diff --git a/optimized/ami/100ms/ls_eend_ami_100ms.mlmodelc/analytics/coremldata.bin b/optimized/ami/100ms/ls_eend_ami_100ms.mlmodelc/analytics/coremldata.bin
index 823d91cf23a2bb454a5b6dd99a9ae05da88006bc..e336d5a767baed9689302dba203620df3bd2beb7 100644
--- a/optimized/ami/100ms/ls_eend_ami_100ms.mlmodelc/analytics/coremldata.bin
+++ b/optimized/ami/100ms/ls_eend_ami_100ms.mlmodelc/analytics/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9880c37e22316efbaf285d84d1a4eecab31657806256346bd1939fcf4a775924
+oid sha256:acd29bd7bc0274452e77f5735e705d136916e5bd978ca14d6a383c7a2018618a
 size 243
diff --git a/optimized/ami/100ms/ls_eend_ami_100ms.mlmodelc/coremldata.bin b/optimized/ami/100ms/ls_eend_ami_100ms.mlmodelc/coremldata.bin
index 6eed47e0dd8f2c62579405217a9755f463661f78..56d90a939a4077bbce418ac68249e3597338a6c8 100644
--- a/optimized/ami/100ms/ls_eend_ami_100ms.mlmodelc/coremldata.bin
+++ b/optimized/ami/100ms/ls_eend_ami_100ms.mlmodelc/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:914dd099519cff6e07d71c8a29081c50c800030c21d635e1a226ec7286819c8d
-size 1292
+oid sha256:2c45f751d1fdb81520abae505c4a969a6b817d816487ce6e901e0e02e60a5f0b
+size 1395
diff --git a/optimized/ami/100ms/ls_eend_ami_100ms.mlmodelc/metadata.json b/optimized/ami/100ms/ls_eend_ami_100ms.mlmodelc/metadata.json
index 3dc3df54de7517ae88aad148997bb18d28dc5797..c4e7d5f6666d406a128108a844274d48bb01a2ae 100644
--- a/optimized/ami/100ms/ls_eend_ami_100ms.mlmodelc/metadata.json
+++ b/optimized/ami/100ms/ls_eend_ami_100ms.mlmodelc/metadata.json
@@ -1,7 +1,7 @@
 [
   {
     "metadataOutputVersion" : "3.0",
-    "shortDescription" : "LS-EEND AMI streaming diarizer (pipeline, T=1, max_speakers=4)",
+    "shortDescription" : "LS-EEND AMI streaming diarizer (pipeline, T=1, max_speakers=4, layout=raw_mel, cu=all)",
     "outputSchema" : [
       {
         "hasShapeFlexibility" : "0",
@@ -81,12 +81,12 @@
     "specificationVersion" : 8,
     "mlProgramOperationTypeHistogram" : {
       "Ios16.reduceL2Norm" : 2,
-      "Ios17.reshape" : 66,
+      "Ios17.reshape" : 67,
       "Ios16.softmax" : 2,
       "Ios17.matmul" : 29,
       "Ios17.transpose" : 57,
       "Split" : 4,
-      "Ios17.expandDims" : 3,
+      "Ios17.expandDims" : 4,
       "Ios17.add" : 46,
       "Ios16.sigmoid" : 5,
       "Ios17.sliceByIndex" : 36,
@@ -128,9 +128,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float32",
-        "formattedType" : "MultiArray (Float32 1 × 1 × 345)",
+        "formattedType" : "MultiArray (Float32 1 × 15 × 23)",
         "shortDescription" : "",
-        "shape" : "[1, 1, 345]",
+        "shape" : "[1, 15, 23]",
         "name" : "features",
         "type" : "MultiArray"
       },
@@ -206,8 +206,8 @@
       }
     ],
     "userDefinedMetadata" : {
-      "com.github.apple.coremltools.conversion_date" : "2026-04-16",
-      "config" : "{\"model_name\": \"ami\", \"model_label\": \"AMI\", \"variant\": \"pipeline\", \"chunk_size\": 1, \"step_duration_ms\": 100, \"frame_duration_ms\": 100, \"frame_duration_seconds\": 0.1, \"max_speakers\": 4, \"max_nspks\": 6, \"n_units\": 256, \"n_heads\": 4, \"enc_n_layers\": 4, \"dec_n_layers\": 2, \"conv_kernel_size\": 16, \"conv_delay\": 9, \"sample_rate\": 8000, \"win_length\": 200, \"hop_length\": 80, \"n_mels\": 23, \"context_size\": 7, \"subsampling\": 10, \"feat_type\": \"logmel23_cummn\", \"pure_roll\": true}",
+      "com.github.apple.coremltools.conversion_date" : "2026-04-18",
+      "config" : "{\"model_name\": \"ami\", \"model_label\": \"AMI\", \"variant\": \"pipeline\", \"chunk_size\": 1, \"step_duration_ms\": 100, \"frame_duration_ms\": 100, \"frame_duration_seconds\": 0.1, \"max_speakers\": 4, \"max_nspks\": 6, \"n_units\": 256, \"n_heads\": 4, \"enc_n_layers\": 4, \"dec_n_layers\": 2, \"conv_kernel_size\": 16, \"conv_delay\": 9, \"sample_rate\": 8000, \"win_length\": 200, \"hop_length\": 80, \"n_mels\": 23, \"context_size\": 7, \"subsampling\": 10, \"feat_type\": \"logmel23_cummn\", \"pure_roll\": true, \"input_layout\": \"raw_mel\", \"compute_units_export\": \"all\", \"raw_mel_length\": 15}",
       "com.github.apple.coremltools.source" : "torch==2.6.0",
       "com.github.apple.coremltools.version" : "9.0",
       "com.github.apple.coremltools.source_dialect" : "TorchScript"
diff --git a/optimized/ami/100ms/ls_eend_ami_100ms.mlmodelc/model.mil b/optimized/ami/100ms/ls_eend_ami_100ms.mlmodelc/model.mil
index 69b8fb57d7fa52c2c479b44e6c51ee2bf32cccf4..2ac962d651b9b73a7bea06a0e9008c399748989e 100644
--- a/optimized/ami/100ms/ls_eend_ami_100ms.mlmodelc/model.mil
+++ b/optimized/ami/100ms/ls_eend_ami_100ms.mlmodelc/model.mil
@@ -1,233 +1,239 @@
 program(1.0)
 [buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.6.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "9.0"}})]
 {
-    func main<ios17>(tensor<fp32, [1, 256, 18]> cnn_window, tensor<fp32, [2, 6, 4, 64, 64]> dec_kv, tensor<fp32, [2, 1]> dec_scale, tensor<fp32, [4, 1, 16, 256]> enc_conv_cache, tensor<fp32, [4, 1, 4, 64, 64]> enc_kv, tensor<fp32, [4, 1]> enc_scale, tensor<fp32, [1, 1, 345]> features, tensor<fp32, [1]> valid_mask) {
-            tensor<fp32, [256]> encoder_cnn_bias = const()[name = tensor<string, []>("encoder_cnn_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
-            tensor<fp32, [256, 256, 19]> encoder_cnn_weight = const()[name = tensor<string, []>("encoder_cnn_weight"), val = tensor<fp32, [256, 256, 19]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
-            tensor<fp32, [1, 1]> encoder__causal_mask = const()[name = tensor<string, []>("encoder__causal_mask"), val = tensor<fp32, [1, 1]>([[0x1p+0]])];
-            tensor<fp32, [1]> encoder__t_index = const()[name = tensor<string, []>("encoder__t_index"), val = tensor<fp32, [1]>([0x1p+0])];
-            tensor<fp32, [256]> encoder_input_projection_linear_bias = const()[name = tensor<string, []>("encoder_input_projection_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4981952)))];
-            tensor<fp32, [256, 345]> encoder_input_projection_linear_weight = const()[name = tensor<string, []>("encoder_input_projection_linear_weight"), val = tensor<fp32, [256, 345]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4983040)))];
-            tensor<fp32, [256]> encoder_pre_layer_norm_bias = const()[name = tensor<string, []>("encoder_pre_layer_norm_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5336384)))];
-            tensor<fp32, [256]> encoder_pre_layer_norm_weight = const()[name = tensor<string, []>("encoder_pre_layer_norm_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5337472)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5338560)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5339648)))];
-            tensor<fp32, [1024]> encoder_ffn1_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5340736)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5344896)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6393536)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6394624)))];
-            tensor<fp32, [256]> encoder_ret_lns_0_bias = const()[name = tensor<string, []>("encoder_ret_lns_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7443264)))];
-            tensor<fp32, [256]> encoder_ret_lns_0_weight = const()[name = tensor<string, []>("encoder_ret_lns_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7444352)))];
-            tensor<fp32, [256]> encoder_q_proj_0_bias = const()[name = tensor<string, []>("encoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7445440)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_0_weight = const()[name = tensor<string, []>("encoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7446528)))];
-            tensor<fp32, [256]> encoder_k_proj_0_bias = const()[name = tensor<string, []>("encoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7708736)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_0_weight = const()[name = tensor<string, []>("encoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7709824)))];
-            tensor<fp32, [256]> encoder_v_proj_0_bias = const()[name = tensor<string, []>("encoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7972032)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_0_weight = const()[name = tensor<string, []>("encoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7973120)))];
-            tensor<fp32, [256]> encoder_g_proj_0_bias = const()[name = tensor<string, []>("encoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8235328)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_0_weight = const()[name = tensor<string, []>("encoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8236416)))];
-            tensor<fp32, [256]> encoder_out_proj_0_bias = const()[name = tensor<string, []>("encoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8498624)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_0_weight = const()[name = tensor<string, []>("encoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8499712)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8761920)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8763008)))];
-            tensor<fp32, [512]> encoder_conv_module_0_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8764096)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_0_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8766208)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_0_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9290560)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9307008)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9308096)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9309184)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9310272)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9311360)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_0_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9312448)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9574656)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9575744)))];
-            tensor<fp32, [1024]> encoder_ffn2_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9576832)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9580992)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10629632)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10630720)))];
-            tensor<fp32, [256]> encoder_layer_norm_0_bias = const()[name = tensor<string, []>("encoder_layer_norm_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11679360)))];
-            tensor<fp32, [256]> encoder_layer_norm_0_weight = const()[name = tensor<string, []>("encoder_layer_norm_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11680448)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11681536)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11682624)))];
-            tensor<fp32, [1024]> encoder_ffn1_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11683712)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11687872)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12736512)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12737600)))];
-            tensor<fp32, [256]> encoder_ret_lns_1_bias = const()[name = tensor<string, []>("encoder_ret_lns_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13786240)))];
-            tensor<fp32, [256]> encoder_ret_lns_1_weight = const()[name = tensor<string, []>("encoder_ret_lns_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13787328)))];
-            tensor<fp32, [256]> encoder_q_proj_1_bias = const()[name = tensor<string, []>("encoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13788416)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_1_weight = const()[name = tensor<string, []>("encoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13789504)))];
-            tensor<fp32, [256]> encoder_k_proj_1_bias = const()[name = tensor<string, []>("encoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14051712)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_1_weight = const()[name = tensor<string, []>("encoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14052800)))];
-            tensor<fp32, [256]> encoder_v_proj_1_bias = const()[name = tensor<string, []>("encoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14315008)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_1_weight = const()[name = tensor<string, []>("encoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14316096)))];
-            tensor<fp32, [256]> encoder_g_proj_1_bias = const()[name = tensor<string, []>("encoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14578304)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_1_weight = const()[name = tensor<string, []>("encoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14579392)))];
-            tensor<fp32, [256]> encoder_out_proj_1_bias = const()[name = tensor<string, []>("encoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14841600)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_1_weight = const()[name = tensor<string, []>("encoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14842688)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15104896)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15105984)))];
-            tensor<fp32, [512]> encoder_conv_module_1_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15107072)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_1_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15109184)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_1_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15633536)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15649984)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15651072)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15652160)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15653248)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15654336)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_1_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15655424)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15917632)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15918720)))];
-            tensor<fp32, [1024]> encoder_ffn2_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15919808)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15923968)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16972608)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16973696)))];
-            tensor<fp32, [256]> encoder_layer_norm_1_bias = const()[name = tensor<string, []>("encoder_layer_norm_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18022336)))];
-            tensor<fp32, [256]> encoder_layer_norm_1_weight = const()[name = tensor<string, []>("encoder_layer_norm_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18023424)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18024512)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18025600)))];
-            tensor<fp32, [1024]> encoder_ffn1_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18026688)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18030848)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19079488)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19080576)))];
-            tensor<fp32, [256]> encoder_ret_lns_2_bias = const()[name = tensor<string, []>("encoder_ret_lns_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20129216)))];
-            tensor<fp32, [256]> encoder_ret_lns_2_weight = const()[name = tensor<string, []>("encoder_ret_lns_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20130304)))];
-            tensor<fp32, [256]> encoder_q_proj_2_bias = const()[name = tensor<string, []>("encoder_q_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20131392)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_2_weight = const()[name = tensor<string, []>("encoder_q_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20132480)))];
-            tensor<fp32, [256]> encoder_k_proj_2_bias = const()[name = tensor<string, []>("encoder_k_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20394688)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_2_weight = const()[name = tensor<string, []>("encoder_k_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20395776)))];
-            tensor<fp32, [256]> encoder_v_proj_2_bias = const()[name = tensor<string, []>("encoder_v_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20657984)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_2_weight = const()[name = tensor<string, []>("encoder_v_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20659072)))];
-            tensor<fp32, [256]> encoder_g_proj_2_bias = const()[name = tensor<string, []>("encoder_g_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20921280)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_2_weight = const()[name = tensor<string, []>("encoder_g_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20922368)))];
-            tensor<fp32, [256]> encoder_out_proj_2_bias = const()[name = tensor<string, []>("encoder_out_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21184576)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_2_weight = const()[name = tensor<string, []>("encoder_out_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21185664)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21447872)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21448960)))];
-            tensor<fp32, [512]> encoder_conv_module_2_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21450048)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_2_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21452160)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_2_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21976512)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21992960)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21994048)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21995136)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21996224)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21997312)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_2_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21998400)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22260608)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22261696)))];
-            tensor<fp32, [1024]> encoder_ffn2_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22262784)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22266944)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23315584)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23316672)))];
-            tensor<fp32, [256]> encoder_layer_norm_2_bias = const()[name = tensor<string, []>("encoder_layer_norm_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24365312)))];
-            tensor<fp32, [256]> encoder_layer_norm_2_weight = const()[name = tensor<string, []>("encoder_layer_norm_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24366400)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24367488)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24368576)))];
-            tensor<fp32, [1024]> encoder_ffn1_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24369664)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24373824)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25422464)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25423552)))];
-            tensor<fp32, [256]> encoder_ret_lns_3_bias = const()[name = tensor<string, []>("encoder_ret_lns_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26472192)))];
-            tensor<fp32, [256]> encoder_ret_lns_3_weight = const()[name = tensor<string, []>("encoder_ret_lns_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26473280)))];
-            tensor<fp32, [256]> encoder_q_proj_3_bias = const()[name = tensor<string, []>("encoder_q_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26474368)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_3_weight = const()[name = tensor<string, []>("encoder_q_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26475456)))];
-            tensor<fp32, [256]> encoder_k_proj_3_bias = const()[name = tensor<string, []>("encoder_k_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26737664)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_3_weight = const()[name = tensor<string, []>("encoder_k_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26738752)))];
-            tensor<fp32, [256]> encoder_v_proj_3_bias = const()[name = tensor<string, []>("encoder_v_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27000960)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_3_weight = const()[name = tensor<string, []>("encoder_v_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27002048)))];
-            tensor<fp32, [256]> encoder_g_proj_3_bias = const()[name = tensor<string, []>("encoder_g_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27264256)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_3_weight = const()[name = tensor<string, []>("encoder_g_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27265344)))];
-            tensor<fp32, [256]> encoder_out_proj_3_bias = const()[name = tensor<string, []>("encoder_out_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27527552)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_3_weight = const()[name = tensor<string, []>("encoder_out_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27528640)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27790848)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27791936)))];
-            tensor<fp32, [512]> encoder_conv_module_3_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27793024)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_3_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27795136)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_3_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28319488)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28335936)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28337024)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28338112)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28339200)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28340288)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_3_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28341376)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28603584)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28604672)))];
-            tensor<fp32, [1024]> encoder_ffn2_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28605760)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28609920)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29658560)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29659648)))];
-            tensor<fp32, [256]> encoder_layer_norm_3_bias = const()[name = tensor<string, []>("encoder_layer_norm_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30708288)))];
-            tensor<fp32, [256]> encoder_layer_norm_3_weight = const()[name = tensor<string, []>("encoder_layer_norm_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30709376)))];
-            tensor<fp32, [256]> decoder_convert_bias = const()[name = tensor<string, []>("decoder_convert_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710464)))];
-            tensor<fp32, [256, 512]> decoder_convert_weight = const()[name = tensor<string, []>("decoder_convert_weight"), val = tensor<fp32, [256, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30711552)))];
-            tensor<fp32, [256]> decoder_q_proj_0_bias = const()[name = tensor<string, []>("decoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31235904)))];
-            tensor<fp32, [256, 256]> decoder_q_proj_0_weight = const()[name = tensor<string, []>("decoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31236992)))];
-            tensor<fp32, [256]> decoder_k_proj_0_bias = const()[name = tensor<string, []>("decoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31499200)))];
-            tensor<fp32, [256, 256]> decoder_k_proj_0_weight = const()[name = tensor<string, []>("decoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31500288)))];
-            tensor<fp32, [256]> decoder_v_proj_0_bias = const()[name = tensor<string, []>("decoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31762496)))];
-            tensor<fp32, [256, 256]> decoder_v_proj_0_weight = const()[name = tensor<string, []>("decoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31763584)))];
-            tensor<fp32, [256]> decoder_g_proj_0_bias = const()[name = tensor<string, []>("decoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32025792)))];
-            tensor<fp32, [256, 256]> decoder_g_proj_0_weight = const()[name = tensor<string, []>("decoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32026880)))];
-            tensor<fp32, [256]> decoder_out_proj_0_bias = const()[name = tensor<string, []>("decoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32289088)))];
-            tensor<fp32, [256, 256]> decoder_out_proj_0_weight = const()[name = tensor<string, []>("decoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32290176)))];
-            tensor<fp32, [256]> decoder_norm11_0_bias = const()[name = tensor<string, []>("decoder_norm11_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32552384)))];
-            tensor<fp32, [256]> decoder_norm11_0_weight = const()[name = tensor<string, []>("decoder_norm11_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32553472)))];
-            tensor<fp32, [256]> decoder_self_attn2_0_out_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_0_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32554560)))];
-            tensor<fp32, [256, 256]> decoder_self_attn2_0_out_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_0_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32555648)))];
-            tensor<fp32, [768]> decoder_self_attn2_0_in_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_0_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32817856)))];
-            tensor<fp32, [768, 256]> decoder_self_attn2_0_in_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_0_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32820992)))];
-            tensor<fp32, [256]> decoder_norm21_0_bias = const()[name = tensor<string, []>("decoder_norm21_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33607488)))];
-            tensor<fp32, [256]> decoder_norm21_0_weight = const()[name = tensor<string, []>("decoder_norm21_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33608576)))];
-            tensor<fp32, [2048]> decoder_linear1_0_bias = const()[name = tensor<string, []>("decoder_linear1_0_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33609664)))];
-            tensor<fp32, [2048, 256]> decoder_linear1_0_weight = const()[name = tensor<string, []>("decoder_linear1_0_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33617920)))];
-            tensor<fp32, [256]> decoder_linear2_0_bias = const()[name = tensor<string, []>("decoder_linear2_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35715136)))];
-            tensor<fp32, [256, 2048]> decoder_linear2_0_weight = const()[name = tensor<string, []>("decoder_linear2_0_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35716224)))];
-            tensor<fp32, [256]> decoder_norm22_0_bias = const()[name = tensor<string, []>("decoder_norm22_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37813440)))];
-            tensor<fp32, [256]> decoder_norm22_0_weight = const()[name = tensor<string, []>("decoder_norm22_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37814528)))];
-            tensor<fp32, [256]> decoder_q_proj_1_bias = const()[name = tensor<string, []>("decoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37815616)))];
-            tensor<fp32, [256, 256]> decoder_q_proj_1_weight = const()[name = tensor<string, []>("decoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37816704)))];
-            tensor<fp32, [256]> decoder_k_proj_1_bias = const()[name = tensor<string, []>("decoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38078912)))];
-            tensor<fp32, [256, 256]> decoder_k_proj_1_weight = const()[name = tensor<string, []>("decoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38080000)))];
-            tensor<fp32, [256]> decoder_v_proj_1_bias = const()[name = tensor<string, []>("decoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38342208)))];
-            tensor<fp32, [256, 256]> decoder_v_proj_1_weight = const()[name = tensor<string, []>("decoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38343296)))];
-            tensor<fp32, [256]> decoder_g_proj_1_bias = const()[name = tensor<string, []>("decoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38605504)))];
-            tensor<fp32, [256, 256]> decoder_g_proj_1_weight = const()[name = tensor<string, []>("decoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38606592)))];
-            tensor<fp32, [256]> decoder_out_proj_1_bias = const()[name = tensor<string, []>("decoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38868800)))];
-            tensor<fp32, [256, 256]> decoder_out_proj_1_weight = const()[name = tensor<string, []>("decoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38869888)))];
-            tensor<fp32, [256]> decoder_norm11_1_bias = const()[name = tensor<string, []>("decoder_norm11_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39132096)))];
-            tensor<fp32, [256]> decoder_norm11_1_weight = const()[name = tensor<string, []>("decoder_norm11_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39133184)))];
-            tensor<fp32, [256]> decoder_self_attn2_1_out_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_1_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39134272)))];
-            tensor<fp32, [256, 256]> decoder_self_attn2_1_out_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_1_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39135360)))];
-            tensor<fp32, [768]> decoder_self_attn2_1_in_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_1_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39397568)))];
-            tensor<fp32, [768, 256]> decoder_self_attn2_1_in_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_1_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39400704)))];
-            tensor<fp32, [256]> decoder_norm21_1_bias = const()[name = tensor<string, []>("decoder_norm21_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40187200)))];
-            tensor<fp32, [256]> decoder_norm21_1_weight = const()[name = tensor<string, []>("decoder_norm21_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40188288)))];
-            tensor<fp32, [2048]> decoder_linear1_1_bias = const()[name = tensor<string, []>("decoder_linear1_1_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40189376)))];
-            tensor<fp32, [2048, 256]> decoder_linear1_1_weight = const()[name = tensor<string, []>("decoder_linear1_1_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40197632)))];
-            tensor<fp32, [256]> decoder_linear2_1_bias = const()[name = tensor<string, []>("decoder_linear2_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42294848)))];
-            tensor<fp32, [256, 2048]> decoder_linear2_1_weight = const()[name = tensor<string, []>("decoder_linear2_1_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42295936)))];
-            tensor<fp32, [256]> decoder_norm22_1_bias = const()[name = tensor<string, []>("decoder_norm22_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44393152)))];
-            tensor<fp32, [256]> decoder_norm22_1_weight = const()[name = tensor<string, []>("decoder_norm22_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44394240)))];
-            tensor<fp32, []> var_11 = const()[name = tensor<string, []>("op_11"), val = tensor<fp32, []>(0x1.5798eep-27)];
-            tensor<fp32, []> var_17 = const()[name = tensor<string, []>("op_17"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
-            tensor<int32, []> var_20 = const()[name = tensor<string, []>("op_20"), val = tensor<int32, []>(2)];
-            tensor<int32, []> var_23 = const()[name = tensor<string, []>("op_23"), val = tensor<int32, []>(0)];
-            tensor<int32, []> var_26 = const()[name = tensor<string, []>("op_26"), val = tensor<int32, []>(1)];
-            tensor<fp32, []> var_28 = const()[name = tensor<string, []>("op_28"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
-            tensor<bool, []> var_29 = const()[name = tensor<string, []>("op_29"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 1, 256]> input_1 = linear(bias = encoder_input_projection_linear_bias, weight = encoder_input_projection_linear_weight, x = features)[name = tensor<string, []>("linear_0")];
-            tensor<int32, [1]> input_3_axes_0 = const()[name = tensor<string, []>("input_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_3 = layer_norm(axes = input_3_axes_0, beta = encoder_pre_layer_norm_bias, epsilon = var_28, gamma = encoder_pre_layer_norm_weight, x = input_1)[name = tensor<string, []>("input_3")];
+    func main<ios17>(tensor<fp32, [1, 256, 18]> cnn_window, tensor<fp32, [2, 6, 4, 64, 64]> dec_kv, tensor<fp32, [2, 1]> dec_scale, tensor<fp32, [4, 1, 16, 256]> enc_conv_cache, tensor<fp32, [4, 1, 4, 64, 64]> enc_kv, tensor<fp32, [4, 1]> enc_scale, tensor<fp32, [1, 15, 23]> features, tensor<fp32, [1]> valid_mask) {
+            tensor<fp32, [256]> inner_encoder_cnn_bias = const()[name = tensor<string, []>("inner_encoder_cnn_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp32, [256, 256, 19]> inner_encoder_cnn_weight = const()[name = tensor<string, []>("inner_encoder_cnn_weight"), val = tensor<fp32, [256, 256, 19]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
+            tensor<fp32, [1, 1]> inner_encoder__causal_mask = const()[name = tensor<string, []>("inner_encoder__causal_mask"), val = tensor<fp32, [1, 1]>([[0x1p+0]])];
+            tensor<fp32, [1]> inner_encoder__t_index = const()[name = tensor<string, []>("inner_encoder__t_index"), val = tensor<fp32, [1]>([0x1p+0])];
+            tensor<fp32, [256]> inner_encoder_input_projection_linear_bias = const()[name = tensor<string, []>("inner_encoder_input_projection_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4981952)))];
+            tensor<fp32, [256, 345]> inner_encoder_input_projection_linear_weight = const()[name = tensor<string, []>("inner_encoder_input_projection_linear_weight"), val = tensor<fp32, [256, 345]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4983040)))];
+            tensor<fp32, [256]> inner_encoder_pre_layer_norm_bias = const()[name = tensor<string, []>("inner_encoder_pre_layer_norm_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5336384)))];
+            tensor<fp32, [256]> inner_encoder_pre_layer_norm_weight = const()[name = tensor<string, []>("inner_encoder_pre_layer_norm_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5337472)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5338560)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5339648)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5340736)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5344896)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6393536)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6394624)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_0_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7443264)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_0_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7444352)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7445440)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7446528)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7708736)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7709824)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7972032)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7973120)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8235328)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8236416)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8498624)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8499712)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8761920)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8763008)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_0_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8764096)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_0_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8766208)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_0_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9290560)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9307008)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9308096)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9309184)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9310272)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9311360)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_0_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9312448)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9574656)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9575744)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9576832)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9580992)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10629632)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10630720)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_0_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11679360)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_0_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11680448)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11681536)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11682624)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11683712)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11687872)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12736512)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12737600)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_1_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13786240)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_1_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13787328)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13788416)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13789504)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14051712)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14052800)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14315008)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14316096)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14578304)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14579392)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14841600)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14842688)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15104896)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15105984)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_1_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15107072)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_1_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15109184)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_1_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15633536)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15649984)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15651072)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15652160)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15653248)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15654336)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_1_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15655424)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15917632)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15918720)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15919808)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15923968)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16972608)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16973696)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_1_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18022336)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_1_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18023424)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18024512)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18025600)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18026688)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18030848)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19079488)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19080576)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_2_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20129216)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_2_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20130304)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20131392)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20132480)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20394688)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20395776)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20657984)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20659072)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20921280)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20922368)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21184576)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21185664)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21447872)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21448960)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_2_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21450048)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_2_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21452160)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_2_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21976512)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21992960)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21994048)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21995136)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21996224)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21997312)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_2_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21998400)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22260608)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22261696)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22262784)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22266944)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23315584)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23316672)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_2_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24365312)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_2_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24366400)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24367488)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24368576)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24369664)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24373824)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25422464)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25423552)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_3_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26472192)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_3_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26473280)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26474368)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26475456)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26737664)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26738752)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27000960)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27002048)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27264256)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27265344)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27527552)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27528640)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27790848)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27791936)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_3_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27793024)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_3_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27795136)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_3_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28319488)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28335936)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28337024)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28338112)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28339200)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28340288)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_3_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28341376)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28603584)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28604672)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28605760)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28609920)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29658560)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29659648)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_3_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30708288)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_3_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30709376)))];
+            tensor<fp32, [256]> inner_decoder_convert_bias = const()[name = tensor<string, []>("inner_decoder_convert_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710464)))];
+            tensor<fp32, [256, 512]> inner_decoder_convert_weight = const()[name = tensor<string, []>("inner_decoder_convert_weight"), val = tensor<fp32, [256, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30711552)))];
+            tensor<fp32, [256]> inner_decoder_q_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31235904)))];
+            tensor<fp32, [256, 256]> inner_decoder_q_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31236992)))];
+            tensor<fp32, [256]> inner_decoder_k_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31499200)))];
+            tensor<fp32, [256, 256]> inner_decoder_k_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31500288)))];
+            tensor<fp32, [256]> inner_decoder_v_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31762496)))];
+            tensor<fp32, [256, 256]> inner_decoder_v_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31763584)))];
+            tensor<fp32, [256]> inner_decoder_g_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32025792)))];
+            tensor<fp32, [256, 256]> inner_decoder_g_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32026880)))];
+            tensor<fp32, [256]> inner_decoder_out_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32289088)))];
+            tensor<fp32, [256, 256]> inner_decoder_out_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32290176)))];
+            tensor<fp32, [256]> inner_decoder_norm11_0_bias = const()[name = tensor<string, []>("inner_decoder_norm11_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32552384)))];
+            tensor<fp32, [256]> inner_decoder_norm11_0_weight = const()[name = tensor<string, []>("inner_decoder_norm11_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32553472)))];
+            tensor<fp32, [256]> inner_decoder_self_attn2_0_out_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32554560)))];
+            tensor<fp32, [256, 256]> inner_decoder_self_attn2_0_out_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32555648)))];
+            tensor<fp32, [768]> inner_decoder_self_attn2_0_in_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32817856)))];
+            tensor<fp32, [768, 256]> inner_decoder_self_attn2_0_in_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32820992)))];
+            tensor<fp32, [256]> inner_decoder_norm21_0_bias = const()[name = tensor<string, []>("inner_decoder_norm21_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33607488)))];
+            tensor<fp32, [256]> inner_decoder_norm21_0_weight = const()[name = tensor<string, []>("inner_decoder_norm21_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33608576)))];
+            tensor<fp32, [2048]> inner_decoder_linear1_0_bias = const()[name = tensor<string, []>("inner_decoder_linear1_0_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33609664)))];
+            tensor<fp32, [2048, 256]> inner_decoder_linear1_0_weight = const()[name = tensor<string, []>("inner_decoder_linear1_0_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33617920)))];
+            tensor<fp32, [256]> inner_decoder_linear2_0_bias = const()[name = tensor<string, []>("inner_decoder_linear2_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35715136)))];
+            tensor<fp32, [256, 2048]> inner_decoder_linear2_0_weight = const()[name = tensor<string, []>("inner_decoder_linear2_0_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35716224)))];
+            tensor<fp32, [256]> inner_decoder_norm22_0_bias = const()[name = tensor<string, []>("inner_decoder_norm22_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37813440)))];
+            tensor<fp32, [256]> inner_decoder_norm22_0_weight = const()[name = tensor<string, []>("inner_decoder_norm22_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37814528)))];
+            tensor<fp32, [256]> inner_decoder_q_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37815616)))];
+            tensor<fp32, [256, 256]> inner_decoder_q_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37816704)))];
+            tensor<fp32, [256]> inner_decoder_k_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38078912)))];
+            tensor<fp32, [256, 256]> inner_decoder_k_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38080000)))];
+            tensor<fp32, [256]> inner_decoder_v_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38342208)))];
+            tensor<fp32, [256, 256]> inner_decoder_v_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38343296)))];
+            tensor<fp32, [256]> inner_decoder_g_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38605504)))];
+            tensor<fp32, [256, 256]> inner_decoder_g_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38606592)))];
+            tensor<fp32, [256]> inner_decoder_out_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38868800)))];
+            tensor<fp32, [256, 256]> inner_decoder_out_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38869888)))];
+            tensor<fp32, [256]> inner_decoder_norm11_1_bias = const()[name = tensor<string, []>("inner_decoder_norm11_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39132096)))];
+            tensor<fp32, [256]> inner_decoder_norm11_1_weight = const()[name = tensor<string, []>("inner_decoder_norm11_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39133184)))];
+            tensor<fp32, [256]> inner_decoder_self_attn2_1_out_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39134272)))];
+            tensor<fp32, [256, 256]> inner_decoder_self_attn2_1_out_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39135360)))];
+            tensor<fp32, [768]> inner_decoder_self_attn2_1_in_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39397568)))];
+            tensor<fp32, [768, 256]> inner_decoder_self_attn2_1_in_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39400704)))];
+            tensor<fp32, [256]> inner_decoder_norm21_1_bias = const()[name = tensor<string, []>("inner_decoder_norm21_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40187200)))];
+            tensor<fp32, [256]> inner_decoder_norm21_1_weight = const()[name = tensor<string, []>("inner_decoder_norm21_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40188288)))];
+            tensor<fp32, [2048]> inner_decoder_linear1_1_bias = const()[name = tensor<string, []>("inner_decoder_linear1_1_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40189376)))];
+            tensor<fp32, [2048, 256]> inner_decoder_linear1_1_weight = const()[name = tensor<string, []>("inner_decoder_linear1_1_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40197632)))];
+            tensor<fp32, [256]> inner_decoder_linear2_1_bias = const()[name = tensor<string, []>("inner_decoder_linear2_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42294848)))];
+            tensor<fp32, [256, 2048]> inner_decoder_linear2_1_weight = const()[name = tensor<string, []>("inner_decoder_linear2_1_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42295936)))];
+            tensor<fp32, [256]> inner_decoder_norm22_1_bias = const()[name = tensor<string, []>("inner_decoder_norm22_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44393152)))];
+            tensor<fp32, [256]> inner_decoder_norm22_1_weight = const()[name = tensor<string, []>("inner_decoder_norm22_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44394240)))];
+            tensor<int32, [1]> stacked_axes_0 = const()[name = tensor<string, []>("stacked_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp32, [1, 1, 15, 23]> stacked = expand_dims(axes = stacked_axes_0, x = features)[name = tensor<string, []>("stacked")];
+            tensor<int32, [3]> var_26 = const()[name = tensor<string, []>("op_26"), val = tensor<int32, [3]>([1, 1, 345])];
+            tensor<fp32, [1, 1, 345]> input_1 = reshape(shape = var_26, x = stacked)[name = tensor<string, []>("input_1")];
+            tensor<fp32, []> var_28 = const()[name = tensor<string, []>("op_28"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<bool, []> var_34 = const()[name = tensor<string, []>("op_34"), val = tensor<bool, []>(true)];
+            tensor<fp32, []> var_35 = const()[name = tensor<string, []>("op_35"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
+            tensor<int32, []> var_38 = const()[name = tensor<string, []>("op_38"), val = tensor<int32, []>(0)];
+            tensor<int32, []> var_40 = const()[name = tensor<string, []>("op_40"), val = tensor<int32, []>(2)];
+            tensor<int32, []> var_41 = const()[name = tensor<string, []>("op_41"), val = tensor<int32, []>(-1)];
+            tensor<fp32, []> var_43 = const()[name = tensor<string, []>("op_43"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
+            tensor<fp32, []> var_48 = const()[name = tensor<string, []>("op_48"), val = tensor<fp32, []>(0x1.5798eep-27)];
+            tensor<int32, []> var_52 = const()[name = tensor<string, []>("op_52"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 1, 256]> input_3 = linear(bias = inner_encoder_input_projection_linear_bias, weight = inner_encoder_input_projection_linear_weight, x = input_1)[name = tensor<string, []>("linear_0")];
             tensor<int32, [1]> input_5_axes_0 = const()[name = tensor<string, []>("input_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_5 = layer_norm(axes = input_5_axes_0, beta = encoder_ffn1_0_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_0_module_sequential_0_weight, x = input_3)[name = tensor<string, []>("input_5")];
-            tensor<fp32, [1, 1, 1024]> inputs_1 = linear(bias = encoder_ffn1_0_module_sequential_1_linear_bias, weight = encoder_ffn1_0_module_sequential_1_linear_weight, x = input_5)[name = tensor<string, []>("linear_1")];
-            tensor<fp32, [1, 1, 1024]> input_7 = silu(x = inputs_1)[name = tensor<string, []>("input_7")];
-            tensor<fp32, [1, 1, 256]> input_11 = linear(bias = encoder_ffn1_0_module_sequential_4_linear_bias, weight = encoder_ffn1_0_module_sequential_4_linear_weight, x = input_7)[name = tensor<string, []>("linear_2")];
-            tensor<fp32, []> var_147 = const()[name = tensor<string, []>("op_147"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 1, 256]> var_148 = mul(x = input_11, y = var_147)[name = tensor<string, []>("op_148")];
-            tensor<fp32, [1, 1, 256]> input_13 = add(x = var_148, y = input_3)[name = tensor<string, []>("input_13")];
+            tensor<fp32, [1, 1, 256]> input_5 = layer_norm(axes = input_5_axes_0, beta = inner_encoder_pre_layer_norm_bias, epsilon = var_35, gamma = inner_encoder_pre_layer_norm_weight, x = input_3)[name = tensor<string, []>("input_5")];
+            tensor<int32, [1]> input_7_axes_0 = const()[name = tensor<string, []>("input_7_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 1, 256]> input_7 = layer_norm(axes = input_7_axes_0, beta = inner_encoder_ffn1_0_module_sequential_0_bias, epsilon = var_35, gamma = inner_encoder_ffn1_0_module_sequential_0_weight, x = input_5)[name = tensor<string, []>("input_7")];
+            tensor<fp32, [1, 1, 1024]> inputs_1 = linear(bias = inner_encoder_ffn1_0_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_0_module_sequential_1_linear_weight, x = input_7)[name = tensor<string, []>("linear_1")];
+            tensor<fp32, [1, 1, 1024]> input_9 = silu(x = inputs_1)[name = tensor<string, []>("input_9")];
+            tensor<fp32, [1, 1, 256]> input_13 = linear(bias = inner_encoder_ffn1_0_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_0_module_sequential_4_linear_weight, x = input_9)[name = tensor<string, []>("linear_2")];
+            tensor<fp32, []> var_173 = const()[name = tensor<string, []>("op_173"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 1, 256]> var_174 = mul(x = input_13, y = var_173)[name = tensor<string, []>("op_174")];
+            tensor<fp32, [1, 1, 256]> input_15 = add(x = var_174, y = input_5)[name = tensor<string, []>("input_15")];
             tensor<int32, [1]> x_1_axes_0 = const()[name = tensor<string, []>("x_1_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> x_1 = layer_norm(axes = x_1_axes_0, beta = encoder_ret_lns_0_bias, epsilon = var_28, gamma = encoder_ret_lns_0_weight, x = input_13)[name = tensor<string, []>("x_1")];
+            tensor<fp32, [1, 1, 256]> x_1 = layer_norm(axes = x_1_axes_0, beta = inner_encoder_ret_lns_0_bias, epsilon = var_35, gamma = inner_encoder_ret_lns_0_weight, x = input_15)[name = tensor<string, []>("x_1")];
             tensor<int32, [5]> prev_kv_1_begin_0 = const()[name = tensor<string, []>("prev_kv_1_begin_0"), val = tensor<int32, [5]>([0, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_1_end_0 = const()[name = tensor<string, []>("prev_kv_1_end_0"), val = tensor<int32, [5]>([1, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_1_end_mask_0 = const()[name = tensor<string, []>("prev_kv_1_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -238,139 +244,139 @@ program(1.0)
             tensor<bool, [2]> prev_scale_1_end_mask_0 = const()[name = tensor<string, []>("prev_scale_1_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_1_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_1_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_1 = slice_by_index(begin = prev_scale_1_begin_0, end = prev_scale_1_end_0, end_mask = prev_scale_1_end_mask_0, squeeze_mask = prev_scale_1_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_1")];
-            tensor<fp32, [1, 1, 256]> var_162 = linear(bias = encoder_q_proj_0_bias, weight = encoder_q_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_3")];
-            tensor<int32, [4]> var_163 = const()[name = tensor<string, []>("op_163"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_164 = reshape(shape = var_163, x = var_162)[name = tensor<string, []>("op_164")];
+            tensor<fp32, [1, 1, 256]> var_188 = linear(bias = inner_encoder_q_proj_0_bias, weight = inner_encoder_q_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_3")];
+            tensor<int32, [4]> var_189 = const()[name = tensor<string, []>("op_189"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_190 = reshape(shape = var_189, x = var_188)[name = tensor<string, []>("op_190")];
             tensor<int32, [4]> q_1_perm_0 = const()[name = tensor<string, []>("q_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> var_168 = linear(bias = encoder_k_proj_0_bias, weight = encoder_k_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_4")];
-            tensor<fp32, []> var_169 = const()[name = tensor<string, []>("op_169"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 1, 256]> var_170 = mul(x = var_168, y = var_169)[name = tensor<string, []>("op_170")];
-            tensor<int32, [4]> var_171 = const()[name = tensor<string, []>("op_171"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_172 = reshape(shape = var_171, x = var_170)[name = tensor<string, []>("op_172")];
+            tensor<fp32, [1, 1, 256]> var_194 = linear(bias = inner_encoder_k_proj_0_bias, weight = inner_encoder_k_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_4")];
+            tensor<fp32, []> var_195 = const()[name = tensor<string, []>("op_195"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 1, 256]> var_196 = mul(x = var_194, y = var_195)[name = tensor<string, []>("op_196")];
+            tensor<int32, [4]> var_197 = const()[name = tensor<string, []>("op_197"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_198 = reshape(shape = var_197, x = var_196)[name = tensor<string, []>("op_198")];
             tensor<int32, [4]> k_1_perm_0 = const()[name = tensor<string, []>("k_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> var_176 = linear(bias = encoder_v_proj_0_bias, weight = encoder_v_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_5")];
-            tensor<int32, [4]> var_177 = const()[name = tensor<string, []>("op_177"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_178 = reshape(shape = var_177, x = var_176)[name = tensor<string, []>("op_178")];
+            tensor<fp32, [1, 1, 256]> var_202 = linear(bias = inner_encoder_v_proj_0_bias, weight = inner_encoder_v_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_5")];
+            tensor<int32, [4]> var_203 = const()[name = tensor<string, []>("op_203"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_204 = reshape(shape = var_203, x = var_202)[name = tensor<string, []>("op_204")];
             tensor<int32, [4]> v_1_perm_0 = const()[name = tensor<string, []>("v_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> input_17 = linear(bias = encoder_g_proj_0_bias, weight = encoder_g_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_6")];
+            tensor<fp32, [1, 1, 256]> input_19 = linear(bias = inner_encoder_g_proj_0_bias, weight = inner_encoder_g_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_6")];
             tensor<fp32, [1]> sqrt_s0_1 = sqrt(x = prev_scale_1)[name = tensor<string, []>("sqrt_s0_1")];
-            tensor<fp32, [1]> s_t_1 = add(x = prev_scale_1, y = encoder__t_index)[name = tensor<string, []>("s_t_1")];
+            tensor<fp32, [1]> s_t_1 = add(x = prev_scale_1, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_1")];
             tensor<fp32, [1]> sqrt_s_t_1 = sqrt(x = s_t_1)[name = tensor<string, []>("sqrt_s_t_1")];
             tensor<bool, []> qk_1_transpose_x_1 = const()[name = tensor<string, []>("qk_1_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_1_transpose_y_1 = const()[name = tensor<string, []>("qk_1_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 1, 64]> k_1 = transpose(perm = k_1_perm_0, x = var_172)[name = tensor<string, []>("transpose_57")];
-            tensor<fp32, [1, 4, 1, 64]> q_1 = transpose(perm = q_1_perm_0, x = var_164)[name = tensor<string, []>("transpose_58")];
+            tensor<fp32, [1, 4, 1, 64]> k_1 = transpose(perm = k_1_perm_0, x = var_198)[name = tensor<string, []>("transpose_57")];
+            tensor<fp32, [1, 4, 1, 64]> q_1 = transpose(perm = q_1_perm_0, x = var_190)[name = tensor<string, []>("transpose_58")];
             tensor<fp32, [1, 4, 1, 1]> qk_1 = matmul(transpose_x = qk_1_transpose_x_1, transpose_y = qk_1_transpose_y_1, x = q_1, y = k_1)[name = tensor<string, []>("qk_1")];
-            tensor<int32, [2]> var_188 = const()[name = tensor<string, []>("op_188"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp32, [1, 1]> var_189 = reshape(shape = var_188, x = sqrt_s_t_1)[name = tensor<string, []>("op_189")];
-            tensor<fp32, [1, 1]> M_1 = real_div(x = encoder__causal_mask, y = var_189)[name = tensor<string, []>("M_1")];
-            tensor<fp32, [1, 4, 1, 1]> var_191 = mul(x = qk_1, y = M_1)[name = tensor<string, []>("op_191")];
+            tensor<int32, [2]> var_214 = const()[name = tensor<string, []>("op_214"), val = tensor<int32, [2]>([1, 1])];
+            tensor<fp32, [1, 1]> var_215 = reshape(shape = var_214, x = sqrt_s_t_1)[name = tensor<string, []>("op_215")];
+            tensor<fp32, [1, 1]> M_1 = real_div(x = inner_encoder__causal_mask, y = var_215)[name = tensor<string, []>("M_1")];
+            tensor<fp32, [1, 4, 1, 1]> var_217 = mul(x = qk_1, y = M_1)[name = tensor<string, []>("op_217")];
             tensor<bool, []> inner_1_transpose_x_0 = const()[name = tensor<string, []>("inner_1_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_1_transpose_y_0 = const()[name = tensor<string, []>("inner_1_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 1, 64]> v_1 = transpose(perm = v_1_perm_0, x = var_178)[name = tensor<string, []>("transpose_56")];
-            tensor<fp32, [1, 4, 1, 64]> inner_1 = matmul(transpose_x = inner_1_transpose_x_0, transpose_y = inner_1_transpose_y_0, x = var_191, y = v_1)[name = tensor<string, []>("inner_1")];
-            tensor<bool, []> var_193_transpose_x_0 = const()[name = tensor<string, []>("op_193_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_193_transpose_y_0 = const()[name = tensor<string, []>("op_193_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 1, 64]> var_193 = matmul(transpose_x = var_193_transpose_x_0, transpose_y = var_193_transpose_y_0, x = q_1, y = prev_kv_1)[name = tensor<string, []>("op_193")];
-            tensor<fp32, [1]> var_194 = real_div(x = sqrt_s0_1, y = sqrt_s_t_1)[name = tensor<string, []>("op_194")];
-            tensor<int32, [4]> var_195 = const()[name = tensor<string, []>("op_195"), val = tensor<int32, [4]>([1, 1, 1, 1])];
-            tensor<fp32, [1, 1, 1, 1]> var_196 = reshape(shape = var_195, x = var_194)[name = tensor<string, []>("op_196")];
-            tensor<fp32, [1, 4, 1, 64]> cross_1 = mul(x = var_193, y = var_196)[name = tensor<string, []>("cross_1")];
+            tensor<fp32, [1, 4, 1, 64]> v_1 = transpose(perm = v_1_perm_0, x = var_204)[name = tensor<string, []>("transpose_56")];
+            tensor<fp32, [1, 4, 1, 64]> inner_1 = matmul(transpose_x = inner_1_transpose_x_0, transpose_y = inner_1_transpose_y_0, x = var_217, y = v_1)[name = tensor<string, []>("inner_1")];
+            tensor<bool, []> var_219_transpose_x_0 = const()[name = tensor<string, []>("op_219_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_219_transpose_y_0 = const()[name = tensor<string, []>("op_219_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 1, 64]> var_219 = matmul(transpose_x = var_219_transpose_x_0, transpose_y = var_219_transpose_y_0, x = q_1, y = prev_kv_1)[name = tensor<string, []>("op_219")];
+            tensor<fp32, [1]> var_220 = real_div(x = sqrt_s0_1, y = sqrt_s_t_1)[name = tensor<string, []>("op_220")];
+            tensor<int32, [4]> var_221 = const()[name = tensor<string, []>("op_221"), val = tensor<int32, [4]>([1, 1, 1, 1])];
+            tensor<fp32, [1, 1, 1, 1]> var_222 = reshape(shape = var_221, x = var_220)[name = tensor<string, []>("op_222")];
+            tensor<fp32, [1, 4, 1, 64]> cross_1 = mul(x = var_219, y = var_222)[name = tensor<string, []>("cross_1")];
             tensor<fp32, [1, 4, 1, 64]> out_1 = add(x = inner_1, y = cross_1)[name = tensor<string, []>("out_1")];
-            tensor<fp32, [1, 4, 64, 64]> var_199 = mul(x = prev_kv_1, y = sqrt_s0_1)[name = tensor<string, []>("op_199")];
-            tensor<bool, []> var_201_transpose_x_1 = const()[name = tensor<string, []>("op_201_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_201_transpose_y_1 = const()[name = tensor<string, []>("op_201_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_201 = matmul(transpose_x = var_201_transpose_x_1, transpose_y = var_201_transpose_y_1, x = k_1, y = v_1)[name = tensor<string, []>("op_201")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_1 = add(x = var_199, y = var_201)[name = tensor<string, []>("new_kv_unnorm_1")];
-            tensor<fp32, []> var_203 = const()[name = tensor<string, []>("op_203"), val = tensor<fp32, []>(0x1p+0)];
-            tensor<fp32, [1]> new_scale_1 = add(x = prev_scale_1, y = var_203)[name = tensor<string, []>("new_scale_1")];
-            tensor<fp32, [1]> var_205 = sqrt(x = new_scale_1)[name = tensor<string, []>("op_205")];
-            tensor<fp32, [1, 4, 64, 64]> var_206 = real_div(x = new_kv_unnorm_1, y = var_205)[name = tensor<string, []>("op_206")];
-            tensor<int32, [4]> var_207_perm_0 = const()[name = tensor<string, []>("op_207_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_225 = mul(x = prev_kv_1, y = sqrt_s0_1)[name = tensor<string, []>("op_225")];
+            tensor<bool, []> var_227_transpose_x_1 = const()[name = tensor<string, []>("op_227_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_227_transpose_y_1 = const()[name = tensor<string, []>("op_227_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_227 = matmul(transpose_x = var_227_transpose_x_1, transpose_y = var_227_transpose_y_1, x = k_1, y = v_1)[name = tensor<string, []>("op_227")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_1 = add(x = var_225, y = var_227)[name = tensor<string, []>("new_kv_unnorm_1")];
+            tensor<fp32, []> var_229 = const()[name = tensor<string, []>("op_229"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<fp32, [1]> new_scale_1 = add(x = prev_scale_1, y = var_229)[name = tensor<string, []>("new_scale_1")];
+            tensor<fp32, [1]> var_231 = sqrt(x = new_scale_1)[name = tensor<string, []>("op_231")];
+            tensor<fp32, [1, 4, 64, 64]> var_232 = real_div(x = new_kv_unnorm_1, y = var_231)[name = tensor<string, []>("op_232")];
+            tensor<int32, [4]> var_233_perm_0 = const()[name = tensor<string, []>("op_233_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_3_axes_0 = const()[name = tensor<string, []>("out_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 4, 64]> var_207 = transpose(perm = var_207_perm_0, x = out_1)[name = tensor<string, []>("transpose_55")];
-            tensor<fp32, [1, 1, 4, 64]> out_3 = layer_norm(axes = out_3_axes_0, epsilon = var_17, x = var_207)[name = tensor<string, []>("out_3")];
-            tensor<int32, [3]> var_211 = const()[name = tensor<string, []>("op_211"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<fp32, [1, 1, 256]> out_5 = reshape(shape = var_211, x = out_3)[name = tensor<string, []>("out_5")];
-            tensor<fp32, [1, 1, 256]> var_213 = silu(x = input_17)[name = tensor<string, []>("op_213")];
-            tensor<fp32, [1, 1, 256]> input_19 = mul(x = var_213, y = out_5)[name = tensor<string, []>("input_19")];
-            tensor<fp32, [1, 1, 256]> ret_out_1 = linear(bias = encoder_out_proj_0_bias, weight = encoder_out_proj_0_weight, x = input_19)[name = tensor<string, []>("linear_7")];
-            tensor<fp32, [1, 1, 256]> x_3 = add(x = input_13, y = ret_out_1)[name = tensor<string, []>("x_3")];
+            tensor<fp32, [1, 1, 4, 64]> var_233 = transpose(perm = var_233_perm_0, x = out_1)[name = tensor<string, []>("transpose_55")];
+            tensor<fp32, [1, 1, 4, 64]> out_3 = layer_norm(axes = out_3_axes_0, epsilon = var_43, x = var_233)[name = tensor<string, []>("out_3")];
+            tensor<int32, [3]> var_237 = const()[name = tensor<string, []>("op_237"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<fp32, [1, 1, 256]> out_5 = reshape(shape = var_237, x = out_3)[name = tensor<string, []>("out_5")];
+            tensor<fp32, [1, 1, 256]> var_239 = silu(x = input_19)[name = tensor<string, []>("op_239")];
+            tensor<fp32, [1, 1, 256]> input_21 = mul(x = var_239, y = out_5)[name = tensor<string, []>("input_21")];
+            tensor<fp32, [1, 1, 256]> ret_out_1 = linear(bias = inner_encoder_out_proj_0_bias, weight = inner_encoder_out_proj_0_weight, x = input_21)[name = tensor<string, []>("linear_7")];
+            tensor<fp32, [1, 1, 256]> x_3 = add(x = input_15, y = ret_out_1)[name = tensor<string, []>("x_3")];
             tensor<int32, [4]> window_1_begin_0 = const()[name = tensor<string, []>("window_1_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> window_1_end_0 = const()[name = tensor<string, []>("window_1_end_0"), val = tensor<int32, [4]>([1, 1, 16, 256])];
             tensor<bool, [4]> window_1_end_mask_0 = const()[name = tensor<string, []>("window_1_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_1_squeeze_mask_0 = const()[name = tensor<string, []>("window_1_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_1 = slice_by_index(begin = window_1_begin_0, end = window_1_end_0, end_mask = window_1_end_mask_0, squeeze_mask = window_1_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_1")];
-            tensor<int32, [3]> var_224_begin_0 = const()[name = tensor<string, []>("op_224_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_224_end_0 = const()[name = tensor<string, []>("op_224_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_224_end_mask_0 = const()[name = tensor<string, []>("op_224_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_224 = slice_by_index(begin = var_224_begin_0, end = var_224_end_0, end_mask = var_224_end_mask_0, x = window_1)[name = tensor<string, []>("op_224")];
+            tensor<int32, [3]> var_250_begin_0 = const()[name = tensor<string, []>("op_250_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_250_end_0 = const()[name = tensor<string, []>("op_250_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_250_end_mask_0 = const()[name = tensor<string, []>("op_250_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_250 = slice_by_index(begin = var_250_begin_0, end = var_250_end_0, end_mask = var_250_end_mask_0, x = window_1)[name = tensor<string, []>("op_250")];
             tensor<bool, []> window_3_interleave_0 = const()[name = tensor<string, []>("window_3_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_3 = concat(axis = var_26, interleave = window_3_interleave_0, values = (var_224, x_3))[name = tensor<string, []>("window_3")];
-            tensor<bool, []> input_21_interleave_0 = const()[name = tensor<string, []>("input_21_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> input_21 = concat(axis = var_23, interleave = input_21_interleave_0, values = window_3)[name = tensor<string, []>("input_21")];
+            tensor<fp32, [1, 16, 256]> window_3 = concat(axis = var_52, interleave = window_3_interleave_0, values = (var_250, x_3))[name = tensor<string, []>("window_3")];
+            tensor<bool, []> input_23_interleave_0 = const()[name = tensor<string, []>("input_23_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 16, 256]> input_23 = concat(axis = var_38, interleave = input_23_interleave_0, values = window_3)[name = tensor<string, []>("input_23")];
             tensor<int32, [1]> x_5_axes_0 = const()[name = tensor<string, []>("x_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 16, 256]> x_5 = layer_norm(axes = x_5_axes_0, beta = encoder_conv_module_0_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_0_sequential_0_weight, x = input_21)[name = tensor<string, []>("x_5")];
-            tensor<int32, [3]> input_23_perm_0 = const()[name = tensor<string, []>("input_23_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [1, 16, 256]> x_5 = layer_norm(axes = x_5_axes_0, beta = inner_encoder_conv_module_0_sequential_0_bias, epsilon = var_35, gamma = inner_encoder_conv_module_0_sequential_0_weight, x = input_23)[name = tensor<string, []>("x_5")];
+            tensor<int32, [3]> input_25_perm_0 = const()[name = tensor<string, []>("input_25_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_3_pad_type_0 = const()[name = tensor<string, []>("inputs_3_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_3_strides_0 = const()[name = tensor<string, []>("inputs_3_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_3_pad_0 = const()[name = tensor<string, []>("inputs_3_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_3_dilations_0 = const()[name = tensor<string, []>("inputs_3_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_3_groups_0 = const()[name = tensor<string, []>("inputs_3_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> input_23 = transpose(perm = input_23_perm_0, x = x_5)[name = tensor<string, []>("transpose_54")];
-            tensor<fp32, [1, 512, 16]> inputs_3 = conv(bias = encoder_conv_module_0_sequential_2_conv_bias, dilations = inputs_3_dilations_0, groups = inputs_3_groups_0, pad = inputs_3_pad_0, pad_type = inputs_3_pad_type_0, strides = inputs_3_strides_0, weight = encoder_conv_module_0_sequential_2_conv_weight, x = input_23)[name = tensor<string, []>("inputs_3")];
-            tensor<int32, [2]> var_249_split_sizes_0 = const()[name = tensor<string, []>("op_249_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_249_axis_0 = const()[name = tensor<string, []>("op_249_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> var_249_0, tensor<fp32, [1, 256, 16]> var_249_1 = split(axis = var_249_axis_0, split_sizes = var_249_split_sizes_0, x = inputs_3)[name = tensor<string, []>("op_249")];
-            tensor<fp32, [1, 256, 16]> var_251 = sigmoid(x = var_249_1)[name = tensor<string, []>("op_251")];
-            tensor<fp32, [1, 256, 16]> inputs_5 = mul(x = var_249_0, y = var_251)[name = tensor<string, []>("inputs_5")];
+            tensor<fp32, [1, 256, 16]> input_25 = transpose(perm = input_25_perm_0, x = x_5)[name = tensor<string, []>("transpose_54")];
+            tensor<fp32, [1, 512, 16]> inputs_3 = conv(bias = inner_encoder_conv_module_0_sequential_2_conv_bias, dilations = inputs_3_dilations_0, groups = inputs_3_groups_0, pad = inputs_3_pad_0, pad_type = inputs_3_pad_type_0, strides = inputs_3_strides_0, weight = inner_encoder_conv_module_0_sequential_2_conv_weight, x = input_25)[name = tensor<string, []>("inputs_3")];
+            tensor<int32, [2]> var_275_split_sizes_0 = const()[name = tensor<string, []>("op_275_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_275_axis_0 = const()[name = tensor<string, []>("op_275_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 256, 16]> var_275_0, tensor<fp32, [1, 256, 16]> var_275_1 = split(axis = var_275_axis_0, split_sizes = var_275_split_sizes_0, x = inputs_3)[name = tensor<string, []>("op_275")];
+            tensor<fp32, [1, 256, 16]> var_277 = sigmoid(x = var_275_1)[name = tensor<string, []>("op_277")];
+            tensor<fp32, [1, 256, 16]> inputs_5 = mul(x = var_275_0, y = var_277)[name = tensor<string, []>("inputs_5")];
             tensor<string, []> outputs_aug_1_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_1_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_1_pad_0 = const()[name = tensor<string, []>("outputs_aug_1_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_1_groups_0 = const()[name = tensor<string, []>("outputs_aug_1_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_1_strides_0 = const()[name = tensor<string, []>("outputs_aug_1_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_1_dilations_0 = const()[name = tensor<string, []>("outputs_aug_1_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [1, 256, 31]> outputs_aug_1 = conv(dilations = outputs_aug_1_dilations_0, groups = outputs_aug_1_groups_0, pad = outputs_aug_1_pad_0, pad_type = outputs_aug_1_pad_type_0, strides = outputs_aug_1_strides_0, weight = encoder_conv_module_0_sequential_4_conv_weight, x = inputs_5)[name = tensor<string, []>("outputs_aug_1")];
-            tensor<int32, [3]> input_25_begin_0 = const()[name = tensor<string, []>("input_25_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_25_end_0 = const()[name = tensor<string, []>("input_25_end_0"), val = tensor<int32, [3]>([1, 256, 16])];
-            tensor<bool, [3]> input_25_end_mask_0 = const()[name = tensor<string, []>("input_25_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [1, 256, 16]> input_25 = slice_by_index(begin = input_25_begin_0, end = input_25_end_0, end_mask = input_25_end_mask_0, x = outputs_aug_1)[name = tensor<string, []>("input_25")];
-            tensor<fp32, [1, 256, 16]> inputs_7 = batch_norm(beta = encoder_conv_module_0_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_0_sequential_5_weight, mean = encoder_conv_module_0_sequential_5_running_mean, variance = encoder_conv_module_0_sequential_5_running_var, x = input_25)[name = tensor<string, []>("inputs_7")];
-            tensor<fp32, [1, 256, 16]> input_27 = silu(x = inputs_7)[name = tensor<string, []>("input_27")];
-            tensor<string, []> input_29_pad_type_0 = const()[name = tensor<string, []>("input_29_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_29_strides_0 = const()[name = tensor<string, []>("input_29_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_29_pad_0 = const()[name = tensor<string, []>("input_29_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_29_dilations_0 = const()[name = tensor<string, []>("input_29_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_29_groups_0 = const()[name = tensor<string, []>("input_29_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> input_29 = conv(bias = encoder_conv_module_0_sequential_7_conv_bias, dilations = input_29_dilations_0, groups = input_29_groups_0, pad = input_29_pad_0, pad_type = input_29_pad_type_0, strides = input_29_strides_0, weight = encoder_conv_module_0_sequential_7_conv_weight, x = input_27)[name = tensor<string, []>("input_29")];
+            tensor<fp32, [1, 256, 31]> outputs_aug_1 = conv(dilations = outputs_aug_1_dilations_0, groups = outputs_aug_1_groups_0, pad = outputs_aug_1_pad_0, pad_type = outputs_aug_1_pad_type_0, strides = outputs_aug_1_strides_0, weight = inner_encoder_conv_module_0_sequential_4_conv_weight, x = inputs_5)[name = tensor<string, []>("outputs_aug_1")];
+            tensor<int32, [3]> input_27_begin_0 = const()[name = tensor<string, []>("input_27_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_27_end_0 = const()[name = tensor<string, []>("input_27_end_0"), val = tensor<int32, [3]>([1, 256, 16])];
+            tensor<bool, [3]> input_27_end_mask_0 = const()[name = tensor<string, []>("input_27_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [1, 256, 16]> input_27 = slice_by_index(begin = input_27_begin_0, end = input_27_end_0, end_mask = input_27_end_mask_0, x = outputs_aug_1)[name = tensor<string, []>("input_27")];
+            tensor<fp32, [1, 256, 16]> inputs_7 = batch_norm(beta = inner_encoder_conv_module_0_sequential_5_bias, epsilon = var_35, gamma = inner_encoder_conv_module_0_sequential_5_weight, mean = inner_encoder_conv_module_0_sequential_5_running_mean, variance = inner_encoder_conv_module_0_sequential_5_running_var, x = input_27)[name = tensor<string, []>("inputs_7")];
+            tensor<fp32, [1, 256, 16]> input_29 = silu(x = inputs_7)[name = tensor<string, []>("input_29")];
+            tensor<string, []> input_31_pad_type_0 = const()[name = tensor<string, []>("input_31_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_31_strides_0 = const()[name = tensor<string, []>("input_31_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_31_pad_0 = const()[name = tensor<string, []>("input_31_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_31_dilations_0 = const()[name = tensor<string, []>("input_31_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_31_groups_0 = const()[name = tensor<string, []>("input_31_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 256, 16]> input_31 = conv(bias = inner_encoder_conv_module_0_sequential_7_conv_bias, dilations = input_31_dilations_0, groups = input_31_groups_0, pad = input_31_pad_0, pad_type = input_31_pad_type_0, strides = input_31_strides_0, weight = inner_encoder_conv_module_0_sequential_7_conv_weight, x = input_29)[name = tensor<string, []>("input_31")];
             tensor<int32, [3]> conv_out_1_perm_0 = const()[name = tensor<string, []>("conv_out_1_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_282_begin_0 = const()[name = tensor<string, []>("op_282_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_282_end_0 = const()[name = tensor<string, []>("op_282_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_282_end_mask_0 = const()[name = tensor<string, []>("op_282_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 16, 256]> conv_out_1 = transpose(perm = conv_out_1_perm_0, x = input_29)[name = tensor<string, []>("transpose_53")];
-            tensor<fp32, [1, 1, 256]> var_282 = slice_by_index(begin = var_282_begin_0, end = var_282_end_0, end_mask = var_282_end_mask_0, x = conv_out_1)[name = tensor<string, []>("op_282")];
-            tensor<int32, [3]> var_284_perm_0 = const()[name = tensor<string, []>("op_284_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 1, 256]> var_284 = transpose(perm = var_284_perm_0, x = var_282)[name = tensor<string, []>("transpose_52")];
-            tensor<fp32, [1, 1, 256]> input_31 = add(x = x_3, y = var_284)[name = tensor<string, []>("input_31")];
-            tensor<int32, [1]> input_33_axes_0 = const()[name = tensor<string, []>("input_33_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_33 = layer_norm(axes = input_33_axes_0, beta = encoder_ffn2_0_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_0_module_sequential_0_weight, x = input_31)[name = tensor<string, []>("input_33")];
-            tensor<fp32, [1, 1, 1024]> inputs_9 = linear(bias = encoder_ffn2_0_module_sequential_1_linear_bias, weight = encoder_ffn2_0_module_sequential_1_linear_weight, x = input_33)[name = tensor<string, []>("linear_8")];
-            tensor<fp32, [1, 1, 1024]> input_35 = silu(x = inputs_9)[name = tensor<string, []>("input_35")];
-            tensor<fp32, [1, 1, 256]> input_39 = linear(bias = encoder_ffn2_0_module_sequential_4_linear_bias, weight = encoder_ffn2_0_module_sequential_4_linear_weight, x = input_35)[name = tensor<string, []>("linear_9")];
-            tensor<fp32, []> var_307 = const()[name = tensor<string, []>("op_307"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 1, 256]> var_308 = mul(x = input_39, y = var_307)[name = tensor<string, []>("op_308")];
-            tensor<fp32, [1, 1, 256]> input_41 = add(x = var_308, y = input_31)[name = tensor<string, []>("input_41")];
-            tensor<int32, [1]> input_43_axes_0 = const()[name = tensor<string, []>("input_43_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_43 = layer_norm(axes = input_43_axes_0, beta = encoder_layer_norm_0_bias, epsilon = var_28, gamma = encoder_layer_norm_0_weight, x = input_41)[name = tensor<string, []>("input_43")];
+            tensor<int32, [3]> var_308_begin_0 = const()[name = tensor<string, []>("op_308_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_308_end_0 = const()[name = tensor<string, []>("op_308_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_308_end_mask_0 = const()[name = tensor<string, []>("op_308_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 16, 256]> conv_out_1 = transpose(perm = conv_out_1_perm_0, x = input_31)[name = tensor<string, []>("transpose_53")];
+            tensor<fp32, [1, 1, 256]> var_308 = slice_by_index(begin = var_308_begin_0, end = var_308_end_0, end_mask = var_308_end_mask_0, x = conv_out_1)[name = tensor<string, []>("op_308")];
+            tensor<int32, [3]> var_310_perm_0 = const()[name = tensor<string, []>("op_310_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 1, 256]> var_310 = transpose(perm = var_310_perm_0, x = var_308)[name = tensor<string, []>("transpose_52")];
+            tensor<fp32, [1, 1, 256]> input_33 = add(x = x_3, y = var_310)[name = tensor<string, []>("input_33")];
+            tensor<int32, [1]> input_35_axes_0 = const()[name = tensor<string, []>("input_35_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 1, 256]> input_35 = layer_norm(axes = input_35_axes_0, beta = inner_encoder_ffn2_0_module_sequential_0_bias, epsilon = var_35, gamma = inner_encoder_ffn2_0_module_sequential_0_weight, x = input_33)[name = tensor<string, []>("input_35")];
+            tensor<fp32, [1, 1, 1024]> inputs_9 = linear(bias = inner_encoder_ffn2_0_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_0_module_sequential_1_linear_weight, x = input_35)[name = tensor<string, []>("linear_8")];
+            tensor<fp32, [1, 1, 1024]> input_37 = silu(x = inputs_9)[name = tensor<string, []>("input_37")];
+            tensor<fp32, [1, 1, 256]> input_41 = linear(bias = inner_encoder_ffn2_0_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_0_module_sequential_4_linear_weight, x = input_37)[name = tensor<string, []>("linear_9")];
+            tensor<fp32, []> var_333 = const()[name = tensor<string, []>("op_333"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 1, 256]> var_334 = mul(x = input_41, y = var_333)[name = tensor<string, []>("op_334")];
+            tensor<fp32, [1, 1, 256]> input_43 = add(x = var_334, y = input_33)[name = tensor<string, []>("input_43")];
             tensor<int32, [1]> input_45_axes_0 = const()[name = tensor<string, []>("input_45_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_45 = layer_norm(axes = input_45_axes_0, beta = encoder_ffn1_1_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_1_module_sequential_0_weight, x = input_43)[name = tensor<string, []>("input_45")];
-            tensor<fp32, [1, 1, 1024]> inputs_11 = linear(bias = encoder_ffn1_1_module_sequential_1_linear_bias, weight = encoder_ffn1_1_module_sequential_1_linear_weight, x = input_45)[name = tensor<string, []>("linear_10")];
-            tensor<fp32, [1, 1, 1024]> input_47 = silu(x = inputs_11)[name = tensor<string, []>("input_47")];
-            tensor<fp32, [1, 1, 256]> input_51 = linear(bias = encoder_ffn1_1_module_sequential_4_linear_bias, weight = encoder_ffn1_1_module_sequential_4_linear_weight, x = input_47)[name = tensor<string, []>("linear_11")];
-            tensor<fp32, []> var_337 = const()[name = tensor<string, []>("op_337"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 1, 256]> var_338 = mul(x = input_51, y = var_337)[name = tensor<string, []>("op_338")];
-            tensor<fp32, [1, 1, 256]> input_53 = add(x = var_338, y = input_43)[name = tensor<string, []>("input_53")];
+            tensor<fp32, [1, 1, 256]> input_45 = layer_norm(axes = input_45_axes_0, beta = inner_encoder_layer_norm_0_bias, epsilon = var_35, gamma = inner_encoder_layer_norm_0_weight, x = input_43)[name = tensor<string, []>("input_45")];
+            tensor<int32, [1]> input_47_axes_0 = const()[name = tensor<string, []>("input_47_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 1, 256]> input_47 = layer_norm(axes = input_47_axes_0, beta = inner_encoder_ffn1_1_module_sequential_0_bias, epsilon = var_35, gamma = inner_encoder_ffn1_1_module_sequential_0_weight, x = input_45)[name = tensor<string, []>("input_47")];
+            tensor<fp32, [1, 1, 1024]> inputs_11 = linear(bias = inner_encoder_ffn1_1_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_1_module_sequential_1_linear_weight, x = input_47)[name = tensor<string, []>("linear_10")];
+            tensor<fp32, [1, 1, 1024]> input_49 = silu(x = inputs_11)[name = tensor<string, []>("input_49")];
+            tensor<fp32, [1, 1, 256]> input_53 = linear(bias = inner_encoder_ffn1_1_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_1_module_sequential_4_linear_weight, x = input_49)[name = tensor<string, []>("linear_11")];
+            tensor<fp32, []> var_363 = const()[name = tensor<string, []>("op_363"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 1, 256]> var_364 = mul(x = input_53, y = var_363)[name = tensor<string, []>("op_364")];
+            tensor<fp32, [1, 1, 256]> input_55 = add(x = var_364, y = input_45)[name = tensor<string, []>("input_55")];
             tensor<int32, [1]> x_7_axes_0 = const()[name = tensor<string, []>("x_7_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> x_7 = layer_norm(axes = x_7_axes_0, beta = encoder_ret_lns_1_bias, epsilon = var_28, gamma = encoder_ret_lns_1_weight, x = input_53)[name = tensor<string, []>("x_7")];
+            tensor<fp32, [1, 1, 256]> x_7 = layer_norm(axes = x_7_axes_0, beta = inner_encoder_ret_lns_1_bias, epsilon = var_35, gamma = inner_encoder_ret_lns_1_weight, x = input_55)[name = tensor<string, []>("x_7")];
             tensor<int32, [5]> prev_kv_3_begin_0 = const()[name = tensor<string, []>("prev_kv_3_begin_0"), val = tensor<int32, [5]>([1, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_3_end_0 = const()[name = tensor<string, []>("prev_kv_3_end_0"), val = tensor<int32, [5]>([2, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_3_end_mask_0 = const()[name = tensor<string, []>("prev_kv_3_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -381,139 +387,139 @@ program(1.0)
             tensor<bool, [2]> prev_scale_3_end_mask_0 = const()[name = tensor<string, []>("prev_scale_3_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_3_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_3_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_3 = slice_by_index(begin = prev_scale_3_begin_0, end = prev_scale_3_end_0, end_mask = prev_scale_3_end_mask_0, squeeze_mask = prev_scale_3_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_3")];
-            tensor<fp32, [1, 1, 256]> var_352 = linear(bias = encoder_q_proj_1_bias, weight = encoder_q_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_12")];
-            tensor<int32, [4]> var_353 = const()[name = tensor<string, []>("op_353"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_354 = reshape(shape = var_353, x = var_352)[name = tensor<string, []>("op_354")];
+            tensor<fp32, [1, 1, 256]> var_378 = linear(bias = inner_encoder_q_proj_1_bias, weight = inner_encoder_q_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_12")];
+            tensor<int32, [4]> var_379 = const()[name = tensor<string, []>("op_379"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_380 = reshape(shape = var_379, x = var_378)[name = tensor<string, []>("op_380")];
             tensor<int32, [4]> q_3_perm_0 = const()[name = tensor<string, []>("q_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> var_358 = linear(bias = encoder_k_proj_1_bias, weight = encoder_k_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_13")];
-            tensor<fp32, []> var_359 = const()[name = tensor<string, []>("op_359"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 1, 256]> var_360 = mul(x = var_358, y = var_359)[name = tensor<string, []>("op_360")];
-            tensor<int32, [4]> var_361 = const()[name = tensor<string, []>("op_361"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_362 = reshape(shape = var_361, x = var_360)[name = tensor<string, []>("op_362")];
+            tensor<fp32, [1, 1, 256]> var_384 = linear(bias = inner_encoder_k_proj_1_bias, weight = inner_encoder_k_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_13")];
+            tensor<fp32, []> var_385 = const()[name = tensor<string, []>("op_385"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 1, 256]> var_386 = mul(x = var_384, y = var_385)[name = tensor<string, []>("op_386")];
+            tensor<int32, [4]> var_387 = const()[name = tensor<string, []>("op_387"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_388 = reshape(shape = var_387, x = var_386)[name = tensor<string, []>("op_388")];
             tensor<int32, [4]> k_3_perm_0 = const()[name = tensor<string, []>("k_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> var_366 = linear(bias = encoder_v_proj_1_bias, weight = encoder_v_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_14")];
-            tensor<int32, [4]> var_367 = const()[name = tensor<string, []>("op_367"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_368 = reshape(shape = var_367, x = var_366)[name = tensor<string, []>("op_368")];
+            tensor<fp32, [1, 1, 256]> var_392 = linear(bias = inner_encoder_v_proj_1_bias, weight = inner_encoder_v_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_14")];
+            tensor<int32, [4]> var_393 = const()[name = tensor<string, []>("op_393"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_394 = reshape(shape = var_393, x = var_392)[name = tensor<string, []>("op_394")];
             tensor<int32, [4]> v_3_perm_0 = const()[name = tensor<string, []>("v_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> input_57 = linear(bias = encoder_g_proj_1_bias, weight = encoder_g_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_15")];
+            tensor<fp32, [1, 1, 256]> input_59 = linear(bias = inner_encoder_g_proj_1_bias, weight = inner_encoder_g_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_15")];
             tensor<fp32, [1]> sqrt_s0_3 = sqrt(x = prev_scale_3)[name = tensor<string, []>("sqrt_s0_3")];
-            tensor<fp32, [1]> s_t_3 = add(x = prev_scale_3, y = encoder__t_index)[name = tensor<string, []>("s_t_3")];
+            tensor<fp32, [1]> s_t_3 = add(x = prev_scale_3, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_3")];
             tensor<fp32, [1]> sqrt_s_t_3 = sqrt(x = s_t_3)[name = tensor<string, []>("sqrt_s_t_3")];
             tensor<bool, []> qk_3_transpose_x_1 = const()[name = tensor<string, []>("qk_3_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_3_transpose_y_1 = const()[name = tensor<string, []>("qk_3_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 1, 64]> k_3 = transpose(perm = k_3_perm_0, x = var_362)[name = tensor<string, []>("transpose_50")];
-            tensor<fp32, [1, 4, 1, 64]> q_3 = transpose(perm = q_3_perm_0, x = var_354)[name = tensor<string, []>("transpose_51")];
+            tensor<fp32, [1, 4, 1, 64]> k_3 = transpose(perm = k_3_perm_0, x = var_388)[name = tensor<string, []>("transpose_50")];
+            tensor<fp32, [1, 4, 1, 64]> q_3 = transpose(perm = q_3_perm_0, x = var_380)[name = tensor<string, []>("transpose_51")];
             tensor<fp32, [1, 4, 1, 1]> qk_3 = matmul(transpose_x = qk_3_transpose_x_1, transpose_y = qk_3_transpose_y_1, x = q_3, y = k_3)[name = tensor<string, []>("qk_3")];
-            tensor<int32, [2]> var_378 = const()[name = tensor<string, []>("op_378"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp32, [1, 1]> var_379 = reshape(shape = var_378, x = sqrt_s_t_3)[name = tensor<string, []>("op_379")];
-            tensor<fp32, [1, 1]> M_3 = real_div(x = encoder__causal_mask, y = var_379)[name = tensor<string, []>("M_3")];
-            tensor<fp32, [1, 4, 1, 1]> var_381 = mul(x = qk_3, y = M_3)[name = tensor<string, []>("op_381")];
+            tensor<int32, [2]> var_404 = const()[name = tensor<string, []>("op_404"), val = tensor<int32, [2]>([1, 1])];
+            tensor<fp32, [1, 1]> var_405 = reshape(shape = var_404, x = sqrt_s_t_3)[name = tensor<string, []>("op_405")];
+            tensor<fp32, [1, 1]> M_3 = real_div(x = inner_encoder__causal_mask, y = var_405)[name = tensor<string, []>("M_3")];
+            tensor<fp32, [1, 4, 1, 1]> var_407 = mul(x = qk_3, y = M_3)[name = tensor<string, []>("op_407")];
             tensor<bool, []> inner_3_transpose_x_0 = const()[name = tensor<string, []>("inner_3_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_3_transpose_y_0 = const()[name = tensor<string, []>("inner_3_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 1, 64]> v_3 = transpose(perm = v_3_perm_0, x = var_368)[name = tensor<string, []>("transpose_49")];
-            tensor<fp32, [1, 4, 1, 64]> inner_3 = matmul(transpose_x = inner_3_transpose_x_0, transpose_y = inner_3_transpose_y_0, x = var_381, y = v_3)[name = tensor<string, []>("inner_3")];
-            tensor<bool, []> var_383_transpose_x_0 = const()[name = tensor<string, []>("op_383_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_383_transpose_y_0 = const()[name = tensor<string, []>("op_383_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 1, 64]> var_383 = matmul(transpose_x = var_383_transpose_x_0, transpose_y = var_383_transpose_y_0, x = q_3, y = prev_kv_3)[name = tensor<string, []>("op_383")];
-            tensor<fp32, [1]> var_384 = real_div(x = sqrt_s0_3, y = sqrt_s_t_3)[name = tensor<string, []>("op_384")];
-            tensor<int32, [4]> var_385 = const()[name = tensor<string, []>("op_385"), val = tensor<int32, [4]>([1, 1, 1, 1])];
-            tensor<fp32, [1, 1, 1, 1]> var_386 = reshape(shape = var_385, x = var_384)[name = tensor<string, []>("op_386")];
-            tensor<fp32, [1, 4, 1, 64]> cross_3 = mul(x = var_383, y = var_386)[name = tensor<string, []>("cross_3")];
+            tensor<fp32, [1, 4, 1, 64]> v_3 = transpose(perm = v_3_perm_0, x = var_394)[name = tensor<string, []>("transpose_49")];
+            tensor<fp32, [1, 4, 1, 64]> inner_3 = matmul(transpose_x = inner_3_transpose_x_0, transpose_y = inner_3_transpose_y_0, x = var_407, y = v_3)[name = tensor<string, []>("inner_3")];
+            tensor<bool, []> var_409_transpose_x_0 = const()[name = tensor<string, []>("op_409_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_409_transpose_y_0 = const()[name = tensor<string, []>("op_409_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 1, 64]> var_409 = matmul(transpose_x = var_409_transpose_x_0, transpose_y = var_409_transpose_y_0, x = q_3, y = prev_kv_3)[name = tensor<string, []>("op_409")];
+            tensor<fp32, [1]> var_410 = real_div(x = sqrt_s0_3, y = sqrt_s_t_3)[name = tensor<string, []>("op_410")];
+            tensor<int32, [4]> var_411 = const()[name = tensor<string, []>("op_411"), val = tensor<int32, [4]>([1, 1, 1, 1])];
+            tensor<fp32, [1, 1, 1, 1]> var_412 = reshape(shape = var_411, x = var_410)[name = tensor<string, []>("op_412")];
+            tensor<fp32, [1, 4, 1, 64]> cross_3 = mul(x = var_409, y = var_412)[name = tensor<string, []>("cross_3")];
             tensor<fp32, [1, 4, 1, 64]> out_7 = add(x = inner_3, y = cross_3)[name = tensor<string, []>("out_7")];
-            tensor<fp32, [1, 4, 64, 64]> var_389 = mul(x = prev_kv_3, y = sqrt_s0_3)[name = tensor<string, []>("op_389")];
-            tensor<bool, []> var_391_transpose_x_1 = const()[name = tensor<string, []>("op_391_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_391_transpose_y_1 = const()[name = tensor<string, []>("op_391_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_391 = matmul(transpose_x = var_391_transpose_x_1, transpose_y = var_391_transpose_y_1, x = k_3, y = v_3)[name = tensor<string, []>("op_391")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_3 = add(x = var_389, y = var_391)[name = tensor<string, []>("new_kv_unnorm_3")];
-            tensor<fp32, []> var_393 = const()[name = tensor<string, []>("op_393"), val = tensor<fp32, []>(0x1p+0)];
-            tensor<fp32, [1]> new_scale_3 = add(x = prev_scale_3, y = var_393)[name = tensor<string, []>("new_scale_3")];
-            tensor<fp32, [1]> var_395 = sqrt(x = new_scale_3)[name = tensor<string, []>("op_395")];
-            tensor<fp32, [1, 4, 64, 64]> var_396 = real_div(x = new_kv_unnorm_3, y = var_395)[name = tensor<string, []>("op_396")];
-            tensor<int32, [4]> var_397_perm_0 = const()[name = tensor<string, []>("op_397_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_415 = mul(x = prev_kv_3, y = sqrt_s0_3)[name = tensor<string, []>("op_415")];
+            tensor<bool, []> var_417_transpose_x_1 = const()[name = tensor<string, []>("op_417_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_417_transpose_y_1 = const()[name = tensor<string, []>("op_417_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_417 = matmul(transpose_x = var_417_transpose_x_1, transpose_y = var_417_transpose_y_1, x = k_3, y = v_3)[name = tensor<string, []>("op_417")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_3 = add(x = var_415, y = var_417)[name = tensor<string, []>("new_kv_unnorm_3")];
+            tensor<fp32, []> var_419 = const()[name = tensor<string, []>("op_419"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<fp32, [1]> new_scale_3 = add(x = prev_scale_3, y = var_419)[name = tensor<string, []>("new_scale_3")];
+            tensor<fp32, [1]> var_421 = sqrt(x = new_scale_3)[name = tensor<string, []>("op_421")];
+            tensor<fp32, [1, 4, 64, 64]> var_422 = real_div(x = new_kv_unnorm_3, y = var_421)[name = tensor<string, []>("op_422")];
+            tensor<int32, [4]> var_423_perm_0 = const()[name = tensor<string, []>("op_423_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_9_axes_0 = const()[name = tensor<string, []>("out_9_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 4, 64]> var_397 = transpose(perm = var_397_perm_0, x = out_7)[name = tensor<string, []>("transpose_48")];
-            tensor<fp32, [1, 1, 4, 64]> out_9 = layer_norm(axes = out_9_axes_0, epsilon = var_17, x = var_397)[name = tensor<string, []>("out_9")];
-            tensor<int32, [3]> var_401 = const()[name = tensor<string, []>("op_401"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<fp32, [1, 1, 256]> out_11 = reshape(shape = var_401, x = out_9)[name = tensor<string, []>("out_11")];
-            tensor<fp32, [1, 1, 256]> var_403 = silu(x = input_57)[name = tensor<string, []>("op_403")];
-            tensor<fp32, [1, 1, 256]> input_59 = mul(x = var_403, y = out_11)[name = tensor<string, []>("input_59")];
-            tensor<fp32, [1, 1, 256]> ret_out_3 = linear(bias = encoder_out_proj_1_bias, weight = encoder_out_proj_1_weight, x = input_59)[name = tensor<string, []>("linear_16")];
-            tensor<fp32, [1, 1, 256]> x_9 = add(x = input_53, y = ret_out_3)[name = tensor<string, []>("x_9")];
+            tensor<fp32, [1, 1, 4, 64]> var_423 = transpose(perm = var_423_perm_0, x = out_7)[name = tensor<string, []>("transpose_48")];
+            tensor<fp32, [1, 1, 4, 64]> out_9 = layer_norm(axes = out_9_axes_0, epsilon = var_43, x = var_423)[name = tensor<string, []>("out_9")];
+            tensor<int32, [3]> var_427 = const()[name = tensor<string, []>("op_427"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<fp32, [1, 1, 256]> out_11 = reshape(shape = var_427, x = out_9)[name = tensor<string, []>("out_11")];
+            tensor<fp32, [1, 1, 256]> var_429 = silu(x = input_59)[name = tensor<string, []>("op_429")];
+            tensor<fp32, [1, 1, 256]> input_61 = mul(x = var_429, y = out_11)[name = tensor<string, []>("input_61")];
+            tensor<fp32, [1, 1, 256]> ret_out_3 = linear(bias = inner_encoder_out_proj_1_bias, weight = inner_encoder_out_proj_1_weight, x = input_61)[name = tensor<string, []>("linear_16")];
+            tensor<fp32, [1, 1, 256]> x_9 = add(x = input_55, y = ret_out_3)[name = tensor<string, []>("x_9")];
             tensor<int32, [4]> window_5_begin_0 = const()[name = tensor<string, []>("window_5_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> window_5_end_0 = const()[name = tensor<string, []>("window_5_end_0"), val = tensor<int32, [4]>([2, 1, 16, 256])];
             tensor<bool, [4]> window_5_end_mask_0 = const()[name = tensor<string, []>("window_5_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_5_squeeze_mask_0 = const()[name = tensor<string, []>("window_5_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_5 = slice_by_index(begin = window_5_begin_0, end = window_5_end_0, end_mask = window_5_end_mask_0, squeeze_mask = window_5_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_5")];
-            tensor<int32, [3]> var_414_begin_0 = const()[name = tensor<string, []>("op_414_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_414_end_0 = const()[name = tensor<string, []>("op_414_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_414_end_mask_0 = const()[name = tensor<string, []>("op_414_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_414 = slice_by_index(begin = var_414_begin_0, end = var_414_end_0, end_mask = var_414_end_mask_0, x = window_5)[name = tensor<string, []>("op_414")];
+            tensor<int32, [3]> var_440_begin_0 = const()[name = tensor<string, []>("op_440_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_440_end_0 = const()[name = tensor<string, []>("op_440_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_440_end_mask_0 = const()[name = tensor<string, []>("op_440_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_440 = slice_by_index(begin = var_440_begin_0, end = var_440_end_0, end_mask = var_440_end_mask_0, x = window_5)[name = tensor<string, []>("op_440")];
             tensor<bool, []> window_7_interleave_0 = const()[name = tensor<string, []>("window_7_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_7 = concat(axis = var_26, interleave = window_7_interleave_0, values = (var_414, x_9))[name = tensor<string, []>("window_7")];
-            tensor<bool, []> input_61_interleave_0 = const()[name = tensor<string, []>("input_61_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> input_61 = concat(axis = var_23, interleave = input_61_interleave_0, values = window_7)[name = tensor<string, []>("input_61")];
+            tensor<fp32, [1, 16, 256]> window_7 = concat(axis = var_52, interleave = window_7_interleave_0, values = (var_440, x_9))[name = tensor<string, []>("window_7")];
+            tensor<bool, []> input_63_interleave_0 = const()[name = tensor<string, []>("input_63_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 16, 256]> input_63 = concat(axis = var_38, interleave = input_63_interleave_0, values = window_7)[name = tensor<string, []>("input_63")];
             tensor<int32, [1]> x_11_axes_0 = const()[name = tensor<string, []>("x_11_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 16, 256]> x_11 = layer_norm(axes = x_11_axes_0, beta = encoder_conv_module_1_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_1_sequential_0_weight, x = input_61)[name = tensor<string, []>("x_11")];
-            tensor<int32, [3]> input_63_perm_0 = const()[name = tensor<string, []>("input_63_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [1, 16, 256]> x_11 = layer_norm(axes = x_11_axes_0, beta = inner_encoder_conv_module_1_sequential_0_bias, epsilon = var_35, gamma = inner_encoder_conv_module_1_sequential_0_weight, x = input_63)[name = tensor<string, []>("x_11")];
+            tensor<int32, [3]> input_65_perm_0 = const()[name = tensor<string, []>("input_65_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_13_pad_type_0 = const()[name = tensor<string, []>("inputs_13_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_13_strides_0 = const()[name = tensor<string, []>("inputs_13_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_13_pad_0 = const()[name = tensor<string, []>("inputs_13_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_13_dilations_0 = const()[name = tensor<string, []>("inputs_13_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_13_groups_0 = const()[name = tensor<string, []>("inputs_13_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> input_63 = transpose(perm = input_63_perm_0, x = x_11)[name = tensor<string, []>("transpose_47")];
-            tensor<fp32, [1, 512, 16]> inputs_13 = conv(bias = encoder_conv_module_1_sequential_2_conv_bias, dilations = inputs_13_dilations_0, groups = inputs_13_groups_0, pad = inputs_13_pad_0, pad_type = inputs_13_pad_type_0, strides = inputs_13_strides_0, weight = encoder_conv_module_1_sequential_2_conv_weight, x = input_63)[name = tensor<string, []>("inputs_13")];
-            tensor<int32, [2]> var_439_split_sizes_0 = const()[name = tensor<string, []>("op_439_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_439_axis_0 = const()[name = tensor<string, []>("op_439_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> var_439_0, tensor<fp32, [1, 256, 16]> var_439_1 = split(axis = var_439_axis_0, split_sizes = var_439_split_sizes_0, x = inputs_13)[name = tensor<string, []>("op_439")];
-            tensor<fp32, [1, 256, 16]> var_441 = sigmoid(x = var_439_1)[name = tensor<string, []>("op_441")];
-            tensor<fp32, [1, 256, 16]> inputs_15 = mul(x = var_439_0, y = var_441)[name = tensor<string, []>("inputs_15")];
+            tensor<fp32, [1, 256, 16]> input_65 = transpose(perm = input_65_perm_0, x = x_11)[name = tensor<string, []>("transpose_47")];
+            tensor<fp32, [1, 512, 16]> inputs_13 = conv(bias = inner_encoder_conv_module_1_sequential_2_conv_bias, dilations = inputs_13_dilations_0, groups = inputs_13_groups_0, pad = inputs_13_pad_0, pad_type = inputs_13_pad_type_0, strides = inputs_13_strides_0, weight = inner_encoder_conv_module_1_sequential_2_conv_weight, x = input_65)[name = tensor<string, []>("inputs_13")];
+            tensor<int32, [2]> var_465_split_sizes_0 = const()[name = tensor<string, []>("op_465_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_465_axis_0 = const()[name = tensor<string, []>("op_465_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 256, 16]> var_465_0, tensor<fp32, [1, 256, 16]> var_465_1 = split(axis = var_465_axis_0, split_sizes = var_465_split_sizes_0, x = inputs_13)[name = tensor<string, []>("op_465")];
+            tensor<fp32, [1, 256, 16]> var_467 = sigmoid(x = var_465_1)[name = tensor<string, []>("op_467")];
+            tensor<fp32, [1, 256, 16]> inputs_15 = mul(x = var_465_0, y = var_467)[name = tensor<string, []>("inputs_15")];
             tensor<string, []> outputs_aug_3_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_3_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_3_pad_0 = const()[name = tensor<string, []>("outputs_aug_3_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_3_groups_0 = const()[name = tensor<string, []>("outputs_aug_3_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_3_strides_0 = const()[name = tensor<string, []>("outputs_aug_3_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_3_dilations_0 = const()[name = tensor<string, []>("outputs_aug_3_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [1, 256, 31]> outputs_aug_3 = conv(dilations = outputs_aug_3_dilations_0, groups = outputs_aug_3_groups_0, pad = outputs_aug_3_pad_0, pad_type = outputs_aug_3_pad_type_0, strides = outputs_aug_3_strides_0, weight = encoder_conv_module_1_sequential_4_conv_weight, x = inputs_15)[name = tensor<string, []>("outputs_aug_3")];
-            tensor<int32, [3]> input_65_begin_0 = const()[name = tensor<string, []>("input_65_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_65_end_0 = const()[name = tensor<string, []>("input_65_end_0"), val = tensor<int32, [3]>([1, 256, 16])];
-            tensor<bool, [3]> input_65_end_mask_0 = const()[name = tensor<string, []>("input_65_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [1, 256, 16]> input_65 = slice_by_index(begin = input_65_begin_0, end = input_65_end_0, end_mask = input_65_end_mask_0, x = outputs_aug_3)[name = tensor<string, []>("input_65")];
-            tensor<fp32, [1, 256, 16]> inputs_17 = batch_norm(beta = encoder_conv_module_1_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_1_sequential_5_weight, mean = encoder_conv_module_1_sequential_5_running_mean, variance = encoder_conv_module_1_sequential_5_running_var, x = input_65)[name = tensor<string, []>("inputs_17")];
-            tensor<fp32, [1, 256, 16]> input_67 = silu(x = inputs_17)[name = tensor<string, []>("input_67")];
-            tensor<string, []> input_69_pad_type_0 = const()[name = tensor<string, []>("input_69_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_69_strides_0 = const()[name = tensor<string, []>("input_69_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_69_pad_0 = const()[name = tensor<string, []>("input_69_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_69_dilations_0 = const()[name = tensor<string, []>("input_69_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_69_groups_0 = const()[name = tensor<string, []>("input_69_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> input_69 = conv(bias = encoder_conv_module_1_sequential_7_conv_bias, dilations = input_69_dilations_0, groups = input_69_groups_0, pad = input_69_pad_0, pad_type = input_69_pad_type_0, strides = input_69_strides_0, weight = encoder_conv_module_1_sequential_7_conv_weight, x = input_67)[name = tensor<string, []>("input_69")];
+            tensor<fp32, [1, 256, 31]> outputs_aug_3 = conv(dilations = outputs_aug_3_dilations_0, groups = outputs_aug_3_groups_0, pad = outputs_aug_3_pad_0, pad_type = outputs_aug_3_pad_type_0, strides = outputs_aug_3_strides_0, weight = inner_encoder_conv_module_1_sequential_4_conv_weight, x = inputs_15)[name = tensor<string, []>("outputs_aug_3")];
+            tensor<int32, [3]> input_67_begin_0 = const()[name = tensor<string, []>("input_67_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_67_end_0 = const()[name = tensor<string, []>("input_67_end_0"), val = tensor<int32, [3]>([1, 256, 16])];
+            tensor<bool, [3]> input_67_end_mask_0 = const()[name = tensor<string, []>("input_67_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [1, 256, 16]> input_67 = slice_by_index(begin = input_67_begin_0, end = input_67_end_0, end_mask = input_67_end_mask_0, x = outputs_aug_3)[name = tensor<string, []>("input_67")];
+            tensor<fp32, [1, 256, 16]> inputs_17 = batch_norm(beta = inner_encoder_conv_module_1_sequential_5_bias, epsilon = var_35, gamma = inner_encoder_conv_module_1_sequential_5_weight, mean = inner_encoder_conv_module_1_sequential_5_running_mean, variance = inner_encoder_conv_module_1_sequential_5_running_var, x = input_67)[name = tensor<string, []>("inputs_17")];
+            tensor<fp32, [1, 256, 16]> input_69 = silu(x = inputs_17)[name = tensor<string, []>("input_69")];
+            tensor<string, []> input_71_pad_type_0 = const()[name = tensor<string, []>("input_71_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_71_strides_0 = const()[name = tensor<string, []>("input_71_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_71_pad_0 = const()[name = tensor<string, []>("input_71_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_71_dilations_0 = const()[name = tensor<string, []>("input_71_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_71_groups_0 = const()[name = tensor<string, []>("input_71_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 256, 16]> input_71 = conv(bias = inner_encoder_conv_module_1_sequential_7_conv_bias, dilations = input_71_dilations_0, groups = input_71_groups_0, pad = input_71_pad_0, pad_type = input_71_pad_type_0, strides = input_71_strides_0, weight = inner_encoder_conv_module_1_sequential_7_conv_weight, x = input_69)[name = tensor<string, []>("input_71")];
             tensor<int32, [3]> conv_out_3_perm_0 = const()[name = tensor<string, []>("conv_out_3_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_472_begin_0 = const()[name = tensor<string, []>("op_472_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_472_end_0 = const()[name = tensor<string, []>("op_472_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_472_end_mask_0 = const()[name = tensor<string, []>("op_472_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 16, 256]> conv_out_3 = transpose(perm = conv_out_3_perm_0, x = input_69)[name = tensor<string, []>("transpose_46")];
-            tensor<fp32, [1, 1, 256]> var_472 = slice_by_index(begin = var_472_begin_0, end = var_472_end_0, end_mask = var_472_end_mask_0, x = conv_out_3)[name = tensor<string, []>("op_472")];
-            tensor<int32, [3]> var_474_perm_0 = const()[name = tensor<string, []>("op_474_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 1, 256]> var_474 = transpose(perm = var_474_perm_0, x = var_472)[name = tensor<string, []>("transpose_45")];
-            tensor<fp32, [1, 1, 256]> input_71 = add(x = x_9, y = var_474)[name = tensor<string, []>("input_71")];
-            tensor<int32, [1]> input_73_axes_0 = const()[name = tensor<string, []>("input_73_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_73 = layer_norm(axes = input_73_axes_0, beta = encoder_ffn2_1_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_1_module_sequential_0_weight, x = input_71)[name = tensor<string, []>("input_73")];
-            tensor<fp32, [1, 1, 1024]> inputs_19 = linear(bias = encoder_ffn2_1_module_sequential_1_linear_bias, weight = encoder_ffn2_1_module_sequential_1_linear_weight, x = input_73)[name = tensor<string, []>("linear_17")];
-            tensor<fp32, [1, 1, 1024]> input_75 = silu(x = inputs_19)[name = tensor<string, []>("input_75")];
-            tensor<fp32, [1, 1, 256]> input_79 = linear(bias = encoder_ffn2_1_module_sequential_4_linear_bias, weight = encoder_ffn2_1_module_sequential_4_linear_weight, x = input_75)[name = tensor<string, []>("linear_18")];
-            tensor<fp32, []> var_497 = const()[name = tensor<string, []>("op_497"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 1, 256]> var_498 = mul(x = input_79, y = var_497)[name = tensor<string, []>("op_498")];
-            tensor<fp32, [1, 1, 256]> input_81 = add(x = var_498, y = input_71)[name = tensor<string, []>("input_81")];
-            tensor<int32, [1]> input_83_axes_0 = const()[name = tensor<string, []>("input_83_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_83 = layer_norm(axes = input_83_axes_0, beta = encoder_layer_norm_1_bias, epsilon = var_28, gamma = encoder_layer_norm_1_weight, x = input_81)[name = tensor<string, []>("input_83")];
+            tensor<int32, [3]> var_498_begin_0 = const()[name = tensor<string, []>("op_498_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_498_end_0 = const()[name = tensor<string, []>("op_498_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_498_end_mask_0 = const()[name = tensor<string, []>("op_498_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 16, 256]> conv_out_3 = transpose(perm = conv_out_3_perm_0, x = input_71)[name = tensor<string, []>("transpose_46")];
+            tensor<fp32, [1, 1, 256]> var_498 = slice_by_index(begin = var_498_begin_0, end = var_498_end_0, end_mask = var_498_end_mask_0, x = conv_out_3)[name = tensor<string, []>("op_498")];
+            tensor<int32, [3]> var_500_perm_0 = const()[name = tensor<string, []>("op_500_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 1, 256]> var_500 = transpose(perm = var_500_perm_0, x = var_498)[name = tensor<string, []>("transpose_45")];
+            tensor<fp32, [1, 1, 256]> input_73 = add(x = x_9, y = var_500)[name = tensor<string, []>("input_73")];
+            tensor<int32, [1]> input_75_axes_0 = const()[name = tensor<string, []>("input_75_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 1, 256]> input_75 = layer_norm(axes = input_75_axes_0, beta = inner_encoder_ffn2_1_module_sequential_0_bias, epsilon = var_35, gamma = inner_encoder_ffn2_1_module_sequential_0_weight, x = input_73)[name = tensor<string, []>("input_75")];
+            tensor<fp32, [1, 1, 1024]> inputs_19 = linear(bias = inner_encoder_ffn2_1_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_1_module_sequential_1_linear_weight, x = input_75)[name = tensor<string, []>("linear_17")];
+            tensor<fp32, [1, 1, 1024]> input_77 = silu(x = inputs_19)[name = tensor<string, []>("input_77")];
+            tensor<fp32, [1, 1, 256]> input_81 = linear(bias = inner_encoder_ffn2_1_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_1_module_sequential_4_linear_weight, x = input_77)[name = tensor<string, []>("linear_18")];
+            tensor<fp32, []> var_523 = const()[name = tensor<string, []>("op_523"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 1, 256]> var_524 = mul(x = input_81, y = var_523)[name = tensor<string, []>("op_524")];
+            tensor<fp32, [1, 1, 256]> input_83 = add(x = var_524, y = input_73)[name = tensor<string, []>("input_83")];
             tensor<int32, [1]> input_85_axes_0 = const()[name = tensor<string, []>("input_85_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_85 = layer_norm(axes = input_85_axes_0, beta = encoder_ffn1_2_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_2_module_sequential_0_weight, x = input_83)[name = tensor<string, []>("input_85")];
-            tensor<fp32, [1, 1, 1024]> inputs_21 = linear(bias = encoder_ffn1_2_module_sequential_1_linear_bias, weight = encoder_ffn1_2_module_sequential_1_linear_weight, x = input_85)[name = tensor<string, []>("linear_19")];
-            tensor<fp32, [1, 1, 1024]> input_87 = silu(x = inputs_21)[name = tensor<string, []>("input_87")];
-            tensor<fp32, [1, 1, 256]> input_91 = linear(bias = encoder_ffn1_2_module_sequential_4_linear_bias, weight = encoder_ffn1_2_module_sequential_4_linear_weight, x = input_87)[name = tensor<string, []>("linear_20")];
-            tensor<fp32, []> var_527 = const()[name = tensor<string, []>("op_527"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 1, 256]> var_528 = mul(x = input_91, y = var_527)[name = tensor<string, []>("op_528")];
-            tensor<fp32, [1, 1, 256]> input_93 = add(x = var_528, y = input_83)[name = tensor<string, []>("input_93")];
+            tensor<fp32, [1, 1, 256]> input_85 = layer_norm(axes = input_85_axes_0, beta = inner_encoder_layer_norm_1_bias, epsilon = var_35, gamma = inner_encoder_layer_norm_1_weight, x = input_83)[name = tensor<string, []>("input_85")];
+            tensor<int32, [1]> input_87_axes_0 = const()[name = tensor<string, []>("input_87_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 1, 256]> input_87 = layer_norm(axes = input_87_axes_0, beta = inner_encoder_ffn1_2_module_sequential_0_bias, epsilon = var_35, gamma = inner_encoder_ffn1_2_module_sequential_0_weight, x = input_85)[name = tensor<string, []>("input_87")];
+            tensor<fp32, [1, 1, 1024]> inputs_21 = linear(bias = inner_encoder_ffn1_2_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_2_module_sequential_1_linear_weight, x = input_87)[name = tensor<string, []>("linear_19")];
+            tensor<fp32, [1, 1, 1024]> input_89 = silu(x = inputs_21)[name = tensor<string, []>("input_89")];
+            tensor<fp32, [1, 1, 256]> input_93 = linear(bias = inner_encoder_ffn1_2_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_2_module_sequential_4_linear_weight, x = input_89)[name = tensor<string, []>("linear_20")];
+            tensor<fp32, []> var_553 = const()[name = tensor<string, []>("op_553"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 1, 256]> var_554 = mul(x = input_93, y = var_553)[name = tensor<string, []>("op_554")];
+            tensor<fp32, [1, 1, 256]> input_95 = add(x = var_554, y = input_85)[name = tensor<string, []>("input_95")];
             tensor<int32, [1]> x_13_axes_0 = const()[name = tensor<string, []>("x_13_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> x_13 = layer_norm(axes = x_13_axes_0, beta = encoder_ret_lns_2_bias, epsilon = var_28, gamma = encoder_ret_lns_2_weight, x = input_93)[name = tensor<string, []>("x_13")];
+            tensor<fp32, [1, 1, 256]> x_13 = layer_norm(axes = x_13_axes_0, beta = inner_encoder_ret_lns_2_bias, epsilon = var_35, gamma = inner_encoder_ret_lns_2_weight, x = input_95)[name = tensor<string, []>("x_13")];
             tensor<int32, [5]> prev_kv_5_begin_0 = const()[name = tensor<string, []>("prev_kv_5_begin_0"), val = tensor<int32, [5]>([2, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_5_end_0 = const()[name = tensor<string, []>("prev_kv_5_end_0"), val = tensor<int32, [5]>([3, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_5_end_mask_0 = const()[name = tensor<string, []>("prev_kv_5_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -524,139 +530,139 @@ program(1.0)
             tensor<bool, [2]> prev_scale_5_end_mask_0 = const()[name = tensor<string, []>("prev_scale_5_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_5_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_5_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_5 = slice_by_index(begin = prev_scale_5_begin_0, end = prev_scale_5_end_0, end_mask = prev_scale_5_end_mask_0, squeeze_mask = prev_scale_5_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_5")];
-            tensor<fp32, [1, 1, 256]> var_542 = linear(bias = encoder_q_proj_2_bias, weight = encoder_q_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_21")];
-            tensor<int32, [4]> var_543 = const()[name = tensor<string, []>("op_543"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_544 = reshape(shape = var_543, x = var_542)[name = tensor<string, []>("op_544")];
+            tensor<fp32, [1, 1, 256]> var_568 = linear(bias = inner_encoder_q_proj_2_bias, weight = inner_encoder_q_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_21")];
+            tensor<int32, [4]> var_569 = const()[name = tensor<string, []>("op_569"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_570 = reshape(shape = var_569, x = var_568)[name = tensor<string, []>("op_570")];
             tensor<int32, [4]> q_5_perm_0 = const()[name = tensor<string, []>("q_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> var_548 = linear(bias = encoder_k_proj_2_bias, weight = encoder_k_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_22")];
-            tensor<fp32, []> var_549 = const()[name = tensor<string, []>("op_549"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 1, 256]> var_550 = mul(x = var_548, y = var_549)[name = tensor<string, []>("op_550")];
-            tensor<int32, [4]> var_551 = const()[name = tensor<string, []>("op_551"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_552 = reshape(shape = var_551, x = var_550)[name = tensor<string, []>("op_552")];
+            tensor<fp32, [1, 1, 256]> var_574 = linear(bias = inner_encoder_k_proj_2_bias, weight = inner_encoder_k_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_22")];
+            tensor<fp32, []> var_575 = const()[name = tensor<string, []>("op_575"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 1, 256]> var_576 = mul(x = var_574, y = var_575)[name = tensor<string, []>("op_576")];
+            tensor<int32, [4]> var_577 = const()[name = tensor<string, []>("op_577"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_578 = reshape(shape = var_577, x = var_576)[name = tensor<string, []>("op_578")];
             tensor<int32, [4]> k_5_perm_0 = const()[name = tensor<string, []>("k_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> var_556 = linear(bias = encoder_v_proj_2_bias, weight = encoder_v_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_23")];
-            tensor<int32, [4]> var_557 = const()[name = tensor<string, []>("op_557"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_558 = reshape(shape = var_557, x = var_556)[name = tensor<string, []>("op_558")];
+            tensor<fp32, [1, 1, 256]> var_582 = linear(bias = inner_encoder_v_proj_2_bias, weight = inner_encoder_v_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_23")];
+            tensor<int32, [4]> var_583 = const()[name = tensor<string, []>("op_583"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_584 = reshape(shape = var_583, x = var_582)[name = tensor<string, []>("op_584")];
             tensor<int32, [4]> v_5_perm_0 = const()[name = tensor<string, []>("v_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> input_97 = linear(bias = encoder_g_proj_2_bias, weight = encoder_g_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_24")];
+            tensor<fp32, [1, 1, 256]> input_99 = linear(bias = inner_encoder_g_proj_2_bias, weight = inner_encoder_g_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_24")];
             tensor<fp32, [1]> sqrt_s0_5 = sqrt(x = prev_scale_5)[name = tensor<string, []>("sqrt_s0_5")];
-            tensor<fp32, [1]> s_t_5 = add(x = prev_scale_5, y = encoder__t_index)[name = tensor<string, []>("s_t_5")];
+            tensor<fp32, [1]> s_t_5 = add(x = prev_scale_5, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_5")];
             tensor<fp32, [1]> sqrt_s_t_5 = sqrt(x = s_t_5)[name = tensor<string, []>("sqrt_s_t_5")];
             tensor<bool, []> qk_5_transpose_x_1 = const()[name = tensor<string, []>("qk_5_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_5_transpose_y_1 = const()[name = tensor<string, []>("qk_5_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 1, 64]> k_5 = transpose(perm = k_5_perm_0, x = var_552)[name = tensor<string, []>("transpose_43")];
-            tensor<fp32, [1, 4, 1, 64]> q_5 = transpose(perm = q_5_perm_0, x = var_544)[name = tensor<string, []>("transpose_44")];
+            tensor<fp32, [1, 4, 1, 64]> k_5 = transpose(perm = k_5_perm_0, x = var_578)[name = tensor<string, []>("transpose_43")];
+            tensor<fp32, [1, 4, 1, 64]> q_5 = transpose(perm = q_5_perm_0, x = var_570)[name = tensor<string, []>("transpose_44")];
             tensor<fp32, [1, 4, 1, 1]> qk_5 = matmul(transpose_x = qk_5_transpose_x_1, transpose_y = qk_5_transpose_y_1, x = q_5, y = k_5)[name = tensor<string, []>("qk_5")];
-            tensor<int32, [2]> var_568 = const()[name = tensor<string, []>("op_568"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp32, [1, 1]> var_569 = reshape(shape = var_568, x = sqrt_s_t_5)[name = tensor<string, []>("op_569")];
-            tensor<fp32, [1, 1]> M_5 = real_div(x = encoder__causal_mask, y = var_569)[name = tensor<string, []>("M_5")];
-            tensor<fp32, [1, 4, 1, 1]> var_571 = mul(x = qk_5, y = M_5)[name = tensor<string, []>("op_571")];
+            tensor<int32, [2]> var_594 = const()[name = tensor<string, []>("op_594"), val = tensor<int32, [2]>([1, 1])];
+            tensor<fp32, [1, 1]> var_595 = reshape(shape = var_594, x = sqrt_s_t_5)[name = tensor<string, []>("op_595")];
+            tensor<fp32, [1, 1]> M_5 = real_div(x = inner_encoder__causal_mask, y = var_595)[name = tensor<string, []>("M_5")];
+            tensor<fp32, [1, 4, 1, 1]> var_597 = mul(x = qk_5, y = M_5)[name = tensor<string, []>("op_597")];
             tensor<bool, []> inner_5_transpose_x_0 = const()[name = tensor<string, []>("inner_5_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_5_transpose_y_0 = const()[name = tensor<string, []>("inner_5_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 1, 64]> v_5 = transpose(perm = v_5_perm_0, x = var_558)[name = tensor<string, []>("transpose_42")];
-            tensor<fp32, [1, 4, 1, 64]> inner_5 = matmul(transpose_x = inner_5_transpose_x_0, transpose_y = inner_5_transpose_y_0, x = var_571, y = v_5)[name = tensor<string, []>("inner_5")];
-            tensor<bool, []> var_573_transpose_x_0 = const()[name = tensor<string, []>("op_573_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_573_transpose_y_0 = const()[name = tensor<string, []>("op_573_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 1, 64]> var_573 = matmul(transpose_x = var_573_transpose_x_0, transpose_y = var_573_transpose_y_0, x = q_5, y = prev_kv_5)[name = tensor<string, []>("op_573")];
-            tensor<fp32, [1]> var_574 = real_div(x = sqrt_s0_5, y = sqrt_s_t_5)[name = tensor<string, []>("op_574")];
-            tensor<int32, [4]> var_575 = const()[name = tensor<string, []>("op_575"), val = tensor<int32, [4]>([1, 1, 1, 1])];
-            tensor<fp32, [1, 1, 1, 1]> var_576 = reshape(shape = var_575, x = var_574)[name = tensor<string, []>("op_576")];
-            tensor<fp32, [1, 4, 1, 64]> cross_5 = mul(x = var_573, y = var_576)[name = tensor<string, []>("cross_5")];
+            tensor<fp32, [1, 4, 1, 64]> v_5 = transpose(perm = v_5_perm_0, x = var_584)[name = tensor<string, []>("transpose_42")];
+            tensor<fp32, [1, 4, 1, 64]> inner_5 = matmul(transpose_x = inner_5_transpose_x_0, transpose_y = inner_5_transpose_y_0, x = var_597, y = v_5)[name = tensor<string, []>("inner_5")];
+            tensor<bool, []> var_599_transpose_x_0 = const()[name = tensor<string, []>("op_599_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_599_transpose_y_0 = const()[name = tensor<string, []>("op_599_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 1, 64]> var_599 = matmul(transpose_x = var_599_transpose_x_0, transpose_y = var_599_transpose_y_0, x = q_5, y = prev_kv_5)[name = tensor<string, []>("op_599")];
+            tensor<fp32, [1]> var_600 = real_div(x = sqrt_s0_5, y = sqrt_s_t_5)[name = tensor<string, []>("op_600")];
+            tensor<int32, [4]> var_601 = const()[name = tensor<string, []>("op_601"), val = tensor<int32, [4]>([1, 1, 1, 1])];
+            tensor<fp32, [1, 1, 1, 1]> var_602 = reshape(shape = var_601, x = var_600)[name = tensor<string, []>("op_602")];
+            tensor<fp32, [1, 4, 1, 64]> cross_5 = mul(x = var_599, y = var_602)[name = tensor<string, []>("cross_5")];
             tensor<fp32, [1, 4, 1, 64]> out_13 = add(x = inner_5, y = cross_5)[name = tensor<string, []>("out_13")];
-            tensor<fp32, [1, 4, 64, 64]> var_579 = mul(x = prev_kv_5, y = sqrt_s0_5)[name = tensor<string, []>("op_579")];
-            tensor<bool, []> var_581_transpose_x_1 = const()[name = tensor<string, []>("op_581_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_581_transpose_y_1 = const()[name = tensor<string, []>("op_581_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_581 = matmul(transpose_x = var_581_transpose_x_1, transpose_y = var_581_transpose_y_1, x = k_5, y = v_5)[name = tensor<string, []>("op_581")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_5 = add(x = var_579, y = var_581)[name = tensor<string, []>("new_kv_unnorm_5")];
-            tensor<fp32, []> var_583 = const()[name = tensor<string, []>("op_583"), val = tensor<fp32, []>(0x1p+0)];
-            tensor<fp32, [1]> new_scale_5 = add(x = prev_scale_5, y = var_583)[name = tensor<string, []>("new_scale_5")];
-            tensor<fp32, [1]> var_585 = sqrt(x = new_scale_5)[name = tensor<string, []>("op_585")];
-            tensor<fp32, [1, 4, 64, 64]> var_586 = real_div(x = new_kv_unnorm_5, y = var_585)[name = tensor<string, []>("op_586")];
-            tensor<int32, [4]> var_587_perm_0 = const()[name = tensor<string, []>("op_587_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_605 = mul(x = prev_kv_5, y = sqrt_s0_5)[name = tensor<string, []>("op_605")];
+            tensor<bool, []> var_607_transpose_x_1 = const()[name = tensor<string, []>("op_607_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_607_transpose_y_1 = const()[name = tensor<string, []>("op_607_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_607 = matmul(transpose_x = var_607_transpose_x_1, transpose_y = var_607_transpose_y_1, x = k_5, y = v_5)[name = tensor<string, []>("op_607")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_5 = add(x = var_605, y = var_607)[name = tensor<string, []>("new_kv_unnorm_5")];
+            tensor<fp32, []> var_609 = const()[name = tensor<string, []>("op_609"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<fp32, [1]> new_scale_5 = add(x = prev_scale_5, y = var_609)[name = tensor<string, []>("new_scale_5")];
+            tensor<fp32, [1]> var_611 = sqrt(x = new_scale_5)[name = tensor<string, []>("op_611")];
+            tensor<fp32, [1, 4, 64, 64]> var_612 = real_div(x = new_kv_unnorm_5, y = var_611)[name = tensor<string, []>("op_612")];
+            tensor<int32, [4]> var_613_perm_0 = const()[name = tensor<string, []>("op_613_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_15_axes_0 = const()[name = tensor<string, []>("out_15_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 4, 64]> var_587 = transpose(perm = var_587_perm_0, x = out_13)[name = tensor<string, []>("transpose_41")];
-            tensor<fp32, [1, 1, 4, 64]> out_15 = layer_norm(axes = out_15_axes_0, epsilon = var_17, x = var_587)[name = tensor<string, []>("out_15")];
-            tensor<int32, [3]> var_591 = const()[name = tensor<string, []>("op_591"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<fp32, [1, 1, 256]> out_17 = reshape(shape = var_591, x = out_15)[name = tensor<string, []>("out_17")];
-            tensor<fp32, [1, 1, 256]> var_593 = silu(x = input_97)[name = tensor<string, []>("op_593")];
-            tensor<fp32, [1, 1, 256]> input_99 = mul(x = var_593, y = out_17)[name = tensor<string, []>("input_99")];
-            tensor<fp32, [1, 1, 256]> ret_out_5 = linear(bias = encoder_out_proj_2_bias, weight = encoder_out_proj_2_weight, x = input_99)[name = tensor<string, []>("linear_25")];
-            tensor<fp32, [1, 1, 256]> x_15 = add(x = input_93, y = ret_out_5)[name = tensor<string, []>("x_15")];
+            tensor<fp32, [1, 1, 4, 64]> var_613 = transpose(perm = var_613_perm_0, x = out_13)[name = tensor<string, []>("transpose_41")];
+            tensor<fp32, [1, 1, 4, 64]> out_15 = layer_norm(axes = out_15_axes_0, epsilon = var_43, x = var_613)[name = tensor<string, []>("out_15")];
+            tensor<int32, [3]> var_617 = const()[name = tensor<string, []>("op_617"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<fp32, [1, 1, 256]> out_17 = reshape(shape = var_617, x = out_15)[name = tensor<string, []>("out_17")];
+            tensor<fp32, [1, 1, 256]> var_619 = silu(x = input_99)[name = tensor<string, []>("op_619")];
+            tensor<fp32, [1, 1, 256]> input_101 = mul(x = var_619, y = out_17)[name = tensor<string, []>("input_101")];
+            tensor<fp32, [1, 1, 256]> ret_out_5 = linear(bias = inner_encoder_out_proj_2_bias, weight = inner_encoder_out_proj_2_weight, x = input_101)[name = tensor<string, []>("linear_25")];
+            tensor<fp32, [1, 1, 256]> x_15 = add(x = input_95, y = ret_out_5)[name = tensor<string, []>("x_15")];
             tensor<int32, [4]> window_9_begin_0 = const()[name = tensor<string, []>("window_9_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> window_9_end_0 = const()[name = tensor<string, []>("window_9_end_0"), val = tensor<int32, [4]>([3, 1, 16, 256])];
             tensor<bool, [4]> window_9_end_mask_0 = const()[name = tensor<string, []>("window_9_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_9_squeeze_mask_0 = const()[name = tensor<string, []>("window_9_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_9 = slice_by_index(begin = window_9_begin_0, end = window_9_end_0, end_mask = window_9_end_mask_0, squeeze_mask = window_9_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_9")];
-            tensor<int32, [3]> var_604_begin_0 = const()[name = tensor<string, []>("op_604_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_604_end_0 = const()[name = tensor<string, []>("op_604_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_604_end_mask_0 = const()[name = tensor<string, []>("op_604_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_604 = slice_by_index(begin = var_604_begin_0, end = var_604_end_0, end_mask = var_604_end_mask_0, x = window_9)[name = tensor<string, []>("op_604")];
+            tensor<int32, [3]> var_630_begin_0 = const()[name = tensor<string, []>("op_630_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_630_end_0 = const()[name = tensor<string, []>("op_630_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_630_end_mask_0 = const()[name = tensor<string, []>("op_630_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_630 = slice_by_index(begin = var_630_begin_0, end = var_630_end_0, end_mask = var_630_end_mask_0, x = window_9)[name = tensor<string, []>("op_630")];
             tensor<bool, []> window_11_interleave_0 = const()[name = tensor<string, []>("window_11_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_11 = concat(axis = var_26, interleave = window_11_interleave_0, values = (var_604, x_15))[name = tensor<string, []>("window_11")];
-            tensor<bool, []> input_101_interleave_0 = const()[name = tensor<string, []>("input_101_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> input_101 = concat(axis = var_23, interleave = input_101_interleave_0, values = window_11)[name = tensor<string, []>("input_101")];
+            tensor<fp32, [1, 16, 256]> window_11 = concat(axis = var_52, interleave = window_11_interleave_0, values = (var_630, x_15))[name = tensor<string, []>("window_11")];
+            tensor<bool, []> input_103_interleave_0 = const()[name = tensor<string, []>("input_103_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 16, 256]> input_103 = concat(axis = var_38, interleave = input_103_interleave_0, values = window_11)[name = tensor<string, []>("input_103")];
             tensor<int32, [1]> x_17_axes_0 = const()[name = tensor<string, []>("x_17_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 16, 256]> x_17 = layer_norm(axes = x_17_axes_0, beta = encoder_conv_module_2_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_2_sequential_0_weight, x = input_101)[name = tensor<string, []>("x_17")];
-            tensor<int32, [3]> input_103_perm_0 = const()[name = tensor<string, []>("input_103_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [1, 16, 256]> x_17 = layer_norm(axes = x_17_axes_0, beta = inner_encoder_conv_module_2_sequential_0_bias, epsilon = var_35, gamma = inner_encoder_conv_module_2_sequential_0_weight, x = input_103)[name = tensor<string, []>("x_17")];
+            tensor<int32, [3]> input_105_perm_0 = const()[name = tensor<string, []>("input_105_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_23_pad_type_0 = const()[name = tensor<string, []>("inputs_23_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_23_strides_0 = const()[name = tensor<string, []>("inputs_23_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_23_pad_0 = const()[name = tensor<string, []>("inputs_23_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_23_dilations_0 = const()[name = tensor<string, []>("inputs_23_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_23_groups_0 = const()[name = tensor<string, []>("inputs_23_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> input_103 = transpose(perm = input_103_perm_0, x = x_17)[name = tensor<string, []>("transpose_40")];
-            tensor<fp32, [1, 512, 16]> inputs_23 = conv(bias = encoder_conv_module_2_sequential_2_conv_bias, dilations = inputs_23_dilations_0, groups = inputs_23_groups_0, pad = inputs_23_pad_0, pad_type = inputs_23_pad_type_0, strides = inputs_23_strides_0, weight = encoder_conv_module_2_sequential_2_conv_weight, x = input_103)[name = tensor<string, []>("inputs_23")];
-            tensor<int32, [2]> var_629_split_sizes_0 = const()[name = tensor<string, []>("op_629_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_629_axis_0 = const()[name = tensor<string, []>("op_629_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> var_629_0, tensor<fp32, [1, 256, 16]> var_629_1 = split(axis = var_629_axis_0, split_sizes = var_629_split_sizes_0, x = inputs_23)[name = tensor<string, []>("op_629")];
-            tensor<fp32, [1, 256, 16]> var_631 = sigmoid(x = var_629_1)[name = tensor<string, []>("op_631")];
-            tensor<fp32, [1, 256, 16]> inputs_25 = mul(x = var_629_0, y = var_631)[name = tensor<string, []>("inputs_25")];
+            tensor<fp32, [1, 256, 16]> input_105 = transpose(perm = input_105_perm_0, x = x_17)[name = tensor<string, []>("transpose_40")];
+            tensor<fp32, [1, 512, 16]> inputs_23 = conv(bias = inner_encoder_conv_module_2_sequential_2_conv_bias, dilations = inputs_23_dilations_0, groups = inputs_23_groups_0, pad = inputs_23_pad_0, pad_type = inputs_23_pad_type_0, strides = inputs_23_strides_0, weight = inner_encoder_conv_module_2_sequential_2_conv_weight, x = input_105)[name = tensor<string, []>("inputs_23")];
+            tensor<int32, [2]> var_655_split_sizes_0 = const()[name = tensor<string, []>("op_655_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_655_axis_0 = const()[name = tensor<string, []>("op_655_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 256, 16]> var_655_0, tensor<fp32, [1, 256, 16]> var_655_1 = split(axis = var_655_axis_0, split_sizes = var_655_split_sizes_0, x = inputs_23)[name = tensor<string, []>("op_655")];
+            tensor<fp32, [1, 256, 16]> var_657 = sigmoid(x = var_655_1)[name = tensor<string, []>("op_657")];
+            tensor<fp32, [1, 256, 16]> inputs_25 = mul(x = var_655_0, y = var_657)[name = tensor<string, []>("inputs_25")];
             tensor<string, []> outputs_aug_5_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_5_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_5_pad_0 = const()[name = tensor<string, []>("outputs_aug_5_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_5_groups_0 = const()[name = tensor<string, []>("outputs_aug_5_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_5_strides_0 = const()[name = tensor<string, []>("outputs_aug_5_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_5_dilations_0 = const()[name = tensor<string, []>("outputs_aug_5_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [1, 256, 31]> outputs_aug_5 = conv(dilations = outputs_aug_5_dilations_0, groups = outputs_aug_5_groups_0, pad = outputs_aug_5_pad_0, pad_type = outputs_aug_5_pad_type_0, strides = outputs_aug_5_strides_0, weight = encoder_conv_module_2_sequential_4_conv_weight, x = inputs_25)[name = tensor<string, []>("outputs_aug_5")];
-            tensor<int32, [3]> input_105_begin_0 = const()[name = tensor<string, []>("input_105_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_105_end_0 = const()[name = tensor<string, []>("input_105_end_0"), val = tensor<int32, [3]>([1, 256, 16])];
-            tensor<bool, [3]> input_105_end_mask_0 = const()[name = tensor<string, []>("input_105_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [1, 256, 16]> input_105 = slice_by_index(begin = input_105_begin_0, end = input_105_end_0, end_mask = input_105_end_mask_0, x = outputs_aug_5)[name = tensor<string, []>("input_105")];
-            tensor<fp32, [1, 256, 16]> inputs_27 = batch_norm(beta = encoder_conv_module_2_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_2_sequential_5_weight, mean = encoder_conv_module_2_sequential_5_running_mean, variance = encoder_conv_module_2_sequential_5_running_var, x = input_105)[name = tensor<string, []>("inputs_27")];
-            tensor<fp32, [1, 256, 16]> input_107 = silu(x = inputs_27)[name = tensor<string, []>("input_107")];
-            tensor<string, []> input_109_pad_type_0 = const()[name = tensor<string, []>("input_109_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_109_strides_0 = const()[name = tensor<string, []>("input_109_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_109_pad_0 = const()[name = tensor<string, []>("input_109_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_109_dilations_0 = const()[name = tensor<string, []>("input_109_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_109_groups_0 = const()[name = tensor<string, []>("input_109_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> input_109 = conv(bias = encoder_conv_module_2_sequential_7_conv_bias, dilations = input_109_dilations_0, groups = input_109_groups_0, pad = input_109_pad_0, pad_type = input_109_pad_type_0, strides = input_109_strides_0, weight = encoder_conv_module_2_sequential_7_conv_weight, x = input_107)[name = tensor<string, []>("input_109")];
+            tensor<fp32, [1, 256, 31]> outputs_aug_5 = conv(dilations = outputs_aug_5_dilations_0, groups = outputs_aug_5_groups_0, pad = outputs_aug_5_pad_0, pad_type = outputs_aug_5_pad_type_0, strides = outputs_aug_5_strides_0, weight = inner_encoder_conv_module_2_sequential_4_conv_weight, x = inputs_25)[name = tensor<string, []>("outputs_aug_5")];
+            tensor<int32, [3]> input_107_begin_0 = const()[name = tensor<string, []>("input_107_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_107_end_0 = const()[name = tensor<string, []>("input_107_end_0"), val = tensor<int32, [3]>([1, 256, 16])];
+            tensor<bool, [3]> input_107_end_mask_0 = const()[name = tensor<string, []>("input_107_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [1, 256, 16]> input_107 = slice_by_index(begin = input_107_begin_0, end = input_107_end_0, end_mask = input_107_end_mask_0, x = outputs_aug_5)[name = tensor<string, []>("input_107")];
+            tensor<fp32, [1, 256, 16]> inputs_27 = batch_norm(beta = inner_encoder_conv_module_2_sequential_5_bias, epsilon = var_35, gamma = inner_encoder_conv_module_2_sequential_5_weight, mean = inner_encoder_conv_module_2_sequential_5_running_mean, variance = inner_encoder_conv_module_2_sequential_5_running_var, x = input_107)[name = tensor<string, []>("inputs_27")];
+            tensor<fp32, [1, 256, 16]> input_109 = silu(x = inputs_27)[name = tensor<string, []>("input_109")];
+            tensor<string, []> input_111_pad_type_0 = const()[name = tensor<string, []>("input_111_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_111_strides_0 = const()[name = tensor<string, []>("input_111_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_111_pad_0 = const()[name = tensor<string, []>("input_111_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_111_dilations_0 = const()[name = tensor<string, []>("input_111_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_111_groups_0 = const()[name = tensor<string, []>("input_111_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 256, 16]> input_111 = conv(bias = inner_encoder_conv_module_2_sequential_7_conv_bias, dilations = input_111_dilations_0, groups = input_111_groups_0, pad = input_111_pad_0, pad_type = input_111_pad_type_0, strides = input_111_strides_0, weight = inner_encoder_conv_module_2_sequential_7_conv_weight, x = input_109)[name = tensor<string, []>("input_111")];
             tensor<int32, [3]> conv_out_5_perm_0 = const()[name = tensor<string, []>("conv_out_5_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_662_begin_0 = const()[name = tensor<string, []>("op_662_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_662_end_0 = const()[name = tensor<string, []>("op_662_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_662_end_mask_0 = const()[name = tensor<string, []>("op_662_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 16, 256]> conv_out_5 = transpose(perm = conv_out_5_perm_0, x = input_109)[name = tensor<string, []>("transpose_39")];
-            tensor<fp32, [1, 1, 256]> var_662 = slice_by_index(begin = var_662_begin_0, end = var_662_end_0, end_mask = var_662_end_mask_0, x = conv_out_5)[name = tensor<string, []>("op_662")];
-            tensor<int32, [3]> var_664_perm_0 = const()[name = tensor<string, []>("op_664_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 1, 256]> var_664 = transpose(perm = var_664_perm_0, x = var_662)[name = tensor<string, []>("transpose_38")];
-            tensor<fp32, [1, 1, 256]> input_111 = add(x = x_15, y = var_664)[name = tensor<string, []>("input_111")];
-            tensor<int32, [1]> input_113_axes_0 = const()[name = tensor<string, []>("input_113_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_113 = layer_norm(axes = input_113_axes_0, beta = encoder_ffn2_2_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_2_module_sequential_0_weight, x = input_111)[name = tensor<string, []>("input_113")];
-            tensor<fp32, [1, 1, 1024]> inputs_29 = linear(bias = encoder_ffn2_2_module_sequential_1_linear_bias, weight = encoder_ffn2_2_module_sequential_1_linear_weight, x = input_113)[name = tensor<string, []>("linear_26")];
-            tensor<fp32, [1, 1, 1024]> input_115 = silu(x = inputs_29)[name = tensor<string, []>("input_115")];
-            tensor<fp32, [1, 1, 256]> input_119 = linear(bias = encoder_ffn2_2_module_sequential_4_linear_bias, weight = encoder_ffn2_2_module_sequential_4_linear_weight, x = input_115)[name = tensor<string, []>("linear_27")];
-            tensor<fp32, []> var_687 = const()[name = tensor<string, []>("op_687"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 1, 256]> var_688 = mul(x = input_119, y = var_687)[name = tensor<string, []>("op_688")];
-            tensor<fp32, [1, 1, 256]> input_121 = add(x = var_688, y = input_111)[name = tensor<string, []>("input_121")];
-            tensor<int32, [1]> input_123_axes_0 = const()[name = tensor<string, []>("input_123_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_123 = layer_norm(axes = input_123_axes_0, beta = encoder_layer_norm_2_bias, epsilon = var_28, gamma = encoder_layer_norm_2_weight, x = input_121)[name = tensor<string, []>("input_123")];
+            tensor<int32, [3]> var_688_begin_0 = const()[name = tensor<string, []>("op_688_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_688_end_0 = const()[name = tensor<string, []>("op_688_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_688_end_mask_0 = const()[name = tensor<string, []>("op_688_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 16, 256]> conv_out_5 = transpose(perm = conv_out_5_perm_0, x = input_111)[name = tensor<string, []>("transpose_39")];
+            tensor<fp32, [1, 1, 256]> var_688 = slice_by_index(begin = var_688_begin_0, end = var_688_end_0, end_mask = var_688_end_mask_0, x = conv_out_5)[name = tensor<string, []>("op_688")];
+            tensor<int32, [3]> var_690_perm_0 = const()[name = tensor<string, []>("op_690_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 1, 256]> var_690 = transpose(perm = var_690_perm_0, x = var_688)[name = tensor<string, []>("transpose_38")];
+            tensor<fp32, [1, 1, 256]> input_113 = add(x = x_15, y = var_690)[name = tensor<string, []>("input_113")];
+            tensor<int32, [1]> input_115_axes_0 = const()[name = tensor<string, []>("input_115_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 1, 256]> input_115 = layer_norm(axes = input_115_axes_0, beta = inner_encoder_ffn2_2_module_sequential_0_bias, epsilon = var_35, gamma = inner_encoder_ffn2_2_module_sequential_0_weight, x = input_113)[name = tensor<string, []>("input_115")];
+            tensor<fp32, [1, 1, 1024]> inputs_29 = linear(bias = inner_encoder_ffn2_2_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_2_module_sequential_1_linear_weight, x = input_115)[name = tensor<string, []>("linear_26")];
+            tensor<fp32, [1, 1, 1024]> input_117 = silu(x = inputs_29)[name = tensor<string, []>("input_117")];
+            tensor<fp32, [1, 1, 256]> input_121 = linear(bias = inner_encoder_ffn2_2_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_2_module_sequential_4_linear_weight, x = input_117)[name = tensor<string, []>("linear_27")];
+            tensor<fp32, []> var_713 = const()[name = tensor<string, []>("op_713"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 1, 256]> var_714 = mul(x = input_121, y = var_713)[name = tensor<string, []>("op_714")];
+            tensor<fp32, [1, 1, 256]> input_123 = add(x = var_714, y = input_113)[name = tensor<string, []>("input_123")];
             tensor<int32, [1]> input_125_axes_0 = const()[name = tensor<string, []>("input_125_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_125 = layer_norm(axes = input_125_axes_0, beta = encoder_ffn1_3_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_3_module_sequential_0_weight, x = input_123)[name = tensor<string, []>("input_125")];
-            tensor<fp32, [1, 1, 1024]> inputs_31 = linear(bias = encoder_ffn1_3_module_sequential_1_linear_bias, weight = encoder_ffn1_3_module_sequential_1_linear_weight, x = input_125)[name = tensor<string, []>("linear_28")];
-            tensor<fp32, [1, 1, 1024]> input_127 = silu(x = inputs_31)[name = tensor<string, []>("input_127")];
-            tensor<fp32, [1, 1, 256]> input_131 = linear(bias = encoder_ffn1_3_module_sequential_4_linear_bias, weight = encoder_ffn1_3_module_sequential_4_linear_weight, x = input_127)[name = tensor<string, []>("linear_29")];
-            tensor<fp32, []> var_717 = const()[name = tensor<string, []>("op_717"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 1, 256]> var_718 = mul(x = input_131, y = var_717)[name = tensor<string, []>("op_718")];
-            tensor<fp32, [1, 1, 256]> input_133 = add(x = var_718, y = input_123)[name = tensor<string, []>("input_133")];
+            tensor<fp32, [1, 1, 256]> input_125 = layer_norm(axes = input_125_axes_0, beta = inner_encoder_layer_norm_2_bias, epsilon = var_35, gamma = inner_encoder_layer_norm_2_weight, x = input_123)[name = tensor<string, []>("input_125")];
+            tensor<int32, [1]> input_127_axes_0 = const()[name = tensor<string, []>("input_127_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 1, 256]> input_127 = layer_norm(axes = input_127_axes_0, beta = inner_encoder_ffn1_3_module_sequential_0_bias, epsilon = var_35, gamma = inner_encoder_ffn1_3_module_sequential_0_weight, x = input_125)[name = tensor<string, []>("input_127")];
+            tensor<fp32, [1, 1, 1024]> inputs_31 = linear(bias = inner_encoder_ffn1_3_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_3_module_sequential_1_linear_weight, x = input_127)[name = tensor<string, []>("linear_28")];
+            tensor<fp32, [1, 1, 1024]> input_129 = silu(x = inputs_31)[name = tensor<string, []>("input_129")];
+            tensor<fp32, [1, 1, 256]> input_133 = linear(bias = inner_encoder_ffn1_3_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_3_module_sequential_4_linear_weight, x = input_129)[name = tensor<string, []>("linear_29")];
+            tensor<fp32, []> var_743 = const()[name = tensor<string, []>("op_743"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 1, 256]> var_744 = mul(x = input_133, y = var_743)[name = tensor<string, []>("op_744")];
+            tensor<fp32, [1, 1, 256]> input_135 = add(x = var_744, y = input_125)[name = tensor<string, []>("input_135")];
             tensor<int32, [1]> x_19_axes_0 = const()[name = tensor<string, []>("x_19_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> x_19 = layer_norm(axes = x_19_axes_0, beta = encoder_ret_lns_3_bias, epsilon = var_28, gamma = encoder_ret_lns_3_weight, x = input_133)[name = tensor<string, []>("x_19")];
+            tensor<fp32, [1, 1, 256]> x_19 = layer_norm(axes = x_19_axes_0, beta = inner_encoder_ret_lns_3_bias, epsilon = var_35, gamma = inner_encoder_ret_lns_3_weight, x = input_135)[name = tensor<string, []>("x_19")];
             tensor<int32, [5]> prev_kv_7_begin_0 = const()[name = tensor<string, []>("prev_kv_7_begin_0"), val = tensor<int32, [5]>([3, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_7_end_0 = const()[name = tensor<string, []>("prev_kv_7_end_0"), val = tensor<int32, [5]>([4, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_7_end_mask_0 = const()[name = tensor<string, []>("prev_kv_7_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -667,175 +673,168 @@ program(1.0)
             tensor<bool, [2]> prev_scale_7_end_mask_0 = const()[name = tensor<string, []>("prev_scale_7_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_7_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_7_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_7 = slice_by_index(begin = prev_scale_7_begin_0, end = prev_scale_7_end_0, end_mask = prev_scale_7_end_mask_0, squeeze_mask = prev_scale_7_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_7")];
-            tensor<fp32, [1, 1, 256]> var_732 = linear(bias = encoder_q_proj_3_bias, weight = encoder_q_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_30")];
-            tensor<int32, [4]> var_733 = const()[name = tensor<string, []>("op_733"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_734 = reshape(shape = var_733, x = var_732)[name = tensor<string, []>("op_734")];
+            tensor<fp32, [1, 1, 256]> var_758 = linear(bias = inner_encoder_q_proj_3_bias, weight = inner_encoder_q_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_30")];
+            tensor<int32, [4]> var_759 = const()[name = tensor<string, []>("op_759"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_760 = reshape(shape = var_759, x = var_758)[name = tensor<string, []>("op_760")];
             tensor<int32, [4]> q_7_perm_0 = const()[name = tensor<string, []>("q_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> var_738 = linear(bias = encoder_k_proj_3_bias, weight = encoder_k_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_31")];
-            tensor<fp32, []> var_739 = const()[name = tensor<string, []>("op_739"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 1, 256]> var_740 = mul(x = var_738, y = var_739)[name = tensor<string, []>("op_740")];
-            tensor<int32, [4]> var_741 = const()[name = tensor<string, []>("op_741"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_742 = reshape(shape = var_741, x = var_740)[name = tensor<string, []>("op_742")];
+            tensor<fp32, [1, 1, 256]> var_764 = linear(bias = inner_encoder_k_proj_3_bias, weight = inner_encoder_k_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_31")];
+            tensor<fp32, []> var_765 = const()[name = tensor<string, []>("op_765"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 1, 256]> var_766 = mul(x = var_764, y = var_765)[name = tensor<string, []>("op_766")];
+            tensor<int32, [4]> var_767 = const()[name = tensor<string, []>("op_767"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_768 = reshape(shape = var_767, x = var_766)[name = tensor<string, []>("op_768")];
             tensor<int32, [4]> k_7_perm_0 = const()[name = tensor<string, []>("k_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> var_746 = linear(bias = encoder_v_proj_3_bias, weight = encoder_v_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_32")];
-            tensor<int32, [4]> var_747 = const()[name = tensor<string, []>("op_747"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_748 = reshape(shape = var_747, x = var_746)[name = tensor<string, []>("op_748")];
+            tensor<fp32, [1, 1, 256]> var_772 = linear(bias = inner_encoder_v_proj_3_bias, weight = inner_encoder_v_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_32")];
+            tensor<int32, [4]> var_773 = const()[name = tensor<string, []>("op_773"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_774 = reshape(shape = var_773, x = var_772)[name = tensor<string, []>("op_774")];
             tensor<int32, [4]> v_7_perm_0 = const()[name = tensor<string, []>("v_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> input_137 = linear(bias = encoder_g_proj_3_bias, weight = encoder_g_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_33")];
+            tensor<fp32, [1, 1, 256]> input_139 = linear(bias = inner_encoder_g_proj_3_bias, weight = inner_encoder_g_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_33")];
             tensor<fp32, [1]> sqrt_s0_7 = sqrt(x = prev_scale_7)[name = tensor<string, []>("sqrt_s0_7")];
-            tensor<fp32, [1]> s_t_7 = add(x = prev_scale_7, y = encoder__t_index)[name = tensor<string, []>("s_t_7")];
+            tensor<fp32, [1]> s_t_7 = add(x = prev_scale_7, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_7")];
             tensor<fp32, [1]> sqrt_s_t_7 = sqrt(x = s_t_7)[name = tensor<string, []>("sqrt_s_t_7")];
             tensor<bool, []> qk_7_transpose_x_1 = const()[name = tensor<string, []>("qk_7_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_7_transpose_y_1 = const()[name = tensor<string, []>("qk_7_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 1, 64]> k_7 = transpose(perm = k_7_perm_0, x = var_742)[name = tensor<string, []>("transpose_36")];
-            tensor<fp32, [1, 4, 1, 64]> q_7 = transpose(perm = q_7_perm_0, x = var_734)[name = tensor<string, []>("transpose_37")];
+            tensor<fp32, [1, 4, 1, 64]> k_7 = transpose(perm = k_7_perm_0, x = var_768)[name = tensor<string, []>("transpose_36")];
+            tensor<fp32, [1, 4, 1, 64]> q_7 = transpose(perm = q_7_perm_0, x = var_760)[name = tensor<string, []>("transpose_37")];
             tensor<fp32, [1, 4, 1, 1]> qk_7 = matmul(transpose_x = qk_7_transpose_x_1, transpose_y = qk_7_transpose_y_1, x = q_7, y = k_7)[name = tensor<string, []>("qk_7")];
-            tensor<int32, [2]> var_758 = const()[name = tensor<string, []>("op_758"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp32, [1, 1]> var_759 = reshape(shape = var_758, x = sqrt_s_t_7)[name = tensor<string, []>("op_759")];
-            tensor<fp32, [1, 1]> M_7 = real_div(x = encoder__causal_mask, y = var_759)[name = tensor<string, []>("M_7")];
-            tensor<fp32, [1, 4, 1, 1]> var_761 = mul(x = qk_7, y = M_7)[name = tensor<string, []>("op_761")];
+            tensor<int32, [2]> var_784 = const()[name = tensor<string, []>("op_784"), val = tensor<int32, [2]>([1, 1])];
+            tensor<fp32, [1, 1]> var_785 = reshape(shape = var_784, x = sqrt_s_t_7)[name = tensor<string, []>("op_785")];
+            tensor<fp32, [1, 1]> M_7 = real_div(x = inner_encoder__causal_mask, y = var_785)[name = tensor<string, []>("M_7")];
+            tensor<fp32, [1, 4, 1, 1]> var_787 = mul(x = qk_7, y = M_7)[name = tensor<string, []>("op_787")];
             tensor<bool, []> inner_7_transpose_x_0 = const()[name = tensor<string, []>("inner_7_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_7_transpose_y_0 = const()[name = tensor<string, []>("inner_7_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 1, 64]> v_7 = transpose(perm = v_7_perm_0, x = var_748)[name = tensor<string, []>("transpose_35")];
-            tensor<fp32, [1, 4, 1, 64]> inner_7 = matmul(transpose_x = inner_7_transpose_x_0, transpose_y = inner_7_transpose_y_0, x = var_761, y = v_7)[name = tensor<string, []>("inner_7")];
-            tensor<bool, []> var_763_transpose_x_0 = const()[name = tensor<string, []>("op_763_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_763_transpose_y_0 = const()[name = tensor<string, []>("op_763_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 1, 64]> var_763 = matmul(transpose_x = var_763_transpose_x_0, transpose_y = var_763_transpose_y_0, x = q_7, y = prev_kv_7)[name = tensor<string, []>("op_763")];
-            tensor<fp32, [1]> var_764 = real_div(x = sqrt_s0_7, y = sqrt_s_t_7)[name = tensor<string, []>("op_764")];
-            tensor<int32, [4]> var_765 = const()[name = tensor<string, []>("op_765"), val = tensor<int32, [4]>([1, 1, 1, 1])];
-            tensor<fp32, [1, 1, 1, 1]> var_766 = reshape(shape = var_765, x = var_764)[name = tensor<string, []>("op_766")];
-            tensor<fp32, [1, 4, 1, 64]> cross_7 = mul(x = var_763, y = var_766)[name = tensor<string, []>("cross_7")];
+            tensor<fp32, [1, 4, 1, 64]> v_7 = transpose(perm = v_7_perm_0, x = var_774)[name = tensor<string, []>("transpose_35")];
+            tensor<fp32, [1, 4, 1, 64]> inner_7 = matmul(transpose_x = inner_7_transpose_x_0, transpose_y = inner_7_transpose_y_0, x = var_787, y = v_7)[name = tensor<string, []>("inner_7")];
+            tensor<bool, []> var_789_transpose_x_0 = const()[name = tensor<string, []>("op_789_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_789_transpose_y_0 = const()[name = tensor<string, []>("op_789_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 1, 64]> var_789 = matmul(transpose_x = var_789_transpose_x_0, transpose_y = var_789_transpose_y_0, x = q_7, y = prev_kv_7)[name = tensor<string, []>("op_789")];
+            tensor<fp32, [1]> var_790 = real_div(x = sqrt_s0_7, y = sqrt_s_t_7)[name = tensor<string, []>("op_790")];
+            tensor<int32, [4]> var_791 = const()[name = tensor<string, []>("op_791"), val = tensor<int32, [4]>([1, 1, 1, 1])];
+            tensor<fp32, [1, 1, 1, 1]> var_792 = reshape(shape = var_791, x = var_790)[name = tensor<string, []>("op_792")];
+            tensor<fp32, [1, 4, 1, 64]> cross_7 = mul(x = var_789, y = var_792)[name = tensor<string, []>("cross_7")];
             tensor<fp32, [1, 4, 1, 64]> out_19 = add(x = inner_7, y = cross_7)[name = tensor<string, []>("out_19")];
-            tensor<fp32, [1, 4, 64, 64]> var_769 = mul(x = prev_kv_7, y = sqrt_s0_7)[name = tensor<string, []>("op_769")];
-            tensor<bool, []> var_771_transpose_x_1 = const()[name = tensor<string, []>("op_771_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_771_transpose_y_1 = const()[name = tensor<string, []>("op_771_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_771 = matmul(transpose_x = var_771_transpose_x_1, transpose_y = var_771_transpose_y_1, x = k_7, y = v_7)[name = tensor<string, []>("op_771")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_7 = add(x = var_769, y = var_771)[name = tensor<string, []>("new_kv_unnorm_7")];
-            tensor<fp32, []> var_773 = const()[name = tensor<string, []>("op_773"), val = tensor<fp32, []>(0x1p+0)];
-            tensor<fp32, [1]> new_scale_7 = add(x = prev_scale_7, y = var_773)[name = tensor<string, []>("new_scale_7")];
-            tensor<fp32, [1]> var_775 = sqrt(x = new_scale_7)[name = tensor<string, []>("op_775")];
-            tensor<fp32, [1, 4, 64, 64]> nkv_1 = real_div(x = new_kv_unnorm_7, y = var_775)[name = tensor<string, []>("nkv_1")];
-            tensor<int32, [4]> var_777_perm_0 = const()[name = tensor<string, []>("op_777_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_795 = mul(x = prev_kv_7, y = sqrt_s0_7)[name = tensor<string, []>("op_795")];
+            tensor<bool, []> var_797_transpose_x_1 = const()[name = tensor<string, []>("op_797_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_797_transpose_y_1 = const()[name = tensor<string, []>("op_797_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_797 = matmul(transpose_x = var_797_transpose_x_1, transpose_y = var_797_transpose_y_1, x = k_7, y = v_7)[name = tensor<string, []>("op_797")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_7 = add(x = var_795, y = var_797)[name = tensor<string, []>("new_kv_unnorm_7")];
+            tensor<fp32, []> var_799 = const()[name = tensor<string, []>("op_799"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<fp32, [1]> new_scale_7 = add(x = prev_scale_7, y = var_799)[name = tensor<string, []>("new_scale_7")];
+            tensor<fp32, [1]> var_801 = sqrt(x = new_scale_7)[name = tensor<string, []>("op_801")];
+            tensor<fp32, [1, 4, 64, 64]> nkv_1 = real_div(x = new_kv_unnorm_7, y = var_801)[name = tensor<string, []>("nkv_1")];
+            tensor<int32, [4]> var_803_perm_0 = const()[name = tensor<string, []>("op_803_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_21_axes_0 = const()[name = tensor<string, []>("out_21_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 4, 64]> var_777 = transpose(perm = var_777_perm_0, x = out_19)[name = tensor<string, []>("transpose_34")];
-            tensor<fp32, [1, 1, 4, 64]> out_21 = layer_norm(axes = out_21_axes_0, epsilon = var_17, x = var_777)[name = tensor<string, []>("out_21")];
-            tensor<int32, [3]> var_781 = const()[name = tensor<string, []>("op_781"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<fp32, [1, 1, 256]> out_23 = reshape(shape = var_781, x = out_21)[name = tensor<string, []>("out_23")];
-            tensor<fp32, [1, 1, 256]> var_783 = silu(x = input_137)[name = tensor<string, []>("op_783")];
-            tensor<fp32, [1, 1, 256]> input_139 = mul(x = var_783, y = out_23)[name = tensor<string, []>("input_139")];
-            tensor<fp32, [1, 1, 256]> ret_out_7 = linear(bias = encoder_out_proj_3_bias, weight = encoder_out_proj_3_weight, x = input_139)[name = tensor<string, []>("linear_34")];
-            tensor<fp32, [1, 1, 256]> x_21 = add(x = input_133, y = ret_out_7)[name = tensor<string, []>("x_21")];
+            tensor<fp32, [1, 1, 4, 64]> var_803 = transpose(perm = var_803_perm_0, x = out_19)[name = tensor<string, []>("transpose_34")];
+            tensor<fp32, [1, 1, 4, 64]> out_21 = layer_norm(axes = out_21_axes_0, epsilon = var_43, x = var_803)[name = tensor<string, []>("out_21")];
+            tensor<int32, [3]> var_807 = const()[name = tensor<string, []>("op_807"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<fp32, [1, 1, 256]> out_23 = reshape(shape = var_807, x = out_21)[name = tensor<string, []>("out_23")];
+            tensor<fp32, [1, 1, 256]> var_809 = silu(x = input_139)[name = tensor<string, []>("op_809")];
+            tensor<fp32, [1, 1, 256]> input_141 = mul(x = var_809, y = out_23)[name = tensor<string, []>("input_141")];
+            tensor<fp32, [1, 1, 256]> ret_out_7 = linear(bias = inner_encoder_out_proj_3_bias, weight = inner_encoder_out_proj_3_weight, x = input_141)[name = tensor<string, []>("linear_34")];
+            tensor<fp32, [1, 1, 256]> x_21 = add(x = input_135, y = ret_out_7)[name = tensor<string, []>("x_21")];
             tensor<int32, [4]> window_13_begin_0 = const()[name = tensor<string, []>("window_13_begin_0"), val = tensor<int32, [4]>([3, 0, 0, 0])];
             tensor<int32, [4]> window_13_end_0 = const()[name = tensor<string, []>("window_13_end_0"), val = tensor<int32, [4]>([4, 1, 16, 256])];
             tensor<bool, [4]> window_13_end_mask_0 = const()[name = tensor<string, []>("window_13_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_13_squeeze_mask_0 = const()[name = tensor<string, []>("window_13_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_13 = slice_by_index(begin = window_13_begin_0, end = window_13_end_0, end_mask = window_13_end_mask_0, squeeze_mask = window_13_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_13")];
-            tensor<int32, [3]> var_794_begin_0 = const()[name = tensor<string, []>("op_794_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_794_end_0 = const()[name = tensor<string, []>("op_794_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_794_end_mask_0 = const()[name = tensor<string, []>("op_794_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_794 = slice_by_index(begin = var_794_begin_0, end = var_794_end_0, end_mask = var_794_end_mask_0, x = window_13)[name = tensor<string, []>("op_794")];
+            tensor<int32, [3]> var_820_begin_0 = const()[name = tensor<string, []>("op_820_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_820_end_0 = const()[name = tensor<string, []>("op_820_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_820_end_mask_0 = const()[name = tensor<string, []>("op_820_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_820 = slice_by_index(begin = var_820_begin_0, end = var_820_end_0, end_mask = var_820_end_mask_0, x = window_13)[name = tensor<string, []>("op_820")];
             tensor<bool, []> window_interleave_0 = const()[name = tensor<string, []>("window_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window = concat(axis = var_26, interleave = window_interleave_0, values = (var_794, x_21))[name = tensor<string, []>("window")];
-            tensor<bool, []> input_141_interleave_0 = const()[name = tensor<string, []>("input_141_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> input_141 = concat(axis = var_23, interleave = input_141_interleave_0, values = window)[name = tensor<string, []>("input_141")];
+            tensor<fp32, [1, 16, 256]> window = concat(axis = var_52, interleave = window_interleave_0, values = (var_820, x_21))[name = tensor<string, []>("window")];
+            tensor<bool, []> input_143_interleave_0 = const()[name = tensor<string, []>("input_143_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 16, 256]> input_143 = concat(axis = var_38, interleave = input_143_interleave_0, values = window)[name = tensor<string, []>("input_143")];
             tensor<int32, [1]> x_23_axes_0 = const()[name = tensor<string, []>("x_23_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 16, 256]> x_23 = layer_norm(axes = x_23_axes_0, beta = encoder_conv_module_3_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_3_sequential_0_weight, x = input_141)[name = tensor<string, []>("x_23")];
-            tensor<int32, [3]> input_143_perm_0 = const()[name = tensor<string, []>("input_143_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [1, 16, 256]> x_23 = layer_norm(axes = x_23_axes_0, beta = inner_encoder_conv_module_3_sequential_0_bias, epsilon = var_35, gamma = inner_encoder_conv_module_3_sequential_0_weight, x = input_143)[name = tensor<string, []>("x_23")];
+            tensor<int32, [3]> input_145_perm_0 = const()[name = tensor<string, []>("input_145_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_33_pad_type_0 = const()[name = tensor<string, []>("inputs_33_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_33_strides_0 = const()[name = tensor<string, []>("inputs_33_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_33_pad_0 = const()[name = tensor<string, []>("inputs_33_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_33_dilations_0 = const()[name = tensor<string, []>("inputs_33_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_33_groups_0 = const()[name = tensor<string, []>("inputs_33_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> input_143 = transpose(perm = input_143_perm_0, x = x_23)[name = tensor<string, []>("transpose_33")];
-            tensor<fp32, [1, 512, 16]> inputs_33 = conv(bias = encoder_conv_module_3_sequential_2_conv_bias, dilations = inputs_33_dilations_0, groups = inputs_33_groups_0, pad = inputs_33_pad_0, pad_type = inputs_33_pad_type_0, strides = inputs_33_strides_0, weight = encoder_conv_module_3_sequential_2_conv_weight, x = input_143)[name = tensor<string, []>("inputs_33")];
-            tensor<int32, [2]> var_819_split_sizes_0 = const()[name = tensor<string, []>("op_819_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_819_axis_0 = const()[name = tensor<string, []>("op_819_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> var_819_0, tensor<fp32, [1, 256, 16]> var_819_1 = split(axis = var_819_axis_0, split_sizes = var_819_split_sizes_0, x = inputs_33)[name = tensor<string, []>("op_819")];
-            tensor<fp32, [1, 256, 16]> var_821 = sigmoid(x = var_819_1)[name = tensor<string, []>("op_821")];
-            tensor<fp32, [1, 256, 16]> inputs_35 = mul(x = var_819_0, y = var_821)[name = tensor<string, []>("inputs_35")];
+            tensor<fp32, [1, 256, 16]> input_145 = transpose(perm = input_145_perm_0, x = x_23)[name = tensor<string, []>("transpose_33")];
+            tensor<fp32, [1, 512, 16]> inputs_33 = conv(bias = inner_encoder_conv_module_3_sequential_2_conv_bias, dilations = inputs_33_dilations_0, groups = inputs_33_groups_0, pad = inputs_33_pad_0, pad_type = inputs_33_pad_type_0, strides = inputs_33_strides_0, weight = inner_encoder_conv_module_3_sequential_2_conv_weight, x = input_145)[name = tensor<string, []>("inputs_33")];
+            tensor<int32, [2]> var_845_split_sizes_0 = const()[name = tensor<string, []>("op_845_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_845_axis_0 = const()[name = tensor<string, []>("op_845_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 256, 16]> var_845_0, tensor<fp32, [1, 256, 16]> var_845_1 = split(axis = var_845_axis_0, split_sizes = var_845_split_sizes_0, x = inputs_33)[name = tensor<string, []>("op_845")];
+            tensor<fp32, [1, 256, 16]> var_847 = sigmoid(x = var_845_1)[name = tensor<string, []>("op_847")];
+            tensor<fp32, [1, 256, 16]> inputs_35 = mul(x = var_845_0, y = var_847)[name = tensor<string, []>("inputs_35")];
             tensor<string, []> outputs_aug_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_pad_0 = const()[name = tensor<string, []>("outputs_aug_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_groups_0 = const()[name = tensor<string, []>("outputs_aug_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_strides_0 = const()[name = tensor<string, []>("outputs_aug_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_dilations_0 = const()[name = tensor<string, []>("outputs_aug_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [1, 256, 31]> outputs_aug = conv(dilations = outputs_aug_dilations_0, groups = outputs_aug_groups_0, pad = outputs_aug_pad_0, pad_type = outputs_aug_pad_type_0, strides = outputs_aug_strides_0, weight = encoder_conv_module_3_sequential_4_conv_weight, x = inputs_35)[name = tensor<string, []>("outputs_aug")];
-            tensor<int32, [3]> input_145_begin_0 = const()[name = tensor<string, []>("input_145_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_145_end_0 = const()[name = tensor<string, []>("input_145_end_0"), val = tensor<int32, [3]>([1, 256, 16])];
-            tensor<bool, [3]> input_145_end_mask_0 = const()[name = tensor<string, []>("input_145_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [1, 256, 16]> input_145 = slice_by_index(begin = input_145_begin_0, end = input_145_end_0, end_mask = input_145_end_mask_0, x = outputs_aug)[name = tensor<string, []>("input_145")];
-            tensor<fp32, [1, 256, 16]> inputs_37 = batch_norm(beta = encoder_conv_module_3_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_3_sequential_5_weight, mean = encoder_conv_module_3_sequential_5_running_mean, variance = encoder_conv_module_3_sequential_5_running_var, x = input_145)[name = tensor<string, []>("inputs_37")];
-            tensor<fp32, [1, 256, 16]> input_147 = silu(x = inputs_37)[name = tensor<string, []>("input_147")];
-            tensor<string, []> input_149_pad_type_0 = const()[name = tensor<string, []>("input_149_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_149_strides_0 = const()[name = tensor<string, []>("input_149_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_149_pad_0 = const()[name = tensor<string, []>("input_149_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_149_dilations_0 = const()[name = tensor<string, []>("input_149_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_149_groups_0 = const()[name = tensor<string, []>("input_149_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> input_149 = conv(bias = encoder_conv_module_3_sequential_7_conv_bias, dilations = input_149_dilations_0, groups = input_149_groups_0, pad = input_149_pad_0, pad_type = input_149_pad_type_0, strides = input_149_strides_0, weight = encoder_conv_module_3_sequential_7_conv_weight, x = input_147)[name = tensor<string, []>("input_149")];
+            tensor<fp32, [1, 256, 31]> outputs_aug = conv(dilations = outputs_aug_dilations_0, groups = outputs_aug_groups_0, pad = outputs_aug_pad_0, pad_type = outputs_aug_pad_type_0, strides = outputs_aug_strides_0, weight = inner_encoder_conv_module_3_sequential_4_conv_weight, x = inputs_35)[name = tensor<string, []>("outputs_aug")];
+            tensor<int32, [3]> input_147_begin_0 = const()[name = tensor<string, []>("input_147_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_147_end_0 = const()[name = tensor<string, []>("input_147_end_0"), val = tensor<int32, [3]>([1, 256, 16])];
+            tensor<bool, [3]> input_147_end_mask_0 = const()[name = tensor<string, []>("input_147_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [1, 256, 16]> input_147 = slice_by_index(begin = input_147_begin_0, end = input_147_end_0, end_mask = input_147_end_mask_0, x = outputs_aug)[name = tensor<string, []>("input_147")];
+            tensor<fp32, [1, 256, 16]> inputs_37 = batch_norm(beta = inner_encoder_conv_module_3_sequential_5_bias, epsilon = var_35, gamma = inner_encoder_conv_module_3_sequential_5_weight, mean = inner_encoder_conv_module_3_sequential_5_running_mean, variance = inner_encoder_conv_module_3_sequential_5_running_var, x = input_147)[name = tensor<string, []>("inputs_37")];
+            tensor<fp32, [1, 256, 16]> input_149 = silu(x = inputs_37)[name = tensor<string, []>("input_149")];
+            tensor<string, []> input_151_pad_type_0 = const()[name = tensor<string, []>("input_151_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_151_strides_0 = const()[name = tensor<string, []>("input_151_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_151_pad_0 = const()[name = tensor<string, []>("input_151_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_151_dilations_0 = const()[name = tensor<string, []>("input_151_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_151_groups_0 = const()[name = tensor<string, []>("input_151_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 256, 16]> input_151 = conv(bias = inner_encoder_conv_module_3_sequential_7_conv_bias, dilations = input_151_dilations_0, groups = input_151_groups_0, pad = input_151_pad_0, pad_type = input_151_pad_type_0, strides = input_151_strides_0, weight = inner_encoder_conv_module_3_sequential_7_conv_weight, x = input_149)[name = tensor<string, []>("input_151")];
             tensor<int32, [3]> conv_out_7_perm_0 = const()[name = tensor<string, []>("conv_out_7_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_852_begin_0 = const()[name = tensor<string, []>("op_852_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_852_end_0 = const()[name = tensor<string, []>("op_852_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_852_end_mask_0 = const()[name = tensor<string, []>("op_852_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 16, 256]> conv_out_7 = transpose(perm = conv_out_7_perm_0, x = input_149)[name = tensor<string, []>("transpose_32")];
-            tensor<fp32, [1, 1, 256]> var_852 = slice_by_index(begin = var_852_begin_0, end = var_852_end_0, end_mask = var_852_end_mask_0, x = conv_out_7)[name = tensor<string, []>("op_852")];
-            tensor<int32, [3]> var_854_perm_0 = const()[name = tensor<string, []>("op_854_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 1, 256]> var_854 = transpose(perm = var_854_perm_0, x = var_852)[name = tensor<string, []>("transpose_31")];
-            tensor<fp32, [1, 1, 256]> input_151 = add(x = x_21, y = var_854)[name = tensor<string, []>("input_151")];
-            tensor<int32, [1]> input_153_axes_0 = const()[name = tensor<string, []>("input_153_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_153 = layer_norm(axes = input_153_axes_0, beta = encoder_ffn2_3_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_3_module_sequential_0_weight, x = input_151)[name = tensor<string, []>("input_153")];
-            tensor<fp32, [1, 1, 1024]> inputs = linear(bias = encoder_ffn2_3_module_sequential_1_linear_bias, weight = encoder_ffn2_3_module_sequential_1_linear_weight, x = input_153)[name = tensor<string, []>("linear_35")];
-            tensor<fp32, [1, 1, 1024]> input_155 = silu(x = inputs)[name = tensor<string, []>("input_155")];
-            tensor<fp32, [1, 1, 256]> input_159 = linear(bias = encoder_ffn2_3_module_sequential_4_linear_bias, weight = encoder_ffn2_3_module_sequential_4_linear_weight, x = input_155)[name = tensor<string, []>("linear_36")];
-            tensor<fp32, []> var_877 = const()[name = tensor<string, []>("op_877"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 1, 256]> var_878 = mul(x = input_159, y = var_877)[name = tensor<string, []>("op_878")];
-            tensor<fp32, [1, 1, 256]> input_161 = add(x = var_878, y = input_151)[name = tensor<string, []>("input_161")];
+            tensor<int32, [3]> var_878_begin_0 = const()[name = tensor<string, []>("op_878_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_878_end_0 = const()[name = tensor<string, []>("op_878_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_878_end_mask_0 = const()[name = tensor<string, []>("op_878_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 16, 256]> conv_out_7 = transpose(perm = conv_out_7_perm_0, x = input_151)[name = tensor<string, []>("transpose_32")];
+            tensor<fp32, [1, 1, 256]> var_878 = slice_by_index(begin = var_878_begin_0, end = var_878_end_0, end_mask = var_878_end_mask_0, x = conv_out_7)[name = tensor<string, []>("op_878")];
+            tensor<int32, [3]> var_880_perm_0 = const()[name = tensor<string, []>("op_880_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 1, 256]> var_880 = transpose(perm = var_880_perm_0, x = var_878)[name = tensor<string, []>("transpose_31")];
+            tensor<fp32, [1, 1, 256]> input_153 = add(x = x_21, y = var_880)[name = tensor<string, []>("input_153")];
+            tensor<int32, [1]> input_155_axes_0 = const()[name = tensor<string, []>("input_155_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 1, 256]> input_155 = layer_norm(axes = input_155_axes_0, beta = inner_encoder_ffn2_3_module_sequential_0_bias, epsilon = var_35, gamma = inner_encoder_ffn2_3_module_sequential_0_weight, x = input_153)[name = tensor<string, []>("input_155")];
+            tensor<fp32, [1, 1, 1024]> inputs = linear(bias = inner_encoder_ffn2_3_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_3_module_sequential_1_linear_weight, x = input_155)[name = tensor<string, []>("linear_35")];
+            tensor<fp32, [1, 1, 1024]> input_157 = silu(x = inputs)[name = tensor<string, []>("input_157")];
+            tensor<fp32, [1, 1, 256]> input_161 = linear(bias = inner_encoder_ffn2_3_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_3_module_sequential_4_linear_weight, x = input_157)[name = tensor<string, []>("linear_36")];
+            tensor<fp32, []> var_903 = const()[name = tensor<string, []>("op_903"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 1, 256]> var_904 = mul(x = input_161, y = var_903)[name = tensor<string, []>("op_904")];
+            tensor<fp32, [1, 1, 256]> input_163 = add(x = var_904, y = input_153)[name = tensor<string, []>("input_163")];
             tensor<int32, [1]> x_25_axes_0 = const()[name = tensor<string, []>("x_25_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> x_25 = layer_norm(axes = x_25_axes_0, beta = encoder_layer_norm_3_bias, epsilon = var_28, gamma = encoder_layer_norm_3_weight, x = input_161)[name = tensor<string, []>("x_25")];
+            tensor<fp32, [1, 1, 256]> x_25 = layer_norm(axes = x_25_axes_0, beta = inner_encoder_layer_norm_3_bias, epsilon = var_35, gamma = inner_encoder_layer_norm_3_weight, x = input_163)[name = tensor<string, []>("x_25")];
             tensor<int32, [3]> x_ct_perm_0 = const()[name = tensor<string, []>("x_ct_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<bool, []> cat_interleave_0 = const()[name = tensor<string, []>("cat_interleave_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [1, 256, 1]> x_ct = transpose(perm = x_ct_perm_0, x = x_25)[name = tensor<string, []>("transpose_30")];
-            tensor<fp32, [1, 256, 19]> cat = concat(axis = var_20, interleave = cat_interleave_0, values = (cnn_window, x_ct))[name = tensor<string, []>("cat")];
+            tensor<fp32, [1, 256, 19]> cat = concat(axis = var_40, interleave = cat_interleave_0, values = (cnn_window, x_ct))[name = tensor<string, []>("cat")];
             tensor<string, []> conv_out_pad_type_0 = const()[name = tensor<string, []>("conv_out_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> conv_out_strides_0 = const()[name = tensor<string, []>("conv_out_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> conv_out_pad_0 = const()[name = tensor<string, []>("conv_out_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> conv_out_dilations_0 = const()[name = tensor<string, []>("conv_out_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> conv_out_groups_0 = const()[name = tensor<string, []>("conv_out_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 1]> conv_out = conv(bias = encoder_cnn_bias, dilations = conv_out_dilations_0, groups = conv_out_groups_0, pad = conv_out_pad_0, pad_type = conv_out_pad_type_0, strides = conv_out_strides_0, weight = encoder_cnn_weight, x = cat)[name = tensor<string, []>("conv_out")];
-            tensor<int32, [3]> var_896_begin_0 = const()[name = tensor<string, []>("op_896_begin_0"), val = tensor<int32, [3]>([0, 0, 1])];
-            tensor<int32, [3]> var_896_end_0 = const()[name = tensor<string, []>("op_896_end_0"), val = tensor<int32, [3]>([1, 256, 19])];
-            tensor<bool, [3]> var_896_end_mask_0 = const()[name = tensor<string, []>("op_896_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 256, 18]> cnn_window_new = slice_by_index(begin = var_896_begin_0, end = var_896_end_0, end_mask = var_896_end_mask_0, x = cat)[name = tensor<string, []>("op_896")];
-            tensor<int32, [3]> input_163_perm_0 = const()[name = tensor<string, []>("input_163_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [1]> var_898 = const()[name = tensor<string, []>("op_898"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_163 = transpose(perm = input_163_perm_0, x = conv_out)[name = tensor<string, []>("transpose_29")];
-            tensor<fp32, [1, 1, 1]> var_899 = reduce_l2_norm(axes = var_898, keep_dims = var_29, x = input_163)[name = tensor<string, []>("op_899")];
+            tensor<fp32, [1, 256, 1]> conv_out = conv(bias = inner_encoder_cnn_bias, dilations = conv_out_dilations_0, groups = conv_out_groups_0, pad = conv_out_pad_0, pad_type = conv_out_pad_type_0, strides = conv_out_strides_0, weight = inner_encoder_cnn_weight, x = cat)[name = tensor<string, []>("conv_out")];
+            tensor<int32, [3]> var_922_begin_0 = const()[name = tensor<string, []>("op_922_begin_0"), val = tensor<int32, [3]>([0, 0, 1])];
+            tensor<int32, [3]> var_922_end_0 = const()[name = tensor<string, []>("op_922_end_0"), val = tensor<int32, [3]>([1, 256, 19])];
+            tensor<bool, [3]> var_922_end_mask_0 = const()[name = tensor<string, []>("op_922_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 256, 18]> cnn_window_new = slice_by_index(begin = var_922_begin_0, end = var_922_end_0, end_mask = var_922_end_mask_0, x = cat)[name = tensor<string, []>("op_922")];
+            tensor<int32, [3]> input_165_perm_0 = const()[name = tensor<string, []>("input_165_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_924 = const()[name = tensor<string, []>("op_924"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 1, 256]> input_165 = transpose(perm = input_165_perm_0, x = conv_out)[name = tensor<string, []>("transpose_29")];
+            tensor<fp32, [1, 1, 1]> var_925 = reduce_l2_norm(axes = var_924, keep_dims = var_34, x = input_165)[name = tensor<string, []>("op_925")];
             tensor<fp32, []> const_12 = const()[name = tensor<string, []>("const_12"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1, 1, 1]> clip_0 = clip(alpha = var_11, beta = const_12, x = var_899)[name = tensor<string, []>("clip_0")];
-            tensor<fp32, [1, 1, 256]> emb = real_div(x = input_163, y = clip_0)[name = tensor<string, []>("emb")];
-            tensor<int32, []> var_903_axis_0 = const()[name = tensor<string, []>("op_903_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1, 4, 64, 64]> enc_kv_new = stack(axis = var_903_axis_0, values = (var_206, var_396, var_586, nkv_1))[name = tensor<string, []>("op_903")];
-            tensor<int32, []> var_905_axis_0 = const()[name = tensor<string, []>("op_905_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1]> enc_scale_new = stack(axis = var_905_axis_0, values = (new_scale_1, new_scale_3, new_scale_5, new_scale_7))[name = tensor<string, []>("op_905")];
-            tensor<int32, []> var_907_axis_0 = const()[name = tensor<string, []>("op_907_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1, 16, 256]> enc_conv_cache_new = stack(axis = var_907_axis_0, values = (window_3, window_7, window_11, window))[name = tensor<string, []>("op_907")];
-            tensor<fp32, []> var_916 = const()[name = tensor<string, []>("op_916"), val = tensor<fp32, []>(0x1.5798eep-27)];
-            tensor<fp32, []> var_921 = const()[name = tensor<string, []>("op_921"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
-            tensor<fp32, []> var_923 = const()[name = tensor<string, []>("op_923"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
-            tensor<bool, []> var_924 = const()[name = tensor<string, []>("op_924"), val = tensor<bool, []>(true)];
-            tensor<fp32, []> var_926 = const()[name = tensor<string, []>("op_926"), val = tensor<fp32, []>(0x1p+0)];
-            tensor<int32, []> var_930 = const()[name = tensor<string, []>("op_930"), val = tensor<int32, []>(-1)];
-            tensor<int32, []> var_936 = const()[name = tensor<string, []>("op_936"), val = tensor<int32, []>(0)];
-            tensor<fp32, [1, 1, 6, 256]> var_993 = const()[name = tensor<string, []>("op_993"), val = tensor<fp32, [1, 1, 6, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44395328)))];
-            tensor<int32, [1]> var_998_axes_0 = const()[name = tensor<string, []>("op_998_axes_0"), val = tensor<int32, [1]>([2])];
-            tensor<fp32, [1, 1, 1, 256]> var_998 = expand_dims(axes = var_998_axes_0, x = emb)[name = tensor<string, []>("op_998")];
+            tensor<fp32, [1, 1, 1]> clip_0 = clip(alpha = var_48, beta = const_12, x = var_925)[name = tensor<string, []>("clip_0")];
+            tensor<fp32, [1, 1, 256]> emb = real_div(x = input_165, y = clip_0)[name = tensor<string, []>("emb")];
+            tensor<int32, []> var_929_axis_0 = const()[name = tensor<string, []>("op_929_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1, 4, 64, 64]> enc_kv_new = stack(axis = var_929_axis_0, values = (var_232, var_422, var_612, nkv_1))[name = tensor<string, []>("op_929")];
+            tensor<int32, []> var_931_axis_0 = const()[name = tensor<string, []>("op_931_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1]> enc_scale_new = stack(axis = var_931_axis_0, values = (new_scale_1, new_scale_3, new_scale_5, new_scale_7))[name = tensor<string, []>("op_931")];
+            tensor<int32, []> var_933_axis_0 = const()[name = tensor<string, []>("op_933_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1, 16, 256]> enc_conv_cache_new = stack(axis = var_933_axis_0, values = (window_3, window_7, window_11, window))[name = tensor<string, []>("op_933")];
+            tensor<fp32, [1, 1, 6, 256]> var_996 = const()[name = tensor<string, []>("op_996"), val = tensor<fp32, [1, 1, 6, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44395328)))];
+            tensor<int32, [1]> var_1001_axes_0 = const()[name = tensor<string, []>("op_1001_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp32, [1, 1, 1, 256]> var_1001 = expand_dims(axes = var_1001_axes_0, x = emb)[name = tensor<string, []>("op_1001")];
             tensor<int32, [4]> emb_exp_reps_0 = const()[name = tensor<string, []>("emb_exp_reps_0"), val = tensor<int32, [4]>([1, 1, 6, 1])];
-            tensor<fp32, [1, 1, 6, 256]> emb_exp = tile(reps = emb_exp_reps_0, x = var_998)[name = tensor<string, []>("emb_exp")];
-            tensor<bool, []> input_165_interleave_0 = const()[name = tensor<string, []>("input_165_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 1, 6, 512]> input_165 = concat(axis = var_930, interleave = input_165_interleave_0, values = (emb_exp, var_993))[name = tensor<string, []>("input_165")];
-            tensor<fp32, [1, 1, 6, 256]> x_27 = linear(bias = decoder_convert_bias, weight = decoder_convert_weight, x = input_165)[name = tensor<string, []>("linear_37")];
-            tensor<int32, [4]> var_1006_perm_0 = const()[name = tensor<string, []>("op_1006_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1010 = const()[name = tensor<string, []>("op_1010"), val = tensor<int32, [3]>([6, 1, 256])];
-            tensor<fp32, [1, 6, 1, 256]> var_1006 = transpose(perm = var_1006_perm_0, x = x_27)[name = tensor<string, []>("transpose_28")];
-            tensor<fp32, [6, 1, 256]> x_29 = reshape(shape = var_1010, x = var_1006)[name = tensor<string, []>("x_29")];
+            tensor<fp32, [1, 1, 6, 256]> emb_exp = tile(reps = emb_exp_reps_0, x = var_1001)[name = tensor<string, []>("emb_exp")];
+            tensor<bool, []> input_167_interleave_0 = const()[name = tensor<string, []>("input_167_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 1, 6, 512]> input_167 = concat(axis = var_41, interleave = input_167_interleave_0, values = (emb_exp, var_996))[name = tensor<string, []>("input_167")];
+            tensor<fp32, [1, 1, 6, 256]> x_27 = linear(bias = inner_decoder_convert_bias, weight = inner_decoder_convert_weight, x = input_167)[name = tensor<string, []>("linear_37")];
+            tensor<int32, [4]> var_1009_perm_0 = const()[name = tensor<string, []>("op_1009_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1013 = const()[name = tensor<string, []>("op_1013"), val = tensor<int32, [3]>([6, 1, 256])];
+            tensor<fp32, [1, 6, 1, 256]> var_1009 = transpose(perm = var_1009_perm_0, x = x_27)[name = tensor<string, []>("transpose_28")];
+            tensor<fp32, [6, 1, 256]> x_29 = reshape(shape = var_1013, x = var_1009)[name = tensor<string, []>("x_29")];
             tensor<int32, [5]> prev_kv_9_begin_0 = const()[name = tensor<string, []>("prev_kv_9_begin_0"), val = tensor<int32, [5]>([0, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_9_end_0 = const()[name = tensor<string, []>("prev_kv_9_end_0"), val = tensor<int32, [5]>([1, 6, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_9_end_mask_0 = const()[name = tensor<string, []>("prev_kv_9_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -846,131 +845,131 @@ program(1.0)
             tensor<bool, [2]> prev_scale_9_end_mask_0 = const()[name = tensor<string, []>("prev_scale_9_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_9_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_9_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_9 = slice_by_index(begin = prev_scale_9_begin_0, end = prev_scale_9_end_0, end_mask = prev_scale_9_end_mask_0, squeeze_mask = prev_scale_9_squeeze_mask_0, x = dec_scale)[name = tensor<string, []>("prev_scale_9")];
-            tensor<fp32, [6, 1, 256]> var_1018 = linear(bias = decoder_q_proj_0_bias, weight = decoder_q_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_38")];
-            tensor<int32, [4]> var_1019 = const()[name = tensor<string, []>("op_1019"), val = tensor<int32, [4]>([6, 1, 4, 64])];
-            tensor<fp32, [6, 1, 4, 64]> var_1020 = reshape(shape = var_1019, x = var_1018)[name = tensor<string, []>("op_1020")];
+            tensor<fp32, [6, 1, 256]> var_1021 = linear(bias = inner_decoder_q_proj_0_bias, weight = inner_decoder_q_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_38")];
+            tensor<int32, [4]> var_1022 = const()[name = tensor<string, []>("op_1022"), val = tensor<int32, [4]>([6, 1, 4, 64])];
+            tensor<fp32, [6, 1, 4, 64]> var_1023 = reshape(shape = var_1022, x = var_1021)[name = tensor<string, []>("op_1023")];
             tensor<int32, [4]> q_9_perm_0 = const()[name = tensor<string, []>("q_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [6, 1, 256]> var_1024 = linear(bias = decoder_k_proj_0_bias, weight = decoder_k_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_39")];
-            tensor<fp32, []> var_1025 = const()[name = tensor<string, []>("op_1025"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [6, 1, 256]> var_1026 = mul(x = var_1024, y = var_1025)[name = tensor<string, []>("op_1026")];
-            tensor<int32, [4]> var_1027 = const()[name = tensor<string, []>("op_1027"), val = tensor<int32, [4]>([6, 1, 4, 64])];
-            tensor<fp32, [6, 1, 4, 64]> var_1028 = reshape(shape = var_1027, x = var_1026)[name = tensor<string, []>("op_1028")];
+            tensor<fp32, [6, 1, 256]> var_1027 = linear(bias = inner_decoder_k_proj_0_bias, weight = inner_decoder_k_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_39")];
+            tensor<fp32, []> var_1028 = const()[name = tensor<string, []>("op_1028"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [6, 1, 256]> var_1029 = mul(x = var_1027, y = var_1028)[name = tensor<string, []>("op_1029")];
+            tensor<int32, [4]> var_1030 = const()[name = tensor<string, []>("op_1030"), val = tensor<int32, [4]>([6, 1, 4, 64])];
+            tensor<fp32, [6, 1, 4, 64]> var_1031 = reshape(shape = var_1030, x = var_1029)[name = tensor<string, []>("op_1031")];
             tensor<int32, [4]> k_9_perm_0 = const()[name = tensor<string, []>("k_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [6, 1, 256]> var_1032 = linear(bias = decoder_v_proj_0_bias, weight = decoder_v_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_40")];
-            tensor<int32, [4]> var_1033 = const()[name = tensor<string, []>("op_1033"), val = tensor<int32, [4]>([6, 1, 4, 64])];
-            tensor<fp32, [6, 1, 4, 64]> var_1034 = reshape(shape = var_1033, x = var_1032)[name = tensor<string, []>("op_1034")];
+            tensor<fp32, [6, 1, 256]> var_1035 = linear(bias = inner_decoder_v_proj_0_bias, weight = inner_decoder_v_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_40")];
+            tensor<int32, [4]> var_1036 = const()[name = tensor<string, []>("op_1036"), val = tensor<int32, [4]>([6, 1, 4, 64])];
+            tensor<fp32, [6, 1, 4, 64]> var_1037 = reshape(shape = var_1036, x = var_1035)[name = tensor<string, []>("op_1037")];
             tensor<int32, [4]> v_9_perm_0 = const()[name = tensor<string, []>("v_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [6, 1, 256]> input_169 = linear(bias = decoder_g_proj_0_bias, weight = decoder_g_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_41")];
+            tensor<fp32, [6, 1, 256]> input_171 = linear(bias = inner_decoder_g_proj_0_bias, weight = inner_decoder_g_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_41")];
             tensor<bool, []> cumsum_mask_1_exclusive_0 = const()[name = tensor<string, []>("cumsum_mask_1_exclusive_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> cumsum_mask_1_reverse_0 = const()[name = tensor<string, []>("cumsum_mask_1_reverse_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1]> cumsum_mask_1 = cumsum(axis = var_936, exclusive = cumsum_mask_1_exclusive_0, reverse = cumsum_mask_1_reverse_0, x = valid_mask)[name = tensor<string, []>("cumsum_mask_1")];
+            tensor<fp32, [1]> cumsum_mask_1 = cumsum(axis = var_38, exclusive = cumsum_mask_1_exclusive_0, reverse = cumsum_mask_1_reverse_0, x = valid_mask)[name = tensor<string, []>("cumsum_mask_1")];
             tensor<fp32, [1]> sqrt_s0_9 = sqrt(x = prev_scale_9)[name = tensor<string, []>("sqrt_s0_9")];
             tensor<fp32, [1]> s_t_9 = add(x = prev_scale_9, y = cumsum_mask_1)[name = tensor<string, []>("s_t_9")];
             tensor<fp32, []> const_20 = const()[name = tensor<string, []>("const_20"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_1 = clip(alpha = var_926, beta = const_20, x = s_t_9)[name = tensor<string, []>("clip_1")];
+            tensor<fp32, [1]> clip_1 = clip(alpha = var_28, beta = const_20, x = s_t_9)[name = tensor<string, []>("clip_1")];
             tensor<fp32, [1]> sqrt_s_t_9 = sqrt(x = clip_1)[name = tensor<string, []>("sqrt_s_t_9")];
             tensor<bool, []> qk_9_transpose_x_1 = const()[name = tensor<string, []>("qk_9_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_9_transpose_y_1 = const()[name = tensor<string, []>("qk_9_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [6, 4, 1, 64]> k_9 = transpose(perm = k_9_perm_0, x = var_1028)[name = tensor<string, []>("transpose_26")];
-            tensor<fp32, [6, 4, 1, 64]> q_9 = transpose(perm = q_9_perm_0, x = var_1020)[name = tensor<string, []>("transpose_27")];
+            tensor<fp32, [6, 4, 1, 64]> k_9 = transpose(perm = k_9_perm_0, x = var_1031)[name = tensor<string, []>("transpose_26")];
+            tensor<fp32, [6, 4, 1, 64]> q_9 = transpose(perm = q_9_perm_0, x = var_1023)[name = tensor<string, []>("transpose_27")];
             tensor<fp32, [6, 4, 1, 1]> qk_9 = matmul(transpose_x = qk_9_transpose_x_1, transpose_y = qk_9_transpose_y_1, x = q_9, y = k_9)[name = tensor<string, []>("qk_9")];
-            tensor<int32, [2]> var_1046 = const()[name = tensor<string, []>("op_1046"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp32, [1, 1]> var_1047 = reshape(shape = var_1046, x = valid_mask)[name = tensor<string, []>("op_1047")];
             tensor<int32, [2]> var_1049 = const()[name = tensor<string, []>("op_1049"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp32, [1, 1]> var_1050 = reshape(shape = var_1049, x = sqrt_s_t_9)[name = tensor<string, []>("op_1050")];
-            tensor<fp32, [1, 1]> M_9 = real_div(x = var_1047, y = var_1050)[name = tensor<string, []>("M_9")];
-            tensor<fp32, [6, 4, 1, 1]> var_1052 = mul(x = qk_9, y = M_9)[name = tensor<string, []>("op_1052")];
+            tensor<fp32, [1, 1]> var_1050 = reshape(shape = var_1049, x = valid_mask)[name = tensor<string, []>("op_1050")];
+            tensor<int32, [2]> var_1052 = const()[name = tensor<string, []>("op_1052"), val = tensor<int32, [2]>([1, 1])];
+            tensor<fp32, [1, 1]> var_1053 = reshape(shape = var_1052, x = sqrt_s_t_9)[name = tensor<string, []>("op_1053")];
+            tensor<fp32, [1, 1]> M_9 = real_div(x = var_1050, y = var_1053)[name = tensor<string, []>("M_9")];
+            tensor<fp32, [6, 4, 1, 1]> var_1055 = mul(x = qk_9, y = M_9)[name = tensor<string, []>("op_1055")];
             tensor<bool, []> inner_9_transpose_x_0 = const()[name = tensor<string, []>("inner_9_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_9_transpose_y_0 = const()[name = tensor<string, []>("inner_9_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [6, 4, 1, 64]> v_9 = transpose(perm = v_9_perm_0, x = var_1034)[name = tensor<string, []>("transpose_25")];
-            tensor<fp32, [6, 4, 1, 64]> inner_9 = matmul(transpose_x = inner_9_transpose_x_0, transpose_y = inner_9_transpose_y_0, x = var_1052, y = v_9)[name = tensor<string, []>("inner_9")];
-            tensor<bool, []> var_1054_transpose_x_0 = const()[name = tensor<string, []>("op_1054_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_1054_transpose_y_0 = const()[name = tensor<string, []>("op_1054_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [6, 4, 1, 64]> var_1054 = matmul(transpose_x = var_1054_transpose_x_0, transpose_y = var_1054_transpose_y_0, x = q_9, y = prev_kv_9)[name = tensor<string, []>("op_1054")];
-            tensor<fp32, [1]> var_1055 = real_div(x = sqrt_s0_9, y = sqrt_s_t_9)[name = tensor<string, []>("op_1055")];
-            tensor<int32, [4]> var_1056 = const()[name = tensor<string, []>("op_1056"), val = tensor<int32, [4]>([1, 1, 1, 1])];
-            tensor<fp32, [1, 1, 1, 1]> var_1057 = reshape(shape = var_1056, x = var_1055)[name = tensor<string, []>("op_1057")];
-            tensor<fp32, [6, 4, 1, 64]> cross_9 = mul(x = var_1054, y = var_1057)[name = tensor<string, []>("cross_9")];
+            tensor<fp32, [6, 4, 1, 64]> v_9 = transpose(perm = v_9_perm_0, x = var_1037)[name = tensor<string, []>("transpose_25")];
+            tensor<fp32, [6, 4, 1, 64]> inner_9 = matmul(transpose_x = inner_9_transpose_x_0, transpose_y = inner_9_transpose_y_0, x = var_1055, y = v_9)[name = tensor<string, []>("inner_9")];
+            tensor<bool, []> var_1057_transpose_x_0 = const()[name = tensor<string, []>("op_1057_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_1057_transpose_y_0 = const()[name = tensor<string, []>("op_1057_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [6, 4, 1, 64]> var_1057 = matmul(transpose_x = var_1057_transpose_x_0, transpose_y = var_1057_transpose_y_0, x = q_9, y = prev_kv_9)[name = tensor<string, []>("op_1057")];
+            tensor<fp32, [1]> var_1058 = real_div(x = sqrt_s0_9, y = sqrt_s_t_9)[name = tensor<string, []>("op_1058")];
+            tensor<int32, [4]> var_1059 = const()[name = tensor<string, []>("op_1059"), val = tensor<int32, [4]>([1, 1, 1, 1])];
+            tensor<fp32, [1, 1, 1, 1]> var_1060 = reshape(shape = var_1059, x = var_1058)[name = tensor<string, []>("op_1060")];
+            tensor<fp32, [6, 4, 1, 64]> cross_9 = mul(x = var_1057, y = var_1060)[name = tensor<string, []>("cross_9")];
             tensor<fp32, [6, 4, 1, 64]> out_25 = add(x = inner_9, y = cross_9)[name = tensor<string, []>("out_25")];
-            tensor<int32, [4]> var_1060 = const()[name = tensor<string, []>("op_1060"), val = tensor<int32, [4]>([1, 1, 1, 1])];
-            tensor<fp32, [1, 1, 1, 1]> var_1061 = reshape(shape = var_1060, x = valid_mask)[name = tensor<string, []>("op_1061")];
-            tensor<fp32, [6, 4, 1, 64]> v_masked_1 = mul(x = v_9, y = var_1061)[name = tensor<string, []>("v_masked_1")];
-            tensor<fp32, [6, 4, 64, 64]> var_1063 = mul(x = prev_kv_9, y = sqrt_s0_9)[name = tensor<string, []>("op_1063")];
-            tensor<bool, []> var_1065_transpose_x_1 = const()[name = tensor<string, []>("op_1065_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_1065_transpose_y_1 = const()[name = tensor<string, []>("op_1065_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [6, 4, 64, 64]> var_1065 = matmul(transpose_x = var_1065_transpose_x_1, transpose_y = var_1065_transpose_y_1, x = k_9, y = v_masked_1)[name = tensor<string, []>("op_1065")];
-            tensor<fp32, [6, 4, 64, 64]> new_kv_unnorm_9 = add(x = var_1063, y = var_1065)[name = tensor<string, []>("new_kv_unnorm_9")];
-            tensor<bool, []> var_1067_keep_dims_0 = const()[name = tensor<string, []>("op_1067_keep_dims_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, []> var_1067 = reduce_sum(keep_dims = var_1067_keep_dims_0, x = valid_mask)[name = tensor<string, []>("op_1067")];
-            tensor<int32, [1]> var_1068 = const()[name = tensor<string, []>("op_1068"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [1]> var_1069 = reshape(shape = var_1068, x = var_1067)[name = tensor<string, []>("op_1069")];
-            tensor<fp32, [1]> new_scale_9 = add(x = prev_scale_9, y = var_1069)[name = tensor<string, []>("new_scale_9")];
+            tensor<int32, [4]> var_1063 = const()[name = tensor<string, []>("op_1063"), val = tensor<int32, [4]>([1, 1, 1, 1])];
+            tensor<fp32, [1, 1, 1, 1]> var_1064 = reshape(shape = var_1063, x = valid_mask)[name = tensor<string, []>("op_1064")];
+            tensor<fp32, [6, 4, 1, 64]> v_masked_1 = mul(x = v_9, y = var_1064)[name = tensor<string, []>("v_masked_1")];
+            tensor<fp32, [6, 4, 64, 64]> var_1066 = mul(x = prev_kv_9, y = sqrt_s0_9)[name = tensor<string, []>("op_1066")];
+            tensor<bool, []> var_1068_transpose_x_1 = const()[name = tensor<string, []>("op_1068_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_1068_transpose_y_1 = const()[name = tensor<string, []>("op_1068_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [6, 4, 64, 64]> var_1068 = matmul(transpose_x = var_1068_transpose_x_1, transpose_y = var_1068_transpose_y_1, x = k_9, y = v_masked_1)[name = tensor<string, []>("op_1068")];
+            tensor<fp32, [6, 4, 64, 64]> new_kv_unnorm_9 = add(x = var_1066, y = var_1068)[name = tensor<string, []>("new_kv_unnorm_9")];
+            tensor<bool, []> var_1070_keep_dims_0 = const()[name = tensor<string, []>("op_1070_keep_dims_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, []> var_1070 = reduce_sum(keep_dims = var_1070_keep_dims_0, x = valid_mask)[name = tensor<string, []>("op_1070")];
+            tensor<int32, [1]> var_1071 = const()[name = tensor<string, []>("op_1071"), val = tensor<int32, [1]>([1])];
+            tensor<fp32, [1]> var_1072 = reshape(shape = var_1071, x = var_1070)[name = tensor<string, []>("op_1072")];
+            tensor<fp32, [1]> new_scale_9 = add(x = prev_scale_9, y = var_1072)[name = tensor<string, []>("new_scale_9")];
             tensor<fp32, []> const_21 = const()[name = tensor<string, []>("const_21"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_2 = clip(alpha = var_926, beta = const_21, x = new_scale_9)[name = tensor<string, []>("clip_2")];
+            tensor<fp32, [1]> clip_2 = clip(alpha = var_28, beta = const_21, x = new_scale_9)[name = tensor<string, []>("clip_2")];
             tensor<fp32, [1]> sqrt_new_scale_1 = sqrt(x = clip_2)[name = tensor<string, []>("sqrt_new_scale_1")];
-            tensor<fp32, [6, 4, 64, 64]> var_1073 = real_div(x = new_kv_unnorm_9, y = sqrt_new_scale_1)[name = tensor<string, []>("op_1073")];
-            tensor<int32, [4]> var_1074_perm_0 = const()[name = tensor<string, []>("op_1074_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [6, 4, 64, 64]> var_1076 = real_div(x = new_kv_unnorm_9, y = sqrt_new_scale_1)[name = tensor<string, []>("op_1076")];
+            tensor<int32, [4]> var_1077_perm_0 = const()[name = tensor<string, []>("op_1077_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_27_axes_0 = const()[name = tensor<string, []>("out_27_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [6, 1, 4, 64]> var_1074 = transpose(perm = var_1074_perm_0, x = out_25)[name = tensor<string, []>("transpose_24")];
-            tensor<fp32, [6, 1, 4, 64]> out_27 = layer_norm(axes = out_27_axes_0, epsilon = var_923, x = var_1074)[name = tensor<string, []>("out_27")];
-            tensor<int32, [3]> var_1078 = const()[name = tensor<string, []>("op_1078"), val = tensor<int32, [3]>([6, 1, 256])];
-            tensor<fp32, [6, 1, 256]> out_29 = reshape(shape = var_1078, x = out_27)[name = tensor<string, []>("out_29")];
-            tensor<fp32, [6, 1, 256]> var_1080 = silu(x = input_169)[name = tensor<string, []>("op_1080")];
-            tensor<fp32, [6, 1, 256]> input_171 = mul(x = var_1080, y = out_29)[name = tensor<string, []>("input_171")];
-            tensor<fp32, [6, 1, 256]> ret_out_9 = linear(bias = decoder_out_proj_0_bias, weight = decoder_out_proj_0_weight, x = input_171)[name = tensor<string, []>("linear_42")];
-            tensor<fp32, [6, 1, 256]> input_173 = add(x = x_29, y = ret_out_9)[name = tensor<string, []>("input_173")];
+            tensor<fp32, [6, 1, 4, 64]> var_1077 = transpose(perm = var_1077_perm_0, x = out_25)[name = tensor<string, []>("transpose_24")];
+            tensor<fp32, [6, 1, 4, 64]> out_27 = layer_norm(axes = out_27_axes_0, epsilon = var_43, x = var_1077)[name = tensor<string, []>("out_27")];
+            tensor<int32, [3]> var_1081 = const()[name = tensor<string, []>("op_1081"), val = tensor<int32, [3]>([6, 1, 256])];
+            tensor<fp32, [6, 1, 256]> out_29 = reshape(shape = var_1081, x = out_27)[name = tensor<string, []>("out_29")];
+            tensor<fp32, [6, 1, 256]> var_1083 = silu(x = input_171)[name = tensor<string, []>("op_1083")];
+            tensor<fp32, [6, 1, 256]> input_173 = mul(x = var_1083, y = out_29)[name = tensor<string, []>("input_173")];
+            tensor<fp32, [6, 1, 256]> ret_out_9 = linear(bias = inner_decoder_out_proj_0_bias, weight = inner_decoder_out_proj_0_weight, x = input_173)[name = tensor<string, []>("linear_42")];
+            tensor<fp32, [6, 1, 256]> input_175 = add(x = x_29, y = ret_out_9)[name = tensor<string, []>("input_175")];
             tensor<int32, [1]> xt_1_axes_0 = const()[name = tensor<string, []>("xt_1_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [6, 1, 256]> xt_1 = layer_norm(axes = xt_1_axes_0, beta = decoder_norm11_0_bias, epsilon = var_921, gamma = decoder_norm11_0_weight, x = input_173)[name = tensor<string, []>("xt_1")];
-            tensor<int32, [4]> var_1090 = const()[name = tensor<string, []>("op_1090"), val = tensor<int32, [4]>([1, 6, 1, 256])];
-            tensor<fp32, [1, 6, 1, 256]> var_1091 = reshape(shape = var_1090, x = xt_1)[name = tensor<string, []>("op_1091")];
-            tensor<int32, [4]> var_1092_perm_0 = const()[name = tensor<string, []>("op_1092_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1095 = const()[name = tensor<string, []>("op_1095"), val = tensor<int32, [3]>([1, 6, 256])];
-            tensor<fp32, [1, 1, 6, 256]> var_1092 = transpose(perm = var_1092_perm_0, x = var_1091)[name = tensor<string, []>("transpose_23")];
-            tensor<fp32, [1, 6, 256]> query_1 = reshape(shape = var_1095, x = var_1092)[name = tensor<string, []>("query_1")];
+            tensor<fp32, [6, 1, 256]> xt_1 = layer_norm(axes = xt_1_axes_0, beta = inner_decoder_norm11_0_bias, epsilon = var_35, gamma = inner_decoder_norm11_0_weight, x = input_175)[name = tensor<string, []>("xt_1")];
+            tensor<int32, [4]> var_1093 = const()[name = tensor<string, []>("op_1093"), val = tensor<int32, [4]>([1, 6, 1, 256])];
+            tensor<fp32, [1, 6, 1, 256]> var_1094 = reshape(shape = var_1093, x = xt_1)[name = tensor<string, []>("op_1094")];
+            tensor<int32, [4]> var_1095_perm_0 = const()[name = tensor<string, []>("op_1095_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1098 = const()[name = tensor<string, []>("op_1098"), val = tensor<int32, [3]>([1, 6, 256])];
+            tensor<fp32, [1, 1, 6, 256]> var_1095 = transpose(perm = var_1095_perm_0, x = var_1094)[name = tensor<string, []>("transpose_23")];
+            tensor<fp32, [1, 6, 256]> query_1 = reshape(shape = var_1098, x = var_1095)[name = tensor<string, []>("query_1")];
             tensor<int32, [3]> query_3_perm_0 = const()[name = tensor<string, []>("query_3_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [6, 1, 256]> query_3 = transpose(perm = query_3_perm_0, x = query_1)[name = tensor<string, []>("transpose_22")];
-            tensor<fp32, [6, 1, 768]> var_1118 = linear(bias = decoder_self_attn2_0_in_proj_bias, weight = decoder_self_attn2_0_in_proj_weight, x = query_3)[name = tensor<string, []>("linear_43")];
+            tensor<fp32, [6, 1, 768]> var_1121 = linear(bias = inner_decoder_self_attn2_0_in_proj_bias, weight = inner_decoder_self_attn2_0_in_proj_weight, x = query_3)[name = tensor<string, []>("linear_43")];
             tensor<int32, [4]> concat_1 = const()[name = tensor<string, []>("concat_1"), val = tensor<int32, [4]>([6, 1, 3, 256])];
-            tensor<fp32, [6, 1, 3, 256]> var_1120 = reshape(shape = concat_1, x = var_1118)[name = tensor<string, []>("op_1120")];
-            tensor<int32, [1]> var_1121_axes_0 = const()[name = tensor<string, []>("op_1121_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp32, [1, 6, 1, 3, 256]> var_1121 = expand_dims(axes = var_1121_axes_0, x = var_1120)[name = tensor<string, []>("op_1121")];
-            tensor<int32, [5]> var_1122_perm_0 = const()[name = tensor<string, []>("op_1122_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
-            tensor<int32, [1]> var_1123_axes_0 = const()[name = tensor<string, []>("op_1123_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp32, [3, 6, 1, 1, 256]> var_1122 = transpose(perm = var_1122_perm_0, x = var_1121)[name = tensor<string, []>("transpose_21")];
-            tensor<fp32, [3, 6, 1, 256]> var_1123 = squeeze(axes = var_1123_axes_0, x = var_1122)[name = tensor<string, []>("op_1123")];
+            tensor<fp32, [6, 1, 3, 256]> var_1123 = reshape(shape = concat_1, x = var_1121)[name = tensor<string, []>("op_1123")];
+            tensor<int32, [1]> var_1124_axes_0 = const()[name = tensor<string, []>("op_1124_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 6, 1, 3, 256]> var_1124 = expand_dims(axes = var_1124_axes_0, x = var_1123)[name = tensor<string, []>("op_1124")];
+            tensor<int32, [5]> var_1125_perm_0 = const()[name = tensor<string, []>("op_1125_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
+            tensor<int32, [1]> var_1126_axes_0 = const()[name = tensor<string, []>("op_1126_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp32, [3, 6, 1, 1, 256]> var_1125 = transpose(perm = var_1125_perm_0, x = var_1124)[name = tensor<string, []>("transpose_21")];
+            tensor<fp32, [3, 6, 1, 256]> var_1126 = squeeze(axes = var_1126_axes_0, x = var_1125)[name = tensor<string, []>("op_1126")];
             tensor<int32, [4]> q_11_begin_0 = const()[name = tensor<string, []>("q_11_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> q_11_end_0 = const()[name = tensor<string, []>("q_11_end_0"), val = tensor<int32, [4]>([1, 6, 1, 256])];
             tensor<bool, [4]> q_11_end_mask_0 = const()[name = tensor<string, []>("q_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> q_11_squeeze_mask_0 = const()[name = tensor<string, []>("q_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [6, 1, 256]> q_11 = slice_by_index(begin = q_11_begin_0, end = q_11_end_0, end_mask = q_11_end_mask_0, squeeze_mask = q_11_squeeze_mask_0, x = var_1123)[name = tensor<string, []>("q_11")];
+            tensor<fp32, [6, 1, 256]> q_11 = slice_by_index(begin = q_11_begin_0, end = q_11_end_0, end_mask = q_11_end_mask_0, squeeze_mask = q_11_squeeze_mask_0, x = var_1126)[name = tensor<string, []>("q_11")];
             tensor<int32, [4]> k_11_begin_0 = const()[name = tensor<string, []>("k_11_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> k_11_end_0 = const()[name = tensor<string, []>("k_11_end_0"), val = tensor<int32, [4]>([2, 6, 1, 256])];
             tensor<bool, [4]> k_11_end_mask_0 = const()[name = tensor<string, []>("k_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> k_11_squeeze_mask_0 = const()[name = tensor<string, []>("k_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [6, 1, 256]> k_11 = slice_by_index(begin = k_11_begin_0, end = k_11_end_0, end_mask = k_11_end_mask_0, squeeze_mask = k_11_squeeze_mask_0, x = var_1123)[name = tensor<string, []>("k_11")];
+            tensor<fp32, [6, 1, 256]> k_11 = slice_by_index(begin = k_11_begin_0, end = k_11_end_0, end_mask = k_11_end_mask_0, squeeze_mask = k_11_squeeze_mask_0, x = var_1126)[name = tensor<string, []>("k_11")];
             tensor<int32, [4]> v_11_begin_0 = const()[name = tensor<string, []>("v_11_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> v_11_end_0 = const()[name = tensor<string, []>("v_11_end_0"), val = tensor<int32, [4]>([3, 6, 1, 256])];
             tensor<bool, [4]> v_11_end_mask_0 = const()[name = tensor<string, []>("v_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> v_11_squeeze_mask_0 = const()[name = tensor<string, []>("v_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [6, 1, 256]> v_11 = slice_by_index(begin = v_11_begin_0, end = v_11_end_0, end_mask = v_11_end_mask_0, squeeze_mask = v_11_squeeze_mask_0, x = var_1123)[name = tensor<string, []>("v_11")];
-            tensor<int32, [3]> var_1131 = const()[name = tensor<string, []>("op_1131"), val = tensor<int32, [3]>([6, 4, 64])];
-            tensor<fp32, [6, 4, 64]> var_1132 = reshape(shape = var_1131, x = q_11)[name = tensor<string, []>("op_1132")];
+            tensor<fp32, [6, 1, 256]> v_11 = slice_by_index(begin = v_11_begin_0, end = v_11_end_0, end_mask = v_11_end_mask_0, squeeze_mask = v_11_squeeze_mask_0, x = var_1126)[name = tensor<string, []>("v_11")];
+            tensor<int32, [3]> var_1134 = const()[name = tensor<string, []>("op_1134"), val = tensor<int32, [3]>([6, 4, 64])];
+            tensor<fp32, [6, 4, 64]> var_1135 = reshape(shape = var_1134, x = q_11)[name = tensor<string, []>("op_1135")];
             tensor<int32, [3]> q_13_perm_0 = const()[name = tensor<string, []>("q_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1138 = const()[name = tensor<string, []>("op_1138"), val = tensor<int32, [3]>([6, 4, 64])];
-            tensor<fp32, [6, 4, 64]> var_1139 = reshape(shape = var_1138, x = k_11)[name = tensor<string, []>("op_1139")];
+            tensor<int32, [3]> var_1141 = const()[name = tensor<string, []>("op_1141"), val = tensor<int32, [3]>([6, 4, 64])];
+            tensor<fp32, [6, 4, 64]> var_1142 = reshape(shape = var_1141, x = k_11)[name = tensor<string, []>("op_1142")];
             tensor<int32, [3]> k_13_perm_0 = const()[name = tensor<string, []>("k_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1145 = const()[name = tensor<string, []>("op_1145"), val = tensor<int32, [3]>([6, 4, 64])];
-            tensor<fp32, [6, 4, 64]> var_1146 = reshape(shape = var_1145, x = v_11)[name = tensor<string, []>("op_1146")];
+            tensor<int32, [3]> var_1148 = const()[name = tensor<string, []>("op_1148"), val = tensor<int32, [3]>([6, 4, 64])];
+            tensor<fp32, [6, 4, 64]> var_1149 = reshape(shape = var_1148, x = v_11)[name = tensor<string, []>("op_1149")];
             tensor<int32, [3]> v_13_perm_0 = const()[name = tensor<string, []>("v_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [4]> var_1149 = const()[name = tensor<string, []>("op_1149"), val = tensor<int32, [4]>([1, 4, 6, 64])];
-            tensor<fp32, [4, 6, 64]> q_13 = transpose(perm = q_13_perm_0, x = var_1132)[name = tensor<string, []>("transpose_20")];
-            tensor<fp32, [1, 4, 6, 64]> q_15 = reshape(shape = var_1149, x = q_13)[name = tensor<string, []>("q_15")];
-            tensor<int32, [4]> var_1151 = const()[name = tensor<string, []>("op_1151"), val = tensor<int32, [4]>([1, 4, 6, 64])];
-            tensor<fp32, [4, 6, 64]> k_13 = transpose(perm = k_13_perm_0, x = var_1139)[name = tensor<string, []>("transpose_19")];
-            tensor<fp32, [1, 4, 6, 64]> k_15 = reshape(shape = var_1151, x = k_13)[name = tensor<string, []>("k_15")];
-            tensor<int32, [4]> var_1153 = const()[name = tensor<string, []>("op_1153"), val = tensor<int32, [4]>([1, 4, 6, 64])];
-            tensor<fp32, [4, 6, 64]> v_13 = transpose(perm = v_13_perm_0, x = var_1146)[name = tensor<string, []>("transpose_18")];
-            tensor<fp32, [1, 4, 6, 64]> v_15 = reshape(shape = var_1153, x = v_13)[name = tensor<string, []>("v_15")];
+            tensor<int32, [4]> var_1152 = const()[name = tensor<string, []>("op_1152"), val = tensor<int32, [4]>([1, 4, 6, 64])];
+            tensor<fp32, [4, 6, 64]> q_13 = transpose(perm = q_13_perm_0, x = var_1135)[name = tensor<string, []>("transpose_20")];
+            tensor<fp32, [1, 4, 6, 64]> q_15 = reshape(shape = var_1152, x = q_13)[name = tensor<string, []>("q_15")];
+            tensor<int32, [4]> var_1154 = const()[name = tensor<string, []>("op_1154"), val = tensor<int32, [4]>([1, 4, 6, 64])];
+            tensor<fp32, [4, 6, 64]> k_13 = transpose(perm = k_13_perm_0, x = var_1142)[name = tensor<string, []>("transpose_19")];
+            tensor<fp32, [1, 4, 6, 64]> k_15 = reshape(shape = var_1154, x = k_13)[name = tensor<string, []>("k_15")];
+            tensor<int32, [4]> var_1156 = const()[name = tensor<string, []>("op_1156"), val = tensor<int32, [4]>([1, 4, 6, 64])];
+            tensor<fp32, [4, 6, 64]> v_13 = transpose(perm = v_13_perm_0, x = var_1149)[name = tensor<string, []>("transpose_18")];
+            tensor<fp32, [1, 4, 6, 64]> v_15 = reshape(shape = var_1156, x = v_13)[name = tensor<string, []>("v_15")];
             tensor<fp32, []> mul_1_y_0 = const()[name = tensor<string, []>("mul_1_y_0"), val = tensor<fp32, []>(0x1p-3)];
             tensor<fp32, [1, 4, 6, 64]> mul_1 = mul(x = q_15, y = mul_1_y_0)[name = tensor<string, []>("mul_1")];
             tensor<bool, []> matmul_0_transpose_y_0 = const()[name = tensor<string, []>("matmul_0_transpose_y_0"), val = tensor<bool, []>(true)];
@@ -981,30 +980,30 @@ program(1.0)
             tensor<bool, []> attn_output_1_transpose_x_0 = const()[name = tensor<string, []>("attn_output_1_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> attn_output_1_transpose_y_0 = const()[name = tensor<string, []>("attn_output_1_transpose_y_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [1, 4, 6, 64]> attn_output_1 = matmul(transpose_x = attn_output_1_transpose_x_0, transpose_y = attn_output_1_transpose_y_0, x = softmax_0, y = v_15)[name = tensor<string, []>("attn_output_1")];
-            tensor<int32, [4]> var_1156 = const()[name = tensor<string, []>("op_1156"), val = tensor<int32, [4]>([2, 0, 1, 3])];
-            tensor<int32, [2]> var_1161 = const()[name = tensor<string, []>("op_1161"), val = tensor<int32, [2]>([6, 256])];
-            tensor<fp32, [6, 1, 4, 64]> var_1157 = transpose(perm = var_1156, x = attn_output_1)[name = tensor<string, []>("transpose_17")];
-            tensor<fp32, [6, 256]> attn_output_3 = reshape(shape = var_1161, x = var_1157)[name = tensor<string, []>("attn_output_3")];
-            tensor<fp32, [6, 256]> attn_output_5 = linear(bias = decoder_self_attn2_0_out_proj_bias, weight = decoder_self_attn2_0_out_proj_weight, x = attn_output_3)[name = tensor<string, []>("linear_44")];
-            tensor<int32, [3]> var_1165 = const()[name = tensor<string, []>("op_1165"), val = tensor<int32, [3]>([6, 1, 256])];
-            tensor<fp32, [6, 1, 256]> attn_output_7 = reshape(shape = var_1165, x = attn_output_5)[name = tensor<string, []>("attn_output_7")];
+            tensor<int32, [4]> var_1159 = const()[name = tensor<string, []>("op_1159"), val = tensor<int32, [4]>([2, 0, 1, 3])];
+            tensor<int32, [2]> var_1164 = const()[name = tensor<string, []>("op_1164"), val = tensor<int32, [2]>([6, 256])];
+            tensor<fp32, [6, 1, 4, 64]> var_1160 = transpose(perm = var_1159, x = attn_output_1)[name = tensor<string, []>("transpose_17")];
+            tensor<fp32, [6, 256]> attn_output_3 = reshape(shape = var_1164, x = var_1160)[name = tensor<string, []>("attn_output_3")];
+            tensor<fp32, [6, 256]> attn_output_5 = linear(bias = inner_decoder_self_attn2_0_out_proj_bias, weight = inner_decoder_self_attn2_0_out_proj_weight, x = attn_output_3)[name = tensor<string, []>("linear_44")];
+            tensor<int32, [3]> var_1168 = const()[name = tensor<string, []>("op_1168"), val = tensor<int32, [3]>([6, 1, 256])];
+            tensor<fp32, [6, 1, 256]> attn_output_7 = reshape(shape = var_1168, x = attn_output_5)[name = tensor<string, []>("attn_output_7")];
             tensor<int32, [3]> sa2_out_1_perm_0 = const()[name = tensor<string, []>("sa2_out_1_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [1, 6, 256]> sa2_out_1 = transpose(perm = sa2_out_1_perm_0, x = attn_output_7)[name = tensor<string, []>("transpose_16")];
-            tensor<fp32, [1, 6, 256]> input_175 = add(x = query_1, y = sa2_out_1)[name = tensor<string, []>("input_175")];
-            tensor<int32, [1]> input_177_axes_0 = const()[name = tensor<string, []>("input_177_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 6, 256]> input_177 = layer_norm(axes = input_177_axes_0, beta = decoder_norm21_0_bias, epsilon = var_921, gamma = decoder_norm21_0_weight, x = input_175)[name = tensor<string, []>("input_177")];
-            tensor<fp32, [1, 6, 2048]> input_179 = linear(bias = decoder_linear1_0_bias, weight = decoder_linear1_0_weight, x = input_177)[name = tensor<string, []>("linear_45")];
-            tensor<fp32, [1, 6, 2048]> input_181 = relu(x = input_179)[name = tensor<string, []>("input_181")];
-            tensor<fp32, [1, 6, 256]> ffn_out_1 = linear(bias = decoder_linear2_0_bias, weight = decoder_linear2_0_weight, x = input_181)[name = tensor<string, []>("linear_46")];
-            tensor<fp32, [1, 6, 256]> input_183 = add(x = input_177, y = ffn_out_1)[name = tensor<string, []>("input_183")];
+            tensor<fp32, [1, 6, 256]> input_177 = add(x = query_1, y = sa2_out_1)[name = tensor<string, []>("input_177")];
+            tensor<int32, [1]> input_179_axes_0 = const()[name = tensor<string, []>("input_179_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 6, 256]> input_179 = layer_norm(axes = input_179_axes_0, beta = inner_decoder_norm21_0_bias, epsilon = var_35, gamma = inner_decoder_norm21_0_weight, x = input_177)[name = tensor<string, []>("input_179")];
+            tensor<fp32, [1, 6, 2048]> input_181 = linear(bias = inner_decoder_linear1_0_bias, weight = inner_decoder_linear1_0_weight, x = input_179)[name = tensor<string, []>("linear_45")];
+            tensor<fp32, [1, 6, 2048]> input_183 = relu(x = input_181)[name = tensor<string, []>("input_183")];
+            tensor<fp32, [1, 6, 256]> ffn_out_1 = linear(bias = inner_decoder_linear2_0_bias, weight = inner_decoder_linear2_0_weight, x = input_183)[name = tensor<string, []>("linear_46")];
+            tensor<fp32, [1, 6, 256]> input_185 = add(x = input_179, y = ffn_out_1)[name = tensor<string, []>("input_185")];
             tensor<int32, [1]> xt_3_axes_0 = const()[name = tensor<string, []>("xt_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 6, 256]> xt_3 = layer_norm(axes = xt_3_axes_0, beta = decoder_norm22_0_bias, epsilon = var_921, gamma = decoder_norm22_0_weight, x = input_183)[name = tensor<string, []>("xt_3")];
-            tensor<int32, [4]> var_1185 = const()[name = tensor<string, []>("op_1185"), val = tensor<int32, [4]>([1, 1, 6, 256])];
-            tensor<fp32, [1, 1, 6, 256]> x_31 = reshape(shape = var_1185, x = xt_3)[name = tensor<string, []>("x_31")];
-            tensor<int32, [4]> var_1187_perm_0 = const()[name = tensor<string, []>("op_1187_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1191 = const()[name = tensor<string, []>("op_1191"), val = tensor<int32, [3]>([6, 1, 256])];
-            tensor<fp32, [1, 6, 1, 256]> var_1187 = transpose(perm = var_1187_perm_0, x = x_31)[name = tensor<string, []>("transpose_15")];
-            tensor<fp32, [6, 1, 256]> x = reshape(shape = var_1191, x = var_1187)[name = tensor<string, []>("x")];
+            tensor<fp32, [1, 6, 256]> xt_3 = layer_norm(axes = xt_3_axes_0, beta = inner_decoder_norm22_0_bias, epsilon = var_35, gamma = inner_decoder_norm22_0_weight, x = input_185)[name = tensor<string, []>("xt_3")];
+            tensor<int32, [4]> var_1188 = const()[name = tensor<string, []>("op_1188"), val = tensor<int32, [4]>([1, 1, 6, 256])];
+            tensor<fp32, [1, 1, 6, 256]> x_31 = reshape(shape = var_1188, x = xt_3)[name = tensor<string, []>("x_31")];
+            tensor<int32, [4]> var_1190_perm_0 = const()[name = tensor<string, []>("op_1190_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1194 = const()[name = tensor<string, []>("op_1194"), val = tensor<int32, [3]>([6, 1, 256])];
+            tensor<fp32, [1, 6, 1, 256]> var_1190 = transpose(perm = var_1190_perm_0, x = x_31)[name = tensor<string, []>("transpose_15")];
+            tensor<fp32, [6, 1, 256]> x = reshape(shape = var_1194, x = var_1190)[name = tensor<string, []>("x")];
             tensor<int32, [5]> prev_kv_begin_0 = const()[name = tensor<string, []>("prev_kv_begin_0"), val = tensor<int32, [5]>([1, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_end_0 = const()[name = tensor<string, []>("prev_kv_end_0"), val = tensor<int32, [5]>([2, 6, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_end_mask_0 = const()[name = tensor<string, []>("prev_kv_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -1015,120 +1014,120 @@ program(1.0)
             tensor<bool, [2]> prev_scale_end_mask_0 = const()[name = tensor<string, []>("prev_scale_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale = slice_by_index(begin = prev_scale_begin_0, end = prev_scale_end_0, end_mask = prev_scale_end_mask_0, squeeze_mask = prev_scale_squeeze_mask_0, x = dec_scale)[name = tensor<string, []>("prev_scale")];
-            tensor<fp32, [6, 1, 256]> var_1199 = linear(bias = decoder_q_proj_1_bias, weight = decoder_q_proj_1_weight, x = x)[name = tensor<string, []>("linear_47")];
-            tensor<int32, [4]> var_1200 = const()[name = tensor<string, []>("op_1200"), val = tensor<int32, [4]>([6, 1, 4, 64])];
-            tensor<fp32, [6, 1, 4, 64]> var_1201 = reshape(shape = var_1200, x = var_1199)[name = tensor<string, []>("op_1201")];
+            tensor<fp32, [6, 1, 256]> var_1202 = linear(bias = inner_decoder_q_proj_1_bias, weight = inner_decoder_q_proj_1_weight, x = x)[name = tensor<string, []>("linear_47")];
+            tensor<int32, [4]> var_1203 = const()[name = tensor<string, []>("op_1203"), val = tensor<int32, [4]>([6, 1, 4, 64])];
+            tensor<fp32, [6, 1, 4, 64]> var_1204 = reshape(shape = var_1203, x = var_1202)[name = tensor<string, []>("op_1204")];
             tensor<int32, [4]> q_17_perm_0 = const()[name = tensor<string, []>("q_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [6, 1, 256]> var_1205 = linear(bias = decoder_k_proj_1_bias, weight = decoder_k_proj_1_weight, x = x)[name = tensor<string, []>("linear_48")];
-            tensor<fp32, []> var_1206 = const()[name = tensor<string, []>("op_1206"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [6, 1, 256]> var_1207 = mul(x = var_1205, y = var_1206)[name = tensor<string, []>("op_1207")];
-            tensor<int32, [4]> var_1208 = const()[name = tensor<string, []>("op_1208"), val = tensor<int32, [4]>([6, 1, 4, 64])];
-            tensor<fp32, [6, 1, 4, 64]> var_1209 = reshape(shape = var_1208, x = var_1207)[name = tensor<string, []>("op_1209")];
+            tensor<fp32, [6, 1, 256]> var_1208 = linear(bias = inner_decoder_k_proj_1_bias, weight = inner_decoder_k_proj_1_weight, x = x)[name = tensor<string, []>("linear_48")];
+            tensor<fp32, []> var_1209 = const()[name = tensor<string, []>("op_1209"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [6, 1, 256]> var_1210 = mul(x = var_1208, y = var_1209)[name = tensor<string, []>("op_1210")];
+            tensor<int32, [4]> var_1211 = const()[name = tensor<string, []>("op_1211"), val = tensor<int32, [4]>([6, 1, 4, 64])];
+            tensor<fp32, [6, 1, 4, 64]> var_1212 = reshape(shape = var_1211, x = var_1210)[name = tensor<string, []>("op_1212")];
             tensor<int32, [4]> k_17_perm_0 = const()[name = tensor<string, []>("k_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [6, 1, 256]> var_1213 = linear(bias = decoder_v_proj_1_bias, weight = decoder_v_proj_1_weight, x = x)[name = tensor<string, []>("linear_49")];
-            tensor<int32, [4]> var_1214 = const()[name = tensor<string, []>("op_1214"), val = tensor<int32, [4]>([6, 1, 4, 64])];
-            tensor<fp32, [6, 1, 4, 64]> var_1215 = reshape(shape = var_1214, x = var_1213)[name = tensor<string, []>("op_1215")];
+            tensor<fp32, [6, 1, 256]> var_1216 = linear(bias = inner_decoder_v_proj_1_bias, weight = inner_decoder_v_proj_1_weight, x = x)[name = tensor<string, []>("linear_49")];
+            tensor<int32, [4]> var_1217 = const()[name = tensor<string, []>("op_1217"), val = tensor<int32, [4]>([6, 1, 4, 64])];
+            tensor<fp32, [6, 1, 4, 64]> var_1218 = reshape(shape = var_1217, x = var_1216)[name = tensor<string, []>("op_1218")];
             tensor<int32, [4]> v_17_perm_0 = const()[name = tensor<string, []>("v_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [6, 1, 256]> input_187 = linear(bias = decoder_g_proj_1_bias, weight = decoder_g_proj_1_weight, x = x)[name = tensor<string, []>("linear_50")];
+            tensor<fp32, [6, 1, 256]> input_189 = linear(bias = inner_decoder_g_proj_1_bias, weight = inner_decoder_g_proj_1_weight, x = x)[name = tensor<string, []>("linear_50")];
             tensor<fp32, [1]> sqrt_s0 = sqrt(x = prev_scale)[name = tensor<string, []>("sqrt_s0")];
             tensor<fp32, [1]> s_t = add(x = prev_scale, y = cumsum_mask_1)[name = tensor<string, []>("s_t")];
             tensor<fp32, []> const_32 = const()[name = tensor<string, []>("const_32"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_3 = clip(alpha = var_926, beta = const_32, x = s_t)[name = tensor<string, []>("clip_3")];
+            tensor<fp32, [1]> clip_3 = clip(alpha = var_28, beta = const_32, x = s_t)[name = tensor<string, []>("clip_3")];
             tensor<fp32, [1]> sqrt_s_t = sqrt(x = clip_3)[name = tensor<string, []>("sqrt_s_t")];
             tensor<bool, []> qk_transpose_x_1 = const()[name = tensor<string, []>("qk_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_transpose_y_1 = const()[name = tensor<string, []>("qk_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [6, 4, 1, 64]> k_17 = transpose(perm = k_17_perm_0, x = var_1209)[name = tensor<string, []>("transpose_13")];
-            tensor<fp32, [6, 4, 1, 64]> q_17 = transpose(perm = q_17_perm_0, x = var_1201)[name = tensor<string, []>("transpose_14")];
+            tensor<fp32, [6, 4, 1, 64]> k_17 = transpose(perm = k_17_perm_0, x = var_1212)[name = tensor<string, []>("transpose_13")];
+            tensor<fp32, [6, 4, 1, 64]> q_17 = transpose(perm = q_17_perm_0, x = var_1204)[name = tensor<string, []>("transpose_14")];
             tensor<fp32, [6, 4, 1, 1]> qk = matmul(transpose_x = qk_transpose_x_1, transpose_y = qk_transpose_y_1, x = q_17, y = k_17)[name = tensor<string, []>("qk")];
-            tensor<int32, [2]> var_1230 = const()[name = tensor<string, []>("op_1230"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp32, [1, 1]> var_1231 = reshape(shape = var_1230, x = sqrt_s_t)[name = tensor<string, []>("op_1231")];
-            tensor<fp32, [1, 1]> M = real_div(x = var_1047, y = var_1231)[name = tensor<string, []>("M")];
-            tensor<fp32, [6, 4, 1, 1]> var_1233 = mul(x = qk, y = M)[name = tensor<string, []>("op_1233")];
-            tensor<bool, []> inner_transpose_x_0 = const()[name = tensor<string, []>("inner_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> inner_transpose_y_0 = const()[name = tensor<string, []>("inner_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [6, 4, 1, 64]> v_17 = transpose(perm = v_17_perm_0, x = var_1215)[name = tensor<string, []>("transpose_12")];
-            tensor<fp32, [6, 4, 1, 64]> inner = matmul(transpose_x = inner_transpose_x_0, transpose_y = inner_transpose_y_0, x = var_1233, y = v_17)[name = tensor<string, []>("inner")];
-            tensor<bool, []> var_1235_transpose_x_0 = const()[name = tensor<string, []>("op_1235_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_1235_transpose_y_0 = const()[name = tensor<string, []>("op_1235_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [6, 4, 1, 64]> var_1235 = matmul(transpose_x = var_1235_transpose_x_0, transpose_y = var_1235_transpose_y_0, x = q_17, y = prev_kv)[name = tensor<string, []>("op_1235")];
-            tensor<fp32, [1]> var_1236 = real_div(x = sqrt_s0, y = sqrt_s_t)[name = tensor<string, []>("op_1236")];
-            tensor<int32, [4]> var_1237 = const()[name = tensor<string, []>("op_1237"), val = tensor<int32, [4]>([1, 1, 1, 1])];
-            tensor<fp32, [1, 1, 1, 1]> var_1238 = reshape(shape = var_1237, x = var_1236)[name = tensor<string, []>("op_1238")];
-            tensor<fp32, [6, 4, 1, 64]> cross = mul(x = var_1235, y = var_1238)[name = tensor<string, []>("cross")];
-            tensor<fp32, [6, 4, 1, 64]> out_31 = add(x = inner, y = cross)[name = tensor<string, []>("out_31")];
-            tensor<fp32, [6, 4, 1, 64]> v_masked = mul(x = v_17, y = var_1061)[name = tensor<string, []>("v_masked")];
-            tensor<fp32, [6, 4, 64, 64]> var_1244 = mul(x = prev_kv, y = sqrt_s0)[name = tensor<string, []>("op_1244")];
-            tensor<bool, []> var_1246_transpose_x_1 = const()[name = tensor<string, []>("op_1246_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_1246_transpose_y_1 = const()[name = tensor<string, []>("op_1246_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [6, 4, 64, 64]> var_1246 = matmul(transpose_x = var_1246_transpose_x_1, transpose_y = var_1246_transpose_y_1, x = k_17, y = v_masked)[name = tensor<string, []>("op_1246")];
-            tensor<fp32, [6, 4, 64, 64]> new_kv_unnorm = add(x = var_1244, y = var_1246)[name = tensor<string, []>("new_kv_unnorm")];
-            tensor<fp32, [1]> new_scale = add(x = prev_scale, y = var_1069)[name = tensor<string, []>("new_scale")];
+            tensor<int32, [2]> var_1233 = const()[name = tensor<string, []>("op_1233"), val = tensor<int32, [2]>([1, 1])];
+            tensor<fp32, [1, 1]> var_1234 = reshape(shape = var_1233, x = sqrt_s_t)[name = tensor<string, []>("op_1234")];
+            tensor<fp32, [1, 1]> M = real_div(x = var_1050, y = var_1234)[name = tensor<string, []>("M")];
+            tensor<fp32, [6, 4, 1, 1]> var_1236 = mul(x = qk, y = M)[name = tensor<string, []>("op_1236")];
+            tensor<bool, []> inner_11_transpose_x_0 = const()[name = tensor<string, []>("inner_11_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> inner_11_transpose_y_0 = const()[name = tensor<string, []>("inner_11_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [6, 4, 1, 64]> v_17 = transpose(perm = v_17_perm_0, x = var_1218)[name = tensor<string, []>("transpose_12")];
+            tensor<fp32, [6, 4, 1, 64]> inner_11 = matmul(transpose_x = inner_11_transpose_x_0, transpose_y = inner_11_transpose_y_0, x = var_1236, y = v_17)[name = tensor<string, []>("inner_11")];
+            tensor<bool, []> var_1238_transpose_x_0 = const()[name = tensor<string, []>("op_1238_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_1238_transpose_y_0 = const()[name = tensor<string, []>("op_1238_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [6, 4, 1, 64]> var_1238 = matmul(transpose_x = var_1238_transpose_x_0, transpose_y = var_1238_transpose_y_0, x = q_17, y = prev_kv)[name = tensor<string, []>("op_1238")];
+            tensor<fp32, [1]> var_1239 = real_div(x = sqrt_s0, y = sqrt_s_t)[name = tensor<string, []>("op_1239")];
+            tensor<int32, [4]> var_1240 = const()[name = tensor<string, []>("op_1240"), val = tensor<int32, [4]>([1, 1, 1, 1])];
+            tensor<fp32, [1, 1, 1, 1]> var_1241 = reshape(shape = var_1240, x = var_1239)[name = tensor<string, []>("op_1241")];
+            tensor<fp32, [6, 4, 1, 64]> cross = mul(x = var_1238, y = var_1241)[name = tensor<string, []>("cross")];
+            tensor<fp32, [6, 4, 1, 64]> out_31 = add(x = inner_11, y = cross)[name = tensor<string, []>("out_31")];
+            tensor<fp32, [6, 4, 1, 64]> v_masked = mul(x = v_17, y = var_1064)[name = tensor<string, []>("v_masked")];
+            tensor<fp32, [6, 4, 64, 64]> var_1247 = mul(x = prev_kv, y = sqrt_s0)[name = tensor<string, []>("op_1247")];
+            tensor<bool, []> var_1249_transpose_x_1 = const()[name = tensor<string, []>("op_1249_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_1249_transpose_y_1 = const()[name = tensor<string, []>("op_1249_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [6, 4, 64, 64]> var_1249 = matmul(transpose_x = var_1249_transpose_x_1, transpose_y = var_1249_transpose_y_1, x = k_17, y = v_masked)[name = tensor<string, []>("op_1249")];
+            tensor<fp32, [6, 4, 64, 64]> new_kv_unnorm = add(x = var_1247, y = var_1249)[name = tensor<string, []>("new_kv_unnorm")];
+            tensor<fp32, [1]> new_scale = add(x = prev_scale, y = var_1072)[name = tensor<string, []>("new_scale")];
             tensor<fp32, []> const_33 = const()[name = tensor<string, []>("const_33"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_4 = clip(alpha = var_926, beta = const_33, x = new_scale)[name = tensor<string, []>("clip_4")];
+            tensor<fp32, [1]> clip_4 = clip(alpha = var_28, beta = const_33, x = new_scale)[name = tensor<string, []>("clip_4")];
             tensor<fp32, [1]> sqrt_new_scale = sqrt(x = clip_4)[name = tensor<string, []>("sqrt_new_scale")];
             tensor<fp32, [6, 4, 64, 64]> nkv = real_div(x = new_kv_unnorm, y = sqrt_new_scale)[name = tensor<string, []>("nkv")];
-            tensor<int32, [4]> var_1255_perm_0 = const()[name = tensor<string, []>("op_1255_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [4]> var_1258_perm_0 = const()[name = tensor<string, []>("op_1258_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_33_axes_0 = const()[name = tensor<string, []>("out_33_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [6, 1, 4, 64]> var_1255 = transpose(perm = var_1255_perm_0, x = out_31)[name = tensor<string, []>("transpose_11")];
-            tensor<fp32, [6, 1, 4, 64]> out_33 = layer_norm(axes = out_33_axes_0, epsilon = var_923, x = var_1255)[name = tensor<string, []>("out_33")];
-            tensor<int32, [3]> var_1259 = const()[name = tensor<string, []>("op_1259"), val = tensor<int32, [3]>([6, 1, 256])];
-            tensor<fp32, [6, 1, 256]> out = reshape(shape = var_1259, x = out_33)[name = tensor<string, []>("out")];
-            tensor<fp32, [6, 1, 256]> var_1261 = silu(x = input_187)[name = tensor<string, []>("op_1261")];
-            tensor<fp32, [6, 1, 256]> input_189 = mul(x = var_1261, y = out)[name = tensor<string, []>("input_189")];
-            tensor<fp32, [6, 1, 256]> ret_out = linear(bias = decoder_out_proj_1_bias, weight = decoder_out_proj_1_weight, x = input_189)[name = tensor<string, []>("linear_51")];
-            tensor<fp32, [6, 1, 256]> input_191 = add(x = x, y = ret_out)[name = tensor<string, []>("input_191")];
+            tensor<fp32, [6, 1, 4, 64]> var_1258 = transpose(perm = var_1258_perm_0, x = out_31)[name = tensor<string, []>("transpose_11")];
+            tensor<fp32, [6, 1, 4, 64]> out_33 = layer_norm(axes = out_33_axes_0, epsilon = var_43, x = var_1258)[name = tensor<string, []>("out_33")];
+            tensor<int32, [3]> var_1262 = const()[name = tensor<string, []>("op_1262"), val = tensor<int32, [3]>([6, 1, 256])];
+            tensor<fp32, [6, 1, 256]> out = reshape(shape = var_1262, x = out_33)[name = tensor<string, []>("out")];
+            tensor<fp32, [6, 1, 256]> var_1264 = silu(x = input_189)[name = tensor<string, []>("op_1264")];
+            tensor<fp32, [6, 1, 256]> input_191 = mul(x = var_1264, y = out)[name = tensor<string, []>("input_191")];
+            tensor<fp32, [6, 1, 256]> ret_out = linear(bias = inner_decoder_out_proj_1_bias, weight = inner_decoder_out_proj_1_weight, x = input_191)[name = tensor<string, []>("linear_51")];
+            tensor<fp32, [6, 1, 256]> input_193 = add(x = x, y = ret_out)[name = tensor<string, []>("input_193")];
             tensor<int32, [1]> xt_5_axes_0 = const()[name = tensor<string, []>("xt_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [6, 1, 256]> xt_5 = layer_norm(axes = xt_5_axes_0, beta = decoder_norm11_1_bias, epsilon = var_921, gamma = decoder_norm11_1_weight, x = input_191)[name = tensor<string, []>("xt_5")];
-            tensor<int32, [4]> var_1271 = const()[name = tensor<string, []>("op_1271"), val = tensor<int32, [4]>([1, 6, 1, 256])];
-            tensor<fp32, [1, 6, 1, 256]> var_1272 = reshape(shape = var_1271, x = xt_5)[name = tensor<string, []>("op_1272")];
-            tensor<int32, [4]> var_1273_perm_0 = const()[name = tensor<string, []>("op_1273_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1276 = const()[name = tensor<string, []>("op_1276"), val = tensor<int32, [3]>([1, 6, 256])];
-            tensor<fp32, [1, 1, 6, 256]> var_1273 = transpose(perm = var_1273_perm_0, x = var_1272)[name = tensor<string, []>("transpose_10")];
-            tensor<fp32, [1, 6, 256]> query_5 = reshape(shape = var_1276, x = var_1273)[name = tensor<string, []>("query_5")];
+            tensor<fp32, [6, 1, 256]> xt_5 = layer_norm(axes = xt_5_axes_0, beta = inner_decoder_norm11_1_bias, epsilon = var_35, gamma = inner_decoder_norm11_1_weight, x = input_193)[name = tensor<string, []>("xt_5")];
+            tensor<int32, [4]> var_1274 = const()[name = tensor<string, []>("op_1274"), val = tensor<int32, [4]>([1, 6, 1, 256])];
+            tensor<fp32, [1, 6, 1, 256]> var_1275 = reshape(shape = var_1274, x = xt_5)[name = tensor<string, []>("op_1275")];
+            tensor<int32, [4]> var_1276_perm_0 = const()[name = tensor<string, []>("op_1276_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1279 = const()[name = tensor<string, []>("op_1279"), val = tensor<int32, [3]>([1, 6, 256])];
+            tensor<fp32, [1, 1, 6, 256]> var_1276 = transpose(perm = var_1276_perm_0, x = var_1275)[name = tensor<string, []>("transpose_10")];
+            tensor<fp32, [1, 6, 256]> query_5 = reshape(shape = var_1279, x = var_1276)[name = tensor<string, []>("query_5")];
             tensor<int32, [3]> query_perm_0 = const()[name = tensor<string, []>("query_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [6, 1, 256]> query = transpose(perm = query_perm_0, x = query_5)[name = tensor<string, []>("transpose_9")];
-            tensor<fp32, [6, 1, 768]> var_1299 = linear(bias = decoder_self_attn2_1_in_proj_bias, weight = decoder_self_attn2_1_in_proj_weight, x = query)[name = tensor<string, []>("linear_52")];
+            tensor<fp32, [6, 1, 768]> var_1302 = linear(bias = inner_decoder_self_attn2_1_in_proj_bias, weight = inner_decoder_self_attn2_1_in_proj_weight, x = query)[name = tensor<string, []>("linear_52")];
             tensor<int32, [4]> concat_2 = const()[name = tensor<string, []>("concat_2"), val = tensor<int32, [4]>([6, 1, 3, 256])];
-            tensor<fp32, [6, 1, 3, 256]> var_1301 = reshape(shape = concat_2, x = var_1299)[name = tensor<string, []>("op_1301")];
-            tensor<int32, [1]> var_1302_axes_0 = const()[name = tensor<string, []>("op_1302_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp32, [1, 6, 1, 3, 256]> var_1302 = expand_dims(axes = var_1302_axes_0, x = var_1301)[name = tensor<string, []>("op_1302")];
-            tensor<int32, [5]> var_1303_perm_0 = const()[name = tensor<string, []>("op_1303_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
-            tensor<int32, [1]> var_1304_axes_0 = const()[name = tensor<string, []>("op_1304_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp32, [3, 6, 1, 1, 256]> var_1303 = transpose(perm = var_1303_perm_0, x = var_1302)[name = tensor<string, []>("transpose_8")];
-            tensor<fp32, [3, 6, 1, 256]> var_1304 = squeeze(axes = var_1304_axes_0, x = var_1303)[name = tensor<string, []>("op_1304")];
+            tensor<fp32, [6, 1, 3, 256]> var_1304 = reshape(shape = concat_2, x = var_1302)[name = tensor<string, []>("op_1304")];
+            tensor<int32, [1]> var_1305_axes_0 = const()[name = tensor<string, []>("op_1305_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 6, 1, 3, 256]> var_1305 = expand_dims(axes = var_1305_axes_0, x = var_1304)[name = tensor<string, []>("op_1305")];
+            tensor<int32, [5]> var_1306_perm_0 = const()[name = tensor<string, []>("op_1306_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
+            tensor<int32, [1]> var_1307_axes_0 = const()[name = tensor<string, []>("op_1307_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp32, [3, 6, 1, 1, 256]> var_1306 = transpose(perm = var_1306_perm_0, x = var_1305)[name = tensor<string, []>("transpose_8")];
+            tensor<fp32, [3, 6, 1, 256]> var_1307 = squeeze(axes = var_1307_axes_0, x = var_1306)[name = tensor<string, []>("op_1307")];
             tensor<int32, [4]> q_19_begin_0 = const()[name = tensor<string, []>("q_19_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> q_19_end_0 = const()[name = tensor<string, []>("q_19_end_0"), val = tensor<int32, [4]>([1, 6, 1, 256])];
             tensor<bool, [4]> q_19_end_mask_0 = const()[name = tensor<string, []>("q_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> q_19_squeeze_mask_0 = const()[name = tensor<string, []>("q_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [6, 1, 256]> q_19 = slice_by_index(begin = q_19_begin_0, end = q_19_end_0, end_mask = q_19_end_mask_0, squeeze_mask = q_19_squeeze_mask_0, x = var_1304)[name = tensor<string, []>("q_19")];
+            tensor<fp32, [6, 1, 256]> q_19 = slice_by_index(begin = q_19_begin_0, end = q_19_end_0, end_mask = q_19_end_mask_0, squeeze_mask = q_19_squeeze_mask_0, x = var_1307)[name = tensor<string, []>("q_19")];
             tensor<int32, [4]> k_19_begin_0 = const()[name = tensor<string, []>("k_19_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> k_19_end_0 = const()[name = tensor<string, []>("k_19_end_0"), val = tensor<int32, [4]>([2, 6, 1, 256])];
             tensor<bool, [4]> k_19_end_mask_0 = const()[name = tensor<string, []>("k_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> k_19_squeeze_mask_0 = const()[name = tensor<string, []>("k_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [6, 1, 256]> k_19 = slice_by_index(begin = k_19_begin_0, end = k_19_end_0, end_mask = k_19_end_mask_0, squeeze_mask = k_19_squeeze_mask_0, x = var_1304)[name = tensor<string, []>("k_19")];
+            tensor<fp32, [6, 1, 256]> k_19 = slice_by_index(begin = k_19_begin_0, end = k_19_end_0, end_mask = k_19_end_mask_0, squeeze_mask = k_19_squeeze_mask_0, x = var_1307)[name = tensor<string, []>("k_19")];
             tensor<int32, [4]> v_19_begin_0 = const()[name = tensor<string, []>("v_19_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> v_19_end_0 = const()[name = tensor<string, []>("v_19_end_0"), val = tensor<int32, [4]>([3, 6, 1, 256])];
             tensor<bool, [4]> v_19_end_mask_0 = const()[name = tensor<string, []>("v_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> v_19_squeeze_mask_0 = const()[name = tensor<string, []>("v_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [6, 1, 256]> v_19 = slice_by_index(begin = v_19_begin_0, end = v_19_end_0, end_mask = v_19_end_mask_0, squeeze_mask = v_19_squeeze_mask_0, x = var_1304)[name = tensor<string, []>("v_19")];
-            tensor<int32, [3]> var_1312 = const()[name = tensor<string, []>("op_1312"), val = tensor<int32, [3]>([6, 4, 64])];
-            tensor<fp32, [6, 4, 64]> var_1313 = reshape(shape = var_1312, x = q_19)[name = tensor<string, []>("op_1313")];
+            tensor<fp32, [6, 1, 256]> v_19 = slice_by_index(begin = v_19_begin_0, end = v_19_end_0, end_mask = v_19_end_mask_0, squeeze_mask = v_19_squeeze_mask_0, x = var_1307)[name = tensor<string, []>("v_19")];
+            tensor<int32, [3]> var_1315 = const()[name = tensor<string, []>("op_1315"), val = tensor<int32, [3]>([6, 4, 64])];
+            tensor<fp32, [6, 4, 64]> var_1316 = reshape(shape = var_1315, x = q_19)[name = tensor<string, []>("op_1316")];
             tensor<int32, [3]> q_21_perm_0 = const()[name = tensor<string, []>("q_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1319 = const()[name = tensor<string, []>("op_1319"), val = tensor<int32, [3]>([6, 4, 64])];
-            tensor<fp32, [6, 4, 64]> var_1320 = reshape(shape = var_1319, x = k_19)[name = tensor<string, []>("op_1320")];
+            tensor<int32, [3]> var_1322 = const()[name = tensor<string, []>("op_1322"), val = tensor<int32, [3]>([6, 4, 64])];
+            tensor<fp32, [6, 4, 64]> var_1323 = reshape(shape = var_1322, x = k_19)[name = tensor<string, []>("op_1323")];
             tensor<int32, [3]> k_21_perm_0 = const()[name = tensor<string, []>("k_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1326 = const()[name = tensor<string, []>("op_1326"), val = tensor<int32, [3]>([6, 4, 64])];
-            tensor<fp32, [6, 4, 64]> var_1327 = reshape(shape = var_1326, x = v_19)[name = tensor<string, []>("op_1327")];
+            tensor<int32, [3]> var_1329 = const()[name = tensor<string, []>("op_1329"), val = tensor<int32, [3]>([6, 4, 64])];
+            tensor<fp32, [6, 4, 64]> var_1330 = reshape(shape = var_1329, x = v_19)[name = tensor<string, []>("op_1330")];
             tensor<int32, [3]> v_21_perm_0 = const()[name = tensor<string, []>("v_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [4]> var_1330 = const()[name = tensor<string, []>("op_1330"), val = tensor<int32, [4]>([1, 4, 6, 64])];
-            tensor<fp32, [4, 6, 64]> q_21 = transpose(perm = q_21_perm_0, x = var_1313)[name = tensor<string, []>("transpose_7")];
-            tensor<fp32, [1, 4, 6, 64]> q = reshape(shape = var_1330, x = q_21)[name = tensor<string, []>("q")];
-            tensor<int32, [4]> var_1332 = const()[name = tensor<string, []>("op_1332"), val = tensor<int32, [4]>([1, 4, 6, 64])];
-            tensor<fp32, [4, 6, 64]> k_21 = transpose(perm = k_21_perm_0, x = var_1320)[name = tensor<string, []>("transpose_6")];
-            tensor<fp32, [1, 4, 6, 64]> k = reshape(shape = var_1332, x = k_21)[name = tensor<string, []>("k")];
-            tensor<int32, [4]> var_1334 = const()[name = tensor<string, []>("op_1334"), val = tensor<int32, [4]>([1, 4, 6, 64])];
-            tensor<fp32, [4, 6, 64]> v_21 = transpose(perm = v_21_perm_0, x = var_1327)[name = tensor<string, []>("transpose_5")];
-            tensor<fp32, [1, 4, 6, 64]> v = reshape(shape = var_1334, x = v_21)[name = tensor<string, []>("v")];
+            tensor<int32, [4]> var_1333 = const()[name = tensor<string, []>("op_1333"), val = tensor<int32, [4]>([1, 4, 6, 64])];
+            tensor<fp32, [4, 6, 64]> q_21 = transpose(perm = q_21_perm_0, x = var_1316)[name = tensor<string, []>("transpose_7")];
+            tensor<fp32, [1, 4, 6, 64]> q = reshape(shape = var_1333, x = q_21)[name = tensor<string, []>("q")];
+            tensor<int32, [4]> var_1335 = const()[name = tensor<string, []>("op_1335"), val = tensor<int32, [4]>([1, 4, 6, 64])];
+            tensor<fp32, [4, 6, 64]> k_21 = transpose(perm = k_21_perm_0, x = var_1323)[name = tensor<string, []>("transpose_6")];
+            tensor<fp32, [1, 4, 6, 64]> k = reshape(shape = var_1335, x = k_21)[name = tensor<string, []>("k")];
+            tensor<int32, [4]> var_1337 = const()[name = tensor<string, []>("op_1337"), val = tensor<int32, [4]>([1, 4, 6, 64])];
+            tensor<fp32, [4, 6, 64]> v_21 = transpose(perm = v_21_perm_0, x = var_1330)[name = tensor<string, []>("transpose_5")];
+            tensor<fp32, [1, 4, 6, 64]> v = reshape(shape = var_1337, x = v_21)[name = tensor<string, []>("v")];
             tensor<fp32, []> mul_3_y_0 = const()[name = tensor<string, []>("mul_3_y_0"), val = tensor<fp32, []>(0x1p-3)];
             tensor<fp32, [1, 4, 6, 64]> mul_3 = mul(x = q, y = mul_3_y_0)[name = tensor<string, []>("mul_3")];
             tensor<bool, []> matmul_1_transpose_y_0 = const()[name = tensor<string, []>("matmul_1_transpose_y_0"), val = tensor<bool, []>(true)];
@@ -1139,34 +1138,34 @@ program(1.0)
             tensor<bool, []> attn_output_9_transpose_x_0 = const()[name = tensor<string, []>("attn_output_9_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> attn_output_9_transpose_y_0 = const()[name = tensor<string, []>("attn_output_9_transpose_y_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [1, 4, 6, 64]> attn_output_9 = matmul(transpose_x = attn_output_9_transpose_x_0, transpose_y = attn_output_9_transpose_y_0, x = softmax_1, y = v)[name = tensor<string, []>("attn_output_9")];
-            tensor<int32, [4]> var_1337 = const()[name = tensor<string, []>("op_1337"), val = tensor<int32, [4]>([2, 0, 1, 3])];
-            tensor<int32, [2]> var_1342 = const()[name = tensor<string, []>("op_1342"), val = tensor<int32, [2]>([6, 256])];
-            tensor<fp32, [6, 1, 4, 64]> var_1338 = transpose(perm = var_1337, x = attn_output_9)[name = tensor<string, []>("transpose_4")];
-            tensor<fp32, [6, 256]> attn_output_11 = reshape(shape = var_1342, x = var_1338)[name = tensor<string, []>("attn_output_11")];
-            tensor<fp32, [6, 256]> attn_output_13 = linear(bias = decoder_self_attn2_1_out_proj_bias, weight = decoder_self_attn2_1_out_proj_weight, x = attn_output_11)[name = tensor<string, []>("linear_53")];
-            tensor<int32, [3]> var_1346 = const()[name = tensor<string, []>("op_1346"), val = tensor<int32, [3]>([6, 1, 256])];
-            tensor<fp32, [6, 1, 256]> attn_output = reshape(shape = var_1346, x = attn_output_13)[name = tensor<string, []>("attn_output")];
+            tensor<int32, [4]> var_1340 = const()[name = tensor<string, []>("op_1340"), val = tensor<int32, [4]>([2, 0, 1, 3])];
+            tensor<int32, [2]> var_1345 = const()[name = tensor<string, []>("op_1345"), val = tensor<int32, [2]>([6, 256])];
+            tensor<fp32, [6, 1, 4, 64]> var_1341 = transpose(perm = var_1340, x = attn_output_9)[name = tensor<string, []>("transpose_4")];
+            tensor<fp32, [6, 256]> attn_output_11 = reshape(shape = var_1345, x = var_1341)[name = tensor<string, []>("attn_output_11")];
+            tensor<fp32, [6, 256]> attn_output_13 = linear(bias = inner_decoder_self_attn2_1_out_proj_bias, weight = inner_decoder_self_attn2_1_out_proj_weight, x = attn_output_11)[name = tensor<string, []>("linear_53")];
+            tensor<int32, [3]> var_1349 = const()[name = tensor<string, []>("op_1349"), val = tensor<int32, [3]>([6, 1, 256])];
+            tensor<fp32, [6, 1, 256]> attn_output = reshape(shape = var_1349, x = attn_output_13)[name = tensor<string, []>("attn_output")];
             tensor<int32, [3]> sa2_out_perm_0 = const()[name = tensor<string, []>("sa2_out_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [1, 6, 256]> sa2_out = transpose(perm = sa2_out_perm_0, x = attn_output)[name = tensor<string, []>("transpose_3")];
-            tensor<fp32, [1, 6, 256]> input_193 = add(x = query_5, y = sa2_out)[name = tensor<string, []>("input_193")];
-            tensor<int32, [1]> input_195_axes_0 = const()[name = tensor<string, []>("input_195_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 6, 256]> input_195 = layer_norm(axes = input_195_axes_0, beta = decoder_norm21_1_bias, epsilon = var_921, gamma = decoder_norm21_1_weight, x = input_193)[name = tensor<string, []>("input_195")];
-            tensor<fp32, [1, 6, 2048]> input_197 = linear(bias = decoder_linear1_1_bias, weight = decoder_linear1_1_weight, x = input_195)[name = tensor<string, []>("linear_54")];
-            tensor<fp32, [1, 6, 2048]> input_199 = relu(x = input_197)[name = tensor<string, []>("input_199")];
-            tensor<fp32, [1, 6, 256]> ffn_out = linear(bias = decoder_linear2_1_bias, weight = decoder_linear2_1_weight, x = input_199)[name = tensor<string, []>("linear_55")];
-            tensor<fp32, [1, 6, 256]> input_201 = add(x = input_195, y = ffn_out)[name = tensor<string, []>("input_201")];
+            tensor<fp32, [1, 6, 256]> input_195 = add(x = query_5, y = sa2_out)[name = tensor<string, []>("input_195")];
+            tensor<int32, [1]> input_197_axes_0 = const()[name = tensor<string, []>("input_197_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 6, 256]> input_197 = layer_norm(axes = input_197_axes_0, beta = inner_decoder_norm21_1_bias, epsilon = var_35, gamma = inner_decoder_norm21_1_weight, x = input_195)[name = tensor<string, []>("input_197")];
+            tensor<fp32, [1, 6, 2048]> input_199 = linear(bias = inner_decoder_linear1_1_bias, weight = inner_decoder_linear1_1_weight, x = input_197)[name = tensor<string, []>("linear_54")];
+            tensor<fp32, [1, 6, 2048]> input_201 = relu(x = input_199)[name = tensor<string, []>("input_201")];
+            tensor<fp32, [1, 6, 256]> ffn_out = linear(bias = inner_decoder_linear2_1_bias, weight = inner_decoder_linear2_1_weight, x = input_201)[name = tensor<string, []>("linear_55")];
+            tensor<fp32, [1, 6, 256]> input_203 = add(x = input_197, y = ffn_out)[name = tensor<string, []>("input_203")];
             tensor<int32, [1]> xt_axes_0 = const()[name = tensor<string, []>("xt_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 6, 256]> xt = layer_norm(axes = xt_axes_0, beta = decoder_norm22_1_bias, epsilon = var_921, gamma = decoder_norm22_1_weight, x = input_201)[name = tensor<string, []>("xt")];
-            tensor<int32, [4]> var_1366 = const()[name = tensor<string, []>("op_1366"), val = tensor<int32, [4]>([1, 1, 6, 256])];
-            tensor<fp32, [1, 1, 6, 256]> input = reshape(shape = var_1366, x = xt)[name = tensor<string, []>("input")];
-            tensor<int32, [1]> var_1368 = const()[name = tensor<string, []>("op_1368"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 6, 1]> var_1369 = reduce_l2_norm(axes = var_1368, keep_dims = var_924, x = input)[name = tensor<string, []>("op_1369")];
+            tensor<fp32, [1, 6, 256]> xt = layer_norm(axes = xt_axes_0, beta = inner_decoder_norm22_1_bias, epsilon = var_35, gamma = inner_decoder_norm22_1_weight, x = input_203)[name = tensor<string, []>("xt")];
+            tensor<int32, [4]> var_1369 = const()[name = tensor<string, []>("op_1369"), val = tensor<int32, [4]>([1, 1, 6, 256])];
+            tensor<fp32, [1, 1, 6, 256]> input = reshape(shape = var_1369, x = xt)[name = tensor<string, []>("input")];
+            tensor<int32, [1]> var_1371 = const()[name = tensor<string, []>("op_1371"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 1, 6, 1]> var_1372 = reduce_l2_norm(axes = var_1371, keep_dims = var_34, x = input)[name = tensor<string, []>("op_1372")];
             tensor<fp32, []> const_42 = const()[name = tensor<string, []>("const_42"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1, 1, 6, 1]> clip_5 = clip(alpha = var_916, beta = const_42, x = var_1369)[name = tensor<string, []>("clip_5")];
-            tensor<fp32, [1, 1, 6, 256]> var_1371 = real_div(x = input, y = clip_5)[name = tensor<string, []>("op_1371")];
+            tensor<fp32, [1, 1, 6, 1]> clip_5 = clip(alpha = var_48, beta = const_42, x = var_1372)[name = tensor<string, []>("clip_5")];
+            tensor<fp32, [1, 1, 6, 256]> var_1374 = real_div(x = input, y = clip_5)[name = tensor<string, []>("op_1374")];
             tensor<int32, [4]> transpose_1_perm_0 = const()[name = tensor<string, []>("transpose_1_perm_0"), val = tensor<int32, [4]>([0, 1, 3, 2])];
             tensor<int32, [3]> concat_7 = const()[name = tensor<string, []>("concat_7"), val = tensor<int32, [3]>([1, 256, 6])];
-            tensor<fp32, [1, 1, 256, 6]> transpose_1 = transpose(perm = transpose_1_perm_0, x = var_1371)[name = tensor<string, []>("transpose_2")];
+            tensor<fp32, [1, 1, 256, 6]> transpose_1 = transpose(perm = transpose_1_perm_0, x = var_1374)[name = tensor<string, []>("transpose_2")];
             tensor<fp32, [1, 256, 6]> reshape_1 = reshape(shape = concat_7, x = transpose_1)[name = tensor<string, []>("reshape_1")];
             tensor<bool, []> matmul_2_transpose_x_0 = const()[name = tensor<string, []>("matmul_2_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> matmul_2_transpose_y_0 = const()[name = tensor<string, []>("matmul_2_transpose_y_0"), val = tensor<bool, []>(false)];
@@ -1175,10 +1174,10 @@ program(1.0)
             tensor<int32, [3]> output_end_0 = const()[name = tensor<string, []>("output_end_0"), val = tensor<int32, [3]>([1, 1, 5])];
             tensor<bool, [3]> output_end_mask_0 = const()[name = tensor<string, []>("output_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
             tensor<fp32, [1, 1, 4]> output = slice_by_index(begin = output_begin_0, end = output_end_0, end_mask = output_end_mask_0, x = matmul_2)[name = tensor<string, []>("output")];
-            tensor<fp32, [1, 1, 4]> probs = sigmoid(x = output)[name = tensor<string, []>("op_1375")];
-            tensor<int32, []> var_1377_axis_0 = const()[name = tensor<string, []>("op_1377_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [2, 6, 4, 64, 64]> dec_kv_new = stack(axis = var_1377_axis_0, values = (var_1073, nkv))[name = tensor<string, []>("op_1377")];
-            tensor<int32, []> var_1379_axis_0 = const()[name = tensor<string, []>("op_1379_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [2, 1]> dec_scale_new = stack(axis = var_1379_axis_0, values = (new_scale_9, new_scale))[name = tensor<string, []>("op_1379")];
+            tensor<fp32, [1, 1, 4]> probs = sigmoid(x = output)[name = tensor<string, []>("op_1378")];
+            tensor<int32, []> var_1380_axis_0 = const()[name = tensor<string, []>("op_1380_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [2, 6, 4, 64, 64]> dec_kv_new = stack(axis = var_1380_axis_0, values = (var_1076, nkv))[name = tensor<string, []>("op_1380")];
+            tensor<int32, []> var_1382_axis_0 = const()[name = tensor<string, []>("op_1382_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [2, 1]> dec_scale_new = stack(axis = var_1382_axis_0, values = (new_scale_9, new_scale))[name = tensor<string, []>("op_1382")];
         } -> (probs, enc_kv_new, enc_scale_new, enc_conv_cache_new, cnn_window_new, dec_kv_new, dec_scale_new);
 }
\ No newline at end of file
diff --git a/optimized/ami/100ms/ls_eend_ami_100ms.mlpackage/Data/com.apple.CoreML/model.mlmodel b/optimized/ami/100ms/ls_eend_ami_100ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
index 23b9a3151223c06a795711a72944fec0a2d73987..dc361a040c87fc3818df1830df2c72f647d6c270 100644
--- a/optimized/ami/100ms/ls_eend_ami_100ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
+++ b/optimized/ami/100ms/ls_eend_ami_100ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2d99472f0bd00e091613210906f7fb7be42e92f6019968246af665c07dab02a7
-size 171348
+oid sha256:fc177110db14e37e532be41bb47accb450f1ce89bb7876b675b2d4a25d797e1e
+size 175266
diff --git a/optimized/ami/100ms/ls_eend_ami_100ms.mlpackage/Manifest.json b/optimized/ami/100ms/ls_eend_ami_100ms.mlpackage/Manifest.json
index 6a6df0d6218423e5cf5828262039e34f982ae069..600ad3c5b36c6613b59be15141c9242bb273eb53 100644
--- a/optimized/ami/100ms/ls_eend_ami_100ms.mlpackage/Manifest.json
+++ b/optimized/ami/100ms/ls_eend_ami_100ms.mlpackage/Manifest.json
@@ -1,18 +1,18 @@
 {
     "fileFormatVersion": "1.0.0",
     "itemInfoEntries": {
-        "25D17A51-8136-403A-8E7C-65E586D88F62": {
+        "3ADB4906-F6AE-45F6-8FBA-E66FCC7A8291": {
             "author": "com.apple.CoreML",
             "description": "CoreML Model Weights",
             "name": "weights",
             "path": "com.apple.CoreML/weights"
         },
-        "966E268D-F501-43F0-BD33-968439381A4D": {
+        "403AED6F-5C8E-400B-80AA-18BE809ED8A3": {
             "author": "com.apple.CoreML",
             "description": "CoreML Model Specification",
             "name": "model.mlmodel",
             "path": "com.apple.CoreML/model.mlmodel"
         }
     },
-    "rootModelIdentifier": "966E268D-F501-43F0-BD33-968439381A4D"
+    "rootModelIdentifier": "403AED6F-5C8E-400B-80AA-18BE809ED8A3"
 }
diff --git a/optimized/ami/200ms/ls_eend_ami_200ms.mlmodelc/analytics/coremldata.bin b/optimized/ami/200ms/ls_eend_ami_200ms.mlmodelc/analytics/coremldata.bin
index dbf2843795d71fc2c8081b28675d1065af2fc06c..38434f198256b9bb1ff20657e279c5ad2ef9ed87 100644
--- a/optimized/ami/200ms/ls_eend_ami_200ms.mlmodelc/analytics/coremldata.bin
+++ b/optimized/ami/200ms/ls_eend_ami_200ms.mlmodelc/analytics/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c6a81fd874a82f12a3d9273121dfc5cf2c7b7fc3ab75940416ce700d6c464715
+oid sha256:879a50af51ba6bf2014083aac4b8757dd61926e4ea7e84b03b88c8bd34bf19a7
 size 243
diff --git a/optimized/ami/200ms/ls_eend_ami_200ms.mlmodelc/coremldata.bin b/optimized/ami/200ms/ls_eend_ami_200ms.mlmodelc/coremldata.bin
index c2e0a156964f1117ec1437b8f2af288e3cae4b6a..33a973adb4aaa20c921d030786fdd9741671d4e6 100644
--- a/optimized/ami/200ms/ls_eend_ami_200ms.mlmodelc/coremldata.bin
+++ b/optimized/ami/200ms/ls_eend_ami_200ms.mlmodelc/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1ff7a04daff876566a5951933f1afdcf6a2bc73cf8214ddcb95845fb83a7ad66
-size 1292
+oid sha256:db2f6d6a11db778787a47cac87d5933e6c831d13f92e280b3ba1d11f75f4ba6c
+size 1395
diff --git a/optimized/ami/200ms/ls_eend_ami_200ms.mlmodelc/metadata.json b/optimized/ami/200ms/ls_eend_ami_200ms.mlmodelc/metadata.json
index fceb4211edaa544fa740da0dbf819fbec3b8ddf1..b44cfc54249ccb12efacb97f32bc8fa49ad4ee48 100644
--- a/optimized/ami/200ms/ls_eend_ami_200ms.mlmodelc/metadata.json
+++ b/optimized/ami/200ms/ls_eend_ami_200ms.mlmodelc/metadata.json
@@ -1,7 +1,7 @@
 [
   {
     "metadataOutputVersion" : "3.0",
-    "shortDescription" : "LS-EEND AMI streaming diarizer (pipeline, T=2, max_speakers=4)",
+    "shortDescription" : "LS-EEND AMI streaming diarizer (pipeline, T=2, max_speakers=4, layout=raw_mel, cu=all)",
     "outputSchema" : [
       {
         "hasShapeFlexibility" : "0",
@@ -81,7 +81,7 @@
     "specificationVersion" : 8,
     "mlProgramOperationTypeHistogram" : {
       "Ios16.reduceL2Norm" : 2,
-      "Ios17.reshape" : 68,
+      "Ios17.reshape" : 69,
       "Ios16.softmax" : 2,
       "Ios17.matmul" : 29,
       "Ios17.transpose" : 57,
@@ -89,7 +89,7 @@
       "Ios17.expandDims" : 3,
       "Ios17.add" : 46,
       "Ios16.sigmoid" : 5,
-      "Ios17.sliceByIndex" : 48,
+      "Ios17.sliceByIndex" : 50,
       "Tile" : 1,
       "Ios16.reduceSum" : 1,
       "Ios17.squeeze" : 2,
@@ -101,7 +101,7 @@
       "Ios16.silu" : 18,
       "Ios17.realDiv" : 20,
       "Ios17.linear" : 56,
-      "Stack" : 5,
+      "Stack" : 6,
       "Ios17.concat" : 14,
       "Ios16.relu" : 2,
       "Ios16.cumsum" : 1,
@@ -128,9 +128,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float32",
-        "formattedType" : "MultiArray (Float32 1 × 2 × 345)",
+        "formattedType" : "MultiArray (Float32 1 × 25 × 23)",
         "shortDescription" : "",
-        "shape" : "[1, 2, 345]",
+        "shape" : "[1, 25, 23]",
         "name" : "features",
         "type" : "MultiArray"
       },
@@ -206,8 +206,8 @@
       }
     ],
     "userDefinedMetadata" : {
-      "com.github.apple.coremltools.conversion_date" : "2026-04-16",
-      "config" : "{\"model_name\": \"ami\", \"model_label\": \"AMI\", \"variant\": \"pipeline\", \"chunk_size\": 2, \"step_duration_ms\": 200, \"frame_duration_ms\": 100, \"frame_duration_seconds\": 0.1, \"max_speakers\": 4, \"max_nspks\": 6, \"n_units\": 256, \"n_heads\": 4, \"enc_n_layers\": 4, \"dec_n_layers\": 2, \"conv_kernel_size\": 16, \"conv_delay\": 9, \"sample_rate\": 8000, \"win_length\": 200, \"hop_length\": 80, \"n_mels\": 23, \"context_size\": 7, \"subsampling\": 10, \"feat_type\": \"logmel23_cummn\", \"pure_roll\": true}",
+      "com.github.apple.coremltools.conversion_date" : "2026-04-18",
+      "config" : "{\"model_name\": \"ami\", \"model_label\": \"AMI\", \"variant\": \"pipeline\", \"chunk_size\": 2, \"step_duration_ms\": 200, \"frame_duration_ms\": 100, \"frame_duration_seconds\": 0.1, \"max_speakers\": 4, \"max_nspks\": 6, \"n_units\": 256, \"n_heads\": 4, \"enc_n_layers\": 4, \"dec_n_layers\": 2, \"conv_kernel_size\": 16, \"conv_delay\": 9, \"sample_rate\": 8000, \"win_length\": 200, \"hop_length\": 80, \"n_mels\": 23, \"context_size\": 7, \"subsampling\": 10, \"feat_type\": \"logmel23_cummn\", \"pure_roll\": true, \"input_layout\": \"raw_mel\", \"compute_units_export\": \"all\", \"raw_mel_length\": 25}",
       "com.github.apple.coremltools.source" : "torch==2.6.0",
       "com.github.apple.coremltools.version" : "9.0",
       "com.github.apple.coremltools.source_dialect" : "TorchScript"
diff --git a/optimized/ami/200ms/ls_eend_ami_200ms.mlmodelc/model.mil b/optimized/ami/200ms/ls_eend_ami_200ms.mlmodelc/model.mil
index 06fd0af3c049963c87924d0249246fe8af7890b0..4e4bf7dc4c9439ee3979e99bab1dbb11b541124b 100644
--- a/optimized/ami/200ms/ls_eend_ami_200ms.mlmodelc/model.mil
+++ b/optimized/ami/200ms/ls_eend_ami_200ms.mlmodelc/model.mil
@@ -1,234 +1,248 @@
 program(1.0)
 [buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.6.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "9.0"}})]
 {
-    func main<ios17>(tensor<fp32, [1, 256, 18]> cnn_window, tensor<fp32, [2, 6, 4, 64, 64]> dec_kv, tensor<fp32, [2, 1]> dec_scale, tensor<fp32, [4, 1, 16, 256]> enc_conv_cache, tensor<fp32, [4, 1, 4, 64, 64]> enc_kv, tensor<fp32, [4, 1]> enc_scale, tensor<fp32, [1, 2, 345]> features, tensor<fp32, [2]> valid_mask) {
-            tensor<fp32, [256]> encoder_cnn_bias = const()[name = tensor<string, []>("encoder_cnn_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
-            tensor<fp32, [256, 256, 19]> encoder_cnn_weight = const()[name = tensor<string, []>("encoder_cnn_weight"), val = tensor<fp32, [256, 256, 19]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
-            tensor<fp32, [2, 2]> encoder__causal_mask = const()[name = tensor<string, []>("encoder__causal_mask"), val = tensor<fp32, [2, 2]>([[0x1p+0, 0x0p+0], [0x1p+0, 0x1p+0]])];
-            tensor<fp32, [2]> encoder__t_index = const()[name = tensor<string, []>("encoder__t_index"), val = tensor<fp32, [2]>([0x1p+0, 0x1p+1])];
-            tensor<fp32, [256]> encoder_input_projection_linear_bias = const()[name = tensor<string, []>("encoder_input_projection_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4981952)))];
-            tensor<fp32, [256, 345]> encoder_input_projection_linear_weight = const()[name = tensor<string, []>("encoder_input_projection_linear_weight"), val = tensor<fp32, [256, 345]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4983040)))];
-            tensor<fp32, [256]> encoder_pre_layer_norm_bias = const()[name = tensor<string, []>("encoder_pre_layer_norm_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5336384)))];
-            tensor<fp32, [256]> encoder_pre_layer_norm_weight = const()[name = tensor<string, []>("encoder_pre_layer_norm_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5337472)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5338560)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5339648)))];
-            tensor<fp32, [1024]> encoder_ffn1_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5340736)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5344896)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6393536)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6394624)))];
-            tensor<fp32, [256]> encoder_ret_lns_0_bias = const()[name = tensor<string, []>("encoder_ret_lns_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7443264)))];
-            tensor<fp32, [256]> encoder_ret_lns_0_weight = const()[name = tensor<string, []>("encoder_ret_lns_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7444352)))];
-            tensor<fp32, [256]> encoder_q_proj_0_bias = const()[name = tensor<string, []>("encoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7445440)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_0_weight = const()[name = tensor<string, []>("encoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7446528)))];
-            tensor<fp32, [256]> encoder_k_proj_0_bias = const()[name = tensor<string, []>("encoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7708736)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_0_weight = const()[name = tensor<string, []>("encoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7709824)))];
-            tensor<fp32, [256]> encoder_v_proj_0_bias = const()[name = tensor<string, []>("encoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7972032)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_0_weight = const()[name = tensor<string, []>("encoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7973120)))];
-            tensor<fp32, [256]> encoder_g_proj_0_bias = const()[name = tensor<string, []>("encoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8235328)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_0_weight = const()[name = tensor<string, []>("encoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8236416)))];
-            tensor<fp32, [256]> encoder_out_proj_0_bias = const()[name = tensor<string, []>("encoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8498624)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_0_weight = const()[name = tensor<string, []>("encoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8499712)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8761920)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8763008)))];
-            tensor<fp32, [512]> encoder_conv_module_0_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8764096)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_0_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8766208)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_0_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9290560)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9307008)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9308096)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9309184)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9310272)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9311360)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_0_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9312448)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9574656)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9575744)))];
-            tensor<fp32, [1024]> encoder_ffn2_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9576832)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9580992)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10629632)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10630720)))];
-            tensor<fp32, [256]> encoder_layer_norm_0_bias = const()[name = tensor<string, []>("encoder_layer_norm_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11679360)))];
-            tensor<fp32, [256]> encoder_layer_norm_0_weight = const()[name = tensor<string, []>("encoder_layer_norm_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11680448)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11681536)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11682624)))];
-            tensor<fp32, [1024]> encoder_ffn1_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11683712)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11687872)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12736512)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12737600)))];
-            tensor<fp32, [256]> encoder_ret_lns_1_bias = const()[name = tensor<string, []>("encoder_ret_lns_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13786240)))];
-            tensor<fp32, [256]> encoder_ret_lns_1_weight = const()[name = tensor<string, []>("encoder_ret_lns_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13787328)))];
-            tensor<fp32, [256]> encoder_q_proj_1_bias = const()[name = tensor<string, []>("encoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13788416)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_1_weight = const()[name = tensor<string, []>("encoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13789504)))];
-            tensor<fp32, [256]> encoder_k_proj_1_bias = const()[name = tensor<string, []>("encoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14051712)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_1_weight = const()[name = tensor<string, []>("encoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14052800)))];
-            tensor<fp32, [256]> encoder_v_proj_1_bias = const()[name = tensor<string, []>("encoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14315008)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_1_weight = const()[name = tensor<string, []>("encoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14316096)))];
-            tensor<fp32, [256]> encoder_g_proj_1_bias = const()[name = tensor<string, []>("encoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14578304)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_1_weight = const()[name = tensor<string, []>("encoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14579392)))];
-            tensor<fp32, [256]> encoder_out_proj_1_bias = const()[name = tensor<string, []>("encoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14841600)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_1_weight = const()[name = tensor<string, []>("encoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14842688)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15104896)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15105984)))];
-            tensor<fp32, [512]> encoder_conv_module_1_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15107072)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_1_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15109184)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_1_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15633536)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15649984)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15651072)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15652160)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15653248)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15654336)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_1_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15655424)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15917632)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15918720)))];
-            tensor<fp32, [1024]> encoder_ffn2_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15919808)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15923968)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16972608)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16973696)))];
-            tensor<fp32, [256]> encoder_layer_norm_1_bias = const()[name = tensor<string, []>("encoder_layer_norm_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18022336)))];
-            tensor<fp32, [256]> encoder_layer_norm_1_weight = const()[name = tensor<string, []>("encoder_layer_norm_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18023424)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18024512)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18025600)))];
-            tensor<fp32, [1024]> encoder_ffn1_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18026688)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18030848)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19079488)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19080576)))];
-            tensor<fp32, [256]> encoder_ret_lns_2_bias = const()[name = tensor<string, []>("encoder_ret_lns_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20129216)))];
-            tensor<fp32, [256]> encoder_ret_lns_2_weight = const()[name = tensor<string, []>("encoder_ret_lns_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20130304)))];
-            tensor<fp32, [256]> encoder_q_proj_2_bias = const()[name = tensor<string, []>("encoder_q_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20131392)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_2_weight = const()[name = tensor<string, []>("encoder_q_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20132480)))];
-            tensor<fp32, [256]> encoder_k_proj_2_bias = const()[name = tensor<string, []>("encoder_k_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20394688)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_2_weight = const()[name = tensor<string, []>("encoder_k_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20395776)))];
-            tensor<fp32, [256]> encoder_v_proj_2_bias = const()[name = tensor<string, []>("encoder_v_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20657984)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_2_weight = const()[name = tensor<string, []>("encoder_v_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20659072)))];
-            tensor<fp32, [256]> encoder_g_proj_2_bias = const()[name = tensor<string, []>("encoder_g_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20921280)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_2_weight = const()[name = tensor<string, []>("encoder_g_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20922368)))];
-            tensor<fp32, [256]> encoder_out_proj_2_bias = const()[name = tensor<string, []>("encoder_out_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21184576)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_2_weight = const()[name = tensor<string, []>("encoder_out_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21185664)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21447872)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21448960)))];
-            tensor<fp32, [512]> encoder_conv_module_2_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21450048)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_2_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21452160)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_2_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21976512)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21992960)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21994048)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21995136)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21996224)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21997312)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_2_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21998400)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22260608)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22261696)))];
-            tensor<fp32, [1024]> encoder_ffn2_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22262784)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22266944)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23315584)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23316672)))];
-            tensor<fp32, [256]> encoder_layer_norm_2_bias = const()[name = tensor<string, []>("encoder_layer_norm_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24365312)))];
-            tensor<fp32, [256]> encoder_layer_norm_2_weight = const()[name = tensor<string, []>("encoder_layer_norm_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24366400)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24367488)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24368576)))];
-            tensor<fp32, [1024]> encoder_ffn1_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24369664)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24373824)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25422464)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25423552)))];
-            tensor<fp32, [256]> encoder_ret_lns_3_bias = const()[name = tensor<string, []>("encoder_ret_lns_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26472192)))];
-            tensor<fp32, [256]> encoder_ret_lns_3_weight = const()[name = tensor<string, []>("encoder_ret_lns_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26473280)))];
-            tensor<fp32, [256]> encoder_q_proj_3_bias = const()[name = tensor<string, []>("encoder_q_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26474368)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_3_weight = const()[name = tensor<string, []>("encoder_q_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26475456)))];
-            tensor<fp32, [256]> encoder_k_proj_3_bias = const()[name = tensor<string, []>("encoder_k_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26737664)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_3_weight = const()[name = tensor<string, []>("encoder_k_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26738752)))];
-            tensor<fp32, [256]> encoder_v_proj_3_bias = const()[name = tensor<string, []>("encoder_v_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27000960)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_3_weight = const()[name = tensor<string, []>("encoder_v_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27002048)))];
-            tensor<fp32, [256]> encoder_g_proj_3_bias = const()[name = tensor<string, []>("encoder_g_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27264256)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_3_weight = const()[name = tensor<string, []>("encoder_g_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27265344)))];
-            tensor<fp32, [256]> encoder_out_proj_3_bias = const()[name = tensor<string, []>("encoder_out_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27527552)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_3_weight = const()[name = tensor<string, []>("encoder_out_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27528640)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27790848)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27791936)))];
-            tensor<fp32, [512]> encoder_conv_module_3_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27793024)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_3_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27795136)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_3_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28319488)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28335936)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28337024)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28338112)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28339200)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28340288)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_3_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28341376)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28603584)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28604672)))];
-            tensor<fp32, [1024]> encoder_ffn2_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28605760)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28609920)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29658560)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29659648)))];
-            tensor<fp32, [256]> encoder_layer_norm_3_bias = const()[name = tensor<string, []>("encoder_layer_norm_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30708288)))];
-            tensor<fp32, [256]> encoder_layer_norm_3_weight = const()[name = tensor<string, []>("encoder_layer_norm_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30709376)))];
-            tensor<fp32, [2, 2]> decoder__causal_mask = const()[name = tensor<string, []>("decoder__causal_mask"), val = tensor<fp32, [2, 2]>([[0x1p+0, 0x0p+0], [0x1p+0, 0x1p+0]])];
-            tensor<fp32, [256]> decoder_convert_bias = const()[name = tensor<string, []>("decoder_convert_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710464)))];
-            tensor<fp32, [256, 512]> decoder_convert_weight = const()[name = tensor<string, []>("decoder_convert_weight"), val = tensor<fp32, [256, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30711552)))];
-            tensor<fp32, [256]> decoder_q_proj_0_bias = const()[name = tensor<string, []>("decoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31235904)))];
-            tensor<fp32, [256, 256]> decoder_q_proj_0_weight = const()[name = tensor<string, []>("decoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31236992)))];
-            tensor<fp32, [256]> decoder_k_proj_0_bias = const()[name = tensor<string, []>("decoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31499200)))];
-            tensor<fp32, [256, 256]> decoder_k_proj_0_weight = const()[name = tensor<string, []>("decoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31500288)))];
-            tensor<fp32, [256]> decoder_v_proj_0_bias = const()[name = tensor<string, []>("decoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31762496)))];
-            tensor<fp32, [256, 256]> decoder_v_proj_0_weight = const()[name = tensor<string, []>("decoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31763584)))];
-            tensor<fp32, [256]> decoder_g_proj_0_bias = const()[name = tensor<string, []>("decoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32025792)))];
-            tensor<fp32, [256, 256]> decoder_g_proj_0_weight = const()[name = tensor<string, []>("decoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32026880)))];
-            tensor<fp32, [256]> decoder_out_proj_0_bias = const()[name = tensor<string, []>("decoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32289088)))];
-            tensor<fp32, [256, 256]> decoder_out_proj_0_weight = const()[name = tensor<string, []>("decoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32290176)))];
-            tensor<fp32, [256]> decoder_norm11_0_bias = const()[name = tensor<string, []>("decoder_norm11_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32552384)))];
-            tensor<fp32, [256]> decoder_norm11_0_weight = const()[name = tensor<string, []>("decoder_norm11_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32553472)))];
-            tensor<fp32, [256]> decoder_self_attn2_0_out_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_0_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32554560)))];
-            tensor<fp32, [256, 256]> decoder_self_attn2_0_out_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_0_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32555648)))];
-            tensor<fp32, [768]> decoder_self_attn2_0_in_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_0_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32817856)))];
-            tensor<fp32, [768, 256]> decoder_self_attn2_0_in_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_0_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32820992)))];
-            tensor<fp32, [256]> decoder_norm21_0_bias = const()[name = tensor<string, []>("decoder_norm21_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33607488)))];
-            tensor<fp32, [256]> decoder_norm21_0_weight = const()[name = tensor<string, []>("decoder_norm21_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33608576)))];
-            tensor<fp32, [2048]> decoder_linear1_0_bias = const()[name = tensor<string, []>("decoder_linear1_0_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33609664)))];
-            tensor<fp32, [2048, 256]> decoder_linear1_0_weight = const()[name = tensor<string, []>("decoder_linear1_0_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33617920)))];
-            tensor<fp32, [256]> decoder_linear2_0_bias = const()[name = tensor<string, []>("decoder_linear2_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35715136)))];
-            tensor<fp32, [256, 2048]> decoder_linear2_0_weight = const()[name = tensor<string, []>("decoder_linear2_0_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35716224)))];
-            tensor<fp32, [256]> decoder_norm22_0_bias = const()[name = tensor<string, []>("decoder_norm22_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37813440)))];
-            tensor<fp32, [256]> decoder_norm22_0_weight = const()[name = tensor<string, []>("decoder_norm22_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37814528)))];
-            tensor<fp32, [256]> decoder_q_proj_1_bias = const()[name = tensor<string, []>("decoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37815616)))];
-            tensor<fp32, [256, 256]> decoder_q_proj_1_weight = const()[name = tensor<string, []>("decoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37816704)))];
-            tensor<fp32, [256]> decoder_k_proj_1_bias = const()[name = tensor<string, []>("decoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38078912)))];
-            tensor<fp32, [256, 256]> decoder_k_proj_1_weight = const()[name = tensor<string, []>("decoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38080000)))];
-            tensor<fp32, [256]> decoder_v_proj_1_bias = const()[name = tensor<string, []>("decoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38342208)))];
-            tensor<fp32, [256, 256]> decoder_v_proj_1_weight = const()[name = tensor<string, []>("decoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38343296)))];
-            tensor<fp32, [256]> decoder_g_proj_1_bias = const()[name = tensor<string, []>("decoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38605504)))];
-            tensor<fp32, [256, 256]> decoder_g_proj_1_weight = const()[name = tensor<string, []>("decoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38606592)))];
-            tensor<fp32, [256]> decoder_out_proj_1_bias = const()[name = tensor<string, []>("decoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38868800)))];
-            tensor<fp32, [256, 256]> decoder_out_proj_1_weight = const()[name = tensor<string, []>("decoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38869888)))];
-            tensor<fp32, [256]> decoder_norm11_1_bias = const()[name = tensor<string, []>("decoder_norm11_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39132096)))];
-            tensor<fp32, [256]> decoder_norm11_1_weight = const()[name = tensor<string, []>("decoder_norm11_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39133184)))];
-            tensor<fp32, [256]> decoder_self_attn2_1_out_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_1_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39134272)))];
-            tensor<fp32, [256, 256]> decoder_self_attn2_1_out_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_1_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39135360)))];
-            tensor<fp32, [768]> decoder_self_attn2_1_in_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_1_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39397568)))];
-            tensor<fp32, [768, 256]> decoder_self_attn2_1_in_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_1_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39400704)))];
-            tensor<fp32, [256]> decoder_norm21_1_bias = const()[name = tensor<string, []>("decoder_norm21_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40187200)))];
-            tensor<fp32, [256]> decoder_norm21_1_weight = const()[name = tensor<string, []>("decoder_norm21_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40188288)))];
-            tensor<fp32, [2048]> decoder_linear1_1_bias = const()[name = tensor<string, []>("decoder_linear1_1_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40189376)))];
-            tensor<fp32, [2048, 256]> decoder_linear1_1_weight = const()[name = tensor<string, []>("decoder_linear1_1_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40197632)))];
-            tensor<fp32, [256]> decoder_linear2_1_bias = const()[name = tensor<string, []>("decoder_linear2_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42294848)))];
-            tensor<fp32, [256, 2048]> decoder_linear2_1_weight = const()[name = tensor<string, []>("decoder_linear2_1_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42295936)))];
-            tensor<fp32, [256]> decoder_norm22_1_bias = const()[name = tensor<string, []>("decoder_norm22_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44393152)))];
-            tensor<fp32, [256]> decoder_norm22_1_weight = const()[name = tensor<string, []>("decoder_norm22_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44394240)))];
-            tensor<fp32, []> var_11 = const()[name = tensor<string, []>("op_11"), val = tensor<fp32, []>(0x1.5798eep-27)];
-            tensor<fp32, []> var_17 = const()[name = tensor<string, []>("op_17"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
-            tensor<int32, []> var_20 = const()[name = tensor<string, []>("op_20"), val = tensor<int32, []>(2)];
-            tensor<int32, []> var_23 = const()[name = tensor<string, []>("op_23"), val = tensor<int32, []>(0)];
-            tensor<int32, []> var_26 = const()[name = tensor<string, []>("op_26"), val = tensor<int32, []>(1)];
-            tensor<fp32, []> var_28 = const()[name = tensor<string, []>("op_28"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
-            tensor<bool, []> var_29 = const()[name = tensor<string, []>("op_29"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 2, 256]> input_1 = linear(bias = encoder_input_projection_linear_bias, weight = encoder_input_projection_linear_weight, x = features)[name = tensor<string, []>("linear_0")];
-            tensor<int32, [1]> input_3_axes_0 = const()[name = tensor<string, []>("input_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_3 = layer_norm(axes = input_3_axes_0, beta = encoder_pre_layer_norm_bias, epsilon = var_28, gamma = encoder_pre_layer_norm_weight, x = input_1)[name = tensor<string, []>("input_3")];
+    func main<ios17>(tensor<fp32, [1, 256, 18]> cnn_window, tensor<fp32, [2, 6, 4, 64, 64]> dec_kv, tensor<fp32, [2, 1]> dec_scale, tensor<fp32, [4, 1, 16, 256]> enc_conv_cache, tensor<fp32, [4, 1, 4, 64, 64]> enc_kv, tensor<fp32, [4, 1]> enc_scale, tensor<fp32, [1, 25, 23]> features, tensor<fp32, [2]> valid_mask) {
+            tensor<fp32, [256]> inner_encoder_cnn_bias = const()[name = tensor<string, []>("inner_encoder_cnn_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp32, [256, 256, 19]> inner_encoder_cnn_weight = const()[name = tensor<string, []>("inner_encoder_cnn_weight"), val = tensor<fp32, [256, 256, 19]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
+            tensor<fp32, [2, 2]> inner_encoder__causal_mask = const()[name = tensor<string, []>("inner_encoder__causal_mask"), val = tensor<fp32, [2, 2]>([[0x1p+0, 0x0p+0], [0x1p+0, 0x1p+0]])];
+            tensor<fp32, [2]> inner_encoder__t_index = const()[name = tensor<string, []>("inner_encoder__t_index"), val = tensor<fp32, [2]>([0x1p+0, 0x1p+1])];
+            tensor<fp32, [256]> inner_encoder_input_projection_linear_bias = const()[name = tensor<string, []>("inner_encoder_input_projection_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4981952)))];
+            tensor<fp32, [256, 345]> inner_encoder_input_projection_linear_weight = const()[name = tensor<string, []>("inner_encoder_input_projection_linear_weight"), val = tensor<fp32, [256, 345]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4983040)))];
+            tensor<fp32, [256]> inner_encoder_pre_layer_norm_bias = const()[name = tensor<string, []>("inner_encoder_pre_layer_norm_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5336384)))];
+            tensor<fp32, [256]> inner_encoder_pre_layer_norm_weight = const()[name = tensor<string, []>("inner_encoder_pre_layer_norm_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5337472)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5338560)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5339648)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5340736)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5344896)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6393536)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6394624)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_0_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7443264)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_0_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7444352)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7445440)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7446528)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7708736)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7709824)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7972032)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7973120)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8235328)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8236416)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8498624)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8499712)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8761920)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8763008)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_0_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8764096)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_0_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8766208)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_0_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9290560)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9307008)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9308096)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9309184)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9310272)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9311360)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_0_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9312448)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9574656)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9575744)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9576832)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9580992)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10629632)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10630720)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_0_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11679360)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_0_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11680448)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11681536)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11682624)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11683712)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11687872)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12736512)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12737600)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_1_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13786240)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_1_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13787328)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13788416)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13789504)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14051712)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14052800)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14315008)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14316096)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14578304)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14579392)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14841600)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14842688)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15104896)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15105984)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_1_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15107072)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_1_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15109184)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_1_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15633536)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15649984)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15651072)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15652160)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15653248)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15654336)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_1_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15655424)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15917632)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15918720)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15919808)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15923968)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16972608)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16973696)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_1_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18022336)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_1_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18023424)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18024512)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18025600)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18026688)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18030848)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19079488)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19080576)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_2_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20129216)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_2_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20130304)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20131392)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20132480)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20394688)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20395776)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20657984)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20659072)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20921280)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20922368)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21184576)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21185664)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21447872)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21448960)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_2_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21450048)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_2_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21452160)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_2_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21976512)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21992960)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21994048)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21995136)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21996224)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21997312)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_2_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21998400)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22260608)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22261696)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22262784)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22266944)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23315584)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23316672)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_2_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24365312)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_2_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24366400)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24367488)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24368576)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24369664)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24373824)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25422464)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25423552)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_3_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26472192)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_3_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26473280)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26474368)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26475456)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26737664)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26738752)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27000960)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27002048)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27264256)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27265344)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27527552)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27528640)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27790848)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27791936)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_3_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27793024)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_3_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27795136)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_3_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28319488)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28335936)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28337024)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28338112)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28339200)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28340288)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_3_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28341376)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28603584)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28604672)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28605760)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28609920)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29658560)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29659648)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_3_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30708288)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_3_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30709376)))];
+            tensor<fp32, [2, 2]> inner_decoder__causal_mask = const()[name = tensor<string, []>("inner_decoder__causal_mask"), val = tensor<fp32, [2, 2]>([[0x1p+0, 0x0p+0], [0x1p+0, 0x1p+0]])];
+            tensor<fp32, [256]> inner_decoder_convert_bias = const()[name = tensor<string, []>("inner_decoder_convert_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710464)))];
+            tensor<fp32, [256, 512]> inner_decoder_convert_weight = const()[name = tensor<string, []>("inner_decoder_convert_weight"), val = tensor<fp32, [256, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30711552)))];
+            tensor<fp32, [256]> inner_decoder_q_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31235904)))];
+            tensor<fp32, [256, 256]> inner_decoder_q_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31236992)))];
+            tensor<fp32, [256]> inner_decoder_k_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31499200)))];
+            tensor<fp32, [256, 256]> inner_decoder_k_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31500288)))];
+            tensor<fp32, [256]> inner_decoder_v_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31762496)))];
+            tensor<fp32, [256, 256]> inner_decoder_v_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31763584)))];
+            tensor<fp32, [256]> inner_decoder_g_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32025792)))];
+            tensor<fp32, [256, 256]> inner_decoder_g_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32026880)))];
+            tensor<fp32, [256]> inner_decoder_out_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32289088)))];
+            tensor<fp32, [256, 256]> inner_decoder_out_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32290176)))];
+            tensor<fp32, [256]> inner_decoder_norm11_0_bias = const()[name = tensor<string, []>("inner_decoder_norm11_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32552384)))];
+            tensor<fp32, [256]> inner_decoder_norm11_0_weight = const()[name = tensor<string, []>("inner_decoder_norm11_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32553472)))];
+            tensor<fp32, [256]> inner_decoder_self_attn2_0_out_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32554560)))];
+            tensor<fp32, [256, 256]> inner_decoder_self_attn2_0_out_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32555648)))];
+            tensor<fp32, [768]> inner_decoder_self_attn2_0_in_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32817856)))];
+            tensor<fp32, [768, 256]> inner_decoder_self_attn2_0_in_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32820992)))];
+            tensor<fp32, [256]> inner_decoder_norm21_0_bias = const()[name = tensor<string, []>("inner_decoder_norm21_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33607488)))];
+            tensor<fp32, [256]> inner_decoder_norm21_0_weight = const()[name = tensor<string, []>("inner_decoder_norm21_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33608576)))];
+            tensor<fp32, [2048]> inner_decoder_linear1_0_bias = const()[name = tensor<string, []>("inner_decoder_linear1_0_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33609664)))];
+            tensor<fp32, [2048, 256]> inner_decoder_linear1_0_weight = const()[name = tensor<string, []>("inner_decoder_linear1_0_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33617920)))];
+            tensor<fp32, [256]> inner_decoder_linear2_0_bias = const()[name = tensor<string, []>("inner_decoder_linear2_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35715136)))];
+            tensor<fp32, [256, 2048]> inner_decoder_linear2_0_weight = const()[name = tensor<string, []>("inner_decoder_linear2_0_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35716224)))];
+            tensor<fp32, [256]> inner_decoder_norm22_0_bias = const()[name = tensor<string, []>("inner_decoder_norm22_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37813440)))];
+            tensor<fp32, [256]> inner_decoder_norm22_0_weight = const()[name = tensor<string, []>("inner_decoder_norm22_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37814528)))];
+            tensor<fp32, [256]> inner_decoder_q_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37815616)))];
+            tensor<fp32, [256, 256]> inner_decoder_q_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37816704)))];
+            tensor<fp32, [256]> inner_decoder_k_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38078912)))];
+            tensor<fp32, [256, 256]> inner_decoder_k_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38080000)))];
+            tensor<fp32, [256]> inner_decoder_v_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38342208)))];
+            tensor<fp32, [256, 256]> inner_decoder_v_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38343296)))];
+            tensor<fp32, [256]> inner_decoder_g_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38605504)))];
+            tensor<fp32, [256, 256]> inner_decoder_g_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38606592)))];
+            tensor<fp32, [256]> inner_decoder_out_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38868800)))];
+            tensor<fp32, [256, 256]> inner_decoder_out_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38869888)))];
+            tensor<fp32, [256]> inner_decoder_norm11_1_bias = const()[name = tensor<string, []>("inner_decoder_norm11_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39132096)))];
+            tensor<fp32, [256]> inner_decoder_norm11_1_weight = const()[name = tensor<string, []>("inner_decoder_norm11_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39133184)))];
+            tensor<fp32, [256]> inner_decoder_self_attn2_1_out_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39134272)))];
+            tensor<fp32, [256, 256]> inner_decoder_self_attn2_1_out_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39135360)))];
+            tensor<fp32, [768]> inner_decoder_self_attn2_1_in_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39397568)))];
+            tensor<fp32, [768, 256]> inner_decoder_self_attn2_1_in_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39400704)))];
+            tensor<fp32, [256]> inner_decoder_norm21_1_bias = const()[name = tensor<string, []>("inner_decoder_norm21_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40187200)))];
+            tensor<fp32, [256]> inner_decoder_norm21_1_weight = const()[name = tensor<string, []>("inner_decoder_norm21_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40188288)))];
+            tensor<fp32, [2048]> inner_decoder_linear1_1_bias = const()[name = tensor<string, []>("inner_decoder_linear1_1_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40189376)))];
+            tensor<fp32, [2048, 256]> inner_decoder_linear1_1_weight = const()[name = tensor<string, []>("inner_decoder_linear1_1_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40197632)))];
+            tensor<fp32, [256]> inner_decoder_linear2_1_bias = const()[name = tensor<string, []>("inner_decoder_linear2_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42294848)))];
+            tensor<fp32, [256, 2048]> inner_decoder_linear2_1_weight = const()[name = tensor<string, []>("inner_decoder_linear2_1_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42295936)))];
+            tensor<fp32, [256]> inner_decoder_norm22_1_bias = const()[name = tensor<string, []>("inner_decoder_norm22_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44393152)))];
+            tensor<fp32, [256]> inner_decoder_norm22_1_weight = const()[name = tensor<string, []>("inner_decoder_norm22_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44394240)))];
+            tensor<int32, [3]> var_19_begin_0 = const()[name = tensor<string, []>("op_19_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_19_end_0 = const()[name = tensor<string, []>("op_19_end_0"), val = tensor<int32, [3]>([1, 15, 23])];
+            tensor<bool, [3]> var_19_end_mask_0 = const()[name = tensor<string, []>("op_19_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 15, 23]> var_19 = slice_by_index(begin = var_19_begin_0, end = var_19_end_0, end_mask = var_19_end_mask_0, x = features)[name = tensor<string, []>("op_19")];
+            tensor<int32, [3]> var_29_begin_0 = const()[name = tensor<string, []>("op_29_begin_0"), val = tensor<int32, [3]>([0, 10, 0])];
+            tensor<int32, [3]> var_29_end_0 = const()[name = tensor<string, []>("op_29_end_0"), val = tensor<int32, [3]>([1, 1, 23])];
+            tensor<bool, [3]> var_29_end_mask_0 = const()[name = tensor<string, []>("op_29_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 23]> var_29 = slice_by_index(begin = var_29_begin_0, end = var_29_end_0, end_mask = var_29_end_mask_0, x = features)[name = tensor<string, []>("op_29")];
+            tensor<int32, []> stacked_axis_0 = const()[name = tensor<string, []>("stacked_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 2, 15, 23]> stacked = stack(axis = stacked_axis_0, values = (var_19, var_29))[name = tensor<string, []>("stacked")];
+            tensor<int32, [3]> var_36 = const()[name = tensor<string, []>("op_36"), val = tensor<int32, [3]>([1, 2, 345])];
+            tensor<fp32, [1, 2, 345]> input_1 = reshape(shape = var_36, x = stacked)[name = tensor<string, []>("input_1")];
+            tensor<fp32, []> var_38 = const()[name = tensor<string, []>("op_38"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<bool, []> var_44 = const()[name = tensor<string, []>("op_44"), val = tensor<bool, []>(true)];
+            tensor<fp32, []> var_45 = const()[name = tensor<string, []>("op_45"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
+            tensor<int32, []> var_48 = const()[name = tensor<string, []>("op_48"), val = tensor<int32, []>(0)];
+            tensor<int32, []> var_50 = const()[name = tensor<string, []>("op_50"), val = tensor<int32, []>(2)];
+            tensor<int32, []> var_51 = const()[name = tensor<string, []>("op_51"), val = tensor<int32, []>(-1)];
+            tensor<fp32, []> var_53 = const()[name = tensor<string, []>("op_53"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
+            tensor<fp32, []> var_58 = const()[name = tensor<string, []>("op_58"), val = tensor<fp32, []>(0x1.5798eep-27)];
+            tensor<int32, []> var_62 = const()[name = tensor<string, []>("op_62"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 2, 256]> input_3 = linear(bias = inner_encoder_input_projection_linear_bias, weight = inner_encoder_input_projection_linear_weight, x = input_1)[name = tensor<string, []>("linear_0")];
             tensor<int32, [1]> input_5_axes_0 = const()[name = tensor<string, []>("input_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_5 = layer_norm(axes = input_5_axes_0, beta = encoder_ffn1_0_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_0_module_sequential_0_weight, x = input_3)[name = tensor<string, []>("input_5")];
-            tensor<fp32, [1, 2, 1024]> inputs_1 = linear(bias = encoder_ffn1_0_module_sequential_1_linear_bias, weight = encoder_ffn1_0_module_sequential_1_linear_weight, x = input_5)[name = tensor<string, []>("linear_1")];
-            tensor<fp32, [1, 2, 1024]> input_7 = silu(x = inputs_1)[name = tensor<string, []>("input_7")];
-            tensor<fp32, [1, 2, 256]> input_11 = linear(bias = encoder_ffn1_0_module_sequential_4_linear_bias, weight = encoder_ffn1_0_module_sequential_4_linear_weight, x = input_7)[name = tensor<string, []>("linear_2")];
-            tensor<fp32, []> var_147 = const()[name = tensor<string, []>("op_147"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 2, 256]> var_148 = mul(x = input_11, y = var_147)[name = tensor<string, []>("op_148")];
-            tensor<fp32, [1, 2, 256]> input_13 = add(x = var_148, y = input_3)[name = tensor<string, []>("input_13")];
+            tensor<fp32, [1, 2, 256]> input_5 = layer_norm(axes = input_5_axes_0, beta = inner_encoder_pre_layer_norm_bias, epsilon = var_45, gamma = inner_encoder_pre_layer_norm_weight, x = input_3)[name = tensor<string, []>("input_5")];
+            tensor<int32, [1]> input_7_axes_0 = const()[name = tensor<string, []>("input_7_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 2, 256]> input_7 = layer_norm(axes = input_7_axes_0, beta = inner_encoder_ffn1_0_module_sequential_0_bias, epsilon = var_45, gamma = inner_encoder_ffn1_0_module_sequential_0_weight, x = input_5)[name = tensor<string, []>("input_7")];
+            tensor<fp32, [1, 2, 1024]> inputs_1 = linear(bias = inner_encoder_ffn1_0_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_0_module_sequential_1_linear_weight, x = input_7)[name = tensor<string, []>("linear_1")];
+            tensor<fp32, [1, 2, 1024]> input_9 = silu(x = inputs_1)[name = tensor<string, []>("input_9")];
+            tensor<fp32, [1, 2, 256]> input_13 = linear(bias = inner_encoder_ffn1_0_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_0_module_sequential_4_linear_weight, x = input_9)[name = tensor<string, []>("linear_2")];
+            tensor<fp32, []> var_183 = const()[name = tensor<string, []>("op_183"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 2, 256]> var_184 = mul(x = input_13, y = var_183)[name = tensor<string, []>("op_184")];
+            tensor<fp32, [1, 2, 256]> input_15 = add(x = var_184, y = input_5)[name = tensor<string, []>("input_15")];
             tensor<int32, [1]> x_1_axes_0 = const()[name = tensor<string, []>("x_1_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> x_1 = layer_norm(axes = x_1_axes_0, beta = encoder_ret_lns_0_bias, epsilon = var_28, gamma = encoder_ret_lns_0_weight, x = input_13)[name = tensor<string, []>("x_1")];
+            tensor<fp32, [1, 2, 256]> x_1 = layer_norm(axes = x_1_axes_0, beta = inner_encoder_ret_lns_0_bias, epsilon = var_45, gamma = inner_encoder_ret_lns_0_weight, x = input_15)[name = tensor<string, []>("x_1")];
             tensor<int32, [5]> prev_kv_1_begin_0 = const()[name = tensor<string, []>("prev_kv_1_begin_0"), val = tensor<int32, [5]>([0, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_1_end_0 = const()[name = tensor<string, []>("prev_kv_1_end_0"), val = tensor<int32, [5]>([1, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_1_end_mask_0 = const()[name = tensor<string, []>("prev_kv_1_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -239,153 +253,153 @@ program(1.0)
             tensor<bool, [2]> prev_scale_1_end_mask_0 = const()[name = tensor<string, []>("prev_scale_1_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_1_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_1_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_1 = slice_by_index(begin = prev_scale_1_begin_0, end = prev_scale_1_end_0, end_mask = prev_scale_1_end_mask_0, squeeze_mask = prev_scale_1_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_1")];
-            tensor<fp32, [1, 2, 256]> var_162 = linear(bias = encoder_q_proj_0_bias, weight = encoder_q_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_3")];
-            tensor<int32, [4]> var_163 = const()[name = tensor<string, []>("op_163"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_164 = reshape(shape = var_163, x = var_162)[name = tensor<string, []>("op_164")];
+            tensor<fp32, [1, 2, 256]> var_198 = linear(bias = inner_encoder_q_proj_0_bias, weight = inner_encoder_q_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_3")];
+            tensor<int32, [4]> var_199 = const()[name = tensor<string, []>("op_199"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_200 = reshape(shape = var_199, x = var_198)[name = tensor<string, []>("op_200")];
             tensor<int32, [4]> q_1_perm_0 = const()[name = tensor<string, []>("q_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> var_168 = linear(bias = encoder_k_proj_0_bias, weight = encoder_k_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_4")];
-            tensor<fp32, []> var_169 = const()[name = tensor<string, []>("op_169"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 2, 256]> var_170 = mul(x = var_168, y = var_169)[name = tensor<string, []>("op_170")];
-            tensor<int32, [4]> var_171 = const()[name = tensor<string, []>("op_171"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_172 = reshape(shape = var_171, x = var_170)[name = tensor<string, []>("op_172")];
+            tensor<fp32, [1, 2, 256]> var_204 = linear(bias = inner_encoder_k_proj_0_bias, weight = inner_encoder_k_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_4")];
+            tensor<fp32, []> var_205 = const()[name = tensor<string, []>("op_205"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 2, 256]> var_206 = mul(x = var_204, y = var_205)[name = tensor<string, []>("op_206")];
+            tensor<int32, [4]> var_207 = const()[name = tensor<string, []>("op_207"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_208 = reshape(shape = var_207, x = var_206)[name = tensor<string, []>("op_208")];
             tensor<int32, [4]> k_1_perm_0 = const()[name = tensor<string, []>("k_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> var_176 = linear(bias = encoder_v_proj_0_bias, weight = encoder_v_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_5")];
-            tensor<int32, [4]> var_177 = const()[name = tensor<string, []>("op_177"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_178 = reshape(shape = var_177, x = var_176)[name = tensor<string, []>("op_178")];
+            tensor<fp32, [1, 2, 256]> var_212 = linear(bias = inner_encoder_v_proj_0_bias, weight = inner_encoder_v_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_5")];
+            tensor<int32, [4]> var_213 = const()[name = tensor<string, []>("op_213"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_214 = reshape(shape = var_213, x = var_212)[name = tensor<string, []>("op_214")];
             tensor<int32, [4]> v_1_perm_0 = const()[name = tensor<string, []>("v_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> input_17 = linear(bias = encoder_g_proj_0_bias, weight = encoder_g_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_6")];
+            tensor<fp32, [1, 2, 256]> input_19 = linear(bias = inner_encoder_g_proj_0_bias, weight = inner_encoder_g_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_6")];
             tensor<fp32, [1]> sqrt_s0_1 = sqrt(x = prev_scale_1)[name = tensor<string, []>("sqrt_s0_1")];
-            tensor<fp32, [2]> s_t_1 = add(x = prev_scale_1, y = encoder__t_index)[name = tensor<string, []>("s_t_1")];
+            tensor<fp32, [2]> s_t_1 = add(x = prev_scale_1, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_1")];
             tensor<fp32, [2]> sqrt_s_t_1 = sqrt(x = s_t_1)[name = tensor<string, []>("sqrt_s_t_1")];
             tensor<bool, []> qk_1_transpose_x_1 = const()[name = tensor<string, []>("qk_1_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_1_transpose_y_1 = const()[name = tensor<string, []>("qk_1_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 2, 64]> k_1 = transpose(perm = k_1_perm_0, x = var_172)[name = tensor<string, []>("transpose_57")];
-            tensor<fp32, [1, 4, 2, 64]> q_1 = transpose(perm = q_1_perm_0, x = var_164)[name = tensor<string, []>("transpose_58")];
+            tensor<fp32, [1, 4, 2, 64]> k_1 = transpose(perm = k_1_perm_0, x = var_208)[name = tensor<string, []>("transpose_57")];
+            tensor<fp32, [1, 4, 2, 64]> q_1 = transpose(perm = q_1_perm_0, x = var_200)[name = tensor<string, []>("transpose_58")];
             tensor<fp32, [1, 4, 2, 2]> qk_1 = matmul(transpose_x = qk_1_transpose_x_1, transpose_y = qk_1_transpose_y_1, x = q_1, y = k_1)[name = tensor<string, []>("qk_1")];
-            tensor<int32, [2]> var_188 = const()[name = tensor<string, []>("op_188"), val = tensor<int32, [2]>([2, 1])];
-            tensor<fp32, [2, 1]> var_189 = reshape(shape = var_188, x = sqrt_s_t_1)[name = tensor<string, []>("op_189")];
-            tensor<fp32, [2, 2]> M_1 = real_div(x = encoder__causal_mask, y = var_189)[name = tensor<string, []>("M_1")];
-            tensor<fp32, [1, 4, 2, 2]> var_191 = mul(x = qk_1, y = M_1)[name = tensor<string, []>("op_191")];
+            tensor<int32, [2]> var_224 = const()[name = tensor<string, []>("op_224"), val = tensor<int32, [2]>([2, 1])];
+            tensor<fp32, [2, 1]> var_225 = reshape(shape = var_224, x = sqrt_s_t_1)[name = tensor<string, []>("op_225")];
+            tensor<fp32, [2, 2]> M_1 = real_div(x = inner_encoder__causal_mask, y = var_225)[name = tensor<string, []>("M_1")];
+            tensor<fp32, [1, 4, 2, 2]> var_227 = mul(x = qk_1, y = M_1)[name = tensor<string, []>("op_227")];
             tensor<bool, []> inner_1_transpose_x_0 = const()[name = tensor<string, []>("inner_1_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_1_transpose_y_0 = const()[name = tensor<string, []>("inner_1_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 2, 64]> v_1 = transpose(perm = v_1_perm_0, x = var_178)[name = tensor<string, []>("transpose_56")];
-            tensor<fp32, [1, 4, 2, 64]> inner_1 = matmul(transpose_x = inner_1_transpose_x_0, transpose_y = inner_1_transpose_y_0, x = var_191, y = v_1)[name = tensor<string, []>("inner_1")];
-            tensor<bool, []> var_193_transpose_x_0 = const()[name = tensor<string, []>("op_193_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_193_transpose_y_0 = const()[name = tensor<string, []>("op_193_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 2, 64]> var_193 = matmul(transpose_x = var_193_transpose_x_0, transpose_y = var_193_transpose_y_0, x = q_1, y = prev_kv_1)[name = tensor<string, []>("op_193")];
-            tensor<fp32, [2]> var_194 = real_div(x = sqrt_s0_1, y = sqrt_s_t_1)[name = tensor<string, []>("op_194")];
-            tensor<int32, [4]> var_195 = const()[name = tensor<string, []>("op_195"), val = tensor<int32, [4]>([1, 1, 2, 1])];
-            tensor<fp32, [1, 1, 2, 1]> var_196 = reshape(shape = var_195, x = var_194)[name = tensor<string, []>("op_196")];
-            tensor<fp32, [1, 4, 2, 64]> cross_1 = mul(x = var_193, y = var_196)[name = tensor<string, []>("cross_1")];
+            tensor<fp32, [1, 4, 2, 64]> v_1 = transpose(perm = v_1_perm_0, x = var_214)[name = tensor<string, []>("transpose_56")];
+            tensor<fp32, [1, 4, 2, 64]> inner_1 = matmul(transpose_x = inner_1_transpose_x_0, transpose_y = inner_1_transpose_y_0, x = var_227, y = v_1)[name = tensor<string, []>("inner_1")];
+            tensor<bool, []> var_229_transpose_x_0 = const()[name = tensor<string, []>("op_229_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_229_transpose_y_0 = const()[name = tensor<string, []>("op_229_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 2, 64]> var_229 = matmul(transpose_x = var_229_transpose_x_0, transpose_y = var_229_transpose_y_0, x = q_1, y = prev_kv_1)[name = tensor<string, []>("op_229")];
+            tensor<fp32, [2]> var_230 = real_div(x = sqrt_s0_1, y = sqrt_s_t_1)[name = tensor<string, []>("op_230")];
+            tensor<int32, [4]> var_231 = const()[name = tensor<string, []>("op_231"), val = tensor<int32, [4]>([1, 1, 2, 1])];
+            tensor<fp32, [1, 1, 2, 1]> var_232 = reshape(shape = var_231, x = var_230)[name = tensor<string, []>("op_232")];
+            tensor<fp32, [1, 4, 2, 64]> cross_1 = mul(x = var_229, y = var_232)[name = tensor<string, []>("cross_1")];
             tensor<fp32, [1, 4, 2, 64]> out_1 = add(x = inner_1, y = cross_1)[name = tensor<string, []>("out_1")];
-            tensor<fp32, [1, 4, 64, 64]> var_199 = mul(x = prev_kv_1, y = sqrt_s0_1)[name = tensor<string, []>("op_199")];
-            tensor<bool, []> var_201_transpose_x_1 = const()[name = tensor<string, []>("op_201_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_201_transpose_y_1 = const()[name = tensor<string, []>("op_201_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_201 = matmul(transpose_x = var_201_transpose_x_1, transpose_y = var_201_transpose_y_1, x = k_1, y = v_1)[name = tensor<string, []>("op_201")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_1 = add(x = var_199, y = var_201)[name = tensor<string, []>("new_kv_unnorm_1")];
-            tensor<fp32, []> var_203 = const()[name = tensor<string, []>("op_203"), val = tensor<fp32, []>(0x1p+1)];
-            tensor<fp32, [1]> new_scale_1 = add(x = prev_scale_1, y = var_203)[name = tensor<string, []>("new_scale_1")];
-            tensor<fp32, [1]> var_205 = sqrt(x = new_scale_1)[name = tensor<string, []>("op_205")];
-            tensor<fp32, [1, 4, 64, 64]> var_206 = real_div(x = new_kv_unnorm_1, y = var_205)[name = tensor<string, []>("op_206")];
-            tensor<int32, [4]> var_207_perm_0 = const()[name = tensor<string, []>("op_207_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_235 = mul(x = prev_kv_1, y = sqrt_s0_1)[name = tensor<string, []>("op_235")];
+            tensor<bool, []> var_237_transpose_x_1 = const()[name = tensor<string, []>("op_237_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_237_transpose_y_1 = const()[name = tensor<string, []>("op_237_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_237 = matmul(transpose_x = var_237_transpose_x_1, transpose_y = var_237_transpose_y_1, x = k_1, y = v_1)[name = tensor<string, []>("op_237")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_1 = add(x = var_235, y = var_237)[name = tensor<string, []>("new_kv_unnorm_1")];
+            tensor<fp32, []> var_239 = const()[name = tensor<string, []>("op_239"), val = tensor<fp32, []>(0x1p+1)];
+            tensor<fp32, [1]> new_scale_1 = add(x = prev_scale_1, y = var_239)[name = tensor<string, []>("new_scale_1")];
+            tensor<fp32, [1]> var_241 = sqrt(x = new_scale_1)[name = tensor<string, []>("op_241")];
+            tensor<fp32, [1, 4, 64, 64]> var_242 = real_div(x = new_kv_unnorm_1, y = var_241)[name = tensor<string, []>("op_242")];
+            tensor<int32, [4]> var_243_perm_0 = const()[name = tensor<string, []>("op_243_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_3_axes_0 = const()[name = tensor<string, []>("out_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 4, 64]> var_207 = transpose(perm = var_207_perm_0, x = out_1)[name = tensor<string, []>("transpose_55")];
-            tensor<fp32, [1, 2, 4, 64]> out_3 = layer_norm(axes = out_3_axes_0, epsilon = var_17, x = var_207)[name = tensor<string, []>("out_3")];
-            tensor<int32, [3]> var_211 = const()[name = tensor<string, []>("op_211"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<fp32, [1, 2, 256]> out_5 = reshape(shape = var_211, x = out_3)[name = tensor<string, []>("out_5")];
-            tensor<fp32, [1, 2, 256]> var_213 = silu(x = input_17)[name = tensor<string, []>("op_213")];
-            tensor<fp32, [1, 2, 256]> input_19 = mul(x = var_213, y = out_5)[name = tensor<string, []>("input_19")];
-            tensor<fp32, [1, 2, 256]> ret_out_1 = linear(bias = encoder_out_proj_0_bias, weight = encoder_out_proj_0_weight, x = input_19)[name = tensor<string, []>("linear_7")];
-            tensor<fp32, [1, 2, 256]> x_3 = add(x = input_13, y = ret_out_1)[name = tensor<string, []>("x_3")];
+            tensor<fp32, [1, 2, 4, 64]> var_243 = transpose(perm = var_243_perm_0, x = out_1)[name = tensor<string, []>("transpose_55")];
+            tensor<fp32, [1, 2, 4, 64]> out_3 = layer_norm(axes = out_3_axes_0, epsilon = var_53, x = var_243)[name = tensor<string, []>("out_3")];
+            tensor<int32, [3]> var_247 = const()[name = tensor<string, []>("op_247"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp32, [1, 2, 256]> out_5 = reshape(shape = var_247, x = out_3)[name = tensor<string, []>("out_5")];
+            tensor<fp32, [1, 2, 256]> var_249 = silu(x = input_19)[name = tensor<string, []>("op_249")];
+            tensor<fp32, [1, 2, 256]> input_21 = mul(x = var_249, y = out_5)[name = tensor<string, []>("input_21")];
+            tensor<fp32, [1, 2, 256]> ret_out_1 = linear(bias = inner_encoder_out_proj_0_bias, weight = inner_encoder_out_proj_0_weight, x = input_21)[name = tensor<string, []>("linear_7")];
+            tensor<fp32, [1, 2, 256]> x_3 = add(x = input_15, y = ret_out_1)[name = tensor<string, []>("x_3")];
             tensor<int32, [4]> window_1_begin_0 = const()[name = tensor<string, []>("window_1_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> window_1_end_0 = const()[name = tensor<string, []>("window_1_end_0"), val = tensor<int32, [4]>([1, 1, 16, 256])];
             tensor<bool, [4]> window_1_end_mask_0 = const()[name = tensor<string, []>("window_1_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_1_squeeze_mask_0 = const()[name = tensor<string, []>("window_1_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_1 = slice_by_index(begin = window_1_begin_0, end = window_1_end_0, end_mask = window_1_end_mask_0, squeeze_mask = window_1_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_1")];
-            tensor<int32, [3]> var_221_begin_0 = const()[name = tensor<string, []>("op_221_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_221_end_0 = const()[name = tensor<string, []>("op_221_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_221_end_mask_0 = const()[name = tensor<string, []>("op_221_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_221 = slice_by_index(begin = var_221_begin_0, end = var_221_end_0, end_mask = var_221_end_mask_0, x = x_3)[name = tensor<string, []>("op_221")];
-            tensor<int32, [3]> var_224_begin_0 = const()[name = tensor<string, []>("op_224_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_224_end_0 = const()[name = tensor<string, []>("op_224_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_224_end_mask_0 = const()[name = tensor<string, []>("op_224_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_224 = slice_by_index(begin = var_224_begin_0, end = var_224_end_0, end_mask = var_224_end_mask_0, x = window_1)[name = tensor<string, []>("op_224")];
+            tensor<int32, [3]> var_257_begin_0 = const()[name = tensor<string, []>("op_257_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_257_end_0 = const()[name = tensor<string, []>("op_257_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_257_end_mask_0 = const()[name = tensor<string, []>("op_257_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_257 = slice_by_index(begin = var_257_begin_0, end = var_257_end_0, end_mask = var_257_end_mask_0, x = x_3)[name = tensor<string, []>("op_257")];
+            tensor<int32, [3]> var_260_begin_0 = const()[name = tensor<string, []>("op_260_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_260_end_0 = const()[name = tensor<string, []>("op_260_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_260_end_mask_0 = const()[name = tensor<string, []>("op_260_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_260 = slice_by_index(begin = var_260_begin_0, end = var_260_end_0, end_mask = var_260_end_mask_0, x = window_1)[name = tensor<string, []>("op_260")];
             tensor<bool, []> window_3_interleave_0 = const()[name = tensor<string, []>("window_3_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_3 = concat(axis = var_26, interleave = window_3_interleave_0, values = (var_224, var_221))[name = tensor<string, []>("window_3")];
-            tensor<int32, [3]> var_229_begin_0 = const()[name = tensor<string, []>("op_229_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_229_end_0 = const()[name = tensor<string, []>("op_229_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_229_end_mask_0 = const()[name = tensor<string, []>("op_229_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_229 = slice_by_index(begin = var_229_begin_0, end = var_229_end_0, end_mask = var_229_end_mask_0, x = x_3)[name = tensor<string, []>("op_229")];
-            tensor<int32, [3]> var_232_begin_0 = const()[name = tensor<string, []>("op_232_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_232_end_0 = const()[name = tensor<string, []>("op_232_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_232_end_mask_0 = const()[name = tensor<string, []>("op_232_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_232 = slice_by_index(begin = var_232_begin_0, end = var_232_end_0, end_mask = var_232_end_mask_0, x = window_3)[name = tensor<string, []>("op_232")];
+            tensor<fp32, [1, 16, 256]> window_3 = concat(axis = var_62, interleave = window_3_interleave_0, values = (var_260, var_257))[name = tensor<string, []>("window_3")];
+            tensor<int32, [3]> var_265_begin_0 = const()[name = tensor<string, []>("op_265_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_265_end_0 = const()[name = tensor<string, []>("op_265_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_265_end_mask_0 = const()[name = tensor<string, []>("op_265_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_265 = slice_by_index(begin = var_265_begin_0, end = var_265_end_0, end_mask = var_265_end_mask_0, x = x_3)[name = tensor<string, []>("op_265")];
+            tensor<int32, [3]> var_268_begin_0 = const()[name = tensor<string, []>("op_268_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_268_end_0 = const()[name = tensor<string, []>("op_268_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_268_end_mask_0 = const()[name = tensor<string, []>("op_268_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_268 = slice_by_index(begin = var_268_begin_0, end = var_268_end_0, end_mask = var_268_end_mask_0, x = window_3)[name = tensor<string, []>("op_268")];
             tensor<bool, []> window_5_interleave_0 = const()[name = tensor<string, []>("window_5_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_5 = concat(axis = var_26, interleave = window_5_interleave_0, values = (var_232, var_229))[name = tensor<string, []>("window_5")];
-            tensor<bool, []> input_21_interleave_0 = const()[name = tensor<string, []>("input_21_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [2, 16, 256]> input_21 = concat(axis = var_23, interleave = input_21_interleave_0, values = (window_3, window_5))[name = tensor<string, []>("input_21")];
+            tensor<fp32, [1, 16, 256]> window_5 = concat(axis = var_62, interleave = window_5_interleave_0, values = (var_268, var_265))[name = tensor<string, []>("window_5")];
+            tensor<bool, []> input_23_interleave_0 = const()[name = tensor<string, []>("input_23_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [2, 16, 256]> input_23 = concat(axis = var_48, interleave = input_23_interleave_0, values = (window_3, window_5))[name = tensor<string, []>("input_23")];
             tensor<int32, [1]> x_5_axes_0 = const()[name = tensor<string, []>("x_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [2, 16, 256]> x_5 = layer_norm(axes = x_5_axes_0, beta = encoder_conv_module_0_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_0_sequential_0_weight, x = input_21)[name = tensor<string, []>("x_5")];
-            tensor<int32, [3]> input_23_perm_0 = const()[name = tensor<string, []>("input_23_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [2, 16, 256]> x_5 = layer_norm(axes = x_5_axes_0, beta = inner_encoder_conv_module_0_sequential_0_bias, epsilon = var_45, gamma = inner_encoder_conv_module_0_sequential_0_weight, x = input_23)[name = tensor<string, []>("x_5")];
+            tensor<int32, [3]> input_25_perm_0 = const()[name = tensor<string, []>("input_25_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_3_pad_type_0 = const()[name = tensor<string, []>("inputs_3_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_3_strides_0 = const()[name = tensor<string, []>("inputs_3_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_3_pad_0 = const()[name = tensor<string, []>("inputs_3_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_3_dilations_0 = const()[name = tensor<string, []>("inputs_3_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_3_groups_0 = const()[name = tensor<string, []>("inputs_3_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> input_23 = transpose(perm = input_23_perm_0, x = x_5)[name = tensor<string, []>("transpose_54")];
-            tensor<fp32, [2, 512, 16]> inputs_3 = conv(bias = encoder_conv_module_0_sequential_2_conv_bias, dilations = inputs_3_dilations_0, groups = inputs_3_groups_0, pad = inputs_3_pad_0, pad_type = inputs_3_pad_type_0, strides = inputs_3_strides_0, weight = encoder_conv_module_0_sequential_2_conv_weight, x = input_23)[name = tensor<string, []>("inputs_3")];
-            tensor<int32, [2]> var_257_split_sizes_0 = const()[name = tensor<string, []>("op_257_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_257_axis_0 = const()[name = tensor<string, []>("op_257_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> var_257_0, tensor<fp32, [2, 256, 16]> var_257_1 = split(axis = var_257_axis_0, split_sizes = var_257_split_sizes_0, x = inputs_3)[name = tensor<string, []>("op_257")];
-            tensor<fp32, [2, 256, 16]> var_259 = sigmoid(x = var_257_1)[name = tensor<string, []>("op_259")];
-            tensor<fp32, [2, 256, 16]> inputs_5 = mul(x = var_257_0, y = var_259)[name = tensor<string, []>("inputs_5")];
+            tensor<fp32, [2, 256, 16]> input_25 = transpose(perm = input_25_perm_0, x = x_5)[name = tensor<string, []>("transpose_54")];
+            tensor<fp32, [2, 512, 16]> inputs_3 = conv(bias = inner_encoder_conv_module_0_sequential_2_conv_bias, dilations = inputs_3_dilations_0, groups = inputs_3_groups_0, pad = inputs_3_pad_0, pad_type = inputs_3_pad_type_0, strides = inputs_3_strides_0, weight = inner_encoder_conv_module_0_sequential_2_conv_weight, x = input_25)[name = tensor<string, []>("inputs_3")];
+            tensor<int32, [2]> var_293_split_sizes_0 = const()[name = tensor<string, []>("op_293_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_293_axis_0 = const()[name = tensor<string, []>("op_293_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [2, 256, 16]> var_293_0, tensor<fp32, [2, 256, 16]> var_293_1 = split(axis = var_293_axis_0, split_sizes = var_293_split_sizes_0, x = inputs_3)[name = tensor<string, []>("op_293")];
+            tensor<fp32, [2, 256, 16]> var_295 = sigmoid(x = var_293_1)[name = tensor<string, []>("op_295")];
+            tensor<fp32, [2, 256, 16]> inputs_5 = mul(x = var_293_0, y = var_295)[name = tensor<string, []>("inputs_5")];
             tensor<string, []> outputs_aug_1_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_1_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_1_pad_0 = const()[name = tensor<string, []>("outputs_aug_1_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_1_groups_0 = const()[name = tensor<string, []>("outputs_aug_1_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_1_strides_0 = const()[name = tensor<string, []>("outputs_aug_1_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_1_dilations_0 = const()[name = tensor<string, []>("outputs_aug_1_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [2, 256, 31]> outputs_aug_1 = conv(dilations = outputs_aug_1_dilations_0, groups = outputs_aug_1_groups_0, pad = outputs_aug_1_pad_0, pad_type = outputs_aug_1_pad_type_0, strides = outputs_aug_1_strides_0, weight = encoder_conv_module_0_sequential_4_conv_weight, x = inputs_5)[name = tensor<string, []>("outputs_aug_1")];
-            tensor<int32, [3]> input_25_begin_0 = const()[name = tensor<string, []>("input_25_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_25_end_0 = const()[name = tensor<string, []>("input_25_end_0"), val = tensor<int32, [3]>([2, 256, 16])];
-            tensor<bool, [3]> input_25_end_mask_0 = const()[name = tensor<string, []>("input_25_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [2, 256, 16]> input_25 = slice_by_index(begin = input_25_begin_0, end = input_25_end_0, end_mask = input_25_end_mask_0, x = outputs_aug_1)[name = tensor<string, []>("input_25")];
-            tensor<fp32, [2, 256, 16]> inputs_7 = batch_norm(beta = encoder_conv_module_0_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_0_sequential_5_weight, mean = encoder_conv_module_0_sequential_5_running_mean, variance = encoder_conv_module_0_sequential_5_running_var, x = input_25)[name = tensor<string, []>("inputs_7")];
-            tensor<fp32, [2, 256, 16]> input_27 = silu(x = inputs_7)[name = tensor<string, []>("input_27")];
-            tensor<string, []> input_29_pad_type_0 = const()[name = tensor<string, []>("input_29_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_29_strides_0 = const()[name = tensor<string, []>("input_29_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_29_pad_0 = const()[name = tensor<string, []>("input_29_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_29_dilations_0 = const()[name = tensor<string, []>("input_29_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_29_groups_0 = const()[name = tensor<string, []>("input_29_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> input_29 = conv(bias = encoder_conv_module_0_sequential_7_conv_bias, dilations = input_29_dilations_0, groups = input_29_groups_0, pad = input_29_pad_0, pad_type = input_29_pad_type_0, strides = input_29_strides_0, weight = encoder_conv_module_0_sequential_7_conv_weight, x = input_27)[name = tensor<string, []>("input_29")];
+            tensor<fp32, [2, 256, 31]> outputs_aug_1 = conv(dilations = outputs_aug_1_dilations_0, groups = outputs_aug_1_groups_0, pad = outputs_aug_1_pad_0, pad_type = outputs_aug_1_pad_type_0, strides = outputs_aug_1_strides_0, weight = inner_encoder_conv_module_0_sequential_4_conv_weight, x = inputs_5)[name = tensor<string, []>("outputs_aug_1")];
+            tensor<int32, [3]> input_27_begin_0 = const()[name = tensor<string, []>("input_27_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_27_end_0 = const()[name = tensor<string, []>("input_27_end_0"), val = tensor<int32, [3]>([2, 256, 16])];
+            tensor<bool, [3]> input_27_end_mask_0 = const()[name = tensor<string, []>("input_27_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [2, 256, 16]> input_27 = slice_by_index(begin = input_27_begin_0, end = input_27_end_0, end_mask = input_27_end_mask_0, x = outputs_aug_1)[name = tensor<string, []>("input_27")];
+            tensor<fp32, [2, 256, 16]> inputs_7 = batch_norm(beta = inner_encoder_conv_module_0_sequential_5_bias, epsilon = var_45, gamma = inner_encoder_conv_module_0_sequential_5_weight, mean = inner_encoder_conv_module_0_sequential_5_running_mean, variance = inner_encoder_conv_module_0_sequential_5_running_var, x = input_27)[name = tensor<string, []>("inputs_7")];
+            tensor<fp32, [2, 256, 16]> input_29 = silu(x = inputs_7)[name = tensor<string, []>("input_29")];
+            tensor<string, []> input_31_pad_type_0 = const()[name = tensor<string, []>("input_31_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_31_strides_0 = const()[name = tensor<string, []>("input_31_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_31_pad_0 = const()[name = tensor<string, []>("input_31_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_31_dilations_0 = const()[name = tensor<string, []>("input_31_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_31_groups_0 = const()[name = tensor<string, []>("input_31_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [2, 256, 16]> input_31 = conv(bias = inner_encoder_conv_module_0_sequential_7_conv_bias, dilations = input_31_dilations_0, groups = input_31_groups_0, pad = input_31_pad_0, pad_type = input_31_pad_type_0, strides = input_31_strides_0, weight = inner_encoder_conv_module_0_sequential_7_conv_weight, x = input_29)[name = tensor<string, []>("input_31")];
             tensor<int32, [3]> conv_out_1_perm_0 = const()[name = tensor<string, []>("conv_out_1_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_290_begin_0 = const()[name = tensor<string, []>("op_290_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_290_end_0 = const()[name = tensor<string, []>("op_290_end_0"), val = tensor<int32, [3]>([2, 16, 256])];
-            tensor<bool, [3]> var_290_end_mask_0 = const()[name = tensor<string, []>("op_290_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [2, 16, 256]> conv_out_1 = transpose(perm = conv_out_1_perm_0, x = input_29)[name = tensor<string, []>("transpose_53")];
-            tensor<fp32, [2, 1, 256]> var_290 = slice_by_index(begin = var_290_begin_0, end = var_290_end_0, end_mask = var_290_end_mask_0, x = conv_out_1)[name = tensor<string, []>("op_290")];
-            tensor<int32, [3]> var_292_perm_0 = const()[name = tensor<string, []>("op_292_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 2, 256]> var_292 = transpose(perm = var_292_perm_0, x = var_290)[name = tensor<string, []>("transpose_52")];
-            tensor<fp32, [1, 2, 256]> input_31 = add(x = x_3, y = var_292)[name = tensor<string, []>("input_31")];
-            tensor<int32, [1]> input_33_axes_0 = const()[name = tensor<string, []>("input_33_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_33 = layer_norm(axes = input_33_axes_0, beta = encoder_ffn2_0_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_0_module_sequential_0_weight, x = input_31)[name = tensor<string, []>("input_33")];
-            tensor<fp32, [1, 2, 1024]> inputs_9 = linear(bias = encoder_ffn2_0_module_sequential_1_linear_bias, weight = encoder_ffn2_0_module_sequential_1_linear_weight, x = input_33)[name = tensor<string, []>("linear_8")];
-            tensor<fp32, [1, 2, 1024]> input_35 = silu(x = inputs_9)[name = tensor<string, []>("input_35")];
-            tensor<fp32, [1, 2, 256]> input_39 = linear(bias = encoder_ffn2_0_module_sequential_4_linear_bias, weight = encoder_ffn2_0_module_sequential_4_linear_weight, x = input_35)[name = tensor<string, []>("linear_9")];
-            tensor<fp32, []> var_315 = const()[name = tensor<string, []>("op_315"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 2, 256]> var_316 = mul(x = input_39, y = var_315)[name = tensor<string, []>("op_316")];
-            tensor<fp32, [1, 2, 256]> input_41 = add(x = var_316, y = input_31)[name = tensor<string, []>("input_41")];
-            tensor<int32, [1]> input_43_axes_0 = const()[name = tensor<string, []>("input_43_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_43 = layer_norm(axes = input_43_axes_0, beta = encoder_layer_norm_0_bias, epsilon = var_28, gamma = encoder_layer_norm_0_weight, x = input_41)[name = tensor<string, []>("input_43")];
+            tensor<int32, [3]> var_326_begin_0 = const()[name = tensor<string, []>("op_326_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_326_end_0 = const()[name = tensor<string, []>("op_326_end_0"), val = tensor<int32, [3]>([2, 16, 256])];
+            tensor<bool, [3]> var_326_end_mask_0 = const()[name = tensor<string, []>("op_326_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [2, 16, 256]> conv_out_1 = transpose(perm = conv_out_1_perm_0, x = input_31)[name = tensor<string, []>("transpose_53")];
+            tensor<fp32, [2, 1, 256]> var_326 = slice_by_index(begin = var_326_begin_0, end = var_326_end_0, end_mask = var_326_end_mask_0, x = conv_out_1)[name = tensor<string, []>("op_326")];
+            tensor<int32, [3]> var_328_perm_0 = const()[name = tensor<string, []>("op_328_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 2, 256]> var_328 = transpose(perm = var_328_perm_0, x = var_326)[name = tensor<string, []>("transpose_52")];
+            tensor<fp32, [1, 2, 256]> input_33 = add(x = x_3, y = var_328)[name = tensor<string, []>("input_33")];
+            tensor<int32, [1]> input_35_axes_0 = const()[name = tensor<string, []>("input_35_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 2, 256]> input_35 = layer_norm(axes = input_35_axes_0, beta = inner_encoder_ffn2_0_module_sequential_0_bias, epsilon = var_45, gamma = inner_encoder_ffn2_0_module_sequential_0_weight, x = input_33)[name = tensor<string, []>("input_35")];
+            tensor<fp32, [1, 2, 1024]> inputs_9 = linear(bias = inner_encoder_ffn2_0_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_0_module_sequential_1_linear_weight, x = input_35)[name = tensor<string, []>("linear_8")];
+            tensor<fp32, [1, 2, 1024]> input_37 = silu(x = inputs_9)[name = tensor<string, []>("input_37")];
+            tensor<fp32, [1, 2, 256]> input_41 = linear(bias = inner_encoder_ffn2_0_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_0_module_sequential_4_linear_weight, x = input_37)[name = tensor<string, []>("linear_9")];
+            tensor<fp32, []> var_351 = const()[name = tensor<string, []>("op_351"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 2, 256]> var_352 = mul(x = input_41, y = var_351)[name = tensor<string, []>("op_352")];
+            tensor<fp32, [1, 2, 256]> input_43 = add(x = var_352, y = input_33)[name = tensor<string, []>("input_43")];
             tensor<int32, [1]> input_45_axes_0 = const()[name = tensor<string, []>("input_45_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_45 = layer_norm(axes = input_45_axes_0, beta = encoder_ffn1_1_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_1_module_sequential_0_weight, x = input_43)[name = tensor<string, []>("input_45")];
-            tensor<fp32, [1, 2, 1024]> inputs_11 = linear(bias = encoder_ffn1_1_module_sequential_1_linear_bias, weight = encoder_ffn1_1_module_sequential_1_linear_weight, x = input_45)[name = tensor<string, []>("linear_10")];
-            tensor<fp32, [1, 2, 1024]> input_47 = silu(x = inputs_11)[name = tensor<string, []>("input_47")];
-            tensor<fp32, [1, 2, 256]> input_51 = linear(bias = encoder_ffn1_1_module_sequential_4_linear_bias, weight = encoder_ffn1_1_module_sequential_4_linear_weight, x = input_47)[name = tensor<string, []>("linear_11")];
-            tensor<fp32, []> var_345 = const()[name = tensor<string, []>("op_345"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 2, 256]> var_346 = mul(x = input_51, y = var_345)[name = tensor<string, []>("op_346")];
-            tensor<fp32, [1, 2, 256]> input_53 = add(x = var_346, y = input_43)[name = tensor<string, []>("input_53")];
+            tensor<fp32, [1, 2, 256]> input_45 = layer_norm(axes = input_45_axes_0, beta = inner_encoder_layer_norm_0_bias, epsilon = var_45, gamma = inner_encoder_layer_norm_0_weight, x = input_43)[name = tensor<string, []>("input_45")];
+            tensor<int32, [1]> input_47_axes_0 = const()[name = tensor<string, []>("input_47_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 2, 256]> input_47 = layer_norm(axes = input_47_axes_0, beta = inner_encoder_ffn1_1_module_sequential_0_bias, epsilon = var_45, gamma = inner_encoder_ffn1_1_module_sequential_0_weight, x = input_45)[name = tensor<string, []>("input_47")];
+            tensor<fp32, [1, 2, 1024]> inputs_11 = linear(bias = inner_encoder_ffn1_1_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_1_module_sequential_1_linear_weight, x = input_47)[name = tensor<string, []>("linear_10")];
+            tensor<fp32, [1, 2, 1024]> input_49 = silu(x = inputs_11)[name = tensor<string, []>("input_49")];
+            tensor<fp32, [1, 2, 256]> input_53 = linear(bias = inner_encoder_ffn1_1_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_1_module_sequential_4_linear_weight, x = input_49)[name = tensor<string, []>("linear_11")];
+            tensor<fp32, []> var_381 = const()[name = tensor<string, []>("op_381"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 2, 256]> var_382 = mul(x = input_53, y = var_381)[name = tensor<string, []>("op_382")];
+            tensor<fp32, [1, 2, 256]> input_55 = add(x = var_382, y = input_45)[name = tensor<string, []>("input_55")];
             tensor<int32, [1]> x_7_axes_0 = const()[name = tensor<string, []>("x_7_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> x_7 = layer_norm(axes = x_7_axes_0, beta = encoder_ret_lns_1_bias, epsilon = var_28, gamma = encoder_ret_lns_1_weight, x = input_53)[name = tensor<string, []>("x_7")];
+            tensor<fp32, [1, 2, 256]> x_7 = layer_norm(axes = x_7_axes_0, beta = inner_encoder_ret_lns_1_bias, epsilon = var_45, gamma = inner_encoder_ret_lns_1_weight, x = input_55)[name = tensor<string, []>("x_7")];
             tensor<int32, [5]> prev_kv_3_begin_0 = const()[name = tensor<string, []>("prev_kv_3_begin_0"), val = tensor<int32, [5]>([1, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_3_end_0 = const()[name = tensor<string, []>("prev_kv_3_end_0"), val = tensor<int32, [5]>([2, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_3_end_mask_0 = const()[name = tensor<string, []>("prev_kv_3_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -396,153 +410,153 @@ program(1.0)
             tensor<bool, [2]> prev_scale_3_end_mask_0 = const()[name = tensor<string, []>("prev_scale_3_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_3_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_3_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_3 = slice_by_index(begin = prev_scale_3_begin_0, end = prev_scale_3_end_0, end_mask = prev_scale_3_end_mask_0, squeeze_mask = prev_scale_3_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_3")];
-            tensor<fp32, [1, 2, 256]> var_360 = linear(bias = encoder_q_proj_1_bias, weight = encoder_q_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_12")];
-            tensor<int32, [4]> var_361 = const()[name = tensor<string, []>("op_361"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_362 = reshape(shape = var_361, x = var_360)[name = tensor<string, []>("op_362")];
+            tensor<fp32, [1, 2, 256]> var_396 = linear(bias = inner_encoder_q_proj_1_bias, weight = inner_encoder_q_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_12")];
+            tensor<int32, [4]> var_397 = const()[name = tensor<string, []>("op_397"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_398 = reshape(shape = var_397, x = var_396)[name = tensor<string, []>("op_398")];
             tensor<int32, [4]> q_3_perm_0 = const()[name = tensor<string, []>("q_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> var_366 = linear(bias = encoder_k_proj_1_bias, weight = encoder_k_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_13")];
-            tensor<fp32, []> var_367 = const()[name = tensor<string, []>("op_367"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 2, 256]> var_368 = mul(x = var_366, y = var_367)[name = tensor<string, []>("op_368")];
-            tensor<int32, [4]> var_369 = const()[name = tensor<string, []>("op_369"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_370 = reshape(shape = var_369, x = var_368)[name = tensor<string, []>("op_370")];
+            tensor<fp32, [1, 2, 256]> var_402 = linear(bias = inner_encoder_k_proj_1_bias, weight = inner_encoder_k_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_13")];
+            tensor<fp32, []> var_403 = const()[name = tensor<string, []>("op_403"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 2, 256]> var_404 = mul(x = var_402, y = var_403)[name = tensor<string, []>("op_404")];
+            tensor<int32, [4]> var_405 = const()[name = tensor<string, []>("op_405"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_406 = reshape(shape = var_405, x = var_404)[name = tensor<string, []>("op_406")];
             tensor<int32, [4]> k_3_perm_0 = const()[name = tensor<string, []>("k_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> var_374 = linear(bias = encoder_v_proj_1_bias, weight = encoder_v_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_14")];
-            tensor<int32, [4]> var_375 = const()[name = tensor<string, []>("op_375"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_376 = reshape(shape = var_375, x = var_374)[name = tensor<string, []>("op_376")];
+            tensor<fp32, [1, 2, 256]> var_410 = linear(bias = inner_encoder_v_proj_1_bias, weight = inner_encoder_v_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_14")];
+            tensor<int32, [4]> var_411 = const()[name = tensor<string, []>("op_411"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_412 = reshape(shape = var_411, x = var_410)[name = tensor<string, []>("op_412")];
             tensor<int32, [4]> v_3_perm_0 = const()[name = tensor<string, []>("v_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> input_57 = linear(bias = encoder_g_proj_1_bias, weight = encoder_g_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_15")];
+            tensor<fp32, [1, 2, 256]> input_59 = linear(bias = inner_encoder_g_proj_1_bias, weight = inner_encoder_g_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_15")];
             tensor<fp32, [1]> sqrt_s0_3 = sqrt(x = prev_scale_3)[name = tensor<string, []>("sqrt_s0_3")];
-            tensor<fp32, [2]> s_t_3 = add(x = prev_scale_3, y = encoder__t_index)[name = tensor<string, []>("s_t_3")];
+            tensor<fp32, [2]> s_t_3 = add(x = prev_scale_3, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_3")];
             tensor<fp32, [2]> sqrt_s_t_3 = sqrt(x = s_t_3)[name = tensor<string, []>("sqrt_s_t_3")];
             tensor<bool, []> qk_3_transpose_x_1 = const()[name = tensor<string, []>("qk_3_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_3_transpose_y_1 = const()[name = tensor<string, []>("qk_3_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 2, 64]> k_3 = transpose(perm = k_3_perm_0, x = var_370)[name = tensor<string, []>("transpose_50")];
-            tensor<fp32, [1, 4, 2, 64]> q_3 = transpose(perm = q_3_perm_0, x = var_362)[name = tensor<string, []>("transpose_51")];
+            tensor<fp32, [1, 4, 2, 64]> k_3 = transpose(perm = k_3_perm_0, x = var_406)[name = tensor<string, []>("transpose_50")];
+            tensor<fp32, [1, 4, 2, 64]> q_3 = transpose(perm = q_3_perm_0, x = var_398)[name = tensor<string, []>("transpose_51")];
             tensor<fp32, [1, 4, 2, 2]> qk_3 = matmul(transpose_x = qk_3_transpose_x_1, transpose_y = qk_3_transpose_y_1, x = q_3, y = k_3)[name = tensor<string, []>("qk_3")];
-            tensor<int32, [2]> var_386 = const()[name = tensor<string, []>("op_386"), val = tensor<int32, [2]>([2, 1])];
-            tensor<fp32, [2, 1]> var_387 = reshape(shape = var_386, x = sqrt_s_t_3)[name = tensor<string, []>("op_387")];
-            tensor<fp32, [2, 2]> M_3 = real_div(x = encoder__causal_mask, y = var_387)[name = tensor<string, []>("M_3")];
-            tensor<fp32, [1, 4, 2, 2]> var_389 = mul(x = qk_3, y = M_3)[name = tensor<string, []>("op_389")];
+            tensor<int32, [2]> var_422 = const()[name = tensor<string, []>("op_422"), val = tensor<int32, [2]>([2, 1])];
+            tensor<fp32, [2, 1]> var_423 = reshape(shape = var_422, x = sqrt_s_t_3)[name = tensor<string, []>("op_423")];
+            tensor<fp32, [2, 2]> M_3 = real_div(x = inner_encoder__causal_mask, y = var_423)[name = tensor<string, []>("M_3")];
+            tensor<fp32, [1, 4, 2, 2]> var_425 = mul(x = qk_3, y = M_3)[name = tensor<string, []>("op_425")];
             tensor<bool, []> inner_3_transpose_x_0 = const()[name = tensor<string, []>("inner_3_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_3_transpose_y_0 = const()[name = tensor<string, []>("inner_3_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 2, 64]> v_3 = transpose(perm = v_3_perm_0, x = var_376)[name = tensor<string, []>("transpose_49")];
-            tensor<fp32, [1, 4, 2, 64]> inner_3 = matmul(transpose_x = inner_3_transpose_x_0, transpose_y = inner_3_transpose_y_0, x = var_389, y = v_3)[name = tensor<string, []>("inner_3")];
-            tensor<bool, []> var_391_transpose_x_0 = const()[name = tensor<string, []>("op_391_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_391_transpose_y_0 = const()[name = tensor<string, []>("op_391_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 2, 64]> var_391 = matmul(transpose_x = var_391_transpose_x_0, transpose_y = var_391_transpose_y_0, x = q_3, y = prev_kv_3)[name = tensor<string, []>("op_391")];
-            tensor<fp32, [2]> var_392 = real_div(x = sqrt_s0_3, y = sqrt_s_t_3)[name = tensor<string, []>("op_392")];
-            tensor<int32, [4]> var_393 = const()[name = tensor<string, []>("op_393"), val = tensor<int32, [4]>([1, 1, 2, 1])];
-            tensor<fp32, [1, 1, 2, 1]> var_394 = reshape(shape = var_393, x = var_392)[name = tensor<string, []>("op_394")];
-            tensor<fp32, [1, 4, 2, 64]> cross_3 = mul(x = var_391, y = var_394)[name = tensor<string, []>("cross_3")];
+            tensor<fp32, [1, 4, 2, 64]> v_3 = transpose(perm = v_3_perm_0, x = var_412)[name = tensor<string, []>("transpose_49")];
+            tensor<fp32, [1, 4, 2, 64]> inner_3 = matmul(transpose_x = inner_3_transpose_x_0, transpose_y = inner_3_transpose_y_0, x = var_425, y = v_3)[name = tensor<string, []>("inner_3")];
+            tensor<bool, []> var_427_transpose_x_0 = const()[name = tensor<string, []>("op_427_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_427_transpose_y_0 = const()[name = tensor<string, []>("op_427_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 2, 64]> var_427 = matmul(transpose_x = var_427_transpose_x_0, transpose_y = var_427_transpose_y_0, x = q_3, y = prev_kv_3)[name = tensor<string, []>("op_427")];
+            tensor<fp32, [2]> var_428 = real_div(x = sqrt_s0_3, y = sqrt_s_t_3)[name = tensor<string, []>("op_428")];
+            tensor<int32, [4]> var_429 = const()[name = tensor<string, []>("op_429"), val = tensor<int32, [4]>([1, 1, 2, 1])];
+            tensor<fp32, [1, 1, 2, 1]> var_430 = reshape(shape = var_429, x = var_428)[name = tensor<string, []>("op_430")];
+            tensor<fp32, [1, 4, 2, 64]> cross_3 = mul(x = var_427, y = var_430)[name = tensor<string, []>("cross_3")];
             tensor<fp32, [1, 4, 2, 64]> out_7 = add(x = inner_3, y = cross_3)[name = tensor<string, []>("out_7")];
-            tensor<fp32, [1, 4, 64, 64]> var_397 = mul(x = prev_kv_3, y = sqrt_s0_3)[name = tensor<string, []>("op_397")];
-            tensor<bool, []> var_399_transpose_x_1 = const()[name = tensor<string, []>("op_399_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_399_transpose_y_1 = const()[name = tensor<string, []>("op_399_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_399 = matmul(transpose_x = var_399_transpose_x_1, transpose_y = var_399_transpose_y_1, x = k_3, y = v_3)[name = tensor<string, []>("op_399")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_3 = add(x = var_397, y = var_399)[name = tensor<string, []>("new_kv_unnorm_3")];
-            tensor<fp32, []> var_401 = const()[name = tensor<string, []>("op_401"), val = tensor<fp32, []>(0x1p+1)];
-            tensor<fp32, [1]> new_scale_3 = add(x = prev_scale_3, y = var_401)[name = tensor<string, []>("new_scale_3")];
-            tensor<fp32, [1]> var_403 = sqrt(x = new_scale_3)[name = tensor<string, []>("op_403")];
-            tensor<fp32, [1, 4, 64, 64]> var_404 = real_div(x = new_kv_unnorm_3, y = var_403)[name = tensor<string, []>("op_404")];
-            tensor<int32, [4]> var_405_perm_0 = const()[name = tensor<string, []>("op_405_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_433 = mul(x = prev_kv_3, y = sqrt_s0_3)[name = tensor<string, []>("op_433")];
+            tensor<bool, []> var_435_transpose_x_1 = const()[name = tensor<string, []>("op_435_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_435_transpose_y_1 = const()[name = tensor<string, []>("op_435_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_435 = matmul(transpose_x = var_435_transpose_x_1, transpose_y = var_435_transpose_y_1, x = k_3, y = v_3)[name = tensor<string, []>("op_435")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_3 = add(x = var_433, y = var_435)[name = tensor<string, []>("new_kv_unnorm_3")];
+            tensor<fp32, []> var_437 = const()[name = tensor<string, []>("op_437"), val = tensor<fp32, []>(0x1p+1)];
+            tensor<fp32, [1]> new_scale_3 = add(x = prev_scale_3, y = var_437)[name = tensor<string, []>("new_scale_3")];
+            tensor<fp32, [1]> var_439 = sqrt(x = new_scale_3)[name = tensor<string, []>("op_439")];
+            tensor<fp32, [1, 4, 64, 64]> var_440 = real_div(x = new_kv_unnorm_3, y = var_439)[name = tensor<string, []>("op_440")];
+            tensor<int32, [4]> var_441_perm_0 = const()[name = tensor<string, []>("op_441_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_9_axes_0 = const()[name = tensor<string, []>("out_9_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 4, 64]> var_405 = transpose(perm = var_405_perm_0, x = out_7)[name = tensor<string, []>("transpose_48")];
-            tensor<fp32, [1, 2, 4, 64]> out_9 = layer_norm(axes = out_9_axes_0, epsilon = var_17, x = var_405)[name = tensor<string, []>("out_9")];
-            tensor<int32, [3]> var_409 = const()[name = tensor<string, []>("op_409"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<fp32, [1, 2, 256]> out_11 = reshape(shape = var_409, x = out_9)[name = tensor<string, []>("out_11")];
-            tensor<fp32, [1, 2, 256]> var_411 = silu(x = input_57)[name = tensor<string, []>("op_411")];
-            tensor<fp32, [1, 2, 256]> input_59 = mul(x = var_411, y = out_11)[name = tensor<string, []>("input_59")];
-            tensor<fp32, [1, 2, 256]> ret_out_3 = linear(bias = encoder_out_proj_1_bias, weight = encoder_out_proj_1_weight, x = input_59)[name = tensor<string, []>("linear_16")];
-            tensor<fp32, [1, 2, 256]> x_9 = add(x = input_53, y = ret_out_3)[name = tensor<string, []>("x_9")];
+            tensor<fp32, [1, 2, 4, 64]> var_441 = transpose(perm = var_441_perm_0, x = out_7)[name = tensor<string, []>("transpose_48")];
+            tensor<fp32, [1, 2, 4, 64]> out_9 = layer_norm(axes = out_9_axes_0, epsilon = var_53, x = var_441)[name = tensor<string, []>("out_9")];
+            tensor<int32, [3]> var_445 = const()[name = tensor<string, []>("op_445"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp32, [1, 2, 256]> out_11 = reshape(shape = var_445, x = out_9)[name = tensor<string, []>("out_11")];
+            tensor<fp32, [1, 2, 256]> var_447 = silu(x = input_59)[name = tensor<string, []>("op_447")];
+            tensor<fp32, [1, 2, 256]> input_61 = mul(x = var_447, y = out_11)[name = tensor<string, []>("input_61")];
+            tensor<fp32, [1, 2, 256]> ret_out_3 = linear(bias = inner_encoder_out_proj_1_bias, weight = inner_encoder_out_proj_1_weight, x = input_61)[name = tensor<string, []>("linear_16")];
+            tensor<fp32, [1, 2, 256]> x_9 = add(x = input_55, y = ret_out_3)[name = tensor<string, []>("x_9")];
             tensor<int32, [4]> window_7_begin_0 = const()[name = tensor<string, []>("window_7_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> window_7_end_0 = const()[name = tensor<string, []>("window_7_end_0"), val = tensor<int32, [4]>([2, 1, 16, 256])];
             tensor<bool, [4]> window_7_end_mask_0 = const()[name = tensor<string, []>("window_7_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_7_squeeze_mask_0 = const()[name = tensor<string, []>("window_7_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_7 = slice_by_index(begin = window_7_begin_0, end = window_7_end_0, end_mask = window_7_end_mask_0, squeeze_mask = window_7_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_7")];
-            tensor<int32, [3]> var_419_begin_0 = const()[name = tensor<string, []>("op_419_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_419_end_0 = const()[name = tensor<string, []>("op_419_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_419_end_mask_0 = const()[name = tensor<string, []>("op_419_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_419 = slice_by_index(begin = var_419_begin_0, end = var_419_end_0, end_mask = var_419_end_mask_0, x = x_9)[name = tensor<string, []>("op_419")];
-            tensor<int32, [3]> var_422_begin_0 = const()[name = tensor<string, []>("op_422_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_422_end_0 = const()[name = tensor<string, []>("op_422_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_422_end_mask_0 = const()[name = tensor<string, []>("op_422_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_422 = slice_by_index(begin = var_422_begin_0, end = var_422_end_0, end_mask = var_422_end_mask_0, x = window_7)[name = tensor<string, []>("op_422")];
+            tensor<int32, [3]> var_455_begin_0 = const()[name = tensor<string, []>("op_455_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_455_end_0 = const()[name = tensor<string, []>("op_455_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_455_end_mask_0 = const()[name = tensor<string, []>("op_455_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_455 = slice_by_index(begin = var_455_begin_0, end = var_455_end_0, end_mask = var_455_end_mask_0, x = x_9)[name = tensor<string, []>("op_455")];
+            tensor<int32, [3]> var_458_begin_0 = const()[name = tensor<string, []>("op_458_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_458_end_0 = const()[name = tensor<string, []>("op_458_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_458_end_mask_0 = const()[name = tensor<string, []>("op_458_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_458 = slice_by_index(begin = var_458_begin_0, end = var_458_end_0, end_mask = var_458_end_mask_0, x = window_7)[name = tensor<string, []>("op_458")];
             tensor<bool, []> window_9_interleave_0 = const()[name = tensor<string, []>("window_9_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_9 = concat(axis = var_26, interleave = window_9_interleave_0, values = (var_422, var_419))[name = tensor<string, []>("window_9")];
-            tensor<int32, [3]> var_427_begin_0 = const()[name = tensor<string, []>("op_427_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_427_end_0 = const()[name = tensor<string, []>("op_427_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_427_end_mask_0 = const()[name = tensor<string, []>("op_427_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_427 = slice_by_index(begin = var_427_begin_0, end = var_427_end_0, end_mask = var_427_end_mask_0, x = x_9)[name = tensor<string, []>("op_427")];
-            tensor<int32, [3]> var_430_begin_0 = const()[name = tensor<string, []>("op_430_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_430_end_0 = const()[name = tensor<string, []>("op_430_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_430_end_mask_0 = const()[name = tensor<string, []>("op_430_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_430 = slice_by_index(begin = var_430_begin_0, end = var_430_end_0, end_mask = var_430_end_mask_0, x = window_9)[name = tensor<string, []>("op_430")];
+            tensor<fp32, [1, 16, 256]> window_9 = concat(axis = var_62, interleave = window_9_interleave_0, values = (var_458, var_455))[name = tensor<string, []>("window_9")];
+            tensor<int32, [3]> var_463_begin_0 = const()[name = tensor<string, []>("op_463_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_463_end_0 = const()[name = tensor<string, []>("op_463_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_463_end_mask_0 = const()[name = tensor<string, []>("op_463_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_463 = slice_by_index(begin = var_463_begin_0, end = var_463_end_0, end_mask = var_463_end_mask_0, x = x_9)[name = tensor<string, []>("op_463")];
+            tensor<int32, [3]> var_466_begin_0 = const()[name = tensor<string, []>("op_466_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_466_end_0 = const()[name = tensor<string, []>("op_466_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_466_end_mask_0 = const()[name = tensor<string, []>("op_466_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_466 = slice_by_index(begin = var_466_begin_0, end = var_466_end_0, end_mask = var_466_end_mask_0, x = window_9)[name = tensor<string, []>("op_466")];
             tensor<bool, []> window_11_interleave_0 = const()[name = tensor<string, []>("window_11_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_11 = concat(axis = var_26, interleave = window_11_interleave_0, values = (var_430, var_427))[name = tensor<string, []>("window_11")];
-            tensor<bool, []> input_61_interleave_0 = const()[name = tensor<string, []>("input_61_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [2, 16, 256]> input_61 = concat(axis = var_23, interleave = input_61_interleave_0, values = (window_9, window_11))[name = tensor<string, []>("input_61")];
+            tensor<fp32, [1, 16, 256]> window_11 = concat(axis = var_62, interleave = window_11_interleave_0, values = (var_466, var_463))[name = tensor<string, []>("window_11")];
+            tensor<bool, []> input_63_interleave_0 = const()[name = tensor<string, []>("input_63_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [2, 16, 256]> input_63 = concat(axis = var_48, interleave = input_63_interleave_0, values = (window_9, window_11))[name = tensor<string, []>("input_63")];
             tensor<int32, [1]> x_11_axes_0 = const()[name = tensor<string, []>("x_11_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [2, 16, 256]> x_11 = layer_norm(axes = x_11_axes_0, beta = encoder_conv_module_1_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_1_sequential_0_weight, x = input_61)[name = tensor<string, []>("x_11")];
-            tensor<int32, [3]> input_63_perm_0 = const()[name = tensor<string, []>("input_63_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [2, 16, 256]> x_11 = layer_norm(axes = x_11_axes_0, beta = inner_encoder_conv_module_1_sequential_0_bias, epsilon = var_45, gamma = inner_encoder_conv_module_1_sequential_0_weight, x = input_63)[name = tensor<string, []>("x_11")];
+            tensor<int32, [3]> input_65_perm_0 = const()[name = tensor<string, []>("input_65_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_13_pad_type_0 = const()[name = tensor<string, []>("inputs_13_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_13_strides_0 = const()[name = tensor<string, []>("inputs_13_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_13_pad_0 = const()[name = tensor<string, []>("inputs_13_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_13_dilations_0 = const()[name = tensor<string, []>("inputs_13_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_13_groups_0 = const()[name = tensor<string, []>("inputs_13_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> input_63 = transpose(perm = input_63_perm_0, x = x_11)[name = tensor<string, []>("transpose_47")];
-            tensor<fp32, [2, 512, 16]> inputs_13 = conv(bias = encoder_conv_module_1_sequential_2_conv_bias, dilations = inputs_13_dilations_0, groups = inputs_13_groups_0, pad = inputs_13_pad_0, pad_type = inputs_13_pad_type_0, strides = inputs_13_strides_0, weight = encoder_conv_module_1_sequential_2_conv_weight, x = input_63)[name = tensor<string, []>("inputs_13")];
-            tensor<int32, [2]> var_455_split_sizes_0 = const()[name = tensor<string, []>("op_455_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_455_axis_0 = const()[name = tensor<string, []>("op_455_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> var_455_0, tensor<fp32, [2, 256, 16]> var_455_1 = split(axis = var_455_axis_0, split_sizes = var_455_split_sizes_0, x = inputs_13)[name = tensor<string, []>("op_455")];
-            tensor<fp32, [2, 256, 16]> var_457 = sigmoid(x = var_455_1)[name = tensor<string, []>("op_457")];
-            tensor<fp32, [2, 256, 16]> inputs_15 = mul(x = var_455_0, y = var_457)[name = tensor<string, []>("inputs_15")];
+            tensor<fp32, [2, 256, 16]> input_65 = transpose(perm = input_65_perm_0, x = x_11)[name = tensor<string, []>("transpose_47")];
+            tensor<fp32, [2, 512, 16]> inputs_13 = conv(bias = inner_encoder_conv_module_1_sequential_2_conv_bias, dilations = inputs_13_dilations_0, groups = inputs_13_groups_0, pad = inputs_13_pad_0, pad_type = inputs_13_pad_type_0, strides = inputs_13_strides_0, weight = inner_encoder_conv_module_1_sequential_2_conv_weight, x = input_65)[name = tensor<string, []>("inputs_13")];
+            tensor<int32, [2]> var_491_split_sizes_0 = const()[name = tensor<string, []>("op_491_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_491_axis_0 = const()[name = tensor<string, []>("op_491_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [2, 256, 16]> var_491_0, tensor<fp32, [2, 256, 16]> var_491_1 = split(axis = var_491_axis_0, split_sizes = var_491_split_sizes_0, x = inputs_13)[name = tensor<string, []>("op_491")];
+            tensor<fp32, [2, 256, 16]> var_493 = sigmoid(x = var_491_1)[name = tensor<string, []>("op_493")];
+            tensor<fp32, [2, 256, 16]> inputs_15 = mul(x = var_491_0, y = var_493)[name = tensor<string, []>("inputs_15")];
             tensor<string, []> outputs_aug_3_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_3_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_3_pad_0 = const()[name = tensor<string, []>("outputs_aug_3_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_3_groups_0 = const()[name = tensor<string, []>("outputs_aug_3_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_3_strides_0 = const()[name = tensor<string, []>("outputs_aug_3_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_3_dilations_0 = const()[name = tensor<string, []>("outputs_aug_3_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [2, 256, 31]> outputs_aug_3 = conv(dilations = outputs_aug_3_dilations_0, groups = outputs_aug_3_groups_0, pad = outputs_aug_3_pad_0, pad_type = outputs_aug_3_pad_type_0, strides = outputs_aug_3_strides_0, weight = encoder_conv_module_1_sequential_4_conv_weight, x = inputs_15)[name = tensor<string, []>("outputs_aug_3")];
-            tensor<int32, [3]> input_65_begin_0 = const()[name = tensor<string, []>("input_65_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_65_end_0 = const()[name = tensor<string, []>("input_65_end_0"), val = tensor<int32, [3]>([2, 256, 16])];
-            tensor<bool, [3]> input_65_end_mask_0 = const()[name = tensor<string, []>("input_65_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [2, 256, 16]> input_65 = slice_by_index(begin = input_65_begin_0, end = input_65_end_0, end_mask = input_65_end_mask_0, x = outputs_aug_3)[name = tensor<string, []>("input_65")];
-            tensor<fp32, [2, 256, 16]> inputs_17 = batch_norm(beta = encoder_conv_module_1_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_1_sequential_5_weight, mean = encoder_conv_module_1_sequential_5_running_mean, variance = encoder_conv_module_1_sequential_5_running_var, x = input_65)[name = tensor<string, []>("inputs_17")];
-            tensor<fp32, [2, 256, 16]> input_67 = silu(x = inputs_17)[name = tensor<string, []>("input_67")];
-            tensor<string, []> input_69_pad_type_0 = const()[name = tensor<string, []>("input_69_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_69_strides_0 = const()[name = tensor<string, []>("input_69_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_69_pad_0 = const()[name = tensor<string, []>("input_69_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_69_dilations_0 = const()[name = tensor<string, []>("input_69_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_69_groups_0 = const()[name = tensor<string, []>("input_69_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> input_69 = conv(bias = encoder_conv_module_1_sequential_7_conv_bias, dilations = input_69_dilations_0, groups = input_69_groups_0, pad = input_69_pad_0, pad_type = input_69_pad_type_0, strides = input_69_strides_0, weight = encoder_conv_module_1_sequential_7_conv_weight, x = input_67)[name = tensor<string, []>("input_69")];
+            tensor<fp32, [2, 256, 31]> outputs_aug_3 = conv(dilations = outputs_aug_3_dilations_0, groups = outputs_aug_3_groups_0, pad = outputs_aug_3_pad_0, pad_type = outputs_aug_3_pad_type_0, strides = outputs_aug_3_strides_0, weight = inner_encoder_conv_module_1_sequential_4_conv_weight, x = inputs_15)[name = tensor<string, []>("outputs_aug_3")];
+            tensor<int32, [3]> input_67_begin_0 = const()[name = tensor<string, []>("input_67_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_67_end_0 = const()[name = tensor<string, []>("input_67_end_0"), val = tensor<int32, [3]>([2, 256, 16])];
+            tensor<bool, [3]> input_67_end_mask_0 = const()[name = tensor<string, []>("input_67_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [2, 256, 16]> input_67 = slice_by_index(begin = input_67_begin_0, end = input_67_end_0, end_mask = input_67_end_mask_0, x = outputs_aug_3)[name = tensor<string, []>("input_67")];
+            tensor<fp32, [2, 256, 16]> inputs_17 = batch_norm(beta = inner_encoder_conv_module_1_sequential_5_bias, epsilon = var_45, gamma = inner_encoder_conv_module_1_sequential_5_weight, mean = inner_encoder_conv_module_1_sequential_5_running_mean, variance = inner_encoder_conv_module_1_sequential_5_running_var, x = input_67)[name = tensor<string, []>("inputs_17")];
+            tensor<fp32, [2, 256, 16]> input_69 = silu(x = inputs_17)[name = tensor<string, []>("input_69")];
+            tensor<string, []> input_71_pad_type_0 = const()[name = tensor<string, []>("input_71_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_71_strides_0 = const()[name = tensor<string, []>("input_71_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_71_pad_0 = const()[name = tensor<string, []>("input_71_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_71_dilations_0 = const()[name = tensor<string, []>("input_71_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_71_groups_0 = const()[name = tensor<string, []>("input_71_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [2, 256, 16]> input_71 = conv(bias = inner_encoder_conv_module_1_sequential_7_conv_bias, dilations = input_71_dilations_0, groups = input_71_groups_0, pad = input_71_pad_0, pad_type = input_71_pad_type_0, strides = input_71_strides_0, weight = inner_encoder_conv_module_1_sequential_7_conv_weight, x = input_69)[name = tensor<string, []>("input_71")];
             tensor<int32, [3]> conv_out_3_perm_0 = const()[name = tensor<string, []>("conv_out_3_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_488_begin_0 = const()[name = tensor<string, []>("op_488_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_488_end_0 = const()[name = tensor<string, []>("op_488_end_0"), val = tensor<int32, [3]>([2, 16, 256])];
-            tensor<bool, [3]> var_488_end_mask_0 = const()[name = tensor<string, []>("op_488_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [2, 16, 256]> conv_out_3 = transpose(perm = conv_out_3_perm_0, x = input_69)[name = tensor<string, []>("transpose_46")];
-            tensor<fp32, [2, 1, 256]> var_488 = slice_by_index(begin = var_488_begin_0, end = var_488_end_0, end_mask = var_488_end_mask_0, x = conv_out_3)[name = tensor<string, []>("op_488")];
-            tensor<int32, [3]> var_490_perm_0 = const()[name = tensor<string, []>("op_490_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 2, 256]> var_490 = transpose(perm = var_490_perm_0, x = var_488)[name = tensor<string, []>("transpose_45")];
-            tensor<fp32, [1, 2, 256]> input_71 = add(x = x_9, y = var_490)[name = tensor<string, []>("input_71")];
-            tensor<int32, [1]> input_73_axes_0 = const()[name = tensor<string, []>("input_73_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_73 = layer_norm(axes = input_73_axes_0, beta = encoder_ffn2_1_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_1_module_sequential_0_weight, x = input_71)[name = tensor<string, []>("input_73")];
-            tensor<fp32, [1, 2, 1024]> inputs_19 = linear(bias = encoder_ffn2_1_module_sequential_1_linear_bias, weight = encoder_ffn2_1_module_sequential_1_linear_weight, x = input_73)[name = tensor<string, []>("linear_17")];
-            tensor<fp32, [1, 2, 1024]> input_75 = silu(x = inputs_19)[name = tensor<string, []>("input_75")];
-            tensor<fp32, [1, 2, 256]> input_79 = linear(bias = encoder_ffn2_1_module_sequential_4_linear_bias, weight = encoder_ffn2_1_module_sequential_4_linear_weight, x = input_75)[name = tensor<string, []>("linear_18")];
-            tensor<fp32, []> var_513 = const()[name = tensor<string, []>("op_513"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 2, 256]> var_514 = mul(x = input_79, y = var_513)[name = tensor<string, []>("op_514")];
-            tensor<fp32, [1, 2, 256]> input_81 = add(x = var_514, y = input_71)[name = tensor<string, []>("input_81")];
-            tensor<int32, [1]> input_83_axes_0 = const()[name = tensor<string, []>("input_83_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_83 = layer_norm(axes = input_83_axes_0, beta = encoder_layer_norm_1_bias, epsilon = var_28, gamma = encoder_layer_norm_1_weight, x = input_81)[name = tensor<string, []>("input_83")];
+            tensor<int32, [3]> var_524_begin_0 = const()[name = tensor<string, []>("op_524_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_524_end_0 = const()[name = tensor<string, []>("op_524_end_0"), val = tensor<int32, [3]>([2, 16, 256])];
+            tensor<bool, [3]> var_524_end_mask_0 = const()[name = tensor<string, []>("op_524_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [2, 16, 256]> conv_out_3 = transpose(perm = conv_out_3_perm_0, x = input_71)[name = tensor<string, []>("transpose_46")];
+            tensor<fp32, [2, 1, 256]> var_524 = slice_by_index(begin = var_524_begin_0, end = var_524_end_0, end_mask = var_524_end_mask_0, x = conv_out_3)[name = tensor<string, []>("op_524")];
+            tensor<int32, [3]> var_526_perm_0 = const()[name = tensor<string, []>("op_526_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 2, 256]> var_526 = transpose(perm = var_526_perm_0, x = var_524)[name = tensor<string, []>("transpose_45")];
+            tensor<fp32, [1, 2, 256]> input_73 = add(x = x_9, y = var_526)[name = tensor<string, []>("input_73")];
+            tensor<int32, [1]> input_75_axes_0 = const()[name = tensor<string, []>("input_75_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 2, 256]> input_75 = layer_norm(axes = input_75_axes_0, beta = inner_encoder_ffn2_1_module_sequential_0_bias, epsilon = var_45, gamma = inner_encoder_ffn2_1_module_sequential_0_weight, x = input_73)[name = tensor<string, []>("input_75")];
+            tensor<fp32, [1, 2, 1024]> inputs_19 = linear(bias = inner_encoder_ffn2_1_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_1_module_sequential_1_linear_weight, x = input_75)[name = tensor<string, []>("linear_17")];
+            tensor<fp32, [1, 2, 1024]> input_77 = silu(x = inputs_19)[name = tensor<string, []>("input_77")];
+            tensor<fp32, [1, 2, 256]> input_81 = linear(bias = inner_encoder_ffn2_1_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_1_module_sequential_4_linear_weight, x = input_77)[name = tensor<string, []>("linear_18")];
+            tensor<fp32, []> var_549 = const()[name = tensor<string, []>("op_549"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 2, 256]> var_550 = mul(x = input_81, y = var_549)[name = tensor<string, []>("op_550")];
+            tensor<fp32, [1, 2, 256]> input_83 = add(x = var_550, y = input_73)[name = tensor<string, []>("input_83")];
             tensor<int32, [1]> input_85_axes_0 = const()[name = tensor<string, []>("input_85_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_85 = layer_norm(axes = input_85_axes_0, beta = encoder_ffn1_2_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_2_module_sequential_0_weight, x = input_83)[name = tensor<string, []>("input_85")];
-            tensor<fp32, [1, 2, 1024]> inputs_21 = linear(bias = encoder_ffn1_2_module_sequential_1_linear_bias, weight = encoder_ffn1_2_module_sequential_1_linear_weight, x = input_85)[name = tensor<string, []>("linear_19")];
-            tensor<fp32, [1, 2, 1024]> input_87 = silu(x = inputs_21)[name = tensor<string, []>("input_87")];
-            tensor<fp32, [1, 2, 256]> input_91 = linear(bias = encoder_ffn1_2_module_sequential_4_linear_bias, weight = encoder_ffn1_2_module_sequential_4_linear_weight, x = input_87)[name = tensor<string, []>("linear_20")];
-            tensor<fp32, []> var_543 = const()[name = tensor<string, []>("op_543"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 2, 256]> var_544 = mul(x = input_91, y = var_543)[name = tensor<string, []>("op_544")];
-            tensor<fp32, [1, 2, 256]> input_93 = add(x = var_544, y = input_83)[name = tensor<string, []>("input_93")];
+            tensor<fp32, [1, 2, 256]> input_85 = layer_norm(axes = input_85_axes_0, beta = inner_encoder_layer_norm_1_bias, epsilon = var_45, gamma = inner_encoder_layer_norm_1_weight, x = input_83)[name = tensor<string, []>("input_85")];
+            tensor<int32, [1]> input_87_axes_0 = const()[name = tensor<string, []>("input_87_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 2, 256]> input_87 = layer_norm(axes = input_87_axes_0, beta = inner_encoder_ffn1_2_module_sequential_0_bias, epsilon = var_45, gamma = inner_encoder_ffn1_2_module_sequential_0_weight, x = input_85)[name = tensor<string, []>("input_87")];
+            tensor<fp32, [1, 2, 1024]> inputs_21 = linear(bias = inner_encoder_ffn1_2_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_2_module_sequential_1_linear_weight, x = input_87)[name = tensor<string, []>("linear_19")];
+            tensor<fp32, [1, 2, 1024]> input_89 = silu(x = inputs_21)[name = tensor<string, []>("input_89")];
+            tensor<fp32, [1, 2, 256]> input_93 = linear(bias = inner_encoder_ffn1_2_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_2_module_sequential_4_linear_weight, x = input_89)[name = tensor<string, []>("linear_20")];
+            tensor<fp32, []> var_579 = const()[name = tensor<string, []>("op_579"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 2, 256]> var_580 = mul(x = input_93, y = var_579)[name = tensor<string, []>("op_580")];
+            tensor<fp32, [1, 2, 256]> input_95 = add(x = var_580, y = input_85)[name = tensor<string, []>("input_95")];
             tensor<int32, [1]> x_13_axes_0 = const()[name = tensor<string, []>("x_13_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> x_13 = layer_norm(axes = x_13_axes_0, beta = encoder_ret_lns_2_bias, epsilon = var_28, gamma = encoder_ret_lns_2_weight, x = input_93)[name = tensor<string, []>("x_13")];
+            tensor<fp32, [1, 2, 256]> x_13 = layer_norm(axes = x_13_axes_0, beta = inner_encoder_ret_lns_2_bias, epsilon = var_45, gamma = inner_encoder_ret_lns_2_weight, x = input_95)[name = tensor<string, []>("x_13")];
             tensor<int32, [5]> prev_kv_5_begin_0 = const()[name = tensor<string, []>("prev_kv_5_begin_0"), val = tensor<int32, [5]>([2, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_5_end_0 = const()[name = tensor<string, []>("prev_kv_5_end_0"), val = tensor<int32, [5]>([3, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_5_end_mask_0 = const()[name = tensor<string, []>("prev_kv_5_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -553,153 +567,153 @@ program(1.0)
             tensor<bool, [2]> prev_scale_5_end_mask_0 = const()[name = tensor<string, []>("prev_scale_5_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_5_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_5_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_5 = slice_by_index(begin = prev_scale_5_begin_0, end = prev_scale_5_end_0, end_mask = prev_scale_5_end_mask_0, squeeze_mask = prev_scale_5_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_5")];
-            tensor<fp32, [1, 2, 256]> var_558 = linear(bias = encoder_q_proj_2_bias, weight = encoder_q_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_21")];
-            tensor<int32, [4]> var_559 = const()[name = tensor<string, []>("op_559"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_560 = reshape(shape = var_559, x = var_558)[name = tensor<string, []>("op_560")];
+            tensor<fp32, [1, 2, 256]> var_594 = linear(bias = inner_encoder_q_proj_2_bias, weight = inner_encoder_q_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_21")];
+            tensor<int32, [4]> var_595 = const()[name = tensor<string, []>("op_595"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_596 = reshape(shape = var_595, x = var_594)[name = tensor<string, []>("op_596")];
             tensor<int32, [4]> q_5_perm_0 = const()[name = tensor<string, []>("q_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> var_564 = linear(bias = encoder_k_proj_2_bias, weight = encoder_k_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_22")];
-            tensor<fp32, []> var_565 = const()[name = tensor<string, []>("op_565"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 2, 256]> var_566 = mul(x = var_564, y = var_565)[name = tensor<string, []>("op_566")];
-            tensor<int32, [4]> var_567 = const()[name = tensor<string, []>("op_567"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_568 = reshape(shape = var_567, x = var_566)[name = tensor<string, []>("op_568")];
+            tensor<fp32, [1, 2, 256]> var_600 = linear(bias = inner_encoder_k_proj_2_bias, weight = inner_encoder_k_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_22")];
+            tensor<fp32, []> var_601 = const()[name = tensor<string, []>("op_601"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 2, 256]> var_602 = mul(x = var_600, y = var_601)[name = tensor<string, []>("op_602")];
+            tensor<int32, [4]> var_603 = const()[name = tensor<string, []>("op_603"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_604 = reshape(shape = var_603, x = var_602)[name = tensor<string, []>("op_604")];
             tensor<int32, [4]> k_5_perm_0 = const()[name = tensor<string, []>("k_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> var_572 = linear(bias = encoder_v_proj_2_bias, weight = encoder_v_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_23")];
-            tensor<int32, [4]> var_573 = const()[name = tensor<string, []>("op_573"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_574 = reshape(shape = var_573, x = var_572)[name = tensor<string, []>("op_574")];
+            tensor<fp32, [1, 2, 256]> var_608 = linear(bias = inner_encoder_v_proj_2_bias, weight = inner_encoder_v_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_23")];
+            tensor<int32, [4]> var_609 = const()[name = tensor<string, []>("op_609"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_610 = reshape(shape = var_609, x = var_608)[name = tensor<string, []>("op_610")];
             tensor<int32, [4]> v_5_perm_0 = const()[name = tensor<string, []>("v_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> input_97 = linear(bias = encoder_g_proj_2_bias, weight = encoder_g_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_24")];
+            tensor<fp32, [1, 2, 256]> input_99 = linear(bias = inner_encoder_g_proj_2_bias, weight = inner_encoder_g_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_24")];
             tensor<fp32, [1]> sqrt_s0_5 = sqrt(x = prev_scale_5)[name = tensor<string, []>("sqrt_s0_5")];
-            tensor<fp32, [2]> s_t_5 = add(x = prev_scale_5, y = encoder__t_index)[name = tensor<string, []>("s_t_5")];
+            tensor<fp32, [2]> s_t_5 = add(x = prev_scale_5, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_5")];
             tensor<fp32, [2]> sqrt_s_t_5 = sqrt(x = s_t_5)[name = tensor<string, []>("sqrt_s_t_5")];
             tensor<bool, []> qk_5_transpose_x_1 = const()[name = tensor<string, []>("qk_5_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_5_transpose_y_1 = const()[name = tensor<string, []>("qk_5_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 2, 64]> k_5 = transpose(perm = k_5_perm_0, x = var_568)[name = tensor<string, []>("transpose_43")];
-            tensor<fp32, [1, 4, 2, 64]> q_5 = transpose(perm = q_5_perm_0, x = var_560)[name = tensor<string, []>("transpose_44")];
+            tensor<fp32, [1, 4, 2, 64]> k_5 = transpose(perm = k_5_perm_0, x = var_604)[name = tensor<string, []>("transpose_43")];
+            tensor<fp32, [1, 4, 2, 64]> q_5 = transpose(perm = q_5_perm_0, x = var_596)[name = tensor<string, []>("transpose_44")];
             tensor<fp32, [1, 4, 2, 2]> qk_5 = matmul(transpose_x = qk_5_transpose_x_1, transpose_y = qk_5_transpose_y_1, x = q_5, y = k_5)[name = tensor<string, []>("qk_5")];
-            tensor<int32, [2]> var_584 = const()[name = tensor<string, []>("op_584"), val = tensor<int32, [2]>([2, 1])];
-            tensor<fp32, [2, 1]> var_585 = reshape(shape = var_584, x = sqrt_s_t_5)[name = tensor<string, []>("op_585")];
-            tensor<fp32, [2, 2]> M_5 = real_div(x = encoder__causal_mask, y = var_585)[name = tensor<string, []>("M_5")];
-            tensor<fp32, [1, 4, 2, 2]> var_587 = mul(x = qk_5, y = M_5)[name = tensor<string, []>("op_587")];
+            tensor<int32, [2]> var_620 = const()[name = tensor<string, []>("op_620"), val = tensor<int32, [2]>([2, 1])];
+            tensor<fp32, [2, 1]> var_621 = reshape(shape = var_620, x = sqrt_s_t_5)[name = tensor<string, []>("op_621")];
+            tensor<fp32, [2, 2]> M_5 = real_div(x = inner_encoder__causal_mask, y = var_621)[name = tensor<string, []>("M_5")];
+            tensor<fp32, [1, 4, 2, 2]> var_623 = mul(x = qk_5, y = M_5)[name = tensor<string, []>("op_623")];
             tensor<bool, []> inner_5_transpose_x_0 = const()[name = tensor<string, []>("inner_5_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_5_transpose_y_0 = const()[name = tensor<string, []>("inner_5_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 2, 64]> v_5 = transpose(perm = v_5_perm_0, x = var_574)[name = tensor<string, []>("transpose_42")];
-            tensor<fp32, [1, 4, 2, 64]> inner_5 = matmul(transpose_x = inner_5_transpose_x_0, transpose_y = inner_5_transpose_y_0, x = var_587, y = v_5)[name = tensor<string, []>("inner_5")];
-            tensor<bool, []> var_589_transpose_x_0 = const()[name = tensor<string, []>("op_589_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_589_transpose_y_0 = const()[name = tensor<string, []>("op_589_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 2, 64]> var_589 = matmul(transpose_x = var_589_transpose_x_0, transpose_y = var_589_transpose_y_0, x = q_5, y = prev_kv_5)[name = tensor<string, []>("op_589")];
-            tensor<fp32, [2]> var_590 = real_div(x = sqrt_s0_5, y = sqrt_s_t_5)[name = tensor<string, []>("op_590")];
-            tensor<int32, [4]> var_591 = const()[name = tensor<string, []>("op_591"), val = tensor<int32, [4]>([1, 1, 2, 1])];
-            tensor<fp32, [1, 1, 2, 1]> var_592 = reshape(shape = var_591, x = var_590)[name = tensor<string, []>("op_592")];
-            tensor<fp32, [1, 4, 2, 64]> cross_5 = mul(x = var_589, y = var_592)[name = tensor<string, []>("cross_5")];
+            tensor<fp32, [1, 4, 2, 64]> v_5 = transpose(perm = v_5_perm_0, x = var_610)[name = tensor<string, []>("transpose_42")];
+            tensor<fp32, [1, 4, 2, 64]> inner_5 = matmul(transpose_x = inner_5_transpose_x_0, transpose_y = inner_5_transpose_y_0, x = var_623, y = v_5)[name = tensor<string, []>("inner_5")];
+            tensor<bool, []> var_625_transpose_x_0 = const()[name = tensor<string, []>("op_625_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_625_transpose_y_0 = const()[name = tensor<string, []>("op_625_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 2, 64]> var_625 = matmul(transpose_x = var_625_transpose_x_0, transpose_y = var_625_transpose_y_0, x = q_5, y = prev_kv_5)[name = tensor<string, []>("op_625")];
+            tensor<fp32, [2]> var_626 = real_div(x = sqrt_s0_5, y = sqrt_s_t_5)[name = tensor<string, []>("op_626")];
+            tensor<int32, [4]> var_627 = const()[name = tensor<string, []>("op_627"), val = tensor<int32, [4]>([1, 1, 2, 1])];
+            tensor<fp32, [1, 1, 2, 1]> var_628 = reshape(shape = var_627, x = var_626)[name = tensor<string, []>("op_628")];
+            tensor<fp32, [1, 4, 2, 64]> cross_5 = mul(x = var_625, y = var_628)[name = tensor<string, []>("cross_5")];
             tensor<fp32, [1, 4, 2, 64]> out_13 = add(x = inner_5, y = cross_5)[name = tensor<string, []>("out_13")];
-            tensor<fp32, [1, 4, 64, 64]> var_595 = mul(x = prev_kv_5, y = sqrt_s0_5)[name = tensor<string, []>("op_595")];
-            tensor<bool, []> var_597_transpose_x_1 = const()[name = tensor<string, []>("op_597_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_597_transpose_y_1 = const()[name = tensor<string, []>("op_597_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_597 = matmul(transpose_x = var_597_transpose_x_1, transpose_y = var_597_transpose_y_1, x = k_5, y = v_5)[name = tensor<string, []>("op_597")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_5 = add(x = var_595, y = var_597)[name = tensor<string, []>("new_kv_unnorm_5")];
-            tensor<fp32, []> var_599 = const()[name = tensor<string, []>("op_599"), val = tensor<fp32, []>(0x1p+1)];
-            tensor<fp32, [1]> new_scale_5 = add(x = prev_scale_5, y = var_599)[name = tensor<string, []>("new_scale_5")];
-            tensor<fp32, [1]> var_601 = sqrt(x = new_scale_5)[name = tensor<string, []>("op_601")];
-            tensor<fp32, [1, 4, 64, 64]> var_602 = real_div(x = new_kv_unnorm_5, y = var_601)[name = tensor<string, []>("op_602")];
-            tensor<int32, [4]> var_603_perm_0 = const()[name = tensor<string, []>("op_603_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_631 = mul(x = prev_kv_5, y = sqrt_s0_5)[name = tensor<string, []>("op_631")];
+            tensor<bool, []> var_633_transpose_x_1 = const()[name = tensor<string, []>("op_633_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_633_transpose_y_1 = const()[name = tensor<string, []>("op_633_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_633 = matmul(transpose_x = var_633_transpose_x_1, transpose_y = var_633_transpose_y_1, x = k_5, y = v_5)[name = tensor<string, []>("op_633")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_5 = add(x = var_631, y = var_633)[name = tensor<string, []>("new_kv_unnorm_5")];
+            tensor<fp32, []> var_635 = const()[name = tensor<string, []>("op_635"), val = tensor<fp32, []>(0x1p+1)];
+            tensor<fp32, [1]> new_scale_5 = add(x = prev_scale_5, y = var_635)[name = tensor<string, []>("new_scale_5")];
+            tensor<fp32, [1]> var_637 = sqrt(x = new_scale_5)[name = tensor<string, []>("op_637")];
+            tensor<fp32, [1, 4, 64, 64]> var_638 = real_div(x = new_kv_unnorm_5, y = var_637)[name = tensor<string, []>("op_638")];
+            tensor<int32, [4]> var_639_perm_0 = const()[name = tensor<string, []>("op_639_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_15_axes_0 = const()[name = tensor<string, []>("out_15_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 4, 64]> var_603 = transpose(perm = var_603_perm_0, x = out_13)[name = tensor<string, []>("transpose_41")];
-            tensor<fp32, [1, 2, 4, 64]> out_15 = layer_norm(axes = out_15_axes_0, epsilon = var_17, x = var_603)[name = tensor<string, []>("out_15")];
-            tensor<int32, [3]> var_607 = const()[name = tensor<string, []>("op_607"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<fp32, [1, 2, 256]> out_17 = reshape(shape = var_607, x = out_15)[name = tensor<string, []>("out_17")];
-            tensor<fp32, [1, 2, 256]> var_609 = silu(x = input_97)[name = tensor<string, []>("op_609")];
-            tensor<fp32, [1, 2, 256]> input_99 = mul(x = var_609, y = out_17)[name = tensor<string, []>("input_99")];
-            tensor<fp32, [1, 2, 256]> ret_out_5 = linear(bias = encoder_out_proj_2_bias, weight = encoder_out_proj_2_weight, x = input_99)[name = tensor<string, []>("linear_25")];
-            tensor<fp32, [1, 2, 256]> x_15 = add(x = input_93, y = ret_out_5)[name = tensor<string, []>("x_15")];
+            tensor<fp32, [1, 2, 4, 64]> var_639 = transpose(perm = var_639_perm_0, x = out_13)[name = tensor<string, []>("transpose_41")];
+            tensor<fp32, [1, 2, 4, 64]> out_15 = layer_norm(axes = out_15_axes_0, epsilon = var_53, x = var_639)[name = tensor<string, []>("out_15")];
+            tensor<int32, [3]> var_643 = const()[name = tensor<string, []>("op_643"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp32, [1, 2, 256]> out_17 = reshape(shape = var_643, x = out_15)[name = tensor<string, []>("out_17")];
+            tensor<fp32, [1, 2, 256]> var_645 = silu(x = input_99)[name = tensor<string, []>("op_645")];
+            tensor<fp32, [1, 2, 256]> input_101 = mul(x = var_645, y = out_17)[name = tensor<string, []>("input_101")];
+            tensor<fp32, [1, 2, 256]> ret_out_5 = linear(bias = inner_encoder_out_proj_2_bias, weight = inner_encoder_out_proj_2_weight, x = input_101)[name = tensor<string, []>("linear_25")];
+            tensor<fp32, [1, 2, 256]> x_15 = add(x = input_95, y = ret_out_5)[name = tensor<string, []>("x_15")];
             tensor<int32, [4]> window_13_begin_0 = const()[name = tensor<string, []>("window_13_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> window_13_end_0 = const()[name = tensor<string, []>("window_13_end_0"), val = tensor<int32, [4]>([3, 1, 16, 256])];
             tensor<bool, [4]> window_13_end_mask_0 = const()[name = tensor<string, []>("window_13_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_13_squeeze_mask_0 = const()[name = tensor<string, []>("window_13_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_13 = slice_by_index(begin = window_13_begin_0, end = window_13_end_0, end_mask = window_13_end_mask_0, squeeze_mask = window_13_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_13")];
-            tensor<int32, [3]> var_617_begin_0 = const()[name = tensor<string, []>("op_617_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_617_end_0 = const()[name = tensor<string, []>("op_617_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_617_end_mask_0 = const()[name = tensor<string, []>("op_617_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_617 = slice_by_index(begin = var_617_begin_0, end = var_617_end_0, end_mask = var_617_end_mask_0, x = x_15)[name = tensor<string, []>("op_617")];
-            tensor<int32, [3]> var_620_begin_0 = const()[name = tensor<string, []>("op_620_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_620_end_0 = const()[name = tensor<string, []>("op_620_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_620_end_mask_0 = const()[name = tensor<string, []>("op_620_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_620 = slice_by_index(begin = var_620_begin_0, end = var_620_end_0, end_mask = var_620_end_mask_0, x = window_13)[name = tensor<string, []>("op_620")];
+            tensor<int32, [3]> var_653_begin_0 = const()[name = tensor<string, []>("op_653_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_653_end_0 = const()[name = tensor<string, []>("op_653_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_653_end_mask_0 = const()[name = tensor<string, []>("op_653_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_653 = slice_by_index(begin = var_653_begin_0, end = var_653_end_0, end_mask = var_653_end_mask_0, x = x_15)[name = tensor<string, []>("op_653")];
+            tensor<int32, [3]> var_656_begin_0 = const()[name = tensor<string, []>("op_656_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_656_end_0 = const()[name = tensor<string, []>("op_656_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_656_end_mask_0 = const()[name = tensor<string, []>("op_656_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_656 = slice_by_index(begin = var_656_begin_0, end = var_656_end_0, end_mask = var_656_end_mask_0, x = window_13)[name = tensor<string, []>("op_656")];
             tensor<bool, []> window_15_interleave_0 = const()[name = tensor<string, []>("window_15_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_15 = concat(axis = var_26, interleave = window_15_interleave_0, values = (var_620, var_617))[name = tensor<string, []>("window_15")];
-            tensor<int32, [3]> var_625_begin_0 = const()[name = tensor<string, []>("op_625_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_625_end_0 = const()[name = tensor<string, []>("op_625_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_625_end_mask_0 = const()[name = tensor<string, []>("op_625_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_625 = slice_by_index(begin = var_625_begin_0, end = var_625_end_0, end_mask = var_625_end_mask_0, x = x_15)[name = tensor<string, []>("op_625")];
-            tensor<int32, [3]> var_628_begin_0 = const()[name = tensor<string, []>("op_628_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_628_end_0 = const()[name = tensor<string, []>("op_628_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_628_end_mask_0 = const()[name = tensor<string, []>("op_628_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_628 = slice_by_index(begin = var_628_begin_0, end = var_628_end_0, end_mask = var_628_end_mask_0, x = window_15)[name = tensor<string, []>("op_628")];
+            tensor<fp32, [1, 16, 256]> window_15 = concat(axis = var_62, interleave = window_15_interleave_0, values = (var_656, var_653))[name = tensor<string, []>("window_15")];
+            tensor<int32, [3]> var_661_begin_0 = const()[name = tensor<string, []>("op_661_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_661_end_0 = const()[name = tensor<string, []>("op_661_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_661_end_mask_0 = const()[name = tensor<string, []>("op_661_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_661 = slice_by_index(begin = var_661_begin_0, end = var_661_end_0, end_mask = var_661_end_mask_0, x = x_15)[name = tensor<string, []>("op_661")];
+            tensor<int32, [3]> var_664_begin_0 = const()[name = tensor<string, []>("op_664_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_664_end_0 = const()[name = tensor<string, []>("op_664_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_664_end_mask_0 = const()[name = tensor<string, []>("op_664_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_664 = slice_by_index(begin = var_664_begin_0, end = var_664_end_0, end_mask = var_664_end_mask_0, x = window_15)[name = tensor<string, []>("op_664")];
             tensor<bool, []> window_17_interleave_0 = const()[name = tensor<string, []>("window_17_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_17 = concat(axis = var_26, interleave = window_17_interleave_0, values = (var_628, var_625))[name = tensor<string, []>("window_17")];
-            tensor<bool, []> input_101_interleave_0 = const()[name = tensor<string, []>("input_101_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [2, 16, 256]> input_101 = concat(axis = var_23, interleave = input_101_interleave_0, values = (window_15, window_17))[name = tensor<string, []>("input_101")];
+            tensor<fp32, [1, 16, 256]> window_17 = concat(axis = var_62, interleave = window_17_interleave_0, values = (var_664, var_661))[name = tensor<string, []>("window_17")];
+            tensor<bool, []> input_103_interleave_0 = const()[name = tensor<string, []>("input_103_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [2, 16, 256]> input_103 = concat(axis = var_48, interleave = input_103_interleave_0, values = (window_15, window_17))[name = tensor<string, []>("input_103")];
             tensor<int32, [1]> x_17_axes_0 = const()[name = tensor<string, []>("x_17_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [2, 16, 256]> x_17 = layer_norm(axes = x_17_axes_0, beta = encoder_conv_module_2_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_2_sequential_0_weight, x = input_101)[name = tensor<string, []>("x_17")];
-            tensor<int32, [3]> input_103_perm_0 = const()[name = tensor<string, []>("input_103_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [2, 16, 256]> x_17 = layer_norm(axes = x_17_axes_0, beta = inner_encoder_conv_module_2_sequential_0_bias, epsilon = var_45, gamma = inner_encoder_conv_module_2_sequential_0_weight, x = input_103)[name = tensor<string, []>("x_17")];
+            tensor<int32, [3]> input_105_perm_0 = const()[name = tensor<string, []>("input_105_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_23_pad_type_0 = const()[name = tensor<string, []>("inputs_23_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_23_strides_0 = const()[name = tensor<string, []>("inputs_23_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_23_pad_0 = const()[name = tensor<string, []>("inputs_23_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_23_dilations_0 = const()[name = tensor<string, []>("inputs_23_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_23_groups_0 = const()[name = tensor<string, []>("inputs_23_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> input_103 = transpose(perm = input_103_perm_0, x = x_17)[name = tensor<string, []>("transpose_40")];
-            tensor<fp32, [2, 512, 16]> inputs_23 = conv(bias = encoder_conv_module_2_sequential_2_conv_bias, dilations = inputs_23_dilations_0, groups = inputs_23_groups_0, pad = inputs_23_pad_0, pad_type = inputs_23_pad_type_0, strides = inputs_23_strides_0, weight = encoder_conv_module_2_sequential_2_conv_weight, x = input_103)[name = tensor<string, []>("inputs_23")];
-            tensor<int32, [2]> var_653_split_sizes_0 = const()[name = tensor<string, []>("op_653_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_653_axis_0 = const()[name = tensor<string, []>("op_653_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> var_653_0, tensor<fp32, [2, 256, 16]> var_653_1 = split(axis = var_653_axis_0, split_sizes = var_653_split_sizes_0, x = inputs_23)[name = tensor<string, []>("op_653")];
-            tensor<fp32, [2, 256, 16]> var_655 = sigmoid(x = var_653_1)[name = tensor<string, []>("op_655")];
-            tensor<fp32, [2, 256, 16]> inputs_25 = mul(x = var_653_0, y = var_655)[name = tensor<string, []>("inputs_25")];
+            tensor<fp32, [2, 256, 16]> input_105 = transpose(perm = input_105_perm_0, x = x_17)[name = tensor<string, []>("transpose_40")];
+            tensor<fp32, [2, 512, 16]> inputs_23 = conv(bias = inner_encoder_conv_module_2_sequential_2_conv_bias, dilations = inputs_23_dilations_0, groups = inputs_23_groups_0, pad = inputs_23_pad_0, pad_type = inputs_23_pad_type_0, strides = inputs_23_strides_0, weight = inner_encoder_conv_module_2_sequential_2_conv_weight, x = input_105)[name = tensor<string, []>("inputs_23")];
+            tensor<int32, [2]> var_689_split_sizes_0 = const()[name = tensor<string, []>("op_689_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_689_axis_0 = const()[name = tensor<string, []>("op_689_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [2, 256, 16]> var_689_0, tensor<fp32, [2, 256, 16]> var_689_1 = split(axis = var_689_axis_0, split_sizes = var_689_split_sizes_0, x = inputs_23)[name = tensor<string, []>("op_689")];
+            tensor<fp32, [2, 256, 16]> var_691 = sigmoid(x = var_689_1)[name = tensor<string, []>("op_691")];
+            tensor<fp32, [2, 256, 16]> inputs_25 = mul(x = var_689_0, y = var_691)[name = tensor<string, []>("inputs_25")];
             tensor<string, []> outputs_aug_5_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_5_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_5_pad_0 = const()[name = tensor<string, []>("outputs_aug_5_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_5_groups_0 = const()[name = tensor<string, []>("outputs_aug_5_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_5_strides_0 = const()[name = tensor<string, []>("outputs_aug_5_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_5_dilations_0 = const()[name = tensor<string, []>("outputs_aug_5_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [2, 256, 31]> outputs_aug_5 = conv(dilations = outputs_aug_5_dilations_0, groups = outputs_aug_5_groups_0, pad = outputs_aug_5_pad_0, pad_type = outputs_aug_5_pad_type_0, strides = outputs_aug_5_strides_0, weight = encoder_conv_module_2_sequential_4_conv_weight, x = inputs_25)[name = tensor<string, []>("outputs_aug_5")];
-            tensor<int32, [3]> input_105_begin_0 = const()[name = tensor<string, []>("input_105_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_105_end_0 = const()[name = tensor<string, []>("input_105_end_0"), val = tensor<int32, [3]>([2, 256, 16])];
-            tensor<bool, [3]> input_105_end_mask_0 = const()[name = tensor<string, []>("input_105_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [2, 256, 16]> input_105 = slice_by_index(begin = input_105_begin_0, end = input_105_end_0, end_mask = input_105_end_mask_0, x = outputs_aug_5)[name = tensor<string, []>("input_105")];
-            tensor<fp32, [2, 256, 16]> inputs_27 = batch_norm(beta = encoder_conv_module_2_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_2_sequential_5_weight, mean = encoder_conv_module_2_sequential_5_running_mean, variance = encoder_conv_module_2_sequential_5_running_var, x = input_105)[name = tensor<string, []>("inputs_27")];
-            tensor<fp32, [2, 256, 16]> input_107 = silu(x = inputs_27)[name = tensor<string, []>("input_107")];
-            tensor<string, []> input_109_pad_type_0 = const()[name = tensor<string, []>("input_109_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_109_strides_0 = const()[name = tensor<string, []>("input_109_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_109_pad_0 = const()[name = tensor<string, []>("input_109_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_109_dilations_0 = const()[name = tensor<string, []>("input_109_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_109_groups_0 = const()[name = tensor<string, []>("input_109_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> input_109 = conv(bias = encoder_conv_module_2_sequential_7_conv_bias, dilations = input_109_dilations_0, groups = input_109_groups_0, pad = input_109_pad_0, pad_type = input_109_pad_type_0, strides = input_109_strides_0, weight = encoder_conv_module_2_sequential_7_conv_weight, x = input_107)[name = tensor<string, []>("input_109")];
+            tensor<fp32, [2, 256, 31]> outputs_aug_5 = conv(dilations = outputs_aug_5_dilations_0, groups = outputs_aug_5_groups_0, pad = outputs_aug_5_pad_0, pad_type = outputs_aug_5_pad_type_0, strides = outputs_aug_5_strides_0, weight = inner_encoder_conv_module_2_sequential_4_conv_weight, x = inputs_25)[name = tensor<string, []>("outputs_aug_5")];
+            tensor<int32, [3]> input_107_begin_0 = const()[name = tensor<string, []>("input_107_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_107_end_0 = const()[name = tensor<string, []>("input_107_end_0"), val = tensor<int32, [3]>([2, 256, 16])];
+            tensor<bool, [3]> input_107_end_mask_0 = const()[name = tensor<string, []>("input_107_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [2, 256, 16]> input_107 = slice_by_index(begin = input_107_begin_0, end = input_107_end_0, end_mask = input_107_end_mask_0, x = outputs_aug_5)[name = tensor<string, []>("input_107")];
+            tensor<fp32, [2, 256, 16]> inputs_27 = batch_norm(beta = inner_encoder_conv_module_2_sequential_5_bias, epsilon = var_45, gamma = inner_encoder_conv_module_2_sequential_5_weight, mean = inner_encoder_conv_module_2_sequential_5_running_mean, variance = inner_encoder_conv_module_2_sequential_5_running_var, x = input_107)[name = tensor<string, []>("inputs_27")];
+            tensor<fp32, [2, 256, 16]> input_109 = silu(x = inputs_27)[name = tensor<string, []>("input_109")];
+            tensor<string, []> input_111_pad_type_0 = const()[name = tensor<string, []>("input_111_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_111_strides_0 = const()[name = tensor<string, []>("input_111_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_111_pad_0 = const()[name = tensor<string, []>("input_111_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_111_dilations_0 = const()[name = tensor<string, []>("input_111_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_111_groups_0 = const()[name = tensor<string, []>("input_111_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [2, 256, 16]> input_111 = conv(bias = inner_encoder_conv_module_2_sequential_7_conv_bias, dilations = input_111_dilations_0, groups = input_111_groups_0, pad = input_111_pad_0, pad_type = input_111_pad_type_0, strides = input_111_strides_0, weight = inner_encoder_conv_module_2_sequential_7_conv_weight, x = input_109)[name = tensor<string, []>("input_111")];
             tensor<int32, [3]> conv_out_5_perm_0 = const()[name = tensor<string, []>("conv_out_5_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_686_begin_0 = const()[name = tensor<string, []>("op_686_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_686_end_0 = const()[name = tensor<string, []>("op_686_end_0"), val = tensor<int32, [3]>([2, 16, 256])];
-            tensor<bool, [3]> var_686_end_mask_0 = const()[name = tensor<string, []>("op_686_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [2, 16, 256]> conv_out_5 = transpose(perm = conv_out_5_perm_0, x = input_109)[name = tensor<string, []>("transpose_39")];
-            tensor<fp32, [2, 1, 256]> var_686 = slice_by_index(begin = var_686_begin_0, end = var_686_end_0, end_mask = var_686_end_mask_0, x = conv_out_5)[name = tensor<string, []>("op_686")];
-            tensor<int32, [3]> var_688_perm_0 = const()[name = tensor<string, []>("op_688_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 2, 256]> var_688 = transpose(perm = var_688_perm_0, x = var_686)[name = tensor<string, []>("transpose_38")];
-            tensor<fp32, [1, 2, 256]> input_111 = add(x = x_15, y = var_688)[name = tensor<string, []>("input_111")];
-            tensor<int32, [1]> input_113_axes_0 = const()[name = tensor<string, []>("input_113_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_113 = layer_norm(axes = input_113_axes_0, beta = encoder_ffn2_2_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_2_module_sequential_0_weight, x = input_111)[name = tensor<string, []>("input_113")];
-            tensor<fp32, [1, 2, 1024]> inputs_29 = linear(bias = encoder_ffn2_2_module_sequential_1_linear_bias, weight = encoder_ffn2_2_module_sequential_1_linear_weight, x = input_113)[name = tensor<string, []>("linear_26")];
-            tensor<fp32, [1, 2, 1024]> input_115 = silu(x = inputs_29)[name = tensor<string, []>("input_115")];
-            tensor<fp32, [1, 2, 256]> input_119 = linear(bias = encoder_ffn2_2_module_sequential_4_linear_bias, weight = encoder_ffn2_2_module_sequential_4_linear_weight, x = input_115)[name = tensor<string, []>("linear_27")];
-            tensor<fp32, []> var_711 = const()[name = tensor<string, []>("op_711"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 2, 256]> var_712 = mul(x = input_119, y = var_711)[name = tensor<string, []>("op_712")];
-            tensor<fp32, [1, 2, 256]> input_121 = add(x = var_712, y = input_111)[name = tensor<string, []>("input_121")];
-            tensor<int32, [1]> input_123_axes_0 = const()[name = tensor<string, []>("input_123_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_123 = layer_norm(axes = input_123_axes_0, beta = encoder_layer_norm_2_bias, epsilon = var_28, gamma = encoder_layer_norm_2_weight, x = input_121)[name = tensor<string, []>("input_123")];
+            tensor<int32, [3]> var_722_begin_0 = const()[name = tensor<string, []>("op_722_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_722_end_0 = const()[name = tensor<string, []>("op_722_end_0"), val = tensor<int32, [3]>([2, 16, 256])];
+            tensor<bool, [3]> var_722_end_mask_0 = const()[name = tensor<string, []>("op_722_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [2, 16, 256]> conv_out_5 = transpose(perm = conv_out_5_perm_0, x = input_111)[name = tensor<string, []>("transpose_39")];
+            tensor<fp32, [2, 1, 256]> var_722 = slice_by_index(begin = var_722_begin_0, end = var_722_end_0, end_mask = var_722_end_mask_0, x = conv_out_5)[name = tensor<string, []>("op_722")];
+            tensor<int32, [3]> var_724_perm_0 = const()[name = tensor<string, []>("op_724_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 2, 256]> var_724 = transpose(perm = var_724_perm_0, x = var_722)[name = tensor<string, []>("transpose_38")];
+            tensor<fp32, [1, 2, 256]> input_113 = add(x = x_15, y = var_724)[name = tensor<string, []>("input_113")];
+            tensor<int32, [1]> input_115_axes_0 = const()[name = tensor<string, []>("input_115_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 2, 256]> input_115 = layer_norm(axes = input_115_axes_0, beta = inner_encoder_ffn2_2_module_sequential_0_bias, epsilon = var_45, gamma = inner_encoder_ffn2_2_module_sequential_0_weight, x = input_113)[name = tensor<string, []>("input_115")];
+            tensor<fp32, [1, 2, 1024]> inputs_29 = linear(bias = inner_encoder_ffn2_2_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_2_module_sequential_1_linear_weight, x = input_115)[name = tensor<string, []>("linear_26")];
+            tensor<fp32, [1, 2, 1024]> input_117 = silu(x = inputs_29)[name = tensor<string, []>("input_117")];
+            tensor<fp32, [1, 2, 256]> input_121 = linear(bias = inner_encoder_ffn2_2_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_2_module_sequential_4_linear_weight, x = input_117)[name = tensor<string, []>("linear_27")];
+            tensor<fp32, []> var_747 = const()[name = tensor<string, []>("op_747"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 2, 256]> var_748 = mul(x = input_121, y = var_747)[name = tensor<string, []>("op_748")];
+            tensor<fp32, [1, 2, 256]> input_123 = add(x = var_748, y = input_113)[name = tensor<string, []>("input_123")];
             tensor<int32, [1]> input_125_axes_0 = const()[name = tensor<string, []>("input_125_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_125 = layer_norm(axes = input_125_axes_0, beta = encoder_ffn1_3_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_3_module_sequential_0_weight, x = input_123)[name = tensor<string, []>("input_125")];
-            tensor<fp32, [1, 2, 1024]> inputs_31 = linear(bias = encoder_ffn1_3_module_sequential_1_linear_bias, weight = encoder_ffn1_3_module_sequential_1_linear_weight, x = input_125)[name = tensor<string, []>("linear_28")];
-            tensor<fp32, [1, 2, 1024]> input_127 = silu(x = inputs_31)[name = tensor<string, []>("input_127")];
-            tensor<fp32, [1, 2, 256]> input_131 = linear(bias = encoder_ffn1_3_module_sequential_4_linear_bias, weight = encoder_ffn1_3_module_sequential_4_linear_weight, x = input_127)[name = tensor<string, []>("linear_29")];
-            tensor<fp32, []> var_741 = const()[name = tensor<string, []>("op_741"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 2, 256]> var_742 = mul(x = input_131, y = var_741)[name = tensor<string, []>("op_742")];
-            tensor<fp32, [1, 2, 256]> input_133 = add(x = var_742, y = input_123)[name = tensor<string, []>("input_133")];
+            tensor<fp32, [1, 2, 256]> input_125 = layer_norm(axes = input_125_axes_0, beta = inner_encoder_layer_norm_2_bias, epsilon = var_45, gamma = inner_encoder_layer_norm_2_weight, x = input_123)[name = tensor<string, []>("input_125")];
+            tensor<int32, [1]> input_127_axes_0 = const()[name = tensor<string, []>("input_127_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 2, 256]> input_127 = layer_norm(axes = input_127_axes_0, beta = inner_encoder_ffn1_3_module_sequential_0_bias, epsilon = var_45, gamma = inner_encoder_ffn1_3_module_sequential_0_weight, x = input_125)[name = tensor<string, []>("input_127")];
+            tensor<fp32, [1, 2, 1024]> inputs_31 = linear(bias = inner_encoder_ffn1_3_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_3_module_sequential_1_linear_weight, x = input_127)[name = tensor<string, []>("linear_28")];
+            tensor<fp32, [1, 2, 1024]> input_129 = silu(x = inputs_31)[name = tensor<string, []>("input_129")];
+            tensor<fp32, [1, 2, 256]> input_133 = linear(bias = inner_encoder_ffn1_3_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_3_module_sequential_4_linear_weight, x = input_129)[name = tensor<string, []>("linear_29")];
+            tensor<fp32, []> var_777 = const()[name = tensor<string, []>("op_777"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 2, 256]> var_778 = mul(x = input_133, y = var_777)[name = tensor<string, []>("op_778")];
+            tensor<fp32, [1, 2, 256]> input_135 = add(x = var_778, y = input_125)[name = tensor<string, []>("input_135")];
             tensor<int32, [1]> x_19_axes_0 = const()[name = tensor<string, []>("x_19_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> x_19 = layer_norm(axes = x_19_axes_0, beta = encoder_ret_lns_3_bias, epsilon = var_28, gamma = encoder_ret_lns_3_weight, x = input_133)[name = tensor<string, []>("x_19")];
+            tensor<fp32, [1, 2, 256]> x_19 = layer_norm(axes = x_19_axes_0, beta = inner_encoder_ret_lns_3_bias, epsilon = var_45, gamma = inner_encoder_ret_lns_3_weight, x = input_135)[name = tensor<string, []>("x_19")];
             tensor<int32, [5]> prev_kv_7_begin_0 = const()[name = tensor<string, []>("prev_kv_7_begin_0"), val = tensor<int32, [5]>([3, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_7_end_0 = const()[name = tensor<string, []>("prev_kv_7_end_0"), val = tensor<int32, [5]>([4, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_7_end_mask_0 = const()[name = tensor<string, []>("prev_kv_7_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -710,189 +724,182 @@ program(1.0)
             tensor<bool, [2]> prev_scale_7_end_mask_0 = const()[name = tensor<string, []>("prev_scale_7_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_7_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_7_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_7 = slice_by_index(begin = prev_scale_7_begin_0, end = prev_scale_7_end_0, end_mask = prev_scale_7_end_mask_0, squeeze_mask = prev_scale_7_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_7")];
-            tensor<fp32, [1, 2, 256]> var_756 = linear(bias = encoder_q_proj_3_bias, weight = encoder_q_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_30")];
-            tensor<int32, [4]> var_757 = const()[name = tensor<string, []>("op_757"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_758 = reshape(shape = var_757, x = var_756)[name = tensor<string, []>("op_758")];
+            tensor<fp32, [1, 2, 256]> var_792 = linear(bias = inner_encoder_q_proj_3_bias, weight = inner_encoder_q_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_30")];
+            tensor<int32, [4]> var_793 = const()[name = tensor<string, []>("op_793"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_794 = reshape(shape = var_793, x = var_792)[name = tensor<string, []>("op_794")];
             tensor<int32, [4]> q_7_perm_0 = const()[name = tensor<string, []>("q_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> var_762 = linear(bias = encoder_k_proj_3_bias, weight = encoder_k_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_31")];
-            tensor<fp32, []> var_763 = const()[name = tensor<string, []>("op_763"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 2, 256]> var_764 = mul(x = var_762, y = var_763)[name = tensor<string, []>("op_764")];
-            tensor<int32, [4]> var_765 = const()[name = tensor<string, []>("op_765"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_766 = reshape(shape = var_765, x = var_764)[name = tensor<string, []>("op_766")];
+            tensor<fp32, [1, 2, 256]> var_798 = linear(bias = inner_encoder_k_proj_3_bias, weight = inner_encoder_k_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_31")];
+            tensor<fp32, []> var_799 = const()[name = tensor<string, []>("op_799"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 2, 256]> var_800 = mul(x = var_798, y = var_799)[name = tensor<string, []>("op_800")];
+            tensor<int32, [4]> var_801 = const()[name = tensor<string, []>("op_801"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_802 = reshape(shape = var_801, x = var_800)[name = tensor<string, []>("op_802")];
             tensor<int32, [4]> k_7_perm_0 = const()[name = tensor<string, []>("k_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> var_770 = linear(bias = encoder_v_proj_3_bias, weight = encoder_v_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_32")];
-            tensor<int32, [4]> var_771 = const()[name = tensor<string, []>("op_771"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_772 = reshape(shape = var_771, x = var_770)[name = tensor<string, []>("op_772")];
+            tensor<fp32, [1, 2, 256]> var_806 = linear(bias = inner_encoder_v_proj_3_bias, weight = inner_encoder_v_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_32")];
+            tensor<int32, [4]> var_807 = const()[name = tensor<string, []>("op_807"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_808 = reshape(shape = var_807, x = var_806)[name = tensor<string, []>("op_808")];
             tensor<int32, [4]> v_7_perm_0 = const()[name = tensor<string, []>("v_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> input_137 = linear(bias = encoder_g_proj_3_bias, weight = encoder_g_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_33")];
+            tensor<fp32, [1, 2, 256]> input_139 = linear(bias = inner_encoder_g_proj_3_bias, weight = inner_encoder_g_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_33")];
             tensor<fp32, [1]> sqrt_s0_7 = sqrt(x = prev_scale_7)[name = tensor<string, []>("sqrt_s0_7")];
-            tensor<fp32, [2]> s_t_7 = add(x = prev_scale_7, y = encoder__t_index)[name = tensor<string, []>("s_t_7")];
+            tensor<fp32, [2]> s_t_7 = add(x = prev_scale_7, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_7")];
             tensor<fp32, [2]> sqrt_s_t_7 = sqrt(x = s_t_7)[name = tensor<string, []>("sqrt_s_t_7")];
             tensor<bool, []> qk_7_transpose_x_1 = const()[name = tensor<string, []>("qk_7_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_7_transpose_y_1 = const()[name = tensor<string, []>("qk_7_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 2, 64]> k_7 = transpose(perm = k_7_perm_0, x = var_766)[name = tensor<string, []>("transpose_36")];
-            tensor<fp32, [1, 4, 2, 64]> q_7 = transpose(perm = q_7_perm_0, x = var_758)[name = tensor<string, []>("transpose_37")];
+            tensor<fp32, [1, 4, 2, 64]> k_7 = transpose(perm = k_7_perm_0, x = var_802)[name = tensor<string, []>("transpose_36")];
+            tensor<fp32, [1, 4, 2, 64]> q_7 = transpose(perm = q_7_perm_0, x = var_794)[name = tensor<string, []>("transpose_37")];
             tensor<fp32, [1, 4, 2, 2]> qk_7 = matmul(transpose_x = qk_7_transpose_x_1, transpose_y = qk_7_transpose_y_1, x = q_7, y = k_7)[name = tensor<string, []>("qk_7")];
-            tensor<int32, [2]> var_782 = const()[name = tensor<string, []>("op_782"), val = tensor<int32, [2]>([2, 1])];
-            tensor<fp32, [2, 1]> var_783 = reshape(shape = var_782, x = sqrt_s_t_7)[name = tensor<string, []>("op_783")];
-            tensor<fp32, [2, 2]> M_7 = real_div(x = encoder__causal_mask, y = var_783)[name = tensor<string, []>("M_7")];
-            tensor<fp32, [1, 4, 2, 2]> var_785 = mul(x = qk_7, y = M_7)[name = tensor<string, []>("op_785")];
+            tensor<int32, [2]> var_818 = const()[name = tensor<string, []>("op_818"), val = tensor<int32, [2]>([2, 1])];
+            tensor<fp32, [2, 1]> var_819 = reshape(shape = var_818, x = sqrt_s_t_7)[name = tensor<string, []>("op_819")];
+            tensor<fp32, [2, 2]> M_7 = real_div(x = inner_encoder__causal_mask, y = var_819)[name = tensor<string, []>("M_7")];
+            tensor<fp32, [1, 4, 2, 2]> var_821 = mul(x = qk_7, y = M_7)[name = tensor<string, []>("op_821")];
             tensor<bool, []> inner_7_transpose_x_0 = const()[name = tensor<string, []>("inner_7_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_7_transpose_y_0 = const()[name = tensor<string, []>("inner_7_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 2, 64]> v_7 = transpose(perm = v_7_perm_0, x = var_772)[name = tensor<string, []>("transpose_35")];
-            tensor<fp32, [1, 4, 2, 64]> inner_7 = matmul(transpose_x = inner_7_transpose_x_0, transpose_y = inner_7_transpose_y_0, x = var_785, y = v_7)[name = tensor<string, []>("inner_7")];
-            tensor<bool, []> var_787_transpose_x_0 = const()[name = tensor<string, []>("op_787_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_787_transpose_y_0 = const()[name = tensor<string, []>("op_787_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 2, 64]> var_787 = matmul(transpose_x = var_787_transpose_x_0, transpose_y = var_787_transpose_y_0, x = q_7, y = prev_kv_7)[name = tensor<string, []>("op_787")];
-            tensor<fp32, [2]> var_788 = real_div(x = sqrt_s0_7, y = sqrt_s_t_7)[name = tensor<string, []>("op_788")];
-            tensor<int32, [4]> var_789 = const()[name = tensor<string, []>("op_789"), val = tensor<int32, [4]>([1, 1, 2, 1])];
-            tensor<fp32, [1, 1, 2, 1]> var_790 = reshape(shape = var_789, x = var_788)[name = tensor<string, []>("op_790")];
-            tensor<fp32, [1, 4, 2, 64]> cross_7 = mul(x = var_787, y = var_790)[name = tensor<string, []>("cross_7")];
+            tensor<fp32, [1, 4, 2, 64]> v_7 = transpose(perm = v_7_perm_0, x = var_808)[name = tensor<string, []>("transpose_35")];
+            tensor<fp32, [1, 4, 2, 64]> inner_7 = matmul(transpose_x = inner_7_transpose_x_0, transpose_y = inner_7_transpose_y_0, x = var_821, y = v_7)[name = tensor<string, []>("inner_7")];
+            tensor<bool, []> var_823_transpose_x_0 = const()[name = tensor<string, []>("op_823_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_823_transpose_y_0 = const()[name = tensor<string, []>("op_823_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 2, 64]> var_823 = matmul(transpose_x = var_823_transpose_x_0, transpose_y = var_823_transpose_y_0, x = q_7, y = prev_kv_7)[name = tensor<string, []>("op_823")];
+            tensor<fp32, [2]> var_824 = real_div(x = sqrt_s0_7, y = sqrt_s_t_7)[name = tensor<string, []>("op_824")];
+            tensor<int32, [4]> var_825 = const()[name = tensor<string, []>("op_825"), val = tensor<int32, [4]>([1, 1, 2, 1])];
+            tensor<fp32, [1, 1, 2, 1]> var_826 = reshape(shape = var_825, x = var_824)[name = tensor<string, []>("op_826")];
+            tensor<fp32, [1, 4, 2, 64]> cross_7 = mul(x = var_823, y = var_826)[name = tensor<string, []>("cross_7")];
             tensor<fp32, [1, 4, 2, 64]> out_19 = add(x = inner_7, y = cross_7)[name = tensor<string, []>("out_19")];
-            tensor<fp32, [1, 4, 64, 64]> var_793 = mul(x = prev_kv_7, y = sqrt_s0_7)[name = tensor<string, []>("op_793")];
-            tensor<bool, []> var_795_transpose_x_1 = const()[name = tensor<string, []>("op_795_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_795_transpose_y_1 = const()[name = tensor<string, []>("op_795_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_795 = matmul(transpose_x = var_795_transpose_x_1, transpose_y = var_795_transpose_y_1, x = k_7, y = v_7)[name = tensor<string, []>("op_795")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_7 = add(x = var_793, y = var_795)[name = tensor<string, []>("new_kv_unnorm_7")];
-            tensor<fp32, []> var_797 = const()[name = tensor<string, []>("op_797"), val = tensor<fp32, []>(0x1p+1)];
-            tensor<fp32, [1]> new_scale_7 = add(x = prev_scale_7, y = var_797)[name = tensor<string, []>("new_scale_7")];
-            tensor<fp32, [1]> var_799 = sqrt(x = new_scale_7)[name = tensor<string, []>("op_799")];
-            tensor<fp32, [1, 4, 64, 64]> nkv_1 = real_div(x = new_kv_unnorm_7, y = var_799)[name = tensor<string, []>("nkv_1")];
-            tensor<int32, [4]> var_801_perm_0 = const()[name = tensor<string, []>("op_801_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_829 = mul(x = prev_kv_7, y = sqrt_s0_7)[name = tensor<string, []>("op_829")];
+            tensor<bool, []> var_831_transpose_x_1 = const()[name = tensor<string, []>("op_831_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_831_transpose_y_1 = const()[name = tensor<string, []>("op_831_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_831 = matmul(transpose_x = var_831_transpose_x_1, transpose_y = var_831_transpose_y_1, x = k_7, y = v_7)[name = tensor<string, []>("op_831")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_7 = add(x = var_829, y = var_831)[name = tensor<string, []>("new_kv_unnorm_7")];
+            tensor<fp32, []> var_833 = const()[name = tensor<string, []>("op_833"), val = tensor<fp32, []>(0x1p+1)];
+            tensor<fp32, [1]> new_scale_7 = add(x = prev_scale_7, y = var_833)[name = tensor<string, []>("new_scale_7")];
+            tensor<fp32, [1]> var_835 = sqrt(x = new_scale_7)[name = tensor<string, []>("op_835")];
+            tensor<fp32, [1, 4, 64, 64]> nkv_1 = real_div(x = new_kv_unnorm_7, y = var_835)[name = tensor<string, []>("nkv_1")];
+            tensor<int32, [4]> var_837_perm_0 = const()[name = tensor<string, []>("op_837_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_21_axes_0 = const()[name = tensor<string, []>("out_21_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 4, 64]> var_801 = transpose(perm = var_801_perm_0, x = out_19)[name = tensor<string, []>("transpose_34")];
-            tensor<fp32, [1, 2, 4, 64]> out_21 = layer_norm(axes = out_21_axes_0, epsilon = var_17, x = var_801)[name = tensor<string, []>("out_21")];
-            tensor<int32, [3]> var_805 = const()[name = tensor<string, []>("op_805"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<fp32, [1, 2, 256]> out_23 = reshape(shape = var_805, x = out_21)[name = tensor<string, []>("out_23")];
-            tensor<fp32, [1, 2, 256]> var_807 = silu(x = input_137)[name = tensor<string, []>("op_807")];
-            tensor<fp32, [1, 2, 256]> input_139 = mul(x = var_807, y = out_23)[name = tensor<string, []>("input_139")];
-            tensor<fp32, [1, 2, 256]> ret_out_7 = linear(bias = encoder_out_proj_3_bias, weight = encoder_out_proj_3_weight, x = input_139)[name = tensor<string, []>("linear_34")];
-            tensor<fp32, [1, 2, 256]> x_21 = add(x = input_133, y = ret_out_7)[name = tensor<string, []>("x_21")];
+            tensor<fp32, [1, 2, 4, 64]> var_837 = transpose(perm = var_837_perm_0, x = out_19)[name = tensor<string, []>("transpose_34")];
+            tensor<fp32, [1, 2, 4, 64]> out_21 = layer_norm(axes = out_21_axes_0, epsilon = var_53, x = var_837)[name = tensor<string, []>("out_21")];
+            tensor<int32, [3]> var_841 = const()[name = tensor<string, []>("op_841"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp32, [1, 2, 256]> out_23 = reshape(shape = var_841, x = out_21)[name = tensor<string, []>("out_23")];
+            tensor<fp32, [1, 2, 256]> var_843 = silu(x = input_139)[name = tensor<string, []>("op_843")];
+            tensor<fp32, [1, 2, 256]> input_141 = mul(x = var_843, y = out_23)[name = tensor<string, []>("input_141")];
+            tensor<fp32, [1, 2, 256]> ret_out_7 = linear(bias = inner_encoder_out_proj_3_bias, weight = inner_encoder_out_proj_3_weight, x = input_141)[name = tensor<string, []>("linear_34")];
+            tensor<fp32, [1, 2, 256]> x_21 = add(x = input_135, y = ret_out_7)[name = tensor<string, []>("x_21")];
             tensor<int32, [4]> window_19_begin_0 = const()[name = tensor<string, []>("window_19_begin_0"), val = tensor<int32, [4]>([3, 0, 0, 0])];
             tensor<int32, [4]> window_19_end_0 = const()[name = tensor<string, []>("window_19_end_0"), val = tensor<int32, [4]>([4, 1, 16, 256])];
             tensor<bool, [4]> window_19_end_mask_0 = const()[name = tensor<string, []>("window_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_19_squeeze_mask_0 = const()[name = tensor<string, []>("window_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_19 = slice_by_index(begin = window_19_begin_0, end = window_19_end_0, end_mask = window_19_end_mask_0, squeeze_mask = window_19_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_19")];
-            tensor<int32, [3]> var_815_begin_0 = const()[name = tensor<string, []>("op_815_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_815_end_0 = const()[name = tensor<string, []>("op_815_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_815_end_mask_0 = const()[name = tensor<string, []>("op_815_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_815 = slice_by_index(begin = var_815_begin_0, end = var_815_end_0, end_mask = var_815_end_mask_0, x = x_21)[name = tensor<string, []>("op_815")];
-            tensor<int32, [3]> var_818_begin_0 = const()[name = tensor<string, []>("op_818_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_818_end_0 = const()[name = tensor<string, []>("op_818_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_818_end_mask_0 = const()[name = tensor<string, []>("op_818_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_818 = slice_by_index(begin = var_818_begin_0, end = var_818_end_0, end_mask = var_818_end_mask_0, x = window_19)[name = tensor<string, []>("op_818")];
+            tensor<int32, [3]> var_851_begin_0 = const()[name = tensor<string, []>("op_851_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_851_end_0 = const()[name = tensor<string, []>("op_851_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_851_end_mask_0 = const()[name = tensor<string, []>("op_851_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_851 = slice_by_index(begin = var_851_begin_0, end = var_851_end_0, end_mask = var_851_end_mask_0, x = x_21)[name = tensor<string, []>("op_851")];
+            tensor<int32, [3]> var_854_begin_0 = const()[name = tensor<string, []>("op_854_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_854_end_0 = const()[name = tensor<string, []>("op_854_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_854_end_mask_0 = const()[name = tensor<string, []>("op_854_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_854 = slice_by_index(begin = var_854_begin_0, end = var_854_end_0, end_mask = var_854_end_mask_0, x = window_19)[name = tensor<string, []>("op_854")];
             tensor<bool, []> window_21_interleave_0 = const()[name = tensor<string, []>("window_21_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_21 = concat(axis = var_26, interleave = window_21_interleave_0, values = (var_818, var_815))[name = tensor<string, []>("window_21")];
-            tensor<int32, [3]> var_823_begin_0 = const()[name = tensor<string, []>("op_823_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_823_end_0 = const()[name = tensor<string, []>("op_823_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_823_end_mask_0 = const()[name = tensor<string, []>("op_823_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_823 = slice_by_index(begin = var_823_begin_0, end = var_823_end_0, end_mask = var_823_end_mask_0, x = x_21)[name = tensor<string, []>("op_823")];
-            tensor<int32, [3]> var_826_begin_0 = const()[name = tensor<string, []>("op_826_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_826_end_0 = const()[name = tensor<string, []>("op_826_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_826_end_mask_0 = const()[name = tensor<string, []>("op_826_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_826 = slice_by_index(begin = var_826_begin_0, end = var_826_end_0, end_mask = var_826_end_mask_0, x = window_21)[name = tensor<string, []>("op_826")];
+            tensor<fp32, [1, 16, 256]> window_21 = concat(axis = var_62, interleave = window_21_interleave_0, values = (var_854, var_851))[name = tensor<string, []>("window_21")];
+            tensor<int32, [3]> var_859_begin_0 = const()[name = tensor<string, []>("op_859_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_859_end_0 = const()[name = tensor<string, []>("op_859_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_859_end_mask_0 = const()[name = tensor<string, []>("op_859_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_859 = slice_by_index(begin = var_859_begin_0, end = var_859_end_0, end_mask = var_859_end_mask_0, x = x_21)[name = tensor<string, []>("op_859")];
+            tensor<int32, [3]> var_862_begin_0 = const()[name = tensor<string, []>("op_862_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_862_end_0 = const()[name = tensor<string, []>("op_862_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_862_end_mask_0 = const()[name = tensor<string, []>("op_862_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_862 = slice_by_index(begin = var_862_begin_0, end = var_862_end_0, end_mask = var_862_end_mask_0, x = window_21)[name = tensor<string, []>("op_862")];
             tensor<bool, []> window_interleave_0 = const()[name = tensor<string, []>("window_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window = concat(axis = var_26, interleave = window_interleave_0, values = (var_826, var_823))[name = tensor<string, []>("window")];
-            tensor<bool, []> input_141_interleave_0 = const()[name = tensor<string, []>("input_141_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [2, 16, 256]> input_141 = concat(axis = var_23, interleave = input_141_interleave_0, values = (window_21, window))[name = tensor<string, []>("input_141")];
+            tensor<fp32, [1, 16, 256]> window = concat(axis = var_62, interleave = window_interleave_0, values = (var_862, var_859))[name = tensor<string, []>("window")];
+            tensor<bool, []> input_143_interleave_0 = const()[name = tensor<string, []>("input_143_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [2, 16, 256]> input_143 = concat(axis = var_48, interleave = input_143_interleave_0, values = (window_21, window))[name = tensor<string, []>("input_143")];
             tensor<int32, [1]> x_23_axes_0 = const()[name = tensor<string, []>("x_23_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [2, 16, 256]> x_23 = layer_norm(axes = x_23_axes_0, beta = encoder_conv_module_3_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_3_sequential_0_weight, x = input_141)[name = tensor<string, []>("x_23")];
-            tensor<int32, [3]> input_143_perm_0 = const()[name = tensor<string, []>("input_143_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [2, 16, 256]> x_23 = layer_norm(axes = x_23_axes_0, beta = inner_encoder_conv_module_3_sequential_0_bias, epsilon = var_45, gamma = inner_encoder_conv_module_3_sequential_0_weight, x = input_143)[name = tensor<string, []>("x_23")];
+            tensor<int32, [3]> input_145_perm_0 = const()[name = tensor<string, []>("input_145_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_33_pad_type_0 = const()[name = tensor<string, []>("inputs_33_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_33_strides_0 = const()[name = tensor<string, []>("inputs_33_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_33_pad_0 = const()[name = tensor<string, []>("inputs_33_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_33_dilations_0 = const()[name = tensor<string, []>("inputs_33_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_33_groups_0 = const()[name = tensor<string, []>("inputs_33_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> input_143 = transpose(perm = input_143_perm_0, x = x_23)[name = tensor<string, []>("transpose_33")];
-            tensor<fp32, [2, 512, 16]> inputs_33 = conv(bias = encoder_conv_module_3_sequential_2_conv_bias, dilations = inputs_33_dilations_0, groups = inputs_33_groups_0, pad = inputs_33_pad_0, pad_type = inputs_33_pad_type_0, strides = inputs_33_strides_0, weight = encoder_conv_module_3_sequential_2_conv_weight, x = input_143)[name = tensor<string, []>("inputs_33")];
-            tensor<int32, [2]> var_851_split_sizes_0 = const()[name = tensor<string, []>("op_851_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_851_axis_0 = const()[name = tensor<string, []>("op_851_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> var_851_0, tensor<fp32, [2, 256, 16]> var_851_1 = split(axis = var_851_axis_0, split_sizes = var_851_split_sizes_0, x = inputs_33)[name = tensor<string, []>("op_851")];
-            tensor<fp32, [2, 256, 16]> var_853 = sigmoid(x = var_851_1)[name = tensor<string, []>("op_853")];
-            tensor<fp32, [2, 256, 16]> inputs_35 = mul(x = var_851_0, y = var_853)[name = tensor<string, []>("inputs_35")];
+            tensor<fp32, [2, 256, 16]> input_145 = transpose(perm = input_145_perm_0, x = x_23)[name = tensor<string, []>("transpose_33")];
+            tensor<fp32, [2, 512, 16]> inputs_33 = conv(bias = inner_encoder_conv_module_3_sequential_2_conv_bias, dilations = inputs_33_dilations_0, groups = inputs_33_groups_0, pad = inputs_33_pad_0, pad_type = inputs_33_pad_type_0, strides = inputs_33_strides_0, weight = inner_encoder_conv_module_3_sequential_2_conv_weight, x = input_145)[name = tensor<string, []>("inputs_33")];
+            tensor<int32, [2]> var_887_split_sizes_0 = const()[name = tensor<string, []>("op_887_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_887_axis_0 = const()[name = tensor<string, []>("op_887_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [2, 256, 16]> var_887_0, tensor<fp32, [2, 256, 16]> var_887_1 = split(axis = var_887_axis_0, split_sizes = var_887_split_sizes_0, x = inputs_33)[name = tensor<string, []>("op_887")];
+            tensor<fp32, [2, 256, 16]> var_889 = sigmoid(x = var_887_1)[name = tensor<string, []>("op_889")];
+            tensor<fp32, [2, 256, 16]> inputs_35 = mul(x = var_887_0, y = var_889)[name = tensor<string, []>("inputs_35")];
             tensor<string, []> outputs_aug_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_pad_0 = const()[name = tensor<string, []>("outputs_aug_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_groups_0 = const()[name = tensor<string, []>("outputs_aug_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_strides_0 = const()[name = tensor<string, []>("outputs_aug_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_dilations_0 = const()[name = tensor<string, []>("outputs_aug_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [2, 256, 31]> outputs_aug = conv(dilations = outputs_aug_dilations_0, groups = outputs_aug_groups_0, pad = outputs_aug_pad_0, pad_type = outputs_aug_pad_type_0, strides = outputs_aug_strides_0, weight = encoder_conv_module_3_sequential_4_conv_weight, x = inputs_35)[name = tensor<string, []>("outputs_aug")];
-            tensor<int32, [3]> input_145_begin_0 = const()[name = tensor<string, []>("input_145_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_145_end_0 = const()[name = tensor<string, []>("input_145_end_0"), val = tensor<int32, [3]>([2, 256, 16])];
-            tensor<bool, [3]> input_145_end_mask_0 = const()[name = tensor<string, []>("input_145_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [2, 256, 16]> input_145 = slice_by_index(begin = input_145_begin_0, end = input_145_end_0, end_mask = input_145_end_mask_0, x = outputs_aug)[name = tensor<string, []>("input_145")];
-            tensor<fp32, [2, 256, 16]> inputs_37 = batch_norm(beta = encoder_conv_module_3_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_3_sequential_5_weight, mean = encoder_conv_module_3_sequential_5_running_mean, variance = encoder_conv_module_3_sequential_5_running_var, x = input_145)[name = tensor<string, []>("inputs_37")];
-            tensor<fp32, [2, 256, 16]> input_147 = silu(x = inputs_37)[name = tensor<string, []>("input_147")];
-            tensor<string, []> input_149_pad_type_0 = const()[name = tensor<string, []>("input_149_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_149_strides_0 = const()[name = tensor<string, []>("input_149_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_149_pad_0 = const()[name = tensor<string, []>("input_149_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_149_dilations_0 = const()[name = tensor<string, []>("input_149_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_149_groups_0 = const()[name = tensor<string, []>("input_149_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> input_149 = conv(bias = encoder_conv_module_3_sequential_7_conv_bias, dilations = input_149_dilations_0, groups = input_149_groups_0, pad = input_149_pad_0, pad_type = input_149_pad_type_0, strides = input_149_strides_0, weight = encoder_conv_module_3_sequential_7_conv_weight, x = input_147)[name = tensor<string, []>("input_149")];
+            tensor<fp32, [2, 256, 31]> outputs_aug = conv(dilations = outputs_aug_dilations_0, groups = outputs_aug_groups_0, pad = outputs_aug_pad_0, pad_type = outputs_aug_pad_type_0, strides = outputs_aug_strides_0, weight = inner_encoder_conv_module_3_sequential_4_conv_weight, x = inputs_35)[name = tensor<string, []>("outputs_aug")];
+            tensor<int32, [3]> input_147_begin_0 = const()[name = tensor<string, []>("input_147_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_147_end_0 = const()[name = tensor<string, []>("input_147_end_0"), val = tensor<int32, [3]>([2, 256, 16])];
+            tensor<bool, [3]> input_147_end_mask_0 = const()[name = tensor<string, []>("input_147_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [2, 256, 16]> input_147 = slice_by_index(begin = input_147_begin_0, end = input_147_end_0, end_mask = input_147_end_mask_0, x = outputs_aug)[name = tensor<string, []>("input_147")];
+            tensor<fp32, [2, 256, 16]> inputs_37 = batch_norm(beta = inner_encoder_conv_module_3_sequential_5_bias, epsilon = var_45, gamma = inner_encoder_conv_module_3_sequential_5_weight, mean = inner_encoder_conv_module_3_sequential_5_running_mean, variance = inner_encoder_conv_module_3_sequential_5_running_var, x = input_147)[name = tensor<string, []>("inputs_37")];
+            tensor<fp32, [2, 256, 16]> input_149 = silu(x = inputs_37)[name = tensor<string, []>("input_149")];
+            tensor<string, []> input_151_pad_type_0 = const()[name = tensor<string, []>("input_151_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_151_strides_0 = const()[name = tensor<string, []>("input_151_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_151_pad_0 = const()[name = tensor<string, []>("input_151_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_151_dilations_0 = const()[name = tensor<string, []>("input_151_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_151_groups_0 = const()[name = tensor<string, []>("input_151_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [2, 256, 16]> input_151 = conv(bias = inner_encoder_conv_module_3_sequential_7_conv_bias, dilations = input_151_dilations_0, groups = input_151_groups_0, pad = input_151_pad_0, pad_type = input_151_pad_type_0, strides = input_151_strides_0, weight = inner_encoder_conv_module_3_sequential_7_conv_weight, x = input_149)[name = tensor<string, []>("input_151")];
             tensor<int32, [3]> conv_out_7_perm_0 = const()[name = tensor<string, []>("conv_out_7_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_884_begin_0 = const()[name = tensor<string, []>("op_884_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_884_end_0 = const()[name = tensor<string, []>("op_884_end_0"), val = tensor<int32, [3]>([2, 16, 256])];
-            tensor<bool, [3]> var_884_end_mask_0 = const()[name = tensor<string, []>("op_884_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [2, 16, 256]> conv_out_7 = transpose(perm = conv_out_7_perm_0, x = input_149)[name = tensor<string, []>("transpose_32")];
-            tensor<fp32, [2, 1, 256]> var_884 = slice_by_index(begin = var_884_begin_0, end = var_884_end_0, end_mask = var_884_end_mask_0, x = conv_out_7)[name = tensor<string, []>("op_884")];
-            tensor<int32, [3]> var_886_perm_0 = const()[name = tensor<string, []>("op_886_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 2, 256]> var_886 = transpose(perm = var_886_perm_0, x = var_884)[name = tensor<string, []>("transpose_31")];
-            tensor<fp32, [1, 2, 256]> input_151 = add(x = x_21, y = var_886)[name = tensor<string, []>("input_151")];
-            tensor<int32, [1]> input_153_axes_0 = const()[name = tensor<string, []>("input_153_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_153 = layer_norm(axes = input_153_axes_0, beta = encoder_ffn2_3_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_3_module_sequential_0_weight, x = input_151)[name = tensor<string, []>("input_153")];
-            tensor<fp32, [1, 2, 1024]> inputs = linear(bias = encoder_ffn2_3_module_sequential_1_linear_bias, weight = encoder_ffn2_3_module_sequential_1_linear_weight, x = input_153)[name = tensor<string, []>("linear_35")];
-            tensor<fp32, [1, 2, 1024]> input_155 = silu(x = inputs)[name = tensor<string, []>("input_155")];
-            tensor<fp32, [1, 2, 256]> input_159 = linear(bias = encoder_ffn2_3_module_sequential_4_linear_bias, weight = encoder_ffn2_3_module_sequential_4_linear_weight, x = input_155)[name = tensor<string, []>("linear_36")];
-            tensor<fp32, []> var_909 = const()[name = tensor<string, []>("op_909"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 2, 256]> var_910 = mul(x = input_159, y = var_909)[name = tensor<string, []>("op_910")];
-            tensor<fp32, [1, 2, 256]> input_161 = add(x = var_910, y = input_151)[name = tensor<string, []>("input_161")];
+            tensor<int32, [3]> var_920_begin_0 = const()[name = tensor<string, []>("op_920_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_920_end_0 = const()[name = tensor<string, []>("op_920_end_0"), val = tensor<int32, [3]>([2, 16, 256])];
+            tensor<bool, [3]> var_920_end_mask_0 = const()[name = tensor<string, []>("op_920_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [2, 16, 256]> conv_out_7 = transpose(perm = conv_out_7_perm_0, x = input_151)[name = tensor<string, []>("transpose_32")];
+            tensor<fp32, [2, 1, 256]> var_920 = slice_by_index(begin = var_920_begin_0, end = var_920_end_0, end_mask = var_920_end_mask_0, x = conv_out_7)[name = tensor<string, []>("op_920")];
+            tensor<int32, [3]> var_922_perm_0 = const()[name = tensor<string, []>("op_922_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 2, 256]> var_922 = transpose(perm = var_922_perm_0, x = var_920)[name = tensor<string, []>("transpose_31")];
+            tensor<fp32, [1, 2, 256]> input_153 = add(x = x_21, y = var_922)[name = tensor<string, []>("input_153")];
+            tensor<int32, [1]> input_155_axes_0 = const()[name = tensor<string, []>("input_155_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 2, 256]> input_155 = layer_norm(axes = input_155_axes_0, beta = inner_encoder_ffn2_3_module_sequential_0_bias, epsilon = var_45, gamma = inner_encoder_ffn2_3_module_sequential_0_weight, x = input_153)[name = tensor<string, []>("input_155")];
+            tensor<fp32, [1, 2, 1024]> inputs = linear(bias = inner_encoder_ffn2_3_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_3_module_sequential_1_linear_weight, x = input_155)[name = tensor<string, []>("linear_35")];
+            tensor<fp32, [1, 2, 1024]> input_157 = silu(x = inputs)[name = tensor<string, []>("input_157")];
+            tensor<fp32, [1, 2, 256]> input_161 = linear(bias = inner_encoder_ffn2_3_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_3_module_sequential_4_linear_weight, x = input_157)[name = tensor<string, []>("linear_36")];
+            tensor<fp32, []> var_945 = const()[name = tensor<string, []>("op_945"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 2, 256]> var_946 = mul(x = input_161, y = var_945)[name = tensor<string, []>("op_946")];
+            tensor<fp32, [1, 2, 256]> input_163 = add(x = var_946, y = input_153)[name = tensor<string, []>("input_163")];
             tensor<int32, [1]> x_25_axes_0 = const()[name = tensor<string, []>("x_25_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> x_25 = layer_norm(axes = x_25_axes_0, beta = encoder_layer_norm_3_bias, epsilon = var_28, gamma = encoder_layer_norm_3_weight, x = input_161)[name = tensor<string, []>("x_25")];
+            tensor<fp32, [1, 2, 256]> x_25 = layer_norm(axes = x_25_axes_0, beta = inner_encoder_layer_norm_3_bias, epsilon = var_45, gamma = inner_encoder_layer_norm_3_weight, x = input_163)[name = tensor<string, []>("x_25")];
             tensor<int32, [3]> x_ct_perm_0 = const()[name = tensor<string, []>("x_ct_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<bool, []> cat_interleave_0 = const()[name = tensor<string, []>("cat_interleave_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [1, 256, 2]> x_ct = transpose(perm = x_ct_perm_0, x = x_25)[name = tensor<string, []>("transpose_30")];
-            tensor<fp32, [1, 256, 20]> cat = concat(axis = var_20, interleave = cat_interleave_0, values = (cnn_window, x_ct))[name = tensor<string, []>("cat")];
+            tensor<fp32, [1, 256, 20]> cat = concat(axis = var_50, interleave = cat_interleave_0, values = (cnn_window, x_ct))[name = tensor<string, []>("cat")];
             tensor<string, []> conv_out_pad_type_0 = const()[name = tensor<string, []>("conv_out_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> conv_out_strides_0 = const()[name = tensor<string, []>("conv_out_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> conv_out_pad_0 = const()[name = tensor<string, []>("conv_out_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> conv_out_dilations_0 = const()[name = tensor<string, []>("conv_out_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> conv_out_groups_0 = const()[name = tensor<string, []>("conv_out_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 2]> conv_out = conv(bias = encoder_cnn_bias, dilations = conv_out_dilations_0, groups = conv_out_groups_0, pad = conv_out_pad_0, pad_type = conv_out_pad_type_0, strides = conv_out_strides_0, weight = encoder_cnn_weight, x = cat)[name = tensor<string, []>("conv_out")];
-            tensor<int32, [3]> var_928_begin_0 = const()[name = tensor<string, []>("op_928_begin_0"), val = tensor<int32, [3]>([0, 0, 2])];
-            tensor<int32, [3]> var_928_end_0 = const()[name = tensor<string, []>("op_928_end_0"), val = tensor<int32, [3]>([1, 256, 20])];
-            tensor<bool, [3]> var_928_end_mask_0 = const()[name = tensor<string, []>("op_928_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 256, 18]> cnn_window_new = slice_by_index(begin = var_928_begin_0, end = var_928_end_0, end_mask = var_928_end_mask_0, x = cat)[name = tensor<string, []>("op_928")];
-            tensor<int32, [3]> input_163_perm_0 = const()[name = tensor<string, []>("input_163_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [1]> var_930 = const()[name = tensor<string, []>("op_930"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_163 = transpose(perm = input_163_perm_0, x = conv_out)[name = tensor<string, []>("transpose_29")];
-            tensor<fp32, [1, 2, 1]> var_931 = reduce_l2_norm(axes = var_930, keep_dims = var_29, x = input_163)[name = tensor<string, []>("op_931")];
+            tensor<fp32, [1, 256, 2]> conv_out = conv(bias = inner_encoder_cnn_bias, dilations = conv_out_dilations_0, groups = conv_out_groups_0, pad = conv_out_pad_0, pad_type = conv_out_pad_type_0, strides = conv_out_strides_0, weight = inner_encoder_cnn_weight, x = cat)[name = tensor<string, []>("conv_out")];
+            tensor<int32, [3]> var_964_begin_0 = const()[name = tensor<string, []>("op_964_begin_0"), val = tensor<int32, [3]>([0, 0, 2])];
+            tensor<int32, [3]> var_964_end_0 = const()[name = tensor<string, []>("op_964_end_0"), val = tensor<int32, [3]>([1, 256, 20])];
+            tensor<bool, [3]> var_964_end_mask_0 = const()[name = tensor<string, []>("op_964_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 256, 18]> cnn_window_new = slice_by_index(begin = var_964_begin_0, end = var_964_end_0, end_mask = var_964_end_mask_0, x = cat)[name = tensor<string, []>("op_964")];
+            tensor<int32, [3]> input_165_perm_0 = const()[name = tensor<string, []>("input_165_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_966 = const()[name = tensor<string, []>("op_966"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 2, 256]> input_165 = transpose(perm = input_165_perm_0, x = conv_out)[name = tensor<string, []>("transpose_29")];
+            tensor<fp32, [1, 2, 1]> var_967 = reduce_l2_norm(axes = var_966, keep_dims = var_44, x = input_165)[name = tensor<string, []>("op_967")];
             tensor<fp32, []> const_12 = const()[name = tensor<string, []>("const_12"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1, 2, 1]> clip_0 = clip(alpha = var_11, beta = const_12, x = var_931)[name = tensor<string, []>("clip_0")];
-            tensor<fp32, [1, 2, 256]> emb = real_div(x = input_163, y = clip_0)[name = tensor<string, []>("emb")];
-            tensor<int32, []> var_935_axis_0 = const()[name = tensor<string, []>("op_935_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1, 4, 64, 64]> enc_kv_new = stack(axis = var_935_axis_0, values = (var_206, var_404, var_602, nkv_1))[name = tensor<string, []>("op_935")];
-            tensor<int32, []> var_937_axis_0 = const()[name = tensor<string, []>("op_937_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1]> enc_scale_new = stack(axis = var_937_axis_0, values = (new_scale_1, new_scale_3, new_scale_5, new_scale_7))[name = tensor<string, []>("op_937")];
-            tensor<int32, []> var_939_axis_0 = const()[name = tensor<string, []>("op_939_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1, 16, 256]> enc_conv_cache_new = stack(axis = var_939_axis_0, values = (window_5, window_11, window_17, window))[name = tensor<string, []>("op_939")];
-            tensor<fp32, []> var_948 = const()[name = tensor<string, []>("op_948"), val = tensor<fp32, []>(0x1.5798eep-27)];
-            tensor<fp32, []> var_953 = const()[name = tensor<string, []>("op_953"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
-            tensor<fp32, []> var_955 = const()[name = tensor<string, []>("op_955"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
-            tensor<bool, []> var_956 = const()[name = tensor<string, []>("op_956"), val = tensor<bool, []>(true)];
-            tensor<fp32, []> var_958 = const()[name = tensor<string, []>("op_958"), val = tensor<fp32, []>(0x1p+0)];
-            tensor<int32, []> var_962 = const()[name = tensor<string, []>("op_962"), val = tensor<int32, []>(-1)];
-            tensor<int32, []> var_968 = const()[name = tensor<string, []>("op_968"), val = tensor<int32, []>(0)];
+            tensor<fp32, [1, 2, 1]> clip_0 = clip(alpha = var_58, beta = const_12, x = var_967)[name = tensor<string, []>("clip_0")];
+            tensor<fp32, [1, 2, 256]> emb = real_div(x = input_165, y = clip_0)[name = tensor<string, []>("emb")];
+            tensor<int32, []> var_971_axis_0 = const()[name = tensor<string, []>("op_971_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1, 4, 64, 64]> enc_kv_new = stack(axis = var_971_axis_0, values = (var_242, var_440, var_638, nkv_1))[name = tensor<string, []>("op_971")];
+            tensor<int32, []> var_973_axis_0 = const()[name = tensor<string, []>("op_973_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1]> enc_scale_new = stack(axis = var_973_axis_0, values = (new_scale_1, new_scale_3, new_scale_5, new_scale_7))[name = tensor<string, []>("op_973")];
+            tensor<int32, []> var_975_axis_0 = const()[name = tensor<string, []>("op_975_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1, 16, 256]> enc_conv_cache_new = stack(axis = var_975_axis_0, values = (window_5, window_11, window_17, window))[name = tensor<string, []>("op_975")];
             tensor<fp32, [1, 2, 6, 256]> pos = const()[name = tensor<string, []>("pos"), val = tensor<fp32, [1, 2, 6, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44395328)))];
-            tensor<int32, [1]> var_1030_axes_0 = const()[name = tensor<string, []>("op_1030_axes_0"), val = tensor<int32, [1]>([2])];
-            tensor<fp32, [1, 2, 1, 256]> var_1030 = expand_dims(axes = var_1030_axes_0, x = emb)[name = tensor<string, []>("op_1030")];
+            tensor<int32, [1]> var_1043_axes_0 = const()[name = tensor<string, []>("op_1043_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp32, [1, 2, 1, 256]> var_1043 = expand_dims(axes = var_1043_axes_0, x = emb)[name = tensor<string, []>("op_1043")];
             tensor<int32, [4]> emb_exp_reps_0 = const()[name = tensor<string, []>("emb_exp_reps_0"), val = tensor<int32, [4]>([1, 1, 6, 1])];
-            tensor<fp32, [1, 2, 6, 256]> emb_exp = tile(reps = emb_exp_reps_0, x = var_1030)[name = tensor<string, []>("emb_exp")];
-            tensor<bool, []> input_165_interleave_0 = const()[name = tensor<string, []>("input_165_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 2, 6, 512]> input_165 = concat(axis = var_962, interleave = input_165_interleave_0, values = (emb_exp, pos))[name = tensor<string, []>("input_165")];
-            tensor<fp32, [1, 2, 6, 256]> x_27 = linear(bias = decoder_convert_bias, weight = decoder_convert_weight, x = input_165)[name = tensor<string, []>("linear_37")];
-            tensor<int32, [4]> var_1038_perm_0 = const()[name = tensor<string, []>("op_1038_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1042 = const()[name = tensor<string, []>("op_1042"), val = tensor<int32, [3]>([6, 2, 256])];
-            tensor<fp32, [1, 6, 2, 256]> var_1038 = transpose(perm = var_1038_perm_0, x = x_27)[name = tensor<string, []>("transpose_28")];
-            tensor<fp32, [6, 2, 256]> x_29 = reshape(shape = var_1042, x = var_1038)[name = tensor<string, []>("x_29")];
+            tensor<fp32, [1, 2, 6, 256]> emb_exp = tile(reps = emb_exp_reps_0, x = var_1043)[name = tensor<string, []>("emb_exp")];
+            tensor<bool, []> input_167_interleave_0 = const()[name = tensor<string, []>("input_167_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 2, 6, 512]> input_167 = concat(axis = var_51, interleave = input_167_interleave_0, values = (emb_exp, pos))[name = tensor<string, []>("input_167")];
+            tensor<fp32, [1, 2, 6, 256]> x_27 = linear(bias = inner_decoder_convert_bias, weight = inner_decoder_convert_weight, x = input_167)[name = tensor<string, []>("linear_37")];
+            tensor<int32, [4]> var_1051_perm_0 = const()[name = tensor<string, []>("op_1051_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1055 = const()[name = tensor<string, []>("op_1055"), val = tensor<int32, [3]>([6, 2, 256])];
+            tensor<fp32, [1, 6, 2, 256]> var_1051 = transpose(perm = var_1051_perm_0, x = x_27)[name = tensor<string, []>("transpose_28")];
+            tensor<fp32, [6, 2, 256]> x_29 = reshape(shape = var_1055, x = var_1051)[name = tensor<string, []>("x_29")];
             tensor<int32, [5]> prev_kv_9_begin_0 = const()[name = tensor<string, []>("prev_kv_9_begin_0"), val = tensor<int32, [5]>([0, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_9_end_0 = const()[name = tensor<string, []>("prev_kv_9_end_0"), val = tensor<int32, [5]>([1, 6, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_9_end_mask_0 = const()[name = tensor<string, []>("prev_kv_9_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -903,132 +910,132 @@ program(1.0)
             tensor<bool, [2]> prev_scale_9_end_mask_0 = const()[name = tensor<string, []>("prev_scale_9_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_9_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_9_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_9 = slice_by_index(begin = prev_scale_9_begin_0, end = prev_scale_9_end_0, end_mask = prev_scale_9_end_mask_0, squeeze_mask = prev_scale_9_squeeze_mask_0, x = dec_scale)[name = tensor<string, []>("prev_scale_9")];
-            tensor<fp32, [6, 2, 256]> var_1050 = linear(bias = decoder_q_proj_0_bias, weight = decoder_q_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_38")];
-            tensor<int32, [4]> var_1051 = const()[name = tensor<string, []>("op_1051"), val = tensor<int32, [4]>([6, 2, 4, 64])];
-            tensor<fp32, [6, 2, 4, 64]> var_1052 = reshape(shape = var_1051, x = var_1050)[name = tensor<string, []>("op_1052")];
+            tensor<fp32, [6, 2, 256]> var_1063 = linear(bias = inner_decoder_q_proj_0_bias, weight = inner_decoder_q_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_38")];
+            tensor<int32, [4]> var_1064 = const()[name = tensor<string, []>("op_1064"), val = tensor<int32, [4]>([6, 2, 4, 64])];
+            tensor<fp32, [6, 2, 4, 64]> var_1065 = reshape(shape = var_1064, x = var_1063)[name = tensor<string, []>("op_1065")];
             tensor<int32, [4]> q_9_perm_0 = const()[name = tensor<string, []>("q_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [6, 2, 256]> var_1056 = linear(bias = decoder_k_proj_0_bias, weight = decoder_k_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_39")];
-            tensor<fp32, []> var_1057 = const()[name = tensor<string, []>("op_1057"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [6, 2, 256]> var_1058 = mul(x = var_1056, y = var_1057)[name = tensor<string, []>("op_1058")];
-            tensor<int32, [4]> var_1059 = const()[name = tensor<string, []>("op_1059"), val = tensor<int32, [4]>([6, 2, 4, 64])];
-            tensor<fp32, [6, 2, 4, 64]> var_1060 = reshape(shape = var_1059, x = var_1058)[name = tensor<string, []>("op_1060")];
+            tensor<fp32, [6, 2, 256]> var_1069 = linear(bias = inner_decoder_k_proj_0_bias, weight = inner_decoder_k_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_39")];
+            tensor<fp32, []> var_1070 = const()[name = tensor<string, []>("op_1070"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [6, 2, 256]> var_1071 = mul(x = var_1069, y = var_1070)[name = tensor<string, []>("op_1071")];
+            tensor<int32, [4]> var_1072 = const()[name = tensor<string, []>("op_1072"), val = tensor<int32, [4]>([6, 2, 4, 64])];
+            tensor<fp32, [6, 2, 4, 64]> var_1073 = reshape(shape = var_1072, x = var_1071)[name = tensor<string, []>("op_1073")];
             tensor<int32, [4]> k_9_perm_0 = const()[name = tensor<string, []>("k_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [6, 2, 256]> var_1064 = linear(bias = decoder_v_proj_0_bias, weight = decoder_v_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_40")];
-            tensor<int32, [4]> var_1065 = const()[name = tensor<string, []>("op_1065"), val = tensor<int32, [4]>([6, 2, 4, 64])];
-            tensor<fp32, [6, 2, 4, 64]> var_1066 = reshape(shape = var_1065, x = var_1064)[name = tensor<string, []>("op_1066")];
+            tensor<fp32, [6, 2, 256]> var_1077 = linear(bias = inner_decoder_v_proj_0_bias, weight = inner_decoder_v_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_40")];
+            tensor<int32, [4]> var_1078 = const()[name = tensor<string, []>("op_1078"), val = tensor<int32, [4]>([6, 2, 4, 64])];
+            tensor<fp32, [6, 2, 4, 64]> var_1079 = reshape(shape = var_1078, x = var_1077)[name = tensor<string, []>("op_1079")];
             tensor<int32, [4]> v_9_perm_0 = const()[name = tensor<string, []>("v_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [6, 2, 256]> input_169 = linear(bias = decoder_g_proj_0_bias, weight = decoder_g_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_41")];
+            tensor<fp32, [6, 2, 256]> input_171 = linear(bias = inner_decoder_g_proj_0_bias, weight = inner_decoder_g_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_41")];
             tensor<bool, []> cumsum_mask_1_exclusive_0 = const()[name = tensor<string, []>("cumsum_mask_1_exclusive_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> cumsum_mask_1_reverse_0 = const()[name = tensor<string, []>("cumsum_mask_1_reverse_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [2]> cumsum_mask_1 = cumsum(axis = var_968, exclusive = cumsum_mask_1_exclusive_0, reverse = cumsum_mask_1_reverse_0, x = valid_mask)[name = tensor<string, []>("cumsum_mask_1")];
+            tensor<fp32, [2]> cumsum_mask_1 = cumsum(axis = var_48, exclusive = cumsum_mask_1_exclusive_0, reverse = cumsum_mask_1_reverse_0, x = valid_mask)[name = tensor<string, []>("cumsum_mask_1")];
             tensor<fp32, [1]> sqrt_s0_9 = sqrt(x = prev_scale_9)[name = tensor<string, []>("sqrt_s0_9")];
             tensor<fp32, [2]> s_t_9 = add(x = prev_scale_9, y = cumsum_mask_1)[name = tensor<string, []>("s_t_9")];
             tensor<fp32, []> const_20 = const()[name = tensor<string, []>("const_20"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [2]> clip_1 = clip(alpha = var_958, beta = const_20, x = s_t_9)[name = tensor<string, []>("clip_1")];
+            tensor<fp32, [2]> clip_1 = clip(alpha = var_38, beta = const_20, x = s_t_9)[name = tensor<string, []>("clip_1")];
             tensor<fp32, [2]> sqrt_s_t_9 = sqrt(x = clip_1)[name = tensor<string, []>("sqrt_s_t_9")];
             tensor<bool, []> qk_9_transpose_x_1 = const()[name = tensor<string, []>("qk_9_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_9_transpose_y_1 = const()[name = tensor<string, []>("qk_9_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [6, 4, 2, 64]> k_9 = transpose(perm = k_9_perm_0, x = var_1060)[name = tensor<string, []>("transpose_26")];
-            tensor<fp32, [6, 4, 2, 64]> q_9 = transpose(perm = q_9_perm_0, x = var_1052)[name = tensor<string, []>("transpose_27")];
+            tensor<fp32, [6, 4, 2, 64]> k_9 = transpose(perm = k_9_perm_0, x = var_1073)[name = tensor<string, []>("transpose_26")];
+            tensor<fp32, [6, 4, 2, 64]> q_9 = transpose(perm = q_9_perm_0, x = var_1065)[name = tensor<string, []>("transpose_27")];
             tensor<fp32, [6, 4, 2, 2]> qk_9 = matmul(transpose_x = qk_9_transpose_x_1, transpose_y = qk_9_transpose_y_1, x = q_9, y = k_9)[name = tensor<string, []>("qk_9")];
-            tensor<int32, [2]> var_1078 = const()[name = tensor<string, []>("op_1078"), val = tensor<int32, [2]>([1, 2])];
-            tensor<fp32, [1, 2]> var_1079 = reshape(shape = var_1078, x = valid_mask)[name = tensor<string, []>("op_1079")];
-            tensor<fp32, [2, 2]> causal_with_valid_1 = mul(x = decoder__causal_mask, y = var_1079)[name = tensor<string, []>("causal_with_valid_1")];
-            tensor<int32, [2]> var_1081 = const()[name = tensor<string, []>("op_1081"), val = tensor<int32, [2]>([2, 1])];
-            tensor<fp32, [2, 1]> var_1082 = reshape(shape = var_1081, x = sqrt_s_t_9)[name = tensor<string, []>("op_1082")];
-            tensor<fp32, [2, 2]> M_9 = real_div(x = causal_with_valid_1, y = var_1082)[name = tensor<string, []>("M_9")];
-            tensor<fp32, [6, 4, 2, 2]> var_1084 = mul(x = qk_9, y = M_9)[name = tensor<string, []>("op_1084")];
+            tensor<int32, [2]> var_1091 = const()[name = tensor<string, []>("op_1091"), val = tensor<int32, [2]>([1, 2])];
+            tensor<fp32, [1, 2]> var_1092 = reshape(shape = var_1091, x = valid_mask)[name = tensor<string, []>("op_1092")];
+            tensor<fp32, [2, 2]> causal_with_valid_1 = mul(x = inner_decoder__causal_mask, y = var_1092)[name = tensor<string, []>("causal_with_valid_1")];
+            tensor<int32, [2]> var_1094 = const()[name = tensor<string, []>("op_1094"), val = tensor<int32, [2]>([2, 1])];
+            tensor<fp32, [2, 1]> var_1095 = reshape(shape = var_1094, x = sqrt_s_t_9)[name = tensor<string, []>("op_1095")];
+            tensor<fp32, [2, 2]> M_9 = real_div(x = causal_with_valid_1, y = var_1095)[name = tensor<string, []>("M_9")];
+            tensor<fp32, [6, 4, 2, 2]> var_1097 = mul(x = qk_9, y = M_9)[name = tensor<string, []>("op_1097")];
             tensor<bool, []> inner_9_transpose_x_0 = const()[name = tensor<string, []>("inner_9_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_9_transpose_y_0 = const()[name = tensor<string, []>("inner_9_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [6, 4, 2, 64]> v_9 = transpose(perm = v_9_perm_0, x = var_1066)[name = tensor<string, []>("transpose_25")];
-            tensor<fp32, [6, 4, 2, 64]> inner_9 = matmul(transpose_x = inner_9_transpose_x_0, transpose_y = inner_9_transpose_y_0, x = var_1084, y = v_9)[name = tensor<string, []>("inner_9")];
-            tensor<bool, []> var_1086_transpose_x_0 = const()[name = tensor<string, []>("op_1086_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_1086_transpose_y_0 = const()[name = tensor<string, []>("op_1086_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [6, 4, 2, 64]> var_1086 = matmul(transpose_x = var_1086_transpose_x_0, transpose_y = var_1086_transpose_y_0, x = q_9, y = prev_kv_9)[name = tensor<string, []>("op_1086")];
-            tensor<fp32, [2]> var_1087 = real_div(x = sqrt_s0_9, y = sqrt_s_t_9)[name = tensor<string, []>("op_1087")];
-            tensor<int32, [4]> var_1088 = const()[name = tensor<string, []>("op_1088"), val = tensor<int32, [4]>([1, 1, 2, 1])];
-            tensor<fp32, [1, 1, 2, 1]> var_1089 = reshape(shape = var_1088, x = var_1087)[name = tensor<string, []>("op_1089")];
-            tensor<fp32, [6, 4, 2, 64]> cross_9 = mul(x = var_1086, y = var_1089)[name = tensor<string, []>("cross_9")];
+            tensor<fp32, [6, 4, 2, 64]> v_9 = transpose(perm = v_9_perm_0, x = var_1079)[name = tensor<string, []>("transpose_25")];
+            tensor<fp32, [6, 4, 2, 64]> inner_9 = matmul(transpose_x = inner_9_transpose_x_0, transpose_y = inner_9_transpose_y_0, x = var_1097, y = v_9)[name = tensor<string, []>("inner_9")];
+            tensor<bool, []> var_1099_transpose_x_0 = const()[name = tensor<string, []>("op_1099_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_1099_transpose_y_0 = const()[name = tensor<string, []>("op_1099_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [6, 4, 2, 64]> var_1099 = matmul(transpose_x = var_1099_transpose_x_0, transpose_y = var_1099_transpose_y_0, x = q_9, y = prev_kv_9)[name = tensor<string, []>("op_1099")];
+            tensor<fp32, [2]> var_1100 = real_div(x = sqrt_s0_9, y = sqrt_s_t_9)[name = tensor<string, []>("op_1100")];
+            tensor<int32, [4]> var_1101 = const()[name = tensor<string, []>("op_1101"), val = tensor<int32, [4]>([1, 1, 2, 1])];
+            tensor<fp32, [1, 1, 2, 1]> var_1102 = reshape(shape = var_1101, x = var_1100)[name = tensor<string, []>("op_1102")];
+            tensor<fp32, [6, 4, 2, 64]> cross_9 = mul(x = var_1099, y = var_1102)[name = tensor<string, []>("cross_9")];
             tensor<fp32, [6, 4, 2, 64]> out_25 = add(x = inner_9, y = cross_9)[name = tensor<string, []>("out_25")];
-            tensor<int32, [4]> var_1092 = const()[name = tensor<string, []>("op_1092"), val = tensor<int32, [4]>([1, 1, 2, 1])];
-            tensor<fp32, [1, 1, 2, 1]> var_1093 = reshape(shape = var_1092, x = valid_mask)[name = tensor<string, []>("op_1093")];
-            tensor<fp32, [6, 4, 2, 64]> v_masked_1 = mul(x = v_9, y = var_1093)[name = tensor<string, []>("v_masked_1")];
-            tensor<fp32, [6, 4, 64, 64]> var_1095 = mul(x = prev_kv_9, y = sqrt_s0_9)[name = tensor<string, []>("op_1095")];
-            tensor<bool, []> var_1097_transpose_x_1 = const()[name = tensor<string, []>("op_1097_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_1097_transpose_y_1 = const()[name = tensor<string, []>("op_1097_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [6, 4, 64, 64]> var_1097 = matmul(transpose_x = var_1097_transpose_x_1, transpose_y = var_1097_transpose_y_1, x = k_9, y = v_masked_1)[name = tensor<string, []>("op_1097")];
-            tensor<fp32, [6, 4, 64, 64]> new_kv_unnorm_9 = add(x = var_1095, y = var_1097)[name = tensor<string, []>("new_kv_unnorm_9")];
-            tensor<bool, []> var_1099_keep_dims_0 = const()[name = tensor<string, []>("op_1099_keep_dims_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, []> var_1099 = reduce_sum(keep_dims = var_1099_keep_dims_0, x = valid_mask)[name = tensor<string, []>("op_1099")];
-            tensor<int32, [1]> var_1100 = const()[name = tensor<string, []>("op_1100"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [1]> var_1101 = reshape(shape = var_1100, x = var_1099)[name = tensor<string, []>("op_1101")];
-            tensor<fp32, [1]> new_scale_9 = add(x = prev_scale_9, y = var_1101)[name = tensor<string, []>("new_scale_9")];
+            tensor<int32, [4]> var_1105 = const()[name = tensor<string, []>("op_1105"), val = tensor<int32, [4]>([1, 1, 2, 1])];
+            tensor<fp32, [1, 1, 2, 1]> var_1106 = reshape(shape = var_1105, x = valid_mask)[name = tensor<string, []>("op_1106")];
+            tensor<fp32, [6, 4, 2, 64]> v_masked_1 = mul(x = v_9, y = var_1106)[name = tensor<string, []>("v_masked_1")];
+            tensor<fp32, [6, 4, 64, 64]> var_1108 = mul(x = prev_kv_9, y = sqrt_s0_9)[name = tensor<string, []>("op_1108")];
+            tensor<bool, []> var_1110_transpose_x_1 = const()[name = tensor<string, []>("op_1110_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_1110_transpose_y_1 = const()[name = tensor<string, []>("op_1110_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [6, 4, 64, 64]> var_1110 = matmul(transpose_x = var_1110_transpose_x_1, transpose_y = var_1110_transpose_y_1, x = k_9, y = v_masked_1)[name = tensor<string, []>("op_1110")];
+            tensor<fp32, [6, 4, 64, 64]> new_kv_unnorm_9 = add(x = var_1108, y = var_1110)[name = tensor<string, []>("new_kv_unnorm_9")];
+            tensor<bool, []> var_1112_keep_dims_0 = const()[name = tensor<string, []>("op_1112_keep_dims_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, []> var_1112 = reduce_sum(keep_dims = var_1112_keep_dims_0, x = valid_mask)[name = tensor<string, []>("op_1112")];
+            tensor<int32, [1]> var_1113 = const()[name = tensor<string, []>("op_1113"), val = tensor<int32, [1]>([1])];
+            tensor<fp32, [1]> var_1114 = reshape(shape = var_1113, x = var_1112)[name = tensor<string, []>("op_1114")];
+            tensor<fp32, [1]> new_scale_9 = add(x = prev_scale_9, y = var_1114)[name = tensor<string, []>("new_scale_9")];
             tensor<fp32, []> const_21 = const()[name = tensor<string, []>("const_21"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_2 = clip(alpha = var_958, beta = const_21, x = new_scale_9)[name = tensor<string, []>("clip_2")];
+            tensor<fp32, [1]> clip_2 = clip(alpha = var_38, beta = const_21, x = new_scale_9)[name = tensor<string, []>("clip_2")];
             tensor<fp32, [1]> sqrt_new_scale_1 = sqrt(x = clip_2)[name = tensor<string, []>("sqrt_new_scale_1")];
-            tensor<fp32, [6, 4, 64, 64]> var_1105 = real_div(x = new_kv_unnorm_9, y = sqrt_new_scale_1)[name = tensor<string, []>("op_1105")];
-            tensor<int32, [4]> var_1106_perm_0 = const()[name = tensor<string, []>("op_1106_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [6, 4, 64, 64]> var_1118 = real_div(x = new_kv_unnorm_9, y = sqrt_new_scale_1)[name = tensor<string, []>("op_1118")];
+            tensor<int32, [4]> var_1119_perm_0 = const()[name = tensor<string, []>("op_1119_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_27_axes_0 = const()[name = tensor<string, []>("out_27_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [6, 2, 4, 64]> var_1106 = transpose(perm = var_1106_perm_0, x = out_25)[name = tensor<string, []>("transpose_24")];
-            tensor<fp32, [6, 2, 4, 64]> out_27 = layer_norm(axes = out_27_axes_0, epsilon = var_955, x = var_1106)[name = tensor<string, []>("out_27")];
-            tensor<int32, [3]> var_1110 = const()[name = tensor<string, []>("op_1110"), val = tensor<int32, [3]>([6, 2, 256])];
-            tensor<fp32, [6, 2, 256]> out_29 = reshape(shape = var_1110, x = out_27)[name = tensor<string, []>("out_29")];
-            tensor<fp32, [6, 2, 256]> var_1112 = silu(x = input_169)[name = tensor<string, []>("op_1112")];
-            tensor<fp32, [6, 2, 256]> input_171 = mul(x = var_1112, y = out_29)[name = tensor<string, []>("input_171")];
-            tensor<fp32, [6, 2, 256]> ret_out_9 = linear(bias = decoder_out_proj_0_bias, weight = decoder_out_proj_0_weight, x = input_171)[name = tensor<string, []>("linear_42")];
-            tensor<fp32, [6, 2, 256]> input_173 = add(x = x_29, y = ret_out_9)[name = tensor<string, []>("input_173")];
+            tensor<fp32, [6, 2, 4, 64]> var_1119 = transpose(perm = var_1119_perm_0, x = out_25)[name = tensor<string, []>("transpose_24")];
+            tensor<fp32, [6, 2, 4, 64]> out_27 = layer_norm(axes = out_27_axes_0, epsilon = var_53, x = var_1119)[name = tensor<string, []>("out_27")];
+            tensor<int32, [3]> var_1123 = const()[name = tensor<string, []>("op_1123"), val = tensor<int32, [3]>([6, 2, 256])];
+            tensor<fp32, [6, 2, 256]> out_29 = reshape(shape = var_1123, x = out_27)[name = tensor<string, []>("out_29")];
+            tensor<fp32, [6, 2, 256]> var_1125 = silu(x = input_171)[name = tensor<string, []>("op_1125")];
+            tensor<fp32, [6, 2, 256]> input_173 = mul(x = var_1125, y = out_29)[name = tensor<string, []>("input_173")];
+            tensor<fp32, [6, 2, 256]> ret_out_9 = linear(bias = inner_decoder_out_proj_0_bias, weight = inner_decoder_out_proj_0_weight, x = input_173)[name = tensor<string, []>("linear_42")];
+            tensor<fp32, [6, 2, 256]> input_175 = add(x = x_29, y = ret_out_9)[name = tensor<string, []>("input_175")];
             tensor<int32, [1]> xt_1_axes_0 = const()[name = tensor<string, []>("xt_1_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [6, 2, 256]> xt_1 = layer_norm(axes = xt_1_axes_0, beta = decoder_norm11_0_bias, epsilon = var_953, gamma = decoder_norm11_0_weight, x = input_173)[name = tensor<string, []>("xt_1")];
-            tensor<int32, [4]> var_1122 = const()[name = tensor<string, []>("op_1122"), val = tensor<int32, [4]>([1, 6, 2, 256])];
-            tensor<fp32, [1, 6, 2, 256]> var_1123 = reshape(shape = var_1122, x = xt_1)[name = tensor<string, []>("op_1123")];
-            tensor<int32, [4]> var_1124_perm_0 = const()[name = tensor<string, []>("op_1124_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1127 = const()[name = tensor<string, []>("op_1127"), val = tensor<int32, [3]>([2, 6, 256])];
-            tensor<fp32, [1, 2, 6, 256]> var_1124 = transpose(perm = var_1124_perm_0, x = var_1123)[name = tensor<string, []>("transpose_23")];
-            tensor<fp32, [2, 6, 256]> query_1 = reshape(shape = var_1127, x = var_1124)[name = tensor<string, []>("query_1")];
+            tensor<fp32, [6, 2, 256]> xt_1 = layer_norm(axes = xt_1_axes_0, beta = inner_decoder_norm11_0_bias, epsilon = var_45, gamma = inner_decoder_norm11_0_weight, x = input_175)[name = tensor<string, []>("xt_1")];
+            tensor<int32, [4]> var_1135 = const()[name = tensor<string, []>("op_1135"), val = tensor<int32, [4]>([1, 6, 2, 256])];
+            tensor<fp32, [1, 6, 2, 256]> var_1136 = reshape(shape = var_1135, x = xt_1)[name = tensor<string, []>("op_1136")];
+            tensor<int32, [4]> var_1137_perm_0 = const()[name = tensor<string, []>("op_1137_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1140 = const()[name = tensor<string, []>("op_1140"), val = tensor<int32, [3]>([2, 6, 256])];
+            tensor<fp32, [1, 2, 6, 256]> var_1137 = transpose(perm = var_1137_perm_0, x = var_1136)[name = tensor<string, []>("transpose_23")];
+            tensor<fp32, [2, 6, 256]> query_1 = reshape(shape = var_1140, x = var_1137)[name = tensor<string, []>("query_1")];
             tensor<int32, [3]> query_3_perm_0 = const()[name = tensor<string, []>("query_3_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [6, 2, 256]> query_3 = transpose(perm = query_3_perm_0, x = query_1)[name = tensor<string, []>("transpose_22")];
-            tensor<fp32, [6, 2, 768]> var_1150 = linear(bias = decoder_self_attn2_0_in_proj_bias, weight = decoder_self_attn2_0_in_proj_weight, x = query_3)[name = tensor<string, []>("linear_43")];
+            tensor<fp32, [6, 2, 768]> var_1163 = linear(bias = inner_decoder_self_attn2_0_in_proj_bias, weight = inner_decoder_self_attn2_0_in_proj_weight, x = query_3)[name = tensor<string, []>("linear_43")];
             tensor<int32, [4]> concat_1 = const()[name = tensor<string, []>("concat_1"), val = tensor<int32, [4]>([6, 2, 3, 256])];
-            tensor<fp32, [6, 2, 3, 256]> var_1152 = reshape(shape = concat_1, x = var_1150)[name = tensor<string, []>("op_1152")];
-            tensor<int32, [1]> var_1153_axes_0 = const()[name = tensor<string, []>("op_1153_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp32, [1, 6, 2, 3, 256]> var_1153 = expand_dims(axes = var_1153_axes_0, x = var_1152)[name = tensor<string, []>("op_1153")];
-            tensor<int32, [5]> var_1154_perm_0 = const()[name = tensor<string, []>("op_1154_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
-            tensor<int32, [1]> var_1155_axes_0 = const()[name = tensor<string, []>("op_1155_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp32, [3, 6, 2, 1, 256]> var_1154 = transpose(perm = var_1154_perm_0, x = var_1153)[name = tensor<string, []>("transpose_21")];
-            tensor<fp32, [3, 6, 2, 256]> var_1155 = squeeze(axes = var_1155_axes_0, x = var_1154)[name = tensor<string, []>("op_1155")];
+            tensor<fp32, [6, 2, 3, 256]> var_1165 = reshape(shape = concat_1, x = var_1163)[name = tensor<string, []>("op_1165")];
+            tensor<int32, [1]> var_1166_axes_0 = const()[name = tensor<string, []>("op_1166_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 6, 2, 3, 256]> var_1166 = expand_dims(axes = var_1166_axes_0, x = var_1165)[name = tensor<string, []>("op_1166")];
+            tensor<int32, [5]> var_1167_perm_0 = const()[name = tensor<string, []>("op_1167_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
+            tensor<int32, [1]> var_1168_axes_0 = const()[name = tensor<string, []>("op_1168_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp32, [3, 6, 2, 1, 256]> var_1167 = transpose(perm = var_1167_perm_0, x = var_1166)[name = tensor<string, []>("transpose_21")];
+            tensor<fp32, [3, 6, 2, 256]> var_1168 = squeeze(axes = var_1168_axes_0, x = var_1167)[name = tensor<string, []>("op_1168")];
             tensor<int32, [4]> q_11_begin_0 = const()[name = tensor<string, []>("q_11_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> q_11_end_0 = const()[name = tensor<string, []>("q_11_end_0"), val = tensor<int32, [4]>([1, 6, 2, 256])];
             tensor<bool, [4]> q_11_end_mask_0 = const()[name = tensor<string, []>("q_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> q_11_squeeze_mask_0 = const()[name = tensor<string, []>("q_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [6, 2, 256]> q_11 = slice_by_index(begin = q_11_begin_0, end = q_11_end_0, end_mask = q_11_end_mask_0, squeeze_mask = q_11_squeeze_mask_0, x = var_1155)[name = tensor<string, []>("q_11")];
+            tensor<fp32, [6, 2, 256]> q_11 = slice_by_index(begin = q_11_begin_0, end = q_11_end_0, end_mask = q_11_end_mask_0, squeeze_mask = q_11_squeeze_mask_0, x = var_1168)[name = tensor<string, []>("q_11")];
             tensor<int32, [4]> k_11_begin_0 = const()[name = tensor<string, []>("k_11_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> k_11_end_0 = const()[name = tensor<string, []>("k_11_end_0"), val = tensor<int32, [4]>([2, 6, 2, 256])];
             tensor<bool, [4]> k_11_end_mask_0 = const()[name = tensor<string, []>("k_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> k_11_squeeze_mask_0 = const()[name = tensor<string, []>("k_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [6, 2, 256]> k_11 = slice_by_index(begin = k_11_begin_0, end = k_11_end_0, end_mask = k_11_end_mask_0, squeeze_mask = k_11_squeeze_mask_0, x = var_1155)[name = tensor<string, []>("k_11")];
+            tensor<fp32, [6, 2, 256]> k_11 = slice_by_index(begin = k_11_begin_0, end = k_11_end_0, end_mask = k_11_end_mask_0, squeeze_mask = k_11_squeeze_mask_0, x = var_1168)[name = tensor<string, []>("k_11")];
             tensor<int32, [4]> v_11_begin_0 = const()[name = tensor<string, []>("v_11_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> v_11_end_0 = const()[name = tensor<string, []>("v_11_end_0"), val = tensor<int32, [4]>([3, 6, 2, 256])];
             tensor<bool, [4]> v_11_end_mask_0 = const()[name = tensor<string, []>("v_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> v_11_squeeze_mask_0 = const()[name = tensor<string, []>("v_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [6, 2, 256]> v_11 = slice_by_index(begin = v_11_begin_0, end = v_11_end_0, end_mask = v_11_end_mask_0, squeeze_mask = v_11_squeeze_mask_0, x = var_1155)[name = tensor<string, []>("v_11")];
-            tensor<int32, [3]> var_1163 = const()[name = tensor<string, []>("op_1163"), val = tensor<int32, [3]>([6, 8, 64])];
-            tensor<fp32, [6, 8, 64]> var_1164 = reshape(shape = var_1163, x = q_11)[name = tensor<string, []>("op_1164")];
+            tensor<fp32, [6, 2, 256]> v_11 = slice_by_index(begin = v_11_begin_0, end = v_11_end_0, end_mask = v_11_end_mask_0, squeeze_mask = v_11_squeeze_mask_0, x = var_1168)[name = tensor<string, []>("v_11")];
+            tensor<int32, [3]> var_1176 = const()[name = tensor<string, []>("op_1176"), val = tensor<int32, [3]>([6, 8, 64])];
+            tensor<fp32, [6, 8, 64]> var_1177 = reshape(shape = var_1176, x = q_11)[name = tensor<string, []>("op_1177")];
             tensor<int32, [3]> q_13_perm_0 = const()[name = tensor<string, []>("q_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1170 = const()[name = tensor<string, []>("op_1170"), val = tensor<int32, [3]>([6, 8, 64])];
-            tensor<fp32, [6, 8, 64]> var_1171 = reshape(shape = var_1170, x = k_11)[name = tensor<string, []>("op_1171")];
+            tensor<int32, [3]> var_1183 = const()[name = tensor<string, []>("op_1183"), val = tensor<int32, [3]>([6, 8, 64])];
+            tensor<fp32, [6, 8, 64]> var_1184 = reshape(shape = var_1183, x = k_11)[name = tensor<string, []>("op_1184")];
             tensor<int32, [3]> k_13_perm_0 = const()[name = tensor<string, []>("k_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1177 = const()[name = tensor<string, []>("op_1177"), val = tensor<int32, [3]>([6, 8, 64])];
-            tensor<fp32, [6, 8, 64]> var_1178 = reshape(shape = var_1177, x = v_11)[name = tensor<string, []>("op_1178")];
+            tensor<int32, [3]> var_1190 = const()[name = tensor<string, []>("op_1190"), val = tensor<int32, [3]>([6, 8, 64])];
+            tensor<fp32, [6, 8, 64]> var_1191 = reshape(shape = var_1190, x = v_11)[name = tensor<string, []>("op_1191")];
             tensor<int32, [3]> v_13_perm_0 = const()[name = tensor<string, []>("v_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [4]> var_1181 = const()[name = tensor<string, []>("op_1181"), val = tensor<int32, [4]>([2, 4, 6, 64])];
-            tensor<fp32, [8, 6, 64]> q_13 = transpose(perm = q_13_perm_0, x = var_1164)[name = tensor<string, []>("transpose_20")];
-            tensor<fp32, [2, 4, 6, 64]> q_15 = reshape(shape = var_1181, x = q_13)[name = tensor<string, []>("q_15")];
-            tensor<int32, [4]> var_1183 = const()[name = tensor<string, []>("op_1183"), val = tensor<int32, [4]>([2, 4, 6, 64])];
-            tensor<fp32, [8, 6, 64]> k_13 = transpose(perm = k_13_perm_0, x = var_1171)[name = tensor<string, []>("transpose_19")];
-            tensor<fp32, [2, 4, 6, 64]> k_15 = reshape(shape = var_1183, x = k_13)[name = tensor<string, []>("k_15")];
-            tensor<int32, [4]> var_1185 = const()[name = tensor<string, []>("op_1185"), val = tensor<int32, [4]>([2, 4, 6, 64])];
-            tensor<fp32, [8, 6, 64]> v_13 = transpose(perm = v_13_perm_0, x = var_1178)[name = tensor<string, []>("transpose_18")];
-            tensor<fp32, [2, 4, 6, 64]> v_15 = reshape(shape = var_1185, x = v_13)[name = tensor<string, []>("v_15")];
+            tensor<int32, [4]> var_1194 = const()[name = tensor<string, []>("op_1194"), val = tensor<int32, [4]>([2, 4, 6, 64])];
+            tensor<fp32, [8, 6, 64]> q_13 = transpose(perm = q_13_perm_0, x = var_1177)[name = tensor<string, []>("transpose_20")];
+            tensor<fp32, [2, 4, 6, 64]> q_15 = reshape(shape = var_1194, x = q_13)[name = tensor<string, []>("q_15")];
+            tensor<int32, [4]> var_1196 = const()[name = tensor<string, []>("op_1196"), val = tensor<int32, [4]>([2, 4, 6, 64])];
+            tensor<fp32, [8, 6, 64]> k_13 = transpose(perm = k_13_perm_0, x = var_1184)[name = tensor<string, []>("transpose_19")];
+            tensor<fp32, [2, 4, 6, 64]> k_15 = reshape(shape = var_1196, x = k_13)[name = tensor<string, []>("k_15")];
+            tensor<int32, [4]> var_1198 = const()[name = tensor<string, []>("op_1198"), val = tensor<int32, [4]>([2, 4, 6, 64])];
+            tensor<fp32, [8, 6, 64]> v_13 = transpose(perm = v_13_perm_0, x = var_1191)[name = tensor<string, []>("transpose_18")];
+            tensor<fp32, [2, 4, 6, 64]> v_15 = reshape(shape = var_1198, x = v_13)[name = tensor<string, []>("v_15")];
             tensor<fp32, []> mul_1_y_0 = const()[name = tensor<string, []>("mul_1_y_0"), val = tensor<fp32, []>(0x1p-3)];
             tensor<fp32, [2, 4, 6, 64]> mul_1 = mul(x = q_15, y = mul_1_y_0)[name = tensor<string, []>("mul_1")];
             tensor<bool, []> matmul_0_transpose_y_0 = const()[name = tensor<string, []>("matmul_0_transpose_y_0"), val = tensor<bool, []>(true)];
@@ -1039,30 +1046,30 @@ program(1.0)
             tensor<bool, []> attn_output_1_transpose_x_0 = const()[name = tensor<string, []>("attn_output_1_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> attn_output_1_transpose_y_0 = const()[name = tensor<string, []>("attn_output_1_transpose_y_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [2, 4, 6, 64]> attn_output_1 = matmul(transpose_x = attn_output_1_transpose_x_0, transpose_y = attn_output_1_transpose_y_0, x = softmax_0, y = v_15)[name = tensor<string, []>("attn_output_1")];
-            tensor<int32, [4]> var_1188 = const()[name = tensor<string, []>("op_1188"), val = tensor<int32, [4]>([2, 0, 1, 3])];
-            tensor<int32, [2]> var_1193 = const()[name = tensor<string, []>("op_1193"), val = tensor<int32, [2]>([12, 256])];
-            tensor<fp32, [6, 2, 4, 64]> var_1189 = transpose(perm = var_1188, x = attn_output_1)[name = tensor<string, []>("transpose_17")];
-            tensor<fp32, [12, 256]> attn_output_3 = reshape(shape = var_1193, x = var_1189)[name = tensor<string, []>("attn_output_3")];
-            tensor<fp32, [12, 256]> attn_output_5 = linear(bias = decoder_self_attn2_0_out_proj_bias, weight = decoder_self_attn2_0_out_proj_weight, x = attn_output_3)[name = tensor<string, []>("linear_44")];
-            tensor<int32, [3]> var_1197 = const()[name = tensor<string, []>("op_1197"), val = tensor<int32, [3]>([6, 2, 256])];
-            tensor<fp32, [6, 2, 256]> attn_output_7 = reshape(shape = var_1197, x = attn_output_5)[name = tensor<string, []>("attn_output_7")];
+            tensor<int32, [4]> var_1201 = const()[name = tensor<string, []>("op_1201"), val = tensor<int32, [4]>([2, 0, 1, 3])];
+            tensor<int32, [2]> var_1206 = const()[name = tensor<string, []>("op_1206"), val = tensor<int32, [2]>([12, 256])];
+            tensor<fp32, [6, 2, 4, 64]> var_1202 = transpose(perm = var_1201, x = attn_output_1)[name = tensor<string, []>("transpose_17")];
+            tensor<fp32, [12, 256]> attn_output_3 = reshape(shape = var_1206, x = var_1202)[name = tensor<string, []>("attn_output_3")];
+            tensor<fp32, [12, 256]> attn_output_5 = linear(bias = inner_decoder_self_attn2_0_out_proj_bias, weight = inner_decoder_self_attn2_0_out_proj_weight, x = attn_output_3)[name = tensor<string, []>("linear_44")];
+            tensor<int32, [3]> var_1210 = const()[name = tensor<string, []>("op_1210"), val = tensor<int32, [3]>([6, 2, 256])];
+            tensor<fp32, [6, 2, 256]> attn_output_7 = reshape(shape = var_1210, x = attn_output_5)[name = tensor<string, []>("attn_output_7")];
             tensor<int32, [3]> sa2_out_1_perm_0 = const()[name = tensor<string, []>("sa2_out_1_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [2, 6, 256]> sa2_out_1 = transpose(perm = sa2_out_1_perm_0, x = attn_output_7)[name = tensor<string, []>("transpose_16")];
-            tensor<fp32, [2, 6, 256]> input_175 = add(x = query_1, y = sa2_out_1)[name = tensor<string, []>("input_175")];
-            tensor<int32, [1]> input_177_axes_0 = const()[name = tensor<string, []>("input_177_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [2, 6, 256]> input_177 = layer_norm(axes = input_177_axes_0, beta = decoder_norm21_0_bias, epsilon = var_953, gamma = decoder_norm21_0_weight, x = input_175)[name = tensor<string, []>("input_177")];
-            tensor<fp32, [2, 6, 2048]> input_179 = linear(bias = decoder_linear1_0_bias, weight = decoder_linear1_0_weight, x = input_177)[name = tensor<string, []>("linear_45")];
-            tensor<fp32, [2, 6, 2048]> input_181 = relu(x = input_179)[name = tensor<string, []>("input_181")];
-            tensor<fp32, [2, 6, 256]> ffn_out_1 = linear(bias = decoder_linear2_0_bias, weight = decoder_linear2_0_weight, x = input_181)[name = tensor<string, []>("linear_46")];
-            tensor<fp32, [2, 6, 256]> input_183 = add(x = input_177, y = ffn_out_1)[name = tensor<string, []>("input_183")];
+            tensor<fp32, [2, 6, 256]> input_177 = add(x = query_1, y = sa2_out_1)[name = tensor<string, []>("input_177")];
+            tensor<int32, [1]> input_179_axes_0 = const()[name = tensor<string, []>("input_179_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [2, 6, 256]> input_179 = layer_norm(axes = input_179_axes_0, beta = inner_decoder_norm21_0_bias, epsilon = var_45, gamma = inner_decoder_norm21_0_weight, x = input_177)[name = tensor<string, []>("input_179")];
+            tensor<fp32, [2, 6, 2048]> input_181 = linear(bias = inner_decoder_linear1_0_bias, weight = inner_decoder_linear1_0_weight, x = input_179)[name = tensor<string, []>("linear_45")];
+            tensor<fp32, [2, 6, 2048]> input_183 = relu(x = input_181)[name = tensor<string, []>("input_183")];
+            tensor<fp32, [2, 6, 256]> ffn_out_1 = linear(bias = inner_decoder_linear2_0_bias, weight = inner_decoder_linear2_0_weight, x = input_183)[name = tensor<string, []>("linear_46")];
+            tensor<fp32, [2, 6, 256]> input_185 = add(x = input_179, y = ffn_out_1)[name = tensor<string, []>("input_185")];
             tensor<int32, [1]> xt_3_axes_0 = const()[name = tensor<string, []>("xt_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [2, 6, 256]> xt_3 = layer_norm(axes = xt_3_axes_0, beta = decoder_norm22_0_bias, epsilon = var_953, gamma = decoder_norm22_0_weight, x = input_183)[name = tensor<string, []>("xt_3")];
-            tensor<int32, [4]> var_1217 = const()[name = tensor<string, []>("op_1217"), val = tensor<int32, [4]>([1, 2, 6, 256])];
-            tensor<fp32, [1, 2, 6, 256]> x_31 = reshape(shape = var_1217, x = xt_3)[name = tensor<string, []>("x_31")];
-            tensor<int32, [4]> var_1219_perm_0 = const()[name = tensor<string, []>("op_1219_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1223 = const()[name = tensor<string, []>("op_1223"), val = tensor<int32, [3]>([6, 2, 256])];
-            tensor<fp32, [1, 6, 2, 256]> var_1219 = transpose(perm = var_1219_perm_0, x = x_31)[name = tensor<string, []>("transpose_15")];
-            tensor<fp32, [6, 2, 256]> x = reshape(shape = var_1223, x = var_1219)[name = tensor<string, []>("x")];
+            tensor<fp32, [2, 6, 256]> xt_3 = layer_norm(axes = xt_3_axes_0, beta = inner_decoder_norm22_0_bias, epsilon = var_45, gamma = inner_decoder_norm22_0_weight, x = input_185)[name = tensor<string, []>("xt_3")];
+            tensor<int32, [4]> var_1230 = const()[name = tensor<string, []>("op_1230"), val = tensor<int32, [4]>([1, 2, 6, 256])];
+            tensor<fp32, [1, 2, 6, 256]> x_31 = reshape(shape = var_1230, x = xt_3)[name = tensor<string, []>("x_31")];
+            tensor<int32, [4]> var_1232_perm_0 = const()[name = tensor<string, []>("op_1232_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1236 = const()[name = tensor<string, []>("op_1236"), val = tensor<int32, [3]>([6, 2, 256])];
+            tensor<fp32, [1, 6, 2, 256]> var_1232 = transpose(perm = var_1232_perm_0, x = x_31)[name = tensor<string, []>("transpose_15")];
+            tensor<fp32, [6, 2, 256]> x = reshape(shape = var_1236, x = var_1232)[name = tensor<string, []>("x")];
             tensor<int32, [5]> prev_kv_begin_0 = const()[name = tensor<string, []>("prev_kv_begin_0"), val = tensor<int32, [5]>([1, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_end_0 = const()[name = tensor<string, []>("prev_kv_end_0"), val = tensor<int32, [5]>([2, 6, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_end_mask_0 = const()[name = tensor<string, []>("prev_kv_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -1073,120 +1080,120 @@ program(1.0)
             tensor<bool, [2]> prev_scale_end_mask_0 = const()[name = tensor<string, []>("prev_scale_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale = slice_by_index(begin = prev_scale_begin_0, end = prev_scale_end_0, end_mask = prev_scale_end_mask_0, squeeze_mask = prev_scale_squeeze_mask_0, x = dec_scale)[name = tensor<string, []>("prev_scale")];
-            tensor<fp32, [6, 2, 256]> var_1231 = linear(bias = decoder_q_proj_1_bias, weight = decoder_q_proj_1_weight, x = x)[name = tensor<string, []>("linear_47")];
-            tensor<int32, [4]> var_1232 = const()[name = tensor<string, []>("op_1232"), val = tensor<int32, [4]>([6, 2, 4, 64])];
-            tensor<fp32, [6, 2, 4, 64]> var_1233 = reshape(shape = var_1232, x = var_1231)[name = tensor<string, []>("op_1233")];
+            tensor<fp32, [6, 2, 256]> var_1244 = linear(bias = inner_decoder_q_proj_1_bias, weight = inner_decoder_q_proj_1_weight, x = x)[name = tensor<string, []>("linear_47")];
+            tensor<int32, [4]> var_1245 = const()[name = tensor<string, []>("op_1245"), val = tensor<int32, [4]>([6, 2, 4, 64])];
+            tensor<fp32, [6, 2, 4, 64]> var_1246 = reshape(shape = var_1245, x = var_1244)[name = tensor<string, []>("op_1246")];
             tensor<int32, [4]> q_17_perm_0 = const()[name = tensor<string, []>("q_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [6, 2, 256]> var_1237 = linear(bias = decoder_k_proj_1_bias, weight = decoder_k_proj_1_weight, x = x)[name = tensor<string, []>("linear_48")];
-            tensor<fp32, []> var_1238 = const()[name = tensor<string, []>("op_1238"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [6, 2, 256]> var_1239 = mul(x = var_1237, y = var_1238)[name = tensor<string, []>("op_1239")];
-            tensor<int32, [4]> var_1240 = const()[name = tensor<string, []>("op_1240"), val = tensor<int32, [4]>([6, 2, 4, 64])];
-            tensor<fp32, [6, 2, 4, 64]> var_1241 = reshape(shape = var_1240, x = var_1239)[name = tensor<string, []>("op_1241")];
+            tensor<fp32, [6, 2, 256]> var_1250 = linear(bias = inner_decoder_k_proj_1_bias, weight = inner_decoder_k_proj_1_weight, x = x)[name = tensor<string, []>("linear_48")];
+            tensor<fp32, []> var_1251 = const()[name = tensor<string, []>("op_1251"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [6, 2, 256]> var_1252 = mul(x = var_1250, y = var_1251)[name = tensor<string, []>("op_1252")];
+            tensor<int32, [4]> var_1253 = const()[name = tensor<string, []>("op_1253"), val = tensor<int32, [4]>([6, 2, 4, 64])];
+            tensor<fp32, [6, 2, 4, 64]> var_1254 = reshape(shape = var_1253, x = var_1252)[name = tensor<string, []>("op_1254")];
             tensor<int32, [4]> k_17_perm_0 = const()[name = tensor<string, []>("k_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [6, 2, 256]> var_1245 = linear(bias = decoder_v_proj_1_bias, weight = decoder_v_proj_1_weight, x = x)[name = tensor<string, []>("linear_49")];
-            tensor<int32, [4]> var_1246 = const()[name = tensor<string, []>("op_1246"), val = tensor<int32, [4]>([6, 2, 4, 64])];
-            tensor<fp32, [6, 2, 4, 64]> var_1247 = reshape(shape = var_1246, x = var_1245)[name = tensor<string, []>("op_1247")];
+            tensor<fp32, [6, 2, 256]> var_1258 = linear(bias = inner_decoder_v_proj_1_bias, weight = inner_decoder_v_proj_1_weight, x = x)[name = tensor<string, []>("linear_49")];
+            tensor<int32, [4]> var_1259 = const()[name = tensor<string, []>("op_1259"), val = tensor<int32, [4]>([6, 2, 4, 64])];
+            tensor<fp32, [6, 2, 4, 64]> var_1260 = reshape(shape = var_1259, x = var_1258)[name = tensor<string, []>("op_1260")];
             tensor<int32, [4]> v_17_perm_0 = const()[name = tensor<string, []>("v_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [6, 2, 256]> input_187 = linear(bias = decoder_g_proj_1_bias, weight = decoder_g_proj_1_weight, x = x)[name = tensor<string, []>("linear_50")];
+            tensor<fp32, [6, 2, 256]> input_189 = linear(bias = inner_decoder_g_proj_1_bias, weight = inner_decoder_g_proj_1_weight, x = x)[name = tensor<string, []>("linear_50")];
             tensor<fp32, [1]> sqrt_s0 = sqrt(x = prev_scale)[name = tensor<string, []>("sqrt_s0")];
             tensor<fp32, [2]> s_t = add(x = prev_scale, y = cumsum_mask_1)[name = tensor<string, []>("s_t")];
             tensor<fp32, []> const_32 = const()[name = tensor<string, []>("const_32"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [2]> clip_3 = clip(alpha = var_958, beta = const_32, x = s_t)[name = tensor<string, []>("clip_3")];
+            tensor<fp32, [2]> clip_3 = clip(alpha = var_38, beta = const_32, x = s_t)[name = tensor<string, []>("clip_3")];
             tensor<fp32, [2]> sqrt_s_t = sqrt(x = clip_3)[name = tensor<string, []>("sqrt_s_t")];
             tensor<bool, []> qk_transpose_x_1 = const()[name = tensor<string, []>("qk_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_transpose_y_1 = const()[name = tensor<string, []>("qk_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [6, 4, 2, 64]> k_17 = transpose(perm = k_17_perm_0, x = var_1241)[name = tensor<string, []>("transpose_13")];
-            tensor<fp32, [6, 4, 2, 64]> q_17 = transpose(perm = q_17_perm_0, x = var_1233)[name = tensor<string, []>("transpose_14")];
+            tensor<fp32, [6, 4, 2, 64]> k_17 = transpose(perm = k_17_perm_0, x = var_1254)[name = tensor<string, []>("transpose_13")];
+            tensor<fp32, [6, 4, 2, 64]> q_17 = transpose(perm = q_17_perm_0, x = var_1246)[name = tensor<string, []>("transpose_14")];
             tensor<fp32, [6, 4, 2, 2]> qk = matmul(transpose_x = qk_transpose_x_1, transpose_y = qk_transpose_y_1, x = q_17, y = k_17)[name = tensor<string, []>("qk")];
-            tensor<int32, [2]> var_1262 = const()[name = tensor<string, []>("op_1262"), val = tensor<int32, [2]>([2, 1])];
-            tensor<fp32, [2, 1]> var_1263 = reshape(shape = var_1262, x = sqrt_s_t)[name = tensor<string, []>("op_1263")];
-            tensor<fp32, [2, 2]> M = real_div(x = causal_with_valid_1, y = var_1263)[name = tensor<string, []>("M")];
-            tensor<fp32, [6, 4, 2, 2]> var_1265 = mul(x = qk, y = M)[name = tensor<string, []>("op_1265")];
-            tensor<bool, []> inner_transpose_x_0 = const()[name = tensor<string, []>("inner_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> inner_transpose_y_0 = const()[name = tensor<string, []>("inner_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [6, 4, 2, 64]> v_17 = transpose(perm = v_17_perm_0, x = var_1247)[name = tensor<string, []>("transpose_12")];
-            tensor<fp32, [6, 4, 2, 64]> inner = matmul(transpose_x = inner_transpose_x_0, transpose_y = inner_transpose_y_0, x = var_1265, y = v_17)[name = tensor<string, []>("inner")];
-            tensor<bool, []> var_1267_transpose_x_0 = const()[name = tensor<string, []>("op_1267_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_1267_transpose_y_0 = const()[name = tensor<string, []>("op_1267_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [6, 4, 2, 64]> var_1267 = matmul(transpose_x = var_1267_transpose_x_0, transpose_y = var_1267_transpose_y_0, x = q_17, y = prev_kv)[name = tensor<string, []>("op_1267")];
-            tensor<fp32, [2]> var_1268 = real_div(x = sqrt_s0, y = sqrt_s_t)[name = tensor<string, []>("op_1268")];
-            tensor<int32, [4]> var_1269 = const()[name = tensor<string, []>("op_1269"), val = tensor<int32, [4]>([1, 1, 2, 1])];
-            tensor<fp32, [1, 1, 2, 1]> var_1270 = reshape(shape = var_1269, x = var_1268)[name = tensor<string, []>("op_1270")];
-            tensor<fp32, [6, 4, 2, 64]> cross = mul(x = var_1267, y = var_1270)[name = tensor<string, []>("cross")];
-            tensor<fp32, [6, 4, 2, 64]> out_31 = add(x = inner, y = cross)[name = tensor<string, []>("out_31")];
-            tensor<fp32, [6, 4, 2, 64]> v_masked = mul(x = v_17, y = var_1093)[name = tensor<string, []>("v_masked")];
-            tensor<fp32, [6, 4, 64, 64]> var_1276 = mul(x = prev_kv, y = sqrt_s0)[name = tensor<string, []>("op_1276")];
-            tensor<bool, []> var_1278_transpose_x_1 = const()[name = tensor<string, []>("op_1278_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_1278_transpose_y_1 = const()[name = tensor<string, []>("op_1278_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [6, 4, 64, 64]> var_1278 = matmul(transpose_x = var_1278_transpose_x_1, transpose_y = var_1278_transpose_y_1, x = k_17, y = v_masked)[name = tensor<string, []>("op_1278")];
-            tensor<fp32, [6, 4, 64, 64]> new_kv_unnorm = add(x = var_1276, y = var_1278)[name = tensor<string, []>("new_kv_unnorm")];
-            tensor<fp32, [1]> new_scale = add(x = prev_scale, y = var_1101)[name = tensor<string, []>("new_scale")];
+            tensor<int32, [2]> var_1275 = const()[name = tensor<string, []>("op_1275"), val = tensor<int32, [2]>([2, 1])];
+            tensor<fp32, [2, 1]> var_1276 = reshape(shape = var_1275, x = sqrt_s_t)[name = tensor<string, []>("op_1276")];
+            tensor<fp32, [2, 2]> M = real_div(x = causal_with_valid_1, y = var_1276)[name = tensor<string, []>("M")];
+            tensor<fp32, [6, 4, 2, 2]> var_1278 = mul(x = qk, y = M)[name = tensor<string, []>("op_1278")];
+            tensor<bool, []> inner_11_transpose_x_0 = const()[name = tensor<string, []>("inner_11_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> inner_11_transpose_y_0 = const()[name = tensor<string, []>("inner_11_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [6, 4, 2, 64]> v_17 = transpose(perm = v_17_perm_0, x = var_1260)[name = tensor<string, []>("transpose_12")];
+            tensor<fp32, [6, 4, 2, 64]> inner_11 = matmul(transpose_x = inner_11_transpose_x_0, transpose_y = inner_11_transpose_y_0, x = var_1278, y = v_17)[name = tensor<string, []>("inner_11")];
+            tensor<bool, []> var_1280_transpose_x_0 = const()[name = tensor<string, []>("op_1280_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_1280_transpose_y_0 = const()[name = tensor<string, []>("op_1280_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [6, 4, 2, 64]> var_1280 = matmul(transpose_x = var_1280_transpose_x_0, transpose_y = var_1280_transpose_y_0, x = q_17, y = prev_kv)[name = tensor<string, []>("op_1280")];
+            tensor<fp32, [2]> var_1281 = real_div(x = sqrt_s0, y = sqrt_s_t)[name = tensor<string, []>("op_1281")];
+            tensor<int32, [4]> var_1282 = const()[name = tensor<string, []>("op_1282"), val = tensor<int32, [4]>([1, 1, 2, 1])];
+            tensor<fp32, [1, 1, 2, 1]> var_1283 = reshape(shape = var_1282, x = var_1281)[name = tensor<string, []>("op_1283")];
+            tensor<fp32, [6, 4, 2, 64]> cross = mul(x = var_1280, y = var_1283)[name = tensor<string, []>("cross")];
+            tensor<fp32, [6, 4, 2, 64]> out_31 = add(x = inner_11, y = cross)[name = tensor<string, []>("out_31")];
+            tensor<fp32, [6, 4, 2, 64]> v_masked = mul(x = v_17, y = var_1106)[name = tensor<string, []>("v_masked")];
+            tensor<fp32, [6, 4, 64, 64]> var_1289 = mul(x = prev_kv, y = sqrt_s0)[name = tensor<string, []>("op_1289")];
+            tensor<bool, []> var_1291_transpose_x_1 = const()[name = tensor<string, []>("op_1291_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_1291_transpose_y_1 = const()[name = tensor<string, []>("op_1291_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [6, 4, 64, 64]> var_1291 = matmul(transpose_x = var_1291_transpose_x_1, transpose_y = var_1291_transpose_y_1, x = k_17, y = v_masked)[name = tensor<string, []>("op_1291")];
+            tensor<fp32, [6, 4, 64, 64]> new_kv_unnorm = add(x = var_1289, y = var_1291)[name = tensor<string, []>("new_kv_unnorm")];
+            tensor<fp32, [1]> new_scale = add(x = prev_scale, y = var_1114)[name = tensor<string, []>("new_scale")];
             tensor<fp32, []> const_33 = const()[name = tensor<string, []>("const_33"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_4 = clip(alpha = var_958, beta = const_33, x = new_scale)[name = tensor<string, []>("clip_4")];
+            tensor<fp32, [1]> clip_4 = clip(alpha = var_38, beta = const_33, x = new_scale)[name = tensor<string, []>("clip_4")];
             tensor<fp32, [1]> sqrt_new_scale = sqrt(x = clip_4)[name = tensor<string, []>("sqrt_new_scale")];
             tensor<fp32, [6, 4, 64, 64]> nkv = real_div(x = new_kv_unnorm, y = sqrt_new_scale)[name = tensor<string, []>("nkv")];
-            tensor<int32, [4]> var_1287_perm_0 = const()[name = tensor<string, []>("op_1287_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [4]> var_1300_perm_0 = const()[name = tensor<string, []>("op_1300_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_33_axes_0 = const()[name = tensor<string, []>("out_33_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [6, 2, 4, 64]> var_1287 = transpose(perm = var_1287_perm_0, x = out_31)[name = tensor<string, []>("transpose_11")];
-            tensor<fp32, [6, 2, 4, 64]> out_33 = layer_norm(axes = out_33_axes_0, epsilon = var_955, x = var_1287)[name = tensor<string, []>("out_33")];
-            tensor<int32, [3]> var_1291 = const()[name = tensor<string, []>("op_1291"), val = tensor<int32, [3]>([6, 2, 256])];
-            tensor<fp32, [6, 2, 256]> out = reshape(shape = var_1291, x = out_33)[name = tensor<string, []>("out")];
-            tensor<fp32, [6, 2, 256]> var_1293 = silu(x = input_187)[name = tensor<string, []>("op_1293")];
-            tensor<fp32, [6, 2, 256]> input_189 = mul(x = var_1293, y = out)[name = tensor<string, []>("input_189")];
-            tensor<fp32, [6, 2, 256]> ret_out = linear(bias = decoder_out_proj_1_bias, weight = decoder_out_proj_1_weight, x = input_189)[name = tensor<string, []>("linear_51")];
-            tensor<fp32, [6, 2, 256]> input_191 = add(x = x, y = ret_out)[name = tensor<string, []>("input_191")];
+            tensor<fp32, [6, 2, 4, 64]> var_1300 = transpose(perm = var_1300_perm_0, x = out_31)[name = tensor<string, []>("transpose_11")];
+            tensor<fp32, [6, 2, 4, 64]> out_33 = layer_norm(axes = out_33_axes_0, epsilon = var_53, x = var_1300)[name = tensor<string, []>("out_33")];
+            tensor<int32, [3]> var_1304 = const()[name = tensor<string, []>("op_1304"), val = tensor<int32, [3]>([6, 2, 256])];
+            tensor<fp32, [6, 2, 256]> out = reshape(shape = var_1304, x = out_33)[name = tensor<string, []>("out")];
+            tensor<fp32, [6, 2, 256]> var_1306 = silu(x = input_189)[name = tensor<string, []>("op_1306")];
+            tensor<fp32, [6, 2, 256]> input_191 = mul(x = var_1306, y = out)[name = tensor<string, []>("input_191")];
+            tensor<fp32, [6, 2, 256]> ret_out = linear(bias = inner_decoder_out_proj_1_bias, weight = inner_decoder_out_proj_1_weight, x = input_191)[name = tensor<string, []>("linear_51")];
+            tensor<fp32, [6, 2, 256]> input_193 = add(x = x, y = ret_out)[name = tensor<string, []>("input_193")];
             tensor<int32, [1]> xt_5_axes_0 = const()[name = tensor<string, []>("xt_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [6, 2, 256]> xt_5 = layer_norm(axes = xt_5_axes_0, beta = decoder_norm11_1_bias, epsilon = var_953, gamma = decoder_norm11_1_weight, x = input_191)[name = tensor<string, []>("xt_5")];
-            tensor<int32, [4]> var_1303 = const()[name = tensor<string, []>("op_1303"), val = tensor<int32, [4]>([1, 6, 2, 256])];
-            tensor<fp32, [1, 6, 2, 256]> var_1304 = reshape(shape = var_1303, x = xt_5)[name = tensor<string, []>("op_1304")];
-            tensor<int32, [4]> var_1305_perm_0 = const()[name = tensor<string, []>("op_1305_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1308 = const()[name = tensor<string, []>("op_1308"), val = tensor<int32, [3]>([2, 6, 256])];
-            tensor<fp32, [1, 2, 6, 256]> var_1305 = transpose(perm = var_1305_perm_0, x = var_1304)[name = tensor<string, []>("transpose_10")];
-            tensor<fp32, [2, 6, 256]> query_5 = reshape(shape = var_1308, x = var_1305)[name = tensor<string, []>("query_5")];
+            tensor<fp32, [6, 2, 256]> xt_5 = layer_norm(axes = xt_5_axes_0, beta = inner_decoder_norm11_1_bias, epsilon = var_45, gamma = inner_decoder_norm11_1_weight, x = input_193)[name = tensor<string, []>("xt_5")];
+            tensor<int32, [4]> var_1316 = const()[name = tensor<string, []>("op_1316"), val = tensor<int32, [4]>([1, 6, 2, 256])];
+            tensor<fp32, [1, 6, 2, 256]> var_1317 = reshape(shape = var_1316, x = xt_5)[name = tensor<string, []>("op_1317")];
+            tensor<int32, [4]> var_1318_perm_0 = const()[name = tensor<string, []>("op_1318_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1321 = const()[name = tensor<string, []>("op_1321"), val = tensor<int32, [3]>([2, 6, 256])];
+            tensor<fp32, [1, 2, 6, 256]> var_1318 = transpose(perm = var_1318_perm_0, x = var_1317)[name = tensor<string, []>("transpose_10")];
+            tensor<fp32, [2, 6, 256]> query_5 = reshape(shape = var_1321, x = var_1318)[name = tensor<string, []>("query_5")];
             tensor<int32, [3]> query_perm_0 = const()[name = tensor<string, []>("query_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [6, 2, 256]> query = transpose(perm = query_perm_0, x = query_5)[name = tensor<string, []>("transpose_9")];
-            tensor<fp32, [6, 2, 768]> var_1331 = linear(bias = decoder_self_attn2_1_in_proj_bias, weight = decoder_self_attn2_1_in_proj_weight, x = query)[name = tensor<string, []>("linear_52")];
+            tensor<fp32, [6, 2, 768]> var_1344 = linear(bias = inner_decoder_self_attn2_1_in_proj_bias, weight = inner_decoder_self_attn2_1_in_proj_weight, x = query)[name = tensor<string, []>("linear_52")];
             tensor<int32, [4]> concat_2 = const()[name = tensor<string, []>("concat_2"), val = tensor<int32, [4]>([6, 2, 3, 256])];
-            tensor<fp32, [6, 2, 3, 256]> var_1333 = reshape(shape = concat_2, x = var_1331)[name = tensor<string, []>("op_1333")];
-            tensor<int32, [1]> var_1334_axes_0 = const()[name = tensor<string, []>("op_1334_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp32, [1, 6, 2, 3, 256]> var_1334 = expand_dims(axes = var_1334_axes_0, x = var_1333)[name = tensor<string, []>("op_1334")];
-            tensor<int32, [5]> var_1335_perm_0 = const()[name = tensor<string, []>("op_1335_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
-            tensor<int32, [1]> var_1336_axes_0 = const()[name = tensor<string, []>("op_1336_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp32, [3, 6, 2, 1, 256]> var_1335 = transpose(perm = var_1335_perm_0, x = var_1334)[name = tensor<string, []>("transpose_8")];
-            tensor<fp32, [3, 6, 2, 256]> var_1336 = squeeze(axes = var_1336_axes_0, x = var_1335)[name = tensor<string, []>("op_1336")];
+            tensor<fp32, [6, 2, 3, 256]> var_1346 = reshape(shape = concat_2, x = var_1344)[name = tensor<string, []>("op_1346")];
+            tensor<int32, [1]> var_1347_axes_0 = const()[name = tensor<string, []>("op_1347_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 6, 2, 3, 256]> var_1347 = expand_dims(axes = var_1347_axes_0, x = var_1346)[name = tensor<string, []>("op_1347")];
+            tensor<int32, [5]> var_1348_perm_0 = const()[name = tensor<string, []>("op_1348_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
+            tensor<int32, [1]> var_1349_axes_0 = const()[name = tensor<string, []>("op_1349_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp32, [3, 6, 2, 1, 256]> var_1348 = transpose(perm = var_1348_perm_0, x = var_1347)[name = tensor<string, []>("transpose_8")];
+            tensor<fp32, [3, 6, 2, 256]> var_1349 = squeeze(axes = var_1349_axes_0, x = var_1348)[name = tensor<string, []>("op_1349")];
             tensor<int32, [4]> q_19_begin_0 = const()[name = tensor<string, []>("q_19_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> q_19_end_0 = const()[name = tensor<string, []>("q_19_end_0"), val = tensor<int32, [4]>([1, 6, 2, 256])];
             tensor<bool, [4]> q_19_end_mask_0 = const()[name = tensor<string, []>("q_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> q_19_squeeze_mask_0 = const()[name = tensor<string, []>("q_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [6, 2, 256]> q_19 = slice_by_index(begin = q_19_begin_0, end = q_19_end_0, end_mask = q_19_end_mask_0, squeeze_mask = q_19_squeeze_mask_0, x = var_1336)[name = tensor<string, []>("q_19")];
+            tensor<fp32, [6, 2, 256]> q_19 = slice_by_index(begin = q_19_begin_0, end = q_19_end_0, end_mask = q_19_end_mask_0, squeeze_mask = q_19_squeeze_mask_0, x = var_1349)[name = tensor<string, []>("q_19")];
             tensor<int32, [4]> k_19_begin_0 = const()[name = tensor<string, []>("k_19_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> k_19_end_0 = const()[name = tensor<string, []>("k_19_end_0"), val = tensor<int32, [4]>([2, 6, 2, 256])];
             tensor<bool, [4]> k_19_end_mask_0 = const()[name = tensor<string, []>("k_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> k_19_squeeze_mask_0 = const()[name = tensor<string, []>("k_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [6, 2, 256]> k_19 = slice_by_index(begin = k_19_begin_0, end = k_19_end_0, end_mask = k_19_end_mask_0, squeeze_mask = k_19_squeeze_mask_0, x = var_1336)[name = tensor<string, []>("k_19")];
+            tensor<fp32, [6, 2, 256]> k_19 = slice_by_index(begin = k_19_begin_0, end = k_19_end_0, end_mask = k_19_end_mask_0, squeeze_mask = k_19_squeeze_mask_0, x = var_1349)[name = tensor<string, []>("k_19")];
             tensor<int32, [4]> v_19_begin_0 = const()[name = tensor<string, []>("v_19_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> v_19_end_0 = const()[name = tensor<string, []>("v_19_end_0"), val = tensor<int32, [4]>([3, 6, 2, 256])];
             tensor<bool, [4]> v_19_end_mask_0 = const()[name = tensor<string, []>("v_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> v_19_squeeze_mask_0 = const()[name = tensor<string, []>("v_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [6, 2, 256]> v_19 = slice_by_index(begin = v_19_begin_0, end = v_19_end_0, end_mask = v_19_end_mask_0, squeeze_mask = v_19_squeeze_mask_0, x = var_1336)[name = tensor<string, []>("v_19")];
-            tensor<int32, [3]> var_1344 = const()[name = tensor<string, []>("op_1344"), val = tensor<int32, [3]>([6, 8, 64])];
-            tensor<fp32, [6, 8, 64]> var_1345 = reshape(shape = var_1344, x = q_19)[name = tensor<string, []>("op_1345")];
+            tensor<fp32, [6, 2, 256]> v_19 = slice_by_index(begin = v_19_begin_0, end = v_19_end_0, end_mask = v_19_end_mask_0, squeeze_mask = v_19_squeeze_mask_0, x = var_1349)[name = tensor<string, []>("v_19")];
+            tensor<int32, [3]> var_1357 = const()[name = tensor<string, []>("op_1357"), val = tensor<int32, [3]>([6, 8, 64])];
+            tensor<fp32, [6, 8, 64]> var_1358 = reshape(shape = var_1357, x = q_19)[name = tensor<string, []>("op_1358")];
             tensor<int32, [3]> q_21_perm_0 = const()[name = tensor<string, []>("q_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1351 = const()[name = tensor<string, []>("op_1351"), val = tensor<int32, [3]>([6, 8, 64])];
-            tensor<fp32, [6, 8, 64]> var_1352 = reshape(shape = var_1351, x = k_19)[name = tensor<string, []>("op_1352")];
+            tensor<int32, [3]> var_1364 = const()[name = tensor<string, []>("op_1364"), val = tensor<int32, [3]>([6, 8, 64])];
+            tensor<fp32, [6, 8, 64]> var_1365 = reshape(shape = var_1364, x = k_19)[name = tensor<string, []>("op_1365")];
             tensor<int32, [3]> k_21_perm_0 = const()[name = tensor<string, []>("k_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1358 = const()[name = tensor<string, []>("op_1358"), val = tensor<int32, [3]>([6, 8, 64])];
-            tensor<fp32, [6, 8, 64]> var_1359 = reshape(shape = var_1358, x = v_19)[name = tensor<string, []>("op_1359")];
+            tensor<int32, [3]> var_1371 = const()[name = tensor<string, []>("op_1371"), val = tensor<int32, [3]>([6, 8, 64])];
+            tensor<fp32, [6, 8, 64]> var_1372 = reshape(shape = var_1371, x = v_19)[name = tensor<string, []>("op_1372")];
             tensor<int32, [3]> v_21_perm_0 = const()[name = tensor<string, []>("v_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [4]> var_1362 = const()[name = tensor<string, []>("op_1362"), val = tensor<int32, [4]>([2, 4, 6, 64])];
-            tensor<fp32, [8, 6, 64]> q_21 = transpose(perm = q_21_perm_0, x = var_1345)[name = tensor<string, []>("transpose_7")];
-            tensor<fp32, [2, 4, 6, 64]> q = reshape(shape = var_1362, x = q_21)[name = tensor<string, []>("q")];
-            tensor<int32, [4]> var_1364 = const()[name = tensor<string, []>("op_1364"), val = tensor<int32, [4]>([2, 4, 6, 64])];
-            tensor<fp32, [8, 6, 64]> k_21 = transpose(perm = k_21_perm_0, x = var_1352)[name = tensor<string, []>("transpose_6")];
-            tensor<fp32, [2, 4, 6, 64]> k = reshape(shape = var_1364, x = k_21)[name = tensor<string, []>("k")];
-            tensor<int32, [4]> var_1366 = const()[name = tensor<string, []>("op_1366"), val = tensor<int32, [4]>([2, 4, 6, 64])];
-            tensor<fp32, [8, 6, 64]> v_21 = transpose(perm = v_21_perm_0, x = var_1359)[name = tensor<string, []>("transpose_5")];
-            tensor<fp32, [2, 4, 6, 64]> v = reshape(shape = var_1366, x = v_21)[name = tensor<string, []>("v")];
+            tensor<int32, [4]> var_1375 = const()[name = tensor<string, []>("op_1375"), val = tensor<int32, [4]>([2, 4, 6, 64])];
+            tensor<fp32, [8, 6, 64]> q_21 = transpose(perm = q_21_perm_0, x = var_1358)[name = tensor<string, []>("transpose_7")];
+            tensor<fp32, [2, 4, 6, 64]> q = reshape(shape = var_1375, x = q_21)[name = tensor<string, []>("q")];
+            tensor<int32, [4]> var_1377 = const()[name = tensor<string, []>("op_1377"), val = tensor<int32, [4]>([2, 4, 6, 64])];
+            tensor<fp32, [8, 6, 64]> k_21 = transpose(perm = k_21_perm_0, x = var_1365)[name = tensor<string, []>("transpose_6")];
+            tensor<fp32, [2, 4, 6, 64]> k = reshape(shape = var_1377, x = k_21)[name = tensor<string, []>("k")];
+            tensor<int32, [4]> var_1379 = const()[name = tensor<string, []>("op_1379"), val = tensor<int32, [4]>([2, 4, 6, 64])];
+            tensor<fp32, [8, 6, 64]> v_21 = transpose(perm = v_21_perm_0, x = var_1372)[name = tensor<string, []>("transpose_5")];
+            tensor<fp32, [2, 4, 6, 64]> v = reshape(shape = var_1379, x = v_21)[name = tensor<string, []>("v")];
             tensor<fp32, []> mul_3_y_0 = const()[name = tensor<string, []>("mul_3_y_0"), val = tensor<fp32, []>(0x1p-3)];
             tensor<fp32, [2, 4, 6, 64]> mul_3 = mul(x = q, y = mul_3_y_0)[name = tensor<string, []>("mul_3")];
             tensor<bool, []> matmul_1_transpose_y_0 = const()[name = tensor<string, []>("matmul_1_transpose_y_0"), val = tensor<bool, []>(true)];
@@ -1197,36 +1204,36 @@ program(1.0)
             tensor<bool, []> attn_output_9_transpose_x_0 = const()[name = tensor<string, []>("attn_output_9_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> attn_output_9_transpose_y_0 = const()[name = tensor<string, []>("attn_output_9_transpose_y_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [2, 4, 6, 64]> attn_output_9 = matmul(transpose_x = attn_output_9_transpose_x_0, transpose_y = attn_output_9_transpose_y_0, x = softmax_1, y = v)[name = tensor<string, []>("attn_output_9")];
-            tensor<int32, [4]> var_1369 = const()[name = tensor<string, []>("op_1369"), val = tensor<int32, [4]>([2, 0, 1, 3])];
-            tensor<int32, [2]> var_1374 = const()[name = tensor<string, []>("op_1374"), val = tensor<int32, [2]>([12, 256])];
-            tensor<fp32, [6, 2, 4, 64]> var_1370 = transpose(perm = var_1369, x = attn_output_9)[name = tensor<string, []>("transpose_4")];
-            tensor<fp32, [12, 256]> attn_output_11 = reshape(shape = var_1374, x = var_1370)[name = tensor<string, []>("attn_output_11")];
-            tensor<fp32, [12, 256]> attn_output_13 = linear(bias = decoder_self_attn2_1_out_proj_bias, weight = decoder_self_attn2_1_out_proj_weight, x = attn_output_11)[name = tensor<string, []>("linear_53")];
-            tensor<int32, [3]> var_1378 = const()[name = tensor<string, []>("op_1378"), val = tensor<int32, [3]>([6, 2, 256])];
-            tensor<fp32, [6, 2, 256]> attn_output = reshape(shape = var_1378, x = attn_output_13)[name = tensor<string, []>("attn_output")];
+            tensor<int32, [4]> var_1382 = const()[name = tensor<string, []>("op_1382"), val = tensor<int32, [4]>([2, 0, 1, 3])];
+            tensor<int32, [2]> var_1387 = const()[name = tensor<string, []>("op_1387"), val = tensor<int32, [2]>([12, 256])];
+            tensor<fp32, [6, 2, 4, 64]> var_1383 = transpose(perm = var_1382, x = attn_output_9)[name = tensor<string, []>("transpose_4")];
+            tensor<fp32, [12, 256]> attn_output_11 = reshape(shape = var_1387, x = var_1383)[name = tensor<string, []>("attn_output_11")];
+            tensor<fp32, [12, 256]> attn_output_13 = linear(bias = inner_decoder_self_attn2_1_out_proj_bias, weight = inner_decoder_self_attn2_1_out_proj_weight, x = attn_output_11)[name = tensor<string, []>("linear_53")];
+            tensor<int32, [3]> var_1391 = const()[name = tensor<string, []>("op_1391"), val = tensor<int32, [3]>([6, 2, 256])];
+            tensor<fp32, [6, 2, 256]> attn_output = reshape(shape = var_1391, x = attn_output_13)[name = tensor<string, []>("attn_output")];
             tensor<int32, [3]> sa2_out_perm_0 = const()[name = tensor<string, []>("sa2_out_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [2, 6, 256]> sa2_out = transpose(perm = sa2_out_perm_0, x = attn_output)[name = tensor<string, []>("transpose_3")];
-            tensor<fp32, [2, 6, 256]> input_193 = add(x = query_5, y = sa2_out)[name = tensor<string, []>("input_193")];
-            tensor<int32, [1]> input_195_axes_0 = const()[name = tensor<string, []>("input_195_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [2, 6, 256]> input_195 = layer_norm(axes = input_195_axes_0, beta = decoder_norm21_1_bias, epsilon = var_953, gamma = decoder_norm21_1_weight, x = input_193)[name = tensor<string, []>("input_195")];
-            tensor<fp32, [2, 6, 2048]> input_197 = linear(bias = decoder_linear1_1_bias, weight = decoder_linear1_1_weight, x = input_195)[name = tensor<string, []>("linear_54")];
-            tensor<fp32, [2, 6, 2048]> input_199 = relu(x = input_197)[name = tensor<string, []>("input_199")];
-            tensor<fp32, [2, 6, 256]> ffn_out = linear(bias = decoder_linear2_1_bias, weight = decoder_linear2_1_weight, x = input_199)[name = tensor<string, []>("linear_55")];
-            tensor<fp32, [2, 6, 256]> input_201 = add(x = input_195, y = ffn_out)[name = tensor<string, []>("input_201")];
+            tensor<fp32, [2, 6, 256]> input_195 = add(x = query_5, y = sa2_out)[name = tensor<string, []>("input_195")];
+            tensor<int32, [1]> input_197_axes_0 = const()[name = tensor<string, []>("input_197_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [2, 6, 256]> input_197 = layer_norm(axes = input_197_axes_0, beta = inner_decoder_norm21_1_bias, epsilon = var_45, gamma = inner_decoder_norm21_1_weight, x = input_195)[name = tensor<string, []>("input_197")];
+            tensor<fp32, [2, 6, 2048]> input_199 = linear(bias = inner_decoder_linear1_1_bias, weight = inner_decoder_linear1_1_weight, x = input_197)[name = tensor<string, []>("linear_54")];
+            tensor<fp32, [2, 6, 2048]> input_201 = relu(x = input_199)[name = tensor<string, []>("input_201")];
+            tensor<fp32, [2, 6, 256]> ffn_out = linear(bias = inner_decoder_linear2_1_bias, weight = inner_decoder_linear2_1_weight, x = input_201)[name = tensor<string, []>("linear_55")];
+            tensor<fp32, [2, 6, 256]> input_203 = add(x = input_197, y = ffn_out)[name = tensor<string, []>("input_203")];
             tensor<int32, [1]> xt_axes_0 = const()[name = tensor<string, []>("xt_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [2, 6, 256]> xt = layer_norm(axes = xt_axes_0, beta = decoder_norm22_1_bias, epsilon = var_953, gamma = decoder_norm22_1_weight, x = input_201)[name = tensor<string, []>("xt")];
-            tensor<int32, [4]> var_1398 = const()[name = tensor<string, []>("op_1398"), val = tensor<int32, [4]>([1, 2, 6, 256])];
-            tensor<fp32, [1, 2, 6, 256]> input = reshape(shape = var_1398, x = xt)[name = tensor<string, []>("input")];
-            tensor<int32, [1]> var_1400 = const()[name = tensor<string, []>("op_1400"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 6, 1]> var_1401 = reduce_l2_norm(axes = var_1400, keep_dims = var_956, x = input)[name = tensor<string, []>("op_1401")];
+            tensor<fp32, [2, 6, 256]> xt = layer_norm(axes = xt_axes_0, beta = inner_decoder_norm22_1_bias, epsilon = var_45, gamma = inner_decoder_norm22_1_weight, x = input_203)[name = tensor<string, []>("xt")];
+            tensor<int32, [4]> var_1411 = const()[name = tensor<string, []>("op_1411"), val = tensor<int32, [4]>([1, 2, 6, 256])];
+            tensor<fp32, [1, 2, 6, 256]> input = reshape(shape = var_1411, x = xt)[name = tensor<string, []>("input")];
+            tensor<int32, [1]> var_1413 = const()[name = tensor<string, []>("op_1413"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 2, 6, 1]> var_1414 = reduce_l2_norm(axes = var_1413, keep_dims = var_44, x = input)[name = tensor<string, []>("op_1414")];
             tensor<fp32, []> const_42 = const()[name = tensor<string, []>("const_42"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1, 2, 6, 1]> clip_5 = clip(alpha = var_948, beta = const_42, x = var_1401)[name = tensor<string, []>("clip_5")];
-            tensor<fp32, [1, 2, 6, 256]> var_1403 = real_div(x = input, y = clip_5)[name = tensor<string, []>("op_1403")];
+            tensor<fp32, [1, 2, 6, 1]> clip_5 = clip(alpha = var_58, beta = const_42, x = var_1414)[name = tensor<string, []>("clip_5")];
+            tensor<fp32, [1, 2, 6, 256]> var_1416 = real_div(x = input, y = clip_5)[name = tensor<string, []>("op_1416")];
             tensor<int32, [3]> concat_6 = const()[name = tensor<string, []>("concat_6"), val = tensor<int32, [3]>([2, 1, 256])];
             tensor<fp32, [2, 1, 256]> reshape_0 = reshape(shape = concat_6, x = emb)[name = tensor<string, []>("reshape_0")];
             tensor<int32, [4]> transpose_1_perm_0 = const()[name = tensor<string, []>("transpose_1_perm_0"), val = tensor<int32, [4]>([0, 1, 3, 2])];
             tensor<int32, [3]> concat_7 = const()[name = tensor<string, []>("concat_7"), val = tensor<int32, [3]>([2, 256, 6])];
-            tensor<fp32, [1, 2, 256, 6]> transpose_1 = transpose(perm = transpose_1_perm_0, x = var_1403)[name = tensor<string, []>("transpose_2")];
+            tensor<fp32, [1, 2, 256, 6]> transpose_1 = transpose(perm = transpose_1_perm_0, x = var_1416)[name = tensor<string, []>("transpose_2")];
             tensor<fp32, [2, 256, 6]> reshape_1 = reshape(shape = concat_7, x = transpose_1)[name = tensor<string, []>("reshape_1")];
             tensor<bool, []> matmul_2_transpose_x_0 = const()[name = tensor<string, []>("matmul_2_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> matmul_2_transpose_y_0 = const()[name = tensor<string, []>("matmul_2_transpose_y_0"), val = tensor<bool, []>(false)];
@@ -1237,10 +1244,10 @@ program(1.0)
             tensor<int32, [3]> output_end_0 = const()[name = tensor<string, []>("output_end_0"), val = tensor<int32, [3]>([1, 2, 5])];
             tensor<bool, [3]> output_end_mask_0 = const()[name = tensor<string, []>("output_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
             tensor<fp32, [1, 2, 4]> output = slice_by_index(begin = output_begin_0, end = output_end_0, end_mask = output_end_mask_0, x = reshape_2)[name = tensor<string, []>("output")];
-            tensor<fp32, [1, 2, 4]> probs = sigmoid(x = output)[name = tensor<string, []>("op_1407")];
-            tensor<int32, []> var_1409_axis_0 = const()[name = tensor<string, []>("op_1409_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [2, 6, 4, 64, 64]> dec_kv_new = stack(axis = var_1409_axis_0, values = (var_1105, nkv))[name = tensor<string, []>("op_1409")];
-            tensor<int32, []> var_1411_axis_0 = const()[name = tensor<string, []>("op_1411_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [2, 1]> dec_scale_new = stack(axis = var_1411_axis_0, values = (new_scale_9, new_scale))[name = tensor<string, []>("op_1411")];
+            tensor<fp32, [1, 2, 4]> probs = sigmoid(x = output)[name = tensor<string, []>("op_1420")];
+            tensor<int32, []> var_1422_axis_0 = const()[name = tensor<string, []>("op_1422_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [2, 6, 4, 64, 64]> dec_kv_new = stack(axis = var_1422_axis_0, values = (var_1118, nkv))[name = tensor<string, []>("op_1422")];
+            tensor<int32, []> var_1424_axis_0 = const()[name = tensor<string, []>("op_1424_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [2, 1]> dec_scale_new = stack(axis = var_1424_axis_0, values = (new_scale_9, new_scale))[name = tensor<string, []>("op_1424")];
         } -> (probs, enc_kv_new, enc_scale_new, enc_conv_cache_new, cnn_window_new, dec_kv_new, dec_scale_new);
 }
\ No newline at end of file
diff --git a/optimized/ami/200ms/ls_eend_ami_200ms.mlpackage/Data/com.apple.CoreML/model.mlmodel b/optimized/ami/200ms/ls_eend_ami_200ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
index 4c678f86789d8217225a774ec066ff29f1c6adcc..e7ff7887f06048798363799e4ecb439139f1e774 100644
--- a/optimized/ami/200ms/ls_eend_ami_200ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
+++ b/optimized/ami/200ms/ls_eend_ami_200ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:286832ef1477a85722d0e808565e32c2edf2c7abbbb9f2b3447253ac30f39155
-size 179858
+oid sha256:7a616be3cfa5c13828e61ddda409e43a20eb8f48c2893530948c7d073a6c87b3
+size 184838
diff --git a/optimized/ami/200ms/ls_eend_ami_200ms.mlpackage/Manifest.json b/optimized/ami/200ms/ls_eend_ami_200ms.mlpackage/Manifest.json
index 612bcaf8e0b488f282381fd935e727601819bea6..579432c763882788ec14fcf33d6cef3c9fad713b 100644
--- a/optimized/ami/200ms/ls_eend_ami_200ms.mlpackage/Manifest.json
+++ b/optimized/ami/200ms/ls_eend_ami_200ms.mlpackage/Manifest.json
@@ -1,18 +1,18 @@
 {
     "fileFormatVersion": "1.0.0",
     "itemInfoEntries": {
-        "9AA855BB-15E6-4440-985F-9D01D6A86BD7": {
+        "1F7F287A-3524-40B3-B470-5E617233CC0A": {
             "author": "com.apple.CoreML",
             "description": "CoreML Model Weights",
             "name": "weights",
             "path": "com.apple.CoreML/weights"
         },
-        "FE3BD696-8DA2-4220-BA67-6DFD3C1F41A7": {
+        "D525D189-4337-40E9-852C-1E328106320D": {
             "author": "com.apple.CoreML",
             "description": "CoreML Model Specification",
             "name": "model.mlmodel",
             "path": "com.apple.CoreML/model.mlmodel"
         }
     },
-    "rootModelIdentifier": "FE3BD696-8DA2-4220-BA67-6DFD3C1F41A7"
+    "rootModelIdentifier": "D525D189-4337-40E9-852C-1E328106320D"
 }
diff --git a/optimized/ami/300ms/ls_eend_ami_300ms.mlmodelc/analytics/coremldata.bin b/optimized/ami/300ms/ls_eend_ami_300ms.mlmodelc/analytics/coremldata.bin
index 3f6639b57a269384ee54f9f1660607c52c504864..a13d866bed0a243a078e8be8aba58b6889b0eb4e 100644
--- a/optimized/ami/300ms/ls_eend_ami_300ms.mlmodelc/analytics/coremldata.bin
+++ b/optimized/ami/300ms/ls_eend_ami_300ms.mlmodelc/analytics/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a6f56029ce7301e97b0cefc55ff6f7d7d8a47604b531cac5a18717b8934c6c12
+oid sha256:1440d924c2808e18a0ec1b1d62ad4cd68275891ba9c6688a69241a12558f5ae5
 size 243
diff --git a/optimized/ami/300ms/ls_eend_ami_300ms.mlmodelc/coremldata.bin b/optimized/ami/300ms/ls_eend_ami_300ms.mlmodelc/coremldata.bin
index 0adcb3383c298e39175a387aa8c5ed93b77e17d8..0b6e249fee2052fd2894882289efbf14210186c6 100644
--- a/optimized/ami/300ms/ls_eend_ami_300ms.mlmodelc/coremldata.bin
+++ b/optimized/ami/300ms/ls_eend_ami_300ms.mlmodelc/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e5272855c6f2aed3afbb9d4479de3da636f7780e628ff7ad02dc5bbc5f5d7894
-size 1292
+oid sha256:44f6bb31efc91684a91067fdc08db6715c7bcac6d9358cca9e7ff435eaa3e4a1
+size 1395
diff --git a/optimized/ami/300ms/ls_eend_ami_300ms.mlmodelc/metadata.json b/optimized/ami/300ms/ls_eend_ami_300ms.mlmodelc/metadata.json
index 24752739dc034358ad51ac236bb8ea00423b379e..c0b4784fccc4747491b550cb4a84e30f4973d127 100644
--- a/optimized/ami/300ms/ls_eend_ami_300ms.mlmodelc/metadata.json
+++ b/optimized/ami/300ms/ls_eend_ami_300ms.mlmodelc/metadata.json
@@ -1,7 +1,7 @@
 [
   {
     "metadataOutputVersion" : "3.0",
-    "shortDescription" : "LS-EEND AMI streaming diarizer (pipeline, T=3, max_speakers=4)",
+    "shortDescription" : "LS-EEND AMI streaming diarizer (pipeline, T=3, max_speakers=4, layout=raw_mel, cu=all)",
     "outputSchema" : [
       {
         "hasShapeFlexibility" : "0",
@@ -81,7 +81,7 @@
     "specificationVersion" : 8,
     "mlProgramOperationTypeHistogram" : {
       "Ios16.reduceL2Norm" : 2,
-      "Ios17.reshape" : 68,
+      "Ios17.reshape" : 69,
       "Ios16.softmax" : 2,
       "Ios17.matmul" : 29,
       "Ios17.transpose" : 57,
@@ -89,7 +89,7 @@
       "Ios17.expandDims" : 3,
       "Ios17.add" : 46,
       "Ios16.sigmoid" : 5,
-      "Ios17.sliceByIndex" : 56,
+      "Ios17.sliceByIndex" : 59,
       "Tile" : 1,
       "Ios16.reduceSum" : 1,
       "Ios17.squeeze" : 2,
@@ -101,7 +101,7 @@
       "Ios16.silu" : 18,
       "Ios17.realDiv" : 20,
       "Ios17.linear" : 56,
-      "Stack" : 5,
+      "Stack" : 6,
       "Ios17.concat" : 18,
       "Ios16.relu" : 2,
       "Ios16.cumsum" : 1,
@@ -128,9 +128,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float32",
-        "formattedType" : "MultiArray (Float32 1 × 3 × 345)",
+        "formattedType" : "MultiArray (Float32 1 × 35 × 23)",
         "shortDescription" : "",
-        "shape" : "[1, 3, 345]",
+        "shape" : "[1, 35, 23]",
         "name" : "features",
         "type" : "MultiArray"
       },
@@ -206,8 +206,8 @@
       }
     ],
     "userDefinedMetadata" : {
-      "com.github.apple.coremltools.conversion_date" : "2026-04-16",
-      "config" : "{\"model_name\": \"ami\", \"model_label\": \"AMI\", \"variant\": \"pipeline\", \"chunk_size\": 3, \"step_duration_ms\": 300, \"frame_duration_ms\": 100, \"frame_duration_seconds\": 0.1, \"max_speakers\": 4, \"max_nspks\": 6, \"n_units\": 256, \"n_heads\": 4, \"enc_n_layers\": 4, \"dec_n_layers\": 2, \"conv_kernel_size\": 16, \"conv_delay\": 9, \"sample_rate\": 8000, \"win_length\": 200, \"hop_length\": 80, \"n_mels\": 23, \"context_size\": 7, \"subsampling\": 10, \"feat_type\": \"logmel23_cummn\", \"pure_roll\": true}",
+      "com.github.apple.coremltools.conversion_date" : "2026-04-18",
+      "config" : "{\"model_name\": \"ami\", \"model_label\": \"AMI\", \"variant\": \"pipeline\", \"chunk_size\": 3, \"step_duration_ms\": 300, \"frame_duration_ms\": 100, \"frame_duration_seconds\": 0.1, \"max_speakers\": 4, \"max_nspks\": 6, \"n_units\": 256, \"n_heads\": 4, \"enc_n_layers\": 4, \"dec_n_layers\": 2, \"conv_kernel_size\": 16, \"conv_delay\": 9, \"sample_rate\": 8000, \"win_length\": 200, \"hop_length\": 80, \"n_mels\": 23, \"context_size\": 7, \"subsampling\": 10, \"feat_type\": \"logmel23_cummn\", \"pure_roll\": true, \"input_layout\": \"raw_mel\", \"compute_units_export\": \"all\", \"raw_mel_length\": 35}",
       "com.github.apple.coremltools.source" : "torch==2.6.0",
       "com.github.apple.coremltools.version" : "9.0",
       "com.github.apple.coremltools.source_dialect" : "TorchScript"
diff --git a/optimized/ami/300ms/ls_eend_ami_300ms.mlmodelc/model.mil b/optimized/ami/300ms/ls_eend_ami_300ms.mlmodelc/model.mil
index 970596a9f4827af95d9232b6600321f0fda4a1c0..0b1302f5a5fd1e848cfab7ab40d920e3e8992726 100644
--- a/optimized/ami/300ms/ls_eend_ami_300ms.mlmodelc/model.mil
+++ b/optimized/ami/300ms/ls_eend_ami_300ms.mlmodelc/model.mil
@@ -1,234 +1,252 @@
 program(1.0)
 [buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.6.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "9.0"}})]
 {
-    func main<ios17>(tensor<fp32, [1, 256, 18]> cnn_window, tensor<fp32, [2, 6, 4, 64, 64]> dec_kv, tensor<fp32, [2, 1]> dec_scale, tensor<fp32, [4, 1, 16, 256]> enc_conv_cache, tensor<fp32, [4, 1, 4, 64, 64]> enc_kv, tensor<fp32, [4, 1]> enc_scale, tensor<fp32, [1, 3, 345]> features, tensor<fp32, [3]> valid_mask) {
-            tensor<fp32, [256]> encoder_cnn_bias = const()[name = tensor<string, []>("encoder_cnn_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
-            tensor<fp32, [256, 256, 19]> encoder_cnn_weight = const()[name = tensor<string, []>("encoder_cnn_weight"), val = tensor<fp32, [256, 256, 19]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
-            tensor<fp32, [3, 3]> encoder__causal_mask = const()[name = tensor<string, []>("encoder__causal_mask"), val = tensor<fp32, [3, 3]>([[0x1p+0, 0x0p+0, 0x0p+0], [0x1p+0, 0x1p+0, 0x0p+0], [0x1p+0, 0x1p+0, 0x1p+0]])];
-            tensor<fp32, [3]> encoder__t_index = const()[name = tensor<string, []>("encoder__t_index"), val = tensor<fp32, [3]>([0x1p+0, 0x1p+1, 0x1.8p+1])];
-            tensor<fp32, [256]> encoder_input_projection_linear_bias = const()[name = tensor<string, []>("encoder_input_projection_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4981952)))];
-            tensor<fp32, [256, 345]> encoder_input_projection_linear_weight = const()[name = tensor<string, []>("encoder_input_projection_linear_weight"), val = tensor<fp32, [256, 345]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4983040)))];
-            tensor<fp32, [256]> encoder_pre_layer_norm_bias = const()[name = tensor<string, []>("encoder_pre_layer_norm_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5336384)))];
-            tensor<fp32, [256]> encoder_pre_layer_norm_weight = const()[name = tensor<string, []>("encoder_pre_layer_norm_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5337472)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5338560)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5339648)))];
-            tensor<fp32, [1024]> encoder_ffn1_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5340736)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5344896)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6393536)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6394624)))];
-            tensor<fp32, [256]> encoder_ret_lns_0_bias = const()[name = tensor<string, []>("encoder_ret_lns_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7443264)))];
-            tensor<fp32, [256]> encoder_ret_lns_0_weight = const()[name = tensor<string, []>("encoder_ret_lns_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7444352)))];
-            tensor<fp32, [256]> encoder_q_proj_0_bias = const()[name = tensor<string, []>("encoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7445440)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_0_weight = const()[name = tensor<string, []>("encoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7446528)))];
-            tensor<fp32, [256]> encoder_k_proj_0_bias = const()[name = tensor<string, []>("encoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7708736)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_0_weight = const()[name = tensor<string, []>("encoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7709824)))];
-            tensor<fp32, [256]> encoder_v_proj_0_bias = const()[name = tensor<string, []>("encoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7972032)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_0_weight = const()[name = tensor<string, []>("encoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7973120)))];
-            tensor<fp32, [256]> encoder_g_proj_0_bias = const()[name = tensor<string, []>("encoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8235328)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_0_weight = const()[name = tensor<string, []>("encoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8236416)))];
-            tensor<fp32, [256]> encoder_out_proj_0_bias = const()[name = tensor<string, []>("encoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8498624)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_0_weight = const()[name = tensor<string, []>("encoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8499712)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8761920)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8763008)))];
-            tensor<fp32, [512]> encoder_conv_module_0_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8764096)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_0_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8766208)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_0_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9290560)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9307008)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9308096)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9309184)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9310272)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9311360)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_0_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9312448)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9574656)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9575744)))];
-            tensor<fp32, [1024]> encoder_ffn2_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9576832)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9580992)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10629632)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10630720)))];
-            tensor<fp32, [256]> encoder_layer_norm_0_bias = const()[name = tensor<string, []>("encoder_layer_norm_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11679360)))];
-            tensor<fp32, [256]> encoder_layer_norm_0_weight = const()[name = tensor<string, []>("encoder_layer_norm_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11680448)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11681536)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11682624)))];
-            tensor<fp32, [1024]> encoder_ffn1_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11683712)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11687872)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12736512)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12737600)))];
-            tensor<fp32, [256]> encoder_ret_lns_1_bias = const()[name = tensor<string, []>("encoder_ret_lns_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13786240)))];
-            tensor<fp32, [256]> encoder_ret_lns_1_weight = const()[name = tensor<string, []>("encoder_ret_lns_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13787328)))];
-            tensor<fp32, [256]> encoder_q_proj_1_bias = const()[name = tensor<string, []>("encoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13788416)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_1_weight = const()[name = tensor<string, []>("encoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13789504)))];
-            tensor<fp32, [256]> encoder_k_proj_1_bias = const()[name = tensor<string, []>("encoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14051712)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_1_weight = const()[name = tensor<string, []>("encoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14052800)))];
-            tensor<fp32, [256]> encoder_v_proj_1_bias = const()[name = tensor<string, []>("encoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14315008)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_1_weight = const()[name = tensor<string, []>("encoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14316096)))];
-            tensor<fp32, [256]> encoder_g_proj_1_bias = const()[name = tensor<string, []>("encoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14578304)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_1_weight = const()[name = tensor<string, []>("encoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14579392)))];
-            tensor<fp32, [256]> encoder_out_proj_1_bias = const()[name = tensor<string, []>("encoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14841600)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_1_weight = const()[name = tensor<string, []>("encoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14842688)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15104896)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15105984)))];
-            tensor<fp32, [512]> encoder_conv_module_1_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15107072)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_1_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15109184)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_1_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15633536)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15649984)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15651072)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15652160)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15653248)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15654336)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_1_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15655424)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15917632)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15918720)))];
-            tensor<fp32, [1024]> encoder_ffn2_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15919808)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15923968)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16972608)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16973696)))];
-            tensor<fp32, [256]> encoder_layer_norm_1_bias = const()[name = tensor<string, []>("encoder_layer_norm_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18022336)))];
-            tensor<fp32, [256]> encoder_layer_norm_1_weight = const()[name = tensor<string, []>("encoder_layer_norm_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18023424)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18024512)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18025600)))];
-            tensor<fp32, [1024]> encoder_ffn1_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18026688)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18030848)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19079488)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19080576)))];
-            tensor<fp32, [256]> encoder_ret_lns_2_bias = const()[name = tensor<string, []>("encoder_ret_lns_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20129216)))];
-            tensor<fp32, [256]> encoder_ret_lns_2_weight = const()[name = tensor<string, []>("encoder_ret_lns_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20130304)))];
-            tensor<fp32, [256]> encoder_q_proj_2_bias = const()[name = tensor<string, []>("encoder_q_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20131392)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_2_weight = const()[name = tensor<string, []>("encoder_q_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20132480)))];
-            tensor<fp32, [256]> encoder_k_proj_2_bias = const()[name = tensor<string, []>("encoder_k_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20394688)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_2_weight = const()[name = tensor<string, []>("encoder_k_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20395776)))];
-            tensor<fp32, [256]> encoder_v_proj_2_bias = const()[name = tensor<string, []>("encoder_v_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20657984)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_2_weight = const()[name = tensor<string, []>("encoder_v_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20659072)))];
-            tensor<fp32, [256]> encoder_g_proj_2_bias = const()[name = tensor<string, []>("encoder_g_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20921280)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_2_weight = const()[name = tensor<string, []>("encoder_g_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20922368)))];
-            tensor<fp32, [256]> encoder_out_proj_2_bias = const()[name = tensor<string, []>("encoder_out_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21184576)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_2_weight = const()[name = tensor<string, []>("encoder_out_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21185664)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21447872)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21448960)))];
-            tensor<fp32, [512]> encoder_conv_module_2_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21450048)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_2_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21452160)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_2_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21976512)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21992960)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21994048)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21995136)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21996224)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21997312)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_2_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21998400)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22260608)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22261696)))];
-            tensor<fp32, [1024]> encoder_ffn2_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22262784)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22266944)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23315584)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23316672)))];
-            tensor<fp32, [256]> encoder_layer_norm_2_bias = const()[name = tensor<string, []>("encoder_layer_norm_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24365312)))];
-            tensor<fp32, [256]> encoder_layer_norm_2_weight = const()[name = tensor<string, []>("encoder_layer_norm_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24366400)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24367488)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24368576)))];
-            tensor<fp32, [1024]> encoder_ffn1_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24369664)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24373824)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25422464)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25423552)))];
-            tensor<fp32, [256]> encoder_ret_lns_3_bias = const()[name = tensor<string, []>("encoder_ret_lns_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26472192)))];
-            tensor<fp32, [256]> encoder_ret_lns_3_weight = const()[name = tensor<string, []>("encoder_ret_lns_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26473280)))];
-            tensor<fp32, [256]> encoder_q_proj_3_bias = const()[name = tensor<string, []>("encoder_q_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26474368)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_3_weight = const()[name = tensor<string, []>("encoder_q_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26475456)))];
-            tensor<fp32, [256]> encoder_k_proj_3_bias = const()[name = tensor<string, []>("encoder_k_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26737664)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_3_weight = const()[name = tensor<string, []>("encoder_k_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26738752)))];
-            tensor<fp32, [256]> encoder_v_proj_3_bias = const()[name = tensor<string, []>("encoder_v_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27000960)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_3_weight = const()[name = tensor<string, []>("encoder_v_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27002048)))];
-            tensor<fp32, [256]> encoder_g_proj_3_bias = const()[name = tensor<string, []>("encoder_g_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27264256)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_3_weight = const()[name = tensor<string, []>("encoder_g_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27265344)))];
-            tensor<fp32, [256]> encoder_out_proj_3_bias = const()[name = tensor<string, []>("encoder_out_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27527552)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_3_weight = const()[name = tensor<string, []>("encoder_out_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27528640)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27790848)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27791936)))];
-            tensor<fp32, [512]> encoder_conv_module_3_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27793024)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_3_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27795136)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_3_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28319488)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28335936)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28337024)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28338112)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28339200)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28340288)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_3_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28341376)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28603584)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28604672)))];
-            tensor<fp32, [1024]> encoder_ffn2_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28605760)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28609920)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29658560)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29659648)))];
-            tensor<fp32, [256]> encoder_layer_norm_3_bias = const()[name = tensor<string, []>("encoder_layer_norm_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30708288)))];
-            tensor<fp32, [256]> encoder_layer_norm_3_weight = const()[name = tensor<string, []>("encoder_layer_norm_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30709376)))];
-            tensor<fp32, [3, 3]> decoder__causal_mask = const()[name = tensor<string, []>("decoder__causal_mask"), val = tensor<fp32, [3, 3]>([[0x1p+0, 0x0p+0, 0x0p+0], [0x1p+0, 0x1p+0, 0x0p+0], [0x1p+0, 0x1p+0, 0x1p+0]])];
-            tensor<fp32, [256]> decoder_convert_bias = const()[name = tensor<string, []>("decoder_convert_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710464)))];
-            tensor<fp32, [256, 512]> decoder_convert_weight = const()[name = tensor<string, []>("decoder_convert_weight"), val = tensor<fp32, [256, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30711552)))];
-            tensor<fp32, [256]> decoder_q_proj_0_bias = const()[name = tensor<string, []>("decoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31235904)))];
-            tensor<fp32, [256, 256]> decoder_q_proj_0_weight = const()[name = tensor<string, []>("decoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31236992)))];
-            tensor<fp32, [256]> decoder_k_proj_0_bias = const()[name = tensor<string, []>("decoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31499200)))];
-            tensor<fp32, [256, 256]> decoder_k_proj_0_weight = const()[name = tensor<string, []>("decoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31500288)))];
-            tensor<fp32, [256]> decoder_v_proj_0_bias = const()[name = tensor<string, []>("decoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31762496)))];
-            tensor<fp32, [256, 256]> decoder_v_proj_0_weight = const()[name = tensor<string, []>("decoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31763584)))];
-            tensor<fp32, [256]> decoder_g_proj_0_bias = const()[name = tensor<string, []>("decoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32025792)))];
-            tensor<fp32, [256, 256]> decoder_g_proj_0_weight = const()[name = tensor<string, []>("decoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32026880)))];
-            tensor<fp32, [256]> decoder_out_proj_0_bias = const()[name = tensor<string, []>("decoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32289088)))];
-            tensor<fp32, [256, 256]> decoder_out_proj_0_weight = const()[name = tensor<string, []>("decoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32290176)))];
-            tensor<fp32, [256]> decoder_norm11_0_bias = const()[name = tensor<string, []>("decoder_norm11_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32552384)))];
-            tensor<fp32, [256]> decoder_norm11_0_weight = const()[name = tensor<string, []>("decoder_norm11_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32553472)))];
-            tensor<fp32, [256]> decoder_self_attn2_0_out_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_0_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32554560)))];
-            tensor<fp32, [256, 256]> decoder_self_attn2_0_out_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_0_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32555648)))];
-            tensor<fp32, [768]> decoder_self_attn2_0_in_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_0_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32817856)))];
-            tensor<fp32, [768, 256]> decoder_self_attn2_0_in_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_0_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32820992)))];
-            tensor<fp32, [256]> decoder_norm21_0_bias = const()[name = tensor<string, []>("decoder_norm21_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33607488)))];
-            tensor<fp32, [256]> decoder_norm21_0_weight = const()[name = tensor<string, []>("decoder_norm21_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33608576)))];
-            tensor<fp32, [2048]> decoder_linear1_0_bias = const()[name = tensor<string, []>("decoder_linear1_0_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33609664)))];
-            tensor<fp32, [2048, 256]> decoder_linear1_0_weight = const()[name = tensor<string, []>("decoder_linear1_0_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33617920)))];
-            tensor<fp32, [256]> decoder_linear2_0_bias = const()[name = tensor<string, []>("decoder_linear2_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35715136)))];
-            tensor<fp32, [256, 2048]> decoder_linear2_0_weight = const()[name = tensor<string, []>("decoder_linear2_0_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35716224)))];
-            tensor<fp32, [256]> decoder_norm22_0_bias = const()[name = tensor<string, []>("decoder_norm22_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37813440)))];
-            tensor<fp32, [256]> decoder_norm22_0_weight = const()[name = tensor<string, []>("decoder_norm22_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37814528)))];
-            tensor<fp32, [256]> decoder_q_proj_1_bias = const()[name = tensor<string, []>("decoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37815616)))];
-            tensor<fp32, [256, 256]> decoder_q_proj_1_weight = const()[name = tensor<string, []>("decoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37816704)))];
-            tensor<fp32, [256]> decoder_k_proj_1_bias = const()[name = tensor<string, []>("decoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38078912)))];
-            tensor<fp32, [256, 256]> decoder_k_proj_1_weight = const()[name = tensor<string, []>("decoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38080000)))];
-            tensor<fp32, [256]> decoder_v_proj_1_bias = const()[name = tensor<string, []>("decoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38342208)))];
-            tensor<fp32, [256, 256]> decoder_v_proj_1_weight = const()[name = tensor<string, []>("decoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38343296)))];
-            tensor<fp32, [256]> decoder_g_proj_1_bias = const()[name = tensor<string, []>("decoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38605504)))];
-            tensor<fp32, [256, 256]> decoder_g_proj_1_weight = const()[name = tensor<string, []>("decoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38606592)))];
-            tensor<fp32, [256]> decoder_out_proj_1_bias = const()[name = tensor<string, []>("decoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38868800)))];
-            tensor<fp32, [256, 256]> decoder_out_proj_1_weight = const()[name = tensor<string, []>("decoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38869888)))];
-            tensor<fp32, [256]> decoder_norm11_1_bias = const()[name = tensor<string, []>("decoder_norm11_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39132096)))];
-            tensor<fp32, [256]> decoder_norm11_1_weight = const()[name = tensor<string, []>("decoder_norm11_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39133184)))];
-            tensor<fp32, [256]> decoder_self_attn2_1_out_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_1_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39134272)))];
-            tensor<fp32, [256, 256]> decoder_self_attn2_1_out_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_1_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39135360)))];
-            tensor<fp32, [768]> decoder_self_attn2_1_in_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_1_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39397568)))];
-            tensor<fp32, [768, 256]> decoder_self_attn2_1_in_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_1_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39400704)))];
-            tensor<fp32, [256]> decoder_norm21_1_bias = const()[name = tensor<string, []>("decoder_norm21_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40187200)))];
-            tensor<fp32, [256]> decoder_norm21_1_weight = const()[name = tensor<string, []>("decoder_norm21_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40188288)))];
-            tensor<fp32, [2048]> decoder_linear1_1_bias = const()[name = tensor<string, []>("decoder_linear1_1_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40189376)))];
-            tensor<fp32, [2048, 256]> decoder_linear1_1_weight = const()[name = tensor<string, []>("decoder_linear1_1_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40197632)))];
-            tensor<fp32, [256]> decoder_linear2_1_bias = const()[name = tensor<string, []>("decoder_linear2_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42294848)))];
-            tensor<fp32, [256, 2048]> decoder_linear2_1_weight = const()[name = tensor<string, []>("decoder_linear2_1_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42295936)))];
-            tensor<fp32, [256]> decoder_norm22_1_bias = const()[name = tensor<string, []>("decoder_norm22_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44393152)))];
-            tensor<fp32, [256]> decoder_norm22_1_weight = const()[name = tensor<string, []>("decoder_norm22_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44394240)))];
-            tensor<fp32, []> var_11 = const()[name = tensor<string, []>("op_11"), val = tensor<fp32, []>(0x1.5798eep-27)];
-            tensor<fp32, []> var_17 = const()[name = tensor<string, []>("op_17"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
-            tensor<int32, []> var_20 = const()[name = tensor<string, []>("op_20"), val = tensor<int32, []>(2)];
-            tensor<int32, []> var_23 = const()[name = tensor<string, []>("op_23"), val = tensor<int32, []>(0)];
-            tensor<int32, []> var_26 = const()[name = tensor<string, []>("op_26"), val = tensor<int32, []>(1)];
-            tensor<fp32, []> var_28 = const()[name = tensor<string, []>("op_28"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
-            tensor<bool, []> var_29 = const()[name = tensor<string, []>("op_29"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 3, 256]> input_1 = linear(bias = encoder_input_projection_linear_bias, weight = encoder_input_projection_linear_weight, x = features)[name = tensor<string, []>("linear_0")];
-            tensor<int32, [1]> input_3_axes_0 = const()[name = tensor<string, []>("input_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_3 = layer_norm(axes = input_3_axes_0, beta = encoder_pre_layer_norm_bias, epsilon = var_28, gamma = encoder_pre_layer_norm_weight, x = input_1)[name = tensor<string, []>("input_3")];
+    func main<ios17>(tensor<fp32, [1, 256, 18]> cnn_window, tensor<fp32, [2, 6, 4, 64, 64]> dec_kv, tensor<fp32, [2, 1]> dec_scale, tensor<fp32, [4, 1, 16, 256]> enc_conv_cache, tensor<fp32, [4, 1, 4, 64, 64]> enc_kv, tensor<fp32, [4, 1]> enc_scale, tensor<fp32, [1, 35, 23]> features, tensor<fp32, [3]> valid_mask) {
+            tensor<fp32, [256]> inner_encoder_cnn_bias = const()[name = tensor<string, []>("inner_encoder_cnn_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp32, [256, 256, 19]> inner_encoder_cnn_weight = const()[name = tensor<string, []>("inner_encoder_cnn_weight"), val = tensor<fp32, [256, 256, 19]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
+            tensor<fp32, [3, 3]> inner_encoder__causal_mask = const()[name = tensor<string, []>("inner_encoder__causal_mask"), val = tensor<fp32, [3, 3]>([[0x1p+0, 0x0p+0, 0x0p+0], [0x1p+0, 0x1p+0, 0x0p+0], [0x1p+0, 0x1p+0, 0x1p+0]])];
+            tensor<fp32, [3]> inner_encoder__t_index = const()[name = tensor<string, []>("inner_encoder__t_index"), val = tensor<fp32, [3]>([0x1p+0, 0x1p+1, 0x1.8p+1])];
+            tensor<fp32, [256]> inner_encoder_input_projection_linear_bias = const()[name = tensor<string, []>("inner_encoder_input_projection_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4981952)))];
+            tensor<fp32, [256, 345]> inner_encoder_input_projection_linear_weight = const()[name = tensor<string, []>("inner_encoder_input_projection_linear_weight"), val = tensor<fp32, [256, 345]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4983040)))];
+            tensor<fp32, [256]> inner_encoder_pre_layer_norm_bias = const()[name = tensor<string, []>("inner_encoder_pre_layer_norm_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5336384)))];
+            tensor<fp32, [256]> inner_encoder_pre_layer_norm_weight = const()[name = tensor<string, []>("inner_encoder_pre_layer_norm_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5337472)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5338560)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5339648)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5340736)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5344896)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6393536)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6394624)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_0_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7443264)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_0_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7444352)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7445440)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7446528)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7708736)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7709824)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7972032)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7973120)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8235328)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8236416)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8498624)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8499712)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8761920)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8763008)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_0_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8764096)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_0_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8766208)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_0_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9290560)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9307008)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9308096)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9309184)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9310272)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9311360)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_0_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9312448)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9574656)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9575744)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9576832)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9580992)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10629632)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10630720)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_0_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11679360)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_0_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11680448)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11681536)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11682624)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11683712)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11687872)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12736512)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12737600)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_1_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13786240)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_1_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13787328)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13788416)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13789504)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14051712)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14052800)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14315008)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14316096)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14578304)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14579392)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14841600)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14842688)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15104896)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15105984)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_1_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15107072)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_1_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15109184)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_1_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15633536)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15649984)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15651072)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15652160)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15653248)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15654336)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_1_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15655424)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15917632)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15918720)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15919808)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15923968)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16972608)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16973696)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_1_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18022336)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_1_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18023424)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18024512)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18025600)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18026688)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18030848)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19079488)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19080576)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_2_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20129216)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_2_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20130304)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20131392)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20132480)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20394688)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20395776)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20657984)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20659072)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20921280)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20922368)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21184576)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21185664)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21447872)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21448960)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_2_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21450048)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_2_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21452160)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_2_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21976512)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21992960)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21994048)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21995136)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21996224)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21997312)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_2_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21998400)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22260608)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22261696)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22262784)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22266944)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23315584)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23316672)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_2_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24365312)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_2_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24366400)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24367488)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24368576)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24369664)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24373824)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25422464)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25423552)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_3_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26472192)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_3_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26473280)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26474368)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26475456)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26737664)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26738752)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27000960)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27002048)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27264256)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27265344)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27527552)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27528640)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27790848)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27791936)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_3_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27793024)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_3_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27795136)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_3_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28319488)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28335936)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28337024)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28338112)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28339200)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28340288)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_3_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28341376)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28603584)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28604672)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28605760)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28609920)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29658560)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29659648)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_3_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30708288)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_3_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30709376)))];
+            tensor<fp32, [3, 3]> inner_decoder__causal_mask = const()[name = tensor<string, []>("inner_decoder__causal_mask"), val = tensor<fp32, [3, 3]>([[0x1p+0, 0x0p+0, 0x0p+0], [0x1p+0, 0x1p+0, 0x0p+0], [0x1p+0, 0x1p+0, 0x1p+0]])];
+            tensor<fp32, [256]> inner_decoder_convert_bias = const()[name = tensor<string, []>("inner_decoder_convert_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710464)))];
+            tensor<fp32, [256, 512]> inner_decoder_convert_weight = const()[name = tensor<string, []>("inner_decoder_convert_weight"), val = tensor<fp32, [256, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30711552)))];
+            tensor<fp32, [256]> inner_decoder_q_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31235904)))];
+            tensor<fp32, [256, 256]> inner_decoder_q_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31236992)))];
+            tensor<fp32, [256]> inner_decoder_k_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31499200)))];
+            tensor<fp32, [256, 256]> inner_decoder_k_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31500288)))];
+            tensor<fp32, [256]> inner_decoder_v_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31762496)))];
+            tensor<fp32, [256, 256]> inner_decoder_v_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31763584)))];
+            tensor<fp32, [256]> inner_decoder_g_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32025792)))];
+            tensor<fp32, [256, 256]> inner_decoder_g_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32026880)))];
+            tensor<fp32, [256]> inner_decoder_out_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32289088)))];
+            tensor<fp32, [256, 256]> inner_decoder_out_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32290176)))];
+            tensor<fp32, [256]> inner_decoder_norm11_0_bias = const()[name = tensor<string, []>("inner_decoder_norm11_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32552384)))];
+            tensor<fp32, [256]> inner_decoder_norm11_0_weight = const()[name = tensor<string, []>("inner_decoder_norm11_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32553472)))];
+            tensor<fp32, [256]> inner_decoder_self_attn2_0_out_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32554560)))];
+            tensor<fp32, [256, 256]> inner_decoder_self_attn2_0_out_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32555648)))];
+            tensor<fp32, [768]> inner_decoder_self_attn2_0_in_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32817856)))];
+            tensor<fp32, [768, 256]> inner_decoder_self_attn2_0_in_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32820992)))];
+            tensor<fp32, [256]> inner_decoder_norm21_0_bias = const()[name = tensor<string, []>("inner_decoder_norm21_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33607488)))];
+            tensor<fp32, [256]> inner_decoder_norm21_0_weight = const()[name = tensor<string, []>("inner_decoder_norm21_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33608576)))];
+            tensor<fp32, [2048]> inner_decoder_linear1_0_bias = const()[name = tensor<string, []>("inner_decoder_linear1_0_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33609664)))];
+            tensor<fp32, [2048, 256]> inner_decoder_linear1_0_weight = const()[name = tensor<string, []>("inner_decoder_linear1_0_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33617920)))];
+            tensor<fp32, [256]> inner_decoder_linear2_0_bias = const()[name = tensor<string, []>("inner_decoder_linear2_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35715136)))];
+            tensor<fp32, [256, 2048]> inner_decoder_linear2_0_weight = const()[name = tensor<string, []>("inner_decoder_linear2_0_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35716224)))];
+            tensor<fp32, [256]> inner_decoder_norm22_0_bias = const()[name = tensor<string, []>("inner_decoder_norm22_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37813440)))];
+            tensor<fp32, [256]> inner_decoder_norm22_0_weight = const()[name = tensor<string, []>("inner_decoder_norm22_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37814528)))];
+            tensor<fp32, [256]> inner_decoder_q_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37815616)))];
+            tensor<fp32, [256, 256]> inner_decoder_q_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37816704)))];
+            tensor<fp32, [256]> inner_decoder_k_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38078912)))];
+            tensor<fp32, [256, 256]> inner_decoder_k_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38080000)))];
+            tensor<fp32, [256]> inner_decoder_v_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38342208)))];
+            tensor<fp32, [256, 256]> inner_decoder_v_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38343296)))];
+            tensor<fp32, [256]> inner_decoder_g_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38605504)))];
+            tensor<fp32, [256, 256]> inner_decoder_g_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38606592)))];
+            tensor<fp32, [256]> inner_decoder_out_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38868800)))];
+            tensor<fp32, [256, 256]> inner_decoder_out_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38869888)))];
+            tensor<fp32, [256]> inner_decoder_norm11_1_bias = const()[name = tensor<string, []>("inner_decoder_norm11_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39132096)))];
+            tensor<fp32, [256]> inner_decoder_norm11_1_weight = const()[name = tensor<string, []>("inner_decoder_norm11_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39133184)))];
+            tensor<fp32, [256]> inner_decoder_self_attn2_1_out_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39134272)))];
+            tensor<fp32, [256, 256]> inner_decoder_self_attn2_1_out_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39135360)))];
+            tensor<fp32, [768]> inner_decoder_self_attn2_1_in_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39397568)))];
+            tensor<fp32, [768, 256]> inner_decoder_self_attn2_1_in_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39400704)))];
+            tensor<fp32, [256]> inner_decoder_norm21_1_bias = const()[name = tensor<string, []>("inner_decoder_norm21_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40187200)))];
+            tensor<fp32, [256]> inner_decoder_norm21_1_weight = const()[name = tensor<string, []>("inner_decoder_norm21_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40188288)))];
+            tensor<fp32, [2048]> inner_decoder_linear1_1_bias = const()[name = tensor<string, []>("inner_decoder_linear1_1_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40189376)))];
+            tensor<fp32, [2048, 256]> inner_decoder_linear1_1_weight = const()[name = tensor<string, []>("inner_decoder_linear1_1_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40197632)))];
+            tensor<fp32, [256]> inner_decoder_linear2_1_bias = const()[name = tensor<string, []>("inner_decoder_linear2_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42294848)))];
+            tensor<fp32, [256, 2048]> inner_decoder_linear2_1_weight = const()[name = tensor<string, []>("inner_decoder_linear2_1_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42295936)))];
+            tensor<fp32, [256]> inner_decoder_norm22_1_bias = const()[name = tensor<string, []>("inner_decoder_norm22_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44393152)))];
+            tensor<fp32, [256]> inner_decoder_norm22_1_weight = const()[name = tensor<string, []>("inner_decoder_norm22_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44394240)))];
+            tensor<int32, [3]> var_19_begin_0 = const()[name = tensor<string, []>("op_19_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_19_end_0 = const()[name = tensor<string, []>("op_19_end_0"), val = tensor<int32, [3]>([1, 15, 23])];
+            tensor<bool, [3]> var_19_end_mask_0 = const()[name = tensor<string, []>("op_19_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 15, 23]> var_19 = slice_by_index(begin = var_19_begin_0, end = var_19_end_0, end_mask = var_19_end_mask_0, x = features)[name = tensor<string, []>("op_19")];
+            tensor<int32, [3]> var_29_begin_0 = const()[name = tensor<string, []>("op_29_begin_0"), val = tensor<int32, [3]>([0, 10, 0])];
+            tensor<int32, [3]> var_29_end_0 = const()[name = tensor<string, []>("op_29_end_0"), val = tensor<int32, [3]>([1, 25, 23])];
+            tensor<bool, [3]> var_29_end_mask_0 = const()[name = tensor<string, []>("op_29_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 15, 23]> var_29 = slice_by_index(begin = var_29_begin_0, end = var_29_end_0, end_mask = var_29_end_mask_0, x = features)[name = tensor<string, []>("op_29")];
+            tensor<int32, [3]> var_39_begin_0 = const()[name = tensor<string, []>("op_39_begin_0"), val = tensor<int32, [3]>([0, 20, 0])];
+            tensor<int32, [3]> var_39_end_0 = const()[name = tensor<string, []>("op_39_end_0"), val = tensor<int32, [3]>([1, 1, 23])];
+            tensor<bool, [3]> var_39_end_mask_0 = const()[name = tensor<string, []>("op_39_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 23]> var_39 = slice_by_index(begin = var_39_begin_0, end = var_39_end_0, end_mask = var_39_end_mask_0, x = features)[name = tensor<string, []>("op_39")];
+            tensor<int32, []> stacked_axis_0 = const()[name = tensor<string, []>("stacked_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 3, 15, 23]> stacked = stack(axis = stacked_axis_0, values = (var_19, var_29, var_39))[name = tensor<string, []>("stacked")];
+            tensor<int32, [3]> var_46 = const()[name = tensor<string, []>("op_46"), val = tensor<int32, [3]>([1, 3, 345])];
+            tensor<fp32, [1, 3, 345]> input_1 = reshape(shape = var_46, x = stacked)[name = tensor<string, []>("input_1")];
+            tensor<fp32, []> var_48 = const()[name = tensor<string, []>("op_48"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<bool, []> var_54 = const()[name = tensor<string, []>("op_54"), val = tensor<bool, []>(true)];
+            tensor<fp32, []> var_55 = const()[name = tensor<string, []>("op_55"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
+            tensor<int32, []> var_58 = const()[name = tensor<string, []>("op_58"), val = tensor<int32, []>(0)];
+            tensor<int32, []> var_60 = const()[name = tensor<string, []>("op_60"), val = tensor<int32, []>(2)];
+            tensor<int32, []> var_61 = const()[name = tensor<string, []>("op_61"), val = tensor<int32, []>(-1)];
+            tensor<fp32, []> var_63 = const()[name = tensor<string, []>("op_63"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
+            tensor<fp32, []> var_68 = const()[name = tensor<string, []>("op_68"), val = tensor<fp32, []>(0x1.5798eep-27)];
+            tensor<int32, []> var_72 = const()[name = tensor<string, []>("op_72"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 3, 256]> input_3 = linear(bias = inner_encoder_input_projection_linear_bias, weight = inner_encoder_input_projection_linear_weight, x = input_1)[name = tensor<string, []>("linear_0")];
             tensor<int32, [1]> input_5_axes_0 = const()[name = tensor<string, []>("input_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_5 = layer_norm(axes = input_5_axes_0, beta = encoder_ffn1_0_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_0_module_sequential_0_weight, x = input_3)[name = tensor<string, []>("input_5")];
-            tensor<fp32, [1, 3, 1024]> inputs_1 = linear(bias = encoder_ffn1_0_module_sequential_1_linear_bias, weight = encoder_ffn1_0_module_sequential_1_linear_weight, x = input_5)[name = tensor<string, []>("linear_1")];
-            tensor<fp32, [1, 3, 1024]> input_7 = silu(x = inputs_1)[name = tensor<string, []>("input_7")];
-            tensor<fp32, [1, 3, 256]> input_11 = linear(bias = encoder_ffn1_0_module_sequential_4_linear_bias, weight = encoder_ffn1_0_module_sequential_4_linear_weight, x = input_7)[name = tensor<string, []>("linear_2")];
-            tensor<fp32, []> var_147 = const()[name = tensor<string, []>("op_147"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 3, 256]> var_148 = mul(x = input_11, y = var_147)[name = tensor<string, []>("op_148")];
-            tensor<fp32, [1, 3, 256]> input_13 = add(x = var_148, y = input_3)[name = tensor<string, []>("input_13")];
+            tensor<fp32, [1, 3, 256]> input_5 = layer_norm(axes = input_5_axes_0, beta = inner_encoder_pre_layer_norm_bias, epsilon = var_55, gamma = inner_encoder_pre_layer_norm_weight, x = input_3)[name = tensor<string, []>("input_5")];
+            tensor<int32, [1]> input_7_axes_0 = const()[name = tensor<string, []>("input_7_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 3, 256]> input_7 = layer_norm(axes = input_7_axes_0, beta = inner_encoder_ffn1_0_module_sequential_0_bias, epsilon = var_55, gamma = inner_encoder_ffn1_0_module_sequential_0_weight, x = input_5)[name = tensor<string, []>("input_7")];
+            tensor<fp32, [1, 3, 1024]> inputs_1 = linear(bias = inner_encoder_ffn1_0_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_0_module_sequential_1_linear_weight, x = input_7)[name = tensor<string, []>("linear_1")];
+            tensor<fp32, [1, 3, 1024]> input_9 = silu(x = inputs_1)[name = tensor<string, []>("input_9")];
+            tensor<fp32, [1, 3, 256]> input_13 = linear(bias = inner_encoder_ffn1_0_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_0_module_sequential_4_linear_weight, x = input_9)[name = tensor<string, []>("linear_2")];
+            tensor<fp32, []> var_193 = const()[name = tensor<string, []>("op_193"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 3, 256]> var_194 = mul(x = input_13, y = var_193)[name = tensor<string, []>("op_194")];
+            tensor<fp32, [1, 3, 256]> input_15 = add(x = var_194, y = input_5)[name = tensor<string, []>("input_15")];
             tensor<int32, [1]> x_1_axes_0 = const()[name = tensor<string, []>("x_1_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> x_1 = layer_norm(axes = x_1_axes_0, beta = encoder_ret_lns_0_bias, epsilon = var_28, gamma = encoder_ret_lns_0_weight, x = input_13)[name = tensor<string, []>("x_1")];
+            tensor<fp32, [1, 3, 256]> x_1 = layer_norm(axes = x_1_axes_0, beta = inner_encoder_ret_lns_0_bias, epsilon = var_55, gamma = inner_encoder_ret_lns_0_weight, x = input_15)[name = tensor<string, []>("x_1")];
             tensor<int32, [5]> prev_kv_1_begin_0 = const()[name = tensor<string, []>("prev_kv_1_begin_0"), val = tensor<int32, [5]>([0, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_1_end_0 = const()[name = tensor<string, []>("prev_kv_1_end_0"), val = tensor<int32, [5]>([1, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_1_end_mask_0 = const()[name = tensor<string, []>("prev_kv_1_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -239,163 +257,163 @@ program(1.0)
             tensor<bool, [2]> prev_scale_1_end_mask_0 = const()[name = tensor<string, []>("prev_scale_1_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_1_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_1_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_1 = slice_by_index(begin = prev_scale_1_begin_0, end = prev_scale_1_end_0, end_mask = prev_scale_1_end_mask_0, squeeze_mask = prev_scale_1_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_1")];
-            tensor<fp32, [1, 3, 256]> var_162 = linear(bias = encoder_q_proj_0_bias, weight = encoder_q_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_3")];
-            tensor<int32, [4]> var_163 = const()[name = tensor<string, []>("op_163"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_164 = reshape(shape = var_163, x = var_162)[name = tensor<string, []>("op_164")];
+            tensor<fp32, [1, 3, 256]> var_208 = linear(bias = inner_encoder_q_proj_0_bias, weight = inner_encoder_q_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_3")];
+            tensor<int32, [4]> var_209 = const()[name = tensor<string, []>("op_209"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_210 = reshape(shape = var_209, x = var_208)[name = tensor<string, []>("op_210")];
             tensor<int32, [4]> q_1_perm_0 = const()[name = tensor<string, []>("q_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> var_168 = linear(bias = encoder_k_proj_0_bias, weight = encoder_k_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_4")];
-            tensor<fp32, []> var_169 = const()[name = tensor<string, []>("op_169"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 3, 256]> var_170 = mul(x = var_168, y = var_169)[name = tensor<string, []>("op_170")];
-            tensor<int32, [4]> var_171 = const()[name = tensor<string, []>("op_171"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_172 = reshape(shape = var_171, x = var_170)[name = tensor<string, []>("op_172")];
+            tensor<fp32, [1, 3, 256]> var_214 = linear(bias = inner_encoder_k_proj_0_bias, weight = inner_encoder_k_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_4")];
+            tensor<fp32, []> var_215 = const()[name = tensor<string, []>("op_215"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 3, 256]> var_216 = mul(x = var_214, y = var_215)[name = tensor<string, []>("op_216")];
+            tensor<int32, [4]> var_217 = const()[name = tensor<string, []>("op_217"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_218 = reshape(shape = var_217, x = var_216)[name = tensor<string, []>("op_218")];
             tensor<int32, [4]> k_1_perm_0 = const()[name = tensor<string, []>("k_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> var_176 = linear(bias = encoder_v_proj_0_bias, weight = encoder_v_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_5")];
-            tensor<int32, [4]> var_177 = const()[name = tensor<string, []>("op_177"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_178 = reshape(shape = var_177, x = var_176)[name = tensor<string, []>("op_178")];
+            tensor<fp32, [1, 3, 256]> var_222 = linear(bias = inner_encoder_v_proj_0_bias, weight = inner_encoder_v_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_5")];
+            tensor<int32, [4]> var_223 = const()[name = tensor<string, []>("op_223"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_224 = reshape(shape = var_223, x = var_222)[name = tensor<string, []>("op_224")];
             tensor<int32, [4]> v_1_perm_0 = const()[name = tensor<string, []>("v_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> input_17 = linear(bias = encoder_g_proj_0_bias, weight = encoder_g_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_6")];
+            tensor<fp32, [1, 3, 256]> input_19 = linear(bias = inner_encoder_g_proj_0_bias, weight = inner_encoder_g_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_6")];
             tensor<fp32, [1]> sqrt_s0_1 = sqrt(x = prev_scale_1)[name = tensor<string, []>("sqrt_s0_1")];
-            tensor<fp32, [3]> s_t_1 = add(x = prev_scale_1, y = encoder__t_index)[name = tensor<string, []>("s_t_1")];
+            tensor<fp32, [3]> s_t_1 = add(x = prev_scale_1, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_1")];
             tensor<fp32, [3]> sqrt_s_t_1 = sqrt(x = s_t_1)[name = tensor<string, []>("sqrt_s_t_1")];
             tensor<bool, []> qk_1_transpose_x_1 = const()[name = tensor<string, []>("qk_1_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_1_transpose_y_1 = const()[name = tensor<string, []>("qk_1_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 3, 64]> k_1 = transpose(perm = k_1_perm_0, x = var_172)[name = tensor<string, []>("transpose_57")];
-            tensor<fp32, [1, 4, 3, 64]> q_1 = transpose(perm = q_1_perm_0, x = var_164)[name = tensor<string, []>("transpose_58")];
+            tensor<fp32, [1, 4, 3, 64]> k_1 = transpose(perm = k_1_perm_0, x = var_218)[name = tensor<string, []>("transpose_57")];
+            tensor<fp32, [1, 4, 3, 64]> q_1 = transpose(perm = q_1_perm_0, x = var_210)[name = tensor<string, []>("transpose_58")];
             tensor<fp32, [1, 4, 3, 3]> qk_1 = matmul(transpose_x = qk_1_transpose_x_1, transpose_y = qk_1_transpose_y_1, x = q_1, y = k_1)[name = tensor<string, []>("qk_1")];
-            tensor<int32, [2]> var_188 = const()[name = tensor<string, []>("op_188"), val = tensor<int32, [2]>([3, 1])];
-            tensor<fp32, [3, 1]> var_189 = reshape(shape = var_188, x = sqrt_s_t_1)[name = tensor<string, []>("op_189")];
-            tensor<fp32, [3, 3]> M_1 = real_div(x = encoder__causal_mask, y = var_189)[name = tensor<string, []>("M_1")];
-            tensor<fp32, [1, 4, 3, 3]> var_191 = mul(x = qk_1, y = M_1)[name = tensor<string, []>("op_191")];
+            tensor<int32, [2]> var_234 = const()[name = tensor<string, []>("op_234"), val = tensor<int32, [2]>([3, 1])];
+            tensor<fp32, [3, 1]> var_235 = reshape(shape = var_234, x = sqrt_s_t_1)[name = tensor<string, []>("op_235")];
+            tensor<fp32, [3, 3]> M_1 = real_div(x = inner_encoder__causal_mask, y = var_235)[name = tensor<string, []>("M_1")];
+            tensor<fp32, [1, 4, 3, 3]> var_237 = mul(x = qk_1, y = M_1)[name = tensor<string, []>("op_237")];
             tensor<bool, []> inner_1_transpose_x_0 = const()[name = tensor<string, []>("inner_1_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_1_transpose_y_0 = const()[name = tensor<string, []>("inner_1_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 3, 64]> v_1 = transpose(perm = v_1_perm_0, x = var_178)[name = tensor<string, []>("transpose_56")];
-            tensor<fp32, [1, 4, 3, 64]> inner_1 = matmul(transpose_x = inner_1_transpose_x_0, transpose_y = inner_1_transpose_y_0, x = var_191, y = v_1)[name = tensor<string, []>("inner_1")];
-            tensor<bool, []> var_193_transpose_x_0 = const()[name = tensor<string, []>("op_193_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_193_transpose_y_0 = const()[name = tensor<string, []>("op_193_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 3, 64]> var_193 = matmul(transpose_x = var_193_transpose_x_0, transpose_y = var_193_transpose_y_0, x = q_1, y = prev_kv_1)[name = tensor<string, []>("op_193")];
-            tensor<fp32, [3]> var_194 = real_div(x = sqrt_s0_1, y = sqrt_s_t_1)[name = tensor<string, []>("op_194")];
-            tensor<int32, [4]> var_195 = const()[name = tensor<string, []>("op_195"), val = tensor<int32, [4]>([1, 1, 3, 1])];
-            tensor<fp32, [1, 1, 3, 1]> var_196 = reshape(shape = var_195, x = var_194)[name = tensor<string, []>("op_196")];
-            tensor<fp32, [1, 4, 3, 64]> cross_1 = mul(x = var_193, y = var_196)[name = tensor<string, []>("cross_1")];
+            tensor<fp32, [1, 4, 3, 64]> v_1 = transpose(perm = v_1_perm_0, x = var_224)[name = tensor<string, []>("transpose_56")];
+            tensor<fp32, [1, 4, 3, 64]> inner_1 = matmul(transpose_x = inner_1_transpose_x_0, transpose_y = inner_1_transpose_y_0, x = var_237, y = v_1)[name = tensor<string, []>("inner_1")];
+            tensor<bool, []> var_239_transpose_x_0 = const()[name = tensor<string, []>("op_239_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_239_transpose_y_0 = const()[name = tensor<string, []>("op_239_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 3, 64]> var_239 = matmul(transpose_x = var_239_transpose_x_0, transpose_y = var_239_transpose_y_0, x = q_1, y = prev_kv_1)[name = tensor<string, []>("op_239")];
+            tensor<fp32, [3]> var_240 = real_div(x = sqrt_s0_1, y = sqrt_s_t_1)[name = tensor<string, []>("op_240")];
+            tensor<int32, [4]> var_241 = const()[name = tensor<string, []>("op_241"), val = tensor<int32, [4]>([1, 1, 3, 1])];
+            tensor<fp32, [1, 1, 3, 1]> var_242 = reshape(shape = var_241, x = var_240)[name = tensor<string, []>("op_242")];
+            tensor<fp32, [1, 4, 3, 64]> cross_1 = mul(x = var_239, y = var_242)[name = tensor<string, []>("cross_1")];
             tensor<fp32, [1, 4, 3, 64]> out_1 = add(x = inner_1, y = cross_1)[name = tensor<string, []>("out_1")];
-            tensor<fp32, [1, 4, 64, 64]> var_199 = mul(x = prev_kv_1, y = sqrt_s0_1)[name = tensor<string, []>("op_199")];
-            tensor<bool, []> var_201_transpose_x_1 = const()[name = tensor<string, []>("op_201_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_201_transpose_y_1 = const()[name = tensor<string, []>("op_201_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_201 = matmul(transpose_x = var_201_transpose_x_1, transpose_y = var_201_transpose_y_1, x = k_1, y = v_1)[name = tensor<string, []>("op_201")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_1 = add(x = var_199, y = var_201)[name = tensor<string, []>("new_kv_unnorm_1")];
-            tensor<fp32, []> var_203 = const()[name = tensor<string, []>("op_203"), val = tensor<fp32, []>(0x1.8p+1)];
-            tensor<fp32, [1]> new_scale_1 = add(x = prev_scale_1, y = var_203)[name = tensor<string, []>("new_scale_1")];
-            tensor<fp32, [1]> var_205 = sqrt(x = new_scale_1)[name = tensor<string, []>("op_205")];
-            tensor<fp32, [1, 4, 64, 64]> var_206 = real_div(x = new_kv_unnorm_1, y = var_205)[name = tensor<string, []>("op_206")];
-            tensor<int32, [4]> var_207_perm_0 = const()[name = tensor<string, []>("op_207_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_245 = mul(x = prev_kv_1, y = sqrt_s0_1)[name = tensor<string, []>("op_245")];
+            tensor<bool, []> var_247_transpose_x_1 = const()[name = tensor<string, []>("op_247_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_247_transpose_y_1 = const()[name = tensor<string, []>("op_247_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_247 = matmul(transpose_x = var_247_transpose_x_1, transpose_y = var_247_transpose_y_1, x = k_1, y = v_1)[name = tensor<string, []>("op_247")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_1 = add(x = var_245, y = var_247)[name = tensor<string, []>("new_kv_unnorm_1")];
+            tensor<fp32, []> var_249 = const()[name = tensor<string, []>("op_249"), val = tensor<fp32, []>(0x1.8p+1)];
+            tensor<fp32, [1]> new_scale_1 = add(x = prev_scale_1, y = var_249)[name = tensor<string, []>("new_scale_1")];
+            tensor<fp32, [1]> var_251 = sqrt(x = new_scale_1)[name = tensor<string, []>("op_251")];
+            tensor<fp32, [1, 4, 64, 64]> var_252 = real_div(x = new_kv_unnorm_1, y = var_251)[name = tensor<string, []>("op_252")];
+            tensor<int32, [4]> var_253_perm_0 = const()[name = tensor<string, []>("op_253_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_3_axes_0 = const()[name = tensor<string, []>("out_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 4, 64]> var_207 = transpose(perm = var_207_perm_0, x = out_1)[name = tensor<string, []>("transpose_55")];
-            tensor<fp32, [1, 3, 4, 64]> out_3 = layer_norm(axes = out_3_axes_0, epsilon = var_17, x = var_207)[name = tensor<string, []>("out_3")];
-            tensor<int32, [3]> var_211 = const()[name = tensor<string, []>("op_211"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<fp32, [1, 3, 256]> out_5 = reshape(shape = var_211, x = out_3)[name = tensor<string, []>("out_5")];
-            tensor<fp32, [1, 3, 256]> var_213 = silu(x = input_17)[name = tensor<string, []>("op_213")];
-            tensor<fp32, [1, 3, 256]> input_19 = mul(x = var_213, y = out_5)[name = tensor<string, []>("input_19")];
-            tensor<fp32, [1, 3, 256]> ret_out_1 = linear(bias = encoder_out_proj_0_bias, weight = encoder_out_proj_0_weight, x = input_19)[name = tensor<string, []>("linear_7")];
-            tensor<fp32, [1, 3, 256]> x_3 = add(x = input_13, y = ret_out_1)[name = tensor<string, []>("x_3")];
+            tensor<fp32, [1, 3, 4, 64]> var_253 = transpose(perm = var_253_perm_0, x = out_1)[name = tensor<string, []>("transpose_55")];
+            tensor<fp32, [1, 3, 4, 64]> out_3 = layer_norm(axes = out_3_axes_0, epsilon = var_63, x = var_253)[name = tensor<string, []>("out_3")];
+            tensor<int32, [3]> var_257 = const()[name = tensor<string, []>("op_257"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<fp32, [1, 3, 256]> out_5 = reshape(shape = var_257, x = out_3)[name = tensor<string, []>("out_5")];
+            tensor<fp32, [1, 3, 256]> var_259 = silu(x = input_19)[name = tensor<string, []>("op_259")];
+            tensor<fp32, [1, 3, 256]> input_21 = mul(x = var_259, y = out_5)[name = tensor<string, []>("input_21")];
+            tensor<fp32, [1, 3, 256]> ret_out_1 = linear(bias = inner_encoder_out_proj_0_bias, weight = inner_encoder_out_proj_0_weight, x = input_21)[name = tensor<string, []>("linear_7")];
+            tensor<fp32, [1, 3, 256]> x_3 = add(x = input_15, y = ret_out_1)[name = tensor<string, []>("x_3")];
             tensor<int32, [4]> window_1_begin_0 = const()[name = tensor<string, []>("window_1_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> window_1_end_0 = const()[name = tensor<string, []>("window_1_end_0"), val = tensor<int32, [4]>([1, 1, 16, 256])];
             tensor<bool, [4]> window_1_end_mask_0 = const()[name = tensor<string, []>("window_1_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_1_squeeze_mask_0 = const()[name = tensor<string, []>("window_1_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_1 = slice_by_index(begin = window_1_begin_0, end = window_1_end_0, end_mask = window_1_end_mask_0, squeeze_mask = window_1_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_1")];
-            tensor<int32, [3]> var_221_begin_0 = const()[name = tensor<string, []>("op_221_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_221_end_0 = const()[name = tensor<string, []>("op_221_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_221_end_mask_0 = const()[name = tensor<string, []>("op_221_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_221 = slice_by_index(begin = var_221_begin_0, end = var_221_end_0, end_mask = var_221_end_mask_0, x = x_3)[name = tensor<string, []>("op_221")];
-            tensor<int32, [3]> var_224_begin_0 = const()[name = tensor<string, []>("op_224_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_224_end_0 = const()[name = tensor<string, []>("op_224_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_224_end_mask_0 = const()[name = tensor<string, []>("op_224_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_224 = slice_by_index(begin = var_224_begin_0, end = var_224_end_0, end_mask = var_224_end_mask_0, x = window_1)[name = tensor<string, []>("op_224")];
+            tensor<int32, [3]> var_267_begin_0 = const()[name = tensor<string, []>("op_267_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_267_end_0 = const()[name = tensor<string, []>("op_267_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_267_end_mask_0 = const()[name = tensor<string, []>("op_267_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_267 = slice_by_index(begin = var_267_begin_0, end = var_267_end_0, end_mask = var_267_end_mask_0, x = x_3)[name = tensor<string, []>("op_267")];
+            tensor<int32, [3]> var_270_begin_0 = const()[name = tensor<string, []>("op_270_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_270_end_0 = const()[name = tensor<string, []>("op_270_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_270_end_mask_0 = const()[name = tensor<string, []>("op_270_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_270 = slice_by_index(begin = var_270_begin_0, end = var_270_end_0, end_mask = var_270_end_mask_0, x = window_1)[name = tensor<string, []>("op_270")];
             tensor<bool, []> window_3_interleave_0 = const()[name = tensor<string, []>("window_3_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_3 = concat(axis = var_26, interleave = window_3_interleave_0, values = (var_224, var_221))[name = tensor<string, []>("window_3")];
-            tensor<int32, [3]> var_229_begin_0 = const()[name = tensor<string, []>("op_229_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_229_end_0 = const()[name = tensor<string, []>("op_229_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_229_end_mask_0 = const()[name = tensor<string, []>("op_229_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_229 = slice_by_index(begin = var_229_begin_0, end = var_229_end_0, end_mask = var_229_end_mask_0, x = x_3)[name = tensor<string, []>("op_229")];
-            tensor<int32, [3]> var_232_begin_0 = const()[name = tensor<string, []>("op_232_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_232_end_0 = const()[name = tensor<string, []>("op_232_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_232_end_mask_0 = const()[name = tensor<string, []>("op_232_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_232 = slice_by_index(begin = var_232_begin_0, end = var_232_end_0, end_mask = var_232_end_mask_0, x = window_3)[name = tensor<string, []>("op_232")];
+            tensor<fp32, [1, 16, 256]> window_3 = concat(axis = var_72, interleave = window_3_interleave_0, values = (var_270, var_267))[name = tensor<string, []>("window_3")];
+            tensor<int32, [3]> var_275_begin_0 = const()[name = tensor<string, []>("op_275_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_275_end_0 = const()[name = tensor<string, []>("op_275_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_275_end_mask_0 = const()[name = tensor<string, []>("op_275_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_275 = slice_by_index(begin = var_275_begin_0, end = var_275_end_0, end_mask = var_275_end_mask_0, x = x_3)[name = tensor<string, []>("op_275")];
+            tensor<int32, [3]> var_278_begin_0 = const()[name = tensor<string, []>("op_278_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_278_end_0 = const()[name = tensor<string, []>("op_278_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_278_end_mask_0 = const()[name = tensor<string, []>("op_278_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_278 = slice_by_index(begin = var_278_begin_0, end = var_278_end_0, end_mask = var_278_end_mask_0, x = window_3)[name = tensor<string, []>("op_278")];
             tensor<bool, []> window_5_interleave_0 = const()[name = tensor<string, []>("window_5_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_5 = concat(axis = var_26, interleave = window_5_interleave_0, values = (var_232, var_229))[name = tensor<string, []>("window_5")];
-            tensor<int32, [3]> var_237_begin_0 = const()[name = tensor<string, []>("op_237_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_237_end_0 = const()[name = tensor<string, []>("op_237_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_237_end_mask_0 = const()[name = tensor<string, []>("op_237_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_237 = slice_by_index(begin = var_237_begin_0, end = var_237_end_0, end_mask = var_237_end_mask_0, x = x_3)[name = tensor<string, []>("op_237")];
-            tensor<int32, [3]> var_240_begin_0 = const()[name = tensor<string, []>("op_240_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_240_end_0 = const()[name = tensor<string, []>("op_240_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_240_end_mask_0 = const()[name = tensor<string, []>("op_240_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_240 = slice_by_index(begin = var_240_begin_0, end = var_240_end_0, end_mask = var_240_end_mask_0, x = window_5)[name = tensor<string, []>("op_240")];
+            tensor<fp32, [1, 16, 256]> window_5 = concat(axis = var_72, interleave = window_5_interleave_0, values = (var_278, var_275))[name = tensor<string, []>("window_5")];
+            tensor<int32, [3]> var_283_begin_0 = const()[name = tensor<string, []>("op_283_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_283_end_0 = const()[name = tensor<string, []>("op_283_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_283_end_mask_0 = const()[name = tensor<string, []>("op_283_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_283 = slice_by_index(begin = var_283_begin_0, end = var_283_end_0, end_mask = var_283_end_mask_0, x = x_3)[name = tensor<string, []>("op_283")];
+            tensor<int32, [3]> var_286_begin_0 = const()[name = tensor<string, []>("op_286_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_286_end_0 = const()[name = tensor<string, []>("op_286_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_286_end_mask_0 = const()[name = tensor<string, []>("op_286_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_286 = slice_by_index(begin = var_286_begin_0, end = var_286_end_0, end_mask = var_286_end_mask_0, x = window_5)[name = tensor<string, []>("op_286")];
             tensor<bool, []> window_7_interleave_0 = const()[name = tensor<string, []>("window_7_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_7 = concat(axis = var_26, interleave = window_7_interleave_0, values = (var_240, var_237))[name = tensor<string, []>("window_7")];
-            tensor<bool, []> input_21_interleave_0 = const()[name = tensor<string, []>("input_21_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [3, 16, 256]> input_21 = concat(axis = var_23, interleave = input_21_interleave_0, values = (window_3, window_5, window_7))[name = tensor<string, []>("input_21")];
+            tensor<fp32, [1, 16, 256]> window_7 = concat(axis = var_72, interleave = window_7_interleave_0, values = (var_286, var_283))[name = tensor<string, []>("window_7")];
+            tensor<bool, []> input_23_interleave_0 = const()[name = tensor<string, []>("input_23_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [3, 16, 256]> input_23 = concat(axis = var_58, interleave = input_23_interleave_0, values = (window_3, window_5, window_7))[name = tensor<string, []>("input_23")];
             tensor<int32, [1]> x_5_axes_0 = const()[name = tensor<string, []>("x_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [3, 16, 256]> x_5 = layer_norm(axes = x_5_axes_0, beta = encoder_conv_module_0_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_0_sequential_0_weight, x = input_21)[name = tensor<string, []>("x_5")];
-            tensor<int32, [3]> input_23_perm_0 = const()[name = tensor<string, []>("input_23_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [3, 16, 256]> x_5 = layer_norm(axes = x_5_axes_0, beta = inner_encoder_conv_module_0_sequential_0_bias, epsilon = var_55, gamma = inner_encoder_conv_module_0_sequential_0_weight, x = input_23)[name = tensor<string, []>("x_5")];
+            tensor<int32, [3]> input_25_perm_0 = const()[name = tensor<string, []>("input_25_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_3_pad_type_0 = const()[name = tensor<string, []>("inputs_3_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_3_strides_0 = const()[name = tensor<string, []>("inputs_3_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_3_pad_0 = const()[name = tensor<string, []>("inputs_3_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_3_dilations_0 = const()[name = tensor<string, []>("inputs_3_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_3_groups_0 = const()[name = tensor<string, []>("inputs_3_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> input_23 = transpose(perm = input_23_perm_0, x = x_5)[name = tensor<string, []>("transpose_54")];
-            tensor<fp32, [3, 512, 16]> inputs_3 = conv(bias = encoder_conv_module_0_sequential_2_conv_bias, dilations = inputs_3_dilations_0, groups = inputs_3_groups_0, pad = inputs_3_pad_0, pad_type = inputs_3_pad_type_0, strides = inputs_3_strides_0, weight = encoder_conv_module_0_sequential_2_conv_weight, x = input_23)[name = tensor<string, []>("inputs_3")];
-            tensor<int32, [2]> var_265_split_sizes_0 = const()[name = tensor<string, []>("op_265_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_265_axis_0 = const()[name = tensor<string, []>("op_265_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> var_265_0, tensor<fp32, [3, 256, 16]> var_265_1 = split(axis = var_265_axis_0, split_sizes = var_265_split_sizes_0, x = inputs_3)[name = tensor<string, []>("op_265")];
-            tensor<fp32, [3, 256, 16]> var_267 = sigmoid(x = var_265_1)[name = tensor<string, []>("op_267")];
-            tensor<fp32, [3, 256, 16]> inputs_5 = mul(x = var_265_0, y = var_267)[name = tensor<string, []>("inputs_5")];
+            tensor<fp32, [3, 256, 16]> input_25 = transpose(perm = input_25_perm_0, x = x_5)[name = tensor<string, []>("transpose_54")];
+            tensor<fp32, [3, 512, 16]> inputs_3 = conv(bias = inner_encoder_conv_module_0_sequential_2_conv_bias, dilations = inputs_3_dilations_0, groups = inputs_3_groups_0, pad = inputs_3_pad_0, pad_type = inputs_3_pad_type_0, strides = inputs_3_strides_0, weight = inner_encoder_conv_module_0_sequential_2_conv_weight, x = input_25)[name = tensor<string, []>("inputs_3")];
+            tensor<int32, [2]> var_311_split_sizes_0 = const()[name = tensor<string, []>("op_311_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_311_axis_0 = const()[name = tensor<string, []>("op_311_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [3, 256, 16]> var_311_0, tensor<fp32, [3, 256, 16]> var_311_1 = split(axis = var_311_axis_0, split_sizes = var_311_split_sizes_0, x = inputs_3)[name = tensor<string, []>("op_311")];
+            tensor<fp32, [3, 256, 16]> var_313 = sigmoid(x = var_311_1)[name = tensor<string, []>("op_313")];
+            tensor<fp32, [3, 256, 16]> inputs_5 = mul(x = var_311_0, y = var_313)[name = tensor<string, []>("inputs_5")];
             tensor<string, []> outputs_aug_1_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_1_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_1_pad_0 = const()[name = tensor<string, []>("outputs_aug_1_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_1_groups_0 = const()[name = tensor<string, []>("outputs_aug_1_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_1_strides_0 = const()[name = tensor<string, []>("outputs_aug_1_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_1_dilations_0 = const()[name = tensor<string, []>("outputs_aug_1_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [3, 256, 31]> outputs_aug_1 = conv(dilations = outputs_aug_1_dilations_0, groups = outputs_aug_1_groups_0, pad = outputs_aug_1_pad_0, pad_type = outputs_aug_1_pad_type_0, strides = outputs_aug_1_strides_0, weight = encoder_conv_module_0_sequential_4_conv_weight, x = inputs_5)[name = tensor<string, []>("outputs_aug_1")];
-            tensor<int32, [3]> input_25_begin_0 = const()[name = tensor<string, []>("input_25_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_25_end_0 = const()[name = tensor<string, []>("input_25_end_0"), val = tensor<int32, [3]>([3, 256, 16])];
-            tensor<bool, [3]> input_25_end_mask_0 = const()[name = tensor<string, []>("input_25_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [3, 256, 16]> input_25 = slice_by_index(begin = input_25_begin_0, end = input_25_end_0, end_mask = input_25_end_mask_0, x = outputs_aug_1)[name = tensor<string, []>("input_25")];
-            tensor<fp32, [3, 256, 16]> inputs_7 = batch_norm(beta = encoder_conv_module_0_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_0_sequential_5_weight, mean = encoder_conv_module_0_sequential_5_running_mean, variance = encoder_conv_module_0_sequential_5_running_var, x = input_25)[name = tensor<string, []>("inputs_7")];
-            tensor<fp32, [3, 256, 16]> input_27 = silu(x = inputs_7)[name = tensor<string, []>("input_27")];
-            tensor<string, []> input_29_pad_type_0 = const()[name = tensor<string, []>("input_29_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_29_strides_0 = const()[name = tensor<string, []>("input_29_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_29_pad_0 = const()[name = tensor<string, []>("input_29_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_29_dilations_0 = const()[name = tensor<string, []>("input_29_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_29_groups_0 = const()[name = tensor<string, []>("input_29_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> input_29 = conv(bias = encoder_conv_module_0_sequential_7_conv_bias, dilations = input_29_dilations_0, groups = input_29_groups_0, pad = input_29_pad_0, pad_type = input_29_pad_type_0, strides = input_29_strides_0, weight = encoder_conv_module_0_sequential_7_conv_weight, x = input_27)[name = tensor<string, []>("input_29")];
+            tensor<fp32, [3, 256, 31]> outputs_aug_1 = conv(dilations = outputs_aug_1_dilations_0, groups = outputs_aug_1_groups_0, pad = outputs_aug_1_pad_0, pad_type = outputs_aug_1_pad_type_0, strides = outputs_aug_1_strides_0, weight = inner_encoder_conv_module_0_sequential_4_conv_weight, x = inputs_5)[name = tensor<string, []>("outputs_aug_1")];
+            tensor<int32, [3]> input_27_begin_0 = const()[name = tensor<string, []>("input_27_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_27_end_0 = const()[name = tensor<string, []>("input_27_end_0"), val = tensor<int32, [3]>([3, 256, 16])];
+            tensor<bool, [3]> input_27_end_mask_0 = const()[name = tensor<string, []>("input_27_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [3, 256, 16]> input_27 = slice_by_index(begin = input_27_begin_0, end = input_27_end_0, end_mask = input_27_end_mask_0, x = outputs_aug_1)[name = tensor<string, []>("input_27")];
+            tensor<fp32, [3, 256, 16]> inputs_7 = batch_norm(beta = inner_encoder_conv_module_0_sequential_5_bias, epsilon = var_55, gamma = inner_encoder_conv_module_0_sequential_5_weight, mean = inner_encoder_conv_module_0_sequential_5_running_mean, variance = inner_encoder_conv_module_0_sequential_5_running_var, x = input_27)[name = tensor<string, []>("inputs_7")];
+            tensor<fp32, [3, 256, 16]> input_29 = silu(x = inputs_7)[name = tensor<string, []>("input_29")];
+            tensor<string, []> input_31_pad_type_0 = const()[name = tensor<string, []>("input_31_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_31_strides_0 = const()[name = tensor<string, []>("input_31_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_31_pad_0 = const()[name = tensor<string, []>("input_31_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_31_dilations_0 = const()[name = tensor<string, []>("input_31_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_31_groups_0 = const()[name = tensor<string, []>("input_31_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [3, 256, 16]> input_31 = conv(bias = inner_encoder_conv_module_0_sequential_7_conv_bias, dilations = input_31_dilations_0, groups = input_31_groups_0, pad = input_31_pad_0, pad_type = input_31_pad_type_0, strides = input_31_strides_0, weight = inner_encoder_conv_module_0_sequential_7_conv_weight, x = input_29)[name = tensor<string, []>("input_31")];
             tensor<int32, [3]> conv_out_1_perm_0 = const()[name = tensor<string, []>("conv_out_1_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_298_begin_0 = const()[name = tensor<string, []>("op_298_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_298_end_0 = const()[name = tensor<string, []>("op_298_end_0"), val = tensor<int32, [3]>([3, 16, 256])];
-            tensor<bool, [3]> var_298_end_mask_0 = const()[name = tensor<string, []>("op_298_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [3, 16, 256]> conv_out_1 = transpose(perm = conv_out_1_perm_0, x = input_29)[name = tensor<string, []>("transpose_53")];
-            tensor<fp32, [3, 1, 256]> var_298 = slice_by_index(begin = var_298_begin_0, end = var_298_end_0, end_mask = var_298_end_mask_0, x = conv_out_1)[name = tensor<string, []>("op_298")];
-            tensor<int32, [3]> var_300_perm_0 = const()[name = tensor<string, []>("op_300_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 3, 256]> var_300 = transpose(perm = var_300_perm_0, x = var_298)[name = tensor<string, []>("transpose_52")];
-            tensor<fp32, [1, 3, 256]> input_31 = add(x = x_3, y = var_300)[name = tensor<string, []>("input_31")];
-            tensor<int32, [1]> input_33_axes_0 = const()[name = tensor<string, []>("input_33_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_33 = layer_norm(axes = input_33_axes_0, beta = encoder_ffn2_0_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_0_module_sequential_0_weight, x = input_31)[name = tensor<string, []>("input_33")];
-            tensor<fp32, [1, 3, 1024]> inputs_9 = linear(bias = encoder_ffn2_0_module_sequential_1_linear_bias, weight = encoder_ffn2_0_module_sequential_1_linear_weight, x = input_33)[name = tensor<string, []>("linear_8")];
-            tensor<fp32, [1, 3, 1024]> input_35 = silu(x = inputs_9)[name = tensor<string, []>("input_35")];
-            tensor<fp32, [1, 3, 256]> input_39 = linear(bias = encoder_ffn2_0_module_sequential_4_linear_bias, weight = encoder_ffn2_0_module_sequential_4_linear_weight, x = input_35)[name = tensor<string, []>("linear_9")];
-            tensor<fp32, []> var_323 = const()[name = tensor<string, []>("op_323"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 3, 256]> var_324 = mul(x = input_39, y = var_323)[name = tensor<string, []>("op_324")];
-            tensor<fp32, [1, 3, 256]> input_41 = add(x = var_324, y = input_31)[name = tensor<string, []>("input_41")];
-            tensor<int32, [1]> input_43_axes_0 = const()[name = tensor<string, []>("input_43_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_43 = layer_norm(axes = input_43_axes_0, beta = encoder_layer_norm_0_bias, epsilon = var_28, gamma = encoder_layer_norm_0_weight, x = input_41)[name = tensor<string, []>("input_43")];
+            tensor<int32, [3]> var_344_begin_0 = const()[name = tensor<string, []>("op_344_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_344_end_0 = const()[name = tensor<string, []>("op_344_end_0"), val = tensor<int32, [3]>([3, 16, 256])];
+            tensor<bool, [3]> var_344_end_mask_0 = const()[name = tensor<string, []>("op_344_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [3, 16, 256]> conv_out_1 = transpose(perm = conv_out_1_perm_0, x = input_31)[name = tensor<string, []>("transpose_53")];
+            tensor<fp32, [3, 1, 256]> var_344 = slice_by_index(begin = var_344_begin_0, end = var_344_end_0, end_mask = var_344_end_mask_0, x = conv_out_1)[name = tensor<string, []>("op_344")];
+            tensor<int32, [3]> var_346_perm_0 = const()[name = tensor<string, []>("op_346_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 3, 256]> var_346 = transpose(perm = var_346_perm_0, x = var_344)[name = tensor<string, []>("transpose_52")];
+            tensor<fp32, [1, 3, 256]> input_33 = add(x = x_3, y = var_346)[name = tensor<string, []>("input_33")];
+            tensor<int32, [1]> input_35_axes_0 = const()[name = tensor<string, []>("input_35_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 3, 256]> input_35 = layer_norm(axes = input_35_axes_0, beta = inner_encoder_ffn2_0_module_sequential_0_bias, epsilon = var_55, gamma = inner_encoder_ffn2_0_module_sequential_0_weight, x = input_33)[name = tensor<string, []>("input_35")];
+            tensor<fp32, [1, 3, 1024]> inputs_9 = linear(bias = inner_encoder_ffn2_0_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_0_module_sequential_1_linear_weight, x = input_35)[name = tensor<string, []>("linear_8")];
+            tensor<fp32, [1, 3, 1024]> input_37 = silu(x = inputs_9)[name = tensor<string, []>("input_37")];
+            tensor<fp32, [1, 3, 256]> input_41 = linear(bias = inner_encoder_ffn2_0_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_0_module_sequential_4_linear_weight, x = input_37)[name = tensor<string, []>("linear_9")];
+            tensor<fp32, []> var_369 = const()[name = tensor<string, []>("op_369"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 3, 256]> var_370 = mul(x = input_41, y = var_369)[name = tensor<string, []>("op_370")];
+            tensor<fp32, [1, 3, 256]> input_43 = add(x = var_370, y = input_33)[name = tensor<string, []>("input_43")];
             tensor<int32, [1]> input_45_axes_0 = const()[name = tensor<string, []>("input_45_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_45 = layer_norm(axes = input_45_axes_0, beta = encoder_ffn1_1_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_1_module_sequential_0_weight, x = input_43)[name = tensor<string, []>("input_45")];
-            tensor<fp32, [1, 3, 1024]> inputs_11 = linear(bias = encoder_ffn1_1_module_sequential_1_linear_bias, weight = encoder_ffn1_1_module_sequential_1_linear_weight, x = input_45)[name = tensor<string, []>("linear_10")];
-            tensor<fp32, [1, 3, 1024]> input_47 = silu(x = inputs_11)[name = tensor<string, []>("input_47")];
-            tensor<fp32, [1, 3, 256]> input_51 = linear(bias = encoder_ffn1_1_module_sequential_4_linear_bias, weight = encoder_ffn1_1_module_sequential_4_linear_weight, x = input_47)[name = tensor<string, []>("linear_11")];
-            tensor<fp32, []> var_353 = const()[name = tensor<string, []>("op_353"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 3, 256]> var_354 = mul(x = input_51, y = var_353)[name = tensor<string, []>("op_354")];
-            tensor<fp32, [1, 3, 256]> input_53 = add(x = var_354, y = input_43)[name = tensor<string, []>("input_53")];
+            tensor<fp32, [1, 3, 256]> input_45 = layer_norm(axes = input_45_axes_0, beta = inner_encoder_layer_norm_0_bias, epsilon = var_55, gamma = inner_encoder_layer_norm_0_weight, x = input_43)[name = tensor<string, []>("input_45")];
+            tensor<int32, [1]> input_47_axes_0 = const()[name = tensor<string, []>("input_47_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 3, 256]> input_47 = layer_norm(axes = input_47_axes_0, beta = inner_encoder_ffn1_1_module_sequential_0_bias, epsilon = var_55, gamma = inner_encoder_ffn1_1_module_sequential_0_weight, x = input_45)[name = tensor<string, []>("input_47")];
+            tensor<fp32, [1, 3, 1024]> inputs_11 = linear(bias = inner_encoder_ffn1_1_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_1_module_sequential_1_linear_weight, x = input_47)[name = tensor<string, []>("linear_10")];
+            tensor<fp32, [1, 3, 1024]> input_49 = silu(x = inputs_11)[name = tensor<string, []>("input_49")];
+            tensor<fp32, [1, 3, 256]> input_53 = linear(bias = inner_encoder_ffn1_1_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_1_module_sequential_4_linear_weight, x = input_49)[name = tensor<string, []>("linear_11")];
+            tensor<fp32, []> var_399 = const()[name = tensor<string, []>("op_399"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 3, 256]> var_400 = mul(x = input_53, y = var_399)[name = tensor<string, []>("op_400")];
+            tensor<fp32, [1, 3, 256]> input_55 = add(x = var_400, y = input_45)[name = tensor<string, []>("input_55")];
             tensor<int32, [1]> x_7_axes_0 = const()[name = tensor<string, []>("x_7_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> x_7 = layer_norm(axes = x_7_axes_0, beta = encoder_ret_lns_1_bias, epsilon = var_28, gamma = encoder_ret_lns_1_weight, x = input_53)[name = tensor<string, []>("x_7")];
+            tensor<fp32, [1, 3, 256]> x_7 = layer_norm(axes = x_7_axes_0, beta = inner_encoder_ret_lns_1_bias, epsilon = var_55, gamma = inner_encoder_ret_lns_1_weight, x = input_55)[name = tensor<string, []>("x_7")];
             tensor<int32, [5]> prev_kv_3_begin_0 = const()[name = tensor<string, []>("prev_kv_3_begin_0"), val = tensor<int32, [5]>([1, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_3_end_0 = const()[name = tensor<string, []>("prev_kv_3_end_0"), val = tensor<int32, [5]>([2, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_3_end_mask_0 = const()[name = tensor<string, []>("prev_kv_3_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -406,163 +424,163 @@ program(1.0)
             tensor<bool, [2]> prev_scale_3_end_mask_0 = const()[name = tensor<string, []>("prev_scale_3_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_3_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_3_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_3 = slice_by_index(begin = prev_scale_3_begin_0, end = prev_scale_3_end_0, end_mask = prev_scale_3_end_mask_0, squeeze_mask = prev_scale_3_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_3")];
-            tensor<fp32, [1, 3, 256]> var_368 = linear(bias = encoder_q_proj_1_bias, weight = encoder_q_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_12")];
-            tensor<int32, [4]> var_369 = const()[name = tensor<string, []>("op_369"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_370 = reshape(shape = var_369, x = var_368)[name = tensor<string, []>("op_370")];
+            tensor<fp32, [1, 3, 256]> var_414 = linear(bias = inner_encoder_q_proj_1_bias, weight = inner_encoder_q_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_12")];
+            tensor<int32, [4]> var_415 = const()[name = tensor<string, []>("op_415"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_416 = reshape(shape = var_415, x = var_414)[name = tensor<string, []>("op_416")];
             tensor<int32, [4]> q_3_perm_0 = const()[name = tensor<string, []>("q_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> var_374 = linear(bias = encoder_k_proj_1_bias, weight = encoder_k_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_13")];
-            tensor<fp32, []> var_375 = const()[name = tensor<string, []>("op_375"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 3, 256]> var_376 = mul(x = var_374, y = var_375)[name = tensor<string, []>("op_376")];
-            tensor<int32, [4]> var_377 = const()[name = tensor<string, []>("op_377"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_378 = reshape(shape = var_377, x = var_376)[name = tensor<string, []>("op_378")];
+            tensor<fp32, [1, 3, 256]> var_420 = linear(bias = inner_encoder_k_proj_1_bias, weight = inner_encoder_k_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_13")];
+            tensor<fp32, []> var_421 = const()[name = tensor<string, []>("op_421"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 3, 256]> var_422 = mul(x = var_420, y = var_421)[name = tensor<string, []>("op_422")];
+            tensor<int32, [4]> var_423 = const()[name = tensor<string, []>("op_423"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_424 = reshape(shape = var_423, x = var_422)[name = tensor<string, []>("op_424")];
             tensor<int32, [4]> k_3_perm_0 = const()[name = tensor<string, []>("k_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> var_382 = linear(bias = encoder_v_proj_1_bias, weight = encoder_v_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_14")];
-            tensor<int32, [4]> var_383 = const()[name = tensor<string, []>("op_383"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_384 = reshape(shape = var_383, x = var_382)[name = tensor<string, []>("op_384")];
+            tensor<fp32, [1, 3, 256]> var_428 = linear(bias = inner_encoder_v_proj_1_bias, weight = inner_encoder_v_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_14")];
+            tensor<int32, [4]> var_429 = const()[name = tensor<string, []>("op_429"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_430 = reshape(shape = var_429, x = var_428)[name = tensor<string, []>("op_430")];
             tensor<int32, [4]> v_3_perm_0 = const()[name = tensor<string, []>("v_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> input_57 = linear(bias = encoder_g_proj_1_bias, weight = encoder_g_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_15")];
+            tensor<fp32, [1, 3, 256]> input_59 = linear(bias = inner_encoder_g_proj_1_bias, weight = inner_encoder_g_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_15")];
             tensor<fp32, [1]> sqrt_s0_3 = sqrt(x = prev_scale_3)[name = tensor<string, []>("sqrt_s0_3")];
-            tensor<fp32, [3]> s_t_3 = add(x = prev_scale_3, y = encoder__t_index)[name = tensor<string, []>("s_t_3")];
+            tensor<fp32, [3]> s_t_3 = add(x = prev_scale_3, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_3")];
             tensor<fp32, [3]> sqrt_s_t_3 = sqrt(x = s_t_3)[name = tensor<string, []>("sqrt_s_t_3")];
             tensor<bool, []> qk_3_transpose_x_1 = const()[name = tensor<string, []>("qk_3_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_3_transpose_y_1 = const()[name = tensor<string, []>("qk_3_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 3, 64]> k_3 = transpose(perm = k_3_perm_0, x = var_378)[name = tensor<string, []>("transpose_50")];
-            tensor<fp32, [1, 4, 3, 64]> q_3 = transpose(perm = q_3_perm_0, x = var_370)[name = tensor<string, []>("transpose_51")];
+            tensor<fp32, [1, 4, 3, 64]> k_3 = transpose(perm = k_3_perm_0, x = var_424)[name = tensor<string, []>("transpose_50")];
+            tensor<fp32, [1, 4, 3, 64]> q_3 = transpose(perm = q_3_perm_0, x = var_416)[name = tensor<string, []>("transpose_51")];
             tensor<fp32, [1, 4, 3, 3]> qk_3 = matmul(transpose_x = qk_3_transpose_x_1, transpose_y = qk_3_transpose_y_1, x = q_3, y = k_3)[name = tensor<string, []>("qk_3")];
-            tensor<int32, [2]> var_394 = const()[name = tensor<string, []>("op_394"), val = tensor<int32, [2]>([3, 1])];
-            tensor<fp32, [3, 1]> var_395 = reshape(shape = var_394, x = sqrt_s_t_3)[name = tensor<string, []>("op_395")];
-            tensor<fp32, [3, 3]> M_3 = real_div(x = encoder__causal_mask, y = var_395)[name = tensor<string, []>("M_3")];
-            tensor<fp32, [1, 4, 3, 3]> var_397 = mul(x = qk_3, y = M_3)[name = tensor<string, []>("op_397")];
+            tensor<int32, [2]> var_440 = const()[name = tensor<string, []>("op_440"), val = tensor<int32, [2]>([3, 1])];
+            tensor<fp32, [3, 1]> var_441 = reshape(shape = var_440, x = sqrt_s_t_3)[name = tensor<string, []>("op_441")];
+            tensor<fp32, [3, 3]> M_3 = real_div(x = inner_encoder__causal_mask, y = var_441)[name = tensor<string, []>("M_3")];
+            tensor<fp32, [1, 4, 3, 3]> var_443 = mul(x = qk_3, y = M_3)[name = tensor<string, []>("op_443")];
             tensor<bool, []> inner_3_transpose_x_0 = const()[name = tensor<string, []>("inner_3_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_3_transpose_y_0 = const()[name = tensor<string, []>("inner_3_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 3, 64]> v_3 = transpose(perm = v_3_perm_0, x = var_384)[name = tensor<string, []>("transpose_49")];
-            tensor<fp32, [1, 4, 3, 64]> inner_3 = matmul(transpose_x = inner_3_transpose_x_0, transpose_y = inner_3_transpose_y_0, x = var_397, y = v_3)[name = tensor<string, []>("inner_3")];
-            tensor<bool, []> var_399_transpose_x_0 = const()[name = tensor<string, []>("op_399_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_399_transpose_y_0 = const()[name = tensor<string, []>("op_399_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 3, 64]> var_399 = matmul(transpose_x = var_399_transpose_x_0, transpose_y = var_399_transpose_y_0, x = q_3, y = prev_kv_3)[name = tensor<string, []>("op_399")];
-            tensor<fp32, [3]> var_400 = real_div(x = sqrt_s0_3, y = sqrt_s_t_3)[name = tensor<string, []>("op_400")];
-            tensor<int32, [4]> var_401 = const()[name = tensor<string, []>("op_401"), val = tensor<int32, [4]>([1, 1, 3, 1])];
-            tensor<fp32, [1, 1, 3, 1]> var_402 = reshape(shape = var_401, x = var_400)[name = tensor<string, []>("op_402")];
-            tensor<fp32, [1, 4, 3, 64]> cross_3 = mul(x = var_399, y = var_402)[name = tensor<string, []>("cross_3")];
+            tensor<fp32, [1, 4, 3, 64]> v_3 = transpose(perm = v_3_perm_0, x = var_430)[name = tensor<string, []>("transpose_49")];
+            tensor<fp32, [1, 4, 3, 64]> inner_3 = matmul(transpose_x = inner_3_transpose_x_0, transpose_y = inner_3_transpose_y_0, x = var_443, y = v_3)[name = tensor<string, []>("inner_3")];
+            tensor<bool, []> var_445_transpose_x_0 = const()[name = tensor<string, []>("op_445_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_445_transpose_y_0 = const()[name = tensor<string, []>("op_445_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 3, 64]> var_445 = matmul(transpose_x = var_445_transpose_x_0, transpose_y = var_445_transpose_y_0, x = q_3, y = prev_kv_3)[name = tensor<string, []>("op_445")];
+            tensor<fp32, [3]> var_446 = real_div(x = sqrt_s0_3, y = sqrt_s_t_3)[name = tensor<string, []>("op_446")];
+            tensor<int32, [4]> var_447 = const()[name = tensor<string, []>("op_447"), val = tensor<int32, [4]>([1, 1, 3, 1])];
+            tensor<fp32, [1, 1, 3, 1]> var_448 = reshape(shape = var_447, x = var_446)[name = tensor<string, []>("op_448")];
+            tensor<fp32, [1, 4, 3, 64]> cross_3 = mul(x = var_445, y = var_448)[name = tensor<string, []>("cross_3")];
             tensor<fp32, [1, 4, 3, 64]> out_7 = add(x = inner_3, y = cross_3)[name = tensor<string, []>("out_7")];
-            tensor<fp32, [1, 4, 64, 64]> var_405 = mul(x = prev_kv_3, y = sqrt_s0_3)[name = tensor<string, []>("op_405")];
-            tensor<bool, []> var_407_transpose_x_1 = const()[name = tensor<string, []>("op_407_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_407_transpose_y_1 = const()[name = tensor<string, []>("op_407_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_407 = matmul(transpose_x = var_407_transpose_x_1, transpose_y = var_407_transpose_y_1, x = k_3, y = v_3)[name = tensor<string, []>("op_407")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_3 = add(x = var_405, y = var_407)[name = tensor<string, []>("new_kv_unnorm_3")];
-            tensor<fp32, []> var_409 = const()[name = tensor<string, []>("op_409"), val = tensor<fp32, []>(0x1.8p+1)];
-            tensor<fp32, [1]> new_scale_3 = add(x = prev_scale_3, y = var_409)[name = tensor<string, []>("new_scale_3")];
-            tensor<fp32, [1]> var_411 = sqrt(x = new_scale_3)[name = tensor<string, []>("op_411")];
-            tensor<fp32, [1, 4, 64, 64]> var_412 = real_div(x = new_kv_unnorm_3, y = var_411)[name = tensor<string, []>("op_412")];
-            tensor<int32, [4]> var_413_perm_0 = const()[name = tensor<string, []>("op_413_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_451 = mul(x = prev_kv_3, y = sqrt_s0_3)[name = tensor<string, []>("op_451")];
+            tensor<bool, []> var_453_transpose_x_1 = const()[name = tensor<string, []>("op_453_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_453_transpose_y_1 = const()[name = tensor<string, []>("op_453_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_453 = matmul(transpose_x = var_453_transpose_x_1, transpose_y = var_453_transpose_y_1, x = k_3, y = v_3)[name = tensor<string, []>("op_453")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_3 = add(x = var_451, y = var_453)[name = tensor<string, []>("new_kv_unnorm_3")];
+            tensor<fp32, []> var_455 = const()[name = tensor<string, []>("op_455"), val = tensor<fp32, []>(0x1.8p+1)];
+            tensor<fp32, [1]> new_scale_3 = add(x = prev_scale_3, y = var_455)[name = tensor<string, []>("new_scale_3")];
+            tensor<fp32, [1]> var_457 = sqrt(x = new_scale_3)[name = tensor<string, []>("op_457")];
+            tensor<fp32, [1, 4, 64, 64]> var_458 = real_div(x = new_kv_unnorm_3, y = var_457)[name = tensor<string, []>("op_458")];
+            tensor<int32, [4]> var_459_perm_0 = const()[name = tensor<string, []>("op_459_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_9_axes_0 = const()[name = tensor<string, []>("out_9_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 4, 64]> var_413 = transpose(perm = var_413_perm_0, x = out_7)[name = tensor<string, []>("transpose_48")];
-            tensor<fp32, [1, 3, 4, 64]> out_9 = layer_norm(axes = out_9_axes_0, epsilon = var_17, x = var_413)[name = tensor<string, []>("out_9")];
-            tensor<int32, [3]> var_417 = const()[name = tensor<string, []>("op_417"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<fp32, [1, 3, 256]> out_11 = reshape(shape = var_417, x = out_9)[name = tensor<string, []>("out_11")];
-            tensor<fp32, [1, 3, 256]> var_419 = silu(x = input_57)[name = tensor<string, []>("op_419")];
-            tensor<fp32, [1, 3, 256]> input_59 = mul(x = var_419, y = out_11)[name = tensor<string, []>("input_59")];
-            tensor<fp32, [1, 3, 256]> ret_out_3 = linear(bias = encoder_out_proj_1_bias, weight = encoder_out_proj_1_weight, x = input_59)[name = tensor<string, []>("linear_16")];
-            tensor<fp32, [1, 3, 256]> x_9 = add(x = input_53, y = ret_out_3)[name = tensor<string, []>("x_9")];
+            tensor<fp32, [1, 3, 4, 64]> var_459 = transpose(perm = var_459_perm_0, x = out_7)[name = tensor<string, []>("transpose_48")];
+            tensor<fp32, [1, 3, 4, 64]> out_9 = layer_norm(axes = out_9_axes_0, epsilon = var_63, x = var_459)[name = tensor<string, []>("out_9")];
+            tensor<int32, [3]> var_463 = const()[name = tensor<string, []>("op_463"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<fp32, [1, 3, 256]> out_11 = reshape(shape = var_463, x = out_9)[name = tensor<string, []>("out_11")];
+            tensor<fp32, [1, 3, 256]> var_465 = silu(x = input_59)[name = tensor<string, []>("op_465")];
+            tensor<fp32, [1, 3, 256]> input_61 = mul(x = var_465, y = out_11)[name = tensor<string, []>("input_61")];
+            tensor<fp32, [1, 3, 256]> ret_out_3 = linear(bias = inner_encoder_out_proj_1_bias, weight = inner_encoder_out_proj_1_weight, x = input_61)[name = tensor<string, []>("linear_16")];
+            tensor<fp32, [1, 3, 256]> x_9 = add(x = input_55, y = ret_out_3)[name = tensor<string, []>("x_9")];
             tensor<int32, [4]> window_9_begin_0 = const()[name = tensor<string, []>("window_9_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> window_9_end_0 = const()[name = tensor<string, []>("window_9_end_0"), val = tensor<int32, [4]>([2, 1, 16, 256])];
             tensor<bool, [4]> window_9_end_mask_0 = const()[name = tensor<string, []>("window_9_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_9_squeeze_mask_0 = const()[name = tensor<string, []>("window_9_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_9 = slice_by_index(begin = window_9_begin_0, end = window_9_end_0, end_mask = window_9_end_mask_0, squeeze_mask = window_9_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_9")];
-            tensor<int32, [3]> var_427_begin_0 = const()[name = tensor<string, []>("op_427_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_427_end_0 = const()[name = tensor<string, []>("op_427_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_427_end_mask_0 = const()[name = tensor<string, []>("op_427_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_427 = slice_by_index(begin = var_427_begin_0, end = var_427_end_0, end_mask = var_427_end_mask_0, x = x_9)[name = tensor<string, []>("op_427")];
-            tensor<int32, [3]> var_430_begin_0 = const()[name = tensor<string, []>("op_430_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_430_end_0 = const()[name = tensor<string, []>("op_430_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_430_end_mask_0 = const()[name = tensor<string, []>("op_430_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_430 = slice_by_index(begin = var_430_begin_0, end = var_430_end_0, end_mask = var_430_end_mask_0, x = window_9)[name = tensor<string, []>("op_430")];
+            tensor<int32, [3]> var_473_begin_0 = const()[name = tensor<string, []>("op_473_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_473_end_0 = const()[name = tensor<string, []>("op_473_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_473_end_mask_0 = const()[name = tensor<string, []>("op_473_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_473 = slice_by_index(begin = var_473_begin_0, end = var_473_end_0, end_mask = var_473_end_mask_0, x = x_9)[name = tensor<string, []>("op_473")];
+            tensor<int32, [3]> var_476_begin_0 = const()[name = tensor<string, []>("op_476_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_476_end_0 = const()[name = tensor<string, []>("op_476_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_476_end_mask_0 = const()[name = tensor<string, []>("op_476_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_476 = slice_by_index(begin = var_476_begin_0, end = var_476_end_0, end_mask = var_476_end_mask_0, x = window_9)[name = tensor<string, []>("op_476")];
             tensor<bool, []> window_11_interleave_0 = const()[name = tensor<string, []>("window_11_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_11 = concat(axis = var_26, interleave = window_11_interleave_0, values = (var_430, var_427))[name = tensor<string, []>("window_11")];
-            tensor<int32, [3]> var_435_begin_0 = const()[name = tensor<string, []>("op_435_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_435_end_0 = const()[name = tensor<string, []>("op_435_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_435_end_mask_0 = const()[name = tensor<string, []>("op_435_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_435 = slice_by_index(begin = var_435_begin_0, end = var_435_end_0, end_mask = var_435_end_mask_0, x = x_9)[name = tensor<string, []>("op_435")];
-            tensor<int32, [3]> var_438_begin_0 = const()[name = tensor<string, []>("op_438_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_438_end_0 = const()[name = tensor<string, []>("op_438_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_438_end_mask_0 = const()[name = tensor<string, []>("op_438_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_438 = slice_by_index(begin = var_438_begin_0, end = var_438_end_0, end_mask = var_438_end_mask_0, x = window_11)[name = tensor<string, []>("op_438")];
+            tensor<fp32, [1, 16, 256]> window_11 = concat(axis = var_72, interleave = window_11_interleave_0, values = (var_476, var_473))[name = tensor<string, []>("window_11")];
+            tensor<int32, [3]> var_481_begin_0 = const()[name = tensor<string, []>("op_481_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_481_end_0 = const()[name = tensor<string, []>("op_481_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_481_end_mask_0 = const()[name = tensor<string, []>("op_481_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_481 = slice_by_index(begin = var_481_begin_0, end = var_481_end_0, end_mask = var_481_end_mask_0, x = x_9)[name = tensor<string, []>("op_481")];
+            tensor<int32, [3]> var_484_begin_0 = const()[name = tensor<string, []>("op_484_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_484_end_0 = const()[name = tensor<string, []>("op_484_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_484_end_mask_0 = const()[name = tensor<string, []>("op_484_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_484 = slice_by_index(begin = var_484_begin_0, end = var_484_end_0, end_mask = var_484_end_mask_0, x = window_11)[name = tensor<string, []>("op_484")];
             tensor<bool, []> window_13_interleave_0 = const()[name = tensor<string, []>("window_13_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_13 = concat(axis = var_26, interleave = window_13_interleave_0, values = (var_438, var_435))[name = tensor<string, []>("window_13")];
-            tensor<int32, [3]> var_443_begin_0 = const()[name = tensor<string, []>("op_443_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_443_end_0 = const()[name = tensor<string, []>("op_443_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_443_end_mask_0 = const()[name = tensor<string, []>("op_443_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_443 = slice_by_index(begin = var_443_begin_0, end = var_443_end_0, end_mask = var_443_end_mask_0, x = x_9)[name = tensor<string, []>("op_443")];
-            tensor<int32, [3]> var_446_begin_0 = const()[name = tensor<string, []>("op_446_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_446_end_0 = const()[name = tensor<string, []>("op_446_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_446_end_mask_0 = const()[name = tensor<string, []>("op_446_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_446 = slice_by_index(begin = var_446_begin_0, end = var_446_end_0, end_mask = var_446_end_mask_0, x = window_13)[name = tensor<string, []>("op_446")];
+            tensor<fp32, [1, 16, 256]> window_13 = concat(axis = var_72, interleave = window_13_interleave_0, values = (var_484, var_481))[name = tensor<string, []>("window_13")];
+            tensor<int32, [3]> var_489_begin_0 = const()[name = tensor<string, []>("op_489_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_489_end_0 = const()[name = tensor<string, []>("op_489_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_489_end_mask_0 = const()[name = tensor<string, []>("op_489_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_489 = slice_by_index(begin = var_489_begin_0, end = var_489_end_0, end_mask = var_489_end_mask_0, x = x_9)[name = tensor<string, []>("op_489")];
+            tensor<int32, [3]> var_492_begin_0 = const()[name = tensor<string, []>("op_492_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_492_end_0 = const()[name = tensor<string, []>("op_492_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_492_end_mask_0 = const()[name = tensor<string, []>("op_492_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_492 = slice_by_index(begin = var_492_begin_0, end = var_492_end_0, end_mask = var_492_end_mask_0, x = window_13)[name = tensor<string, []>("op_492")];
             tensor<bool, []> window_15_interleave_0 = const()[name = tensor<string, []>("window_15_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_15 = concat(axis = var_26, interleave = window_15_interleave_0, values = (var_446, var_443))[name = tensor<string, []>("window_15")];
-            tensor<bool, []> input_61_interleave_0 = const()[name = tensor<string, []>("input_61_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [3, 16, 256]> input_61 = concat(axis = var_23, interleave = input_61_interleave_0, values = (window_11, window_13, window_15))[name = tensor<string, []>("input_61")];
+            tensor<fp32, [1, 16, 256]> window_15 = concat(axis = var_72, interleave = window_15_interleave_0, values = (var_492, var_489))[name = tensor<string, []>("window_15")];
+            tensor<bool, []> input_63_interleave_0 = const()[name = tensor<string, []>("input_63_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [3, 16, 256]> input_63 = concat(axis = var_58, interleave = input_63_interleave_0, values = (window_11, window_13, window_15))[name = tensor<string, []>("input_63")];
             tensor<int32, [1]> x_11_axes_0 = const()[name = tensor<string, []>("x_11_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [3, 16, 256]> x_11 = layer_norm(axes = x_11_axes_0, beta = encoder_conv_module_1_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_1_sequential_0_weight, x = input_61)[name = tensor<string, []>("x_11")];
-            tensor<int32, [3]> input_63_perm_0 = const()[name = tensor<string, []>("input_63_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [3, 16, 256]> x_11 = layer_norm(axes = x_11_axes_0, beta = inner_encoder_conv_module_1_sequential_0_bias, epsilon = var_55, gamma = inner_encoder_conv_module_1_sequential_0_weight, x = input_63)[name = tensor<string, []>("x_11")];
+            tensor<int32, [3]> input_65_perm_0 = const()[name = tensor<string, []>("input_65_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_13_pad_type_0 = const()[name = tensor<string, []>("inputs_13_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_13_strides_0 = const()[name = tensor<string, []>("inputs_13_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_13_pad_0 = const()[name = tensor<string, []>("inputs_13_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_13_dilations_0 = const()[name = tensor<string, []>("inputs_13_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_13_groups_0 = const()[name = tensor<string, []>("inputs_13_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> input_63 = transpose(perm = input_63_perm_0, x = x_11)[name = tensor<string, []>("transpose_47")];
-            tensor<fp32, [3, 512, 16]> inputs_13 = conv(bias = encoder_conv_module_1_sequential_2_conv_bias, dilations = inputs_13_dilations_0, groups = inputs_13_groups_0, pad = inputs_13_pad_0, pad_type = inputs_13_pad_type_0, strides = inputs_13_strides_0, weight = encoder_conv_module_1_sequential_2_conv_weight, x = input_63)[name = tensor<string, []>("inputs_13")];
-            tensor<int32, [2]> var_471_split_sizes_0 = const()[name = tensor<string, []>("op_471_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_471_axis_0 = const()[name = tensor<string, []>("op_471_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> var_471_0, tensor<fp32, [3, 256, 16]> var_471_1 = split(axis = var_471_axis_0, split_sizes = var_471_split_sizes_0, x = inputs_13)[name = tensor<string, []>("op_471")];
-            tensor<fp32, [3, 256, 16]> var_473 = sigmoid(x = var_471_1)[name = tensor<string, []>("op_473")];
-            tensor<fp32, [3, 256, 16]> inputs_15 = mul(x = var_471_0, y = var_473)[name = tensor<string, []>("inputs_15")];
+            tensor<fp32, [3, 256, 16]> input_65 = transpose(perm = input_65_perm_0, x = x_11)[name = tensor<string, []>("transpose_47")];
+            tensor<fp32, [3, 512, 16]> inputs_13 = conv(bias = inner_encoder_conv_module_1_sequential_2_conv_bias, dilations = inputs_13_dilations_0, groups = inputs_13_groups_0, pad = inputs_13_pad_0, pad_type = inputs_13_pad_type_0, strides = inputs_13_strides_0, weight = inner_encoder_conv_module_1_sequential_2_conv_weight, x = input_65)[name = tensor<string, []>("inputs_13")];
+            tensor<int32, [2]> var_517_split_sizes_0 = const()[name = tensor<string, []>("op_517_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_517_axis_0 = const()[name = tensor<string, []>("op_517_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [3, 256, 16]> var_517_0, tensor<fp32, [3, 256, 16]> var_517_1 = split(axis = var_517_axis_0, split_sizes = var_517_split_sizes_0, x = inputs_13)[name = tensor<string, []>("op_517")];
+            tensor<fp32, [3, 256, 16]> var_519 = sigmoid(x = var_517_1)[name = tensor<string, []>("op_519")];
+            tensor<fp32, [3, 256, 16]> inputs_15 = mul(x = var_517_0, y = var_519)[name = tensor<string, []>("inputs_15")];
             tensor<string, []> outputs_aug_3_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_3_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_3_pad_0 = const()[name = tensor<string, []>("outputs_aug_3_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_3_groups_0 = const()[name = tensor<string, []>("outputs_aug_3_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_3_strides_0 = const()[name = tensor<string, []>("outputs_aug_3_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_3_dilations_0 = const()[name = tensor<string, []>("outputs_aug_3_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [3, 256, 31]> outputs_aug_3 = conv(dilations = outputs_aug_3_dilations_0, groups = outputs_aug_3_groups_0, pad = outputs_aug_3_pad_0, pad_type = outputs_aug_3_pad_type_0, strides = outputs_aug_3_strides_0, weight = encoder_conv_module_1_sequential_4_conv_weight, x = inputs_15)[name = tensor<string, []>("outputs_aug_3")];
-            tensor<int32, [3]> input_65_begin_0 = const()[name = tensor<string, []>("input_65_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_65_end_0 = const()[name = tensor<string, []>("input_65_end_0"), val = tensor<int32, [3]>([3, 256, 16])];
-            tensor<bool, [3]> input_65_end_mask_0 = const()[name = tensor<string, []>("input_65_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [3, 256, 16]> input_65 = slice_by_index(begin = input_65_begin_0, end = input_65_end_0, end_mask = input_65_end_mask_0, x = outputs_aug_3)[name = tensor<string, []>("input_65")];
-            tensor<fp32, [3, 256, 16]> inputs_17 = batch_norm(beta = encoder_conv_module_1_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_1_sequential_5_weight, mean = encoder_conv_module_1_sequential_5_running_mean, variance = encoder_conv_module_1_sequential_5_running_var, x = input_65)[name = tensor<string, []>("inputs_17")];
-            tensor<fp32, [3, 256, 16]> input_67 = silu(x = inputs_17)[name = tensor<string, []>("input_67")];
-            tensor<string, []> input_69_pad_type_0 = const()[name = tensor<string, []>("input_69_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_69_strides_0 = const()[name = tensor<string, []>("input_69_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_69_pad_0 = const()[name = tensor<string, []>("input_69_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_69_dilations_0 = const()[name = tensor<string, []>("input_69_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_69_groups_0 = const()[name = tensor<string, []>("input_69_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> input_69 = conv(bias = encoder_conv_module_1_sequential_7_conv_bias, dilations = input_69_dilations_0, groups = input_69_groups_0, pad = input_69_pad_0, pad_type = input_69_pad_type_0, strides = input_69_strides_0, weight = encoder_conv_module_1_sequential_7_conv_weight, x = input_67)[name = tensor<string, []>("input_69")];
+            tensor<fp32, [3, 256, 31]> outputs_aug_3 = conv(dilations = outputs_aug_3_dilations_0, groups = outputs_aug_3_groups_0, pad = outputs_aug_3_pad_0, pad_type = outputs_aug_3_pad_type_0, strides = outputs_aug_3_strides_0, weight = inner_encoder_conv_module_1_sequential_4_conv_weight, x = inputs_15)[name = tensor<string, []>("outputs_aug_3")];
+            tensor<int32, [3]> input_67_begin_0 = const()[name = tensor<string, []>("input_67_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_67_end_0 = const()[name = tensor<string, []>("input_67_end_0"), val = tensor<int32, [3]>([3, 256, 16])];
+            tensor<bool, [3]> input_67_end_mask_0 = const()[name = tensor<string, []>("input_67_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [3, 256, 16]> input_67 = slice_by_index(begin = input_67_begin_0, end = input_67_end_0, end_mask = input_67_end_mask_0, x = outputs_aug_3)[name = tensor<string, []>("input_67")];
+            tensor<fp32, [3, 256, 16]> inputs_17 = batch_norm(beta = inner_encoder_conv_module_1_sequential_5_bias, epsilon = var_55, gamma = inner_encoder_conv_module_1_sequential_5_weight, mean = inner_encoder_conv_module_1_sequential_5_running_mean, variance = inner_encoder_conv_module_1_sequential_5_running_var, x = input_67)[name = tensor<string, []>("inputs_17")];
+            tensor<fp32, [3, 256, 16]> input_69 = silu(x = inputs_17)[name = tensor<string, []>("input_69")];
+            tensor<string, []> input_71_pad_type_0 = const()[name = tensor<string, []>("input_71_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_71_strides_0 = const()[name = tensor<string, []>("input_71_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_71_pad_0 = const()[name = tensor<string, []>("input_71_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_71_dilations_0 = const()[name = tensor<string, []>("input_71_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_71_groups_0 = const()[name = tensor<string, []>("input_71_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [3, 256, 16]> input_71 = conv(bias = inner_encoder_conv_module_1_sequential_7_conv_bias, dilations = input_71_dilations_0, groups = input_71_groups_0, pad = input_71_pad_0, pad_type = input_71_pad_type_0, strides = input_71_strides_0, weight = inner_encoder_conv_module_1_sequential_7_conv_weight, x = input_69)[name = tensor<string, []>("input_71")];
             tensor<int32, [3]> conv_out_3_perm_0 = const()[name = tensor<string, []>("conv_out_3_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_504_begin_0 = const()[name = tensor<string, []>("op_504_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_504_end_0 = const()[name = tensor<string, []>("op_504_end_0"), val = tensor<int32, [3]>([3, 16, 256])];
-            tensor<bool, [3]> var_504_end_mask_0 = const()[name = tensor<string, []>("op_504_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [3, 16, 256]> conv_out_3 = transpose(perm = conv_out_3_perm_0, x = input_69)[name = tensor<string, []>("transpose_46")];
-            tensor<fp32, [3, 1, 256]> var_504 = slice_by_index(begin = var_504_begin_0, end = var_504_end_0, end_mask = var_504_end_mask_0, x = conv_out_3)[name = tensor<string, []>("op_504")];
-            tensor<int32, [3]> var_506_perm_0 = const()[name = tensor<string, []>("op_506_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 3, 256]> var_506 = transpose(perm = var_506_perm_0, x = var_504)[name = tensor<string, []>("transpose_45")];
-            tensor<fp32, [1, 3, 256]> input_71 = add(x = x_9, y = var_506)[name = tensor<string, []>("input_71")];
-            tensor<int32, [1]> input_73_axes_0 = const()[name = tensor<string, []>("input_73_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_73 = layer_norm(axes = input_73_axes_0, beta = encoder_ffn2_1_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_1_module_sequential_0_weight, x = input_71)[name = tensor<string, []>("input_73")];
-            tensor<fp32, [1, 3, 1024]> inputs_19 = linear(bias = encoder_ffn2_1_module_sequential_1_linear_bias, weight = encoder_ffn2_1_module_sequential_1_linear_weight, x = input_73)[name = tensor<string, []>("linear_17")];
-            tensor<fp32, [1, 3, 1024]> input_75 = silu(x = inputs_19)[name = tensor<string, []>("input_75")];
-            tensor<fp32, [1, 3, 256]> input_79 = linear(bias = encoder_ffn2_1_module_sequential_4_linear_bias, weight = encoder_ffn2_1_module_sequential_4_linear_weight, x = input_75)[name = tensor<string, []>("linear_18")];
-            tensor<fp32, []> var_529 = const()[name = tensor<string, []>("op_529"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 3, 256]> var_530 = mul(x = input_79, y = var_529)[name = tensor<string, []>("op_530")];
-            tensor<fp32, [1, 3, 256]> input_81 = add(x = var_530, y = input_71)[name = tensor<string, []>("input_81")];
-            tensor<int32, [1]> input_83_axes_0 = const()[name = tensor<string, []>("input_83_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_83 = layer_norm(axes = input_83_axes_0, beta = encoder_layer_norm_1_bias, epsilon = var_28, gamma = encoder_layer_norm_1_weight, x = input_81)[name = tensor<string, []>("input_83")];
+            tensor<int32, [3]> var_550_begin_0 = const()[name = tensor<string, []>("op_550_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_550_end_0 = const()[name = tensor<string, []>("op_550_end_0"), val = tensor<int32, [3]>([3, 16, 256])];
+            tensor<bool, [3]> var_550_end_mask_0 = const()[name = tensor<string, []>("op_550_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [3, 16, 256]> conv_out_3 = transpose(perm = conv_out_3_perm_0, x = input_71)[name = tensor<string, []>("transpose_46")];
+            tensor<fp32, [3, 1, 256]> var_550 = slice_by_index(begin = var_550_begin_0, end = var_550_end_0, end_mask = var_550_end_mask_0, x = conv_out_3)[name = tensor<string, []>("op_550")];
+            tensor<int32, [3]> var_552_perm_0 = const()[name = tensor<string, []>("op_552_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 3, 256]> var_552 = transpose(perm = var_552_perm_0, x = var_550)[name = tensor<string, []>("transpose_45")];
+            tensor<fp32, [1, 3, 256]> input_73 = add(x = x_9, y = var_552)[name = tensor<string, []>("input_73")];
+            tensor<int32, [1]> input_75_axes_0 = const()[name = tensor<string, []>("input_75_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 3, 256]> input_75 = layer_norm(axes = input_75_axes_0, beta = inner_encoder_ffn2_1_module_sequential_0_bias, epsilon = var_55, gamma = inner_encoder_ffn2_1_module_sequential_0_weight, x = input_73)[name = tensor<string, []>("input_75")];
+            tensor<fp32, [1, 3, 1024]> inputs_19 = linear(bias = inner_encoder_ffn2_1_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_1_module_sequential_1_linear_weight, x = input_75)[name = tensor<string, []>("linear_17")];
+            tensor<fp32, [1, 3, 1024]> input_77 = silu(x = inputs_19)[name = tensor<string, []>("input_77")];
+            tensor<fp32, [1, 3, 256]> input_81 = linear(bias = inner_encoder_ffn2_1_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_1_module_sequential_4_linear_weight, x = input_77)[name = tensor<string, []>("linear_18")];
+            tensor<fp32, []> var_575 = const()[name = tensor<string, []>("op_575"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 3, 256]> var_576 = mul(x = input_81, y = var_575)[name = tensor<string, []>("op_576")];
+            tensor<fp32, [1, 3, 256]> input_83 = add(x = var_576, y = input_73)[name = tensor<string, []>("input_83")];
             tensor<int32, [1]> input_85_axes_0 = const()[name = tensor<string, []>("input_85_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_85 = layer_norm(axes = input_85_axes_0, beta = encoder_ffn1_2_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_2_module_sequential_0_weight, x = input_83)[name = tensor<string, []>("input_85")];
-            tensor<fp32, [1, 3, 1024]> inputs_21 = linear(bias = encoder_ffn1_2_module_sequential_1_linear_bias, weight = encoder_ffn1_2_module_sequential_1_linear_weight, x = input_85)[name = tensor<string, []>("linear_19")];
-            tensor<fp32, [1, 3, 1024]> input_87 = silu(x = inputs_21)[name = tensor<string, []>("input_87")];
-            tensor<fp32, [1, 3, 256]> input_91 = linear(bias = encoder_ffn1_2_module_sequential_4_linear_bias, weight = encoder_ffn1_2_module_sequential_4_linear_weight, x = input_87)[name = tensor<string, []>("linear_20")];
-            tensor<fp32, []> var_559 = const()[name = tensor<string, []>("op_559"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 3, 256]> var_560 = mul(x = input_91, y = var_559)[name = tensor<string, []>("op_560")];
-            tensor<fp32, [1, 3, 256]> input_93 = add(x = var_560, y = input_83)[name = tensor<string, []>("input_93")];
+            tensor<fp32, [1, 3, 256]> input_85 = layer_norm(axes = input_85_axes_0, beta = inner_encoder_layer_norm_1_bias, epsilon = var_55, gamma = inner_encoder_layer_norm_1_weight, x = input_83)[name = tensor<string, []>("input_85")];
+            tensor<int32, [1]> input_87_axes_0 = const()[name = tensor<string, []>("input_87_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 3, 256]> input_87 = layer_norm(axes = input_87_axes_0, beta = inner_encoder_ffn1_2_module_sequential_0_bias, epsilon = var_55, gamma = inner_encoder_ffn1_2_module_sequential_0_weight, x = input_85)[name = tensor<string, []>("input_87")];
+            tensor<fp32, [1, 3, 1024]> inputs_21 = linear(bias = inner_encoder_ffn1_2_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_2_module_sequential_1_linear_weight, x = input_87)[name = tensor<string, []>("linear_19")];
+            tensor<fp32, [1, 3, 1024]> input_89 = silu(x = inputs_21)[name = tensor<string, []>("input_89")];
+            tensor<fp32, [1, 3, 256]> input_93 = linear(bias = inner_encoder_ffn1_2_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_2_module_sequential_4_linear_weight, x = input_89)[name = tensor<string, []>("linear_20")];
+            tensor<fp32, []> var_605 = const()[name = tensor<string, []>("op_605"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 3, 256]> var_606 = mul(x = input_93, y = var_605)[name = tensor<string, []>("op_606")];
+            tensor<fp32, [1, 3, 256]> input_95 = add(x = var_606, y = input_85)[name = tensor<string, []>("input_95")];
             tensor<int32, [1]> x_13_axes_0 = const()[name = tensor<string, []>("x_13_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> x_13 = layer_norm(axes = x_13_axes_0, beta = encoder_ret_lns_2_bias, epsilon = var_28, gamma = encoder_ret_lns_2_weight, x = input_93)[name = tensor<string, []>("x_13")];
+            tensor<fp32, [1, 3, 256]> x_13 = layer_norm(axes = x_13_axes_0, beta = inner_encoder_ret_lns_2_bias, epsilon = var_55, gamma = inner_encoder_ret_lns_2_weight, x = input_95)[name = tensor<string, []>("x_13")];
             tensor<int32, [5]> prev_kv_5_begin_0 = const()[name = tensor<string, []>("prev_kv_5_begin_0"), val = tensor<int32, [5]>([2, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_5_end_0 = const()[name = tensor<string, []>("prev_kv_5_end_0"), val = tensor<int32, [5]>([3, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_5_end_mask_0 = const()[name = tensor<string, []>("prev_kv_5_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -573,163 +591,163 @@ program(1.0)
             tensor<bool, [2]> prev_scale_5_end_mask_0 = const()[name = tensor<string, []>("prev_scale_5_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_5_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_5_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_5 = slice_by_index(begin = prev_scale_5_begin_0, end = prev_scale_5_end_0, end_mask = prev_scale_5_end_mask_0, squeeze_mask = prev_scale_5_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_5")];
-            tensor<fp32, [1, 3, 256]> var_574 = linear(bias = encoder_q_proj_2_bias, weight = encoder_q_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_21")];
-            tensor<int32, [4]> var_575 = const()[name = tensor<string, []>("op_575"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_576 = reshape(shape = var_575, x = var_574)[name = tensor<string, []>("op_576")];
+            tensor<fp32, [1, 3, 256]> var_620 = linear(bias = inner_encoder_q_proj_2_bias, weight = inner_encoder_q_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_21")];
+            tensor<int32, [4]> var_621 = const()[name = tensor<string, []>("op_621"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_622 = reshape(shape = var_621, x = var_620)[name = tensor<string, []>("op_622")];
             tensor<int32, [4]> q_5_perm_0 = const()[name = tensor<string, []>("q_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> var_580 = linear(bias = encoder_k_proj_2_bias, weight = encoder_k_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_22")];
-            tensor<fp32, []> var_581 = const()[name = tensor<string, []>("op_581"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 3, 256]> var_582 = mul(x = var_580, y = var_581)[name = tensor<string, []>("op_582")];
-            tensor<int32, [4]> var_583 = const()[name = tensor<string, []>("op_583"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_584 = reshape(shape = var_583, x = var_582)[name = tensor<string, []>("op_584")];
+            tensor<fp32, [1, 3, 256]> var_626 = linear(bias = inner_encoder_k_proj_2_bias, weight = inner_encoder_k_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_22")];
+            tensor<fp32, []> var_627 = const()[name = tensor<string, []>("op_627"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 3, 256]> var_628 = mul(x = var_626, y = var_627)[name = tensor<string, []>("op_628")];
+            tensor<int32, [4]> var_629 = const()[name = tensor<string, []>("op_629"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_630 = reshape(shape = var_629, x = var_628)[name = tensor<string, []>("op_630")];
             tensor<int32, [4]> k_5_perm_0 = const()[name = tensor<string, []>("k_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> var_588 = linear(bias = encoder_v_proj_2_bias, weight = encoder_v_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_23")];
-            tensor<int32, [4]> var_589 = const()[name = tensor<string, []>("op_589"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_590 = reshape(shape = var_589, x = var_588)[name = tensor<string, []>("op_590")];
+            tensor<fp32, [1, 3, 256]> var_634 = linear(bias = inner_encoder_v_proj_2_bias, weight = inner_encoder_v_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_23")];
+            tensor<int32, [4]> var_635 = const()[name = tensor<string, []>("op_635"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_636 = reshape(shape = var_635, x = var_634)[name = tensor<string, []>("op_636")];
             tensor<int32, [4]> v_5_perm_0 = const()[name = tensor<string, []>("v_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> input_97 = linear(bias = encoder_g_proj_2_bias, weight = encoder_g_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_24")];
+            tensor<fp32, [1, 3, 256]> input_99 = linear(bias = inner_encoder_g_proj_2_bias, weight = inner_encoder_g_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_24")];
             tensor<fp32, [1]> sqrt_s0_5 = sqrt(x = prev_scale_5)[name = tensor<string, []>("sqrt_s0_5")];
-            tensor<fp32, [3]> s_t_5 = add(x = prev_scale_5, y = encoder__t_index)[name = tensor<string, []>("s_t_5")];
+            tensor<fp32, [3]> s_t_5 = add(x = prev_scale_5, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_5")];
             tensor<fp32, [3]> sqrt_s_t_5 = sqrt(x = s_t_5)[name = tensor<string, []>("sqrt_s_t_5")];
             tensor<bool, []> qk_5_transpose_x_1 = const()[name = tensor<string, []>("qk_5_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_5_transpose_y_1 = const()[name = tensor<string, []>("qk_5_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 3, 64]> k_5 = transpose(perm = k_5_perm_0, x = var_584)[name = tensor<string, []>("transpose_43")];
-            tensor<fp32, [1, 4, 3, 64]> q_5 = transpose(perm = q_5_perm_0, x = var_576)[name = tensor<string, []>("transpose_44")];
+            tensor<fp32, [1, 4, 3, 64]> k_5 = transpose(perm = k_5_perm_0, x = var_630)[name = tensor<string, []>("transpose_43")];
+            tensor<fp32, [1, 4, 3, 64]> q_5 = transpose(perm = q_5_perm_0, x = var_622)[name = tensor<string, []>("transpose_44")];
             tensor<fp32, [1, 4, 3, 3]> qk_5 = matmul(transpose_x = qk_5_transpose_x_1, transpose_y = qk_5_transpose_y_1, x = q_5, y = k_5)[name = tensor<string, []>("qk_5")];
-            tensor<int32, [2]> var_600 = const()[name = tensor<string, []>("op_600"), val = tensor<int32, [2]>([3, 1])];
-            tensor<fp32, [3, 1]> var_601 = reshape(shape = var_600, x = sqrt_s_t_5)[name = tensor<string, []>("op_601")];
-            tensor<fp32, [3, 3]> M_5 = real_div(x = encoder__causal_mask, y = var_601)[name = tensor<string, []>("M_5")];
-            tensor<fp32, [1, 4, 3, 3]> var_603 = mul(x = qk_5, y = M_5)[name = tensor<string, []>("op_603")];
+            tensor<int32, [2]> var_646 = const()[name = tensor<string, []>("op_646"), val = tensor<int32, [2]>([3, 1])];
+            tensor<fp32, [3, 1]> var_647 = reshape(shape = var_646, x = sqrt_s_t_5)[name = tensor<string, []>("op_647")];
+            tensor<fp32, [3, 3]> M_5 = real_div(x = inner_encoder__causal_mask, y = var_647)[name = tensor<string, []>("M_5")];
+            tensor<fp32, [1, 4, 3, 3]> var_649 = mul(x = qk_5, y = M_5)[name = tensor<string, []>("op_649")];
             tensor<bool, []> inner_5_transpose_x_0 = const()[name = tensor<string, []>("inner_5_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_5_transpose_y_0 = const()[name = tensor<string, []>("inner_5_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 3, 64]> v_5 = transpose(perm = v_5_perm_0, x = var_590)[name = tensor<string, []>("transpose_42")];
-            tensor<fp32, [1, 4, 3, 64]> inner_5 = matmul(transpose_x = inner_5_transpose_x_0, transpose_y = inner_5_transpose_y_0, x = var_603, y = v_5)[name = tensor<string, []>("inner_5")];
-            tensor<bool, []> var_605_transpose_x_0 = const()[name = tensor<string, []>("op_605_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_605_transpose_y_0 = const()[name = tensor<string, []>("op_605_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 3, 64]> var_605 = matmul(transpose_x = var_605_transpose_x_0, transpose_y = var_605_transpose_y_0, x = q_5, y = prev_kv_5)[name = tensor<string, []>("op_605")];
-            tensor<fp32, [3]> var_606 = real_div(x = sqrt_s0_5, y = sqrt_s_t_5)[name = tensor<string, []>("op_606")];
-            tensor<int32, [4]> var_607 = const()[name = tensor<string, []>("op_607"), val = tensor<int32, [4]>([1, 1, 3, 1])];
-            tensor<fp32, [1, 1, 3, 1]> var_608 = reshape(shape = var_607, x = var_606)[name = tensor<string, []>("op_608")];
-            tensor<fp32, [1, 4, 3, 64]> cross_5 = mul(x = var_605, y = var_608)[name = tensor<string, []>("cross_5")];
+            tensor<fp32, [1, 4, 3, 64]> v_5 = transpose(perm = v_5_perm_0, x = var_636)[name = tensor<string, []>("transpose_42")];
+            tensor<fp32, [1, 4, 3, 64]> inner_5 = matmul(transpose_x = inner_5_transpose_x_0, transpose_y = inner_5_transpose_y_0, x = var_649, y = v_5)[name = tensor<string, []>("inner_5")];
+            tensor<bool, []> var_651_transpose_x_0 = const()[name = tensor<string, []>("op_651_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_651_transpose_y_0 = const()[name = tensor<string, []>("op_651_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 3, 64]> var_651 = matmul(transpose_x = var_651_transpose_x_0, transpose_y = var_651_transpose_y_0, x = q_5, y = prev_kv_5)[name = tensor<string, []>("op_651")];
+            tensor<fp32, [3]> var_652 = real_div(x = sqrt_s0_5, y = sqrt_s_t_5)[name = tensor<string, []>("op_652")];
+            tensor<int32, [4]> var_653 = const()[name = tensor<string, []>("op_653"), val = tensor<int32, [4]>([1, 1, 3, 1])];
+            tensor<fp32, [1, 1, 3, 1]> var_654 = reshape(shape = var_653, x = var_652)[name = tensor<string, []>("op_654")];
+            tensor<fp32, [1, 4, 3, 64]> cross_5 = mul(x = var_651, y = var_654)[name = tensor<string, []>("cross_5")];
             tensor<fp32, [1, 4, 3, 64]> out_13 = add(x = inner_5, y = cross_5)[name = tensor<string, []>("out_13")];
-            tensor<fp32, [1, 4, 64, 64]> var_611 = mul(x = prev_kv_5, y = sqrt_s0_5)[name = tensor<string, []>("op_611")];
-            tensor<bool, []> var_613_transpose_x_1 = const()[name = tensor<string, []>("op_613_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_613_transpose_y_1 = const()[name = tensor<string, []>("op_613_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_613 = matmul(transpose_x = var_613_transpose_x_1, transpose_y = var_613_transpose_y_1, x = k_5, y = v_5)[name = tensor<string, []>("op_613")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_5 = add(x = var_611, y = var_613)[name = tensor<string, []>("new_kv_unnorm_5")];
-            tensor<fp32, []> var_615 = const()[name = tensor<string, []>("op_615"), val = tensor<fp32, []>(0x1.8p+1)];
-            tensor<fp32, [1]> new_scale_5 = add(x = prev_scale_5, y = var_615)[name = tensor<string, []>("new_scale_5")];
-            tensor<fp32, [1]> var_617 = sqrt(x = new_scale_5)[name = tensor<string, []>("op_617")];
-            tensor<fp32, [1, 4, 64, 64]> var_618 = real_div(x = new_kv_unnorm_5, y = var_617)[name = tensor<string, []>("op_618")];
-            tensor<int32, [4]> var_619_perm_0 = const()[name = tensor<string, []>("op_619_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_657 = mul(x = prev_kv_5, y = sqrt_s0_5)[name = tensor<string, []>("op_657")];
+            tensor<bool, []> var_659_transpose_x_1 = const()[name = tensor<string, []>("op_659_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_659_transpose_y_1 = const()[name = tensor<string, []>("op_659_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_659 = matmul(transpose_x = var_659_transpose_x_1, transpose_y = var_659_transpose_y_1, x = k_5, y = v_5)[name = tensor<string, []>("op_659")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_5 = add(x = var_657, y = var_659)[name = tensor<string, []>("new_kv_unnorm_5")];
+            tensor<fp32, []> var_661 = const()[name = tensor<string, []>("op_661"), val = tensor<fp32, []>(0x1.8p+1)];
+            tensor<fp32, [1]> new_scale_5 = add(x = prev_scale_5, y = var_661)[name = tensor<string, []>("new_scale_5")];
+            tensor<fp32, [1]> var_663 = sqrt(x = new_scale_5)[name = tensor<string, []>("op_663")];
+            tensor<fp32, [1, 4, 64, 64]> var_664 = real_div(x = new_kv_unnorm_5, y = var_663)[name = tensor<string, []>("op_664")];
+            tensor<int32, [4]> var_665_perm_0 = const()[name = tensor<string, []>("op_665_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_15_axes_0 = const()[name = tensor<string, []>("out_15_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 4, 64]> var_619 = transpose(perm = var_619_perm_0, x = out_13)[name = tensor<string, []>("transpose_41")];
-            tensor<fp32, [1, 3, 4, 64]> out_15 = layer_norm(axes = out_15_axes_0, epsilon = var_17, x = var_619)[name = tensor<string, []>("out_15")];
-            tensor<int32, [3]> var_623 = const()[name = tensor<string, []>("op_623"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<fp32, [1, 3, 256]> out_17 = reshape(shape = var_623, x = out_15)[name = tensor<string, []>("out_17")];
-            tensor<fp32, [1, 3, 256]> var_625 = silu(x = input_97)[name = tensor<string, []>("op_625")];
-            tensor<fp32, [1, 3, 256]> input_99 = mul(x = var_625, y = out_17)[name = tensor<string, []>("input_99")];
-            tensor<fp32, [1, 3, 256]> ret_out_5 = linear(bias = encoder_out_proj_2_bias, weight = encoder_out_proj_2_weight, x = input_99)[name = tensor<string, []>("linear_25")];
-            tensor<fp32, [1, 3, 256]> x_15 = add(x = input_93, y = ret_out_5)[name = tensor<string, []>("x_15")];
+            tensor<fp32, [1, 3, 4, 64]> var_665 = transpose(perm = var_665_perm_0, x = out_13)[name = tensor<string, []>("transpose_41")];
+            tensor<fp32, [1, 3, 4, 64]> out_15 = layer_norm(axes = out_15_axes_0, epsilon = var_63, x = var_665)[name = tensor<string, []>("out_15")];
+            tensor<int32, [3]> var_669 = const()[name = tensor<string, []>("op_669"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<fp32, [1, 3, 256]> out_17 = reshape(shape = var_669, x = out_15)[name = tensor<string, []>("out_17")];
+            tensor<fp32, [1, 3, 256]> var_671 = silu(x = input_99)[name = tensor<string, []>("op_671")];
+            tensor<fp32, [1, 3, 256]> input_101 = mul(x = var_671, y = out_17)[name = tensor<string, []>("input_101")];
+            tensor<fp32, [1, 3, 256]> ret_out_5 = linear(bias = inner_encoder_out_proj_2_bias, weight = inner_encoder_out_proj_2_weight, x = input_101)[name = tensor<string, []>("linear_25")];
+            tensor<fp32, [1, 3, 256]> x_15 = add(x = input_95, y = ret_out_5)[name = tensor<string, []>("x_15")];
             tensor<int32, [4]> window_17_begin_0 = const()[name = tensor<string, []>("window_17_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> window_17_end_0 = const()[name = tensor<string, []>("window_17_end_0"), val = tensor<int32, [4]>([3, 1, 16, 256])];
             tensor<bool, [4]> window_17_end_mask_0 = const()[name = tensor<string, []>("window_17_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_17_squeeze_mask_0 = const()[name = tensor<string, []>("window_17_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_17 = slice_by_index(begin = window_17_begin_0, end = window_17_end_0, end_mask = window_17_end_mask_0, squeeze_mask = window_17_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_17")];
-            tensor<int32, [3]> var_633_begin_0 = const()[name = tensor<string, []>("op_633_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_633_end_0 = const()[name = tensor<string, []>("op_633_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_633_end_mask_0 = const()[name = tensor<string, []>("op_633_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_633 = slice_by_index(begin = var_633_begin_0, end = var_633_end_0, end_mask = var_633_end_mask_0, x = x_15)[name = tensor<string, []>("op_633")];
-            tensor<int32, [3]> var_636_begin_0 = const()[name = tensor<string, []>("op_636_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_636_end_0 = const()[name = tensor<string, []>("op_636_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_636_end_mask_0 = const()[name = tensor<string, []>("op_636_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_636 = slice_by_index(begin = var_636_begin_0, end = var_636_end_0, end_mask = var_636_end_mask_0, x = window_17)[name = tensor<string, []>("op_636")];
+            tensor<int32, [3]> var_679_begin_0 = const()[name = tensor<string, []>("op_679_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_679_end_0 = const()[name = tensor<string, []>("op_679_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_679_end_mask_0 = const()[name = tensor<string, []>("op_679_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_679 = slice_by_index(begin = var_679_begin_0, end = var_679_end_0, end_mask = var_679_end_mask_0, x = x_15)[name = tensor<string, []>("op_679")];
+            tensor<int32, [3]> var_682_begin_0 = const()[name = tensor<string, []>("op_682_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_682_end_0 = const()[name = tensor<string, []>("op_682_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_682_end_mask_0 = const()[name = tensor<string, []>("op_682_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_682 = slice_by_index(begin = var_682_begin_0, end = var_682_end_0, end_mask = var_682_end_mask_0, x = window_17)[name = tensor<string, []>("op_682")];
             tensor<bool, []> window_19_interleave_0 = const()[name = tensor<string, []>("window_19_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_19 = concat(axis = var_26, interleave = window_19_interleave_0, values = (var_636, var_633))[name = tensor<string, []>("window_19")];
-            tensor<int32, [3]> var_641_begin_0 = const()[name = tensor<string, []>("op_641_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_641_end_0 = const()[name = tensor<string, []>("op_641_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_641_end_mask_0 = const()[name = tensor<string, []>("op_641_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_641 = slice_by_index(begin = var_641_begin_0, end = var_641_end_0, end_mask = var_641_end_mask_0, x = x_15)[name = tensor<string, []>("op_641")];
-            tensor<int32, [3]> var_644_begin_0 = const()[name = tensor<string, []>("op_644_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_644_end_0 = const()[name = tensor<string, []>("op_644_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_644_end_mask_0 = const()[name = tensor<string, []>("op_644_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_644 = slice_by_index(begin = var_644_begin_0, end = var_644_end_0, end_mask = var_644_end_mask_0, x = window_19)[name = tensor<string, []>("op_644")];
+            tensor<fp32, [1, 16, 256]> window_19 = concat(axis = var_72, interleave = window_19_interleave_0, values = (var_682, var_679))[name = tensor<string, []>("window_19")];
+            tensor<int32, [3]> var_687_begin_0 = const()[name = tensor<string, []>("op_687_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_687_end_0 = const()[name = tensor<string, []>("op_687_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_687_end_mask_0 = const()[name = tensor<string, []>("op_687_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_687 = slice_by_index(begin = var_687_begin_0, end = var_687_end_0, end_mask = var_687_end_mask_0, x = x_15)[name = tensor<string, []>("op_687")];
+            tensor<int32, [3]> var_690_begin_0 = const()[name = tensor<string, []>("op_690_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_690_end_0 = const()[name = tensor<string, []>("op_690_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_690_end_mask_0 = const()[name = tensor<string, []>("op_690_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_690 = slice_by_index(begin = var_690_begin_0, end = var_690_end_0, end_mask = var_690_end_mask_0, x = window_19)[name = tensor<string, []>("op_690")];
             tensor<bool, []> window_21_interleave_0 = const()[name = tensor<string, []>("window_21_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_21 = concat(axis = var_26, interleave = window_21_interleave_0, values = (var_644, var_641))[name = tensor<string, []>("window_21")];
-            tensor<int32, [3]> var_649_begin_0 = const()[name = tensor<string, []>("op_649_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_649_end_0 = const()[name = tensor<string, []>("op_649_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_649_end_mask_0 = const()[name = tensor<string, []>("op_649_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_649 = slice_by_index(begin = var_649_begin_0, end = var_649_end_0, end_mask = var_649_end_mask_0, x = x_15)[name = tensor<string, []>("op_649")];
-            tensor<int32, [3]> var_652_begin_0 = const()[name = tensor<string, []>("op_652_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_652_end_0 = const()[name = tensor<string, []>("op_652_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_652_end_mask_0 = const()[name = tensor<string, []>("op_652_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_652 = slice_by_index(begin = var_652_begin_0, end = var_652_end_0, end_mask = var_652_end_mask_0, x = window_21)[name = tensor<string, []>("op_652")];
+            tensor<fp32, [1, 16, 256]> window_21 = concat(axis = var_72, interleave = window_21_interleave_0, values = (var_690, var_687))[name = tensor<string, []>("window_21")];
+            tensor<int32, [3]> var_695_begin_0 = const()[name = tensor<string, []>("op_695_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_695_end_0 = const()[name = tensor<string, []>("op_695_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_695_end_mask_0 = const()[name = tensor<string, []>("op_695_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_695 = slice_by_index(begin = var_695_begin_0, end = var_695_end_0, end_mask = var_695_end_mask_0, x = x_15)[name = tensor<string, []>("op_695")];
+            tensor<int32, [3]> var_698_begin_0 = const()[name = tensor<string, []>("op_698_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_698_end_0 = const()[name = tensor<string, []>("op_698_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_698_end_mask_0 = const()[name = tensor<string, []>("op_698_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_698 = slice_by_index(begin = var_698_begin_0, end = var_698_end_0, end_mask = var_698_end_mask_0, x = window_21)[name = tensor<string, []>("op_698")];
             tensor<bool, []> window_23_interleave_0 = const()[name = tensor<string, []>("window_23_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_23 = concat(axis = var_26, interleave = window_23_interleave_0, values = (var_652, var_649))[name = tensor<string, []>("window_23")];
-            tensor<bool, []> input_101_interleave_0 = const()[name = tensor<string, []>("input_101_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [3, 16, 256]> input_101 = concat(axis = var_23, interleave = input_101_interleave_0, values = (window_19, window_21, window_23))[name = tensor<string, []>("input_101")];
+            tensor<fp32, [1, 16, 256]> window_23 = concat(axis = var_72, interleave = window_23_interleave_0, values = (var_698, var_695))[name = tensor<string, []>("window_23")];
+            tensor<bool, []> input_103_interleave_0 = const()[name = tensor<string, []>("input_103_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [3, 16, 256]> input_103 = concat(axis = var_58, interleave = input_103_interleave_0, values = (window_19, window_21, window_23))[name = tensor<string, []>("input_103")];
             tensor<int32, [1]> x_17_axes_0 = const()[name = tensor<string, []>("x_17_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [3, 16, 256]> x_17 = layer_norm(axes = x_17_axes_0, beta = encoder_conv_module_2_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_2_sequential_0_weight, x = input_101)[name = tensor<string, []>("x_17")];
-            tensor<int32, [3]> input_103_perm_0 = const()[name = tensor<string, []>("input_103_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [3, 16, 256]> x_17 = layer_norm(axes = x_17_axes_0, beta = inner_encoder_conv_module_2_sequential_0_bias, epsilon = var_55, gamma = inner_encoder_conv_module_2_sequential_0_weight, x = input_103)[name = tensor<string, []>("x_17")];
+            tensor<int32, [3]> input_105_perm_0 = const()[name = tensor<string, []>("input_105_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_23_pad_type_0 = const()[name = tensor<string, []>("inputs_23_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_23_strides_0 = const()[name = tensor<string, []>("inputs_23_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_23_pad_0 = const()[name = tensor<string, []>("inputs_23_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_23_dilations_0 = const()[name = tensor<string, []>("inputs_23_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_23_groups_0 = const()[name = tensor<string, []>("inputs_23_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> input_103 = transpose(perm = input_103_perm_0, x = x_17)[name = tensor<string, []>("transpose_40")];
-            tensor<fp32, [3, 512, 16]> inputs_23 = conv(bias = encoder_conv_module_2_sequential_2_conv_bias, dilations = inputs_23_dilations_0, groups = inputs_23_groups_0, pad = inputs_23_pad_0, pad_type = inputs_23_pad_type_0, strides = inputs_23_strides_0, weight = encoder_conv_module_2_sequential_2_conv_weight, x = input_103)[name = tensor<string, []>("inputs_23")];
-            tensor<int32, [2]> var_677_split_sizes_0 = const()[name = tensor<string, []>("op_677_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_677_axis_0 = const()[name = tensor<string, []>("op_677_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> var_677_0, tensor<fp32, [3, 256, 16]> var_677_1 = split(axis = var_677_axis_0, split_sizes = var_677_split_sizes_0, x = inputs_23)[name = tensor<string, []>("op_677")];
-            tensor<fp32, [3, 256, 16]> var_679 = sigmoid(x = var_677_1)[name = tensor<string, []>("op_679")];
-            tensor<fp32, [3, 256, 16]> inputs_25 = mul(x = var_677_0, y = var_679)[name = tensor<string, []>("inputs_25")];
+            tensor<fp32, [3, 256, 16]> input_105 = transpose(perm = input_105_perm_0, x = x_17)[name = tensor<string, []>("transpose_40")];
+            tensor<fp32, [3, 512, 16]> inputs_23 = conv(bias = inner_encoder_conv_module_2_sequential_2_conv_bias, dilations = inputs_23_dilations_0, groups = inputs_23_groups_0, pad = inputs_23_pad_0, pad_type = inputs_23_pad_type_0, strides = inputs_23_strides_0, weight = inner_encoder_conv_module_2_sequential_2_conv_weight, x = input_105)[name = tensor<string, []>("inputs_23")];
+            tensor<int32, [2]> var_723_split_sizes_0 = const()[name = tensor<string, []>("op_723_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_723_axis_0 = const()[name = tensor<string, []>("op_723_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [3, 256, 16]> var_723_0, tensor<fp32, [3, 256, 16]> var_723_1 = split(axis = var_723_axis_0, split_sizes = var_723_split_sizes_0, x = inputs_23)[name = tensor<string, []>("op_723")];
+            tensor<fp32, [3, 256, 16]> var_725 = sigmoid(x = var_723_1)[name = tensor<string, []>("op_725")];
+            tensor<fp32, [3, 256, 16]> inputs_25 = mul(x = var_723_0, y = var_725)[name = tensor<string, []>("inputs_25")];
             tensor<string, []> outputs_aug_5_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_5_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_5_pad_0 = const()[name = tensor<string, []>("outputs_aug_5_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_5_groups_0 = const()[name = tensor<string, []>("outputs_aug_5_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_5_strides_0 = const()[name = tensor<string, []>("outputs_aug_5_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_5_dilations_0 = const()[name = tensor<string, []>("outputs_aug_5_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [3, 256, 31]> outputs_aug_5 = conv(dilations = outputs_aug_5_dilations_0, groups = outputs_aug_5_groups_0, pad = outputs_aug_5_pad_0, pad_type = outputs_aug_5_pad_type_0, strides = outputs_aug_5_strides_0, weight = encoder_conv_module_2_sequential_4_conv_weight, x = inputs_25)[name = tensor<string, []>("outputs_aug_5")];
-            tensor<int32, [3]> input_105_begin_0 = const()[name = tensor<string, []>("input_105_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_105_end_0 = const()[name = tensor<string, []>("input_105_end_0"), val = tensor<int32, [3]>([3, 256, 16])];
-            tensor<bool, [3]> input_105_end_mask_0 = const()[name = tensor<string, []>("input_105_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [3, 256, 16]> input_105 = slice_by_index(begin = input_105_begin_0, end = input_105_end_0, end_mask = input_105_end_mask_0, x = outputs_aug_5)[name = tensor<string, []>("input_105")];
-            tensor<fp32, [3, 256, 16]> inputs_27 = batch_norm(beta = encoder_conv_module_2_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_2_sequential_5_weight, mean = encoder_conv_module_2_sequential_5_running_mean, variance = encoder_conv_module_2_sequential_5_running_var, x = input_105)[name = tensor<string, []>("inputs_27")];
-            tensor<fp32, [3, 256, 16]> input_107 = silu(x = inputs_27)[name = tensor<string, []>("input_107")];
-            tensor<string, []> input_109_pad_type_0 = const()[name = tensor<string, []>("input_109_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_109_strides_0 = const()[name = tensor<string, []>("input_109_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_109_pad_0 = const()[name = tensor<string, []>("input_109_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_109_dilations_0 = const()[name = tensor<string, []>("input_109_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_109_groups_0 = const()[name = tensor<string, []>("input_109_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> input_109 = conv(bias = encoder_conv_module_2_sequential_7_conv_bias, dilations = input_109_dilations_0, groups = input_109_groups_0, pad = input_109_pad_0, pad_type = input_109_pad_type_0, strides = input_109_strides_0, weight = encoder_conv_module_2_sequential_7_conv_weight, x = input_107)[name = tensor<string, []>("input_109")];
+            tensor<fp32, [3, 256, 31]> outputs_aug_5 = conv(dilations = outputs_aug_5_dilations_0, groups = outputs_aug_5_groups_0, pad = outputs_aug_5_pad_0, pad_type = outputs_aug_5_pad_type_0, strides = outputs_aug_5_strides_0, weight = inner_encoder_conv_module_2_sequential_4_conv_weight, x = inputs_25)[name = tensor<string, []>("outputs_aug_5")];
+            tensor<int32, [3]> input_107_begin_0 = const()[name = tensor<string, []>("input_107_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_107_end_0 = const()[name = tensor<string, []>("input_107_end_0"), val = tensor<int32, [3]>([3, 256, 16])];
+            tensor<bool, [3]> input_107_end_mask_0 = const()[name = tensor<string, []>("input_107_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [3, 256, 16]> input_107 = slice_by_index(begin = input_107_begin_0, end = input_107_end_0, end_mask = input_107_end_mask_0, x = outputs_aug_5)[name = tensor<string, []>("input_107")];
+            tensor<fp32, [3, 256, 16]> inputs_27 = batch_norm(beta = inner_encoder_conv_module_2_sequential_5_bias, epsilon = var_55, gamma = inner_encoder_conv_module_2_sequential_5_weight, mean = inner_encoder_conv_module_2_sequential_5_running_mean, variance = inner_encoder_conv_module_2_sequential_5_running_var, x = input_107)[name = tensor<string, []>("inputs_27")];
+            tensor<fp32, [3, 256, 16]> input_109 = silu(x = inputs_27)[name = tensor<string, []>("input_109")];
+            tensor<string, []> input_111_pad_type_0 = const()[name = tensor<string, []>("input_111_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_111_strides_0 = const()[name = tensor<string, []>("input_111_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_111_pad_0 = const()[name = tensor<string, []>("input_111_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_111_dilations_0 = const()[name = tensor<string, []>("input_111_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_111_groups_0 = const()[name = tensor<string, []>("input_111_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [3, 256, 16]> input_111 = conv(bias = inner_encoder_conv_module_2_sequential_7_conv_bias, dilations = input_111_dilations_0, groups = input_111_groups_0, pad = input_111_pad_0, pad_type = input_111_pad_type_0, strides = input_111_strides_0, weight = inner_encoder_conv_module_2_sequential_7_conv_weight, x = input_109)[name = tensor<string, []>("input_111")];
             tensor<int32, [3]> conv_out_5_perm_0 = const()[name = tensor<string, []>("conv_out_5_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_710_begin_0 = const()[name = tensor<string, []>("op_710_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_710_end_0 = const()[name = tensor<string, []>("op_710_end_0"), val = tensor<int32, [3]>([3, 16, 256])];
-            tensor<bool, [3]> var_710_end_mask_0 = const()[name = tensor<string, []>("op_710_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [3, 16, 256]> conv_out_5 = transpose(perm = conv_out_5_perm_0, x = input_109)[name = tensor<string, []>("transpose_39")];
-            tensor<fp32, [3, 1, 256]> var_710 = slice_by_index(begin = var_710_begin_0, end = var_710_end_0, end_mask = var_710_end_mask_0, x = conv_out_5)[name = tensor<string, []>("op_710")];
-            tensor<int32, [3]> var_712_perm_0 = const()[name = tensor<string, []>("op_712_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 3, 256]> var_712 = transpose(perm = var_712_perm_0, x = var_710)[name = tensor<string, []>("transpose_38")];
-            tensor<fp32, [1, 3, 256]> input_111 = add(x = x_15, y = var_712)[name = tensor<string, []>("input_111")];
-            tensor<int32, [1]> input_113_axes_0 = const()[name = tensor<string, []>("input_113_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_113 = layer_norm(axes = input_113_axes_0, beta = encoder_ffn2_2_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_2_module_sequential_0_weight, x = input_111)[name = tensor<string, []>("input_113")];
-            tensor<fp32, [1, 3, 1024]> inputs_29 = linear(bias = encoder_ffn2_2_module_sequential_1_linear_bias, weight = encoder_ffn2_2_module_sequential_1_linear_weight, x = input_113)[name = tensor<string, []>("linear_26")];
-            tensor<fp32, [1, 3, 1024]> input_115 = silu(x = inputs_29)[name = tensor<string, []>("input_115")];
-            tensor<fp32, [1, 3, 256]> input_119 = linear(bias = encoder_ffn2_2_module_sequential_4_linear_bias, weight = encoder_ffn2_2_module_sequential_4_linear_weight, x = input_115)[name = tensor<string, []>("linear_27")];
-            tensor<fp32, []> var_735 = const()[name = tensor<string, []>("op_735"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 3, 256]> var_736 = mul(x = input_119, y = var_735)[name = tensor<string, []>("op_736")];
-            tensor<fp32, [1, 3, 256]> input_121 = add(x = var_736, y = input_111)[name = tensor<string, []>("input_121")];
-            tensor<int32, [1]> input_123_axes_0 = const()[name = tensor<string, []>("input_123_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_123 = layer_norm(axes = input_123_axes_0, beta = encoder_layer_norm_2_bias, epsilon = var_28, gamma = encoder_layer_norm_2_weight, x = input_121)[name = tensor<string, []>("input_123")];
+            tensor<int32, [3]> var_756_begin_0 = const()[name = tensor<string, []>("op_756_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_756_end_0 = const()[name = tensor<string, []>("op_756_end_0"), val = tensor<int32, [3]>([3, 16, 256])];
+            tensor<bool, [3]> var_756_end_mask_0 = const()[name = tensor<string, []>("op_756_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [3, 16, 256]> conv_out_5 = transpose(perm = conv_out_5_perm_0, x = input_111)[name = tensor<string, []>("transpose_39")];
+            tensor<fp32, [3, 1, 256]> var_756 = slice_by_index(begin = var_756_begin_0, end = var_756_end_0, end_mask = var_756_end_mask_0, x = conv_out_5)[name = tensor<string, []>("op_756")];
+            tensor<int32, [3]> var_758_perm_0 = const()[name = tensor<string, []>("op_758_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 3, 256]> var_758 = transpose(perm = var_758_perm_0, x = var_756)[name = tensor<string, []>("transpose_38")];
+            tensor<fp32, [1, 3, 256]> input_113 = add(x = x_15, y = var_758)[name = tensor<string, []>("input_113")];
+            tensor<int32, [1]> input_115_axes_0 = const()[name = tensor<string, []>("input_115_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 3, 256]> input_115 = layer_norm(axes = input_115_axes_0, beta = inner_encoder_ffn2_2_module_sequential_0_bias, epsilon = var_55, gamma = inner_encoder_ffn2_2_module_sequential_0_weight, x = input_113)[name = tensor<string, []>("input_115")];
+            tensor<fp32, [1, 3, 1024]> inputs_29 = linear(bias = inner_encoder_ffn2_2_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_2_module_sequential_1_linear_weight, x = input_115)[name = tensor<string, []>("linear_26")];
+            tensor<fp32, [1, 3, 1024]> input_117 = silu(x = inputs_29)[name = tensor<string, []>("input_117")];
+            tensor<fp32, [1, 3, 256]> input_121 = linear(bias = inner_encoder_ffn2_2_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_2_module_sequential_4_linear_weight, x = input_117)[name = tensor<string, []>("linear_27")];
+            tensor<fp32, []> var_781 = const()[name = tensor<string, []>("op_781"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 3, 256]> var_782 = mul(x = input_121, y = var_781)[name = tensor<string, []>("op_782")];
+            tensor<fp32, [1, 3, 256]> input_123 = add(x = var_782, y = input_113)[name = tensor<string, []>("input_123")];
             tensor<int32, [1]> input_125_axes_0 = const()[name = tensor<string, []>("input_125_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_125 = layer_norm(axes = input_125_axes_0, beta = encoder_ffn1_3_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_3_module_sequential_0_weight, x = input_123)[name = tensor<string, []>("input_125")];
-            tensor<fp32, [1, 3, 1024]> inputs_31 = linear(bias = encoder_ffn1_3_module_sequential_1_linear_bias, weight = encoder_ffn1_3_module_sequential_1_linear_weight, x = input_125)[name = tensor<string, []>("linear_28")];
-            tensor<fp32, [1, 3, 1024]> input_127 = silu(x = inputs_31)[name = tensor<string, []>("input_127")];
-            tensor<fp32, [1, 3, 256]> input_131 = linear(bias = encoder_ffn1_3_module_sequential_4_linear_bias, weight = encoder_ffn1_3_module_sequential_4_linear_weight, x = input_127)[name = tensor<string, []>("linear_29")];
-            tensor<fp32, []> var_765 = const()[name = tensor<string, []>("op_765"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 3, 256]> var_766 = mul(x = input_131, y = var_765)[name = tensor<string, []>("op_766")];
-            tensor<fp32, [1, 3, 256]> input_133 = add(x = var_766, y = input_123)[name = tensor<string, []>("input_133")];
+            tensor<fp32, [1, 3, 256]> input_125 = layer_norm(axes = input_125_axes_0, beta = inner_encoder_layer_norm_2_bias, epsilon = var_55, gamma = inner_encoder_layer_norm_2_weight, x = input_123)[name = tensor<string, []>("input_125")];
+            tensor<int32, [1]> input_127_axes_0 = const()[name = tensor<string, []>("input_127_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 3, 256]> input_127 = layer_norm(axes = input_127_axes_0, beta = inner_encoder_ffn1_3_module_sequential_0_bias, epsilon = var_55, gamma = inner_encoder_ffn1_3_module_sequential_0_weight, x = input_125)[name = tensor<string, []>("input_127")];
+            tensor<fp32, [1, 3, 1024]> inputs_31 = linear(bias = inner_encoder_ffn1_3_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_3_module_sequential_1_linear_weight, x = input_127)[name = tensor<string, []>("linear_28")];
+            tensor<fp32, [1, 3, 1024]> input_129 = silu(x = inputs_31)[name = tensor<string, []>("input_129")];
+            tensor<fp32, [1, 3, 256]> input_133 = linear(bias = inner_encoder_ffn1_3_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_3_module_sequential_4_linear_weight, x = input_129)[name = tensor<string, []>("linear_29")];
+            tensor<fp32, []> var_811 = const()[name = tensor<string, []>("op_811"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 3, 256]> var_812 = mul(x = input_133, y = var_811)[name = tensor<string, []>("op_812")];
+            tensor<fp32, [1, 3, 256]> input_135 = add(x = var_812, y = input_125)[name = tensor<string, []>("input_135")];
             tensor<int32, [1]> x_19_axes_0 = const()[name = tensor<string, []>("x_19_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> x_19 = layer_norm(axes = x_19_axes_0, beta = encoder_ret_lns_3_bias, epsilon = var_28, gamma = encoder_ret_lns_3_weight, x = input_133)[name = tensor<string, []>("x_19")];
+            tensor<fp32, [1, 3, 256]> x_19 = layer_norm(axes = x_19_axes_0, beta = inner_encoder_ret_lns_3_bias, epsilon = var_55, gamma = inner_encoder_ret_lns_3_weight, x = input_135)[name = tensor<string, []>("x_19")];
             tensor<int32, [5]> prev_kv_7_begin_0 = const()[name = tensor<string, []>("prev_kv_7_begin_0"), val = tensor<int32, [5]>([3, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_7_end_0 = const()[name = tensor<string, []>("prev_kv_7_end_0"), val = tensor<int32, [5]>([4, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_7_end_mask_0 = const()[name = tensor<string, []>("prev_kv_7_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -740,199 +758,192 @@ program(1.0)
             tensor<bool, [2]> prev_scale_7_end_mask_0 = const()[name = tensor<string, []>("prev_scale_7_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_7_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_7_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_7 = slice_by_index(begin = prev_scale_7_begin_0, end = prev_scale_7_end_0, end_mask = prev_scale_7_end_mask_0, squeeze_mask = prev_scale_7_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_7")];
-            tensor<fp32, [1, 3, 256]> var_780 = linear(bias = encoder_q_proj_3_bias, weight = encoder_q_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_30")];
-            tensor<int32, [4]> var_781 = const()[name = tensor<string, []>("op_781"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_782 = reshape(shape = var_781, x = var_780)[name = tensor<string, []>("op_782")];
+            tensor<fp32, [1, 3, 256]> var_826 = linear(bias = inner_encoder_q_proj_3_bias, weight = inner_encoder_q_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_30")];
+            tensor<int32, [4]> var_827 = const()[name = tensor<string, []>("op_827"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_828 = reshape(shape = var_827, x = var_826)[name = tensor<string, []>("op_828")];
             tensor<int32, [4]> q_7_perm_0 = const()[name = tensor<string, []>("q_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> var_786 = linear(bias = encoder_k_proj_3_bias, weight = encoder_k_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_31")];
-            tensor<fp32, []> var_787 = const()[name = tensor<string, []>("op_787"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 3, 256]> var_788 = mul(x = var_786, y = var_787)[name = tensor<string, []>("op_788")];
-            tensor<int32, [4]> var_789 = const()[name = tensor<string, []>("op_789"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_790 = reshape(shape = var_789, x = var_788)[name = tensor<string, []>("op_790")];
+            tensor<fp32, [1, 3, 256]> var_832 = linear(bias = inner_encoder_k_proj_3_bias, weight = inner_encoder_k_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_31")];
+            tensor<fp32, []> var_833 = const()[name = tensor<string, []>("op_833"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 3, 256]> var_834 = mul(x = var_832, y = var_833)[name = tensor<string, []>("op_834")];
+            tensor<int32, [4]> var_835 = const()[name = tensor<string, []>("op_835"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_836 = reshape(shape = var_835, x = var_834)[name = tensor<string, []>("op_836")];
             tensor<int32, [4]> k_7_perm_0 = const()[name = tensor<string, []>("k_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> var_794 = linear(bias = encoder_v_proj_3_bias, weight = encoder_v_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_32")];
-            tensor<int32, [4]> var_795 = const()[name = tensor<string, []>("op_795"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_796 = reshape(shape = var_795, x = var_794)[name = tensor<string, []>("op_796")];
+            tensor<fp32, [1, 3, 256]> var_840 = linear(bias = inner_encoder_v_proj_3_bias, weight = inner_encoder_v_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_32")];
+            tensor<int32, [4]> var_841 = const()[name = tensor<string, []>("op_841"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_842 = reshape(shape = var_841, x = var_840)[name = tensor<string, []>("op_842")];
             tensor<int32, [4]> v_7_perm_0 = const()[name = tensor<string, []>("v_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> input_137 = linear(bias = encoder_g_proj_3_bias, weight = encoder_g_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_33")];
+            tensor<fp32, [1, 3, 256]> input_139 = linear(bias = inner_encoder_g_proj_3_bias, weight = inner_encoder_g_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_33")];
             tensor<fp32, [1]> sqrt_s0_7 = sqrt(x = prev_scale_7)[name = tensor<string, []>("sqrt_s0_7")];
-            tensor<fp32, [3]> s_t_7 = add(x = prev_scale_7, y = encoder__t_index)[name = tensor<string, []>("s_t_7")];
+            tensor<fp32, [3]> s_t_7 = add(x = prev_scale_7, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_7")];
             tensor<fp32, [3]> sqrt_s_t_7 = sqrt(x = s_t_7)[name = tensor<string, []>("sqrt_s_t_7")];
             tensor<bool, []> qk_7_transpose_x_1 = const()[name = tensor<string, []>("qk_7_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_7_transpose_y_1 = const()[name = tensor<string, []>("qk_7_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 3, 64]> k_7 = transpose(perm = k_7_perm_0, x = var_790)[name = tensor<string, []>("transpose_36")];
-            tensor<fp32, [1, 4, 3, 64]> q_7 = transpose(perm = q_7_perm_0, x = var_782)[name = tensor<string, []>("transpose_37")];
+            tensor<fp32, [1, 4, 3, 64]> k_7 = transpose(perm = k_7_perm_0, x = var_836)[name = tensor<string, []>("transpose_36")];
+            tensor<fp32, [1, 4, 3, 64]> q_7 = transpose(perm = q_7_perm_0, x = var_828)[name = tensor<string, []>("transpose_37")];
             tensor<fp32, [1, 4, 3, 3]> qk_7 = matmul(transpose_x = qk_7_transpose_x_1, transpose_y = qk_7_transpose_y_1, x = q_7, y = k_7)[name = tensor<string, []>("qk_7")];
-            tensor<int32, [2]> var_806 = const()[name = tensor<string, []>("op_806"), val = tensor<int32, [2]>([3, 1])];
-            tensor<fp32, [3, 1]> var_807 = reshape(shape = var_806, x = sqrt_s_t_7)[name = tensor<string, []>("op_807")];
-            tensor<fp32, [3, 3]> M_7 = real_div(x = encoder__causal_mask, y = var_807)[name = tensor<string, []>("M_7")];
-            tensor<fp32, [1, 4, 3, 3]> var_809 = mul(x = qk_7, y = M_7)[name = tensor<string, []>("op_809")];
+            tensor<int32, [2]> var_852 = const()[name = tensor<string, []>("op_852"), val = tensor<int32, [2]>([3, 1])];
+            tensor<fp32, [3, 1]> var_853 = reshape(shape = var_852, x = sqrt_s_t_7)[name = tensor<string, []>("op_853")];
+            tensor<fp32, [3, 3]> M_7 = real_div(x = inner_encoder__causal_mask, y = var_853)[name = tensor<string, []>("M_7")];
+            tensor<fp32, [1, 4, 3, 3]> var_855 = mul(x = qk_7, y = M_7)[name = tensor<string, []>("op_855")];
             tensor<bool, []> inner_7_transpose_x_0 = const()[name = tensor<string, []>("inner_7_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_7_transpose_y_0 = const()[name = tensor<string, []>("inner_7_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 3, 64]> v_7 = transpose(perm = v_7_perm_0, x = var_796)[name = tensor<string, []>("transpose_35")];
-            tensor<fp32, [1, 4, 3, 64]> inner_7 = matmul(transpose_x = inner_7_transpose_x_0, transpose_y = inner_7_transpose_y_0, x = var_809, y = v_7)[name = tensor<string, []>("inner_7")];
-            tensor<bool, []> var_811_transpose_x_0 = const()[name = tensor<string, []>("op_811_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_811_transpose_y_0 = const()[name = tensor<string, []>("op_811_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 3, 64]> var_811 = matmul(transpose_x = var_811_transpose_x_0, transpose_y = var_811_transpose_y_0, x = q_7, y = prev_kv_7)[name = tensor<string, []>("op_811")];
-            tensor<fp32, [3]> var_812 = real_div(x = sqrt_s0_7, y = sqrt_s_t_7)[name = tensor<string, []>("op_812")];
-            tensor<int32, [4]> var_813 = const()[name = tensor<string, []>("op_813"), val = tensor<int32, [4]>([1, 1, 3, 1])];
-            tensor<fp32, [1, 1, 3, 1]> var_814 = reshape(shape = var_813, x = var_812)[name = tensor<string, []>("op_814")];
-            tensor<fp32, [1, 4, 3, 64]> cross_7 = mul(x = var_811, y = var_814)[name = tensor<string, []>("cross_7")];
+            tensor<fp32, [1, 4, 3, 64]> v_7 = transpose(perm = v_7_perm_0, x = var_842)[name = tensor<string, []>("transpose_35")];
+            tensor<fp32, [1, 4, 3, 64]> inner_7 = matmul(transpose_x = inner_7_transpose_x_0, transpose_y = inner_7_transpose_y_0, x = var_855, y = v_7)[name = tensor<string, []>("inner_7")];
+            tensor<bool, []> var_857_transpose_x_0 = const()[name = tensor<string, []>("op_857_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_857_transpose_y_0 = const()[name = tensor<string, []>("op_857_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 3, 64]> var_857 = matmul(transpose_x = var_857_transpose_x_0, transpose_y = var_857_transpose_y_0, x = q_7, y = prev_kv_7)[name = tensor<string, []>("op_857")];
+            tensor<fp32, [3]> var_858 = real_div(x = sqrt_s0_7, y = sqrt_s_t_7)[name = tensor<string, []>("op_858")];
+            tensor<int32, [4]> var_859 = const()[name = tensor<string, []>("op_859"), val = tensor<int32, [4]>([1, 1, 3, 1])];
+            tensor<fp32, [1, 1, 3, 1]> var_860 = reshape(shape = var_859, x = var_858)[name = tensor<string, []>("op_860")];
+            tensor<fp32, [1, 4, 3, 64]> cross_7 = mul(x = var_857, y = var_860)[name = tensor<string, []>("cross_7")];
             tensor<fp32, [1, 4, 3, 64]> out_19 = add(x = inner_7, y = cross_7)[name = tensor<string, []>("out_19")];
-            tensor<fp32, [1, 4, 64, 64]> var_817 = mul(x = prev_kv_7, y = sqrt_s0_7)[name = tensor<string, []>("op_817")];
-            tensor<bool, []> var_819_transpose_x_1 = const()[name = tensor<string, []>("op_819_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_819_transpose_y_1 = const()[name = tensor<string, []>("op_819_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_819 = matmul(transpose_x = var_819_transpose_x_1, transpose_y = var_819_transpose_y_1, x = k_7, y = v_7)[name = tensor<string, []>("op_819")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_7 = add(x = var_817, y = var_819)[name = tensor<string, []>("new_kv_unnorm_7")];
-            tensor<fp32, []> var_821 = const()[name = tensor<string, []>("op_821"), val = tensor<fp32, []>(0x1.8p+1)];
-            tensor<fp32, [1]> new_scale_7 = add(x = prev_scale_7, y = var_821)[name = tensor<string, []>("new_scale_7")];
-            tensor<fp32, [1]> var_823 = sqrt(x = new_scale_7)[name = tensor<string, []>("op_823")];
-            tensor<fp32, [1, 4, 64, 64]> nkv_1 = real_div(x = new_kv_unnorm_7, y = var_823)[name = tensor<string, []>("nkv_1")];
-            tensor<int32, [4]> var_825_perm_0 = const()[name = tensor<string, []>("op_825_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_863 = mul(x = prev_kv_7, y = sqrt_s0_7)[name = tensor<string, []>("op_863")];
+            tensor<bool, []> var_865_transpose_x_1 = const()[name = tensor<string, []>("op_865_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_865_transpose_y_1 = const()[name = tensor<string, []>("op_865_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_865 = matmul(transpose_x = var_865_transpose_x_1, transpose_y = var_865_transpose_y_1, x = k_7, y = v_7)[name = tensor<string, []>("op_865")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_7 = add(x = var_863, y = var_865)[name = tensor<string, []>("new_kv_unnorm_7")];
+            tensor<fp32, []> var_867 = const()[name = tensor<string, []>("op_867"), val = tensor<fp32, []>(0x1.8p+1)];
+            tensor<fp32, [1]> new_scale_7 = add(x = prev_scale_7, y = var_867)[name = tensor<string, []>("new_scale_7")];
+            tensor<fp32, [1]> var_869 = sqrt(x = new_scale_7)[name = tensor<string, []>("op_869")];
+            tensor<fp32, [1, 4, 64, 64]> nkv_1 = real_div(x = new_kv_unnorm_7, y = var_869)[name = tensor<string, []>("nkv_1")];
+            tensor<int32, [4]> var_871_perm_0 = const()[name = tensor<string, []>("op_871_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_21_axes_0 = const()[name = tensor<string, []>("out_21_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 4, 64]> var_825 = transpose(perm = var_825_perm_0, x = out_19)[name = tensor<string, []>("transpose_34")];
-            tensor<fp32, [1, 3, 4, 64]> out_21 = layer_norm(axes = out_21_axes_0, epsilon = var_17, x = var_825)[name = tensor<string, []>("out_21")];
-            tensor<int32, [3]> var_829 = const()[name = tensor<string, []>("op_829"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<fp32, [1, 3, 256]> out_23 = reshape(shape = var_829, x = out_21)[name = tensor<string, []>("out_23")];
-            tensor<fp32, [1, 3, 256]> var_831 = silu(x = input_137)[name = tensor<string, []>("op_831")];
-            tensor<fp32, [1, 3, 256]> input_139 = mul(x = var_831, y = out_23)[name = tensor<string, []>("input_139")];
-            tensor<fp32, [1, 3, 256]> ret_out_7 = linear(bias = encoder_out_proj_3_bias, weight = encoder_out_proj_3_weight, x = input_139)[name = tensor<string, []>("linear_34")];
-            tensor<fp32, [1, 3, 256]> x_21 = add(x = input_133, y = ret_out_7)[name = tensor<string, []>("x_21")];
+            tensor<fp32, [1, 3, 4, 64]> var_871 = transpose(perm = var_871_perm_0, x = out_19)[name = tensor<string, []>("transpose_34")];
+            tensor<fp32, [1, 3, 4, 64]> out_21 = layer_norm(axes = out_21_axes_0, epsilon = var_63, x = var_871)[name = tensor<string, []>("out_21")];
+            tensor<int32, [3]> var_875 = const()[name = tensor<string, []>("op_875"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<fp32, [1, 3, 256]> out_23 = reshape(shape = var_875, x = out_21)[name = tensor<string, []>("out_23")];
+            tensor<fp32, [1, 3, 256]> var_877 = silu(x = input_139)[name = tensor<string, []>("op_877")];
+            tensor<fp32, [1, 3, 256]> input_141 = mul(x = var_877, y = out_23)[name = tensor<string, []>("input_141")];
+            tensor<fp32, [1, 3, 256]> ret_out_7 = linear(bias = inner_encoder_out_proj_3_bias, weight = inner_encoder_out_proj_3_weight, x = input_141)[name = tensor<string, []>("linear_34")];
+            tensor<fp32, [1, 3, 256]> x_21 = add(x = input_135, y = ret_out_7)[name = tensor<string, []>("x_21")];
             tensor<int32, [4]> window_25_begin_0 = const()[name = tensor<string, []>("window_25_begin_0"), val = tensor<int32, [4]>([3, 0, 0, 0])];
             tensor<int32, [4]> window_25_end_0 = const()[name = tensor<string, []>("window_25_end_0"), val = tensor<int32, [4]>([4, 1, 16, 256])];
             tensor<bool, [4]> window_25_end_mask_0 = const()[name = tensor<string, []>("window_25_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_25_squeeze_mask_0 = const()[name = tensor<string, []>("window_25_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_25 = slice_by_index(begin = window_25_begin_0, end = window_25_end_0, end_mask = window_25_end_mask_0, squeeze_mask = window_25_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_25")];
-            tensor<int32, [3]> var_839_begin_0 = const()[name = tensor<string, []>("op_839_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_839_end_0 = const()[name = tensor<string, []>("op_839_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_839_end_mask_0 = const()[name = tensor<string, []>("op_839_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_839 = slice_by_index(begin = var_839_begin_0, end = var_839_end_0, end_mask = var_839_end_mask_0, x = x_21)[name = tensor<string, []>("op_839")];
-            tensor<int32, [3]> var_842_begin_0 = const()[name = tensor<string, []>("op_842_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_842_end_0 = const()[name = tensor<string, []>("op_842_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_842_end_mask_0 = const()[name = tensor<string, []>("op_842_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_842 = slice_by_index(begin = var_842_begin_0, end = var_842_end_0, end_mask = var_842_end_mask_0, x = window_25)[name = tensor<string, []>("op_842")];
+            tensor<int32, [3]> var_885_begin_0 = const()[name = tensor<string, []>("op_885_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_885_end_0 = const()[name = tensor<string, []>("op_885_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_885_end_mask_0 = const()[name = tensor<string, []>("op_885_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_885 = slice_by_index(begin = var_885_begin_0, end = var_885_end_0, end_mask = var_885_end_mask_0, x = x_21)[name = tensor<string, []>("op_885")];
+            tensor<int32, [3]> var_888_begin_0 = const()[name = tensor<string, []>("op_888_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_888_end_0 = const()[name = tensor<string, []>("op_888_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_888_end_mask_0 = const()[name = tensor<string, []>("op_888_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_888 = slice_by_index(begin = var_888_begin_0, end = var_888_end_0, end_mask = var_888_end_mask_0, x = window_25)[name = tensor<string, []>("op_888")];
             tensor<bool, []> window_27_interleave_0 = const()[name = tensor<string, []>("window_27_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_27 = concat(axis = var_26, interleave = window_27_interleave_0, values = (var_842, var_839))[name = tensor<string, []>("window_27")];
-            tensor<int32, [3]> var_847_begin_0 = const()[name = tensor<string, []>("op_847_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_847_end_0 = const()[name = tensor<string, []>("op_847_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_847_end_mask_0 = const()[name = tensor<string, []>("op_847_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_847 = slice_by_index(begin = var_847_begin_0, end = var_847_end_0, end_mask = var_847_end_mask_0, x = x_21)[name = tensor<string, []>("op_847")];
-            tensor<int32, [3]> var_850_begin_0 = const()[name = tensor<string, []>("op_850_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_850_end_0 = const()[name = tensor<string, []>("op_850_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_850_end_mask_0 = const()[name = tensor<string, []>("op_850_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_850 = slice_by_index(begin = var_850_begin_0, end = var_850_end_0, end_mask = var_850_end_mask_0, x = window_27)[name = tensor<string, []>("op_850")];
+            tensor<fp32, [1, 16, 256]> window_27 = concat(axis = var_72, interleave = window_27_interleave_0, values = (var_888, var_885))[name = tensor<string, []>("window_27")];
+            tensor<int32, [3]> var_893_begin_0 = const()[name = tensor<string, []>("op_893_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_893_end_0 = const()[name = tensor<string, []>("op_893_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_893_end_mask_0 = const()[name = tensor<string, []>("op_893_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_893 = slice_by_index(begin = var_893_begin_0, end = var_893_end_0, end_mask = var_893_end_mask_0, x = x_21)[name = tensor<string, []>("op_893")];
+            tensor<int32, [3]> var_896_begin_0 = const()[name = tensor<string, []>("op_896_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_896_end_0 = const()[name = tensor<string, []>("op_896_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_896_end_mask_0 = const()[name = tensor<string, []>("op_896_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_896 = slice_by_index(begin = var_896_begin_0, end = var_896_end_0, end_mask = var_896_end_mask_0, x = window_27)[name = tensor<string, []>("op_896")];
             tensor<bool, []> window_29_interleave_0 = const()[name = tensor<string, []>("window_29_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_29 = concat(axis = var_26, interleave = window_29_interleave_0, values = (var_850, var_847))[name = tensor<string, []>("window_29")];
-            tensor<int32, [3]> var_855_begin_0 = const()[name = tensor<string, []>("op_855_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_855_end_0 = const()[name = tensor<string, []>("op_855_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_855_end_mask_0 = const()[name = tensor<string, []>("op_855_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_855 = slice_by_index(begin = var_855_begin_0, end = var_855_end_0, end_mask = var_855_end_mask_0, x = x_21)[name = tensor<string, []>("op_855")];
-            tensor<int32, [3]> var_858_begin_0 = const()[name = tensor<string, []>("op_858_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_858_end_0 = const()[name = tensor<string, []>("op_858_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_858_end_mask_0 = const()[name = tensor<string, []>("op_858_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_858 = slice_by_index(begin = var_858_begin_0, end = var_858_end_0, end_mask = var_858_end_mask_0, x = window_29)[name = tensor<string, []>("op_858")];
+            tensor<fp32, [1, 16, 256]> window_29 = concat(axis = var_72, interleave = window_29_interleave_0, values = (var_896, var_893))[name = tensor<string, []>("window_29")];
+            tensor<int32, [3]> var_901_begin_0 = const()[name = tensor<string, []>("op_901_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_901_end_0 = const()[name = tensor<string, []>("op_901_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_901_end_mask_0 = const()[name = tensor<string, []>("op_901_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_901 = slice_by_index(begin = var_901_begin_0, end = var_901_end_0, end_mask = var_901_end_mask_0, x = x_21)[name = tensor<string, []>("op_901")];
+            tensor<int32, [3]> var_904_begin_0 = const()[name = tensor<string, []>("op_904_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_904_end_0 = const()[name = tensor<string, []>("op_904_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_904_end_mask_0 = const()[name = tensor<string, []>("op_904_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_904 = slice_by_index(begin = var_904_begin_0, end = var_904_end_0, end_mask = var_904_end_mask_0, x = window_29)[name = tensor<string, []>("op_904")];
             tensor<bool, []> window_interleave_0 = const()[name = tensor<string, []>("window_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window = concat(axis = var_26, interleave = window_interleave_0, values = (var_858, var_855))[name = tensor<string, []>("window")];
-            tensor<bool, []> input_141_interleave_0 = const()[name = tensor<string, []>("input_141_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [3, 16, 256]> input_141 = concat(axis = var_23, interleave = input_141_interleave_0, values = (window_27, window_29, window))[name = tensor<string, []>("input_141")];
+            tensor<fp32, [1, 16, 256]> window = concat(axis = var_72, interleave = window_interleave_0, values = (var_904, var_901))[name = tensor<string, []>("window")];
+            tensor<bool, []> input_143_interleave_0 = const()[name = tensor<string, []>("input_143_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [3, 16, 256]> input_143 = concat(axis = var_58, interleave = input_143_interleave_0, values = (window_27, window_29, window))[name = tensor<string, []>("input_143")];
             tensor<int32, [1]> x_23_axes_0 = const()[name = tensor<string, []>("x_23_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [3, 16, 256]> x_23 = layer_norm(axes = x_23_axes_0, beta = encoder_conv_module_3_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_3_sequential_0_weight, x = input_141)[name = tensor<string, []>("x_23")];
-            tensor<int32, [3]> input_143_perm_0 = const()[name = tensor<string, []>("input_143_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [3, 16, 256]> x_23 = layer_norm(axes = x_23_axes_0, beta = inner_encoder_conv_module_3_sequential_0_bias, epsilon = var_55, gamma = inner_encoder_conv_module_3_sequential_0_weight, x = input_143)[name = tensor<string, []>("x_23")];
+            tensor<int32, [3]> input_145_perm_0 = const()[name = tensor<string, []>("input_145_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_33_pad_type_0 = const()[name = tensor<string, []>("inputs_33_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_33_strides_0 = const()[name = tensor<string, []>("inputs_33_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_33_pad_0 = const()[name = tensor<string, []>("inputs_33_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_33_dilations_0 = const()[name = tensor<string, []>("inputs_33_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_33_groups_0 = const()[name = tensor<string, []>("inputs_33_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> input_143 = transpose(perm = input_143_perm_0, x = x_23)[name = tensor<string, []>("transpose_33")];
-            tensor<fp32, [3, 512, 16]> inputs_33 = conv(bias = encoder_conv_module_3_sequential_2_conv_bias, dilations = inputs_33_dilations_0, groups = inputs_33_groups_0, pad = inputs_33_pad_0, pad_type = inputs_33_pad_type_0, strides = inputs_33_strides_0, weight = encoder_conv_module_3_sequential_2_conv_weight, x = input_143)[name = tensor<string, []>("inputs_33")];
-            tensor<int32, [2]> var_883_split_sizes_0 = const()[name = tensor<string, []>("op_883_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_883_axis_0 = const()[name = tensor<string, []>("op_883_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> var_883_0, tensor<fp32, [3, 256, 16]> var_883_1 = split(axis = var_883_axis_0, split_sizes = var_883_split_sizes_0, x = inputs_33)[name = tensor<string, []>("op_883")];
-            tensor<fp32, [3, 256, 16]> var_885 = sigmoid(x = var_883_1)[name = tensor<string, []>("op_885")];
-            tensor<fp32, [3, 256, 16]> inputs_35 = mul(x = var_883_0, y = var_885)[name = tensor<string, []>("inputs_35")];
+            tensor<fp32, [3, 256, 16]> input_145 = transpose(perm = input_145_perm_0, x = x_23)[name = tensor<string, []>("transpose_33")];
+            tensor<fp32, [3, 512, 16]> inputs_33 = conv(bias = inner_encoder_conv_module_3_sequential_2_conv_bias, dilations = inputs_33_dilations_0, groups = inputs_33_groups_0, pad = inputs_33_pad_0, pad_type = inputs_33_pad_type_0, strides = inputs_33_strides_0, weight = inner_encoder_conv_module_3_sequential_2_conv_weight, x = input_145)[name = tensor<string, []>("inputs_33")];
+            tensor<int32, [2]> var_929_split_sizes_0 = const()[name = tensor<string, []>("op_929_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_929_axis_0 = const()[name = tensor<string, []>("op_929_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [3, 256, 16]> var_929_0, tensor<fp32, [3, 256, 16]> var_929_1 = split(axis = var_929_axis_0, split_sizes = var_929_split_sizes_0, x = inputs_33)[name = tensor<string, []>("op_929")];
+            tensor<fp32, [3, 256, 16]> var_931 = sigmoid(x = var_929_1)[name = tensor<string, []>("op_931")];
+            tensor<fp32, [3, 256, 16]> inputs_35 = mul(x = var_929_0, y = var_931)[name = tensor<string, []>("inputs_35")];
             tensor<string, []> outputs_aug_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_pad_0 = const()[name = tensor<string, []>("outputs_aug_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_groups_0 = const()[name = tensor<string, []>("outputs_aug_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_strides_0 = const()[name = tensor<string, []>("outputs_aug_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_dilations_0 = const()[name = tensor<string, []>("outputs_aug_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [3, 256, 31]> outputs_aug = conv(dilations = outputs_aug_dilations_0, groups = outputs_aug_groups_0, pad = outputs_aug_pad_0, pad_type = outputs_aug_pad_type_0, strides = outputs_aug_strides_0, weight = encoder_conv_module_3_sequential_4_conv_weight, x = inputs_35)[name = tensor<string, []>("outputs_aug")];
-            tensor<int32, [3]> input_145_begin_0 = const()[name = tensor<string, []>("input_145_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_145_end_0 = const()[name = tensor<string, []>("input_145_end_0"), val = tensor<int32, [3]>([3, 256, 16])];
-            tensor<bool, [3]> input_145_end_mask_0 = const()[name = tensor<string, []>("input_145_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [3, 256, 16]> input_145 = slice_by_index(begin = input_145_begin_0, end = input_145_end_0, end_mask = input_145_end_mask_0, x = outputs_aug)[name = tensor<string, []>("input_145")];
-            tensor<fp32, [3, 256, 16]> inputs_37 = batch_norm(beta = encoder_conv_module_3_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_3_sequential_5_weight, mean = encoder_conv_module_3_sequential_5_running_mean, variance = encoder_conv_module_3_sequential_5_running_var, x = input_145)[name = tensor<string, []>("inputs_37")];
-            tensor<fp32, [3, 256, 16]> input_147 = silu(x = inputs_37)[name = tensor<string, []>("input_147")];
-            tensor<string, []> input_149_pad_type_0 = const()[name = tensor<string, []>("input_149_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_149_strides_0 = const()[name = tensor<string, []>("input_149_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_149_pad_0 = const()[name = tensor<string, []>("input_149_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_149_dilations_0 = const()[name = tensor<string, []>("input_149_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_149_groups_0 = const()[name = tensor<string, []>("input_149_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> input_149 = conv(bias = encoder_conv_module_3_sequential_7_conv_bias, dilations = input_149_dilations_0, groups = input_149_groups_0, pad = input_149_pad_0, pad_type = input_149_pad_type_0, strides = input_149_strides_0, weight = encoder_conv_module_3_sequential_7_conv_weight, x = input_147)[name = tensor<string, []>("input_149")];
+            tensor<fp32, [3, 256, 31]> outputs_aug = conv(dilations = outputs_aug_dilations_0, groups = outputs_aug_groups_0, pad = outputs_aug_pad_0, pad_type = outputs_aug_pad_type_0, strides = outputs_aug_strides_0, weight = inner_encoder_conv_module_3_sequential_4_conv_weight, x = inputs_35)[name = tensor<string, []>("outputs_aug")];
+            tensor<int32, [3]> input_147_begin_0 = const()[name = tensor<string, []>("input_147_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_147_end_0 = const()[name = tensor<string, []>("input_147_end_0"), val = tensor<int32, [3]>([3, 256, 16])];
+            tensor<bool, [3]> input_147_end_mask_0 = const()[name = tensor<string, []>("input_147_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [3, 256, 16]> input_147 = slice_by_index(begin = input_147_begin_0, end = input_147_end_0, end_mask = input_147_end_mask_0, x = outputs_aug)[name = tensor<string, []>("input_147")];
+            tensor<fp32, [3, 256, 16]> inputs_37 = batch_norm(beta = inner_encoder_conv_module_3_sequential_5_bias, epsilon = var_55, gamma = inner_encoder_conv_module_3_sequential_5_weight, mean = inner_encoder_conv_module_3_sequential_5_running_mean, variance = inner_encoder_conv_module_3_sequential_5_running_var, x = input_147)[name = tensor<string, []>("inputs_37")];
+            tensor<fp32, [3, 256, 16]> input_149 = silu(x = inputs_37)[name = tensor<string, []>("input_149")];
+            tensor<string, []> input_151_pad_type_0 = const()[name = tensor<string, []>("input_151_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_151_strides_0 = const()[name = tensor<string, []>("input_151_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_151_pad_0 = const()[name = tensor<string, []>("input_151_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_151_dilations_0 = const()[name = tensor<string, []>("input_151_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_151_groups_0 = const()[name = tensor<string, []>("input_151_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [3, 256, 16]> input_151 = conv(bias = inner_encoder_conv_module_3_sequential_7_conv_bias, dilations = input_151_dilations_0, groups = input_151_groups_0, pad = input_151_pad_0, pad_type = input_151_pad_type_0, strides = input_151_strides_0, weight = inner_encoder_conv_module_3_sequential_7_conv_weight, x = input_149)[name = tensor<string, []>("input_151")];
             tensor<int32, [3]> conv_out_7_perm_0 = const()[name = tensor<string, []>("conv_out_7_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_916_begin_0 = const()[name = tensor<string, []>("op_916_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_916_end_0 = const()[name = tensor<string, []>("op_916_end_0"), val = tensor<int32, [3]>([3, 16, 256])];
-            tensor<bool, [3]> var_916_end_mask_0 = const()[name = tensor<string, []>("op_916_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [3, 16, 256]> conv_out_7 = transpose(perm = conv_out_7_perm_0, x = input_149)[name = tensor<string, []>("transpose_32")];
-            tensor<fp32, [3, 1, 256]> var_916 = slice_by_index(begin = var_916_begin_0, end = var_916_end_0, end_mask = var_916_end_mask_0, x = conv_out_7)[name = tensor<string, []>("op_916")];
-            tensor<int32, [3]> var_918_perm_0 = const()[name = tensor<string, []>("op_918_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 3, 256]> var_918 = transpose(perm = var_918_perm_0, x = var_916)[name = tensor<string, []>("transpose_31")];
-            tensor<fp32, [1, 3, 256]> input_151 = add(x = x_21, y = var_918)[name = tensor<string, []>("input_151")];
-            tensor<int32, [1]> input_153_axes_0 = const()[name = tensor<string, []>("input_153_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_153 = layer_norm(axes = input_153_axes_0, beta = encoder_ffn2_3_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_3_module_sequential_0_weight, x = input_151)[name = tensor<string, []>("input_153")];
-            tensor<fp32, [1, 3, 1024]> inputs = linear(bias = encoder_ffn2_3_module_sequential_1_linear_bias, weight = encoder_ffn2_3_module_sequential_1_linear_weight, x = input_153)[name = tensor<string, []>("linear_35")];
-            tensor<fp32, [1, 3, 1024]> input_155 = silu(x = inputs)[name = tensor<string, []>("input_155")];
-            tensor<fp32, [1, 3, 256]> input_159 = linear(bias = encoder_ffn2_3_module_sequential_4_linear_bias, weight = encoder_ffn2_3_module_sequential_4_linear_weight, x = input_155)[name = tensor<string, []>("linear_36")];
-            tensor<fp32, []> var_941 = const()[name = tensor<string, []>("op_941"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 3, 256]> var_942 = mul(x = input_159, y = var_941)[name = tensor<string, []>("op_942")];
-            tensor<fp32, [1, 3, 256]> input_161 = add(x = var_942, y = input_151)[name = tensor<string, []>("input_161")];
+            tensor<int32, [3]> var_962_begin_0 = const()[name = tensor<string, []>("op_962_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_962_end_0 = const()[name = tensor<string, []>("op_962_end_0"), val = tensor<int32, [3]>([3, 16, 256])];
+            tensor<bool, [3]> var_962_end_mask_0 = const()[name = tensor<string, []>("op_962_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [3, 16, 256]> conv_out_7 = transpose(perm = conv_out_7_perm_0, x = input_151)[name = tensor<string, []>("transpose_32")];
+            tensor<fp32, [3, 1, 256]> var_962 = slice_by_index(begin = var_962_begin_0, end = var_962_end_0, end_mask = var_962_end_mask_0, x = conv_out_7)[name = tensor<string, []>("op_962")];
+            tensor<int32, [3]> var_964_perm_0 = const()[name = tensor<string, []>("op_964_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 3, 256]> var_964 = transpose(perm = var_964_perm_0, x = var_962)[name = tensor<string, []>("transpose_31")];
+            tensor<fp32, [1, 3, 256]> input_153 = add(x = x_21, y = var_964)[name = tensor<string, []>("input_153")];
+            tensor<int32, [1]> input_155_axes_0 = const()[name = tensor<string, []>("input_155_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 3, 256]> input_155 = layer_norm(axes = input_155_axes_0, beta = inner_encoder_ffn2_3_module_sequential_0_bias, epsilon = var_55, gamma = inner_encoder_ffn2_3_module_sequential_0_weight, x = input_153)[name = tensor<string, []>("input_155")];
+            tensor<fp32, [1, 3, 1024]> inputs = linear(bias = inner_encoder_ffn2_3_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_3_module_sequential_1_linear_weight, x = input_155)[name = tensor<string, []>("linear_35")];
+            tensor<fp32, [1, 3, 1024]> input_157 = silu(x = inputs)[name = tensor<string, []>("input_157")];
+            tensor<fp32, [1, 3, 256]> input_161 = linear(bias = inner_encoder_ffn2_3_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_3_module_sequential_4_linear_weight, x = input_157)[name = tensor<string, []>("linear_36")];
+            tensor<fp32, []> var_987 = const()[name = tensor<string, []>("op_987"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 3, 256]> var_988 = mul(x = input_161, y = var_987)[name = tensor<string, []>("op_988")];
+            tensor<fp32, [1, 3, 256]> input_163 = add(x = var_988, y = input_153)[name = tensor<string, []>("input_163")];
             tensor<int32, [1]> x_25_axes_0 = const()[name = tensor<string, []>("x_25_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> x_25 = layer_norm(axes = x_25_axes_0, beta = encoder_layer_norm_3_bias, epsilon = var_28, gamma = encoder_layer_norm_3_weight, x = input_161)[name = tensor<string, []>("x_25")];
+            tensor<fp32, [1, 3, 256]> x_25 = layer_norm(axes = x_25_axes_0, beta = inner_encoder_layer_norm_3_bias, epsilon = var_55, gamma = inner_encoder_layer_norm_3_weight, x = input_163)[name = tensor<string, []>("x_25")];
             tensor<int32, [3]> x_ct_perm_0 = const()[name = tensor<string, []>("x_ct_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<bool, []> cat_interleave_0 = const()[name = tensor<string, []>("cat_interleave_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [1, 256, 3]> x_ct = transpose(perm = x_ct_perm_0, x = x_25)[name = tensor<string, []>("transpose_30")];
-            tensor<fp32, [1, 256, 21]> cat = concat(axis = var_20, interleave = cat_interleave_0, values = (cnn_window, x_ct))[name = tensor<string, []>("cat")];
+            tensor<fp32, [1, 256, 21]> cat = concat(axis = var_60, interleave = cat_interleave_0, values = (cnn_window, x_ct))[name = tensor<string, []>("cat")];
             tensor<string, []> conv_out_pad_type_0 = const()[name = tensor<string, []>("conv_out_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> conv_out_strides_0 = const()[name = tensor<string, []>("conv_out_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> conv_out_pad_0 = const()[name = tensor<string, []>("conv_out_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> conv_out_dilations_0 = const()[name = tensor<string, []>("conv_out_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> conv_out_groups_0 = const()[name = tensor<string, []>("conv_out_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 3]> conv_out = conv(bias = encoder_cnn_bias, dilations = conv_out_dilations_0, groups = conv_out_groups_0, pad = conv_out_pad_0, pad_type = conv_out_pad_type_0, strides = conv_out_strides_0, weight = encoder_cnn_weight, x = cat)[name = tensor<string, []>("conv_out")];
-            tensor<int32, [3]> var_960_begin_0 = const()[name = tensor<string, []>("op_960_begin_0"), val = tensor<int32, [3]>([0, 0, 3])];
-            tensor<int32, [3]> var_960_end_0 = const()[name = tensor<string, []>("op_960_end_0"), val = tensor<int32, [3]>([1, 256, 21])];
-            tensor<bool, [3]> var_960_end_mask_0 = const()[name = tensor<string, []>("op_960_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 256, 18]> cnn_window_new = slice_by_index(begin = var_960_begin_0, end = var_960_end_0, end_mask = var_960_end_mask_0, x = cat)[name = tensor<string, []>("op_960")];
-            tensor<int32, [3]> input_163_perm_0 = const()[name = tensor<string, []>("input_163_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [1]> var_962 = const()[name = tensor<string, []>("op_962"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_163 = transpose(perm = input_163_perm_0, x = conv_out)[name = tensor<string, []>("transpose_29")];
-            tensor<fp32, [1, 3, 1]> var_963 = reduce_l2_norm(axes = var_962, keep_dims = var_29, x = input_163)[name = tensor<string, []>("op_963")];
+            tensor<fp32, [1, 256, 3]> conv_out = conv(bias = inner_encoder_cnn_bias, dilations = conv_out_dilations_0, groups = conv_out_groups_0, pad = conv_out_pad_0, pad_type = conv_out_pad_type_0, strides = conv_out_strides_0, weight = inner_encoder_cnn_weight, x = cat)[name = tensor<string, []>("conv_out")];
+            tensor<int32, [3]> var_1006_begin_0 = const()[name = tensor<string, []>("op_1006_begin_0"), val = tensor<int32, [3]>([0, 0, 3])];
+            tensor<int32, [3]> var_1006_end_0 = const()[name = tensor<string, []>("op_1006_end_0"), val = tensor<int32, [3]>([1, 256, 21])];
+            tensor<bool, [3]> var_1006_end_mask_0 = const()[name = tensor<string, []>("op_1006_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 256, 18]> cnn_window_new = slice_by_index(begin = var_1006_begin_0, end = var_1006_end_0, end_mask = var_1006_end_mask_0, x = cat)[name = tensor<string, []>("op_1006")];
+            tensor<int32, [3]> input_165_perm_0 = const()[name = tensor<string, []>("input_165_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_1008 = const()[name = tensor<string, []>("op_1008"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 3, 256]> input_165 = transpose(perm = input_165_perm_0, x = conv_out)[name = tensor<string, []>("transpose_29")];
+            tensor<fp32, [1, 3, 1]> var_1009 = reduce_l2_norm(axes = var_1008, keep_dims = var_54, x = input_165)[name = tensor<string, []>("op_1009")];
             tensor<fp32, []> const_12 = const()[name = tensor<string, []>("const_12"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1, 3, 1]> clip_0 = clip(alpha = var_11, beta = const_12, x = var_963)[name = tensor<string, []>("clip_0")];
-            tensor<fp32, [1, 3, 256]> emb = real_div(x = input_163, y = clip_0)[name = tensor<string, []>("emb")];
-            tensor<int32, []> var_967_axis_0 = const()[name = tensor<string, []>("op_967_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1, 4, 64, 64]> enc_kv_new = stack(axis = var_967_axis_0, values = (var_206, var_412, var_618, nkv_1))[name = tensor<string, []>("op_967")];
-            tensor<int32, []> var_969_axis_0 = const()[name = tensor<string, []>("op_969_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1]> enc_scale_new = stack(axis = var_969_axis_0, values = (new_scale_1, new_scale_3, new_scale_5, new_scale_7))[name = tensor<string, []>("op_969")];
-            tensor<int32, []> var_971_axis_0 = const()[name = tensor<string, []>("op_971_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1, 16, 256]> enc_conv_cache_new = stack(axis = var_971_axis_0, values = (window_7, window_15, window_23, window))[name = tensor<string, []>("op_971")];
-            tensor<fp32, []> var_980 = const()[name = tensor<string, []>("op_980"), val = tensor<fp32, []>(0x1.5798eep-27)];
-            tensor<fp32, []> var_985 = const()[name = tensor<string, []>("op_985"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
-            tensor<fp32, []> var_987 = const()[name = tensor<string, []>("op_987"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
-            tensor<bool, []> var_988 = const()[name = tensor<string, []>("op_988"), val = tensor<bool, []>(true)];
-            tensor<fp32, []> var_990 = const()[name = tensor<string, []>("op_990"), val = tensor<fp32, []>(0x1p+0)];
-            tensor<int32, []> var_994 = const()[name = tensor<string, []>("op_994"), val = tensor<int32, []>(-1)];
-            tensor<int32, []> var_1000 = const()[name = tensor<string, []>("op_1000"), val = tensor<int32, []>(0)];
+            tensor<fp32, [1, 3, 1]> clip_0 = clip(alpha = var_68, beta = const_12, x = var_1009)[name = tensor<string, []>("clip_0")];
+            tensor<fp32, [1, 3, 256]> emb = real_div(x = input_165, y = clip_0)[name = tensor<string, []>("emb")];
+            tensor<int32, []> var_1013_axis_0 = const()[name = tensor<string, []>("op_1013_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1, 4, 64, 64]> enc_kv_new = stack(axis = var_1013_axis_0, values = (var_252, var_458, var_664, nkv_1))[name = tensor<string, []>("op_1013")];
+            tensor<int32, []> var_1015_axis_0 = const()[name = tensor<string, []>("op_1015_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1]> enc_scale_new = stack(axis = var_1015_axis_0, values = (new_scale_1, new_scale_3, new_scale_5, new_scale_7))[name = tensor<string, []>("op_1015")];
+            tensor<int32, []> var_1017_axis_0 = const()[name = tensor<string, []>("op_1017_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1, 16, 256]> enc_conv_cache_new = stack(axis = var_1017_axis_0, values = (window_7, window_15, window_23, window))[name = tensor<string, []>("op_1017")];
             tensor<fp32, [1, 3, 6, 256]> pos = const()[name = tensor<string, []>("pos"), val = tensor<fp32, [1, 3, 6, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44395328)))];
-            tensor<int32, [1]> var_1062_axes_0 = const()[name = tensor<string, []>("op_1062_axes_0"), val = tensor<int32, [1]>([2])];
-            tensor<fp32, [1, 3, 1, 256]> var_1062 = expand_dims(axes = var_1062_axes_0, x = emb)[name = tensor<string, []>("op_1062")];
+            tensor<int32, [1]> var_1085_axes_0 = const()[name = tensor<string, []>("op_1085_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp32, [1, 3, 1, 256]> var_1085 = expand_dims(axes = var_1085_axes_0, x = emb)[name = tensor<string, []>("op_1085")];
             tensor<int32, [4]> emb_exp_reps_0 = const()[name = tensor<string, []>("emb_exp_reps_0"), val = tensor<int32, [4]>([1, 1, 6, 1])];
-            tensor<fp32, [1, 3, 6, 256]> emb_exp = tile(reps = emb_exp_reps_0, x = var_1062)[name = tensor<string, []>("emb_exp")];
-            tensor<bool, []> input_165_interleave_0 = const()[name = tensor<string, []>("input_165_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 3, 6, 512]> input_165 = concat(axis = var_994, interleave = input_165_interleave_0, values = (emb_exp, pos))[name = tensor<string, []>("input_165")];
-            tensor<fp32, [1, 3, 6, 256]> x_27 = linear(bias = decoder_convert_bias, weight = decoder_convert_weight, x = input_165)[name = tensor<string, []>("linear_37")];
-            tensor<int32, [4]> var_1070_perm_0 = const()[name = tensor<string, []>("op_1070_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1074 = const()[name = tensor<string, []>("op_1074"), val = tensor<int32, [3]>([6, 3, 256])];
-            tensor<fp32, [1, 6, 3, 256]> var_1070 = transpose(perm = var_1070_perm_0, x = x_27)[name = tensor<string, []>("transpose_28")];
-            tensor<fp32, [6, 3, 256]> x_29 = reshape(shape = var_1074, x = var_1070)[name = tensor<string, []>("x_29")];
+            tensor<fp32, [1, 3, 6, 256]> emb_exp = tile(reps = emb_exp_reps_0, x = var_1085)[name = tensor<string, []>("emb_exp")];
+            tensor<bool, []> input_167_interleave_0 = const()[name = tensor<string, []>("input_167_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 3, 6, 512]> input_167 = concat(axis = var_61, interleave = input_167_interleave_0, values = (emb_exp, pos))[name = tensor<string, []>("input_167")];
+            tensor<fp32, [1, 3, 6, 256]> x_27 = linear(bias = inner_decoder_convert_bias, weight = inner_decoder_convert_weight, x = input_167)[name = tensor<string, []>("linear_37")];
+            tensor<int32, [4]> var_1093_perm_0 = const()[name = tensor<string, []>("op_1093_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1097 = const()[name = tensor<string, []>("op_1097"), val = tensor<int32, [3]>([6, 3, 256])];
+            tensor<fp32, [1, 6, 3, 256]> var_1093 = transpose(perm = var_1093_perm_0, x = x_27)[name = tensor<string, []>("transpose_28")];
+            tensor<fp32, [6, 3, 256]> x_29 = reshape(shape = var_1097, x = var_1093)[name = tensor<string, []>("x_29")];
             tensor<int32, [5]> prev_kv_9_begin_0 = const()[name = tensor<string, []>("prev_kv_9_begin_0"), val = tensor<int32, [5]>([0, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_9_end_0 = const()[name = tensor<string, []>("prev_kv_9_end_0"), val = tensor<int32, [5]>([1, 6, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_9_end_mask_0 = const()[name = tensor<string, []>("prev_kv_9_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -943,132 +954,132 @@ program(1.0)
             tensor<bool, [2]> prev_scale_9_end_mask_0 = const()[name = tensor<string, []>("prev_scale_9_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_9_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_9_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_9 = slice_by_index(begin = prev_scale_9_begin_0, end = prev_scale_9_end_0, end_mask = prev_scale_9_end_mask_0, squeeze_mask = prev_scale_9_squeeze_mask_0, x = dec_scale)[name = tensor<string, []>("prev_scale_9")];
-            tensor<fp32, [6, 3, 256]> var_1082 = linear(bias = decoder_q_proj_0_bias, weight = decoder_q_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_38")];
-            tensor<int32, [4]> var_1083 = const()[name = tensor<string, []>("op_1083"), val = tensor<int32, [4]>([6, 3, 4, 64])];
-            tensor<fp32, [6, 3, 4, 64]> var_1084 = reshape(shape = var_1083, x = var_1082)[name = tensor<string, []>("op_1084")];
+            tensor<fp32, [6, 3, 256]> var_1105 = linear(bias = inner_decoder_q_proj_0_bias, weight = inner_decoder_q_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_38")];
+            tensor<int32, [4]> var_1106 = const()[name = tensor<string, []>("op_1106"), val = tensor<int32, [4]>([6, 3, 4, 64])];
+            tensor<fp32, [6, 3, 4, 64]> var_1107 = reshape(shape = var_1106, x = var_1105)[name = tensor<string, []>("op_1107")];
             tensor<int32, [4]> q_9_perm_0 = const()[name = tensor<string, []>("q_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [6, 3, 256]> var_1088 = linear(bias = decoder_k_proj_0_bias, weight = decoder_k_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_39")];
-            tensor<fp32, []> var_1089 = const()[name = tensor<string, []>("op_1089"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [6, 3, 256]> var_1090 = mul(x = var_1088, y = var_1089)[name = tensor<string, []>("op_1090")];
-            tensor<int32, [4]> var_1091 = const()[name = tensor<string, []>("op_1091"), val = tensor<int32, [4]>([6, 3, 4, 64])];
-            tensor<fp32, [6, 3, 4, 64]> var_1092 = reshape(shape = var_1091, x = var_1090)[name = tensor<string, []>("op_1092")];
+            tensor<fp32, [6, 3, 256]> var_1111 = linear(bias = inner_decoder_k_proj_0_bias, weight = inner_decoder_k_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_39")];
+            tensor<fp32, []> var_1112 = const()[name = tensor<string, []>("op_1112"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [6, 3, 256]> var_1113 = mul(x = var_1111, y = var_1112)[name = tensor<string, []>("op_1113")];
+            tensor<int32, [4]> var_1114 = const()[name = tensor<string, []>("op_1114"), val = tensor<int32, [4]>([6, 3, 4, 64])];
+            tensor<fp32, [6, 3, 4, 64]> var_1115 = reshape(shape = var_1114, x = var_1113)[name = tensor<string, []>("op_1115")];
             tensor<int32, [4]> k_9_perm_0 = const()[name = tensor<string, []>("k_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [6, 3, 256]> var_1096 = linear(bias = decoder_v_proj_0_bias, weight = decoder_v_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_40")];
-            tensor<int32, [4]> var_1097 = const()[name = tensor<string, []>("op_1097"), val = tensor<int32, [4]>([6, 3, 4, 64])];
-            tensor<fp32, [6, 3, 4, 64]> var_1098 = reshape(shape = var_1097, x = var_1096)[name = tensor<string, []>("op_1098")];
+            tensor<fp32, [6, 3, 256]> var_1119 = linear(bias = inner_decoder_v_proj_0_bias, weight = inner_decoder_v_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_40")];
+            tensor<int32, [4]> var_1120 = const()[name = tensor<string, []>("op_1120"), val = tensor<int32, [4]>([6, 3, 4, 64])];
+            tensor<fp32, [6, 3, 4, 64]> var_1121 = reshape(shape = var_1120, x = var_1119)[name = tensor<string, []>("op_1121")];
             tensor<int32, [4]> v_9_perm_0 = const()[name = tensor<string, []>("v_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [6, 3, 256]> input_169 = linear(bias = decoder_g_proj_0_bias, weight = decoder_g_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_41")];
+            tensor<fp32, [6, 3, 256]> input_171 = linear(bias = inner_decoder_g_proj_0_bias, weight = inner_decoder_g_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_41")];
             tensor<bool, []> cumsum_mask_1_exclusive_0 = const()[name = tensor<string, []>("cumsum_mask_1_exclusive_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> cumsum_mask_1_reverse_0 = const()[name = tensor<string, []>("cumsum_mask_1_reverse_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [3]> cumsum_mask_1 = cumsum(axis = var_1000, exclusive = cumsum_mask_1_exclusive_0, reverse = cumsum_mask_1_reverse_0, x = valid_mask)[name = tensor<string, []>("cumsum_mask_1")];
+            tensor<fp32, [3]> cumsum_mask_1 = cumsum(axis = var_58, exclusive = cumsum_mask_1_exclusive_0, reverse = cumsum_mask_1_reverse_0, x = valid_mask)[name = tensor<string, []>("cumsum_mask_1")];
             tensor<fp32, [1]> sqrt_s0_9 = sqrt(x = prev_scale_9)[name = tensor<string, []>("sqrt_s0_9")];
             tensor<fp32, [3]> s_t_9 = add(x = prev_scale_9, y = cumsum_mask_1)[name = tensor<string, []>("s_t_9")];
             tensor<fp32, []> const_20 = const()[name = tensor<string, []>("const_20"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [3]> clip_1 = clip(alpha = var_990, beta = const_20, x = s_t_9)[name = tensor<string, []>("clip_1")];
+            tensor<fp32, [3]> clip_1 = clip(alpha = var_48, beta = const_20, x = s_t_9)[name = tensor<string, []>("clip_1")];
             tensor<fp32, [3]> sqrt_s_t_9 = sqrt(x = clip_1)[name = tensor<string, []>("sqrt_s_t_9")];
             tensor<bool, []> qk_9_transpose_x_1 = const()[name = tensor<string, []>("qk_9_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_9_transpose_y_1 = const()[name = tensor<string, []>("qk_9_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [6, 4, 3, 64]> k_9 = transpose(perm = k_9_perm_0, x = var_1092)[name = tensor<string, []>("transpose_26")];
-            tensor<fp32, [6, 4, 3, 64]> q_9 = transpose(perm = q_9_perm_0, x = var_1084)[name = tensor<string, []>("transpose_27")];
+            tensor<fp32, [6, 4, 3, 64]> k_9 = transpose(perm = k_9_perm_0, x = var_1115)[name = tensor<string, []>("transpose_26")];
+            tensor<fp32, [6, 4, 3, 64]> q_9 = transpose(perm = q_9_perm_0, x = var_1107)[name = tensor<string, []>("transpose_27")];
             tensor<fp32, [6, 4, 3, 3]> qk_9 = matmul(transpose_x = qk_9_transpose_x_1, transpose_y = qk_9_transpose_y_1, x = q_9, y = k_9)[name = tensor<string, []>("qk_9")];
-            tensor<int32, [2]> var_1110 = const()[name = tensor<string, []>("op_1110"), val = tensor<int32, [2]>([1, 3])];
-            tensor<fp32, [1, 3]> var_1111 = reshape(shape = var_1110, x = valid_mask)[name = tensor<string, []>("op_1111")];
-            tensor<fp32, [3, 3]> causal_with_valid_1 = mul(x = decoder__causal_mask, y = var_1111)[name = tensor<string, []>("causal_with_valid_1")];
-            tensor<int32, [2]> var_1113 = const()[name = tensor<string, []>("op_1113"), val = tensor<int32, [2]>([3, 1])];
-            tensor<fp32, [3, 1]> var_1114 = reshape(shape = var_1113, x = sqrt_s_t_9)[name = tensor<string, []>("op_1114")];
-            tensor<fp32, [3, 3]> M_9 = real_div(x = causal_with_valid_1, y = var_1114)[name = tensor<string, []>("M_9")];
-            tensor<fp32, [6, 4, 3, 3]> var_1116 = mul(x = qk_9, y = M_9)[name = tensor<string, []>("op_1116")];
+            tensor<int32, [2]> var_1133 = const()[name = tensor<string, []>("op_1133"), val = tensor<int32, [2]>([1, 3])];
+            tensor<fp32, [1, 3]> var_1134 = reshape(shape = var_1133, x = valid_mask)[name = tensor<string, []>("op_1134")];
+            tensor<fp32, [3, 3]> causal_with_valid_1 = mul(x = inner_decoder__causal_mask, y = var_1134)[name = tensor<string, []>("causal_with_valid_1")];
+            tensor<int32, [2]> var_1136 = const()[name = tensor<string, []>("op_1136"), val = tensor<int32, [2]>([3, 1])];
+            tensor<fp32, [3, 1]> var_1137 = reshape(shape = var_1136, x = sqrt_s_t_9)[name = tensor<string, []>("op_1137")];
+            tensor<fp32, [3, 3]> M_9 = real_div(x = causal_with_valid_1, y = var_1137)[name = tensor<string, []>("M_9")];
+            tensor<fp32, [6, 4, 3, 3]> var_1139 = mul(x = qk_9, y = M_9)[name = tensor<string, []>("op_1139")];
             tensor<bool, []> inner_9_transpose_x_0 = const()[name = tensor<string, []>("inner_9_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_9_transpose_y_0 = const()[name = tensor<string, []>("inner_9_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [6, 4, 3, 64]> v_9 = transpose(perm = v_9_perm_0, x = var_1098)[name = tensor<string, []>("transpose_25")];
-            tensor<fp32, [6, 4, 3, 64]> inner_9 = matmul(transpose_x = inner_9_transpose_x_0, transpose_y = inner_9_transpose_y_0, x = var_1116, y = v_9)[name = tensor<string, []>("inner_9")];
-            tensor<bool, []> var_1118_transpose_x_0 = const()[name = tensor<string, []>("op_1118_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_1118_transpose_y_0 = const()[name = tensor<string, []>("op_1118_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [6, 4, 3, 64]> var_1118 = matmul(transpose_x = var_1118_transpose_x_0, transpose_y = var_1118_transpose_y_0, x = q_9, y = prev_kv_9)[name = tensor<string, []>("op_1118")];
-            tensor<fp32, [3]> var_1119 = real_div(x = sqrt_s0_9, y = sqrt_s_t_9)[name = tensor<string, []>("op_1119")];
-            tensor<int32, [4]> var_1120 = const()[name = tensor<string, []>("op_1120"), val = tensor<int32, [4]>([1, 1, 3, 1])];
-            tensor<fp32, [1, 1, 3, 1]> var_1121 = reshape(shape = var_1120, x = var_1119)[name = tensor<string, []>("op_1121")];
-            tensor<fp32, [6, 4, 3, 64]> cross_9 = mul(x = var_1118, y = var_1121)[name = tensor<string, []>("cross_9")];
+            tensor<fp32, [6, 4, 3, 64]> v_9 = transpose(perm = v_9_perm_0, x = var_1121)[name = tensor<string, []>("transpose_25")];
+            tensor<fp32, [6, 4, 3, 64]> inner_9 = matmul(transpose_x = inner_9_transpose_x_0, transpose_y = inner_9_transpose_y_0, x = var_1139, y = v_9)[name = tensor<string, []>("inner_9")];
+            tensor<bool, []> var_1141_transpose_x_0 = const()[name = tensor<string, []>("op_1141_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_1141_transpose_y_0 = const()[name = tensor<string, []>("op_1141_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [6, 4, 3, 64]> var_1141 = matmul(transpose_x = var_1141_transpose_x_0, transpose_y = var_1141_transpose_y_0, x = q_9, y = prev_kv_9)[name = tensor<string, []>("op_1141")];
+            tensor<fp32, [3]> var_1142 = real_div(x = sqrt_s0_9, y = sqrt_s_t_9)[name = tensor<string, []>("op_1142")];
+            tensor<int32, [4]> var_1143 = const()[name = tensor<string, []>("op_1143"), val = tensor<int32, [4]>([1, 1, 3, 1])];
+            tensor<fp32, [1, 1, 3, 1]> var_1144 = reshape(shape = var_1143, x = var_1142)[name = tensor<string, []>("op_1144")];
+            tensor<fp32, [6, 4, 3, 64]> cross_9 = mul(x = var_1141, y = var_1144)[name = tensor<string, []>("cross_9")];
             tensor<fp32, [6, 4, 3, 64]> out_25 = add(x = inner_9, y = cross_9)[name = tensor<string, []>("out_25")];
-            tensor<int32, [4]> var_1124 = const()[name = tensor<string, []>("op_1124"), val = tensor<int32, [4]>([1, 1, 3, 1])];
-            tensor<fp32, [1, 1, 3, 1]> var_1125 = reshape(shape = var_1124, x = valid_mask)[name = tensor<string, []>("op_1125")];
-            tensor<fp32, [6, 4, 3, 64]> v_masked_1 = mul(x = v_9, y = var_1125)[name = tensor<string, []>("v_masked_1")];
-            tensor<fp32, [6, 4, 64, 64]> var_1127 = mul(x = prev_kv_9, y = sqrt_s0_9)[name = tensor<string, []>("op_1127")];
-            tensor<bool, []> var_1129_transpose_x_1 = const()[name = tensor<string, []>("op_1129_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_1129_transpose_y_1 = const()[name = tensor<string, []>("op_1129_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [6, 4, 64, 64]> var_1129 = matmul(transpose_x = var_1129_transpose_x_1, transpose_y = var_1129_transpose_y_1, x = k_9, y = v_masked_1)[name = tensor<string, []>("op_1129")];
-            tensor<fp32, [6, 4, 64, 64]> new_kv_unnorm_9 = add(x = var_1127, y = var_1129)[name = tensor<string, []>("new_kv_unnorm_9")];
-            tensor<bool, []> var_1131_keep_dims_0 = const()[name = tensor<string, []>("op_1131_keep_dims_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, []> var_1131 = reduce_sum(keep_dims = var_1131_keep_dims_0, x = valid_mask)[name = tensor<string, []>("op_1131")];
-            tensor<int32, [1]> var_1132 = const()[name = tensor<string, []>("op_1132"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [1]> var_1133 = reshape(shape = var_1132, x = var_1131)[name = tensor<string, []>("op_1133")];
-            tensor<fp32, [1]> new_scale_9 = add(x = prev_scale_9, y = var_1133)[name = tensor<string, []>("new_scale_9")];
+            tensor<int32, [4]> var_1147 = const()[name = tensor<string, []>("op_1147"), val = tensor<int32, [4]>([1, 1, 3, 1])];
+            tensor<fp32, [1, 1, 3, 1]> var_1148 = reshape(shape = var_1147, x = valid_mask)[name = tensor<string, []>("op_1148")];
+            tensor<fp32, [6, 4, 3, 64]> v_masked_1 = mul(x = v_9, y = var_1148)[name = tensor<string, []>("v_masked_1")];
+            tensor<fp32, [6, 4, 64, 64]> var_1150 = mul(x = prev_kv_9, y = sqrt_s0_9)[name = tensor<string, []>("op_1150")];
+            tensor<bool, []> var_1152_transpose_x_1 = const()[name = tensor<string, []>("op_1152_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_1152_transpose_y_1 = const()[name = tensor<string, []>("op_1152_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [6, 4, 64, 64]> var_1152 = matmul(transpose_x = var_1152_transpose_x_1, transpose_y = var_1152_transpose_y_1, x = k_9, y = v_masked_1)[name = tensor<string, []>("op_1152")];
+            tensor<fp32, [6, 4, 64, 64]> new_kv_unnorm_9 = add(x = var_1150, y = var_1152)[name = tensor<string, []>("new_kv_unnorm_9")];
+            tensor<bool, []> var_1154_keep_dims_0 = const()[name = tensor<string, []>("op_1154_keep_dims_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, []> var_1154 = reduce_sum(keep_dims = var_1154_keep_dims_0, x = valid_mask)[name = tensor<string, []>("op_1154")];
+            tensor<int32, [1]> var_1155 = const()[name = tensor<string, []>("op_1155"), val = tensor<int32, [1]>([1])];
+            tensor<fp32, [1]> var_1156 = reshape(shape = var_1155, x = var_1154)[name = tensor<string, []>("op_1156")];
+            tensor<fp32, [1]> new_scale_9 = add(x = prev_scale_9, y = var_1156)[name = tensor<string, []>("new_scale_9")];
             tensor<fp32, []> const_21 = const()[name = tensor<string, []>("const_21"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_2 = clip(alpha = var_990, beta = const_21, x = new_scale_9)[name = tensor<string, []>("clip_2")];
+            tensor<fp32, [1]> clip_2 = clip(alpha = var_48, beta = const_21, x = new_scale_9)[name = tensor<string, []>("clip_2")];
             tensor<fp32, [1]> sqrt_new_scale_1 = sqrt(x = clip_2)[name = tensor<string, []>("sqrt_new_scale_1")];
-            tensor<fp32, [6, 4, 64, 64]> var_1137 = real_div(x = new_kv_unnorm_9, y = sqrt_new_scale_1)[name = tensor<string, []>("op_1137")];
-            tensor<int32, [4]> var_1138_perm_0 = const()[name = tensor<string, []>("op_1138_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [6, 4, 64, 64]> var_1160 = real_div(x = new_kv_unnorm_9, y = sqrt_new_scale_1)[name = tensor<string, []>("op_1160")];
+            tensor<int32, [4]> var_1161_perm_0 = const()[name = tensor<string, []>("op_1161_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_27_axes_0 = const()[name = tensor<string, []>("out_27_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [6, 3, 4, 64]> var_1138 = transpose(perm = var_1138_perm_0, x = out_25)[name = tensor<string, []>("transpose_24")];
-            tensor<fp32, [6, 3, 4, 64]> out_27 = layer_norm(axes = out_27_axes_0, epsilon = var_987, x = var_1138)[name = tensor<string, []>("out_27")];
-            tensor<int32, [3]> var_1142 = const()[name = tensor<string, []>("op_1142"), val = tensor<int32, [3]>([6, 3, 256])];
-            tensor<fp32, [6, 3, 256]> out_29 = reshape(shape = var_1142, x = out_27)[name = tensor<string, []>("out_29")];
-            tensor<fp32, [6, 3, 256]> var_1144 = silu(x = input_169)[name = tensor<string, []>("op_1144")];
-            tensor<fp32, [6, 3, 256]> input_171 = mul(x = var_1144, y = out_29)[name = tensor<string, []>("input_171")];
-            tensor<fp32, [6, 3, 256]> ret_out_9 = linear(bias = decoder_out_proj_0_bias, weight = decoder_out_proj_0_weight, x = input_171)[name = tensor<string, []>("linear_42")];
-            tensor<fp32, [6, 3, 256]> input_173 = add(x = x_29, y = ret_out_9)[name = tensor<string, []>("input_173")];
+            tensor<fp32, [6, 3, 4, 64]> var_1161 = transpose(perm = var_1161_perm_0, x = out_25)[name = tensor<string, []>("transpose_24")];
+            tensor<fp32, [6, 3, 4, 64]> out_27 = layer_norm(axes = out_27_axes_0, epsilon = var_63, x = var_1161)[name = tensor<string, []>("out_27")];
+            tensor<int32, [3]> var_1165 = const()[name = tensor<string, []>("op_1165"), val = tensor<int32, [3]>([6, 3, 256])];
+            tensor<fp32, [6, 3, 256]> out_29 = reshape(shape = var_1165, x = out_27)[name = tensor<string, []>("out_29")];
+            tensor<fp32, [6, 3, 256]> var_1167 = silu(x = input_171)[name = tensor<string, []>("op_1167")];
+            tensor<fp32, [6, 3, 256]> input_173 = mul(x = var_1167, y = out_29)[name = tensor<string, []>("input_173")];
+            tensor<fp32, [6, 3, 256]> ret_out_9 = linear(bias = inner_decoder_out_proj_0_bias, weight = inner_decoder_out_proj_0_weight, x = input_173)[name = tensor<string, []>("linear_42")];
+            tensor<fp32, [6, 3, 256]> input_175 = add(x = x_29, y = ret_out_9)[name = tensor<string, []>("input_175")];
             tensor<int32, [1]> xt_1_axes_0 = const()[name = tensor<string, []>("xt_1_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [6, 3, 256]> xt_1 = layer_norm(axes = xt_1_axes_0, beta = decoder_norm11_0_bias, epsilon = var_985, gamma = decoder_norm11_0_weight, x = input_173)[name = tensor<string, []>("xt_1")];
-            tensor<int32, [4]> var_1154 = const()[name = tensor<string, []>("op_1154"), val = tensor<int32, [4]>([1, 6, 3, 256])];
-            tensor<fp32, [1, 6, 3, 256]> var_1155 = reshape(shape = var_1154, x = xt_1)[name = tensor<string, []>("op_1155")];
-            tensor<int32, [4]> var_1156_perm_0 = const()[name = tensor<string, []>("op_1156_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1159 = const()[name = tensor<string, []>("op_1159"), val = tensor<int32, [3]>([3, 6, 256])];
-            tensor<fp32, [1, 3, 6, 256]> var_1156 = transpose(perm = var_1156_perm_0, x = var_1155)[name = tensor<string, []>("transpose_23")];
-            tensor<fp32, [3, 6, 256]> query_1 = reshape(shape = var_1159, x = var_1156)[name = tensor<string, []>("query_1")];
+            tensor<fp32, [6, 3, 256]> xt_1 = layer_norm(axes = xt_1_axes_0, beta = inner_decoder_norm11_0_bias, epsilon = var_55, gamma = inner_decoder_norm11_0_weight, x = input_175)[name = tensor<string, []>("xt_1")];
+            tensor<int32, [4]> var_1177 = const()[name = tensor<string, []>("op_1177"), val = tensor<int32, [4]>([1, 6, 3, 256])];
+            tensor<fp32, [1, 6, 3, 256]> var_1178 = reshape(shape = var_1177, x = xt_1)[name = tensor<string, []>("op_1178")];
+            tensor<int32, [4]> var_1179_perm_0 = const()[name = tensor<string, []>("op_1179_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1182 = const()[name = tensor<string, []>("op_1182"), val = tensor<int32, [3]>([3, 6, 256])];
+            tensor<fp32, [1, 3, 6, 256]> var_1179 = transpose(perm = var_1179_perm_0, x = var_1178)[name = tensor<string, []>("transpose_23")];
+            tensor<fp32, [3, 6, 256]> query_1 = reshape(shape = var_1182, x = var_1179)[name = tensor<string, []>("query_1")];
             tensor<int32, [3]> query_3_perm_0 = const()[name = tensor<string, []>("query_3_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [6, 3, 256]> query_3 = transpose(perm = query_3_perm_0, x = query_1)[name = tensor<string, []>("transpose_22")];
-            tensor<fp32, [6, 3, 768]> var_1182 = linear(bias = decoder_self_attn2_0_in_proj_bias, weight = decoder_self_attn2_0_in_proj_weight, x = query_3)[name = tensor<string, []>("linear_43")];
+            tensor<fp32, [6, 3, 768]> var_1205 = linear(bias = inner_decoder_self_attn2_0_in_proj_bias, weight = inner_decoder_self_attn2_0_in_proj_weight, x = query_3)[name = tensor<string, []>("linear_43")];
             tensor<int32, [4]> concat_1 = const()[name = tensor<string, []>("concat_1"), val = tensor<int32, [4]>([6, 3, 3, 256])];
-            tensor<fp32, [6, 3, 3, 256]> var_1184 = reshape(shape = concat_1, x = var_1182)[name = tensor<string, []>("op_1184")];
-            tensor<int32, [1]> var_1185_axes_0 = const()[name = tensor<string, []>("op_1185_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp32, [1, 6, 3, 3, 256]> var_1185 = expand_dims(axes = var_1185_axes_0, x = var_1184)[name = tensor<string, []>("op_1185")];
-            tensor<int32, [5]> var_1186_perm_0 = const()[name = tensor<string, []>("op_1186_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
-            tensor<int32, [1]> var_1187_axes_0 = const()[name = tensor<string, []>("op_1187_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp32, [3, 6, 3, 1, 256]> var_1186 = transpose(perm = var_1186_perm_0, x = var_1185)[name = tensor<string, []>("transpose_21")];
-            tensor<fp32, [3, 6, 3, 256]> var_1187 = squeeze(axes = var_1187_axes_0, x = var_1186)[name = tensor<string, []>("op_1187")];
+            tensor<fp32, [6, 3, 3, 256]> var_1207 = reshape(shape = concat_1, x = var_1205)[name = tensor<string, []>("op_1207")];
+            tensor<int32, [1]> var_1208_axes_0 = const()[name = tensor<string, []>("op_1208_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 6, 3, 3, 256]> var_1208 = expand_dims(axes = var_1208_axes_0, x = var_1207)[name = tensor<string, []>("op_1208")];
+            tensor<int32, [5]> var_1209_perm_0 = const()[name = tensor<string, []>("op_1209_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
+            tensor<int32, [1]> var_1210_axes_0 = const()[name = tensor<string, []>("op_1210_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp32, [3, 6, 3, 1, 256]> var_1209 = transpose(perm = var_1209_perm_0, x = var_1208)[name = tensor<string, []>("transpose_21")];
+            tensor<fp32, [3, 6, 3, 256]> var_1210 = squeeze(axes = var_1210_axes_0, x = var_1209)[name = tensor<string, []>("op_1210")];
             tensor<int32, [4]> q_11_begin_0 = const()[name = tensor<string, []>("q_11_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> q_11_end_0 = const()[name = tensor<string, []>("q_11_end_0"), val = tensor<int32, [4]>([1, 6, 3, 256])];
             tensor<bool, [4]> q_11_end_mask_0 = const()[name = tensor<string, []>("q_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> q_11_squeeze_mask_0 = const()[name = tensor<string, []>("q_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [6, 3, 256]> q_11 = slice_by_index(begin = q_11_begin_0, end = q_11_end_0, end_mask = q_11_end_mask_0, squeeze_mask = q_11_squeeze_mask_0, x = var_1187)[name = tensor<string, []>("q_11")];
+            tensor<fp32, [6, 3, 256]> q_11 = slice_by_index(begin = q_11_begin_0, end = q_11_end_0, end_mask = q_11_end_mask_0, squeeze_mask = q_11_squeeze_mask_0, x = var_1210)[name = tensor<string, []>("q_11")];
             tensor<int32, [4]> k_11_begin_0 = const()[name = tensor<string, []>("k_11_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> k_11_end_0 = const()[name = tensor<string, []>("k_11_end_0"), val = tensor<int32, [4]>([2, 6, 3, 256])];
             tensor<bool, [4]> k_11_end_mask_0 = const()[name = tensor<string, []>("k_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> k_11_squeeze_mask_0 = const()[name = tensor<string, []>("k_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [6, 3, 256]> k_11 = slice_by_index(begin = k_11_begin_0, end = k_11_end_0, end_mask = k_11_end_mask_0, squeeze_mask = k_11_squeeze_mask_0, x = var_1187)[name = tensor<string, []>("k_11")];
+            tensor<fp32, [6, 3, 256]> k_11 = slice_by_index(begin = k_11_begin_0, end = k_11_end_0, end_mask = k_11_end_mask_0, squeeze_mask = k_11_squeeze_mask_0, x = var_1210)[name = tensor<string, []>("k_11")];
             tensor<int32, [4]> v_11_begin_0 = const()[name = tensor<string, []>("v_11_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> v_11_end_0 = const()[name = tensor<string, []>("v_11_end_0"), val = tensor<int32, [4]>([3, 6, 3, 256])];
             tensor<bool, [4]> v_11_end_mask_0 = const()[name = tensor<string, []>("v_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> v_11_squeeze_mask_0 = const()[name = tensor<string, []>("v_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [6, 3, 256]> v_11 = slice_by_index(begin = v_11_begin_0, end = v_11_end_0, end_mask = v_11_end_mask_0, squeeze_mask = v_11_squeeze_mask_0, x = var_1187)[name = tensor<string, []>("v_11")];
-            tensor<int32, [3]> var_1195 = const()[name = tensor<string, []>("op_1195"), val = tensor<int32, [3]>([6, 12, 64])];
-            tensor<fp32, [6, 12, 64]> var_1196 = reshape(shape = var_1195, x = q_11)[name = tensor<string, []>("op_1196")];
+            tensor<fp32, [6, 3, 256]> v_11 = slice_by_index(begin = v_11_begin_0, end = v_11_end_0, end_mask = v_11_end_mask_0, squeeze_mask = v_11_squeeze_mask_0, x = var_1210)[name = tensor<string, []>("v_11")];
+            tensor<int32, [3]> var_1218 = const()[name = tensor<string, []>("op_1218"), val = tensor<int32, [3]>([6, 12, 64])];
+            tensor<fp32, [6, 12, 64]> var_1219 = reshape(shape = var_1218, x = q_11)[name = tensor<string, []>("op_1219")];
             tensor<int32, [3]> q_13_perm_0 = const()[name = tensor<string, []>("q_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1202 = const()[name = tensor<string, []>("op_1202"), val = tensor<int32, [3]>([6, 12, 64])];
-            tensor<fp32, [6, 12, 64]> var_1203 = reshape(shape = var_1202, x = k_11)[name = tensor<string, []>("op_1203")];
+            tensor<int32, [3]> var_1225 = const()[name = tensor<string, []>("op_1225"), val = tensor<int32, [3]>([6, 12, 64])];
+            tensor<fp32, [6, 12, 64]> var_1226 = reshape(shape = var_1225, x = k_11)[name = tensor<string, []>("op_1226")];
             tensor<int32, [3]> k_13_perm_0 = const()[name = tensor<string, []>("k_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1209 = const()[name = tensor<string, []>("op_1209"), val = tensor<int32, [3]>([6, 12, 64])];
-            tensor<fp32, [6, 12, 64]> var_1210 = reshape(shape = var_1209, x = v_11)[name = tensor<string, []>("op_1210")];
+            tensor<int32, [3]> var_1232 = const()[name = tensor<string, []>("op_1232"), val = tensor<int32, [3]>([6, 12, 64])];
+            tensor<fp32, [6, 12, 64]> var_1233 = reshape(shape = var_1232, x = v_11)[name = tensor<string, []>("op_1233")];
             tensor<int32, [3]> v_13_perm_0 = const()[name = tensor<string, []>("v_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [4]> var_1213 = const()[name = tensor<string, []>("op_1213"), val = tensor<int32, [4]>([3, 4, 6, 64])];
-            tensor<fp32, [12, 6, 64]> q_13 = transpose(perm = q_13_perm_0, x = var_1196)[name = tensor<string, []>("transpose_20")];
-            tensor<fp32, [3, 4, 6, 64]> q_15 = reshape(shape = var_1213, x = q_13)[name = tensor<string, []>("q_15")];
-            tensor<int32, [4]> var_1215 = const()[name = tensor<string, []>("op_1215"), val = tensor<int32, [4]>([3, 4, 6, 64])];
-            tensor<fp32, [12, 6, 64]> k_13 = transpose(perm = k_13_perm_0, x = var_1203)[name = tensor<string, []>("transpose_19")];
-            tensor<fp32, [3, 4, 6, 64]> k_15 = reshape(shape = var_1215, x = k_13)[name = tensor<string, []>("k_15")];
-            tensor<int32, [4]> var_1217 = const()[name = tensor<string, []>("op_1217"), val = tensor<int32, [4]>([3, 4, 6, 64])];
-            tensor<fp32, [12, 6, 64]> v_13 = transpose(perm = v_13_perm_0, x = var_1210)[name = tensor<string, []>("transpose_18")];
-            tensor<fp32, [3, 4, 6, 64]> v_15 = reshape(shape = var_1217, x = v_13)[name = tensor<string, []>("v_15")];
+            tensor<int32, [4]> var_1236 = const()[name = tensor<string, []>("op_1236"), val = tensor<int32, [4]>([3, 4, 6, 64])];
+            tensor<fp32, [12, 6, 64]> q_13 = transpose(perm = q_13_perm_0, x = var_1219)[name = tensor<string, []>("transpose_20")];
+            tensor<fp32, [3, 4, 6, 64]> q_15 = reshape(shape = var_1236, x = q_13)[name = tensor<string, []>("q_15")];
+            tensor<int32, [4]> var_1238 = const()[name = tensor<string, []>("op_1238"), val = tensor<int32, [4]>([3, 4, 6, 64])];
+            tensor<fp32, [12, 6, 64]> k_13 = transpose(perm = k_13_perm_0, x = var_1226)[name = tensor<string, []>("transpose_19")];
+            tensor<fp32, [3, 4, 6, 64]> k_15 = reshape(shape = var_1238, x = k_13)[name = tensor<string, []>("k_15")];
+            tensor<int32, [4]> var_1240 = const()[name = tensor<string, []>("op_1240"), val = tensor<int32, [4]>([3, 4, 6, 64])];
+            tensor<fp32, [12, 6, 64]> v_13 = transpose(perm = v_13_perm_0, x = var_1233)[name = tensor<string, []>("transpose_18")];
+            tensor<fp32, [3, 4, 6, 64]> v_15 = reshape(shape = var_1240, x = v_13)[name = tensor<string, []>("v_15")];
             tensor<fp32, []> mul_1_y_0 = const()[name = tensor<string, []>("mul_1_y_0"), val = tensor<fp32, []>(0x1p-3)];
             tensor<fp32, [3, 4, 6, 64]> mul_1 = mul(x = q_15, y = mul_1_y_0)[name = tensor<string, []>("mul_1")];
             tensor<bool, []> matmul_0_transpose_y_0 = const()[name = tensor<string, []>("matmul_0_transpose_y_0"), val = tensor<bool, []>(true)];
@@ -1079,30 +1090,30 @@ program(1.0)
             tensor<bool, []> attn_output_1_transpose_x_0 = const()[name = tensor<string, []>("attn_output_1_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> attn_output_1_transpose_y_0 = const()[name = tensor<string, []>("attn_output_1_transpose_y_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [3, 4, 6, 64]> attn_output_1 = matmul(transpose_x = attn_output_1_transpose_x_0, transpose_y = attn_output_1_transpose_y_0, x = softmax_0, y = v_15)[name = tensor<string, []>("attn_output_1")];
-            tensor<int32, [4]> var_1220 = const()[name = tensor<string, []>("op_1220"), val = tensor<int32, [4]>([2, 0, 1, 3])];
-            tensor<int32, [2]> var_1225 = const()[name = tensor<string, []>("op_1225"), val = tensor<int32, [2]>([18, 256])];
-            tensor<fp32, [6, 3, 4, 64]> var_1221 = transpose(perm = var_1220, x = attn_output_1)[name = tensor<string, []>("transpose_17")];
-            tensor<fp32, [18, 256]> attn_output_3 = reshape(shape = var_1225, x = var_1221)[name = tensor<string, []>("attn_output_3")];
-            tensor<fp32, [18, 256]> attn_output_5 = linear(bias = decoder_self_attn2_0_out_proj_bias, weight = decoder_self_attn2_0_out_proj_weight, x = attn_output_3)[name = tensor<string, []>("linear_44")];
-            tensor<int32, [3]> var_1229 = const()[name = tensor<string, []>("op_1229"), val = tensor<int32, [3]>([6, 3, 256])];
-            tensor<fp32, [6, 3, 256]> attn_output_7 = reshape(shape = var_1229, x = attn_output_5)[name = tensor<string, []>("attn_output_7")];
+            tensor<int32, [4]> var_1243 = const()[name = tensor<string, []>("op_1243"), val = tensor<int32, [4]>([2, 0, 1, 3])];
+            tensor<int32, [2]> var_1248 = const()[name = tensor<string, []>("op_1248"), val = tensor<int32, [2]>([18, 256])];
+            tensor<fp32, [6, 3, 4, 64]> var_1244 = transpose(perm = var_1243, x = attn_output_1)[name = tensor<string, []>("transpose_17")];
+            tensor<fp32, [18, 256]> attn_output_3 = reshape(shape = var_1248, x = var_1244)[name = tensor<string, []>("attn_output_3")];
+            tensor<fp32, [18, 256]> attn_output_5 = linear(bias = inner_decoder_self_attn2_0_out_proj_bias, weight = inner_decoder_self_attn2_0_out_proj_weight, x = attn_output_3)[name = tensor<string, []>("linear_44")];
+            tensor<int32, [3]> var_1252 = const()[name = tensor<string, []>("op_1252"), val = tensor<int32, [3]>([6, 3, 256])];
+            tensor<fp32, [6, 3, 256]> attn_output_7 = reshape(shape = var_1252, x = attn_output_5)[name = tensor<string, []>("attn_output_7")];
             tensor<int32, [3]> sa2_out_1_perm_0 = const()[name = tensor<string, []>("sa2_out_1_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [3, 6, 256]> sa2_out_1 = transpose(perm = sa2_out_1_perm_0, x = attn_output_7)[name = tensor<string, []>("transpose_16")];
-            tensor<fp32, [3, 6, 256]> input_175 = add(x = query_1, y = sa2_out_1)[name = tensor<string, []>("input_175")];
-            tensor<int32, [1]> input_177_axes_0 = const()[name = tensor<string, []>("input_177_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [3, 6, 256]> input_177 = layer_norm(axes = input_177_axes_0, beta = decoder_norm21_0_bias, epsilon = var_985, gamma = decoder_norm21_0_weight, x = input_175)[name = tensor<string, []>("input_177")];
-            tensor<fp32, [3, 6, 2048]> input_179 = linear(bias = decoder_linear1_0_bias, weight = decoder_linear1_0_weight, x = input_177)[name = tensor<string, []>("linear_45")];
-            tensor<fp32, [3, 6, 2048]> input_181 = relu(x = input_179)[name = tensor<string, []>("input_181")];
-            tensor<fp32, [3, 6, 256]> ffn_out_1 = linear(bias = decoder_linear2_0_bias, weight = decoder_linear2_0_weight, x = input_181)[name = tensor<string, []>("linear_46")];
-            tensor<fp32, [3, 6, 256]> input_183 = add(x = input_177, y = ffn_out_1)[name = tensor<string, []>("input_183")];
+            tensor<fp32, [3, 6, 256]> input_177 = add(x = query_1, y = sa2_out_1)[name = tensor<string, []>("input_177")];
+            tensor<int32, [1]> input_179_axes_0 = const()[name = tensor<string, []>("input_179_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [3, 6, 256]> input_179 = layer_norm(axes = input_179_axes_0, beta = inner_decoder_norm21_0_bias, epsilon = var_55, gamma = inner_decoder_norm21_0_weight, x = input_177)[name = tensor<string, []>("input_179")];
+            tensor<fp32, [3, 6, 2048]> input_181 = linear(bias = inner_decoder_linear1_0_bias, weight = inner_decoder_linear1_0_weight, x = input_179)[name = tensor<string, []>("linear_45")];
+            tensor<fp32, [3, 6, 2048]> input_183 = relu(x = input_181)[name = tensor<string, []>("input_183")];
+            tensor<fp32, [3, 6, 256]> ffn_out_1 = linear(bias = inner_decoder_linear2_0_bias, weight = inner_decoder_linear2_0_weight, x = input_183)[name = tensor<string, []>("linear_46")];
+            tensor<fp32, [3, 6, 256]> input_185 = add(x = input_179, y = ffn_out_1)[name = tensor<string, []>("input_185")];
             tensor<int32, [1]> xt_3_axes_0 = const()[name = tensor<string, []>("xt_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [3, 6, 256]> xt_3 = layer_norm(axes = xt_3_axes_0, beta = decoder_norm22_0_bias, epsilon = var_985, gamma = decoder_norm22_0_weight, x = input_183)[name = tensor<string, []>("xt_3")];
-            tensor<int32, [4]> var_1249 = const()[name = tensor<string, []>("op_1249"), val = tensor<int32, [4]>([1, 3, 6, 256])];
-            tensor<fp32, [1, 3, 6, 256]> x_31 = reshape(shape = var_1249, x = xt_3)[name = tensor<string, []>("x_31")];
-            tensor<int32, [4]> var_1251_perm_0 = const()[name = tensor<string, []>("op_1251_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1255 = const()[name = tensor<string, []>("op_1255"), val = tensor<int32, [3]>([6, 3, 256])];
-            tensor<fp32, [1, 6, 3, 256]> var_1251 = transpose(perm = var_1251_perm_0, x = x_31)[name = tensor<string, []>("transpose_15")];
-            tensor<fp32, [6, 3, 256]> x = reshape(shape = var_1255, x = var_1251)[name = tensor<string, []>("x")];
+            tensor<fp32, [3, 6, 256]> xt_3 = layer_norm(axes = xt_3_axes_0, beta = inner_decoder_norm22_0_bias, epsilon = var_55, gamma = inner_decoder_norm22_0_weight, x = input_185)[name = tensor<string, []>("xt_3")];
+            tensor<int32, [4]> var_1272 = const()[name = tensor<string, []>("op_1272"), val = tensor<int32, [4]>([1, 3, 6, 256])];
+            tensor<fp32, [1, 3, 6, 256]> x_31 = reshape(shape = var_1272, x = xt_3)[name = tensor<string, []>("x_31")];
+            tensor<int32, [4]> var_1274_perm_0 = const()[name = tensor<string, []>("op_1274_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1278 = const()[name = tensor<string, []>("op_1278"), val = tensor<int32, [3]>([6, 3, 256])];
+            tensor<fp32, [1, 6, 3, 256]> var_1274 = transpose(perm = var_1274_perm_0, x = x_31)[name = tensor<string, []>("transpose_15")];
+            tensor<fp32, [6, 3, 256]> x = reshape(shape = var_1278, x = var_1274)[name = tensor<string, []>("x")];
             tensor<int32, [5]> prev_kv_begin_0 = const()[name = tensor<string, []>("prev_kv_begin_0"), val = tensor<int32, [5]>([1, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_end_0 = const()[name = tensor<string, []>("prev_kv_end_0"), val = tensor<int32, [5]>([2, 6, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_end_mask_0 = const()[name = tensor<string, []>("prev_kv_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -1113,120 +1124,120 @@ program(1.0)
             tensor<bool, [2]> prev_scale_end_mask_0 = const()[name = tensor<string, []>("prev_scale_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale = slice_by_index(begin = prev_scale_begin_0, end = prev_scale_end_0, end_mask = prev_scale_end_mask_0, squeeze_mask = prev_scale_squeeze_mask_0, x = dec_scale)[name = tensor<string, []>("prev_scale")];
-            tensor<fp32, [6, 3, 256]> var_1263 = linear(bias = decoder_q_proj_1_bias, weight = decoder_q_proj_1_weight, x = x)[name = tensor<string, []>("linear_47")];
-            tensor<int32, [4]> var_1264 = const()[name = tensor<string, []>("op_1264"), val = tensor<int32, [4]>([6, 3, 4, 64])];
-            tensor<fp32, [6, 3, 4, 64]> var_1265 = reshape(shape = var_1264, x = var_1263)[name = tensor<string, []>("op_1265")];
+            tensor<fp32, [6, 3, 256]> var_1286 = linear(bias = inner_decoder_q_proj_1_bias, weight = inner_decoder_q_proj_1_weight, x = x)[name = tensor<string, []>("linear_47")];
+            tensor<int32, [4]> var_1287 = const()[name = tensor<string, []>("op_1287"), val = tensor<int32, [4]>([6, 3, 4, 64])];
+            tensor<fp32, [6, 3, 4, 64]> var_1288 = reshape(shape = var_1287, x = var_1286)[name = tensor<string, []>("op_1288")];
             tensor<int32, [4]> q_17_perm_0 = const()[name = tensor<string, []>("q_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [6, 3, 256]> var_1269 = linear(bias = decoder_k_proj_1_bias, weight = decoder_k_proj_1_weight, x = x)[name = tensor<string, []>("linear_48")];
-            tensor<fp32, []> var_1270 = const()[name = tensor<string, []>("op_1270"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [6, 3, 256]> var_1271 = mul(x = var_1269, y = var_1270)[name = tensor<string, []>("op_1271")];
-            tensor<int32, [4]> var_1272 = const()[name = tensor<string, []>("op_1272"), val = tensor<int32, [4]>([6, 3, 4, 64])];
-            tensor<fp32, [6, 3, 4, 64]> var_1273 = reshape(shape = var_1272, x = var_1271)[name = tensor<string, []>("op_1273")];
+            tensor<fp32, [6, 3, 256]> var_1292 = linear(bias = inner_decoder_k_proj_1_bias, weight = inner_decoder_k_proj_1_weight, x = x)[name = tensor<string, []>("linear_48")];
+            tensor<fp32, []> var_1293 = const()[name = tensor<string, []>("op_1293"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [6, 3, 256]> var_1294 = mul(x = var_1292, y = var_1293)[name = tensor<string, []>("op_1294")];
+            tensor<int32, [4]> var_1295 = const()[name = tensor<string, []>("op_1295"), val = tensor<int32, [4]>([6, 3, 4, 64])];
+            tensor<fp32, [6, 3, 4, 64]> var_1296 = reshape(shape = var_1295, x = var_1294)[name = tensor<string, []>("op_1296")];
             tensor<int32, [4]> k_17_perm_0 = const()[name = tensor<string, []>("k_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [6, 3, 256]> var_1277 = linear(bias = decoder_v_proj_1_bias, weight = decoder_v_proj_1_weight, x = x)[name = tensor<string, []>("linear_49")];
-            tensor<int32, [4]> var_1278 = const()[name = tensor<string, []>("op_1278"), val = tensor<int32, [4]>([6, 3, 4, 64])];
-            tensor<fp32, [6, 3, 4, 64]> var_1279 = reshape(shape = var_1278, x = var_1277)[name = tensor<string, []>("op_1279")];
+            tensor<fp32, [6, 3, 256]> var_1300 = linear(bias = inner_decoder_v_proj_1_bias, weight = inner_decoder_v_proj_1_weight, x = x)[name = tensor<string, []>("linear_49")];
+            tensor<int32, [4]> var_1301 = const()[name = tensor<string, []>("op_1301"), val = tensor<int32, [4]>([6, 3, 4, 64])];
+            tensor<fp32, [6, 3, 4, 64]> var_1302 = reshape(shape = var_1301, x = var_1300)[name = tensor<string, []>("op_1302")];
             tensor<int32, [4]> v_17_perm_0 = const()[name = tensor<string, []>("v_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [6, 3, 256]> input_187 = linear(bias = decoder_g_proj_1_bias, weight = decoder_g_proj_1_weight, x = x)[name = tensor<string, []>("linear_50")];
+            tensor<fp32, [6, 3, 256]> input_189 = linear(bias = inner_decoder_g_proj_1_bias, weight = inner_decoder_g_proj_1_weight, x = x)[name = tensor<string, []>("linear_50")];
             tensor<fp32, [1]> sqrt_s0 = sqrt(x = prev_scale)[name = tensor<string, []>("sqrt_s0")];
             tensor<fp32, [3]> s_t = add(x = prev_scale, y = cumsum_mask_1)[name = tensor<string, []>("s_t")];
             tensor<fp32, []> const_32 = const()[name = tensor<string, []>("const_32"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [3]> clip_3 = clip(alpha = var_990, beta = const_32, x = s_t)[name = tensor<string, []>("clip_3")];
+            tensor<fp32, [3]> clip_3 = clip(alpha = var_48, beta = const_32, x = s_t)[name = tensor<string, []>("clip_3")];
             tensor<fp32, [3]> sqrt_s_t = sqrt(x = clip_3)[name = tensor<string, []>("sqrt_s_t")];
             tensor<bool, []> qk_transpose_x_1 = const()[name = tensor<string, []>("qk_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_transpose_y_1 = const()[name = tensor<string, []>("qk_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [6, 4, 3, 64]> k_17 = transpose(perm = k_17_perm_0, x = var_1273)[name = tensor<string, []>("transpose_13")];
-            tensor<fp32, [6, 4, 3, 64]> q_17 = transpose(perm = q_17_perm_0, x = var_1265)[name = tensor<string, []>("transpose_14")];
+            tensor<fp32, [6, 4, 3, 64]> k_17 = transpose(perm = k_17_perm_0, x = var_1296)[name = tensor<string, []>("transpose_13")];
+            tensor<fp32, [6, 4, 3, 64]> q_17 = transpose(perm = q_17_perm_0, x = var_1288)[name = tensor<string, []>("transpose_14")];
             tensor<fp32, [6, 4, 3, 3]> qk = matmul(transpose_x = qk_transpose_x_1, transpose_y = qk_transpose_y_1, x = q_17, y = k_17)[name = tensor<string, []>("qk")];
-            tensor<int32, [2]> var_1294 = const()[name = tensor<string, []>("op_1294"), val = tensor<int32, [2]>([3, 1])];
-            tensor<fp32, [3, 1]> var_1295 = reshape(shape = var_1294, x = sqrt_s_t)[name = tensor<string, []>("op_1295")];
-            tensor<fp32, [3, 3]> M = real_div(x = causal_with_valid_1, y = var_1295)[name = tensor<string, []>("M")];
-            tensor<fp32, [6, 4, 3, 3]> var_1297 = mul(x = qk, y = M)[name = tensor<string, []>("op_1297")];
-            tensor<bool, []> inner_transpose_x_0 = const()[name = tensor<string, []>("inner_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> inner_transpose_y_0 = const()[name = tensor<string, []>("inner_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [6, 4, 3, 64]> v_17 = transpose(perm = v_17_perm_0, x = var_1279)[name = tensor<string, []>("transpose_12")];
-            tensor<fp32, [6, 4, 3, 64]> inner = matmul(transpose_x = inner_transpose_x_0, transpose_y = inner_transpose_y_0, x = var_1297, y = v_17)[name = tensor<string, []>("inner")];
-            tensor<bool, []> var_1299_transpose_x_0 = const()[name = tensor<string, []>("op_1299_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_1299_transpose_y_0 = const()[name = tensor<string, []>("op_1299_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [6, 4, 3, 64]> var_1299 = matmul(transpose_x = var_1299_transpose_x_0, transpose_y = var_1299_transpose_y_0, x = q_17, y = prev_kv)[name = tensor<string, []>("op_1299")];
-            tensor<fp32, [3]> var_1300 = real_div(x = sqrt_s0, y = sqrt_s_t)[name = tensor<string, []>("op_1300")];
-            tensor<int32, [4]> var_1301 = const()[name = tensor<string, []>("op_1301"), val = tensor<int32, [4]>([1, 1, 3, 1])];
-            tensor<fp32, [1, 1, 3, 1]> var_1302 = reshape(shape = var_1301, x = var_1300)[name = tensor<string, []>("op_1302")];
-            tensor<fp32, [6, 4, 3, 64]> cross = mul(x = var_1299, y = var_1302)[name = tensor<string, []>("cross")];
-            tensor<fp32, [6, 4, 3, 64]> out_31 = add(x = inner, y = cross)[name = tensor<string, []>("out_31")];
-            tensor<fp32, [6, 4, 3, 64]> v_masked = mul(x = v_17, y = var_1125)[name = tensor<string, []>("v_masked")];
-            tensor<fp32, [6, 4, 64, 64]> var_1308 = mul(x = prev_kv, y = sqrt_s0)[name = tensor<string, []>("op_1308")];
-            tensor<bool, []> var_1310_transpose_x_1 = const()[name = tensor<string, []>("op_1310_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_1310_transpose_y_1 = const()[name = tensor<string, []>("op_1310_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [6, 4, 64, 64]> var_1310 = matmul(transpose_x = var_1310_transpose_x_1, transpose_y = var_1310_transpose_y_1, x = k_17, y = v_masked)[name = tensor<string, []>("op_1310")];
-            tensor<fp32, [6, 4, 64, 64]> new_kv_unnorm = add(x = var_1308, y = var_1310)[name = tensor<string, []>("new_kv_unnorm")];
-            tensor<fp32, [1]> new_scale = add(x = prev_scale, y = var_1133)[name = tensor<string, []>("new_scale")];
+            tensor<int32, [2]> var_1317 = const()[name = tensor<string, []>("op_1317"), val = tensor<int32, [2]>([3, 1])];
+            tensor<fp32, [3, 1]> var_1318 = reshape(shape = var_1317, x = sqrt_s_t)[name = tensor<string, []>("op_1318")];
+            tensor<fp32, [3, 3]> M = real_div(x = causal_with_valid_1, y = var_1318)[name = tensor<string, []>("M")];
+            tensor<fp32, [6, 4, 3, 3]> var_1320 = mul(x = qk, y = M)[name = tensor<string, []>("op_1320")];
+            tensor<bool, []> inner_11_transpose_x_0 = const()[name = tensor<string, []>("inner_11_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> inner_11_transpose_y_0 = const()[name = tensor<string, []>("inner_11_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [6, 4, 3, 64]> v_17 = transpose(perm = v_17_perm_0, x = var_1302)[name = tensor<string, []>("transpose_12")];
+            tensor<fp32, [6, 4, 3, 64]> inner_11 = matmul(transpose_x = inner_11_transpose_x_0, transpose_y = inner_11_transpose_y_0, x = var_1320, y = v_17)[name = tensor<string, []>("inner_11")];
+            tensor<bool, []> var_1322_transpose_x_0 = const()[name = tensor<string, []>("op_1322_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_1322_transpose_y_0 = const()[name = tensor<string, []>("op_1322_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [6, 4, 3, 64]> var_1322 = matmul(transpose_x = var_1322_transpose_x_0, transpose_y = var_1322_transpose_y_0, x = q_17, y = prev_kv)[name = tensor<string, []>("op_1322")];
+            tensor<fp32, [3]> var_1323 = real_div(x = sqrt_s0, y = sqrt_s_t)[name = tensor<string, []>("op_1323")];
+            tensor<int32, [4]> var_1324 = const()[name = tensor<string, []>("op_1324"), val = tensor<int32, [4]>([1, 1, 3, 1])];
+            tensor<fp32, [1, 1, 3, 1]> var_1325 = reshape(shape = var_1324, x = var_1323)[name = tensor<string, []>("op_1325")];
+            tensor<fp32, [6, 4, 3, 64]> cross = mul(x = var_1322, y = var_1325)[name = tensor<string, []>("cross")];
+            tensor<fp32, [6, 4, 3, 64]> out_31 = add(x = inner_11, y = cross)[name = tensor<string, []>("out_31")];
+            tensor<fp32, [6, 4, 3, 64]> v_masked = mul(x = v_17, y = var_1148)[name = tensor<string, []>("v_masked")];
+            tensor<fp32, [6, 4, 64, 64]> var_1331 = mul(x = prev_kv, y = sqrt_s0)[name = tensor<string, []>("op_1331")];
+            tensor<bool, []> var_1333_transpose_x_1 = const()[name = tensor<string, []>("op_1333_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_1333_transpose_y_1 = const()[name = tensor<string, []>("op_1333_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [6, 4, 64, 64]> var_1333 = matmul(transpose_x = var_1333_transpose_x_1, transpose_y = var_1333_transpose_y_1, x = k_17, y = v_masked)[name = tensor<string, []>("op_1333")];
+            tensor<fp32, [6, 4, 64, 64]> new_kv_unnorm = add(x = var_1331, y = var_1333)[name = tensor<string, []>("new_kv_unnorm")];
+            tensor<fp32, [1]> new_scale = add(x = prev_scale, y = var_1156)[name = tensor<string, []>("new_scale")];
             tensor<fp32, []> const_33 = const()[name = tensor<string, []>("const_33"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_4 = clip(alpha = var_990, beta = const_33, x = new_scale)[name = tensor<string, []>("clip_4")];
+            tensor<fp32, [1]> clip_4 = clip(alpha = var_48, beta = const_33, x = new_scale)[name = tensor<string, []>("clip_4")];
             tensor<fp32, [1]> sqrt_new_scale = sqrt(x = clip_4)[name = tensor<string, []>("sqrt_new_scale")];
             tensor<fp32, [6, 4, 64, 64]> nkv = real_div(x = new_kv_unnorm, y = sqrt_new_scale)[name = tensor<string, []>("nkv")];
-            tensor<int32, [4]> var_1319_perm_0 = const()[name = tensor<string, []>("op_1319_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [4]> var_1342_perm_0 = const()[name = tensor<string, []>("op_1342_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_33_axes_0 = const()[name = tensor<string, []>("out_33_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [6, 3, 4, 64]> var_1319 = transpose(perm = var_1319_perm_0, x = out_31)[name = tensor<string, []>("transpose_11")];
-            tensor<fp32, [6, 3, 4, 64]> out_33 = layer_norm(axes = out_33_axes_0, epsilon = var_987, x = var_1319)[name = tensor<string, []>("out_33")];
-            tensor<int32, [3]> var_1323 = const()[name = tensor<string, []>("op_1323"), val = tensor<int32, [3]>([6, 3, 256])];
-            tensor<fp32, [6, 3, 256]> out = reshape(shape = var_1323, x = out_33)[name = tensor<string, []>("out")];
-            tensor<fp32, [6, 3, 256]> var_1325 = silu(x = input_187)[name = tensor<string, []>("op_1325")];
-            tensor<fp32, [6, 3, 256]> input_189 = mul(x = var_1325, y = out)[name = tensor<string, []>("input_189")];
-            tensor<fp32, [6, 3, 256]> ret_out = linear(bias = decoder_out_proj_1_bias, weight = decoder_out_proj_1_weight, x = input_189)[name = tensor<string, []>("linear_51")];
-            tensor<fp32, [6, 3, 256]> input_191 = add(x = x, y = ret_out)[name = tensor<string, []>("input_191")];
+            tensor<fp32, [6, 3, 4, 64]> var_1342 = transpose(perm = var_1342_perm_0, x = out_31)[name = tensor<string, []>("transpose_11")];
+            tensor<fp32, [6, 3, 4, 64]> out_33 = layer_norm(axes = out_33_axes_0, epsilon = var_63, x = var_1342)[name = tensor<string, []>("out_33")];
+            tensor<int32, [3]> var_1346 = const()[name = tensor<string, []>("op_1346"), val = tensor<int32, [3]>([6, 3, 256])];
+            tensor<fp32, [6, 3, 256]> out = reshape(shape = var_1346, x = out_33)[name = tensor<string, []>("out")];
+            tensor<fp32, [6, 3, 256]> var_1348 = silu(x = input_189)[name = tensor<string, []>("op_1348")];
+            tensor<fp32, [6, 3, 256]> input_191 = mul(x = var_1348, y = out)[name = tensor<string, []>("input_191")];
+            tensor<fp32, [6, 3, 256]> ret_out = linear(bias = inner_decoder_out_proj_1_bias, weight = inner_decoder_out_proj_1_weight, x = input_191)[name = tensor<string, []>("linear_51")];
+            tensor<fp32, [6, 3, 256]> input_193 = add(x = x, y = ret_out)[name = tensor<string, []>("input_193")];
             tensor<int32, [1]> xt_5_axes_0 = const()[name = tensor<string, []>("xt_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [6, 3, 256]> xt_5 = layer_norm(axes = xt_5_axes_0, beta = decoder_norm11_1_bias, epsilon = var_985, gamma = decoder_norm11_1_weight, x = input_191)[name = tensor<string, []>("xt_5")];
-            tensor<int32, [4]> var_1335 = const()[name = tensor<string, []>("op_1335"), val = tensor<int32, [4]>([1, 6, 3, 256])];
-            tensor<fp32, [1, 6, 3, 256]> var_1336 = reshape(shape = var_1335, x = xt_5)[name = tensor<string, []>("op_1336")];
-            tensor<int32, [4]> var_1337_perm_0 = const()[name = tensor<string, []>("op_1337_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1340 = const()[name = tensor<string, []>("op_1340"), val = tensor<int32, [3]>([3, 6, 256])];
-            tensor<fp32, [1, 3, 6, 256]> var_1337 = transpose(perm = var_1337_perm_0, x = var_1336)[name = tensor<string, []>("transpose_10")];
-            tensor<fp32, [3, 6, 256]> query_5 = reshape(shape = var_1340, x = var_1337)[name = tensor<string, []>("query_5")];
+            tensor<fp32, [6, 3, 256]> xt_5 = layer_norm(axes = xt_5_axes_0, beta = inner_decoder_norm11_1_bias, epsilon = var_55, gamma = inner_decoder_norm11_1_weight, x = input_193)[name = tensor<string, []>("xt_5")];
+            tensor<int32, [4]> var_1358 = const()[name = tensor<string, []>("op_1358"), val = tensor<int32, [4]>([1, 6, 3, 256])];
+            tensor<fp32, [1, 6, 3, 256]> var_1359 = reshape(shape = var_1358, x = xt_5)[name = tensor<string, []>("op_1359")];
+            tensor<int32, [4]> var_1360_perm_0 = const()[name = tensor<string, []>("op_1360_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1363 = const()[name = tensor<string, []>("op_1363"), val = tensor<int32, [3]>([3, 6, 256])];
+            tensor<fp32, [1, 3, 6, 256]> var_1360 = transpose(perm = var_1360_perm_0, x = var_1359)[name = tensor<string, []>("transpose_10")];
+            tensor<fp32, [3, 6, 256]> query_5 = reshape(shape = var_1363, x = var_1360)[name = tensor<string, []>("query_5")];
             tensor<int32, [3]> query_perm_0 = const()[name = tensor<string, []>("query_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [6, 3, 256]> query = transpose(perm = query_perm_0, x = query_5)[name = tensor<string, []>("transpose_9")];
-            tensor<fp32, [6, 3, 768]> var_1363 = linear(bias = decoder_self_attn2_1_in_proj_bias, weight = decoder_self_attn2_1_in_proj_weight, x = query)[name = tensor<string, []>("linear_52")];
+            tensor<fp32, [6, 3, 768]> var_1386 = linear(bias = inner_decoder_self_attn2_1_in_proj_bias, weight = inner_decoder_self_attn2_1_in_proj_weight, x = query)[name = tensor<string, []>("linear_52")];
             tensor<int32, [4]> concat_2 = const()[name = tensor<string, []>("concat_2"), val = tensor<int32, [4]>([6, 3, 3, 256])];
-            tensor<fp32, [6, 3, 3, 256]> var_1365 = reshape(shape = concat_2, x = var_1363)[name = tensor<string, []>("op_1365")];
-            tensor<int32, [1]> var_1366_axes_0 = const()[name = tensor<string, []>("op_1366_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp32, [1, 6, 3, 3, 256]> var_1366 = expand_dims(axes = var_1366_axes_0, x = var_1365)[name = tensor<string, []>("op_1366")];
-            tensor<int32, [5]> var_1367_perm_0 = const()[name = tensor<string, []>("op_1367_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
-            tensor<int32, [1]> var_1368_axes_0 = const()[name = tensor<string, []>("op_1368_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp32, [3, 6, 3, 1, 256]> var_1367 = transpose(perm = var_1367_perm_0, x = var_1366)[name = tensor<string, []>("transpose_8")];
-            tensor<fp32, [3, 6, 3, 256]> var_1368 = squeeze(axes = var_1368_axes_0, x = var_1367)[name = tensor<string, []>("op_1368")];
+            tensor<fp32, [6, 3, 3, 256]> var_1388 = reshape(shape = concat_2, x = var_1386)[name = tensor<string, []>("op_1388")];
+            tensor<int32, [1]> var_1389_axes_0 = const()[name = tensor<string, []>("op_1389_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 6, 3, 3, 256]> var_1389 = expand_dims(axes = var_1389_axes_0, x = var_1388)[name = tensor<string, []>("op_1389")];
+            tensor<int32, [5]> var_1390_perm_0 = const()[name = tensor<string, []>("op_1390_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
+            tensor<int32, [1]> var_1391_axes_0 = const()[name = tensor<string, []>("op_1391_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp32, [3, 6, 3, 1, 256]> var_1390 = transpose(perm = var_1390_perm_0, x = var_1389)[name = tensor<string, []>("transpose_8")];
+            tensor<fp32, [3, 6, 3, 256]> var_1391 = squeeze(axes = var_1391_axes_0, x = var_1390)[name = tensor<string, []>("op_1391")];
             tensor<int32, [4]> q_19_begin_0 = const()[name = tensor<string, []>("q_19_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> q_19_end_0 = const()[name = tensor<string, []>("q_19_end_0"), val = tensor<int32, [4]>([1, 6, 3, 256])];
             tensor<bool, [4]> q_19_end_mask_0 = const()[name = tensor<string, []>("q_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> q_19_squeeze_mask_0 = const()[name = tensor<string, []>("q_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [6, 3, 256]> q_19 = slice_by_index(begin = q_19_begin_0, end = q_19_end_0, end_mask = q_19_end_mask_0, squeeze_mask = q_19_squeeze_mask_0, x = var_1368)[name = tensor<string, []>("q_19")];
+            tensor<fp32, [6, 3, 256]> q_19 = slice_by_index(begin = q_19_begin_0, end = q_19_end_0, end_mask = q_19_end_mask_0, squeeze_mask = q_19_squeeze_mask_0, x = var_1391)[name = tensor<string, []>("q_19")];
             tensor<int32, [4]> k_19_begin_0 = const()[name = tensor<string, []>("k_19_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> k_19_end_0 = const()[name = tensor<string, []>("k_19_end_0"), val = tensor<int32, [4]>([2, 6, 3, 256])];
             tensor<bool, [4]> k_19_end_mask_0 = const()[name = tensor<string, []>("k_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> k_19_squeeze_mask_0 = const()[name = tensor<string, []>("k_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [6, 3, 256]> k_19 = slice_by_index(begin = k_19_begin_0, end = k_19_end_0, end_mask = k_19_end_mask_0, squeeze_mask = k_19_squeeze_mask_0, x = var_1368)[name = tensor<string, []>("k_19")];
+            tensor<fp32, [6, 3, 256]> k_19 = slice_by_index(begin = k_19_begin_0, end = k_19_end_0, end_mask = k_19_end_mask_0, squeeze_mask = k_19_squeeze_mask_0, x = var_1391)[name = tensor<string, []>("k_19")];
             tensor<int32, [4]> v_19_begin_0 = const()[name = tensor<string, []>("v_19_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> v_19_end_0 = const()[name = tensor<string, []>("v_19_end_0"), val = tensor<int32, [4]>([3, 6, 3, 256])];
             tensor<bool, [4]> v_19_end_mask_0 = const()[name = tensor<string, []>("v_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> v_19_squeeze_mask_0 = const()[name = tensor<string, []>("v_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [6, 3, 256]> v_19 = slice_by_index(begin = v_19_begin_0, end = v_19_end_0, end_mask = v_19_end_mask_0, squeeze_mask = v_19_squeeze_mask_0, x = var_1368)[name = tensor<string, []>("v_19")];
-            tensor<int32, [3]> var_1376 = const()[name = tensor<string, []>("op_1376"), val = tensor<int32, [3]>([6, 12, 64])];
-            tensor<fp32, [6, 12, 64]> var_1377 = reshape(shape = var_1376, x = q_19)[name = tensor<string, []>("op_1377")];
+            tensor<fp32, [6, 3, 256]> v_19 = slice_by_index(begin = v_19_begin_0, end = v_19_end_0, end_mask = v_19_end_mask_0, squeeze_mask = v_19_squeeze_mask_0, x = var_1391)[name = tensor<string, []>("v_19")];
+            tensor<int32, [3]> var_1399 = const()[name = tensor<string, []>("op_1399"), val = tensor<int32, [3]>([6, 12, 64])];
+            tensor<fp32, [6, 12, 64]> var_1400 = reshape(shape = var_1399, x = q_19)[name = tensor<string, []>("op_1400")];
             tensor<int32, [3]> q_21_perm_0 = const()[name = tensor<string, []>("q_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1383 = const()[name = tensor<string, []>("op_1383"), val = tensor<int32, [3]>([6, 12, 64])];
-            tensor<fp32, [6, 12, 64]> var_1384 = reshape(shape = var_1383, x = k_19)[name = tensor<string, []>("op_1384")];
+            tensor<int32, [3]> var_1406 = const()[name = tensor<string, []>("op_1406"), val = tensor<int32, [3]>([6, 12, 64])];
+            tensor<fp32, [6, 12, 64]> var_1407 = reshape(shape = var_1406, x = k_19)[name = tensor<string, []>("op_1407")];
             tensor<int32, [3]> k_21_perm_0 = const()[name = tensor<string, []>("k_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1390 = const()[name = tensor<string, []>("op_1390"), val = tensor<int32, [3]>([6, 12, 64])];
-            tensor<fp32, [6, 12, 64]> var_1391 = reshape(shape = var_1390, x = v_19)[name = tensor<string, []>("op_1391")];
+            tensor<int32, [3]> var_1413 = const()[name = tensor<string, []>("op_1413"), val = tensor<int32, [3]>([6, 12, 64])];
+            tensor<fp32, [6, 12, 64]> var_1414 = reshape(shape = var_1413, x = v_19)[name = tensor<string, []>("op_1414")];
             tensor<int32, [3]> v_21_perm_0 = const()[name = tensor<string, []>("v_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [4]> var_1394 = const()[name = tensor<string, []>("op_1394"), val = tensor<int32, [4]>([3, 4, 6, 64])];
-            tensor<fp32, [12, 6, 64]> q_21 = transpose(perm = q_21_perm_0, x = var_1377)[name = tensor<string, []>("transpose_7")];
-            tensor<fp32, [3, 4, 6, 64]> q = reshape(shape = var_1394, x = q_21)[name = tensor<string, []>("q")];
-            tensor<int32, [4]> var_1396 = const()[name = tensor<string, []>("op_1396"), val = tensor<int32, [4]>([3, 4, 6, 64])];
-            tensor<fp32, [12, 6, 64]> k_21 = transpose(perm = k_21_perm_0, x = var_1384)[name = tensor<string, []>("transpose_6")];
-            tensor<fp32, [3, 4, 6, 64]> k = reshape(shape = var_1396, x = k_21)[name = tensor<string, []>("k")];
-            tensor<int32, [4]> var_1398 = const()[name = tensor<string, []>("op_1398"), val = tensor<int32, [4]>([3, 4, 6, 64])];
-            tensor<fp32, [12, 6, 64]> v_21 = transpose(perm = v_21_perm_0, x = var_1391)[name = tensor<string, []>("transpose_5")];
-            tensor<fp32, [3, 4, 6, 64]> v = reshape(shape = var_1398, x = v_21)[name = tensor<string, []>("v")];
+            tensor<int32, [4]> var_1417 = const()[name = tensor<string, []>("op_1417"), val = tensor<int32, [4]>([3, 4, 6, 64])];
+            tensor<fp32, [12, 6, 64]> q_21 = transpose(perm = q_21_perm_0, x = var_1400)[name = tensor<string, []>("transpose_7")];
+            tensor<fp32, [3, 4, 6, 64]> q = reshape(shape = var_1417, x = q_21)[name = tensor<string, []>("q")];
+            tensor<int32, [4]> var_1419 = const()[name = tensor<string, []>("op_1419"), val = tensor<int32, [4]>([3, 4, 6, 64])];
+            tensor<fp32, [12, 6, 64]> k_21 = transpose(perm = k_21_perm_0, x = var_1407)[name = tensor<string, []>("transpose_6")];
+            tensor<fp32, [3, 4, 6, 64]> k = reshape(shape = var_1419, x = k_21)[name = tensor<string, []>("k")];
+            tensor<int32, [4]> var_1421 = const()[name = tensor<string, []>("op_1421"), val = tensor<int32, [4]>([3, 4, 6, 64])];
+            tensor<fp32, [12, 6, 64]> v_21 = transpose(perm = v_21_perm_0, x = var_1414)[name = tensor<string, []>("transpose_5")];
+            tensor<fp32, [3, 4, 6, 64]> v = reshape(shape = var_1421, x = v_21)[name = tensor<string, []>("v")];
             tensor<fp32, []> mul_3_y_0 = const()[name = tensor<string, []>("mul_3_y_0"), val = tensor<fp32, []>(0x1p-3)];
             tensor<fp32, [3, 4, 6, 64]> mul_3 = mul(x = q, y = mul_3_y_0)[name = tensor<string, []>("mul_3")];
             tensor<bool, []> matmul_1_transpose_y_0 = const()[name = tensor<string, []>("matmul_1_transpose_y_0"), val = tensor<bool, []>(true)];
@@ -1237,36 +1248,36 @@ program(1.0)
             tensor<bool, []> attn_output_9_transpose_x_0 = const()[name = tensor<string, []>("attn_output_9_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> attn_output_9_transpose_y_0 = const()[name = tensor<string, []>("attn_output_9_transpose_y_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [3, 4, 6, 64]> attn_output_9 = matmul(transpose_x = attn_output_9_transpose_x_0, transpose_y = attn_output_9_transpose_y_0, x = softmax_1, y = v)[name = tensor<string, []>("attn_output_9")];
-            tensor<int32, [4]> var_1401 = const()[name = tensor<string, []>("op_1401"), val = tensor<int32, [4]>([2, 0, 1, 3])];
-            tensor<int32, [2]> var_1406 = const()[name = tensor<string, []>("op_1406"), val = tensor<int32, [2]>([18, 256])];
-            tensor<fp32, [6, 3, 4, 64]> var_1402 = transpose(perm = var_1401, x = attn_output_9)[name = tensor<string, []>("transpose_4")];
-            tensor<fp32, [18, 256]> attn_output_11 = reshape(shape = var_1406, x = var_1402)[name = tensor<string, []>("attn_output_11")];
-            tensor<fp32, [18, 256]> attn_output_13 = linear(bias = decoder_self_attn2_1_out_proj_bias, weight = decoder_self_attn2_1_out_proj_weight, x = attn_output_11)[name = tensor<string, []>("linear_53")];
-            tensor<int32, [3]> var_1410 = const()[name = tensor<string, []>("op_1410"), val = tensor<int32, [3]>([6, 3, 256])];
-            tensor<fp32, [6, 3, 256]> attn_output = reshape(shape = var_1410, x = attn_output_13)[name = tensor<string, []>("attn_output")];
+            tensor<int32, [4]> var_1424 = const()[name = tensor<string, []>("op_1424"), val = tensor<int32, [4]>([2, 0, 1, 3])];
+            tensor<int32, [2]> var_1429 = const()[name = tensor<string, []>("op_1429"), val = tensor<int32, [2]>([18, 256])];
+            tensor<fp32, [6, 3, 4, 64]> var_1425 = transpose(perm = var_1424, x = attn_output_9)[name = tensor<string, []>("transpose_4")];
+            tensor<fp32, [18, 256]> attn_output_11 = reshape(shape = var_1429, x = var_1425)[name = tensor<string, []>("attn_output_11")];
+            tensor<fp32, [18, 256]> attn_output_13 = linear(bias = inner_decoder_self_attn2_1_out_proj_bias, weight = inner_decoder_self_attn2_1_out_proj_weight, x = attn_output_11)[name = tensor<string, []>("linear_53")];
+            tensor<int32, [3]> var_1433 = const()[name = tensor<string, []>("op_1433"), val = tensor<int32, [3]>([6, 3, 256])];
+            tensor<fp32, [6, 3, 256]> attn_output = reshape(shape = var_1433, x = attn_output_13)[name = tensor<string, []>("attn_output")];
             tensor<int32, [3]> sa2_out_perm_0 = const()[name = tensor<string, []>("sa2_out_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [3, 6, 256]> sa2_out = transpose(perm = sa2_out_perm_0, x = attn_output)[name = tensor<string, []>("transpose_3")];
-            tensor<fp32, [3, 6, 256]> input_193 = add(x = query_5, y = sa2_out)[name = tensor<string, []>("input_193")];
-            tensor<int32, [1]> input_195_axes_0 = const()[name = tensor<string, []>("input_195_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [3, 6, 256]> input_195 = layer_norm(axes = input_195_axes_0, beta = decoder_norm21_1_bias, epsilon = var_985, gamma = decoder_norm21_1_weight, x = input_193)[name = tensor<string, []>("input_195")];
-            tensor<fp32, [3, 6, 2048]> input_197 = linear(bias = decoder_linear1_1_bias, weight = decoder_linear1_1_weight, x = input_195)[name = tensor<string, []>("linear_54")];
-            tensor<fp32, [3, 6, 2048]> input_199 = relu(x = input_197)[name = tensor<string, []>("input_199")];
-            tensor<fp32, [3, 6, 256]> ffn_out = linear(bias = decoder_linear2_1_bias, weight = decoder_linear2_1_weight, x = input_199)[name = tensor<string, []>("linear_55")];
-            tensor<fp32, [3, 6, 256]> input_201 = add(x = input_195, y = ffn_out)[name = tensor<string, []>("input_201")];
+            tensor<fp32, [3, 6, 256]> input_195 = add(x = query_5, y = sa2_out)[name = tensor<string, []>("input_195")];
+            tensor<int32, [1]> input_197_axes_0 = const()[name = tensor<string, []>("input_197_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [3, 6, 256]> input_197 = layer_norm(axes = input_197_axes_0, beta = inner_decoder_norm21_1_bias, epsilon = var_55, gamma = inner_decoder_norm21_1_weight, x = input_195)[name = tensor<string, []>("input_197")];
+            tensor<fp32, [3, 6, 2048]> input_199 = linear(bias = inner_decoder_linear1_1_bias, weight = inner_decoder_linear1_1_weight, x = input_197)[name = tensor<string, []>("linear_54")];
+            tensor<fp32, [3, 6, 2048]> input_201 = relu(x = input_199)[name = tensor<string, []>("input_201")];
+            tensor<fp32, [3, 6, 256]> ffn_out = linear(bias = inner_decoder_linear2_1_bias, weight = inner_decoder_linear2_1_weight, x = input_201)[name = tensor<string, []>("linear_55")];
+            tensor<fp32, [3, 6, 256]> input_203 = add(x = input_197, y = ffn_out)[name = tensor<string, []>("input_203")];
             tensor<int32, [1]> xt_axes_0 = const()[name = tensor<string, []>("xt_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [3, 6, 256]> xt = layer_norm(axes = xt_axes_0, beta = decoder_norm22_1_bias, epsilon = var_985, gamma = decoder_norm22_1_weight, x = input_201)[name = tensor<string, []>("xt")];
-            tensor<int32, [4]> var_1430 = const()[name = tensor<string, []>("op_1430"), val = tensor<int32, [4]>([1, 3, 6, 256])];
-            tensor<fp32, [1, 3, 6, 256]> input = reshape(shape = var_1430, x = xt)[name = tensor<string, []>("input")];
-            tensor<int32, [1]> var_1432 = const()[name = tensor<string, []>("op_1432"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 6, 1]> var_1433 = reduce_l2_norm(axes = var_1432, keep_dims = var_988, x = input)[name = tensor<string, []>("op_1433")];
+            tensor<fp32, [3, 6, 256]> xt = layer_norm(axes = xt_axes_0, beta = inner_decoder_norm22_1_bias, epsilon = var_55, gamma = inner_decoder_norm22_1_weight, x = input_203)[name = tensor<string, []>("xt")];
+            tensor<int32, [4]> var_1453 = const()[name = tensor<string, []>("op_1453"), val = tensor<int32, [4]>([1, 3, 6, 256])];
+            tensor<fp32, [1, 3, 6, 256]> input = reshape(shape = var_1453, x = xt)[name = tensor<string, []>("input")];
+            tensor<int32, [1]> var_1455 = const()[name = tensor<string, []>("op_1455"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 3, 6, 1]> var_1456 = reduce_l2_norm(axes = var_1455, keep_dims = var_54, x = input)[name = tensor<string, []>("op_1456")];
             tensor<fp32, []> const_42 = const()[name = tensor<string, []>("const_42"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1, 3, 6, 1]> clip_5 = clip(alpha = var_980, beta = const_42, x = var_1433)[name = tensor<string, []>("clip_5")];
-            tensor<fp32, [1, 3, 6, 256]> var_1435 = real_div(x = input, y = clip_5)[name = tensor<string, []>("op_1435")];
+            tensor<fp32, [1, 3, 6, 1]> clip_5 = clip(alpha = var_68, beta = const_42, x = var_1456)[name = tensor<string, []>("clip_5")];
+            tensor<fp32, [1, 3, 6, 256]> var_1458 = real_div(x = input, y = clip_5)[name = tensor<string, []>("op_1458")];
             tensor<int32, [3]> concat_6 = const()[name = tensor<string, []>("concat_6"), val = tensor<int32, [3]>([3, 1, 256])];
             tensor<fp32, [3, 1, 256]> reshape_0 = reshape(shape = concat_6, x = emb)[name = tensor<string, []>("reshape_0")];
             tensor<int32, [4]> transpose_1_perm_0 = const()[name = tensor<string, []>("transpose_1_perm_0"), val = tensor<int32, [4]>([0, 1, 3, 2])];
             tensor<int32, [3]> concat_7 = const()[name = tensor<string, []>("concat_7"), val = tensor<int32, [3]>([3, 256, 6])];
-            tensor<fp32, [1, 3, 256, 6]> transpose_1 = transpose(perm = transpose_1_perm_0, x = var_1435)[name = tensor<string, []>("transpose_2")];
+            tensor<fp32, [1, 3, 256, 6]> transpose_1 = transpose(perm = transpose_1_perm_0, x = var_1458)[name = tensor<string, []>("transpose_2")];
             tensor<fp32, [3, 256, 6]> reshape_1 = reshape(shape = concat_7, x = transpose_1)[name = tensor<string, []>("reshape_1")];
             tensor<bool, []> matmul_2_transpose_x_0 = const()[name = tensor<string, []>("matmul_2_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> matmul_2_transpose_y_0 = const()[name = tensor<string, []>("matmul_2_transpose_y_0"), val = tensor<bool, []>(false)];
@@ -1277,10 +1288,10 @@ program(1.0)
             tensor<int32, [3]> output_end_0 = const()[name = tensor<string, []>("output_end_0"), val = tensor<int32, [3]>([1, 3, 5])];
             tensor<bool, [3]> output_end_mask_0 = const()[name = tensor<string, []>("output_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
             tensor<fp32, [1, 3, 4]> output = slice_by_index(begin = output_begin_0, end = output_end_0, end_mask = output_end_mask_0, x = reshape_2)[name = tensor<string, []>("output")];
-            tensor<fp32, [1, 3, 4]> probs = sigmoid(x = output)[name = tensor<string, []>("op_1439")];
-            tensor<int32, []> var_1441_axis_0 = const()[name = tensor<string, []>("op_1441_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [2, 6, 4, 64, 64]> dec_kv_new = stack(axis = var_1441_axis_0, values = (var_1137, nkv))[name = tensor<string, []>("op_1441")];
-            tensor<int32, []> var_1443_axis_0 = const()[name = tensor<string, []>("op_1443_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [2, 1]> dec_scale_new = stack(axis = var_1443_axis_0, values = (new_scale_9, new_scale))[name = tensor<string, []>("op_1443")];
+            tensor<fp32, [1, 3, 4]> probs = sigmoid(x = output)[name = tensor<string, []>("op_1462")];
+            tensor<int32, []> var_1464_axis_0 = const()[name = tensor<string, []>("op_1464_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [2, 6, 4, 64, 64]> dec_kv_new = stack(axis = var_1464_axis_0, values = (var_1160, nkv))[name = tensor<string, []>("op_1464")];
+            tensor<int32, []> var_1466_axis_0 = const()[name = tensor<string, []>("op_1466_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [2, 1]> dec_scale_new = stack(axis = var_1466_axis_0, values = (new_scale_9, new_scale))[name = tensor<string, []>("op_1466")];
         } -> (probs, enc_kv_new, enc_scale_new, enc_conv_cache_new, cnn_window_new, dec_kv_new, dec_scale_new);
 }
\ No newline at end of file
diff --git a/optimized/ami/300ms/ls_eend_ami_300ms.mlpackage/Data/com.apple.CoreML/model.mlmodel b/optimized/ami/300ms/ls_eend_ami_300ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
index 3ee6a266410b4c8f967985e6dd4010971e326dda..bce4e082d0c99aa965c5b822189ac93ab0474b36 100644
--- a/optimized/ami/300ms/ls_eend_ami_300ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
+++ b/optimized/ami/300ms/ls_eend_ami_300ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7c02e235e84b720c6aaf09d92bbac369c38d556327dd8f5f72579d9e76c86d43
-size 185451
+oid sha256:7b50c82d16896cf3aa0109f4b74206ce7f7fdcbdee5fe341dfa6406075be97d3
+size 190996
diff --git a/optimized/ami/300ms/ls_eend_ami_300ms.mlpackage/Manifest.json b/optimized/ami/300ms/ls_eend_ami_300ms.mlpackage/Manifest.json
index c9c2cd6e7172b08333ab9eafcbbc60ac652c7e02..2020c11765ac2eddf1513882c9c54dcecf8ce40d 100644
--- a/optimized/ami/300ms/ls_eend_ami_300ms.mlpackage/Manifest.json
+++ b/optimized/ami/300ms/ls_eend_ami_300ms.mlpackage/Manifest.json
@@ -1,18 +1,18 @@
 {
     "fileFormatVersion": "1.0.0",
     "itemInfoEntries": {
-        "68330719-A46B-4379-8B85-C9B4E8FD52AF": {
+        "8152B853-43F4-4A6B-92C5-99387E9814E3": {
             "author": "com.apple.CoreML",
             "description": "CoreML Model Specification",
             "name": "model.mlmodel",
             "path": "com.apple.CoreML/model.mlmodel"
         },
-        "E31DDEE0-26D5-4CE8-AD79-31130F2DF498": {
+        "F91AC4AF-14C0-45F7-BDD2-A116D8D48885": {
             "author": "com.apple.CoreML",
             "description": "CoreML Model Weights",
             "name": "weights",
             "path": "com.apple.CoreML/weights"
         }
     },
-    "rootModelIdentifier": "68330719-A46B-4379-8B85-C9B4E8FD52AF"
+    "rootModelIdentifier": "8152B853-43F4-4A6B-92C5-99387E9814E3"
 }
diff --git a/optimized/ami/400ms/ls_eend_ami_400ms.mlmodelc/analytics/coremldata.bin b/optimized/ami/400ms/ls_eend_ami_400ms.mlmodelc/analytics/coremldata.bin
index 991ad5aa06a48e1d0f11baf09e0e8defe19c30ba..1f275b6c49d69319653d30ab214c476112d49cd3 100644
--- a/optimized/ami/400ms/ls_eend_ami_400ms.mlmodelc/analytics/coremldata.bin
+++ b/optimized/ami/400ms/ls_eend_ami_400ms.mlmodelc/analytics/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b5f8f8f57d8e374772e2d54b8950c8de8c6a3a8cc10450c3832b9bf5e75ce240
+oid sha256:a6758d5ecdb39569a10f08b51c266206c2824dc825070864428f42ace8434465
 size 243
diff --git a/optimized/ami/400ms/ls_eend_ami_400ms.mlmodelc/coremldata.bin b/optimized/ami/400ms/ls_eend_ami_400ms.mlmodelc/coremldata.bin
index 2e9e19cf1f89eb088959423ab8f66c6e00895ceb..8d4b5942ba1b7b92e69b60b50b2b17899354b405 100644
--- a/optimized/ami/400ms/ls_eend_ami_400ms.mlmodelc/coremldata.bin
+++ b/optimized/ami/400ms/ls_eend_ami_400ms.mlmodelc/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:71e5da37ce4f16ebb4f5b2a2ef320e4f17aee294a47d49fa31f021a549ed1af3
-size 1292
+oid sha256:976e4dbdf87c668f484460fb519558b7c5d6040a00e786a40be1c8ba08e57ac9
+size 1395
diff --git a/optimized/ami/400ms/ls_eend_ami_400ms.mlmodelc/metadata.json b/optimized/ami/400ms/ls_eend_ami_400ms.mlmodelc/metadata.json
index 02cd719d1f593b541368c72808de5e7b68e572ad..789e43d6c81056b77a84b77dc3a1249d69757faa 100644
--- a/optimized/ami/400ms/ls_eend_ami_400ms.mlmodelc/metadata.json
+++ b/optimized/ami/400ms/ls_eend_ami_400ms.mlmodelc/metadata.json
@@ -1,7 +1,7 @@
 [
   {
     "metadataOutputVersion" : "3.0",
-    "shortDescription" : "LS-EEND AMI streaming diarizer (pipeline, T=4, max_speakers=4)",
+    "shortDescription" : "LS-EEND AMI streaming diarizer (pipeline, T=4, max_speakers=4, layout=raw_mel, cu=all)",
     "outputSchema" : [
       {
         "hasShapeFlexibility" : "0",
@@ -81,7 +81,7 @@
     "specificationVersion" : 8,
     "mlProgramOperationTypeHistogram" : {
       "Ios16.reduceL2Norm" : 2,
-      "Ios17.reshape" : 68,
+      "Ios17.reshape" : 69,
       "Ios16.softmax" : 2,
       "Ios17.matmul" : 29,
       "Ios17.transpose" : 57,
@@ -89,7 +89,7 @@
       "Ios17.expandDims" : 3,
       "Ios17.add" : 46,
       "Ios16.sigmoid" : 5,
-      "Ios17.sliceByIndex" : 64,
+      "Ios17.sliceByIndex" : 68,
       "Tile" : 1,
       "Ios16.reduceSum" : 1,
       "Ios17.squeeze" : 2,
@@ -101,7 +101,7 @@
       "Ios16.silu" : 18,
       "Ios17.realDiv" : 20,
       "Ios17.linear" : 56,
-      "Stack" : 5,
+      "Stack" : 6,
       "Ios17.concat" : 22,
       "Ios16.relu" : 2,
       "Ios16.cumsum" : 1,
@@ -128,9 +128,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float32",
-        "formattedType" : "MultiArray (Float32 1 × 4 × 345)",
+        "formattedType" : "MultiArray (Float32 1 × 45 × 23)",
         "shortDescription" : "",
-        "shape" : "[1, 4, 345]",
+        "shape" : "[1, 45, 23]",
         "name" : "features",
         "type" : "MultiArray"
       },
@@ -206,8 +206,8 @@
       }
     ],
     "userDefinedMetadata" : {
-      "com.github.apple.coremltools.conversion_date" : "2026-04-16",
-      "config" : "{\"model_name\": \"ami\", \"model_label\": \"AMI\", \"variant\": \"pipeline\", \"chunk_size\": 4, \"step_duration_ms\": 400, \"frame_duration_ms\": 100, \"frame_duration_seconds\": 0.1, \"max_speakers\": 4, \"max_nspks\": 6, \"n_units\": 256, \"n_heads\": 4, \"enc_n_layers\": 4, \"dec_n_layers\": 2, \"conv_kernel_size\": 16, \"conv_delay\": 9, \"sample_rate\": 8000, \"win_length\": 200, \"hop_length\": 80, \"n_mels\": 23, \"context_size\": 7, \"subsampling\": 10, \"feat_type\": \"logmel23_cummn\", \"pure_roll\": true}",
+      "com.github.apple.coremltools.conversion_date" : "2026-04-18",
+      "config" : "{\"model_name\": \"ami\", \"model_label\": \"AMI\", \"variant\": \"pipeline\", \"chunk_size\": 4, \"step_duration_ms\": 400, \"frame_duration_ms\": 100, \"frame_duration_seconds\": 0.1, \"max_speakers\": 4, \"max_nspks\": 6, \"n_units\": 256, \"n_heads\": 4, \"enc_n_layers\": 4, \"dec_n_layers\": 2, \"conv_kernel_size\": 16, \"conv_delay\": 9, \"sample_rate\": 8000, \"win_length\": 200, \"hop_length\": 80, \"n_mels\": 23, \"context_size\": 7, \"subsampling\": 10, \"feat_type\": \"logmel23_cummn\", \"pure_roll\": true, \"input_layout\": \"raw_mel\", \"compute_units_export\": \"all\", \"raw_mel_length\": 45}",
       "com.github.apple.coremltools.source" : "torch==2.6.0",
       "com.github.apple.coremltools.version" : "9.0",
       "com.github.apple.coremltools.source_dialect" : "TorchScript"
diff --git a/optimized/ami/400ms/ls_eend_ami_400ms.mlmodelc/model.mil b/optimized/ami/400ms/ls_eend_ami_400ms.mlmodelc/model.mil
index 189cd5c18852bac2bde041df8ebc35de2ca1a491..9e45f1e77725011414fcf74aa7144ae43d4ba02e 100644
--- a/optimized/ami/400ms/ls_eend_ami_400ms.mlmodelc/model.mil
+++ b/optimized/ami/400ms/ls_eend_ami_400ms.mlmodelc/model.mil
@@ -1,234 +1,256 @@
 program(1.0)
 [buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.6.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "9.0"}})]
 {
-    func main<ios17>(tensor<fp32, [1, 256, 18]> cnn_window, tensor<fp32, [2, 6, 4, 64, 64]> dec_kv, tensor<fp32, [2, 1]> dec_scale, tensor<fp32, [4, 1, 16, 256]> enc_conv_cache, tensor<fp32, [4, 1, 4, 64, 64]> enc_kv, tensor<fp32, [4, 1]> enc_scale, tensor<fp32, [1, 4, 345]> features, tensor<fp32, [4]> valid_mask) {
-            tensor<fp32, [256]> encoder_cnn_bias = const()[name = tensor<string, []>("encoder_cnn_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
-            tensor<fp32, [256, 256, 19]> encoder_cnn_weight = const()[name = tensor<string, []>("encoder_cnn_weight"), val = tensor<fp32, [256, 256, 19]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
-            tensor<fp32, [4, 4]> encoder__causal_mask = const()[name = tensor<string, []>("encoder__causal_mask"), val = tensor<fp32, [4, 4]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4981952)))];
-            tensor<fp32, [4]> encoder__t_index = const()[name = tensor<string, []>("encoder__t_index"), val = tensor<fp32, [4]>([0x1p+0, 0x1p+1, 0x1.8p+1, 0x1p+2])];
-            tensor<fp32, [256]> encoder_input_projection_linear_bias = const()[name = tensor<string, []>("encoder_input_projection_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4982080)))];
-            tensor<fp32, [256, 345]> encoder_input_projection_linear_weight = const()[name = tensor<string, []>("encoder_input_projection_linear_weight"), val = tensor<fp32, [256, 345]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4983168)))];
-            tensor<fp32, [256]> encoder_pre_layer_norm_bias = const()[name = tensor<string, []>("encoder_pre_layer_norm_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5336512)))];
-            tensor<fp32, [256]> encoder_pre_layer_norm_weight = const()[name = tensor<string, []>("encoder_pre_layer_norm_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5337600)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5338688)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5339776)))];
-            tensor<fp32, [1024]> encoder_ffn1_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5340864)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5345024)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6393664)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6394752)))];
-            tensor<fp32, [256]> encoder_ret_lns_0_bias = const()[name = tensor<string, []>("encoder_ret_lns_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7443392)))];
-            tensor<fp32, [256]> encoder_ret_lns_0_weight = const()[name = tensor<string, []>("encoder_ret_lns_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7444480)))];
-            tensor<fp32, [256]> encoder_q_proj_0_bias = const()[name = tensor<string, []>("encoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7445568)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_0_weight = const()[name = tensor<string, []>("encoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7446656)))];
-            tensor<fp32, [256]> encoder_k_proj_0_bias = const()[name = tensor<string, []>("encoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7708864)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_0_weight = const()[name = tensor<string, []>("encoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7709952)))];
-            tensor<fp32, [256]> encoder_v_proj_0_bias = const()[name = tensor<string, []>("encoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7972160)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_0_weight = const()[name = tensor<string, []>("encoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7973248)))];
-            tensor<fp32, [256]> encoder_g_proj_0_bias = const()[name = tensor<string, []>("encoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8235456)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_0_weight = const()[name = tensor<string, []>("encoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8236544)))];
-            tensor<fp32, [256]> encoder_out_proj_0_bias = const()[name = tensor<string, []>("encoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8498752)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_0_weight = const()[name = tensor<string, []>("encoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8499840)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8762048)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8763136)))];
-            tensor<fp32, [512]> encoder_conv_module_0_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8764224)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_0_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8766336)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_0_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9290688)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9307136)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9308224)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9309312)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9310400)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9311488)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_0_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9312576)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9574784)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9575872)))];
-            tensor<fp32, [1024]> encoder_ffn2_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9576960)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9581120)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10629760)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10630848)))];
-            tensor<fp32, [256]> encoder_layer_norm_0_bias = const()[name = tensor<string, []>("encoder_layer_norm_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11679488)))];
-            tensor<fp32, [256]> encoder_layer_norm_0_weight = const()[name = tensor<string, []>("encoder_layer_norm_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11680576)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11681664)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11682752)))];
-            tensor<fp32, [1024]> encoder_ffn1_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11683840)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11688000)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12736640)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12737728)))];
-            tensor<fp32, [256]> encoder_ret_lns_1_bias = const()[name = tensor<string, []>("encoder_ret_lns_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13786368)))];
-            tensor<fp32, [256]> encoder_ret_lns_1_weight = const()[name = tensor<string, []>("encoder_ret_lns_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13787456)))];
-            tensor<fp32, [256]> encoder_q_proj_1_bias = const()[name = tensor<string, []>("encoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13788544)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_1_weight = const()[name = tensor<string, []>("encoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13789632)))];
-            tensor<fp32, [256]> encoder_k_proj_1_bias = const()[name = tensor<string, []>("encoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14051840)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_1_weight = const()[name = tensor<string, []>("encoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14052928)))];
-            tensor<fp32, [256]> encoder_v_proj_1_bias = const()[name = tensor<string, []>("encoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14315136)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_1_weight = const()[name = tensor<string, []>("encoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14316224)))];
-            tensor<fp32, [256]> encoder_g_proj_1_bias = const()[name = tensor<string, []>("encoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14578432)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_1_weight = const()[name = tensor<string, []>("encoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14579520)))];
-            tensor<fp32, [256]> encoder_out_proj_1_bias = const()[name = tensor<string, []>("encoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14841728)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_1_weight = const()[name = tensor<string, []>("encoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14842816)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15105024)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15106112)))];
-            tensor<fp32, [512]> encoder_conv_module_1_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15107200)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_1_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15109312)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_1_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15633664)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15650112)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15651200)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15652288)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15653376)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15654464)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_1_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15655552)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15917760)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15918848)))];
-            tensor<fp32, [1024]> encoder_ffn2_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15919936)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15924096)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16972736)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16973824)))];
-            tensor<fp32, [256]> encoder_layer_norm_1_bias = const()[name = tensor<string, []>("encoder_layer_norm_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18022464)))];
-            tensor<fp32, [256]> encoder_layer_norm_1_weight = const()[name = tensor<string, []>("encoder_layer_norm_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18023552)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18024640)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18025728)))];
-            tensor<fp32, [1024]> encoder_ffn1_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18026816)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18030976)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19079616)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19080704)))];
-            tensor<fp32, [256]> encoder_ret_lns_2_bias = const()[name = tensor<string, []>("encoder_ret_lns_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20129344)))];
-            tensor<fp32, [256]> encoder_ret_lns_2_weight = const()[name = tensor<string, []>("encoder_ret_lns_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20130432)))];
-            tensor<fp32, [256]> encoder_q_proj_2_bias = const()[name = tensor<string, []>("encoder_q_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20131520)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_2_weight = const()[name = tensor<string, []>("encoder_q_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20132608)))];
-            tensor<fp32, [256]> encoder_k_proj_2_bias = const()[name = tensor<string, []>("encoder_k_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20394816)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_2_weight = const()[name = tensor<string, []>("encoder_k_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20395904)))];
-            tensor<fp32, [256]> encoder_v_proj_2_bias = const()[name = tensor<string, []>("encoder_v_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20658112)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_2_weight = const()[name = tensor<string, []>("encoder_v_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20659200)))];
-            tensor<fp32, [256]> encoder_g_proj_2_bias = const()[name = tensor<string, []>("encoder_g_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20921408)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_2_weight = const()[name = tensor<string, []>("encoder_g_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20922496)))];
-            tensor<fp32, [256]> encoder_out_proj_2_bias = const()[name = tensor<string, []>("encoder_out_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21184704)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_2_weight = const()[name = tensor<string, []>("encoder_out_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21185792)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21448000)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21449088)))];
-            tensor<fp32, [512]> encoder_conv_module_2_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21450176)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_2_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21452288)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_2_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21976640)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21993088)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21994176)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21995264)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21996352)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21997440)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_2_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21998528)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22260736)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22261824)))];
-            tensor<fp32, [1024]> encoder_ffn2_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22262912)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22267072)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23315712)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23316800)))];
-            tensor<fp32, [256]> encoder_layer_norm_2_bias = const()[name = tensor<string, []>("encoder_layer_norm_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24365440)))];
-            tensor<fp32, [256]> encoder_layer_norm_2_weight = const()[name = tensor<string, []>("encoder_layer_norm_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24366528)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24367616)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24368704)))];
-            tensor<fp32, [1024]> encoder_ffn1_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24369792)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24373952)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25422592)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25423680)))];
-            tensor<fp32, [256]> encoder_ret_lns_3_bias = const()[name = tensor<string, []>("encoder_ret_lns_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26472320)))];
-            tensor<fp32, [256]> encoder_ret_lns_3_weight = const()[name = tensor<string, []>("encoder_ret_lns_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26473408)))];
-            tensor<fp32, [256]> encoder_q_proj_3_bias = const()[name = tensor<string, []>("encoder_q_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26474496)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_3_weight = const()[name = tensor<string, []>("encoder_q_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26475584)))];
-            tensor<fp32, [256]> encoder_k_proj_3_bias = const()[name = tensor<string, []>("encoder_k_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26737792)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_3_weight = const()[name = tensor<string, []>("encoder_k_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26738880)))];
-            tensor<fp32, [256]> encoder_v_proj_3_bias = const()[name = tensor<string, []>("encoder_v_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27001088)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_3_weight = const()[name = tensor<string, []>("encoder_v_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27002176)))];
-            tensor<fp32, [256]> encoder_g_proj_3_bias = const()[name = tensor<string, []>("encoder_g_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27264384)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_3_weight = const()[name = tensor<string, []>("encoder_g_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27265472)))];
-            tensor<fp32, [256]> encoder_out_proj_3_bias = const()[name = tensor<string, []>("encoder_out_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27527680)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_3_weight = const()[name = tensor<string, []>("encoder_out_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27528768)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27790976)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27792064)))];
-            tensor<fp32, [512]> encoder_conv_module_3_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27793152)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_3_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27795264)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_3_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28319616)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28336064)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28337152)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28338240)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28339328)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28340416)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_3_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28341504)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28603712)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28604800)))];
-            tensor<fp32, [1024]> encoder_ffn2_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28605888)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28610048)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29658688)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29659776)))];
-            tensor<fp32, [256]> encoder_layer_norm_3_bias = const()[name = tensor<string, []>("encoder_layer_norm_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30708416)))];
-            tensor<fp32, [256]> encoder_layer_norm_3_weight = const()[name = tensor<string, []>("encoder_layer_norm_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30709504)))];
-            tensor<fp32, [4, 4]> decoder__causal_mask = const()[name = tensor<string, []>("decoder__causal_mask"), val = tensor<fp32, [4, 4]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710592)))];
-            tensor<fp32, [256]> decoder_convert_bias = const()[name = tensor<string, []>("decoder_convert_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710720)))];
-            tensor<fp32, [256, 512]> decoder_convert_weight = const()[name = tensor<string, []>("decoder_convert_weight"), val = tensor<fp32, [256, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30711808)))];
-            tensor<fp32, [256]> decoder_q_proj_0_bias = const()[name = tensor<string, []>("decoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31236160)))];
-            tensor<fp32, [256, 256]> decoder_q_proj_0_weight = const()[name = tensor<string, []>("decoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31237248)))];
-            tensor<fp32, [256]> decoder_k_proj_0_bias = const()[name = tensor<string, []>("decoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31499456)))];
-            tensor<fp32, [256, 256]> decoder_k_proj_0_weight = const()[name = tensor<string, []>("decoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31500544)))];
-            tensor<fp32, [256]> decoder_v_proj_0_bias = const()[name = tensor<string, []>("decoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31762752)))];
-            tensor<fp32, [256, 256]> decoder_v_proj_0_weight = const()[name = tensor<string, []>("decoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31763840)))];
-            tensor<fp32, [256]> decoder_g_proj_0_bias = const()[name = tensor<string, []>("decoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32026048)))];
-            tensor<fp32, [256, 256]> decoder_g_proj_0_weight = const()[name = tensor<string, []>("decoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32027136)))];
-            tensor<fp32, [256]> decoder_out_proj_0_bias = const()[name = tensor<string, []>("decoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32289344)))];
-            tensor<fp32, [256, 256]> decoder_out_proj_0_weight = const()[name = tensor<string, []>("decoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32290432)))];
-            tensor<fp32, [256]> decoder_norm11_0_bias = const()[name = tensor<string, []>("decoder_norm11_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32552640)))];
-            tensor<fp32, [256]> decoder_norm11_0_weight = const()[name = tensor<string, []>("decoder_norm11_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32553728)))];
-            tensor<fp32, [256]> decoder_self_attn2_0_out_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_0_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32554816)))];
-            tensor<fp32, [256, 256]> decoder_self_attn2_0_out_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_0_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32555904)))];
-            tensor<fp32, [768]> decoder_self_attn2_0_in_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_0_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32818112)))];
-            tensor<fp32, [768, 256]> decoder_self_attn2_0_in_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_0_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32821248)))];
-            tensor<fp32, [256]> decoder_norm21_0_bias = const()[name = tensor<string, []>("decoder_norm21_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33607744)))];
-            tensor<fp32, [256]> decoder_norm21_0_weight = const()[name = tensor<string, []>("decoder_norm21_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33608832)))];
-            tensor<fp32, [2048]> decoder_linear1_0_bias = const()[name = tensor<string, []>("decoder_linear1_0_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33609920)))];
-            tensor<fp32, [2048, 256]> decoder_linear1_0_weight = const()[name = tensor<string, []>("decoder_linear1_0_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33618176)))];
-            tensor<fp32, [256]> decoder_linear2_0_bias = const()[name = tensor<string, []>("decoder_linear2_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35715392)))];
-            tensor<fp32, [256, 2048]> decoder_linear2_0_weight = const()[name = tensor<string, []>("decoder_linear2_0_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35716480)))];
-            tensor<fp32, [256]> decoder_norm22_0_bias = const()[name = tensor<string, []>("decoder_norm22_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37813696)))];
-            tensor<fp32, [256]> decoder_norm22_0_weight = const()[name = tensor<string, []>("decoder_norm22_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37814784)))];
-            tensor<fp32, [256]> decoder_q_proj_1_bias = const()[name = tensor<string, []>("decoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37815872)))];
-            tensor<fp32, [256, 256]> decoder_q_proj_1_weight = const()[name = tensor<string, []>("decoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37816960)))];
-            tensor<fp32, [256]> decoder_k_proj_1_bias = const()[name = tensor<string, []>("decoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38079168)))];
-            tensor<fp32, [256, 256]> decoder_k_proj_1_weight = const()[name = tensor<string, []>("decoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38080256)))];
-            tensor<fp32, [256]> decoder_v_proj_1_bias = const()[name = tensor<string, []>("decoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38342464)))];
-            tensor<fp32, [256, 256]> decoder_v_proj_1_weight = const()[name = tensor<string, []>("decoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38343552)))];
-            tensor<fp32, [256]> decoder_g_proj_1_bias = const()[name = tensor<string, []>("decoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38605760)))];
-            tensor<fp32, [256, 256]> decoder_g_proj_1_weight = const()[name = tensor<string, []>("decoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38606848)))];
-            tensor<fp32, [256]> decoder_out_proj_1_bias = const()[name = tensor<string, []>("decoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38869056)))];
-            tensor<fp32, [256, 256]> decoder_out_proj_1_weight = const()[name = tensor<string, []>("decoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38870144)))];
-            tensor<fp32, [256]> decoder_norm11_1_bias = const()[name = tensor<string, []>("decoder_norm11_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39132352)))];
-            tensor<fp32, [256]> decoder_norm11_1_weight = const()[name = tensor<string, []>("decoder_norm11_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39133440)))];
-            tensor<fp32, [256]> decoder_self_attn2_1_out_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_1_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39134528)))];
-            tensor<fp32, [256, 256]> decoder_self_attn2_1_out_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_1_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39135616)))];
-            tensor<fp32, [768]> decoder_self_attn2_1_in_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_1_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39397824)))];
-            tensor<fp32, [768, 256]> decoder_self_attn2_1_in_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_1_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39400960)))];
-            tensor<fp32, [256]> decoder_norm21_1_bias = const()[name = tensor<string, []>("decoder_norm21_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40187456)))];
-            tensor<fp32, [256]> decoder_norm21_1_weight = const()[name = tensor<string, []>("decoder_norm21_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40188544)))];
-            tensor<fp32, [2048]> decoder_linear1_1_bias = const()[name = tensor<string, []>("decoder_linear1_1_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40189632)))];
-            tensor<fp32, [2048, 256]> decoder_linear1_1_weight = const()[name = tensor<string, []>("decoder_linear1_1_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40197888)))];
-            tensor<fp32, [256]> decoder_linear2_1_bias = const()[name = tensor<string, []>("decoder_linear2_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42295104)))];
-            tensor<fp32, [256, 2048]> decoder_linear2_1_weight = const()[name = tensor<string, []>("decoder_linear2_1_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42296192)))];
-            tensor<fp32, [256]> decoder_norm22_1_bias = const()[name = tensor<string, []>("decoder_norm22_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44393408)))];
-            tensor<fp32, [256]> decoder_norm22_1_weight = const()[name = tensor<string, []>("decoder_norm22_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44394496)))];
-            tensor<fp32, []> var_11 = const()[name = tensor<string, []>("op_11"), val = tensor<fp32, []>(0x1.5798eep-27)];
-            tensor<fp32, []> var_17 = const()[name = tensor<string, []>("op_17"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
-            tensor<int32, []> var_20 = const()[name = tensor<string, []>("op_20"), val = tensor<int32, []>(2)];
-            tensor<int32, []> var_23 = const()[name = tensor<string, []>("op_23"), val = tensor<int32, []>(0)];
-            tensor<int32, []> var_26 = const()[name = tensor<string, []>("op_26"), val = tensor<int32, []>(1)];
-            tensor<fp32, []> var_28 = const()[name = tensor<string, []>("op_28"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
-            tensor<bool, []> var_29 = const()[name = tensor<string, []>("op_29"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 256]> input_1 = linear(bias = encoder_input_projection_linear_bias, weight = encoder_input_projection_linear_weight, x = features)[name = tensor<string, []>("linear_0")];
-            tensor<int32, [1]> input_3_axes_0 = const()[name = tensor<string, []>("input_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_3 = layer_norm(axes = input_3_axes_0, beta = encoder_pre_layer_norm_bias, epsilon = var_28, gamma = encoder_pre_layer_norm_weight, x = input_1)[name = tensor<string, []>("input_3")];
+    func main<ios17>(tensor<fp32, [1, 256, 18]> cnn_window, tensor<fp32, [2, 6, 4, 64, 64]> dec_kv, tensor<fp32, [2, 1]> dec_scale, tensor<fp32, [4, 1, 16, 256]> enc_conv_cache, tensor<fp32, [4, 1, 4, 64, 64]> enc_kv, tensor<fp32, [4, 1]> enc_scale, tensor<fp32, [1, 45, 23]> features, tensor<fp32, [4]> valid_mask) {
+            tensor<fp32, [256]> inner_encoder_cnn_bias = const()[name = tensor<string, []>("inner_encoder_cnn_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp32, [256, 256, 19]> inner_encoder_cnn_weight = const()[name = tensor<string, []>("inner_encoder_cnn_weight"), val = tensor<fp32, [256, 256, 19]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
+            tensor<fp32, [4, 4]> inner_encoder__causal_mask = const()[name = tensor<string, []>("inner_encoder__causal_mask"), val = tensor<fp32, [4, 4]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4981952)))];
+            tensor<fp32, [4]> inner_encoder__t_index = const()[name = tensor<string, []>("inner_encoder__t_index"), val = tensor<fp32, [4]>([0x1p+0, 0x1p+1, 0x1.8p+1, 0x1p+2])];
+            tensor<fp32, [256]> inner_encoder_input_projection_linear_bias = const()[name = tensor<string, []>("inner_encoder_input_projection_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4982080)))];
+            tensor<fp32, [256, 345]> inner_encoder_input_projection_linear_weight = const()[name = tensor<string, []>("inner_encoder_input_projection_linear_weight"), val = tensor<fp32, [256, 345]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4983168)))];
+            tensor<fp32, [256]> inner_encoder_pre_layer_norm_bias = const()[name = tensor<string, []>("inner_encoder_pre_layer_norm_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5336512)))];
+            tensor<fp32, [256]> inner_encoder_pre_layer_norm_weight = const()[name = tensor<string, []>("inner_encoder_pre_layer_norm_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5337600)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5338688)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5339776)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5340864)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5345024)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6393664)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6394752)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_0_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7443392)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_0_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7444480)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7445568)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7446656)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7708864)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7709952)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7972160)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7973248)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8235456)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8236544)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8498752)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8499840)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8762048)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8763136)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_0_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8764224)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_0_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8766336)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_0_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9290688)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9307136)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9308224)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9309312)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9310400)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9311488)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_0_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9312576)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9574784)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9575872)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9576960)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9581120)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10629760)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10630848)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_0_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11679488)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_0_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11680576)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11681664)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11682752)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11683840)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11688000)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12736640)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12737728)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_1_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13786368)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_1_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13787456)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13788544)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13789632)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14051840)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14052928)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14315136)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14316224)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14578432)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14579520)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14841728)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14842816)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15105024)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15106112)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_1_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15107200)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_1_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15109312)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_1_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15633664)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15650112)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15651200)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15652288)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15653376)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15654464)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_1_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15655552)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15917760)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15918848)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15919936)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15924096)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16972736)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16973824)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_1_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18022464)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_1_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18023552)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18024640)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18025728)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18026816)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18030976)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19079616)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19080704)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_2_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20129344)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_2_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20130432)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20131520)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20132608)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20394816)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20395904)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20658112)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20659200)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20921408)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20922496)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21184704)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21185792)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21448000)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21449088)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_2_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21450176)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_2_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21452288)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_2_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21976640)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21993088)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21994176)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21995264)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21996352)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21997440)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_2_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21998528)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22260736)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22261824)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22262912)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22267072)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23315712)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23316800)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_2_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24365440)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_2_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24366528)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24367616)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24368704)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24369792)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24373952)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25422592)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25423680)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_3_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26472320)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_3_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26473408)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26474496)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26475584)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26737792)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26738880)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27001088)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27002176)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27264384)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27265472)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27527680)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27528768)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27790976)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27792064)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_3_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27793152)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_3_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27795264)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_3_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28319616)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28336064)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28337152)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28338240)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28339328)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28340416)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_3_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28341504)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28603712)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28604800)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28605888)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28610048)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29658688)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29659776)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_3_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30708416)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_3_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30709504)))];
+            tensor<fp32, [4, 4]> inner_decoder__causal_mask = const()[name = tensor<string, []>("inner_decoder__causal_mask"), val = tensor<fp32, [4, 4]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710592)))];
+            tensor<fp32, [256]> inner_decoder_convert_bias = const()[name = tensor<string, []>("inner_decoder_convert_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710720)))];
+            tensor<fp32, [256, 512]> inner_decoder_convert_weight = const()[name = tensor<string, []>("inner_decoder_convert_weight"), val = tensor<fp32, [256, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30711808)))];
+            tensor<fp32, [256]> inner_decoder_q_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31236160)))];
+            tensor<fp32, [256, 256]> inner_decoder_q_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31237248)))];
+            tensor<fp32, [256]> inner_decoder_k_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31499456)))];
+            tensor<fp32, [256, 256]> inner_decoder_k_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31500544)))];
+            tensor<fp32, [256]> inner_decoder_v_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31762752)))];
+            tensor<fp32, [256, 256]> inner_decoder_v_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31763840)))];
+            tensor<fp32, [256]> inner_decoder_g_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32026048)))];
+            tensor<fp32, [256, 256]> inner_decoder_g_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32027136)))];
+            tensor<fp32, [256]> inner_decoder_out_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32289344)))];
+            tensor<fp32, [256, 256]> inner_decoder_out_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32290432)))];
+            tensor<fp32, [256]> inner_decoder_norm11_0_bias = const()[name = tensor<string, []>("inner_decoder_norm11_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32552640)))];
+            tensor<fp32, [256]> inner_decoder_norm11_0_weight = const()[name = tensor<string, []>("inner_decoder_norm11_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32553728)))];
+            tensor<fp32, [256]> inner_decoder_self_attn2_0_out_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32554816)))];
+            tensor<fp32, [256, 256]> inner_decoder_self_attn2_0_out_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32555904)))];
+            tensor<fp32, [768]> inner_decoder_self_attn2_0_in_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32818112)))];
+            tensor<fp32, [768, 256]> inner_decoder_self_attn2_0_in_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32821248)))];
+            tensor<fp32, [256]> inner_decoder_norm21_0_bias = const()[name = tensor<string, []>("inner_decoder_norm21_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33607744)))];
+            tensor<fp32, [256]> inner_decoder_norm21_0_weight = const()[name = tensor<string, []>("inner_decoder_norm21_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33608832)))];
+            tensor<fp32, [2048]> inner_decoder_linear1_0_bias = const()[name = tensor<string, []>("inner_decoder_linear1_0_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33609920)))];
+            tensor<fp32, [2048, 256]> inner_decoder_linear1_0_weight = const()[name = tensor<string, []>("inner_decoder_linear1_0_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33618176)))];
+            tensor<fp32, [256]> inner_decoder_linear2_0_bias = const()[name = tensor<string, []>("inner_decoder_linear2_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35715392)))];
+            tensor<fp32, [256, 2048]> inner_decoder_linear2_0_weight = const()[name = tensor<string, []>("inner_decoder_linear2_0_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35716480)))];
+            tensor<fp32, [256]> inner_decoder_norm22_0_bias = const()[name = tensor<string, []>("inner_decoder_norm22_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37813696)))];
+            tensor<fp32, [256]> inner_decoder_norm22_0_weight = const()[name = tensor<string, []>("inner_decoder_norm22_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37814784)))];
+            tensor<fp32, [256]> inner_decoder_q_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37815872)))];
+            tensor<fp32, [256, 256]> inner_decoder_q_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37816960)))];
+            tensor<fp32, [256]> inner_decoder_k_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38079168)))];
+            tensor<fp32, [256, 256]> inner_decoder_k_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38080256)))];
+            tensor<fp32, [256]> inner_decoder_v_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38342464)))];
+            tensor<fp32, [256, 256]> inner_decoder_v_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38343552)))];
+            tensor<fp32, [256]> inner_decoder_g_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38605760)))];
+            tensor<fp32, [256, 256]> inner_decoder_g_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38606848)))];
+            tensor<fp32, [256]> inner_decoder_out_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38869056)))];
+            tensor<fp32, [256, 256]> inner_decoder_out_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38870144)))];
+            tensor<fp32, [256]> inner_decoder_norm11_1_bias = const()[name = tensor<string, []>("inner_decoder_norm11_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39132352)))];
+            tensor<fp32, [256]> inner_decoder_norm11_1_weight = const()[name = tensor<string, []>("inner_decoder_norm11_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39133440)))];
+            tensor<fp32, [256]> inner_decoder_self_attn2_1_out_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39134528)))];
+            tensor<fp32, [256, 256]> inner_decoder_self_attn2_1_out_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39135616)))];
+            tensor<fp32, [768]> inner_decoder_self_attn2_1_in_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39397824)))];
+            tensor<fp32, [768, 256]> inner_decoder_self_attn2_1_in_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39400960)))];
+            tensor<fp32, [256]> inner_decoder_norm21_1_bias = const()[name = tensor<string, []>("inner_decoder_norm21_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40187456)))];
+            tensor<fp32, [256]> inner_decoder_norm21_1_weight = const()[name = tensor<string, []>("inner_decoder_norm21_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40188544)))];
+            tensor<fp32, [2048]> inner_decoder_linear1_1_bias = const()[name = tensor<string, []>("inner_decoder_linear1_1_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40189632)))];
+            tensor<fp32, [2048, 256]> inner_decoder_linear1_1_weight = const()[name = tensor<string, []>("inner_decoder_linear1_1_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40197888)))];
+            tensor<fp32, [256]> inner_decoder_linear2_1_bias = const()[name = tensor<string, []>("inner_decoder_linear2_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42295104)))];
+            tensor<fp32, [256, 2048]> inner_decoder_linear2_1_weight = const()[name = tensor<string, []>("inner_decoder_linear2_1_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42296192)))];
+            tensor<fp32, [256]> inner_decoder_norm22_1_bias = const()[name = tensor<string, []>("inner_decoder_norm22_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44393408)))];
+            tensor<fp32, [256]> inner_decoder_norm22_1_weight = const()[name = tensor<string, []>("inner_decoder_norm22_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44394496)))];
+            tensor<int32, [3]> var_19_begin_0 = const()[name = tensor<string, []>("op_19_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_19_end_0 = const()[name = tensor<string, []>("op_19_end_0"), val = tensor<int32, [3]>([1, 15, 23])];
+            tensor<bool, [3]> var_19_end_mask_0 = const()[name = tensor<string, []>("op_19_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 15, 23]> var_19 = slice_by_index(begin = var_19_begin_0, end = var_19_end_0, end_mask = var_19_end_mask_0, x = features)[name = tensor<string, []>("op_19")];
+            tensor<int32, [3]> var_29_begin_0 = const()[name = tensor<string, []>("op_29_begin_0"), val = tensor<int32, [3]>([0, 10, 0])];
+            tensor<int32, [3]> var_29_end_0 = const()[name = tensor<string, []>("op_29_end_0"), val = tensor<int32, [3]>([1, 25, 23])];
+            tensor<bool, [3]> var_29_end_mask_0 = const()[name = tensor<string, []>("op_29_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 15, 23]> var_29 = slice_by_index(begin = var_29_begin_0, end = var_29_end_0, end_mask = var_29_end_mask_0, x = features)[name = tensor<string, []>("op_29")];
+            tensor<int32, [3]> var_39_begin_0 = const()[name = tensor<string, []>("op_39_begin_0"), val = tensor<int32, [3]>([0, 20, 0])];
+            tensor<int32, [3]> var_39_end_0 = const()[name = tensor<string, []>("op_39_end_0"), val = tensor<int32, [3]>([1, 35, 23])];
+            tensor<bool, [3]> var_39_end_mask_0 = const()[name = tensor<string, []>("op_39_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 15, 23]> var_39 = slice_by_index(begin = var_39_begin_0, end = var_39_end_0, end_mask = var_39_end_mask_0, x = features)[name = tensor<string, []>("op_39")];
+            tensor<int32, [3]> var_49_begin_0 = const()[name = tensor<string, []>("op_49_begin_0"), val = tensor<int32, [3]>([0, 30, 0])];
+            tensor<int32, [3]> var_49_end_0 = const()[name = tensor<string, []>("op_49_end_0"), val = tensor<int32, [3]>([1, 1, 23])];
+            tensor<bool, [3]> var_49_end_mask_0 = const()[name = tensor<string, []>("op_49_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 23]> var_49 = slice_by_index(begin = var_49_begin_0, end = var_49_end_0, end_mask = var_49_end_mask_0, x = features)[name = tensor<string, []>("op_49")];
+            tensor<int32, []> stacked_axis_0 = const()[name = tensor<string, []>("stacked_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 4, 15, 23]> stacked = stack(axis = stacked_axis_0, values = (var_19, var_29, var_39, var_49))[name = tensor<string, []>("stacked")];
+            tensor<int32, [3]> var_56 = const()[name = tensor<string, []>("op_56"), val = tensor<int32, [3]>([1, 4, 345])];
+            tensor<fp32, [1, 4, 345]> input_1 = reshape(shape = var_56, x = stacked)[name = tensor<string, []>("input_1")];
+            tensor<fp32, []> var_58 = const()[name = tensor<string, []>("op_58"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<bool, []> var_64 = const()[name = tensor<string, []>("op_64"), val = tensor<bool, []>(true)];
+            tensor<fp32, []> var_65 = const()[name = tensor<string, []>("op_65"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
+            tensor<int32, []> var_68 = const()[name = tensor<string, []>("op_68"), val = tensor<int32, []>(0)];
+            tensor<int32, []> var_70 = const()[name = tensor<string, []>("op_70"), val = tensor<int32, []>(2)];
+            tensor<int32, []> var_71 = const()[name = tensor<string, []>("op_71"), val = tensor<int32, []>(-1)];
+            tensor<fp32, []> var_73 = const()[name = tensor<string, []>("op_73"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
+            tensor<fp32, []> var_78 = const()[name = tensor<string, []>("op_78"), val = tensor<fp32, []>(0x1.5798eep-27)];
+            tensor<int32, []> var_82 = const()[name = tensor<string, []>("op_82"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 4, 256]> input_3 = linear(bias = inner_encoder_input_projection_linear_bias, weight = inner_encoder_input_projection_linear_weight, x = input_1)[name = tensor<string, []>("linear_0")];
             tensor<int32, [1]> input_5_axes_0 = const()[name = tensor<string, []>("input_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_5 = layer_norm(axes = input_5_axes_0, beta = encoder_ffn1_0_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_0_module_sequential_0_weight, x = input_3)[name = tensor<string, []>("input_5")];
-            tensor<fp32, [1, 4, 1024]> inputs_1 = linear(bias = encoder_ffn1_0_module_sequential_1_linear_bias, weight = encoder_ffn1_0_module_sequential_1_linear_weight, x = input_5)[name = tensor<string, []>("linear_1")];
-            tensor<fp32, [1, 4, 1024]> input_7 = silu(x = inputs_1)[name = tensor<string, []>("input_7")];
-            tensor<fp32, [1, 4, 256]> input_11 = linear(bias = encoder_ffn1_0_module_sequential_4_linear_bias, weight = encoder_ffn1_0_module_sequential_4_linear_weight, x = input_7)[name = tensor<string, []>("linear_2")];
-            tensor<fp32, []> var_147 = const()[name = tensor<string, []>("op_147"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 4, 256]> var_148 = mul(x = input_11, y = var_147)[name = tensor<string, []>("op_148")];
-            tensor<fp32, [1, 4, 256]> input_13 = add(x = var_148, y = input_3)[name = tensor<string, []>("input_13")];
+            tensor<fp32, [1, 4, 256]> input_5 = layer_norm(axes = input_5_axes_0, beta = inner_encoder_pre_layer_norm_bias, epsilon = var_65, gamma = inner_encoder_pre_layer_norm_weight, x = input_3)[name = tensor<string, []>("input_5")];
+            tensor<int32, [1]> input_7_axes_0 = const()[name = tensor<string, []>("input_7_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 4, 256]> input_7 = layer_norm(axes = input_7_axes_0, beta = inner_encoder_ffn1_0_module_sequential_0_bias, epsilon = var_65, gamma = inner_encoder_ffn1_0_module_sequential_0_weight, x = input_5)[name = tensor<string, []>("input_7")];
+            tensor<fp32, [1, 4, 1024]> inputs_1 = linear(bias = inner_encoder_ffn1_0_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_0_module_sequential_1_linear_weight, x = input_7)[name = tensor<string, []>("linear_1")];
+            tensor<fp32, [1, 4, 1024]> input_9 = silu(x = inputs_1)[name = tensor<string, []>("input_9")];
+            tensor<fp32, [1, 4, 256]> input_13 = linear(bias = inner_encoder_ffn1_0_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_0_module_sequential_4_linear_weight, x = input_9)[name = tensor<string, []>("linear_2")];
+            tensor<fp32, []> var_203 = const()[name = tensor<string, []>("op_203"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 4, 256]> var_204 = mul(x = input_13, y = var_203)[name = tensor<string, []>("op_204")];
+            tensor<fp32, [1, 4, 256]> input_15 = add(x = var_204, y = input_5)[name = tensor<string, []>("input_15")];
             tensor<int32, [1]> x_1_axes_0 = const()[name = tensor<string, []>("x_1_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> x_1 = layer_norm(axes = x_1_axes_0, beta = encoder_ret_lns_0_bias, epsilon = var_28, gamma = encoder_ret_lns_0_weight, x = input_13)[name = tensor<string, []>("x_1")];
+            tensor<fp32, [1, 4, 256]> x_1 = layer_norm(axes = x_1_axes_0, beta = inner_encoder_ret_lns_0_bias, epsilon = var_65, gamma = inner_encoder_ret_lns_0_weight, x = input_15)[name = tensor<string, []>("x_1")];
             tensor<int32, [5]> prev_kv_1_begin_0 = const()[name = tensor<string, []>("prev_kv_1_begin_0"), val = tensor<int32, [5]>([0, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_1_end_0 = const()[name = tensor<string, []>("prev_kv_1_end_0"), val = tensor<int32, [5]>([1, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_1_end_mask_0 = const()[name = tensor<string, []>("prev_kv_1_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -239,173 +261,173 @@ program(1.0)
             tensor<bool, [2]> prev_scale_1_end_mask_0 = const()[name = tensor<string, []>("prev_scale_1_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_1_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_1_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_1 = slice_by_index(begin = prev_scale_1_begin_0, end = prev_scale_1_end_0, end_mask = prev_scale_1_end_mask_0, squeeze_mask = prev_scale_1_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_1")];
-            tensor<fp32, [1, 4, 256]> var_162 = linear(bias = encoder_q_proj_0_bias, weight = encoder_q_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_3")];
-            tensor<int32, [4]> var_163 = const()[name = tensor<string, []>("op_163"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_164 = reshape(shape = var_163, x = var_162)[name = tensor<string, []>("op_164")];
+            tensor<fp32, [1, 4, 256]> var_218 = linear(bias = inner_encoder_q_proj_0_bias, weight = inner_encoder_q_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_3")];
+            tensor<int32, [4]> var_219 = const()[name = tensor<string, []>("op_219"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_220 = reshape(shape = var_219, x = var_218)[name = tensor<string, []>("op_220")];
             tensor<int32, [4]> q_1_perm_0 = const()[name = tensor<string, []>("q_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> var_168 = linear(bias = encoder_k_proj_0_bias, weight = encoder_k_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_4")];
-            tensor<fp32, []> var_169 = const()[name = tensor<string, []>("op_169"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 4, 256]> var_170 = mul(x = var_168, y = var_169)[name = tensor<string, []>("op_170")];
-            tensor<int32, [4]> var_171 = const()[name = tensor<string, []>("op_171"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_172 = reshape(shape = var_171, x = var_170)[name = tensor<string, []>("op_172")];
+            tensor<fp32, [1, 4, 256]> var_224 = linear(bias = inner_encoder_k_proj_0_bias, weight = inner_encoder_k_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_4")];
+            tensor<fp32, []> var_225 = const()[name = tensor<string, []>("op_225"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 4, 256]> var_226 = mul(x = var_224, y = var_225)[name = tensor<string, []>("op_226")];
+            tensor<int32, [4]> var_227 = const()[name = tensor<string, []>("op_227"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_228 = reshape(shape = var_227, x = var_226)[name = tensor<string, []>("op_228")];
             tensor<int32, [4]> k_1_perm_0 = const()[name = tensor<string, []>("k_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> var_176 = linear(bias = encoder_v_proj_0_bias, weight = encoder_v_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_5")];
-            tensor<int32, [4]> var_177 = const()[name = tensor<string, []>("op_177"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_178 = reshape(shape = var_177, x = var_176)[name = tensor<string, []>("op_178")];
+            tensor<fp32, [1, 4, 256]> var_232 = linear(bias = inner_encoder_v_proj_0_bias, weight = inner_encoder_v_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_5")];
+            tensor<int32, [4]> var_233 = const()[name = tensor<string, []>("op_233"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_234 = reshape(shape = var_233, x = var_232)[name = tensor<string, []>("op_234")];
             tensor<int32, [4]> v_1_perm_0 = const()[name = tensor<string, []>("v_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> input_17 = linear(bias = encoder_g_proj_0_bias, weight = encoder_g_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_6")];
+            tensor<fp32, [1, 4, 256]> input_19 = linear(bias = inner_encoder_g_proj_0_bias, weight = inner_encoder_g_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_6")];
             tensor<fp32, [1]> sqrt_s0_1 = sqrt(x = prev_scale_1)[name = tensor<string, []>("sqrt_s0_1")];
-            tensor<fp32, [4]> s_t_1 = add(x = prev_scale_1, y = encoder__t_index)[name = tensor<string, []>("s_t_1")];
+            tensor<fp32, [4]> s_t_1 = add(x = prev_scale_1, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_1")];
             tensor<fp32, [4]> sqrt_s_t_1 = sqrt(x = s_t_1)[name = tensor<string, []>("sqrt_s_t_1")];
             tensor<bool, []> qk_1_transpose_x_1 = const()[name = tensor<string, []>("qk_1_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_1_transpose_y_1 = const()[name = tensor<string, []>("qk_1_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 4, 64]> k_1 = transpose(perm = k_1_perm_0, x = var_172)[name = tensor<string, []>("transpose_57")];
-            tensor<fp32, [1, 4, 4, 64]> q_1 = transpose(perm = q_1_perm_0, x = var_164)[name = tensor<string, []>("transpose_58")];
+            tensor<fp32, [1, 4, 4, 64]> k_1 = transpose(perm = k_1_perm_0, x = var_228)[name = tensor<string, []>("transpose_57")];
+            tensor<fp32, [1, 4, 4, 64]> q_1 = transpose(perm = q_1_perm_0, x = var_220)[name = tensor<string, []>("transpose_58")];
             tensor<fp32, [1, 4, 4, 4]> qk_1 = matmul(transpose_x = qk_1_transpose_x_1, transpose_y = qk_1_transpose_y_1, x = q_1, y = k_1)[name = tensor<string, []>("qk_1")];
-            tensor<int32, [2]> var_188 = const()[name = tensor<string, []>("op_188"), val = tensor<int32, [2]>([4, 1])];
-            tensor<fp32, [4, 1]> var_189 = reshape(shape = var_188, x = sqrt_s_t_1)[name = tensor<string, []>("op_189")];
-            tensor<fp32, [4, 4]> M_1 = real_div(x = encoder__causal_mask, y = var_189)[name = tensor<string, []>("M_1")];
-            tensor<fp32, [1, 4, 4, 4]> var_191 = mul(x = qk_1, y = M_1)[name = tensor<string, []>("op_191")];
+            tensor<int32, [2]> var_244 = const()[name = tensor<string, []>("op_244"), val = tensor<int32, [2]>([4, 1])];
+            tensor<fp32, [4, 1]> var_245 = reshape(shape = var_244, x = sqrt_s_t_1)[name = tensor<string, []>("op_245")];
+            tensor<fp32, [4, 4]> M_1 = real_div(x = inner_encoder__causal_mask, y = var_245)[name = tensor<string, []>("M_1")];
+            tensor<fp32, [1, 4, 4, 4]> var_247 = mul(x = qk_1, y = M_1)[name = tensor<string, []>("op_247")];
             tensor<bool, []> inner_1_transpose_x_0 = const()[name = tensor<string, []>("inner_1_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_1_transpose_y_0 = const()[name = tensor<string, []>("inner_1_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 4, 64]> v_1 = transpose(perm = v_1_perm_0, x = var_178)[name = tensor<string, []>("transpose_56")];
-            tensor<fp32, [1, 4, 4, 64]> inner_1 = matmul(transpose_x = inner_1_transpose_x_0, transpose_y = inner_1_transpose_y_0, x = var_191, y = v_1)[name = tensor<string, []>("inner_1")];
-            tensor<bool, []> var_193_transpose_x_0 = const()[name = tensor<string, []>("op_193_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_193_transpose_y_0 = const()[name = tensor<string, []>("op_193_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 4, 64]> var_193 = matmul(transpose_x = var_193_transpose_x_0, transpose_y = var_193_transpose_y_0, x = q_1, y = prev_kv_1)[name = tensor<string, []>("op_193")];
-            tensor<fp32, [4]> var_194 = real_div(x = sqrt_s0_1, y = sqrt_s_t_1)[name = tensor<string, []>("op_194")];
-            tensor<int32, [4]> var_195 = const()[name = tensor<string, []>("op_195"), val = tensor<int32, [4]>([1, 1, 4, 1])];
-            tensor<fp32, [1, 1, 4, 1]> var_196 = reshape(shape = var_195, x = var_194)[name = tensor<string, []>("op_196")];
-            tensor<fp32, [1, 4, 4, 64]> cross_1 = mul(x = var_193, y = var_196)[name = tensor<string, []>("cross_1")];
+            tensor<fp32, [1, 4, 4, 64]> v_1 = transpose(perm = v_1_perm_0, x = var_234)[name = tensor<string, []>("transpose_56")];
+            tensor<fp32, [1, 4, 4, 64]> inner_1 = matmul(transpose_x = inner_1_transpose_x_0, transpose_y = inner_1_transpose_y_0, x = var_247, y = v_1)[name = tensor<string, []>("inner_1")];
+            tensor<bool, []> var_249_transpose_x_0 = const()[name = tensor<string, []>("op_249_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_249_transpose_y_0 = const()[name = tensor<string, []>("op_249_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 4, 64]> var_249 = matmul(transpose_x = var_249_transpose_x_0, transpose_y = var_249_transpose_y_0, x = q_1, y = prev_kv_1)[name = tensor<string, []>("op_249")];
+            tensor<fp32, [4]> var_250 = real_div(x = sqrt_s0_1, y = sqrt_s_t_1)[name = tensor<string, []>("op_250")];
+            tensor<int32, [4]> var_251 = const()[name = tensor<string, []>("op_251"), val = tensor<int32, [4]>([1, 1, 4, 1])];
+            tensor<fp32, [1, 1, 4, 1]> var_252 = reshape(shape = var_251, x = var_250)[name = tensor<string, []>("op_252")];
+            tensor<fp32, [1, 4, 4, 64]> cross_1 = mul(x = var_249, y = var_252)[name = tensor<string, []>("cross_1")];
             tensor<fp32, [1, 4, 4, 64]> out_1 = add(x = inner_1, y = cross_1)[name = tensor<string, []>("out_1")];
-            tensor<fp32, [1, 4, 64, 64]> var_199 = mul(x = prev_kv_1, y = sqrt_s0_1)[name = tensor<string, []>("op_199")];
-            tensor<bool, []> var_201_transpose_x_1 = const()[name = tensor<string, []>("op_201_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_201_transpose_y_1 = const()[name = tensor<string, []>("op_201_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_201 = matmul(transpose_x = var_201_transpose_x_1, transpose_y = var_201_transpose_y_1, x = k_1, y = v_1)[name = tensor<string, []>("op_201")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_1 = add(x = var_199, y = var_201)[name = tensor<string, []>("new_kv_unnorm_1")];
-            tensor<fp32, []> var_203 = const()[name = tensor<string, []>("op_203"), val = tensor<fp32, []>(0x1p+2)];
-            tensor<fp32, [1]> new_scale_1 = add(x = prev_scale_1, y = var_203)[name = tensor<string, []>("new_scale_1")];
-            tensor<fp32, [1]> var_205 = sqrt(x = new_scale_1)[name = tensor<string, []>("op_205")];
-            tensor<fp32, [1, 4, 64, 64]> var_206 = real_div(x = new_kv_unnorm_1, y = var_205)[name = tensor<string, []>("op_206")];
-            tensor<int32, [4]> var_207_perm_0 = const()[name = tensor<string, []>("op_207_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_255 = mul(x = prev_kv_1, y = sqrt_s0_1)[name = tensor<string, []>("op_255")];
+            tensor<bool, []> var_257_transpose_x_1 = const()[name = tensor<string, []>("op_257_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_257_transpose_y_1 = const()[name = tensor<string, []>("op_257_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_257 = matmul(transpose_x = var_257_transpose_x_1, transpose_y = var_257_transpose_y_1, x = k_1, y = v_1)[name = tensor<string, []>("op_257")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_1 = add(x = var_255, y = var_257)[name = tensor<string, []>("new_kv_unnorm_1")];
+            tensor<fp32, []> var_259 = const()[name = tensor<string, []>("op_259"), val = tensor<fp32, []>(0x1p+2)];
+            tensor<fp32, [1]> new_scale_1 = add(x = prev_scale_1, y = var_259)[name = tensor<string, []>("new_scale_1")];
+            tensor<fp32, [1]> var_261 = sqrt(x = new_scale_1)[name = tensor<string, []>("op_261")];
+            tensor<fp32, [1, 4, 64, 64]> var_262 = real_div(x = new_kv_unnorm_1, y = var_261)[name = tensor<string, []>("op_262")];
+            tensor<int32, [4]> var_263_perm_0 = const()[name = tensor<string, []>("op_263_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_3_axes_0 = const()[name = tensor<string, []>("out_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 4, 64]> var_207 = transpose(perm = var_207_perm_0, x = out_1)[name = tensor<string, []>("transpose_55")];
-            tensor<fp32, [1, 4, 4, 64]> out_3 = layer_norm(axes = out_3_axes_0, epsilon = var_17, x = var_207)[name = tensor<string, []>("out_3")];
-            tensor<int32, [3]> var_211 = const()[name = tensor<string, []>("op_211"), val = tensor<int32, [3]>([1, 4, 256])];
-            tensor<fp32, [1, 4, 256]> out_5 = reshape(shape = var_211, x = out_3)[name = tensor<string, []>("out_5")];
-            tensor<fp32, [1, 4, 256]> var_213 = silu(x = input_17)[name = tensor<string, []>("op_213")];
-            tensor<fp32, [1, 4, 256]> input_19 = mul(x = var_213, y = out_5)[name = tensor<string, []>("input_19")];
-            tensor<fp32, [1, 4, 256]> ret_out_1 = linear(bias = encoder_out_proj_0_bias, weight = encoder_out_proj_0_weight, x = input_19)[name = tensor<string, []>("linear_7")];
-            tensor<fp32, [1, 4, 256]> x_3 = add(x = input_13, y = ret_out_1)[name = tensor<string, []>("x_3")];
+            tensor<fp32, [1, 4, 4, 64]> var_263 = transpose(perm = var_263_perm_0, x = out_1)[name = tensor<string, []>("transpose_55")];
+            tensor<fp32, [1, 4, 4, 64]> out_3 = layer_norm(axes = out_3_axes_0, epsilon = var_73, x = var_263)[name = tensor<string, []>("out_3")];
+            tensor<int32, [3]> var_267 = const()[name = tensor<string, []>("op_267"), val = tensor<int32, [3]>([1, 4, 256])];
+            tensor<fp32, [1, 4, 256]> out_5 = reshape(shape = var_267, x = out_3)[name = tensor<string, []>("out_5")];
+            tensor<fp32, [1, 4, 256]> var_269 = silu(x = input_19)[name = tensor<string, []>("op_269")];
+            tensor<fp32, [1, 4, 256]> input_21 = mul(x = var_269, y = out_5)[name = tensor<string, []>("input_21")];
+            tensor<fp32, [1, 4, 256]> ret_out_1 = linear(bias = inner_encoder_out_proj_0_bias, weight = inner_encoder_out_proj_0_weight, x = input_21)[name = tensor<string, []>("linear_7")];
+            tensor<fp32, [1, 4, 256]> x_3 = add(x = input_15, y = ret_out_1)[name = tensor<string, []>("x_3")];
             tensor<int32, [4]> window_1_begin_0 = const()[name = tensor<string, []>("window_1_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> window_1_end_0 = const()[name = tensor<string, []>("window_1_end_0"), val = tensor<int32, [4]>([1, 1, 16, 256])];
             tensor<bool, [4]> window_1_end_mask_0 = const()[name = tensor<string, []>("window_1_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_1_squeeze_mask_0 = const()[name = tensor<string, []>("window_1_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_1 = slice_by_index(begin = window_1_begin_0, end = window_1_end_0, end_mask = window_1_end_mask_0, squeeze_mask = window_1_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_1")];
-            tensor<int32, [3]> var_221_begin_0 = const()[name = tensor<string, []>("op_221_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_221_end_0 = const()[name = tensor<string, []>("op_221_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_221_end_mask_0 = const()[name = tensor<string, []>("op_221_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_221 = slice_by_index(begin = var_221_begin_0, end = var_221_end_0, end_mask = var_221_end_mask_0, x = x_3)[name = tensor<string, []>("op_221")];
-            tensor<int32, [3]> var_224_begin_0 = const()[name = tensor<string, []>("op_224_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_224_end_0 = const()[name = tensor<string, []>("op_224_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_224_end_mask_0 = const()[name = tensor<string, []>("op_224_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_224 = slice_by_index(begin = var_224_begin_0, end = var_224_end_0, end_mask = var_224_end_mask_0, x = window_1)[name = tensor<string, []>("op_224")];
+            tensor<int32, [3]> var_277_begin_0 = const()[name = tensor<string, []>("op_277_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_277_end_0 = const()[name = tensor<string, []>("op_277_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_277_end_mask_0 = const()[name = tensor<string, []>("op_277_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_277 = slice_by_index(begin = var_277_begin_0, end = var_277_end_0, end_mask = var_277_end_mask_0, x = x_3)[name = tensor<string, []>("op_277")];
+            tensor<int32, [3]> var_280_begin_0 = const()[name = tensor<string, []>("op_280_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_280_end_0 = const()[name = tensor<string, []>("op_280_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_280_end_mask_0 = const()[name = tensor<string, []>("op_280_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_280 = slice_by_index(begin = var_280_begin_0, end = var_280_end_0, end_mask = var_280_end_mask_0, x = window_1)[name = tensor<string, []>("op_280")];
             tensor<bool, []> window_3_interleave_0 = const()[name = tensor<string, []>("window_3_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_3 = concat(axis = var_26, interleave = window_3_interleave_0, values = (var_224, var_221))[name = tensor<string, []>("window_3")];
-            tensor<int32, [3]> var_229_begin_0 = const()[name = tensor<string, []>("op_229_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_229_end_0 = const()[name = tensor<string, []>("op_229_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_229_end_mask_0 = const()[name = tensor<string, []>("op_229_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_229 = slice_by_index(begin = var_229_begin_0, end = var_229_end_0, end_mask = var_229_end_mask_0, x = x_3)[name = tensor<string, []>("op_229")];
-            tensor<int32, [3]> var_232_begin_0 = const()[name = tensor<string, []>("op_232_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_232_end_0 = const()[name = tensor<string, []>("op_232_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_232_end_mask_0 = const()[name = tensor<string, []>("op_232_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_232 = slice_by_index(begin = var_232_begin_0, end = var_232_end_0, end_mask = var_232_end_mask_0, x = window_3)[name = tensor<string, []>("op_232")];
+            tensor<fp32, [1, 16, 256]> window_3 = concat(axis = var_82, interleave = window_3_interleave_0, values = (var_280, var_277))[name = tensor<string, []>("window_3")];
+            tensor<int32, [3]> var_285_begin_0 = const()[name = tensor<string, []>("op_285_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_285_end_0 = const()[name = tensor<string, []>("op_285_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_285_end_mask_0 = const()[name = tensor<string, []>("op_285_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_285 = slice_by_index(begin = var_285_begin_0, end = var_285_end_0, end_mask = var_285_end_mask_0, x = x_3)[name = tensor<string, []>("op_285")];
+            tensor<int32, [3]> var_288_begin_0 = const()[name = tensor<string, []>("op_288_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_288_end_0 = const()[name = tensor<string, []>("op_288_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_288_end_mask_0 = const()[name = tensor<string, []>("op_288_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_288 = slice_by_index(begin = var_288_begin_0, end = var_288_end_0, end_mask = var_288_end_mask_0, x = window_3)[name = tensor<string, []>("op_288")];
             tensor<bool, []> window_5_interleave_0 = const()[name = tensor<string, []>("window_5_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_5 = concat(axis = var_26, interleave = window_5_interleave_0, values = (var_232, var_229))[name = tensor<string, []>("window_5")];
-            tensor<int32, [3]> var_237_begin_0 = const()[name = tensor<string, []>("op_237_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_237_end_0 = const()[name = tensor<string, []>("op_237_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<bool, [3]> var_237_end_mask_0 = const()[name = tensor<string, []>("op_237_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_237 = slice_by_index(begin = var_237_begin_0, end = var_237_end_0, end_mask = var_237_end_mask_0, x = x_3)[name = tensor<string, []>("op_237")];
-            tensor<int32, [3]> var_240_begin_0 = const()[name = tensor<string, []>("op_240_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_240_end_0 = const()[name = tensor<string, []>("op_240_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_240_end_mask_0 = const()[name = tensor<string, []>("op_240_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_240 = slice_by_index(begin = var_240_begin_0, end = var_240_end_0, end_mask = var_240_end_mask_0, x = window_5)[name = tensor<string, []>("op_240")];
+            tensor<fp32, [1, 16, 256]> window_5 = concat(axis = var_82, interleave = window_5_interleave_0, values = (var_288, var_285))[name = tensor<string, []>("window_5")];
+            tensor<int32, [3]> var_293_begin_0 = const()[name = tensor<string, []>("op_293_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_293_end_0 = const()[name = tensor<string, []>("op_293_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<bool, [3]> var_293_end_mask_0 = const()[name = tensor<string, []>("op_293_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_293 = slice_by_index(begin = var_293_begin_0, end = var_293_end_0, end_mask = var_293_end_mask_0, x = x_3)[name = tensor<string, []>("op_293")];
+            tensor<int32, [3]> var_296_begin_0 = const()[name = tensor<string, []>("op_296_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_296_end_0 = const()[name = tensor<string, []>("op_296_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_296_end_mask_0 = const()[name = tensor<string, []>("op_296_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_296 = slice_by_index(begin = var_296_begin_0, end = var_296_end_0, end_mask = var_296_end_mask_0, x = window_5)[name = tensor<string, []>("op_296")];
             tensor<bool, []> window_7_interleave_0 = const()[name = tensor<string, []>("window_7_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_7 = concat(axis = var_26, interleave = window_7_interleave_0, values = (var_240, var_237))[name = tensor<string, []>("window_7")];
-            tensor<int32, [3]> var_245_begin_0 = const()[name = tensor<string, []>("op_245_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
-            tensor<int32, [3]> var_245_end_0 = const()[name = tensor<string, []>("op_245_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_245_end_mask_0 = const()[name = tensor<string, []>("op_245_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_245 = slice_by_index(begin = var_245_begin_0, end = var_245_end_0, end_mask = var_245_end_mask_0, x = x_3)[name = tensor<string, []>("op_245")];
-            tensor<int32, [3]> var_248_begin_0 = const()[name = tensor<string, []>("op_248_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_248_end_0 = const()[name = tensor<string, []>("op_248_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_248_end_mask_0 = const()[name = tensor<string, []>("op_248_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_248 = slice_by_index(begin = var_248_begin_0, end = var_248_end_0, end_mask = var_248_end_mask_0, x = window_7)[name = tensor<string, []>("op_248")];
+            tensor<fp32, [1, 16, 256]> window_7 = concat(axis = var_82, interleave = window_7_interleave_0, values = (var_296, var_293))[name = tensor<string, []>("window_7")];
+            tensor<int32, [3]> var_301_begin_0 = const()[name = tensor<string, []>("op_301_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
+            tensor<int32, [3]> var_301_end_0 = const()[name = tensor<string, []>("op_301_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_301_end_mask_0 = const()[name = tensor<string, []>("op_301_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_301 = slice_by_index(begin = var_301_begin_0, end = var_301_end_0, end_mask = var_301_end_mask_0, x = x_3)[name = tensor<string, []>("op_301")];
+            tensor<int32, [3]> var_304_begin_0 = const()[name = tensor<string, []>("op_304_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_304_end_0 = const()[name = tensor<string, []>("op_304_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_304_end_mask_0 = const()[name = tensor<string, []>("op_304_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_304 = slice_by_index(begin = var_304_begin_0, end = var_304_end_0, end_mask = var_304_end_mask_0, x = window_7)[name = tensor<string, []>("op_304")];
             tensor<bool, []> window_9_interleave_0 = const()[name = tensor<string, []>("window_9_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_9 = concat(axis = var_26, interleave = window_9_interleave_0, values = (var_248, var_245))[name = tensor<string, []>("window_9")];
-            tensor<bool, []> input_21_interleave_0 = const()[name = tensor<string, []>("input_21_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [4, 16, 256]> input_21 = concat(axis = var_23, interleave = input_21_interleave_0, values = (window_3, window_5, window_7, window_9))[name = tensor<string, []>("input_21")];
+            tensor<fp32, [1, 16, 256]> window_9 = concat(axis = var_82, interleave = window_9_interleave_0, values = (var_304, var_301))[name = tensor<string, []>("window_9")];
+            tensor<bool, []> input_23_interleave_0 = const()[name = tensor<string, []>("input_23_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [4, 16, 256]> input_23 = concat(axis = var_68, interleave = input_23_interleave_0, values = (window_3, window_5, window_7, window_9))[name = tensor<string, []>("input_23")];
             tensor<int32, [1]> x_5_axes_0 = const()[name = tensor<string, []>("x_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [4, 16, 256]> x_5 = layer_norm(axes = x_5_axes_0, beta = encoder_conv_module_0_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_0_sequential_0_weight, x = input_21)[name = tensor<string, []>("x_5")];
-            tensor<int32, [3]> input_23_perm_0 = const()[name = tensor<string, []>("input_23_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [4, 16, 256]> x_5 = layer_norm(axes = x_5_axes_0, beta = inner_encoder_conv_module_0_sequential_0_bias, epsilon = var_65, gamma = inner_encoder_conv_module_0_sequential_0_weight, x = input_23)[name = tensor<string, []>("x_5")];
+            tensor<int32, [3]> input_25_perm_0 = const()[name = tensor<string, []>("input_25_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_3_pad_type_0 = const()[name = tensor<string, []>("inputs_3_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_3_strides_0 = const()[name = tensor<string, []>("inputs_3_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_3_pad_0 = const()[name = tensor<string, []>("inputs_3_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_3_dilations_0 = const()[name = tensor<string, []>("inputs_3_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_3_groups_0 = const()[name = tensor<string, []>("inputs_3_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> input_23 = transpose(perm = input_23_perm_0, x = x_5)[name = tensor<string, []>("transpose_54")];
-            tensor<fp32, [4, 512, 16]> inputs_3 = conv(bias = encoder_conv_module_0_sequential_2_conv_bias, dilations = inputs_3_dilations_0, groups = inputs_3_groups_0, pad = inputs_3_pad_0, pad_type = inputs_3_pad_type_0, strides = inputs_3_strides_0, weight = encoder_conv_module_0_sequential_2_conv_weight, x = input_23)[name = tensor<string, []>("inputs_3")];
-            tensor<int32, [2]> var_273_split_sizes_0 = const()[name = tensor<string, []>("op_273_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_273_axis_0 = const()[name = tensor<string, []>("op_273_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> var_273_0, tensor<fp32, [4, 256, 16]> var_273_1 = split(axis = var_273_axis_0, split_sizes = var_273_split_sizes_0, x = inputs_3)[name = tensor<string, []>("op_273")];
-            tensor<fp32, [4, 256, 16]> var_275 = sigmoid(x = var_273_1)[name = tensor<string, []>("op_275")];
-            tensor<fp32, [4, 256, 16]> inputs_5 = mul(x = var_273_0, y = var_275)[name = tensor<string, []>("inputs_5")];
+            tensor<fp32, [4, 256, 16]> input_25 = transpose(perm = input_25_perm_0, x = x_5)[name = tensor<string, []>("transpose_54")];
+            tensor<fp32, [4, 512, 16]> inputs_3 = conv(bias = inner_encoder_conv_module_0_sequential_2_conv_bias, dilations = inputs_3_dilations_0, groups = inputs_3_groups_0, pad = inputs_3_pad_0, pad_type = inputs_3_pad_type_0, strides = inputs_3_strides_0, weight = inner_encoder_conv_module_0_sequential_2_conv_weight, x = input_25)[name = tensor<string, []>("inputs_3")];
+            tensor<int32, [2]> var_329_split_sizes_0 = const()[name = tensor<string, []>("op_329_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_329_axis_0 = const()[name = tensor<string, []>("op_329_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [4, 256, 16]> var_329_0, tensor<fp32, [4, 256, 16]> var_329_1 = split(axis = var_329_axis_0, split_sizes = var_329_split_sizes_0, x = inputs_3)[name = tensor<string, []>("op_329")];
+            tensor<fp32, [4, 256, 16]> var_331 = sigmoid(x = var_329_1)[name = tensor<string, []>("op_331")];
+            tensor<fp32, [4, 256, 16]> inputs_5 = mul(x = var_329_0, y = var_331)[name = tensor<string, []>("inputs_5")];
             tensor<string, []> outputs_aug_1_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_1_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_1_pad_0 = const()[name = tensor<string, []>("outputs_aug_1_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_1_groups_0 = const()[name = tensor<string, []>("outputs_aug_1_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_1_strides_0 = const()[name = tensor<string, []>("outputs_aug_1_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_1_dilations_0 = const()[name = tensor<string, []>("outputs_aug_1_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [4, 256, 31]> outputs_aug_1 = conv(dilations = outputs_aug_1_dilations_0, groups = outputs_aug_1_groups_0, pad = outputs_aug_1_pad_0, pad_type = outputs_aug_1_pad_type_0, strides = outputs_aug_1_strides_0, weight = encoder_conv_module_0_sequential_4_conv_weight, x = inputs_5)[name = tensor<string, []>("outputs_aug_1")];
-            tensor<int32, [3]> input_25_begin_0 = const()[name = tensor<string, []>("input_25_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_25_end_0 = const()[name = tensor<string, []>("input_25_end_0"), val = tensor<int32, [3]>([4, 256, 16])];
-            tensor<bool, [3]> input_25_end_mask_0 = const()[name = tensor<string, []>("input_25_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [4, 256, 16]> input_25 = slice_by_index(begin = input_25_begin_0, end = input_25_end_0, end_mask = input_25_end_mask_0, x = outputs_aug_1)[name = tensor<string, []>("input_25")];
-            tensor<fp32, [4, 256, 16]> inputs_7 = batch_norm(beta = encoder_conv_module_0_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_0_sequential_5_weight, mean = encoder_conv_module_0_sequential_5_running_mean, variance = encoder_conv_module_0_sequential_5_running_var, x = input_25)[name = tensor<string, []>("inputs_7")];
-            tensor<fp32, [4, 256, 16]> input_27 = silu(x = inputs_7)[name = tensor<string, []>("input_27")];
-            tensor<string, []> input_29_pad_type_0 = const()[name = tensor<string, []>("input_29_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_29_strides_0 = const()[name = tensor<string, []>("input_29_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_29_pad_0 = const()[name = tensor<string, []>("input_29_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_29_dilations_0 = const()[name = tensor<string, []>("input_29_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_29_groups_0 = const()[name = tensor<string, []>("input_29_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> input_29 = conv(bias = encoder_conv_module_0_sequential_7_conv_bias, dilations = input_29_dilations_0, groups = input_29_groups_0, pad = input_29_pad_0, pad_type = input_29_pad_type_0, strides = input_29_strides_0, weight = encoder_conv_module_0_sequential_7_conv_weight, x = input_27)[name = tensor<string, []>("input_29")];
+            tensor<fp32, [4, 256, 31]> outputs_aug_1 = conv(dilations = outputs_aug_1_dilations_0, groups = outputs_aug_1_groups_0, pad = outputs_aug_1_pad_0, pad_type = outputs_aug_1_pad_type_0, strides = outputs_aug_1_strides_0, weight = inner_encoder_conv_module_0_sequential_4_conv_weight, x = inputs_5)[name = tensor<string, []>("outputs_aug_1")];
+            tensor<int32, [3]> input_27_begin_0 = const()[name = tensor<string, []>("input_27_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_27_end_0 = const()[name = tensor<string, []>("input_27_end_0"), val = tensor<int32, [3]>([4, 256, 16])];
+            tensor<bool, [3]> input_27_end_mask_0 = const()[name = tensor<string, []>("input_27_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [4, 256, 16]> input_27 = slice_by_index(begin = input_27_begin_0, end = input_27_end_0, end_mask = input_27_end_mask_0, x = outputs_aug_1)[name = tensor<string, []>("input_27")];
+            tensor<fp32, [4, 256, 16]> inputs_7 = batch_norm(beta = inner_encoder_conv_module_0_sequential_5_bias, epsilon = var_65, gamma = inner_encoder_conv_module_0_sequential_5_weight, mean = inner_encoder_conv_module_0_sequential_5_running_mean, variance = inner_encoder_conv_module_0_sequential_5_running_var, x = input_27)[name = tensor<string, []>("inputs_7")];
+            tensor<fp32, [4, 256, 16]> input_29 = silu(x = inputs_7)[name = tensor<string, []>("input_29")];
+            tensor<string, []> input_31_pad_type_0 = const()[name = tensor<string, []>("input_31_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_31_strides_0 = const()[name = tensor<string, []>("input_31_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_31_pad_0 = const()[name = tensor<string, []>("input_31_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_31_dilations_0 = const()[name = tensor<string, []>("input_31_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_31_groups_0 = const()[name = tensor<string, []>("input_31_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [4, 256, 16]> input_31 = conv(bias = inner_encoder_conv_module_0_sequential_7_conv_bias, dilations = input_31_dilations_0, groups = input_31_groups_0, pad = input_31_pad_0, pad_type = input_31_pad_type_0, strides = input_31_strides_0, weight = inner_encoder_conv_module_0_sequential_7_conv_weight, x = input_29)[name = tensor<string, []>("input_31")];
             tensor<int32, [3]> conv_out_1_perm_0 = const()[name = tensor<string, []>("conv_out_1_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_306_begin_0 = const()[name = tensor<string, []>("op_306_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_306_end_0 = const()[name = tensor<string, []>("op_306_end_0"), val = tensor<int32, [3]>([4, 16, 256])];
-            tensor<bool, [3]> var_306_end_mask_0 = const()[name = tensor<string, []>("op_306_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [4, 16, 256]> conv_out_1 = transpose(perm = conv_out_1_perm_0, x = input_29)[name = tensor<string, []>("transpose_53")];
-            tensor<fp32, [4, 1, 256]> var_306 = slice_by_index(begin = var_306_begin_0, end = var_306_end_0, end_mask = var_306_end_mask_0, x = conv_out_1)[name = tensor<string, []>("op_306")];
-            tensor<int32, [3]> var_308_perm_0 = const()[name = tensor<string, []>("op_308_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 4, 256]> var_308 = transpose(perm = var_308_perm_0, x = var_306)[name = tensor<string, []>("transpose_52")];
-            tensor<fp32, [1, 4, 256]> input_31 = add(x = x_3, y = var_308)[name = tensor<string, []>("input_31")];
-            tensor<int32, [1]> input_33_axes_0 = const()[name = tensor<string, []>("input_33_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_33 = layer_norm(axes = input_33_axes_0, beta = encoder_ffn2_0_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_0_module_sequential_0_weight, x = input_31)[name = tensor<string, []>("input_33")];
-            tensor<fp32, [1, 4, 1024]> inputs_9 = linear(bias = encoder_ffn2_0_module_sequential_1_linear_bias, weight = encoder_ffn2_0_module_sequential_1_linear_weight, x = input_33)[name = tensor<string, []>("linear_8")];
-            tensor<fp32, [1, 4, 1024]> input_35 = silu(x = inputs_9)[name = tensor<string, []>("input_35")];
-            tensor<fp32, [1, 4, 256]> input_39 = linear(bias = encoder_ffn2_0_module_sequential_4_linear_bias, weight = encoder_ffn2_0_module_sequential_4_linear_weight, x = input_35)[name = tensor<string, []>("linear_9")];
-            tensor<fp32, []> var_331 = const()[name = tensor<string, []>("op_331"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 4, 256]> var_332 = mul(x = input_39, y = var_331)[name = tensor<string, []>("op_332")];
-            tensor<fp32, [1, 4, 256]> input_41 = add(x = var_332, y = input_31)[name = tensor<string, []>("input_41")];
-            tensor<int32, [1]> input_43_axes_0 = const()[name = tensor<string, []>("input_43_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_43 = layer_norm(axes = input_43_axes_0, beta = encoder_layer_norm_0_bias, epsilon = var_28, gamma = encoder_layer_norm_0_weight, x = input_41)[name = tensor<string, []>("input_43")];
+            tensor<int32, [3]> var_362_begin_0 = const()[name = tensor<string, []>("op_362_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_362_end_0 = const()[name = tensor<string, []>("op_362_end_0"), val = tensor<int32, [3]>([4, 16, 256])];
+            tensor<bool, [3]> var_362_end_mask_0 = const()[name = tensor<string, []>("op_362_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [4, 16, 256]> conv_out_1 = transpose(perm = conv_out_1_perm_0, x = input_31)[name = tensor<string, []>("transpose_53")];
+            tensor<fp32, [4, 1, 256]> var_362 = slice_by_index(begin = var_362_begin_0, end = var_362_end_0, end_mask = var_362_end_mask_0, x = conv_out_1)[name = tensor<string, []>("op_362")];
+            tensor<int32, [3]> var_364_perm_0 = const()[name = tensor<string, []>("op_364_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 4, 256]> var_364 = transpose(perm = var_364_perm_0, x = var_362)[name = tensor<string, []>("transpose_52")];
+            tensor<fp32, [1, 4, 256]> input_33 = add(x = x_3, y = var_364)[name = tensor<string, []>("input_33")];
+            tensor<int32, [1]> input_35_axes_0 = const()[name = tensor<string, []>("input_35_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 4, 256]> input_35 = layer_norm(axes = input_35_axes_0, beta = inner_encoder_ffn2_0_module_sequential_0_bias, epsilon = var_65, gamma = inner_encoder_ffn2_0_module_sequential_0_weight, x = input_33)[name = tensor<string, []>("input_35")];
+            tensor<fp32, [1, 4, 1024]> inputs_9 = linear(bias = inner_encoder_ffn2_0_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_0_module_sequential_1_linear_weight, x = input_35)[name = tensor<string, []>("linear_8")];
+            tensor<fp32, [1, 4, 1024]> input_37 = silu(x = inputs_9)[name = tensor<string, []>("input_37")];
+            tensor<fp32, [1, 4, 256]> input_41 = linear(bias = inner_encoder_ffn2_0_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_0_module_sequential_4_linear_weight, x = input_37)[name = tensor<string, []>("linear_9")];
+            tensor<fp32, []> var_387 = const()[name = tensor<string, []>("op_387"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 4, 256]> var_388 = mul(x = input_41, y = var_387)[name = tensor<string, []>("op_388")];
+            tensor<fp32, [1, 4, 256]> input_43 = add(x = var_388, y = input_33)[name = tensor<string, []>("input_43")];
             tensor<int32, [1]> input_45_axes_0 = const()[name = tensor<string, []>("input_45_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_45 = layer_norm(axes = input_45_axes_0, beta = encoder_ffn1_1_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_1_module_sequential_0_weight, x = input_43)[name = tensor<string, []>("input_45")];
-            tensor<fp32, [1, 4, 1024]> inputs_11 = linear(bias = encoder_ffn1_1_module_sequential_1_linear_bias, weight = encoder_ffn1_1_module_sequential_1_linear_weight, x = input_45)[name = tensor<string, []>("linear_10")];
-            tensor<fp32, [1, 4, 1024]> input_47 = silu(x = inputs_11)[name = tensor<string, []>("input_47")];
-            tensor<fp32, [1, 4, 256]> input_51 = linear(bias = encoder_ffn1_1_module_sequential_4_linear_bias, weight = encoder_ffn1_1_module_sequential_4_linear_weight, x = input_47)[name = tensor<string, []>("linear_11")];
-            tensor<fp32, []> var_361 = const()[name = tensor<string, []>("op_361"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 4, 256]> var_362 = mul(x = input_51, y = var_361)[name = tensor<string, []>("op_362")];
-            tensor<fp32, [1, 4, 256]> input_53 = add(x = var_362, y = input_43)[name = tensor<string, []>("input_53")];
+            tensor<fp32, [1, 4, 256]> input_45 = layer_norm(axes = input_45_axes_0, beta = inner_encoder_layer_norm_0_bias, epsilon = var_65, gamma = inner_encoder_layer_norm_0_weight, x = input_43)[name = tensor<string, []>("input_45")];
+            tensor<int32, [1]> input_47_axes_0 = const()[name = tensor<string, []>("input_47_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 4, 256]> input_47 = layer_norm(axes = input_47_axes_0, beta = inner_encoder_ffn1_1_module_sequential_0_bias, epsilon = var_65, gamma = inner_encoder_ffn1_1_module_sequential_0_weight, x = input_45)[name = tensor<string, []>("input_47")];
+            tensor<fp32, [1, 4, 1024]> inputs_11 = linear(bias = inner_encoder_ffn1_1_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_1_module_sequential_1_linear_weight, x = input_47)[name = tensor<string, []>("linear_10")];
+            tensor<fp32, [1, 4, 1024]> input_49 = silu(x = inputs_11)[name = tensor<string, []>("input_49")];
+            tensor<fp32, [1, 4, 256]> input_53 = linear(bias = inner_encoder_ffn1_1_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_1_module_sequential_4_linear_weight, x = input_49)[name = tensor<string, []>("linear_11")];
+            tensor<fp32, []> var_417 = const()[name = tensor<string, []>("op_417"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 4, 256]> var_418 = mul(x = input_53, y = var_417)[name = tensor<string, []>("op_418")];
+            tensor<fp32, [1, 4, 256]> input_55 = add(x = var_418, y = input_45)[name = tensor<string, []>("input_55")];
             tensor<int32, [1]> x_7_axes_0 = const()[name = tensor<string, []>("x_7_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> x_7 = layer_norm(axes = x_7_axes_0, beta = encoder_ret_lns_1_bias, epsilon = var_28, gamma = encoder_ret_lns_1_weight, x = input_53)[name = tensor<string, []>("x_7")];
+            tensor<fp32, [1, 4, 256]> x_7 = layer_norm(axes = x_7_axes_0, beta = inner_encoder_ret_lns_1_bias, epsilon = var_65, gamma = inner_encoder_ret_lns_1_weight, x = input_55)[name = tensor<string, []>("x_7")];
             tensor<int32, [5]> prev_kv_3_begin_0 = const()[name = tensor<string, []>("prev_kv_3_begin_0"), val = tensor<int32, [5]>([1, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_3_end_0 = const()[name = tensor<string, []>("prev_kv_3_end_0"), val = tensor<int32, [5]>([2, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_3_end_mask_0 = const()[name = tensor<string, []>("prev_kv_3_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -416,173 +438,173 @@ program(1.0)
             tensor<bool, [2]> prev_scale_3_end_mask_0 = const()[name = tensor<string, []>("prev_scale_3_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_3_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_3_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_3 = slice_by_index(begin = prev_scale_3_begin_0, end = prev_scale_3_end_0, end_mask = prev_scale_3_end_mask_0, squeeze_mask = prev_scale_3_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_3")];
-            tensor<fp32, [1, 4, 256]> var_376 = linear(bias = encoder_q_proj_1_bias, weight = encoder_q_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_12")];
-            tensor<int32, [4]> var_377 = const()[name = tensor<string, []>("op_377"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_378 = reshape(shape = var_377, x = var_376)[name = tensor<string, []>("op_378")];
+            tensor<fp32, [1, 4, 256]> var_432 = linear(bias = inner_encoder_q_proj_1_bias, weight = inner_encoder_q_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_12")];
+            tensor<int32, [4]> var_433 = const()[name = tensor<string, []>("op_433"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_434 = reshape(shape = var_433, x = var_432)[name = tensor<string, []>("op_434")];
             tensor<int32, [4]> q_3_perm_0 = const()[name = tensor<string, []>("q_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> var_382 = linear(bias = encoder_k_proj_1_bias, weight = encoder_k_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_13")];
-            tensor<fp32, []> var_383 = const()[name = tensor<string, []>("op_383"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 4, 256]> var_384 = mul(x = var_382, y = var_383)[name = tensor<string, []>("op_384")];
-            tensor<int32, [4]> var_385 = const()[name = tensor<string, []>("op_385"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_386 = reshape(shape = var_385, x = var_384)[name = tensor<string, []>("op_386")];
+            tensor<fp32, [1, 4, 256]> var_438 = linear(bias = inner_encoder_k_proj_1_bias, weight = inner_encoder_k_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_13")];
+            tensor<fp32, []> var_439 = const()[name = tensor<string, []>("op_439"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 4, 256]> var_440 = mul(x = var_438, y = var_439)[name = tensor<string, []>("op_440")];
+            tensor<int32, [4]> var_441 = const()[name = tensor<string, []>("op_441"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_442 = reshape(shape = var_441, x = var_440)[name = tensor<string, []>("op_442")];
             tensor<int32, [4]> k_3_perm_0 = const()[name = tensor<string, []>("k_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> var_390 = linear(bias = encoder_v_proj_1_bias, weight = encoder_v_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_14")];
-            tensor<int32, [4]> var_391 = const()[name = tensor<string, []>("op_391"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_392 = reshape(shape = var_391, x = var_390)[name = tensor<string, []>("op_392")];
+            tensor<fp32, [1, 4, 256]> var_446 = linear(bias = inner_encoder_v_proj_1_bias, weight = inner_encoder_v_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_14")];
+            tensor<int32, [4]> var_447 = const()[name = tensor<string, []>("op_447"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_448 = reshape(shape = var_447, x = var_446)[name = tensor<string, []>("op_448")];
             tensor<int32, [4]> v_3_perm_0 = const()[name = tensor<string, []>("v_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> input_57 = linear(bias = encoder_g_proj_1_bias, weight = encoder_g_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_15")];
+            tensor<fp32, [1, 4, 256]> input_59 = linear(bias = inner_encoder_g_proj_1_bias, weight = inner_encoder_g_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_15")];
             tensor<fp32, [1]> sqrt_s0_3 = sqrt(x = prev_scale_3)[name = tensor<string, []>("sqrt_s0_3")];
-            tensor<fp32, [4]> s_t_3 = add(x = prev_scale_3, y = encoder__t_index)[name = tensor<string, []>("s_t_3")];
+            tensor<fp32, [4]> s_t_3 = add(x = prev_scale_3, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_3")];
             tensor<fp32, [4]> sqrt_s_t_3 = sqrt(x = s_t_3)[name = tensor<string, []>("sqrt_s_t_3")];
             tensor<bool, []> qk_3_transpose_x_1 = const()[name = tensor<string, []>("qk_3_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_3_transpose_y_1 = const()[name = tensor<string, []>("qk_3_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 4, 64]> k_3 = transpose(perm = k_3_perm_0, x = var_386)[name = tensor<string, []>("transpose_50")];
-            tensor<fp32, [1, 4, 4, 64]> q_3 = transpose(perm = q_3_perm_0, x = var_378)[name = tensor<string, []>("transpose_51")];
+            tensor<fp32, [1, 4, 4, 64]> k_3 = transpose(perm = k_3_perm_0, x = var_442)[name = tensor<string, []>("transpose_50")];
+            tensor<fp32, [1, 4, 4, 64]> q_3 = transpose(perm = q_3_perm_0, x = var_434)[name = tensor<string, []>("transpose_51")];
             tensor<fp32, [1, 4, 4, 4]> qk_3 = matmul(transpose_x = qk_3_transpose_x_1, transpose_y = qk_3_transpose_y_1, x = q_3, y = k_3)[name = tensor<string, []>("qk_3")];
-            tensor<int32, [2]> var_402 = const()[name = tensor<string, []>("op_402"), val = tensor<int32, [2]>([4, 1])];
-            tensor<fp32, [4, 1]> var_403 = reshape(shape = var_402, x = sqrt_s_t_3)[name = tensor<string, []>("op_403")];
-            tensor<fp32, [4, 4]> M_3 = real_div(x = encoder__causal_mask, y = var_403)[name = tensor<string, []>("M_3")];
-            tensor<fp32, [1, 4, 4, 4]> var_405 = mul(x = qk_3, y = M_3)[name = tensor<string, []>("op_405")];
+            tensor<int32, [2]> var_458 = const()[name = tensor<string, []>("op_458"), val = tensor<int32, [2]>([4, 1])];
+            tensor<fp32, [4, 1]> var_459 = reshape(shape = var_458, x = sqrt_s_t_3)[name = tensor<string, []>("op_459")];
+            tensor<fp32, [4, 4]> M_3 = real_div(x = inner_encoder__causal_mask, y = var_459)[name = tensor<string, []>("M_3")];
+            tensor<fp32, [1, 4, 4, 4]> var_461 = mul(x = qk_3, y = M_3)[name = tensor<string, []>("op_461")];
             tensor<bool, []> inner_3_transpose_x_0 = const()[name = tensor<string, []>("inner_3_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_3_transpose_y_0 = const()[name = tensor<string, []>("inner_3_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 4, 64]> v_3 = transpose(perm = v_3_perm_0, x = var_392)[name = tensor<string, []>("transpose_49")];
-            tensor<fp32, [1, 4, 4, 64]> inner_3 = matmul(transpose_x = inner_3_transpose_x_0, transpose_y = inner_3_transpose_y_0, x = var_405, y = v_3)[name = tensor<string, []>("inner_3")];
-            tensor<bool, []> var_407_transpose_x_0 = const()[name = tensor<string, []>("op_407_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_407_transpose_y_0 = const()[name = tensor<string, []>("op_407_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 4, 64]> var_407 = matmul(transpose_x = var_407_transpose_x_0, transpose_y = var_407_transpose_y_0, x = q_3, y = prev_kv_3)[name = tensor<string, []>("op_407")];
-            tensor<fp32, [4]> var_408 = real_div(x = sqrt_s0_3, y = sqrt_s_t_3)[name = tensor<string, []>("op_408")];
-            tensor<int32, [4]> var_409 = const()[name = tensor<string, []>("op_409"), val = tensor<int32, [4]>([1, 1, 4, 1])];
-            tensor<fp32, [1, 1, 4, 1]> var_410 = reshape(shape = var_409, x = var_408)[name = tensor<string, []>("op_410")];
-            tensor<fp32, [1, 4, 4, 64]> cross_3 = mul(x = var_407, y = var_410)[name = tensor<string, []>("cross_3")];
+            tensor<fp32, [1, 4, 4, 64]> v_3 = transpose(perm = v_3_perm_0, x = var_448)[name = tensor<string, []>("transpose_49")];
+            tensor<fp32, [1, 4, 4, 64]> inner_3 = matmul(transpose_x = inner_3_transpose_x_0, transpose_y = inner_3_transpose_y_0, x = var_461, y = v_3)[name = tensor<string, []>("inner_3")];
+            tensor<bool, []> var_463_transpose_x_0 = const()[name = tensor<string, []>("op_463_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_463_transpose_y_0 = const()[name = tensor<string, []>("op_463_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 4, 64]> var_463 = matmul(transpose_x = var_463_transpose_x_0, transpose_y = var_463_transpose_y_0, x = q_3, y = prev_kv_3)[name = tensor<string, []>("op_463")];
+            tensor<fp32, [4]> var_464 = real_div(x = sqrt_s0_3, y = sqrt_s_t_3)[name = tensor<string, []>("op_464")];
+            tensor<int32, [4]> var_465 = const()[name = tensor<string, []>("op_465"), val = tensor<int32, [4]>([1, 1, 4, 1])];
+            tensor<fp32, [1, 1, 4, 1]> var_466 = reshape(shape = var_465, x = var_464)[name = tensor<string, []>("op_466")];
+            tensor<fp32, [1, 4, 4, 64]> cross_3 = mul(x = var_463, y = var_466)[name = tensor<string, []>("cross_3")];
             tensor<fp32, [1, 4, 4, 64]> out_7 = add(x = inner_3, y = cross_3)[name = tensor<string, []>("out_7")];
-            tensor<fp32, [1, 4, 64, 64]> var_413 = mul(x = prev_kv_3, y = sqrt_s0_3)[name = tensor<string, []>("op_413")];
-            tensor<bool, []> var_415_transpose_x_1 = const()[name = tensor<string, []>("op_415_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_415_transpose_y_1 = const()[name = tensor<string, []>("op_415_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_415 = matmul(transpose_x = var_415_transpose_x_1, transpose_y = var_415_transpose_y_1, x = k_3, y = v_3)[name = tensor<string, []>("op_415")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_3 = add(x = var_413, y = var_415)[name = tensor<string, []>("new_kv_unnorm_3")];
-            tensor<fp32, []> var_417 = const()[name = tensor<string, []>("op_417"), val = tensor<fp32, []>(0x1p+2)];
-            tensor<fp32, [1]> new_scale_3 = add(x = prev_scale_3, y = var_417)[name = tensor<string, []>("new_scale_3")];
-            tensor<fp32, [1]> var_419 = sqrt(x = new_scale_3)[name = tensor<string, []>("op_419")];
-            tensor<fp32, [1, 4, 64, 64]> var_420 = real_div(x = new_kv_unnorm_3, y = var_419)[name = tensor<string, []>("op_420")];
-            tensor<int32, [4]> var_421_perm_0 = const()[name = tensor<string, []>("op_421_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_469 = mul(x = prev_kv_3, y = sqrt_s0_3)[name = tensor<string, []>("op_469")];
+            tensor<bool, []> var_471_transpose_x_1 = const()[name = tensor<string, []>("op_471_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_471_transpose_y_1 = const()[name = tensor<string, []>("op_471_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_471 = matmul(transpose_x = var_471_transpose_x_1, transpose_y = var_471_transpose_y_1, x = k_3, y = v_3)[name = tensor<string, []>("op_471")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_3 = add(x = var_469, y = var_471)[name = tensor<string, []>("new_kv_unnorm_3")];
+            tensor<fp32, []> var_473 = const()[name = tensor<string, []>("op_473"), val = tensor<fp32, []>(0x1p+2)];
+            tensor<fp32, [1]> new_scale_3 = add(x = prev_scale_3, y = var_473)[name = tensor<string, []>("new_scale_3")];
+            tensor<fp32, [1]> var_475 = sqrt(x = new_scale_3)[name = tensor<string, []>("op_475")];
+            tensor<fp32, [1, 4, 64, 64]> var_476 = real_div(x = new_kv_unnorm_3, y = var_475)[name = tensor<string, []>("op_476")];
+            tensor<int32, [4]> var_477_perm_0 = const()[name = tensor<string, []>("op_477_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_9_axes_0 = const()[name = tensor<string, []>("out_9_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 4, 64]> var_421 = transpose(perm = var_421_perm_0, x = out_7)[name = tensor<string, []>("transpose_48")];
-            tensor<fp32, [1, 4, 4, 64]> out_9 = layer_norm(axes = out_9_axes_0, epsilon = var_17, x = var_421)[name = tensor<string, []>("out_9")];
-            tensor<int32, [3]> var_425 = const()[name = tensor<string, []>("op_425"), val = tensor<int32, [3]>([1, 4, 256])];
-            tensor<fp32, [1, 4, 256]> out_11 = reshape(shape = var_425, x = out_9)[name = tensor<string, []>("out_11")];
-            tensor<fp32, [1, 4, 256]> var_427 = silu(x = input_57)[name = tensor<string, []>("op_427")];
-            tensor<fp32, [1, 4, 256]> input_59 = mul(x = var_427, y = out_11)[name = tensor<string, []>("input_59")];
-            tensor<fp32, [1, 4, 256]> ret_out_3 = linear(bias = encoder_out_proj_1_bias, weight = encoder_out_proj_1_weight, x = input_59)[name = tensor<string, []>("linear_16")];
-            tensor<fp32, [1, 4, 256]> x_9 = add(x = input_53, y = ret_out_3)[name = tensor<string, []>("x_9")];
+            tensor<fp32, [1, 4, 4, 64]> var_477 = transpose(perm = var_477_perm_0, x = out_7)[name = tensor<string, []>("transpose_48")];
+            tensor<fp32, [1, 4, 4, 64]> out_9 = layer_norm(axes = out_9_axes_0, epsilon = var_73, x = var_477)[name = tensor<string, []>("out_9")];
+            tensor<int32, [3]> var_481 = const()[name = tensor<string, []>("op_481"), val = tensor<int32, [3]>([1, 4, 256])];
+            tensor<fp32, [1, 4, 256]> out_11 = reshape(shape = var_481, x = out_9)[name = tensor<string, []>("out_11")];
+            tensor<fp32, [1, 4, 256]> var_483 = silu(x = input_59)[name = tensor<string, []>("op_483")];
+            tensor<fp32, [1, 4, 256]> input_61 = mul(x = var_483, y = out_11)[name = tensor<string, []>("input_61")];
+            tensor<fp32, [1, 4, 256]> ret_out_3 = linear(bias = inner_encoder_out_proj_1_bias, weight = inner_encoder_out_proj_1_weight, x = input_61)[name = tensor<string, []>("linear_16")];
+            tensor<fp32, [1, 4, 256]> x_9 = add(x = input_55, y = ret_out_3)[name = tensor<string, []>("x_9")];
             tensor<int32, [4]> window_11_begin_0 = const()[name = tensor<string, []>("window_11_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> window_11_end_0 = const()[name = tensor<string, []>("window_11_end_0"), val = tensor<int32, [4]>([2, 1, 16, 256])];
             tensor<bool, [4]> window_11_end_mask_0 = const()[name = tensor<string, []>("window_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_11_squeeze_mask_0 = const()[name = tensor<string, []>("window_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_11 = slice_by_index(begin = window_11_begin_0, end = window_11_end_0, end_mask = window_11_end_mask_0, squeeze_mask = window_11_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_11")];
-            tensor<int32, [3]> var_435_begin_0 = const()[name = tensor<string, []>("op_435_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_435_end_0 = const()[name = tensor<string, []>("op_435_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_435_end_mask_0 = const()[name = tensor<string, []>("op_435_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_435 = slice_by_index(begin = var_435_begin_0, end = var_435_end_0, end_mask = var_435_end_mask_0, x = x_9)[name = tensor<string, []>("op_435")];
-            tensor<int32, [3]> var_438_begin_0 = const()[name = tensor<string, []>("op_438_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_438_end_0 = const()[name = tensor<string, []>("op_438_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_438_end_mask_0 = const()[name = tensor<string, []>("op_438_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_438 = slice_by_index(begin = var_438_begin_0, end = var_438_end_0, end_mask = var_438_end_mask_0, x = window_11)[name = tensor<string, []>("op_438")];
+            tensor<int32, [3]> var_491_begin_0 = const()[name = tensor<string, []>("op_491_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_491_end_0 = const()[name = tensor<string, []>("op_491_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_491_end_mask_0 = const()[name = tensor<string, []>("op_491_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_491 = slice_by_index(begin = var_491_begin_0, end = var_491_end_0, end_mask = var_491_end_mask_0, x = x_9)[name = tensor<string, []>("op_491")];
+            tensor<int32, [3]> var_494_begin_0 = const()[name = tensor<string, []>("op_494_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_494_end_0 = const()[name = tensor<string, []>("op_494_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_494_end_mask_0 = const()[name = tensor<string, []>("op_494_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_494 = slice_by_index(begin = var_494_begin_0, end = var_494_end_0, end_mask = var_494_end_mask_0, x = window_11)[name = tensor<string, []>("op_494")];
             tensor<bool, []> window_13_interleave_0 = const()[name = tensor<string, []>("window_13_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_13 = concat(axis = var_26, interleave = window_13_interleave_0, values = (var_438, var_435))[name = tensor<string, []>("window_13")];
-            tensor<int32, [3]> var_443_begin_0 = const()[name = tensor<string, []>("op_443_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_443_end_0 = const()[name = tensor<string, []>("op_443_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_443_end_mask_0 = const()[name = tensor<string, []>("op_443_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_443 = slice_by_index(begin = var_443_begin_0, end = var_443_end_0, end_mask = var_443_end_mask_0, x = x_9)[name = tensor<string, []>("op_443")];
-            tensor<int32, [3]> var_446_begin_0 = const()[name = tensor<string, []>("op_446_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_446_end_0 = const()[name = tensor<string, []>("op_446_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_446_end_mask_0 = const()[name = tensor<string, []>("op_446_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_446 = slice_by_index(begin = var_446_begin_0, end = var_446_end_0, end_mask = var_446_end_mask_0, x = window_13)[name = tensor<string, []>("op_446")];
+            tensor<fp32, [1, 16, 256]> window_13 = concat(axis = var_82, interleave = window_13_interleave_0, values = (var_494, var_491))[name = tensor<string, []>("window_13")];
+            tensor<int32, [3]> var_499_begin_0 = const()[name = tensor<string, []>("op_499_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_499_end_0 = const()[name = tensor<string, []>("op_499_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_499_end_mask_0 = const()[name = tensor<string, []>("op_499_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_499 = slice_by_index(begin = var_499_begin_0, end = var_499_end_0, end_mask = var_499_end_mask_0, x = x_9)[name = tensor<string, []>("op_499")];
+            tensor<int32, [3]> var_502_begin_0 = const()[name = tensor<string, []>("op_502_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_502_end_0 = const()[name = tensor<string, []>("op_502_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_502_end_mask_0 = const()[name = tensor<string, []>("op_502_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_502 = slice_by_index(begin = var_502_begin_0, end = var_502_end_0, end_mask = var_502_end_mask_0, x = window_13)[name = tensor<string, []>("op_502")];
             tensor<bool, []> window_15_interleave_0 = const()[name = tensor<string, []>("window_15_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_15 = concat(axis = var_26, interleave = window_15_interleave_0, values = (var_446, var_443))[name = tensor<string, []>("window_15")];
-            tensor<int32, [3]> var_451_begin_0 = const()[name = tensor<string, []>("op_451_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_451_end_0 = const()[name = tensor<string, []>("op_451_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<bool, [3]> var_451_end_mask_0 = const()[name = tensor<string, []>("op_451_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_451 = slice_by_index(begin = var_451_begin_0, end = var_451_end_0, end_mask = var_451_end_mask_0, x = x_9)[name = tensor<string, []>("op_451")];
-            tensor<int32, [3]> var_454_begin_0 = const()[name = tensor<string, []>("op_454_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_454_end_0 = const()[name = tensor<string, []>("op_454_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_454_end_mask_0 = const()[name = tensor<string, []>("op_454_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_454 = slice_by_index(begin = var_454_begin_0, end = var_454_end_0, end_mask = var_454_end_mask_0, x = window_15)[name = tensor<string, []>("op_454")];
+            tensor<fp32, [1, 16, 256]> window_15 = concat(axis = var_82, interleave = window_15_interleave_0, values = (var_502, var_499))[name = tensor<string, []>("window_15")];
+            tensor<int32, [3]> var_507_begin_0 = const()[name = tensor<string, []>("op_507_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_507_end_0 = const()[name = tensor<string, []>("op_507_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<bool, [3]> var_507_end_mask_0 = const()[name = tensor<string, []>("op_507_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_507 = slice_by_index(begin = var_507_begin_0, end = var_507_end_0, end_mask = var_507_end_mask_0, x = x_9)[name = tensor<string, []>("op_507")];
+            tensor<int32, [3]> var_510_begin_0 = const()[name = tensor<string, []>("op_510_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_510_end_0 = const()[name = tensor<string, []>("op_510_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_510_end_mask_0 = const()[name = tensor<string, []>("op_510_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_510 = slice_by_index(begin = var_510_begin_0, end = var_510_end_0, end_mask = var_510_end_mask_0, x = window_15)[name = tensor<string, []>("op_510")];
             tensor<bool, []> window_17_interleave_0 = const()[name = tensor<string, []>("window_17_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_17 = concat(axis = var_26, interleave = window_17_interleave_0, values = (var_454, var_451))[name = tensor<string, []>("window_17")];
-            tensor<int32, [3]> var_459_begin_0 = const()[name = tensor<string, []>("op_459_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
-            tensor<int32, [3]> var_459_end_0 = const()[name = tensor<string, []>("op_459_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_459_end_mask_0 = const()[name = tensor<string, []>("op_459_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_459 = slice_by_index(begin = var_459_begin_0, end = var_459_end_0, end_mask = var_459_end_mask_0, x = x_9)[name = tensor<string, []>("op_459")];
-            tensor<int32, [3]> var_462_begin_0 = const()[name = tensor<string, []>("op_462_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_462_end_0 = const()[name = tensor<string, []>("op_462_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_462_end_mask_0 = const()[name = tensor<string, []>("op_462_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_462 = slice_by_index(begin = var_462_begin_0, end = var_462_end_0, end_mask = var_462_end_mask_0, x = window_17)[name = tensor<string, []>("op_462")];
+            tensor<fp32, [1, 16, 256]> window_17 = concat(axis = var_82, interleave = window_17_interleave_0, values = (var_510, var_507))[name = tensor<string, []>("window_17")];
+            tensor<int32, [3]> var_515_begin_0 = const()[name = tensor<string, []>("op_515_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
+            tensor<int32, [3]> var_515_end_0 = const()[name = tensor<string, []>("op_515_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_515_end_mask_0 = const()[name = tensor<string, []>("op_515_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_515 = slice_by_index(begin = var_515_begin_0, end = var_515_end_0, end_mask = var_515_end_mask_0, x = x_9)[name = tensor<string, []>("op_515")];
+            tensor<int32, [3]> var_518_begin_0 = const()[name = tensor<string, []>("op_518_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_518_end_0 = const()[name = tensor<string, []>("op_518_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_518_end_mask_0 = const()[name = tensor<string, []>("op_518_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_518 = slice_by_index(begin = var_518_begin_0, end = var_518_end_0, end_mask = var_518_end_mask_0, x = window_17)[name = tensor<string, []>("op_518")];
             tensor<bool, []> window_19_interleave_0 = const()[name = tensor<string, []>("window_19_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_19 = concat(axis = var_26, interleave = window_19_interleave_0, values = (var_462, var_459))[name = tensor<string, []>("window_19")];
-            tensor<bool, []> input_61_interleave_0 = const()[name = tensor<string, []>("input_61_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [4, 16, 256]> input_61 = concat(axis = var_23, interleave = input_61_interleave_0, values = (window_13, window_15, window_17, window_19))[name = tensor<string, []>("input_61")];
+            tensor<fp32, [1, 16, 256]> window_19 = concat(axis = var_82, interleave = window_19_interleave_0, values = (var_518, var_515))[name = tensor<string, []>("window_19")];
+            tensor<bool, []> input_63_interleave_0 = const()[name = tensor<string, []>("input_63_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [4, 16, 256]> input_63 = concat(axis = var_68, interleave = input_63_interleave_0, values = (window_13, window_15, window_17, window_19))[name = tensor<string, []>("input_63")];
             tensor<int32, [1]> x_11_axes_0 = const()[name = tensor<string, []>("x_11_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [4, 16, 256]> x_11 = layer_norm(axes = x_11_axes_0, beta = encoder_conv_module_1_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_1_sequential_0_weight, x = input_61)[name = tensor<string, []>("x_11")];
-            tensor<int32, [3]> input_63_perm_0 = const()[name = tensor<string, []>("input_63_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [4, 16, 256]> x_11 = layer_norm(axes = x_11_axes_0, beta = inner_encoder_conv_module_1_sequential_0_bias, epsilon = var_65, gamma = inner_encoder_conv_module_1_sequential_0_weight, x = input_63)[name = tensor<string, []>("x_11")];
+            tensor<int32, [3]> input_65_perm_0 = const()[name = tensor<string, []>("input_65_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_13_pad_type_0 = const()[name = tensor<string, []>("inputs_13_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_13_strides_0 = const()[name = tensor<string, []>("inputs_13_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_13_pad_0 = const()[name = tensor<string, []>("inputs_13_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_13_dilations_0 = const()[name = tensor<string, []>("inputs_13_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_13_groups_0 = const()[name = tensor<string, []>("inputs_13_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> input_63 = transpose(perm = input_63_perm_0, x = x_11)[name = tensor<string, []>("transpose_47")];
-            tensor<fp32, [4, 512, 16]> inputs_13 = conv(bias = encoder_conv_module_1_sequential_2_conv_bias, dilations = inputs_13_dilations_0, groups = inputs_13_groups_0, pad = inputs_13_pad_0, pad_type = inputs_13_pad_type_0, strides = inputs_13_strides_0, weight = encoder_conv_module_1_sequential_2_conv_weight, x = input_63)[name = tensor<string, []>("inputs_13")];
-            tensor<int32, [2]> var_487_split_sizes_0 = const()[name = tensor<string, []>("op_487_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_487_axis_0 = const()[name = tensor<string, []>("op_487_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> var_487_0, tensor<fp32, [4, 256, 16]> var_487_1 = split(axis = var_487_axis_0, split_sizes = var_487_split_sizes_0, x = inputs_13)[name = tensor<string, []>("op_487")];
-            tensor<fp32, [4, 256, 16]> var_489 = sigmoid(x = var_487_1)[name = tensor<string, []>("op_489")];
-            tensor<fp32, [4, 256, 16]> inputs_15 = mul(x = var_487_0, y = var_489)[name = tensor<string, []>("inputs_15")];
+            tensor<fp32, [4, 256, 16]> input_65 = transpose(perm = input_65_perm_0, x = x_11)[name = tensor<string, []>("transpose_47")];
+            tensor<fp32, [4, 512, 16]> inputs_13 = conv(bias = inner_encoder_conv_module_1_sequential_2_conv_bias, dilations = inputs_13_dilations_0, groups = inputs_13_groups_0, pad = inputs_13_pad_0, pad_type = inputs_13_pad_type_0, strides = inputs_13_strides_0, weight = inner_encoder_conv_module_1_sequential_2_conv_weight, x = input_65)[name = tensor<string, []>("inputs_13")];
+            tensor<int32, [2]> var_543_split_sizes_0 = const()[name = tensor<string, []>("op_543_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_543_axis_0 = const()[name = tensor<string, []>("op_543_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [4, 256, 16]> var_543_0, tensor<fp32, [4, 256, 16]> var_543_1 = split(axis = var_543_axis_0, split_sizes = var_543_split_sizes_0, x = inputs_13)[name = tensor<string, []>("op_543")];
+            tensor<fp32, [4, 256, 16]> var_545 = sigmoid(x = var_543_1)[name = tensor<string, []>("op_545")];
+            tensor<fp32, [4, 256, 16]> inputs_15 = mul(x = var_543_0, y = var_545)[name = tensor<string, []>("inputs_15")];
             tensor<string, []> outputs_aug_3_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_3_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_3_pad_0 = const()[name = tensor<string, []>("outputs_aug_3_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_3_groups_0 = const()[name = tensor<string, []>("outputs_aug_3_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_3_strides_0 = const()[name = tensor<string, []>("outputs_aug_3_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_3_dilations_0 = const()[name = tensor<string, []>("outputs_aug_3_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [4, 256, 31]> outputs_aug_3 = conv(dilations = outputs_aug_3_dilations_0, groups = outputs_aug_3_groups_0, pad = outputs_aug_3_pad_0, pad_type = outputs_aug_3_pad_type_0, strides = outputs_aug_3_strides_0, weight = encoder_conv_module_1_sequential_4_conv_weight, x = inputs_15)[name = tensor<string, []>("outputs_aug_3")];
-            tensor<int32, [3]> input_65_begin_0 = const()[name = tensor<string, []>("input_65_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_65_end_0 = const()[name = tensor<string, []>("input_65_end_0"), val = tensor<int32, [3]>([4, 256, 16])];
-            tensor<bool, [3]> input_65_end_mask_0 = const()[name = tensor<string, []>("input_65_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [4, 256, 16]> input_65 = slice_by_index(begin = input_65_begin_0, end = input_65_end_0, end_mask = input_65_end_mask_0, x = outputs_aug_3)[name = tensor<string, []>("input_65")];
-            tensor<fp32, [4, 256, 16]> inputs_17 = batch_norm(beta = encoder_conv_module_1_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_1_sequential_5_weight, mean = encoder_conv_module_1_sequential_5_running_mean, variance = encoder_conv_module_1_sequential_5_running_var, x = input_65)[name = tensor<string, []>("inputs_17")];
-            tensor<fp32, [4, 256, 16]> input_67 = silu(x = inputs_17)[name = tensor<string, []>("input_67")];
-            tensor<string, []> input_69_pad_type_0 = const()[name = tensor<string, []>("input_69_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_69_strides_0 = const()[name = tensor<string, []>("input_69_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_69_pad_0 = const()[name = tensor<string, []>("input_69_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_69_dilations_0 = const()[name = tensor<string, []>("input_69_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_69_groups_0 = const()[name = tensor<string, []>("input_69_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> input_69 = conv(bias = encoder_conv_module_1_sequential_7_conv_bias, dilations = input_69_dilations_0, groups = input_69_groups_0, pad = input_69_pad_0, pad_type = input_69_pad_type_0, strides = input_69_strides_0, weight = encoder_conv_module_1_sequential_7_conv_weight, x = input_67)[name = tensor<string, []>("input_69")];
+            tensor<fp32, [4, 256, 31]> outputs_aug_3 = conv(dilations = outputs_aug_3_dilations_0, groups = outputs_aug_3_groups_0, pad = outputs_aug_3_pad_0, pad_type = outputs_aug_3_pad_type_0, strides = outputs_aug_3_strides_0, weight = inner_encoder_conv_module_1_sequential_4_conv_weight, x = inputs_15)[name = tensor<string, []>("outputs_aug_3")];
+            tensor<int32, [3]> input_67_begin_0 = const()[name = tensor<string, []>("input_67_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_67_end_0 = const()[name = tensor<string, []>("input_67_end_0"), val = tensor<int32, [3]>([4, 256, 16])];
+            tensor<bool, [3]> input_67_end_mask_0 = const()[name = tensor<string, []>("input_67_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [4, 256, 16]> input_67 = slice_by_index(begin = input_67_begin_0, end = input_67_end_0, end_mask = input_67_end_mask_0, x = outputs_aug_3)[name = tensor<string, []>("input_67")];
+            tensor<fp32, [4, 256, 16]> inputs_17 = batch_norm(beta = inner_encoder_conv_module_1_sequential_5_bias, epsilon = var_65, gamma = inner_encoder_conv_module_1_sequential_5_weight, mean = inner_encoder_conv_module_1_sequential_5_running_mean, variance = inner_encoder_conv_module_1_sequential_5_running_var, x = input_67)[name = tensor<string, []>("inputs_17")];
+            tensor<fp32, [4, 256, 16]> input_69 = silu(x = inputs_17)[name = tensor<string, []>("input_69")];
+            tensor<string, []> input_71_pad_type_0 = const()[name = tensor<string, []>("input_71_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_71_strides_0 = const()[name = tensor<string, []>("input_71_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_71_pad_0 = const()[name = tensor<string, []>("input_71_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_71_dilations_0 = const()[name = tensor<string, []>("input_71_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_71_groups_0 = const()[name = tensor<string, []>("input_71_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [4, 256, 16]> input_71 = conv(bias = inner_encoder_conv_module_1_sequential_7_conv_bias, dilations = input_71_dilations_0, groups = input_71_groups_0, pad = input_71_pad_0, pad_type = input_71_pad_type_0, strides = input_71_strides_0, weight = inner_encoder_conv_module_1_sequential_7_conv_weight, x = input_69)[name = tensor<string, []>("input_71")];
             tensor<int32, [3]> conv_out_3_perm_0 = const()[name = tensor<string, []>("conv_out_3_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_520_begin_0 = const()[name = tensor<string, []>("op_520_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_520_end_0 = const()[name = tensor<string, []>("op_520_end_0"), val = tensor<int32, [3]>([4, 16, 256])];
-            tensor<bool, [3]> var_520_end_mask_0 = const()[name = tensor<string, []>("op_520_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [4, 16, 256]> conv_out_3 = transpose(perm = conv_out_3_perm_0, x = input_69)[name = tensor<string, []>("transpose_46")];
-            tensor<fp32, [4, 1, 256]> var_520 = slice_by_index(begin = var_520_begin_0, end = var_520_end_0, end_mask = var_520_end_mask_0, x = conv_out_3)[name = tensor<string, []>("op_520")];
-            tensor<int32, [3]> var_522_perm_0 = const()[name = tensor<string, []>("op_522_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 4, 256]> var_522 = transpose(perm = var_522_perm_0, x = var_520)[name = tensor<string, []>("transpose_45")];
-            tensor<fp32, [1, 4, 256]> input_71 = add(x = x_9, y = var_522)[name = tensor<string, []>("input_71")];
-            tensor<int32, [1]> input_73_axes_0 = const()[name = tensor<string, []>("input_73_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_73 = layer_norm(axes = input_73_axes_0, beta = encoder_ffn2_1_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_1_module_sequential_0_weight, x = input_71)[name = tensor<string, []>("input_73")];
-            tensor<fp32, [1, 4, 1024]> inputs_19 = linear(bias = encoder_ffn2_1_module_sequential_1_linear_bias, weight = encoder_ffn2_1_module_sequential_1_linear_weight, x = input_73)[name = tensor<string, []>("linear_17")];
-            tensor<fp32, [1, 4, 1024]> input_75 = silu(x = inputs_19)[name = tensor<string, []>("input_75")];
-            tensor<fp32, [1, 4, 256]> input_79 = linear(bias = encoder_ffn2_1_module_sequential_4_linear_bias, weight = encoder_ffn2_1_module_sequential_4_linear_weight, x = input_75)[name = tensor<string, []>("linear_18")];
-            tensor<fp32, []> var_545 = const()[name = tensor<string, []>("op_545"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 4, 256]> var_546 = mul(x = input_79, y = var_545)[name = tensor<string, []>("op_546")];
-            tensor<fp32, [1, 4, 256]> input_81 = add(x = var_546, y = input_71)[name = tensor<string, []>("input_81")];
-            tensor<int32, [1]> input_83_axes_0 = const()[name = tensor<string, []>("input_83_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_83 = layer_norm(axes = input_83_axes_0, beta = encoder_layer_norm_1_bias, epsilon = var_28, gamma = encoder_layer_norm_1_weight, x = input_81)[name = tensor<string, []>("input_83")];
+            tensor<int32, [3]> var_576_begin_0 = const()[name = tensor<string, []>("op_576_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_576_end_0 = const()[name = tensor<string, []>("op_576_end_0"), val = tensor<int32, [3]>([4, 16, 256])];
+            tensor<bool, [3]> var_576_end_mask_0 = const()[name = tensor<string, []>("op_576_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [4, 16, 256]> conv_out_3 = transpose(perm = conv_out_3_perm_0, x = input_71)[name = tensor<string, []>("transpose_46")];
+            tensor<fp32, [4, 1, 256]> var_576 = slice_by_index(begin = var_576_begin_0, end = var_576_end_0, end_mask = var_576_end_mask_0, x = conv_out_3)[name = tensor<string, []>("op_576")];
+            tensor<int32, [3]> var_578_perm_0 = const()[name = tensor<string, []>("op_578_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 4, 256]> var_578 = transpose(perm = var_578_perm_0, x = var_576)[name = tensor<string, []>("transpose_45")];
+            tensor<fp32, [1, 4, 256]> input_73 = add(x = x_9, y = var_578)[name = tensor<string, []>("input_73")];
+            tensor<int32, [1]> input_75_axes_0 = const()[name = tensor<string, []>("input_75_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 4, 256]> input_75 = layer_norm(axes = input_75_axes_0, beta = inner_encoder_ffn2_1_module_sequential_0_bias, epsilon = var_65, gamma = inner_encoder_ffn2_1_module_sequential_0_weight, x = input_73)[name = tensor<string, []>("input_75")];
+            tensor<fp32, [1, 4, 1024]> inputs_19 = linear(bias = inner_encoder_ffn2_1_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_1_module_sequential_1_linear_weight, x = input_75)[name = tensor<string, []>("linear_17")];
+            tensor<fp32, [1, 4, 1024]> input_77 = silu(x = inputs_19)[name = tensor<string, []>("input_77")];
+            tensor<fp32, [1, 4, 256]> input_81 = linear(bias = inner_encoder_ffn2_1_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_1_module_sequential_4_linear_weight, x = input_77)[name = tensor<string, []>("linear_18")];
+            tensor<fp32, []> var_601 = const()[name = tensor<string, []>("op_601"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 4, 256]> var_602 = mul(x = input_81, y = var_601)[name = tensor<string, []>("op_602")];
+            tensor<fp32, [1, 4, 256]> input_83 = add(x = var_602, y = input_73)[name = tensor<string, []>("input_83")];
             tensor<int32, [1]> input_85_axes_0 = const()[name = tensor<string, []>("input_85_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_85 = layer_norm(axes = input_85_axes_0, beta = encoder_ffn1_2_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_2_module_sequential_0_weight, x = input_83)[name = tensor<string, []>("input_85")];
-            tensor<fp32, [1, 4, 1024]> inputs_21 = linear(bias = encoder_ffn1_2_module_sequential_1_linear_bias, weight = encoder_ffn1_2_module_sequential_1_linear_weight, x = input_85)[name = tensor<string, []>("linear_19")];
-            tensor<fp32, [1, 4, 1024]> input_87 = silu(x = inputs_21)[name = tensor<string, []>("input_87")];
-            tensor<fp32, [1, 4, 256]> input_91 = linear(bias = encoder_ffn1_2_module_sequential_4_linear_bias, weight = encoder_ffn1_2_module_sequential_4_linear_weight, x = input_87)[name = tensor<string, []>("linear_20")];
-            tensor<fp32, []> var_575 = const()[name = tensor<string, []>("op_575"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 4, 256]> var_576 = mul(x = input_91, y = var_575)[name = tensor<string, []>("op_576")];
-            tensor<fp32, [1, 4, 256]> input_93 = add(x = var_576, y = input_83)[name = tensor<string, []>("input_93")];
+            tensor<fp32, [1, 4, 256]> input_85 = layer_norm(axes = input_85_axes_0, beta = inner_encoder_layer_norm_1_bias, epsilon = var_65, gamma = inner_encoder_layer_norm_1_weight, x = input_83)[name = tensor<string, []>("input_85")];
+            tensor<int32, [1]> input_87_axes_0 = const()[name = tensor<string, []>("input_87_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 4, 256]> input_87 = layer_norm(axes = input_87_axes_0, beta = inner_encoder_ffn1_2_module_sequential_0_bias, epsilon = var_65, gamma = inner_encoder_ffn1_2_module_sequential_0_weight, x = input_85)[name = tensor<string, []>("input_87")];
+            tensor<fp32, [1, 4, 1024]> inputs_21 = linear(bias = inner_encoder_ffn1_2_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_2_module_sequential_1_linear_weight, x = input_87)[name = tensor<string, []>("linear_19")];
+            tensor<fp32, [1, 4, 1024]> input_89 = silu(x = inputs_21)[name = tensor<string, []>("input_89")];
+            tensor<fp32, [1, 4, 256]> input_93 = linear(bias = inner_encoder_ffn1_2_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_2_module_sequential_4_linear_weight, x = input_89)[name = tensor<string, []>("linear_20")];
+            tensor<fp32, []> var_631 = const()[name = tensor<string, []>("op_631"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 4, 256]> var_632 = mul(x = input_93, y = var_631)[name = tensor<string, []>("op_632")];
+            tensor<fp32, [1, 4, 256]> input_95 = add(x = var_632, y = input_85)[name = tensor<string, []>("input_95")];
             tensor<int32, [1]> x_13_axes_0 = const()[name = tensor<string, []>("x_13_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> x_13 = layer_norm(axes = x_13_axes_0, beta = encoder_ret_lns_2_bias, epsilon = var_28, gamma = encoder_ret_lns_2_weight, x = input_93)[name = tensor<string, []>("x_13")];
+            tensor<fp32, [1, 4, 256]> x_13 = layer_norm(axes = x_13_axes_0, beta = inner_encoder_ret_lns_2_bias, epsilon = var_65, gamma = inner_encoder_ret_lns_2_weight, x = input_95)[name = tensor<string, []>("x_13")];
             tensor<int32, [5]> prev_kv_5_begin_0 = const()[name = tensor<string, []>("prev_kv_5_begin_0"), val = tensor<int32, [5]>([2, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_5_end_0 = const()[name = tensor<string, []>("prev_kv_5_end_0"), val = tensor<int32, [5]>([3, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_5_end_mask_0 = const()[name = tensor<string, []>("prev_kv_5_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -593,173 +615,173 @@ program(1.0)
             tensor<bool, [2]> prev_scale_5_end_mask_0 = const()[name = tensor<string, []>("prev_scale_5_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_5_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_5_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_5 = slice_by_index(begin = prev_scale_5_begin_0, end = prev_scale_5_end_0, end_mask = prev_scale_5_end_mask_0, squeeze_mask = prev_scale_5_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_5")];
-            tensor<fp32, [1, 4, 256]> var_590 = linear(bias = encoder_q_proj_2_bias, weight = encoder_q_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_21")];
-            tensor<int32, [4]> var_591 = const()[name = tensor<string, []>("op_591"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_592 = reshape(shape = var_591, x = var_590)[name = tensor<string, []>("op_592")];
+            tensor<fp32, [1, 4, 256]> var_646 = linear(bias = inner_encoder_q_proj_2_bias, weight = inner_encoder_q_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_21")];
+            tensor<int32, [4]> var_647 = const()[name = tensor<string, []>("op_647"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_648 = reshape(shape = var_647, x = var_646)[name = tensor<string, []>("op_648")];
             tensor<int32, [4]> q_5_perm_0 = const()[name = tensor<string, []>("q_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> var_596 = linear(bias = encoder_k_proj_2_bias, weight = encoder_k_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_22")];
-            tensor<fp32, []> var_597 = const()[name = tensor<string, []>("op_597"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 4, 256]> var_598 = mul(x = var_596, y = var_597)[name = tensor<string, []>("op_598")];
-            tensor<int32, [4]> var_599 = const()[name = tensor<string, []>("op_599"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_600 = reshape(shape = var_599, x = var_598)[name = tensor<string, []>("op_600")];
+            tensor<fp32, [1, 4, 256]> var_652 = linear(bias = inner_encoder_k_proj_2_bias, weight = inner_encoder_k_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_22")];
+            tensor<fp32, []> var_653 = const()[name = tensor<string, []>("op_653"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 4, 256]> var_654 = mul(x = var_652, y = var_653)[name = tensor<string, []>("op_654")];
+            tensor<int32, [4]> var_655 = const()[name = tensor<string, []>("op_655"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_656 = reshape(shape = var_655, x = var_654)[name = tensor<string, []>("op_656")];
             tensor<int32, [4]> k_5_perm_0 = const()[name = tensor<string, []>("k_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> var_604 = linear(bias = encoder_v_proj_2_bias, weight = encoder_v_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_23")];
-            tensor<int32, [4]> var_605 = const()[name = tensor<string, []>("op_605"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_606 = reshape(shape = var_605, x = var_604)[name = tensor<string, []>("op_606")];
+            tensor<fp32, [1, 4, 256]> var_660 = linear(bias = inner_encoder_v_proj_2_bias, weight = inner_encoder_v_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_23")];
+            tensor<int32, [4]> var_661 = const()[name = tensor<string, []>("op_661"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_662 = reshape(shape = var_661, x = var_660)[name = tensor<string, []>("op_662")];
             tensor<int32, [4]> v_5_perm_0 = const()[name = tensor<string, []>("v_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> input_97 = linear(bias = encoder_g_proj_2_bias, weight = encoder_g_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_24")];
+            tensor<fp32, [1, 4, 256]> input_99 = linear(bias = inner_encoder_g_proj_2_bias, weight = inner_encoder_g_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_24")];
             tensor<fp32, [1]> sqrt_s0_5 = sqrt(x = prev_scale_5)[name = tensor<string, []>("sqrt_s0_5")];
-            tensor<fp32, [4]> s_t_5 = add(x = prev_scale_5, y = encoder__t_index)[name = tensor<string, []>("s_t_5")];
+            tensor<fp32, [4]> s_t_5 = add(x = prev_scale_5, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_5")];
             tensor<fp32, [4]> sqrt_s_t_5 = sqrt(x = s_t_5)[name = tensor<string, []>("sqrt_s_t_5")];
             tensor<bool, []> qk_5_transpose_x_1 = const()[name = tensor<string, []>("qk_5_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_5_transpose_y_1 = const()[name = tensor<string, []>("qk_5_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 4, 64]> k_5 = transpose(perm = k_5_perm_0, x = var_600)[name = tensor<string, []>("transpose_43")];
-            tensor<fp32, [1, 4, 4, 64]> q_5 = transpose(perm = q_5_perm_0, x = var_592)[name = tensor<string, []>("transpose_44")];
+            tensor<fp32, [1, 4, 4, 64]> k_5 = transpose(perm = k_5_perm_0, x = var_656)[name = tensor<string, []>("transpose_43")];
+            tensor<fp32, [1, 4, 4, 64]> q_5 = transpose(perm = q_5_perm_0, x = var_648)[name = tensor<string, []>("transpose_44")];
             tensor<fp32, [1, 4, 4, 4]> qk_5 = matmul(transpose_x = qk_5_transpose_x_1, transpose_y = qk_5_transpose_y_1, x = q_5, y = k_5)[name = tensor<string, []>("qk_5")];
-            tensor<int32, [2]> var_616 = const()[name = tensor<string, []>("op_616"), val = tensor<int32, [2]>([4, 1])];
-            tensor<fp32, [4, 1]> var_617 = reshape(shape = var_616, x = sqrt_s_t_5)[name = tensor<string, []>("op_617")];
-            tensor<fp32, [4, 4]> M_5 = real_div(x = encoder__causal_mask, y = var_617)[name = tensor<string, []>("M_5")];
-            tensor<fp32, [1, 4, 4, 4]> var_619 = mul(x = qk_5, y = M_5)[name = tensor<string, []>("op_619")];
+            tensor<int32, [2]> var_672 = const()[name = tensor<string, []>("op_672"), val = tensor<int32, [2]>([4, 1])];
+            tensor<fp32, [4, 1]> var_673 = reshape(shape = var_672, x = sqrt_s_t_5)[name = tensor<string, []>("op_673")];
+            tensor<fp32, [4, 4]> M_5 = real_div(x = inner_encoder__causal_mask, y = var_673)[name = tensor<string, []>("M_5")];
+            tensor<fp32, [1, 4, 4, 4]> var_675 = mul(x = qk_5, y = M_5)[name = tensor<string, []>("op_675")];
             tensor<bool, []> inner_5_transpose_x_0 = const()[name = tensor<string, []>("inner_5_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_5_transpose_y_0 = const()[name = tensor<string, []>("inner_5_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 4, 64]> v_5 = transpose(perm = v_5_perm_0, x = var_606)[name = tensor<string, []>("transpose_42")];
-            tensor<fp32, [1, 4, 4, 64]> inner_5 = matmul(transpose_x = inner_5_transpose_x_0, transpose_y = inner_5_transpose_y_0, x = var_619, y = v_5)[name = tensor<string, []>("inner_5")];
-            tensor<bool, []> var_621_transpose_x_0 = const()[name = tensor<string, []>("op_621_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_621_transpose_y_0 = const()[name = tensor<string, []>("op_621_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 4, 64]> var_621 = matmul(transpose_x = var_621_transpose_x_0, transpose_y = var_621_transpose_y_0, x = q_5, y = prev_kv_5)[name = tensor<string, []>("op_621")];
-            tensor<fp32, [4]> var_622 = real_div(x = sqrt_s0_5, y = sqrt_s_t_5)[name = tensor<string, []>("op_622")];
-            tensor<int32, [4]> var_623 = const()[name = tensor<string, []>("op_623"), val = tensor<int32, [4]>([1, 1, 4, 1])];
-            tensor<fp32, [1, 1, 4, 1]> var_624 = reshape(shape = var_623, x = var_622)[name = tensor<string, []>("op_624")];
-            tensor<fp32, [1, 4, 4, 64]> cross_5 = mul(x = var_621, y = var_624)[name = tensor<string, []>("cross_5")];
+            tensor<fp32, [1, 4, 4, 64]> v_5 = transpose(perm = v_5_perm_0, x = var_662)[name = tensor<string, []>("transpose_42")];
+            tensor<fp32, [1, 4, 4, 64]> inner_5 = matmul(transpose_x = inner_5_transpose_x_0, transpose_y = inner_5_transpose_y_0, x = var_675, y = v_5)[name = tensor<string, []>("inner_5")];
+            tensor<bool, []> var_677_transpose_x_0 = const()[name = tensor<string, []>("op_677_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_677_transpose_y_0 = const()[name = tensor<string, []>("op_677_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 4, 64]> var_677 = matmul(transpose_x = var_677_transpose_x_0, transpose_y = var_677_transpose_y_0, x = q_5, y = prev_kv_5)[name = tensor<string, []>("op_677")];
+            tensor<fp32, [4]> var_678 = real_div(x = sqrt_s0_5, y = sqrt_s_t_5)[name = tensor<string, []>("op_678")];
+            tensor<int32, [4]> var_679 = const()[name = tensor<string, []>("op_679"), val = tensor<int32, [4]>([1, 1, 4, 1])];
+            tensor<fp32, [1, 1, 4, 1]> var_680 = reshape(shape = var_679, x = var_678)[name = tensor<string, []>("op_680")];
+            tensor<fp32, [1, 4, 4, 64]> cross_5 = mul(x = var_677, y = var_680)[name = tensor<string, []>("cross_5")];
             tensor<fp32, [1, 4, 4, 64]> out_13 = add(x = inner_5, y = cross_5)[name = tensor<string, []>("out_13")];
-            tensor<fp32, [1, 4, 64, 64]> var_627 = mul(x = prev_kv_5, y = sqrt_s0_5)[name = tensor<string, []>("op_627")];
-            tensor<bool, []> var_629_transpose_x_1 = const()[name = tensor<string, []>("op_629_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_629_transpose_y_1 = const()[name = tensor<string, []>("op_629_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_629 = matmul(transpose_x = var_629_transpose_x_1, transpose_y = var_629_transpose_y_1, x = k_5, y = v_5)[name = tensor<string, []>("op_629")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_5 = add(x = var_627, y = var_629)[name = tensor<string, []>("new_kv_unnorm_5")];
-            tensor<fp32, []> var_631 = const()[name = tensor<string, []>("op_631"), val = tensor<fp32, []>(0x1p+2)];
-            tensor<fp32, [1]> new_scale_5 = add(x = prev_scale_5, y = var_631)[name = tensor<string, []>("new_scale_5")];
-            tensor<fp32, [1]> var_633 = sqrt(x = new_scale_5)[name = tensor<string, []>("op_633")];
-            tensor<fp32, [1, 4, 64, 64]> var_634 = real_div(x = new_kv_unnorm_5, y = var_633)[name = tensor<string, []>("op_634")];
-            tensor<int32, [4]> var_635_perm_0 = const()[name = tensor<string, []>("op_635_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_683 = mul(x = prev_kv_5, y = sqrt_s0_5)[name = tensor<string, []>("op_683")];
+            tensor<bool, []> var_685_transpose_x_1 = const()[name = tensor<string, []>("op_685_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_685_transpose_y_1 = const()[name = tensor<string, []>("op_685_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_685 = matmul(transpose_x = var_685_transpose_x_1, transpose_y = var_685_transpose_y_1, x = k_5, y = v_5)[name = tensor<string, []>("op_685")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_5 = add(x = var_683, y = var_685)[name = tensor<string, []>("new_kv_unnorm_5")];
+            tensor<fp32, []> var_687 = const()[name = tensor<string, []>("op_687"), val = tensor<fp32, []>(0x1p+2)];
+            tensor<fp32, [1]> new_scale_5 = add(x = prev_scale_5, y = var_687)[name = tensor<string, []>("new_scale_5")];
+            tensor<fp32, [1]> var_689 = sqrt(x = new_scale_5)[name = tensor<string, []>("op_689")];
+            tensor<fp32, [1, 4, 64, 64]> var_690 = real_div(x = new_kv_unnorm_5, y = var_689)[name = tensor<string, []>("op_690")];
+            tensor<int32, [4]> var_691_perm_0 = const()[name = tensor<string, []>("op_691_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_15_axes_0 = const()[name = tensor<string, []>("out_15_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 4, 64]> var_635 = transpose(perm = var_635_perm_0, x = out_13)[name = tensor<string, []>("transpose_41")];
-            tensor<fp32, [1, 4, 4, 64]> out_15 = layer_norm(axes = out_15_axes_0, epsilon = var_17, x = var_635)[name = tensor<string, []>("out_15")];
-            tensor<int32, [3]> var_639 = const()[name = tensor<string, []>("op_639"), val = tensor<int32, [3]>([1, 4, 256])];
-            tensor<fp32, [1, 4, 256]> out_17 = reshape(shape = var_639, x = out_15)[name = tensor<string, []>("out_17")];
-            tensor<fp32, [1, 4, 256]> var_641 = silu(x = input_97)[name = tensor<string, []>("op_641")];
-            tensor<fp32, [1, 4, 256]> input_99 = mul(x = var_641, y = out_17)[name = tensor<string, []>("input_99")];
-            tensor<fp32, [1, 4, 256]> ret_out_5 = linear(bias = encoder_out_proj_2_bias, weight = encoder_out_proj_2_weight, x = input_99)[name = tensor<string, []>("linear_25")];
-            tensor<fp32, [1, 4, 256]> x_15 = add(x = input_93, y = ret_out_5)[name = tensor<string, []>("x_15")];
+            tensor<fp32, [1, 4, 4, 64]> var_691 = transpose(perm = var_691_perm_0, x = out_13)[name = tensor<string, []>("transpose_41")];
+            tensor<fp32, [1, 4, 4, 64]> out_15 = layer_norm(axes = out_15_axes_0, epsilon = var_73, x = var_691)[name = tensor<string, []>("out_15")];
+            tensor<int32, [3]> var_695 = const()[name = tensor<string, []>("op_695"), val = tensor<int32, [3]>([1, 4, 256])];
+            tensor<fp32, [1, 4, 256]> out_17 = reshape(shape = var_695, x = out_15)[name = tensor<string, []>("out_17")];
+            tensor<fp32, [1, 4, 256]> var_697 = silu(x = input_99)[name = tensor<string, []>("op_697")];
+            tensor<fp32, [1, 4, 256]> input_101 = mul(x = var_697, y = out_17)[name = tensor<string, []>("input_101")];
+            tensor<fp32, [1, 4, 256]> ret_out_5 = linear(bias = inner_encoder_out_proj_2_bias, weight = inner_encoder_out_proj_2_weight, x = input_101)[name = tensor<string, []>("linear_25")];
+            tensor<fp32, [1, 4, 256]> x_15 = add(x = input_95, y = ret_out_5)[name = tensor<string, []>("x_15")];
             tensor<int32, [4]> window_21_begin_0 = const()[name = tensor<string, []>("window_21_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> window_21_end_0 = const()[name = tensor<string, []>("window_21_end_0"), val = tensor<int32, [4]>([3, 1, 16, 256])];
             tensor<bool, [4]> window_21_end_mask_0 = const()[name = tensor<string, []>("window_21_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_21_squeeze_mask_0 = const()[name = tensor<string, []>("window_21_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_21 = slice_by_index(begin = window_21_begin_0, end = window_21_end_0, end_mask = window_21_end_mask_0, squeeze_mask = window_21_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_21")];
-            tensor<int32, [3]> var_649_begin_0 = const()[name = tensor<string, []>("op_649_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_649_end_0 = const()[name = tensor<string, []>("op_649_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_649_end_mask_0 = const()[name = tensor<string, []>("op_649_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_649 = slice_by_index(begin = var_649_begin_0, end = var_649_end_0, end_mask = var_649_end_mask_0, x = x_15)[name = tensor<string, []>("op_649")];
-            tensor<int32, [3]> var_652_begin_0 = const()[name = tensor<string, []>("op_652_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_652_end_0 = const()[name = tensor<string, []>("op_652_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_652_end_mask_0 = const()[name = tensor<string, []>("op_652_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_652 = slice_by_index(begin = var_652_begin_0, end = var_652_end_0, end_mask = var_652_end_mask_0, x = window_21)[name = tensor<string, []>("op_652")];
+            tensor<int32, [3]> var_705_begin_0 = const()[name = tensor<string, []>("op_705_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_705_end_0 = const()[name = tensor<string, []>("op_705_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_705_end_mask_0 = const()[name = tensor<string, []>("op_705_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_705 = slice_by_index(begin = var_705_begin_0, end = var_705_end_0, end_mask = var_705_end_mask_0, x = x_15)[name = tensor<string, []>("op_705")];
+            tensor<int32, [3]> var_708_begin_0 = const()[name = tensor<string, []>("op_708_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_708_end_0 = const()[name = tensor<string, []>("op_708_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_708_end_mask_0 = const()[name = tensor<string, []>("op_708_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_708 = slice_by_index(begin = var_708_begin_0, end = var_708_end_0, end_mask = var_708_end_mask_0, x = window_21)[name = tensor<string, []>("op_708")];
             tensor<bool, []> window_23_interleave_0 = const()[name = tensor<string, []>("window_23_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_23 = concat(axis = var_26, interleave = window_23_interleave_0, values = (var_652, var_649))[name = tensor<string, []>("window_23")];
-            tensor<int32, [3]> var_657_begin_0 = const()[name = tensor<string, []>("op_657_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_657_end_0 = const()[name = tensor<string, []>("op_657_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_657_end_mask_0 = const()[name = tensor<string, []>("op_657_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_657 = slice_by_index(begin = var_657_begin_0, end = var_657_end_0, end_mask = var_657_end_mask_0, x = x_15)[name = tensor<string, []>("op_657")];
-            tensor<int32, [3]> var_660_begin_0 = const()[name = tensor<string, []>("op_660_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_660_end_0 = const()[name = tensor<string, []>("op_660_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_660_end_mask_0 = const()[name = tensor<string, []>("op_660_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_660 = slice_by_index(begin = var_660_begin_0, end = var_660_end_0, end_mask = var_660_end_mask_0, x = window_23)[name = tensor<string, []>("op_660")];
+            tensor<fp32, [1, 16, 256]> window_23 = concat(axis = var_82, interleave = window_23_interleave_0, values = (var_708, var_705))[name = tensor<string, []>("window_23")];
+            tensor<int32, [3]> var_713_begin_0 = const()[name = tensor<string, []>("op_713_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_713_end_0 = const()[name = tensor<string, []>("op_713_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_713_end_mask_0 = const()[name = tensor<string, []>("op_713_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_713 = slice_by_index(begin = var_713_begin_0, end = var_713_end_0, end_mask = var_713_end_mask_0, x = x_15)[name = tensor<string, []>("op_713")];
+            tensor<int32, [3]> var_716_begin_0 = const()[name = tensor<string, []>("op_716_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_716_end_0 = const()[name = tensor<string, []>("op_716_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_716_end_mask_0 = const()[name = tensor<string, []>("op_716_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_716 = slice_by_index(begin = var_716_begin_0, end = var_716_end_0, end_mask = var_716_end_mask_0, x = window_23)[name = tensor<string, []>("op_716")];
             tensor<bool, []> window_25_interleave_0 = const()[name = tensor<string, []>("window_25_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_25 = concat(axis = var_26, interleave = window_25_interleave_0, values = (var_660, var_657))[name = tensor<string, []>("window_25")];
-            tensor<int32, [3]> var_665_begin_0 = const()[name = tensor<string, []>("op_665_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_665_end_0 = const()[name = tensor<string, []>("op_665_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<bool, [3]> var_665_end_mask_0 = const()[name = tensor<string, []>("op_665_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_665 = slice_by_index(begin = var_665_begin_0, end = var_665_end_0, end_mask = var_665_end_mask_0, x = x_15)[name = tensor<string, []>("op_665")];
-            tensor<int32, [3]> var_668_begin_0 = const()[name = tensor<string, []>("op_668_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_668_end_0 = const()[name = tensor<string, []>("op_668_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_668_end_mask_0 = const()[name = tensor<string, []>("op_668_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_668 = slice_by_index(begin = var_668_begin_0, end = var_668_end_0, end_mask = var_668_end_mask_0, x = window_25)[name = tensor<string, []>("op_668")];
+            tensor<fp32, [1, 16, 256]> window_25 = concat(axis = var_82, interleave = window_25_interleave_0, values = (var_716, var_713))[name = tensor<string, []>("window_25")];
+            tensor<int32, [3]> var_721_begin_0 = const()[name = tensor<string, []>("op_721_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_721_end_0 = const()[name = tensor<string, []>("op_721_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<bool, [3]> var_721_end_mask_0 = const()[name = tensor<string, []>("op_721_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_721 = slice_by_index(begin = var_721_begin_0, end = var_721_end_0, end_mask = var_721_end_mask_0, x = x_15)[name = tensor<string, []>("op_721")];
+            tensor<int32, [3]> var_724_begin_0 = const()[name = tensor<string, []>("op_724_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_724_end_0 = const()[name = tensor<string, []>("op_724_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_724_end_mask_0 = const()[name = tensor<string, []>("op_724_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_724 = slice_by_index(begin = var_724_begin_0, end = var_724_end_0, end_mask = var_724_end_mask_0, x = window_25)[name = tensor<string, []>("op_724")];
             tensor<bool, []> window_27_interleave_0 = const()[name = tensor<string, []>("window_27_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_27 = concat(axis = var_26, interleave = window_27_interleave_0, values = (var_668, var_665))[name = tensor<string, []>("window_27")];
-            tensor<int32, [3]> var_673_begin_0 = const()[name = tensor<string, []>("op_673_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
-            tensor<int32, [3]> var_673_end_0 = const()[name = tensor<string, []>("op_673_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_673_end_mask_0 = const()[name = tensor<string, []>("op_673_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_673 = slice_by_index(begin = var_673_begin_0, end = var_673_end_0, end_mask = var_673_end_mask_0, x = x_15)[name = tensor<string, []>("op_673")];
-            tensor<int32, [3]> var_676_begin_0 = const()[name = tensor<string, []>("op_676_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_676_end_0 = const()[name = tensor<string, []>("op_676_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_676_end_mask_0 = const()[name = tensor<string, []>("op_676_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_676 = slice_by_index(begin = var_676_begin_0, end = var_676_end_0, end_mask = var_676_end_mask_0, x = window_27)[name = tensor<string, []>("op_676")];
+            tensor<fp32, [1, 16, 256]> window_27 = concat(axis = var_82, interleave = window_27_interleave_0, values = (var_724, var_721))[name = tensor<string, []>("window_27")];
+            tensor<int32, [3]> var_729_begin_0 = const()[name = tensor<string, []>("op_729_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
+            tensor<int32, [3]> var_729_end_0 = const()[name = tensor<string, []>("op_729_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_729_end_mask_0 = const()[name = tensor<string, []>("op_729_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_729 = slice_by_index(begin = var_729_begin_0, end = var_729_end_0, end_mask = var_729_end_mask_0, x = x_15)[name = tensor<string, []>("op_729")];
+            tensor<int32, [3]> var_732_begin_0 = const()[name = tensor<string, []>("op_732_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_732_end_0 = const()[name = tensor<string, []>("op_732_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_732_end_mask_0 = const()[name = tensor<string, []>("op_732_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_732 = slice_by_index(begin = var_732_begin_0, end = var_732_end_0, end_mask = var_732_end_mask_0, x = window_27)[name = tensor<string, []>("op_732")];
             tensor<bool, []> window_29_interleave_0 = const()[name = tensor<string, []>("window_29_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_29 = concat(axis = var_26, interleave = window_29_interleave_0, values = (var_676, var_673))[name = tensor<string, []>("window_29")];
-            tensor<bool, []> input_101_interleave_0 = const()[name = tensor<string, []>("input_101_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [4, 16, 256]> input_101 = concat(axis = var_23, interleave = input_101_interleave_0, values = (window_23, window_25, window_27, window_29))[name = tensor<string, []>("input_101")];
+            tensor<fp32, [1, 16, 256]> window_29 = concat(axis = var_82, interleave = window_29_interleave_0, values = (var_732, var_729))[name = tensor<string, []>("window_29")];
+            tensor<bool, []> input_103_interleave_0 = const()[name = tensor<string, []>("input_103_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [4, 16, 256]> input_103 = concat(axis = var_68, interleave = input_103_interleave_0, values = (window_23, window_25, window_27, window_29))[name = tensor<string, []>("input_103")];
             tensor<int32, [1]> x_17_axes_0 = const()[name = tensor<string, []>("x_17_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [4, 16, 256]> x_17 = layer_norm(axes = x_17_axes_0, beta = encoder_conv_module_2_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_2_sequential_0_weight, x = input_101)[name = tensor<string, []>("x_17")];
-            tensor<int32, [3]> input_103_perm_0 = const()[name = tensor<string, []>("input_103_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [4, 16, 256]> x_17 = layer_norm(axes = x_17_axes_0, beta = inner_encoder_conv_module_2_sequential_0_bias, epsilon = var_65, gamma = inner_encoder_conv_module_2_sequential_0_weight, x = input_103)[name = tensor<string, []>("x_17")];
+            tensor<int32, [3]> input_105_perm_0 = const()[name = tensor<string, []>("input_105_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_23_pad_type_0 = const()[name = tensor<string, []>("inputs_23_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_23_strides_0 = const()[name = tensor<string, []>("inputs_23_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_23_pad_0 = const()[name = tensor<string, []>("inputs_23_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_23_dilations_0 = const()[name = tensor<string, []>("inputs_23_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_23_groups_0 = const()[name = tensor<string, []>("inputs_23_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> input_103 = transpose(perm = input_103_perm_0, x = x_17)[name = tensor<string, []>("transpose_40")];
-            tensor<fp32, [4, 512, 16]> inputs_23 = conv(bias = encoder_conv_module_2_sequential_2_conv_bias, dilations = inputs_23_dilations_0, groups = inputs_23_groups_0, pad = inputs_23_pad_0, pad_type = inputs_23_pad_type_0, strides = inputs_23_strides_0, weight = encoder_conv_module_2_sequential_2_conv_weight, x = input_103)[name = tensor<string, []>("inputs_23")];
-            tensor<int32, [2]> var_701_split_sizes_0 = const()[name = tensor<string, []>("op_701_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_701_axis_0 = const()[name = tensor<string, []>("op_701_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> var_701_0, tensor<fp32, [4, 256, 16]> var_701_1 = split(axis = var_701_axis_0, split_sizes = var_701_split_sizes_0, x = inputs_23)[name = tensor<string, []>("op_701")];
-            tensor<fp32, [4, 256, 16]> var_703 = sigmoid(x = var_701_1)[name = tensor<string, []>("op_703")];
-            tensor<fp32, [4, 256, 16]> inputs_25 = mul(x = var_701_0, y = var_703)[name = tensor<string, []>("inputs_25")];
+            tensor<fp32, [4, 256, 16]> input_105 = transpose(perm = input_105_perm_0, x = x_17)[name = tensor<string, []>("transpose_40")];
+            tensor<fp32, [4, 512, 16]> inputs_23 = conv(bias = inner_encoder_conv_module_2_sequential_2_conv_bias, dilations = inputs_23_dilations_0, groups = inputs_23_groups_0, pad = inputs_23_pad_0, pad_type = inputs_23_pad_type_0, strides = inputs_23_strides_0, weight = inner_encoder_conv_module_2_sequential_2_conv_weight, x = input_105)[name = tensor<string, []>("inputs_23")];
+            tensor<int32, [2]> var_757_split_sizes_0 = const()[name = tensor<string, []>("op_757_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_757_axis_0 = const()[name = tensor<string, []>("op_757_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [4, 256, 16]> var_757_0, tensor<fp32, [4, 256, 16]> var_757_1 = split(axis = var_757_axis_0, split_sizes = var_757_split_sizes_0, x = inputs_23)[name = tensor<string, []>("op_757")];
+            tensor<fp32, [4, 256, 16]> var_759 = sigmoid(x = var_757_1)[name = tensor<string, []>("op_759")];
+            tensor<fp32, [4, 256, 16]> inputs_25 = mul(x = var_757_0, y = var_759)[name = tensor<string, []>("inputs_25")];
             tensor<string, []> outputs_aug_5_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_5_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_5_pad_0 = const()[name = tensor<string, []>("outputs_aug_5_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_5_groups_0 = const()[name = tensor<string, []>("outputs_aug_5_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_5_strides_0 = const()[name = tensor<string, []>("outputs_aug_5_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_5_dilations_0 = const()[name = tensor<string, []>("outputs_aug_5_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [4, 256, 31]> outputs_aug_5 = conv(dilations = outputs_aug_5_dilations_0, groups = outputs_aug_5_groups_0, pad = outputs_aug_5_pad_0, pad_type = outputs_aug_5_pad_type_0, strides = outputs_aug_5_strides_0, weight = encoder_conv_module_2_sequential_4_conv_weight, x = inputs_25)[name = tensor<string, []>("outputs_aug_5")];
-            tensor<int32, [3]> input_105_begin_0 = const()[name = tensor<string, []>("input_105_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_105_end_0 = const()[name = tensor<string, []>("input_105_end_0"), val = tensor<int32, [3]>([4, 256, 16])];
-            tensor<bool, [3]> input_105_end_mask_0 = const()[name = tensor<string, []>("input_105_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [4, 256, 16]> input_105 = slice_by_index(begin = input_105_begin_0, end = input_105_end_0, end_mask = input_105_end_mask_0, x = outputs_aug_5)[name = tensor<string, []>("input_105")];
-            tensor<fp32, [4, 256, 16]> inputs_27 = batch_norm(beta = encoder_conv_module_2_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_2_sequential_5_weight, mean = encoder_conv_module_2_sequential_5_running_mean, variance = encoder_conv_module_2_sequential_5_running_var, x = input_105)[name = tensor<string, []>("inputs_27")];
-            tensor<fp32, [4, 256, 16]> input_107 = silu(x = inputs_27)[name = tensor<string, []>("input_107")];
-            tensor<string, []> input_109_pad_type_0 = const()[name = tensor<string, []>("input_109_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_109_strides_0 = const()[name = tensor<string, []>("input_109_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_109_pad_0 = const()[name = tensor<string, []>("input_109_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_109_dilations_0 = const()[name = tensor<string, []>("input_109_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_109_groups_0 = const()[name = tensor<string, []>("input_109_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> input_109 = conv(bias = encoder_conv_module_2_sequential_7_conv_bias, dilations = input_109_dilations_0, groups = input_109_groups_0, pad = input_109_pad_0, pad_type = input_109_pad_type_0, strides = input_109_strides_0, weight = encoder_conv_module_2_sequential_7_conv_weight, x = input_107)[name = tensor<string, []>("input_109")];
+            tensor<fp32, [4, 256, 31]> outputs_aug_5 = conv(dilations = outputs_aug_5_dilations_0, groups = outputs_aug_5_groups_0, pad = outputs_aug_5_pad_0, pad_type = outputs_aug_5_pad_type_0, strides = outputs_aug_5_strides_0, weight = inner_encoder_conv_module_2_sequential_4_conv_weight, x = inputs_25)[name = tensor<string, []>("outputs_aug_5")];
+            tensor<int32, [3]> input_107_begin_0 = const()[name = tensor<string, []>("input_107_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_107_end_0 = const()[name = tensor<string, []>("input_107_end_0"), val = tensor<int32, [3]>([4, 256, 16])];
+            tensor<bool, [3]> input_107_end_mask_0 = const()[name = tensor<string, []>("input_107_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [4, 256, 16]> input_107 = slice_by_index(begin = input_107_begin_0, end = input_107_end_0, end_mask = input_107_end_mask_0, x = outputs_aug_5)[name = tensor<string, []>("input_107")];
+            tensor<fp32, [4, 256, 16]> inputs_27 = batch_norm(beta = inner_encoder_conv_module_2_sequential_5_bias, epsilon = var_65, gamma = inner_encoder_conv_module_2_sequential_5_weight, mean = inner_encoder_conv_module_2_sequential_5_running_mean, variance = inner_encoder_conv_module_2_sequential_5_running_var, x = input_107)[name = tensor<string, []>("inputs_27")];
+            tensor<fp32, [4, 256, 16]> input_109 = silu(x = inputs_27)[name = tensor<string, []>("input_109")];
+            tensor<string, []> input_111_pad_type_0 = const()[name = tensor<string, []>("input_111_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_111_strides_0 = const()[name = tensor<string, []>("input_111_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_111_pad_0 = const()[name = tensor<string, []>("input_111_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_111_dilations_0 = const()[name = tensor<string, []>("input_111_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_111_groups_0 = const()[name = tensor<string, []>("input_111_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [4, 256, 16]> input_111 = conv(bias = inner_encoder_conv_module_2_sequential_7_conv_bias, dilations = input_111_dilations_0, groups = input_111_groups_0, pad = input_111_pad_0, pad_type = input_111_pad_type_0, strides = input_111_strides_0, weight = inner_encoder_conv_module_2_sequential_7_conv_weight, x = input_109)[name = tensor<string, []>("input_111")];
             tensor<int32, [3]> conv_out_5_perm_0 = const()[name = tensor<string, []>("conv_out_5_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_734_begin_0 = const()[name = tensor<string, []>("op_734_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_734_end_0 = const()[name = tensor<string, []>("op_734_end_0"), val = tensor<int32, [3]>([4, 16, 256])];
-            tensor<bool, [3]> var_734_end_mask_0 = const()[name = tensor<string, []>("op_734_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [4, 16, 256]> conv_out_5 = transpose(perm = conv_out_5_perm_0, x = input_109)[name = tensor<string, []>("transpose_39")];
-            tensor<fp32, [4, 1, 256]> var_734 = slice_by_index(begin = var_734_begin_0, end = var_734_end_0, end_mask = var_734_end_mask_0, x = conv_out_5)[name = tensor<string, []>("op_734")];
-            tensor<int32, [3]> var_736_perm_0 = const()[name = tensor<string, []>("op_736_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 4, 256]> var_736 = transpose(perm = var_736_perm_0, x = var_734)[name = tensor<string, []>("transpose_38")];
-            tensor<fp32, [1, 4, 256]> input_111 = add(x = x_15, y = var_736)[name = tensor<string, []>("input_111")];
-            tensor<int32, [1]> input_113_axes_0 = const()[name = tensor<string, []>("input_113_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_113 = layer_norm(axes = input_113_axes_0, beta = encoder_ffn2_2_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_2_module_sequential_0_weight, x = input_111)[name = tensor<string, []>("input_113")];
-            tensor<fp32, [1, 4, 1024]> inputs_29 = linear(bias = encoder_ffn2_2_module_sequential_1_linear_bias, weight = encoder_ffn2_2_module_sequential_1_linear_weight, x = input_113)[name = tensor<string, []>("linear_26")];
-            tensor<fp32, [1, 4, 1024]> input_115 = silu(x = inputs_29)[name = tensor<string, []>("input_115")];
-            tensor<fp32, [1, 4, 256]> input_119 = linear(bias = encoder_ffn2_2_module_sequential_4_linear_bias, weight = encoder_ffn2_2_module_sequential_4_linear_weight, x = input_115)[name = tensor<string, []>("linear_27")];
-            tensor<fp32, []> var_759 = const()[name = tensor<string, []>("op_759"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 4, 256]> var_760 = mul(x = input_119, y = var_759)[name = tensor<string, []>("op_760")];
-            tensor<fp32, [1, 4, 256]> input_121 = add(x = var_760, y = input_111)[name = tensor<string, []>("input_121")];
-            tensor<int32, [1]> input_123_axes_0 = const()[name = tensor<string, []>("input_123_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_123 = layer_norm(axes = input_123_axes_0, beta = encoder_layer_norm_2_bias, epsilon = var_28, gamma = encoder_layer_norm_2_weight, x = input_121)[name = tensor<string, []>("input_123")];
+            tensor<int32, [3]> var_790_begin_0 = const()[name = tensor<string, []>("op_790_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_790_end_0 = const()[name = tensor<string, []>("op_790_end_0"), val = tensor<int32, [3]>([4, 16, 256])];
+            tensor<bool, [3]> var_790_end_mask_0 = const()[name = tensor<string, []>("op_790_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [4, 16, 256]> conv_out_5 = transpose(perm = conv_out_5_perm_0, x = input_111)[name = tensor<string, []>("transpose_39")];
+            tensor<fp32, [4, 1, 256]> var_790 = slice_by_index(begin = var_790_begin_0, end = var_790_end_0, end_mask = var_790_end_mask_0, x = conv_out_5)[name = tensor<string, []>("op_790")];
+            tensor<int32, [3]> var_792_perm_0 = const()[name = tensor<string, []>("op_792_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 4, 256]> var_792 = transpose(perm = var_792_perm_0, x = var_790)[name = tensor<string, []>("transpose_38")];
+            tensor<fp32, [1, 4, 256]> input_113 = add(x = x_15, y = var_792)[name = tensor<string, []>("input_113")];
+            tensor<int32, [1]> input_115_axes_0 = const()[name = tensor<string, []>("input_115_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 4, 256]> input_115 = layer_norm(axes = input_115_axes_0, beta = inner_encoder_ffn2_2_module_sequential_0_bias, epsilon = var_65, gamma = inner_encoder_ffn2_2_module_sequential_0_weight, x = input_113)[name = tensor<string, []>("input_115")];
+            tensor<fp32, [1, 4, 1024]> inputs_29 = linear(bias = inner_encoder_ffn2_2_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_2_module_sequential_1_linear_weight, x = input_115)[name = tensor<string, []>("linear_26")];
+            tensor<fp32, [1, 4, 1024]> input_117 = silu(x = inputs_29)[name = tensor<string, []>("input_117")];
+            tensor<fp32, [1, 4, 256]> input_121 = linear(bias = inner_encoder_ffn2_2_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_2_module_sequential_4_linear_weight, x = input_117)[name = tensor<string, []>("linear_27")];
+            tensor<fp32, []> var_815 = const()[name = tensor<string, []>("op_815"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 4, 256]> var_816 = mul(x = input_121, y = var_815)[name = tensor<string, []>("op_816")];
+            tensor<fp32, [1, 4, 256]> input_123 = add(x = var_816, y = input_113)[name = tensor<string, []>("input_123")];
             tensor<int32, [1]> input_125_axes_0 = const()[name = tensor<string, []>("input_125_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_125 = layer_norm(axes = input_125_axes_0, beta = encoder_ffn1_3_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_3_module_sequential_0_weight, x = input_123)[name = tensor<string, []>("input_125")];
-            tensor<fp32, [1, 4, 1024]> inputs_31 = linear(bias = encoder_ffn1_3_module_sequential_1_linear_bias, weight = encoder_ffn1_3_module_sequential_1_linear_weight, x = input_125)[name = tensor<string, []>("linear_28")];
-            tensor<fp32, [1, 4, 1024]> input_127 = silu(x = inputs_31)[name = tensor<string, []>("input_127")];
-            tensor<fp32, [1, 4, 256]> input_131 = linear(bias = encoder_ffn1_3_module_sequential_4_linear_bias, weight = encoder_ffn1_3_module_sequential_4_linear_weight, x = input_127)[name = tensor<string, []>("linear_29")];
-            tensor<fp32, []> var_789 = const()[name = tensor<string, []>("op_789"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 4, 256]> var_790 = mul(x = input_131, y = var_789)[name = tensor<string, []>("op_790")];
-            tensor<fp32, [1, 4, 256]> input_133 = add(x = var_790, y = input_123)[name = tensor<string, []>("input_133")];
+            tensor<fp32, [1, 4, 256]> input_125 = layer_norm(axes = input_125_axes_0, beta = inner_encoder_layer_norm_2_bias, epsilon = var_65, gamma = inner_encoder_layer_norm_2_weight, x = input_123)[name = tensor<string, []>("input_125")];
+            tensor<int32, [1]> input_127_axes_0 = const()[name = tensor<string, []>("input_127_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 4, 256]> input_127 = layer_norm(axes = input_127_axes_0, beta = inner_encoder_ffn1_3_module_sequential_0_bias, epsilon = var_65, gamma = inner_encoder_ffn1_3_module_sequential_0_weight, x = input_125)[name = tensor<string, []>("input_127")];
+            tensor<fp32, [1, 4, 1024]> inputs_31 = linear(bias = inner_encoder_ffn1_3_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_3_module_sequential_1_linear_weight, x = input_127)[name = tensor<string, []>("linear_28")];
+            tensor<fp32, [1, 4, 1024]> input_129 = silu(x = inputs_31)[name = tensor<string, []>("input_129")];
+            tensor<fp32, [1, 4, 256]> input_133 = linear(bias = inner_encoder_ffn1_3_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_3_module_sequential_4_linear_weight, x = input_129)[name = tensor<string, []>("linear_29")];
+            tensor<fp32, []> var_845 = const()[name = tensor<string, []>("op_845"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 4, 256]> var_846 = mul(x = input_133, y = var_845)[name = tensor<string, []>("op_846")];
+            tensor<fp32, [1, 4, 256]> input_135 = add(x = var_846, y = input_125)[name = tensor<string, []>("input_135")];
             tensor<int32, [1]> x_19_axes_0 = const()[name = tensor<string, []>("x_19_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> x_19 = layer_norm(axes = x_19_axes_0, beta = encoder_ret_lns_3_bias, epsilon = var_28, gamma = encoder_ret_lns_3_weight, x = input_133)[name = tensor<string, []>("x_19")];
+            tensor<fp32, [1, 4, 256]> x_19 = layer_norm(axes = x_19_axes_0, beta = inner_encoder_ret_lns_3_bias, epsilon = var_65, gamma = inner_encoder_ret_lns_3_weight, x = input_135)[name = tensor<string, []>("x_19")];
             tensor<int32, [5]> prev_kv_7_begin_0 = const()[name = tensor<string, []>("prev_kv_7_begin_0"), val = tensor<int32, [5]>([3, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_7_end_0 = const()[name = tensor<string, []>("prev_kv_7_end_0"), val = tensor<int32, [5]>([4, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_7_end_mask_0 = const()[name = tensor<string, []>("prev_kv_7_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -770,209 +792,202 @@ program(1.0)
             tensor<bool, [2]> prev_scale_7_end_mask_0 = const()[name = tensor<string, []>("prev_scale_7_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_7_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_7_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_7 = slice_by_index(begin = prev_scale_7_begin_0, end = prev_scale_7_end_0, end_mask = prev_scale_7_end_mask_0, squeeze_mask = prev_scale_7_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_7")];
-            tensor<fp32, [1, 4, 256]> var_804 = linear(bias = encoder_q_proj_3_bias, weight = encoder_q_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_30")];
-            tensor<int32, [4]> var_805 = const()[name = tensor<string, []>("op_805"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_806 = reshape(shape = var_805, x = var_804)[name = tensor<string, []>("op_806")];
+            tensor<fp32, [1, 4, 256]> var_860 = linear(bias = inner_encoder_q_proj_3_bias, weight = inner_encoder_q_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_30")];
+            tensor<int32, [4]> var_861 = const()[name = tensor<string, []>("op_861"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_862 = reshape(shape = var_861, x = var_860)[name = tensor<string, []>("op_862")];
             tensor<int32, [4]> q_7_perm_0 = const()[name = tensor<string, []>("q_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> var_810 = linear(bias = encoder_k_proj_3_bias, weight = encoder_k_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_31")];
-            tensor<fp32, []> var_811 = const()[name = tensor<string, []>("op_811"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 4, 256]> var_812 = mul(x = var_810, y = var_811)[name = tensor<string, []>("op_812")];
-            tensor<int32, [4]> var_813 = const()[name = tensor<string, []>("op_813"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_814 = reshape(shape = var_813, x = var_812)[name = tensor<string, []>("op_814")];
+            tensor<fp32, [1, 4, 256]> var_866 = linear(bias = inner_encoder_k_proj_3_bias, weight = inner_encoder_k_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_31")];
+            tensor<fp32, []> var_867 = const()[name = tensor<string, []>("op_867"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 4, 256]> var_868 = mul(x = var_866, y = var_867)[name = tensor<string, []>("op_868")];
+            tensor<int32, [4]> var_869 = const()[name = tensor<string, []>("op_869"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_870 = reshape(shape = var_869, x = var_868)[name = tensor<string, []>("op_870")];
             tensor<int32, [4]> k_7_perm_0 = const()[name = tensor<string, []>("k_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> var_818 = linear(bias = encoder_v_proj_3_bias, weight = encoder_v_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_32")];
-            tensor<int32, [4]> var_819 = const()[name = tensor<string, []>("op_819"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_820 = reshape(shape = var_819, x = var_818)[name = tensor<string, []>("op_820")];
+            tensor<fp32, [1, 4, 256]> var_874 = linear(bias = inner_encoder_v_proj_3_bias, weight = inner_encoder_v_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_32")];
+            tensor<int32, [4]> var_875 = const()[name = tensor<string, []>("op_875"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_876 = reshape(shape = var_875, x = var_874)[name = tensor<string, []>("op_876")];
             tensor<int32, [4]> v_7_perm_0 = const()[name = tensor<string, []>("v_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> input_137 = linear(bias = encoder_g_proj_3_bias, weight = encoder_g_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_33")];
+            tensor<fp32, [1, 4, 256]> input_139 = linear(bias = inner_encoder_g_proj_3_bias, weight = inner_encoder_g_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_33")];
             tensor<fp32, [1]> sqrt_s0_7 = sqrt(x = prev_scale_7)[name = tensor<string, []>("sqrt_s0_7")];
-            tensor<fp32, [4]> s_t_7 = add(x = prev_scale_7, y = encoder__t_index)[name = tensor<string, []>("s_t_7")];
+            tensor<fp32, [4]> s_t_7 = add(x = prev_scale_7, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_7")];
             tensor<fp32, [4]> sqrt_s_t_7 = sqrt(x = s_t_7)[name = tensor<string, []>("sqrt_s_t_7")];
             tensor<bool, []> qk_7_transpose_x_1 = const()[name = tensor<string, []>("qk_7_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_7_transpose_y_1 = const()[name = tensor<string, []>("qk_7_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 4, 64]> k_7 = transpose(perm = k_7_perm_0, x = var_814)[name = tensor<string, []>("transpose_36")];
-            tensor<fp32, [1, 4, 4, 64]> q_7 = transpose(perm = q_7_perm_0, x = var_806)[name = tensor<string, []>("transpose_37")];
+            tensor<fp32, [1, 4, 4, 64]> k_7 = transpose(perm = k_7_perm_0, x = var_870)[name = tensor<string, []>("transpose_36")];
+            tensor<fp32, [1, 4, 4, 64]> q_7 = transpose(perm = q_7_perm_0, x = var_862)[name = tensor<string, []>("transpose_37")];
             tensor<fp32, [1, 4, 4, 4]> qk_7 = matmul(transpose_x = qk_7_transpose_x_1, transpose_y = qk_7_transpose_y_1, x = q_7, y = k_7)[name = tensor<string, []>("qk_7")];
-            tensor<int32, [2]> var_830 = const()[name = tensor<string, []>("op_830"), val = tensor<int32, [2]>([4, 1])];
-            tensor<fp32, [4, 1]> var_831 = reshape(shape = var_830, x = sqrt_s_t_7)[name = tensor<string, []>("op_831")];
-            tensor<fp32, [4, 4]> M_7 = real_div(x = encoder__causal_mask, y = var_831)[name = tensor<string, []>("M_7")];
-            tensor<fp32, [1, 4, 4, 4]> var_833 = mul(x = qk_7, y = M_7)[name = tensor<string, []>("op_833")];
+            tensor<int32, [2]> var_886 = const()[name = tensor<string, []>("op_886"), val = tensor<int32, [2]>([4, 1])];
+            tensor<fp32, [4, 1]> var_887 = reshape(shape = var_886, x = sqrt_s_t_7)[name = tensor<string, []>("op_887")];
+            tensor<fp32, [4, 4]> M_7 = real_div(x = inner_encoder__causal_mask, y = var_887)[name = tensor<string, []>("M_7")];
+            tensor<fp32, [1, 4, 4, 4]> var_889 = mul(x = qk_7, y = M_7)[name = tensor<string, []>("op_889")];
             tensor<bool, []> inner_7_transpose_x_0 = const()[name = tensor<string, []>("inner_7_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_7_transpose_y_0 = const()[name = tensor<string, []>("inner_7_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 4, 64]> v_7 = transpose(perm = v_7_perm_0, x = var_820)[name = tensor<string, []>("transpose_35")];
-            tensor<fp32, [1, 4, 4, 64]> inner_7 = matmul(transpose_x = inner_7_transpose_x_0, transpose_y = inner_7_transpose_y_0, x = var_833, y = v_7)[name = tensor<string, []>("inner_7")];
-            tensor<bool, []> var_835_transpose_x_0 = const()[name = tensor<string, []>("op_835_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_835_transpose_y_0 = const()[name = tensor<string, []>("op_835_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 4, 64]> var_835 = matmul(transpose_x = var_835_transpose_x_0, transpose_y = var_835_transpose_y_0, x = q_7, y = prev_kv_7)[name = tensor<string, []>("op_835")];
-            tensor<fp32, [4]> var_836 = real_div(x = sqrt_s0_7, y = sqrt_s_t_7)[name = tensor<string, []>("op_836")];
-            tensor<int32, [4]> var_837 = const()[name = tensor<string, []>("op_837"), val = tensor<int32, [4]>([1, 1, 4, 1])];
-            tensor<fp32, [1, 1, 4, 1]> var_838 = reshape(shape = var_837, x = var_836)[name = tensor<string, []>("op_838")];
-            tensor<fp32, [1, 4, 4, 64]> cross_7 = mul(x = var_835, y = var_838)[name = tensor<string, []>("cross_7")];
+            tensor<fp32, [1, 4, 4, 64]> v_7 = transpose(perm = v_7_perm_0, x = var_876)[name = tensor<string, []>("transpose_35")];
+            tensor<fp32, [1, 4, 4, 64]> inner_7 = matmul(transpose_x = inner_7_transpose_x_0, transpose_y = inner_7_transpose_y_0, x = var_889, y = v_7)[name = tensor<string, []>("inner_7")];
+            tensor<bool, []> var_891_transpose_x_0 = const()[name = tensor<string, []>("op_891_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_891_transpose_y_0 = const()[name = tensor<string, []>("op_891_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 4, 64]> var_891 = matmul(transpose_x = var_891_transpose_x_0, transpose_y = var_891_transpose_y_0, x = q_7, y = prev_kv_7)[name = tensor<string, []>("op_891")];
+            tensor<fp32, [4]> var_892 = real_div(x = sqrt_s0_7, y = sqrt_s_t_7)[name = tensor<string, []>("op_892")];
+            tensor<int32, [4]> var_893 = const()[name = tensor<string, []>("op_893"), val = tensor<int32, [4]>([1, 1, 4, 1])];
+            tensor<fp32, [1, 1, 4, 1]> var_894 = reshape(shape = var_893, x = var_892)[name = tensor<string, []>("op_894")];
+            tensor<fp32, [1, 4, 4, 64]> cross_7 = mul(x = var_891, y = var_894)[name = tensor<string, []>("cross_7")];
             tensor<fp32, [1, 4, 4, 64]> out_19 = add(x = inner_7, y = cross_7)[name = tensor<string, []>("out_19")];
-            tensor<fp32, [1, 4, 64, 64]> var_841 = mul(x = prev_kv_7, y = sqrt_s0_7)[name = tensor<string, []>("op_841")];
-            tensor<bool, []> var_843_transpose_x_1 = const()[name = tensor<string, []>("op_843_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_843_transpose_y_1 = const()[name = tensor<string, []>("op_843_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_843 = matmul(transpose_x = var_843_transpose_x_1, transpose_y = var_843_transpose_y_1, x = k_7, y = v_7)[name = tensor<string, []>("op_843")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_7 = add(x = var_841, y = var_843)[name = tensor<string, []>("new_kv_unnorm_7")];
-            tensor<fp32, []> var_845 = const()[name = tensor<string, []>("op_845"), val = tensor<fp32, []>(0x1p+2)];
-            tensor<fp32, [1]> new_scale_7 = add(x = prev_scale_7, y = var_845)[name = tensor<string, []>("new_scale_7")];
-            tensor<fp32, [1]> var_847 = sqrt(x = new_scale_7)[name = tensor<string, []>("op_847")];
-            tensor<fp32, [1, 4, 64, 64]> nkv_1 = real_div(x = new_kv_unnorm_7, y = var_847)[name = tensor<string, []>("nkv_1")];
-            tensor<int32, [4]> var_849_perm_0 = const()[name = tensor<string, []>("op_849_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_897 = mul(x = prev_kv_7, y = sqrt_s0_7)[name = tensor<string, []>("op_897")];
+            tensor<bool, []> var_899_transpose_x_1 = const()[name = tensor<string, []>("op_899_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_899_transpose_y_1 = const()[name = tensor<string, []>("op_899_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_899 = matmul(transpose_x = var_899_transpose_x_1, transpose_y = var_899_transpose_y_1, x = k_7, y = v_7)[name = tensor<string, []>("op_899")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_7 = add(x = var_897, y = var_899)[name = tensor<string, []>("new_kv_unnorm_7")];
+            tensor<fp32, []> var_901 = const()[name = tensor<string, []>("op_901"), val = tensor<fp32, []>(0x1p+2)];
+            tensor<fp32, [1]> new_scale_7 = add(x = prev_scale_7, y = var_901)[name = tensor<string, []>("new_scale_7")];
+            tensor<fp32, [1]> var_903 = sqrt(x = new_scale_7)[name = tensor<string, []>("op_903")];
+            tensor<fp32, [1, 4, 64, 64]> nkv_1 = real_div(x = new_kv_unnorm_7, y = var_903)[name = tensor<string, []>("nkv_1")];
+            tensor<int32, [4]> var_905_perm_0 = const()[name = tensor<string, []>("op_905_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_21_axes_0 = const()[name = tensor<string, []>("out_21_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 4, 64]> var_849 = transpose(perm = var_849_perm_0, x = out_19)[name = tensor<string, []>("transpose_34")];
-            tensor<fp32, [1, 4, 4, 64]> out_21 = layer_norm(axes = out_21_axes_0, epsilon = var_17, x = var_849)[name = tensor<string, []>("out_21")];
-            tensor<int32, [3]> var_853 = const()[name = tensor<string, []>("op_853"), val = tensor<int32, [3]>([1, 4, 256])];
-            tensor<fp32, [1, 4, 256]> out_23 = reshape(shape = var_853, x = out_21)[name = tensor<string, []>("out_23")];
-            tensor<fp32, [1, 4, 256]> var_855 = silu(x = input_137)[name = tensor<string, []>("op_855")];
-            tensor<fp32, [1, 4, 256]> input_139 = mul(x = var_855, y = out_23)[name = tensor<string, []>("input_139")];
-            tensor<fp32, [1, 4, 256]> ret_out_7 = linear(bias = encoder_out_proj_3_bias, weight = encoder_out_proj_3_weight, x = input_139)[name = tensor<string, []>("linear_34")];
-            tensor<fp32, [1, 4, 256]> x_21 = add(x = input_133, y = ret_out_7)[name = tensor<string, []>("x_21")];
+            tensor<fp32, [1, 4, 4, 64]> var_905 = transpose(perm = var_905_perm_0, x = out_19)[name = tensor<string, []>("transpose_34")];
+            tensor<fp32, [1, 4, 4, 64]> out_21 = layer_norm(axes = out_21_axes_0, epsilon = var_73, x = var_905)[name = tensor<string, []>("out_21")];
+            tensor<int32, [3]> var_909 = const()[name = tensor<string, []>("op_909"), val = tensor<int32, [3]>([1, 4, 256])];
+            tensor<fp32, [1, 4, 256]> out_23 = reshape(shape = var_909, x = out_21)[name = tensor<string, []>("out_23")];
+            tensor<fp32, [1, 4, 256]> var_911 = silu(x = input_139)[name = tensor<string, []>("op_911")];
+            tensor<fp32, [1, 4, 256]> input_141 = mul(x = var_911, y = out_23)[name = tensor<string, []>("input_141")];
+            tensor<fp32, [1, 4, 256]> ret_out_7 = linear(bias = inner_encoder_out_proj_3_bias, weight = inner_encoder_out_proj_3_weight, x = input_141)[name = tensor<string, []>("linear_34")];
+            tensor<fp32, [1, 4, 256]> x_21 = add(x = input_135, y = ret_out_7)[name = tensor<string, []>("x_21")];
             tensor<int32, [4]> window_31_begin_0 = const()[name = tensor<string, []>("window_31_begin_0"), val = tensor<int32, [4]>([3, 0, 0, 0])];
             tensor<int32, [4]> window_31_end_0 = const()[name = tensor<string, []>("window_31_end_0"), val = tensor<int32, [4]>([4, 1, 16, 256])];
             tensor<bool, [4]> window_31_end_mask_0 = const()[name = tensor<string, []>("window_31_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_31_squeeze_mask_0 = const()[name = tensor<string, []>("window_31_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_31 = slice_by_index(begin = window_31_begin_0, end = window_31_end_0, end_mask = window_31_end_mask_0, squeeze_mask = window_31_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_31")];
-            tensor<int32, [3]> var_863_begin_0 = const()[name = tensor<string, []>("op_863_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_863_end_0 = const()[name = tensor<string, []>("op_863_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_863_end_mask_0 = const()[name = tensor<string, []>("op_863_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_863 = slice_by_index(begin = var_863_begin_0, end = var_863_end_0, end_mask = var_863_end_mask_0, x = x_21)[name = tensor<string, []>("op_863")];
-            tensor<int32, [3]> var_866_begin_0 = const()[name = tensor<string, []>("op_866_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_866_end_0 = const()[name = tensor<string, []>("op_866_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_866_end_mask_0 = const()[name = tensor<string, []>("op_866_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_866 = slice_by_index(begin = var_866_begin_0, end = var_866_end_0, end_mask = var_866_end_mask_0, x = window_31)[name = tensor<string, []>("op_866")];
+            tensor<int32, [3]> var_919_begin_0 = const()[name = tensor<string, []>("op_919_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_919_end_0 = const()[name = tensor<string, []>("op_919_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_919_end_mask_0 = const()[name = tensor<string, []>("op_919_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_919 = slice_by_index(begin = var_919_begin_0, end = var_919_end_0, end_mask = var_919_end_mask_0, x = x_21)[name = tensor<string, []>("op_919")];
+            tensor<int32, [3]> var_922_begin_0 = const()[name = tensor<string, []>("op_922_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_922_end_0 = const()[name = tensor<string, []>("op_922_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_922_end_mask_0 = const()[name = tensor<string, []>("op_922_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_922 = slice_by_index(begin = var_922_begin_0, end = var_922_end_0, end_mask = var_922_end_mask_0, x = window_31)[name = tensor<string, []>("op_922")];
             tensor<bool, []> window_33_interleave_0 = const()[name = tensor<string, []>("window_33_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_33 = concat(axis = var_26, interleave = window_33_interleave_0, values = (var_866, var_863))[name = tensor<string, []>("window_33")];
-            tensor<int32, [3]> var_871_begin_0 = const()[name = tensor<string, []>("op_871_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_871_end_0 = const()[name = tensor<string, []>("op_871_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_871_end_mask_0 = const()[name = tensor<string, []>("op_871_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_871 = slice_by_index(begin = var_871_begin_0, end = var_871_end_0, end_mask = var_871_end_mask_0, x = x_21)[name = tensor<string, []>("op_871")];
-            tensor<int32, [3]> var_874_begin_0 = const()[name = tensor<string, []>("op_874_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_874_end_0 = const()[name = tensor<string, []>("op_874_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_874_end_mask_0 = const()[name = tensor<string, []>("op_874_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_874 = slice_by_index(begin = var_874_begin_0, end = var_874_end_0, end_mask = var_874_end_mask_0, x = window_33)[name = tensor<string, []>("op_874")];
+            tensor<fp32, [1, 16, 256]> window_33 = concat(axis = var_82, interleave = window_33_interleave_0, values = (var_922, var_919))[name = tensor<string, []>("window_33")];
+            tensor<int32, [3]> var_927_begin_0 = const()[name = tensor<string, []>("op_927_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_927_end_0 = const()[name = tensor<string, []>("op_927_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_927_end_mask_0 = const()[name = tensor<string, []>("op_927_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_927 = slice_by_index(begin = var_927_begin_0, end = var_927_end_0, end_mask = var_927_end_mask_0, x = x_21)[name = tensor<string, []>("op_927")];
+            tensor<int32, [3]> var_930_begin_0 = const()[name = tensor<string, []>("op_930_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_930_end_0 = const()[name = tensor<string, []>("op_930_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_930_end_mask_0 = const()[name = tensor<string, []>("op_930_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_930 = slice_by_index(begin = var_930_begin_0, end = var_930_end_0, end_mask = var_930_end_mask_0, x = window_33)[name = tensor<string, []>("op_930")];
             tensor<bool, []> window_35_interleave_0 = const()[name = tensor<string, []>("window_35_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_35 = concat(axis = var_26, interleave = window_35_interleave_0, values = (var_874, var_871))[name = tensor<string, []>("window_35")];
-            tensor<int32, [3]> var_879_begin_0 = const()[name = tensor<string, []>("op_879_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_879_end_0 = const()[name = tensor<string, []>("op_879_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<bool, [3]> var_879_end_mask_0 = const()[name = tensor<string, []>("op_879_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_879 = slice_by_index(begin = var_879_begin_0, end = var_879_end_0, end_mask = var_879_end_mask_0, x = x_21)[name = tensor<string, []>("op_879")];
-            tensor<int32, [3]> var_882_begin_0 = const()[name = tensor<string, []>("op_882_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_882_end_0 = const()[name = tensor<string, []>("op_882_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_882_end_mask_0 = const()[name = tensor<string, []>("op_882_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_882 = slice_by_index(begin = var_882_begin_0, end = var_882_end_0, end_mask = var_882_end_mask_0, x = window_35)[name = tensor<string, []>("op_882")];
+            tensor<fp32, [1, 16, 256]> window_35 = concat(axis = var_82, interleave = window_35_interleave_0, values = (var_930, var_927))[name = tensor<string, []>("window_35")];
+            tensor<int32, [3]> var_935_begin_0 = const()[name = tensor<string, []>("op_935_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_935_end_0 = const()[name = tensor<string, []>("op_935_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<bool, [3]> var_935_end_mask_0 = const()[name = tensor<string, []>("op_935_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_935 = slice_by_index(begin = var_935_begin_0, end = var_935_end_0, end_mask = var_935_end_mask_0, x = x_21)[name = tensor<string, []>("op_935")];
+            tensor<int32, [3]> var_938_begin_0 = const()[name = tensor<string, []>("op_938_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_938_end_0 = const()[name = tensor<string, []>("op_938_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_938_end_mask_0 = const()[name = tensor<string, []>("op_938_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_938 = slice_by_index(begin = var_938_begin_0, end = var_938_end_0, end_mask = var_938_end_mask_0, x = window_35)[name = tensor<string, []>("op_938")];
             tensor<bool, []> window_37_interleave_0 = const()[name = tensor<string, []>("window_37_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_37 = concat(axis = var_26, interleave = window_37_interleave_0, values = (var_882, var_879))[name = tensor<string, []>("window_37")];
-            tensor<int32, [3]> var_887_begin_0 = const()[name = tensor<string, []>("op_887_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
-            tensor<int32, [3]> var_887_end_0 = const()[name = tensor<string, []>("op_887_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_887_end_mask_0 = const()[name = tensor<string, []>("op_887_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_887 = slice_by_index(begin = var_887_begin_0, end = var_887_end_0, end_mask = var_887_end_mask_0, x = x_21)[name = tensor<string, []>("op_887")];
-            tensor<int32, [3]> var_890_begin_0 = const()[name = tensor<string, []>("op_890_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_890_end_0 = const()[name = tensor<string, []>("op_890_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_890_end_mask_0 = const()[name = tensor<string, []>("op_890_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_890 = slice_by_index(begin = var_890_begin_0, end = var_890_end_0, end_mask = var_890_end_mask_0, x = window_37)[name = tensor<string, []>("op_890")];
+            tensor<fp32, [1, 16, 256]> window_37 = concat(axis = var_82, interleave = window_37_interleave_0, values = (var_938, var_935))[name = tensor<string, []>("window_37")];
+            tensor<int32, [3]> var_943_begin_0 = const()[name = tensor<string, []>("op_943_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
+            tensor<int32, [3]> var_943_end_0 = const()[name = tensor<string, []>("op_943_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_943_end_mask_0 = const()[name = tensor<string, []>("op_943_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_943 = slice_by_index(begin = var_943_begin_0, end = var_943_end_0, end_mask = var_943_end_mask_0, x = x_21)[name = tensor<string, []>("op_943")];
+            tensor<int32, [3]> var_946_begin_0 = const()[name = tensor<string, []>("op_946_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_946_end_0 = const()[name = tensor<string, []>("op_946_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_946_end_mask_0 = const()[name = tensor<string, []>("op_946_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_946 = slice_by_index(begin = var_946_begin_0, end = var_946_end_0, end_mask = var_946_end_mask_0, x = window_37)[name = tensor<string, []>("op_946")];
             tensor<bool, []> window_interleave_0 = const()[name = tensor<string, []>("window_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window = concat(axis = var_26, interleave = window_interleave_0, values = (var_890, var_887))[name = tensor<string, []>("window")];
-            tensor<bool, []> input_141_interleave_0 = const()[name = tensor<string, []>("input_141_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [4, 16, 256]> input_141 = concat(axis = var_23, interleave = input_141_interleave_0, values = (window_33, window_35, window_37, window))[name = tensor<string, []>("input_141")];
+            tensor<fp32, [1, 16, 256]> window = concat(axis = var_82, interleave = window_interleave_0, values = (var_946, var_943))[name = tensor<string, []>("window")];
+            tensor<bool, []> input_143_interleave_0 = const()[name = tensor<string, []>("input_143_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [4, 16, 256]> input_143 = concat(axis = var_68, interleave = input_143_interleave_0, values = (window_33, window_35, window_37, window))[name = tensor<string, []>("input_143")];
             tensor<int32, [1]> x_23_axes_0 = const()[name = tensor<string, []>("x_23_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [4, 16, 256]> x_23 = layer_norm(axes = x_23_axes_0, beta = encoder_conv_module_3_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_3_sequential_0_weight, x = input_141)[name = tensor<string, []>("x_23")];
-            tensor<int32, [3]> input_143_perm_0 = const()[name = tensor<string, []>("input_143_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [4, 16, 256]> x_23 = layer_norm(axes = x_23_axes_0, beta = inner_encoder_conv_module_3_sequential_0_bias, epsilon = var_65, gamma = inner_encoder_conv_module_3_sequential_0_weight, x = input_143)[name = tensor<string, []>("x_23")];
+            tensor<int32, [3]> input_145_perm_0 = const()[name = tensor<string, []>("input_145_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_33_pad_type_0 = const()[name = tensor<string, []>("inputs_33_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_33_strides_0 = const()[name = tensor<string, []>("inputs_33_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_33_pad_0 = const()[name = tensor<string, []>("inputs_33_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_33_dilations_0 = const()[name = tensor<string, []>("inputs_33_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_33_groups_0 = const()[name = tensor<string, []>("inputs_33_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> input_143 = transpose(perm = input_143_perm_0, x = x_23)[name = tensor<string, []>("transpose_33")];
-            tensor<fp32, [4, 512, 16]> inputs_33 = conv(bias = encoder_conv_module_3_sequential_2_conv_bias, dilations = inputs_33_dilations_0, groups = inputs_33_groups_0, pad = inputs_33_pad_0, pad_type = inputs_33_pad_type_0, strides = inputs_33_strides_0, weight = encoder_conv_module_3_sequential_2_conv_weight, x = input_143)[name = tensor<string, []>("inputs_33")];
-            tensor<int32, [2]> var_915_split_sizes_0 = const()[name = tensor<string, []>("op_915_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_915_axis_0 = const()[name = tensor<string, []>("op_915_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> var_915_0, tensor<fp32, [4, 256, 16]> var_915_1 = split(axis = var_915_axis_0, split_sizes = var_915_split_sizes_0, x = inputs_33)[name = tensor<string, []>("op_915")];
-            tensor<fp32, [4, 256, 16]> var_917 = sigmoid(x = var_915_1)[name = tensor<string, []>("op_917")];
-            tensor<fp32, [4, 256, 16]> inputs_35 = mul(x = var_915_0, y = var_917)[name = tensor<string, []>("inputs_35")];
+            tensor<fp32, [4, 256, 16]> input_145 = transpose(perm = input_145_perm_0, x = x_23)[name = tensor<string, []>("transpose_33")];
+            tensor<fp32, [4, 512, 16]> inputs_33 = conv(bias = inner_encoder_conv_module_3_sequential_2_conv_bias, dilations = inputs_33_dilations_0, groups = inputs_33_groups_0, pad = inputs_33_pad_0, pad_type = inputs_33_pad_type_0, strides = inputs_33_strides_0, weight = inner_encoder_conv_module_3_sequential_2_conv_weight, x = input_145)[name = tensor<string, []>("inputs_33")];
+            tensor<int32, [2]> var_971_split_sizes_0 = const()[name = tensor<string, []>("op_971_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_971_axis_0 = const()[name = tensor<string, []>("op_971_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [4, 256, 16]> var_971_0, tensor<fp32, [4, 256, 16]> var_971_1 = split(axis = var_971_axis_0, split_sizes = var_971_split_sizes_0, x = inputs_33)[name = tensor<string, []>("op_971")];
+            tensor<fp32, [4, 256, 16]> var_973 = sigmoid(x = var_971_1)[name = tensor<string, []>("op_973")];
+            tensor<fp32, [4, 256, 16]> inputs_35 = mul(x = var_971_0, y = var_973)[name = tensor<string, []>("inputs_35")];
             tensor<string, []> outputs_aug_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_pad_0 = const()[name = tensor<string, []>("outputs_aug_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_groups_0 = const()[name = tensor<string, []>("outputs_aug_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_strides_0 = const()[name = tensor<string, []>("outputs_aug_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_dilations_0 = const()[name = tensor<string, []>("outputs_aug_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [4, 256, 31]> outputs_aug = conv(dilations = outputs_aug_dilations_0, groups = outputs_aug_groups_0, pad = outputs_aug_pad_0, pad_type = outputs_aug_pad_type_0, strides = outputs_aug_strides_0, weight = encoder_conv_module_3_sequential_4_conv_weight, x = inputs_35)[name = tensor<string, []>("outputs_aug")];
-            tensor<int32, [3]> input_145_begin_0 = const()[name = tensor<string, []>("input_145_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_145_end_0 = const()[name = tensor<string, []>("input_145_end_0"), val = tensor<int32, [3]>([4, 256, 16])];
-            tensor<bool, [3]> input_145_end_mask_0 = const()[name = tensor<string, []>("input_145_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [4, 256, 16]> input_145 = slice_by_index(begin = input_145_begin_0, end = input_145_end_0, end_mask = input_145_end_mask_0, x = outputs_aug)[name = tensor<string, []>("input_145")];
-            tensor<fp32, [4, 256, 16]> inputs_37 = batch_norm(beta = encoder_conv_module_3_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_3_sequential_5_weight, mean = encoder_conv_module_3_sequential_5_running_mean, variance = encoder_conv_module_3_sequential_5_running_var, x = input_145)[name = tensor<string, []>("inputs_37")];
-            tensor<fp32, [4, 256, 16]> input_147 = silu(x = inputs_37)[name = tensor<string, []>("input_147")];
-            tensor<string, []> input_149_pad_type_0 = const()[name = tensor<string, []>("input_149_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_149_strides_0 = const()[name = tensor<string, []>("input_149_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_149_pad_0 = const()[name = tensor<string, []>("input_149_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_149_dilations_0 = const()[name = tensor<string, []>("input_149_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_149_groups_0 = const()[name = tensor<string, []>("input_149_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> input_149 = conv(bias = encoder_conv_module_3_sequential_7_conv_bias, dilations = input_149_dilations_0, groups = input_149_groups_0, pad = input_149_pad_0, pad_type = input_149_pad_type_0, strides = input_149_strides_0, weight = encoder_conv_module_3_sequential_7_conv_weight, x = input_147)[name = tensor<string, []>("input_149")];
+            tensor<fp32, [4, 256, 31]> outputs_aug = conv(dilations = outputs_aug_dilations_0, groups = outputs_aug_groups_0, pad = outputs_aug_pad_0, pad_type = outputs_aug_pad_type_0, strides = outputs_aug_strides_0, weight = inner_encoder_conv_module_3_sequential_4_conv_weight, x = inputs_35)[name = tensor<string, []>("outputs_aug")];
+            tensor<int32, [3]> input_147_begin_0 = const()[name = tensor<string, []>("input_147_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_147_end_0 = const()[name = tensor<string, []>("input_147_end_0"), val = tensor<int32, [3]>([4, 256, 16])];
+            tensor<bool, [3]> input_147_end_mask_0 = const()[name = tensor<string, []>("input_147_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [4, 256, 16]> input_147 = slice_by_index(begin = input_147_begin_0, end = input_147_end_0, end_mask = input_147_end_mask_0, x = outputs_aug)[name = tensor<string, []>("input_147")];
+            tensor<fp32, [4, 256, 16]> inputs_37 = batch_norm(beta = inner_encoder_conv_module_3_sequential_5_bias, epsilon = var_65, gamma = inner_encoder_conv_module_3_sequential_5_weight, mean = inner_encoder_conv_module_3_sequential_5_running_mean, variance = inner_encoder_conv_module_3_sequential_5_running_var, x = input_147)[name = tensor<string, []>("inputs_37")];
+            tensor<fp32, [4, 256, 16]> input_149 = silu(x = inputs_37)[name = tensor<string, []>("input_149")];
+            tensor<string, []> input_151_pad_type_0 = const()[name = tensor<string, []>("input_151_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_151_strides_0 = const()[name = tensor<string, []>("input_151_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_151_pad_0 = const()[name = tensor<string, []>("input_151_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_151_dilations_0 = const()[name = tensor<string, []>("input_151_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_151_groups_0 = const()[name = tensor<string, []>("input_151_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [4, 256, 16]> input_151 = conv(bias = inner_encoder_conv_module_3_sequential_7_conv_bias, dilations = input_151_dilations_0, groups = input_151_groups_0, pad = input_151_pad_0, pad_type = input_151_pad_type_0, strides = input_151_strides_0, weight = inner_encoder_conv_module_3_sequential_7_conv_weight, x = input_149)[name = tensor<string, []>("input_151")];
             tensor<int32, [3]> conv_out_7_perm_0 = const()[name = tensor<string, []>("conv_out_7_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_948_begin_0 = const()[name = tensor<string, []>("op_948_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_948_end_0 = const()[name = tensor<string, []>("op_948_end_0"), val = tensor<int32, [3]>([4, 16, 256])];
-            tensor<bool, [3]> var_948_end_mask_0 = const()[name = tensor<string, []>("op_948_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [4, 16, 256]> conv_out_7 = transpose(perm = conv_out_7_perm_0, x = input_149)[name = tensor<string, []>("transpose_32")];
-            tensor<fp32, [4, 1, 256]> var_948 = slice_by_index(begin = var_948_begin_0, end = var_948_end_0, end_mask = var_948_end_mask_0, x = conv_out_7)[name = tensor<string, []>("op_948")];
-            tensor<int32, [3]> var_950_perm_0 = const()[name = tensor<string, []>("op_950_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 4, 256]> var_950 = transpose(perm = var_950_perm_0, x = var_948)[name = tensor<string, []>("transpose_31")];
-            tensor<fp32, [1, 4, 256]> input_151 = add(x = x_21, y = var_950)[name = tensor<string, []>("input_151")];
-            tensor<int32, [1]> input_153_axes_0 = const()[name = tensor<string, []>("input_153_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_153 = layer_norm(axes = input_153_axes_0, beta = encoder_ffn2_3_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_3_module_sequential_0_weight, x = input_151)[name = tensor<string, []>("input_153")];
-            tensor<fp32, [1, 4, 1024]> inputs = linear(bias = encoder_ffn2_3_module_sequential_1_linear_bias, weight = encoder_ffn2_3_module_sequential_1_linear_weight, x = input_153)[name = tensor<string, []>("linear_35")];
-            tensor<fp32, [1, 4, 1024]> input_155 = silu(x = inputs)[name = tensor<string, []>("input_155")];
-            tensor<fp32, [1, 4, 256]> input_159 = linear(bias = encoder_ffn2_3_module_sequential_4_linear_bias, weight = encoder_ffn2_3_module_sequential_4_linear_weight, x = input_155)[name = tensor<string, []>("linear_36")];
-            tensor<fp32, []> var_973 = const()[name = tensor<string, []>("op_973"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 4, 256]> var_974 = mul(x = input_159, y = var_973)[name = tensor<string, []>("op_974")];
-            tensor<fp32, [1, 4, 256]> input_161 = add(x = var_974, y = input_151)[name = tensor<string, []>("input_161")];
+            tensor<int32, [3]> var_1004_begin_0 = const()[name = tensor<string, []>("op_1004_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_1004_end_0 = const()[name = tensor<string, []>("op_1004_end_0"), val = tensor<int32, [3]>([4, 16, 256])];
+            tensor<bool, [3]> var_1004_end_mask_0 = const()[name = tensor<string, []>("op_1004_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [4, 16, 256]> conv_out_7 = transpose(perm = conv_out_7_perm_0, x = input_151)[name = tensor<string, []>("transpose_32")];
+            tensor<fp32, [4, 1, 256]> var_1004 = slice_by_index(begin = var_1004_begin_0, end = var_1004_end_0, end_mask = var_1004_end_mask_0, x = conv_out_7)[name = tensor<string, []>("op_1004")];
+            tensor<int32, [3]> var_1006_perm_0 = const()[name = tensor<string, []>("op_1006_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 4, 256]> var_1006 = transpose(perm = var_1006_perm_0, x = var_1004)[name = tensor<string, []>("transpose_31")];
+            tensor<fp32, [1, 4, 256]> input_153 = add(x = x_21, y = var_1006)[name = tensor<string, []>("input_153")];
+            tensor<int32, [1]> input_155_axes_0 = const()[name = tensor<string, []>("input_155_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 4, 256]> input_155 = layer_norm(axes = input_155_axes_0, beta = inner_encoder_ffn2_3_module_sequential_0_bias, epsilon = var_65, gamma = inner_encoder_ffn2_3_module_sequential_0_weight, x = input_153)[name = tensor<string, []>("input_155")];
+            tensor<fp32, [1, 4, 1024]> inputs = linear(bias = inner_encoder_ffn2_3_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_3_module_sequential_1_linear_weight, x = input_155)[name = tensor<string, []>("linear_35")];
+            tensor<fp32, [1, 4, 1024]> input_157 = silu(x = inputs)[name = tensor<string, []>("input_157")];
+            tensor<fp32, [1, 4, 256]> input_161 = linear(bias = inner_encoder_ffn2_3_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_3_module_sequential_4_linear_weight, x = input_157)[name = tensor<string, []>("linear_36")];
+            tensor<fp32, []> var_1029 = const()[name = tensor<string, []>("op_1029"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 4, 256]> var_1030 = mul(x = input_161, y = var_1029)[name = tensor<string, []>("op_1030")];
+            tensor<fp32, [1, 4, 256]> input_163 = add(x = var_1030, y = input_153)[name = tensor<string, []>("input_163")];
             tensor<int32, [1]> x_25_axes_0 = const()[name = tensor<string, []>("x_25_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> x_25 = layer_norm(axes = x_25_axes_0, beta = encoder_layer_norm_3_bias, epsilon = var_28, gamma = encoder_layer_norm_3_weight, x = input_161)[name = tensor<string, []>("x_25")];
+            tensor<fp32, [1, 4, 256]> x_25 = layer_norm(axes = x_25_axes_0, beta = inner_encoder_layer_norm_3_bias, epsilon = var_65, gamma = inner_encoder_layer_norm_3_weight, x = input_163)[name = tensor<string, []>("x_25")];
             tensor<int32, [3]> x_ct_perm_0 = const()[name = tensor<string, []>("x_ct_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<bool, []> cat_interleave_0 = const()[name = tensor<string, []>("cat_interleave_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [1, 256, 4]> x_ct = transpose(perm = x_ct_perm_0, x = x_25)[name = tensor<string, []>("transpose_30")];
-            tensor<fp32, [1, 256, 22]> cat = concat(axis = var_20, interleave = cat_interleave_0, values = (cnn_window, x_ct))[name = tensor<string, []>("cat")];
+            tensor<fp32, [1, 256, 22]> cat = concat(axis = var_70, interleave = cat_interleave_0, values = (cnn_window, x_ct))[name = tensor<string, []>("cat")];
             tensor<string, []> conv_out_pad_type_0 = const()[name = tensor<string, []>("conv_out_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> conv_out_strides_0 = const()[name = tensor<string, []>("conv_out_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> conv_out_pad_0 = const()[name = tensor<string, []>("conv_out_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> conv_out_dilations_0 = const()[name = tensor<string, []>("conv_out_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> conv_out_groups_0 = const()[name = tensor<string, []>("conv_out_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 4]> conv_out = conv(bias = encoder_cnn_bias, dilations = conv_out_dilations_0, groups = conv_out_groups_0, pad = conv_out_pad_0, pad_type = conv_out_pad_type_0, strides = conv_out_strides_0, weight = encoder_cnn_weight, x = cat)[name = tensor<string, []>("conv_out")];
-            tensor<int32, [3]> var_992_begin_0 = const()[name = tensor<string, []>("op_992_begin_0"), val = tensor<int32, [3]>([0, 0, 4])];
-            tensor<int32, [3]> var_992_end_0 = const()[name = tensor<string, []>("op_992_end_0"), val = tensor<int32, [3]>([1, 256, 22])];
-            tensor<bool, [3]> var_992_end_mask_0 = const()[name = tensor<string, []>("op_992_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 256, 18]> cnn_window_new = slice_by_index(begin = var_992_begin_0, end = var_992_end_0, end_mask = var_992_end_mask_0, x = cat)[name = tensor<string, []>("op_992")];
-            tensor<int32, [3]> input_163_perm_0 = const()[name = tensor<string, []>("input_163_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [1]> var_994 = const()[name = tensor<string, []>("op_994"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_163 = transpose(perm = input_163_perm_0, x = conv_out)[name = tensor<string, []>("transpose_29")];
-            tensor<fp32, [1, 4, 1]> var_995 = reduce_l2_norm(axes = var_994, keep_dims = var_29, x = input_163)[name = tensor<string, []>("op_995")];
+            tensor<fp32, [1, 256, 4]> conv_out = conv(bias = inner_encoder_cnn_bias, dilations = conv_out_dilations_0, groups = conv_out_groups_0, pad = conv_out_pad_0, pad_type = conv_out_pad_type_0, strides = conv_out_strides_0, weight = inner_encoder_cnn_weight, x = cat)[name = tensor<string, []>("conv_out")];
+            tensor<int32, [3]> var_1048_begin_0 = const()[name = tensor<string, []>("op_1048_begin_0"), val = tensor<int32, [3]>([0, 0, 4])];
+            tensor<int32, [3]> var_1048_end_0 = const()[name = tensor<string, []>("op_1048_end_0"), val = tensor<int32, [3]>([1, 256, 22])];
+            tensor<bool, [3]> var_1048_end_mask_0 = const()[name = tensor<string, []>("op_1048_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 256, 18]> cnn_window_new = slice_by_index(begin = var_1048_begin_0, end = var_1048_end_0, end_mask = var_1048_end_mask_0, x = cat)[name = tensor<string, []>("op_1048")];
+            tensor<int32, [3]> input_165_perm_0 = const()[name = tensor<string, []>("input_165_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_1050 = const()[name = tensor<string, []>("op_1050"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 4, 256]> input_165 = transpose(perm = input_165_perm_0, x = conv_out)[name = tensor<string, []>("transpose_29")];
+            tensor<fp32, [1, 4, 1]> var_1051 = reduce_l2_norm(axes = var_1050, keep_dims = var_64, x = input_165)[name = tensor<string, []>("op_1051")];
             tensor<fp32, []> const_12 = const()[name = tensor<string, []>("const_12"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1, 4, 1]> clip_0 = clip(alpha = var_11, beta = const_12, x = var_995)[name = tensor<string, []>("clip_0")];
-            tensor<fp32, [1, 4, 256]> emb = real_div(x = input_163, y = clip_0)[name = tensor<string, []>("emb")];
-            tensor<int32, []> var_999_axis_0 = const()[name = tensor<string, []>("op_999_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1, 4, 64, 64]> enc_kv_new = stack(axis = var_999_axis_0, values = (var_206, var_420, var_634, nkv_1))[name = tensor<string, []>("op_999")];
-            tensor<int32, []> var_1001_axis_0 = const()[name = tensor<string, []>("op_1001_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1]> enc_scale_new = stack(axis = var_1001_axis_0, values = (new_scale_1, new_scale_3, new_scale_5, new_scale_7))[name = tensor<string, []>("op_1001")];
-            tensor<int32, []> var_1003_axis_0 = const()[name = tensor<string, []>("op_1003_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1, 16, 256]> enc_conv_cache_new = stack(axis = var_1003_axis_0, values = (window_9, window_19, window_29, window))[name = tensor<string, []>("op_1003")];
-            tensor<fp32, []> var_1012 = const()[name = tensor<string, []>("op_1012"), val = tensor<fp32, []>(0x1.5798eep-27)];
-            tensor<fp32, []> var_1017 = const()[name = tensor<string, []>("op_1017"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
-            tensor<fp32, []> var_1019 = const()[name = tensor<string, []>("op_1019"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
-            tensor<bool, []> var_1020 = const()[name = tensor<string, []>("op_1020"), val = tensor<bool, []>(true)];
-            tensor<fp32, []> var_1022 = const()[name = tensor<string, []>("op_1022"), val = tensor<fp32, []>(0x1p+0)];
-            tensor<int32, []> var_1026 = const()[name = tensor<string, []>("op_1026"), val = tensor<int32, []>(-1)];
-            tensor<int32, []> var_1032 = const()[name = tensor<string, []>("op_1032"), val = tensor<int32, []>(0)];
+            tensor<fp32, [1, 4, 1]> clip_0 = clip(alpha = var_78, beta = const_12, x = var_1051)[name = tensor<string, []>("clip_0")];
+            tensor<fp32, [1, 4, 256]> emb = real_div(x = input_165, y = clip_0)[name = tensor<string, []>("emb")];
+            tensor<int32, []> var_1055_axis_0 = const()[name = tensor<string, []>("op_1055_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1, 4, 64, 64]> enc_kv_new = stack(axis = var_1055_axis_0, values = (var_262, var_476, var_690, nkv_1))[name = tensor<string, []>("op_1055")];
+            tensor<int32, []> var_1057_axis_0 = const()[name = tensor<string, []>("op_1057_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1]> enc_scale_new = stack(axis = var_1057_axis_0, values = (new_scale_1, new_scale_3, new_scale_5, new_scale_7))[name = tensor<string, []>("op_1057")];
+            tensor<int32, []> var_1059_axis_0 = const()[name = tensor<string, []>("op_1059_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1, 16, 256]> enc_conv_cache_new = stack(axis = var_1059_axis_0, values = (window_9, window_19, window_29, window))[name = tensor<string, []>("op_1059")];
             tensor<fp32, [1, 4, 6, 256]> pos = const()[name = tensor<string, []>("pos"), val = tensor<fp32, [1, 4, 6, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44395584)))];
-            tensor<int32, [1]> var_1094_axes_0 = const()[name = tensor<string, []>("op_1094_axes_0"), val = tensor<int32, [1]>([2])];
-            tensor<fp32, [1, 4, 1, 256]> var_1094 = expand_dims(axes = var_1094_axes_0, x = emb)[name = tensor<string, []>("op_1094")];
+            tensor<int32, [1]> var_1127_axes_0 = const()[name = tensor<string, []>("op_1127_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp32, [1, 4, 1, 256]> var_1127 = expand_dims(axes = var_1127_axes_0, x = emb)[name = tensor<string, []>("op_1127")];
             tensor<int32, [4]> emb_exp_reps_0 = const()[name = tensor<string, []>("emb_exp_reps_0"), val = tensor<int32, [4]>([1, 1, 6, 1])];
-            tensor<fp32, [1, 4, 6, 256]> emb_exp = tile(reps = emb_exp_reps_0, x = var_1094)[name = tensor<string, []>("emb_exp")];
-            tensor<bool, []> input_165_interleave_0 = const()[name = tensor<string, []>("input_165_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 6, 512]> input_165 = concat(axis = var_1026, interleave = input_165_interleave_0, values = (emb_exp, pos))[name = tensor<string, []>("input_165")];
-            tensor<fp32, [1, 4, 6, 256]> x_27 = linear(bias = decoder_convert_bias, weight = decoder_convert_weight, x = input_165)[name = tensor<string, []>("linear_37")];
-            tensor<int32, [4]> var_1102_perm_0 = const()[name = tensor<string, []>("op_1102_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1106 = const()[name = tensor<string, []>("op_1106"), val = tensor<int32, [3]>([6, 4, 256])];
-            tensor<fp32, [1, 6, 4, 256]> var_1102 = transpose(perm = var_1102_perm_0, x = x_27)[name = tensor<string, []>("transpose_28")];
-            tensor<fp32, [6, 4, 256]> x_29 = reshape(shape = var_1106, x = var_1102)[name = tensor<string, []>("x_29")];
+            tensor<fp32, [1, 4, 6, 256]> emb_exp = tile(reps = emb_exp_reps_0, x = var_1127)[name = tensor<string, []>("emb_exp")];
+            tensor<bool, []> input_167_interleave_0 = const()[name = tensor<string, []>("input_167_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 6, 512]> input_167 = concat(axis = var_71, interleave = input_167_interleave_0, values = (emb_exp, pos))[name = tensor<string, []>("input_167")];
+            tensor<fp32, [1, 4, 6, 256]> x_27 = linear(bias = inner_decoder_convert_bias, weight = inner_decoder_convert_weight, x = input_167)[name = tensor<string, []>("linear_37")];
+            tensor<int32, [4]> var_1135_perm_0 = const()[name = tensor<string, []>("op_1135_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1139 = const()[name = tensor<string, []>("op_1139"), val = tensor<int32, [3]>([6, 4, 256])];
+            tensor<fp32, [1, 6, 4, 256]> var_1135 = transpose(perm = var_1135_perm_0, x = x_27)[name = tensor<string, []>("transpose_28")];
+            tensor<fp32, [6, 4, 256]> x_29 = reshape(shape = var_1139, x = var_1135)[name = tensor<string, []>("x_29")];
             tensor<int32, [5]> prev_kv_9_begin_0 = const()[name = tensor<string, []>("prev_kv_9_begin_0"), val = tensor<int32, [5]>([0, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_9_end_0 = const()[name = tensor<string, []>("prev_kv_9_end_0"), val = tensor<int32, [5]>([1, 6, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_9_end_mask_0 = const()[name = tensor<string, []>("prev_kv_9_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -983,132 +998,132 @@ program(1.0)
             tensor<bool, [2]> prev_scale_9_end_mask_0 = const()[name = tensor<string, []>("prev_scale_9_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_9_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_9_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_9 = slice_by_index(begin = prev_scale_9_begin_0, end = prev_scale_9_end_0, end_mask = prev_scale_9_end_mask_0, squeeze_mask = prev_scale_9_squeeze_mask_0, x = dec_scale)[name = tensor<string, []>("prev_scale_9")];
-            tensor<fp32, [6, 4, 256]> var_1114 = linear(bias = decoder_q_proj_0_bias, weight = decoder_q_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_38")];
-            tensor<int32, [4]> var_1115 = const()[name = tensor<string, []>("op_1115"), val = tensor<int32, [4]>([6, 4, 4, 64])];
-            tensor<fp32, [6, 4, 4, 64]> var_1116 = reshape(shape = var_1115, x = var_1114)[name = tensor<string, []>("op_1116")];
+            tensor<fp32, [6, 4, 256]> var_1147 = linear(bias = inner_decoder_q_proj_0_bias, weight = inner_decoder_q_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_38")];
+            tensor<int32, [4]> var_1148 = const()[name = tensor<string, []>("op_1148"), val = tensor<int32, [4]>([6, 4, 4, 64])];
+            tensor<fp32, [6, 4, 4, 64]> var_1149 = reshape(shape = var_1148, x = var_1147)[name = tensor<string, []>("op_1149")];
             tensor<int32, [4]> q_9_perm_0 = const()[name = tensor<string, []>("q_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [6, 4, 256]> var_1120 = linear(bias = decoder_k_proj_0_bias, weight = decoder_k_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_39")];
-            tensor<fp32, []> var_1121 = const()[name = tensor<string, []>("op_1121"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [6, 4, 256]> var_1122 = mul(x = var_1120, y = var_1121)[name = tensor<string, []>("op_1122")];
-            tensor<int32, [4]> var_1123 = const()[name = tensor<string, []>("op_1123"), val = tensor<int32, [4]>([6, 4, 4, 64])];
-            tensor<fp32, [6, 4, 4, 64]> var_1124 = reshape(shape = var_1123, x = var_1122)[name = tensor<string, []>("op_1124")];
+            tensor<fp32, [6, 4, 256]> var_1153 = linear(bias = inner_decoder_k_proj_0_bias, weight = inner_decoder_k_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_39")];
+            tensor<fp32, []> var_1154 = const()[name = tensor<string, []>("op_1154"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [6, 4, 256]> var_1155 = mul(x = var_1153, y = var_1154)[name = tensor<string, []>("op_1155")];
+            tensor<int32, [4]> var_1156 = const()[name = tensor<string, []>("op_1156"), val = tensor<int32, [4]>([6, 4, 4, 64])];
+            tensor<fp32, [6, 4, 4, 64]> var_1157 = reshape(shape = var_1156, x = var_1155)[name = tensor<string, []>("op_1157")];
             tensor<int32, [4]> k_9_perm_0 = const()[name = tensor<string, []>("k_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [6, 4, 256]> var_1128 = linear(bias = decoder_v_proj_0_bias, weight = decoder_v_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_40")];
-            tensor<int32, [4]> var_1129 = const()[name = tensor<string, []>("op_1129"), val = tensor<int32, [4]>([6, 4, 4, 64])];
-            tensor<fp32, [6, 4, 4, 64]> var_1130 = reshape(shape = var_1129, x = var_1128)[name = tensor<string, []>("op_1130")];
+            tensor<fp32, [6, 4, 256]> var_1161 = linear(bias = inner_decoder_v_proj_0_bias, weight = inner_decoder_v_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_40")];
+            tensor<int32, [4]> var_1162 = const()[name = tensor<string, []>("op_1162"), val = tensor<int32, [4]>([6, 4, 4, 64])];
+            tensor<fp32, [6, 4, 4, 64]> var_1163 = reshape(shape = var_1162, x = var_1161)[name = tensor<string, []>("op_1163")];
             tensor<int32, [4]> v_9_perm_0 = const()[name = tensor<string, []>("v_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [6, 4, 256]> input_169 = linear(bias = decoder_g_proj_0_bias, weight = decoder_g_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_41")];
+            tensor<fp32, [6, 4, 256]> input_171 = linear(bias = inner_decoder_g_proj_0_bias, weight = inner_decoder_g_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_41")];
             tensor<bool, []> cumsum_mask_1_exclusive_0 = const()[name = tensor<string, []>("cumsum_mask_1_exclusive_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> cumsum_mask_1_reverse_0 = const()[name = tensor<string, []>("cumsum_mask_1_reverse_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [4]> cumsum_mask_1 = cumsum(axis = var_1032, exclusive = cumsum_mask_1_exclusive_0, reverse = cumsum_mask_1_reverse_0, x = valid_mask)[name = tensor<string, []>("cumsum_mask_1")];
+            tensor<fp32, [4]> cumsum_mask_1 = cumsum(axis = var_68, exclusive = cumsum_mask_1_exclusive_0, reverse = cumsum_mask_1_reverse_0, x = valid_mask)[name = tensor<string, []>("cumsum_mask_1")];
             tensor<fp32, [1]> sqrt_s0_9 = sqrt(x = prev_scale_9)[name = tensor<string, []>("sqrt_s0_9")];
             tensor<fp32, [4]> s_t_9 = add(x = prev_scale_9, y = cumsum_mask_1)[name = tensor<string, []>("s_t_9")];
             tensor<fp32, []> const_20 = const()[name = tensor<string, []>("const_20"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [4]> clip_1 = clip(alpha = var_1022, beta = const_20, x = s_t_9)[name = tensor<string, []>("clip_1")];
+            tensor<fp32, [4]> clip_1 = clip(alpha = var_58, beta = const_20, x = s_t_9)[name = tensor<string, []>("clip_1")];
             tensor<fp32, [4]> sqrt_s_t_9 = sqrt(x = clip_1)[name = tensor<string, []>("sqrt_s_t_9")];
             tensor<bool, []> qk_9_transpose_x_1 = const()[name = tensor<string, []>("qk_9_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_9_transpose_y_1 = const()[name = tensor<string, []>("qk_9_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [6, 4, 4, 64]> k_9 = transpose(perm = k_9_perm_0, x = var_1124)[name = tensor<string, []>("transpose_26")];
-            tensor<fp32, [6, 4, 4, 64]> q_9 = transpose(perm = q_9_perm_0, x = var_1116)[name = tensor<string, []>("transpose_27")];
+            tensor<fp32, [6, 4, 4, 64]> k_9 = transpose(perm = k_9_perm_0, x = var_1157)[name = tensor<string, []>("transpose_26")];
+            tensor<fp32, [6, 4, 4, 64]> q_9 = transpose(perm = q_9_perm_0, x = var_1149)[name = tensor<string, []>("transpose_27")];
             tensor<fp32, [6, 4, 4, 4]> qk_9 = matmul(transpose_x = qk_9_transpose_x_1, transpose_y = qk_9_transpose_y_1, x = q_9, y = k_9)[name = tensor<string, []>("qk_9")];
-            tensor<int32, [2]> var_1142 = const()[name = tensor<string, []>("op_1142"), val = tensor<int32, [2]>([1, 4])];
-            tensor<fp32, [1, 4]> var_1143 = reshape(shape = var_1142, x = valid_mask)[name = tensor<string, []>("op_1143")];
-            tensor<fp32, [4, 4]> causal_with_valid_1 = mul(x = decoder__causal_mask, y = var_1143)[name = tensor<string, []>("causal_with_valid_1")];
-            tensor<int32, [2]> var_1145 = const()[name = tensor<string, []>("op_1145"), val = tensor<int32, [2]>([4, 1])];
-            tensor<fp32, [4, 1]> var_1146 = reshape(shape = var_1145, x = sqrt_s_t_9)[name = tensor<string, []>("op_1146")];
-            tensor<fp32, [4, 4]> M_9 = real_div(x = causal_with_valid_1, y = var_1146)[name = tensor<string, []>("M_9")];
-            tensor<fp32, [6, 4, 4, 4]> var_1148 = mul(x = qk_9, y = M_9)[name = tensor<string, []>("op_1148")];
+            tensor<int32, [2]> var_1175 = const()[name = tensor<string, []>("op_1175"), val = tensor<int32, [2]>([1, 4])];
+            tensor<fp32, [1, 4]> var_1176 = reshape(shape = var_1175, x = valid_mask)[name = tensor<string, []>("op_1176")];
+            tensor<fp32, [4, 4]> causal_with_valid_1 = mul(x = inner_decoder__causal_mask, y = var_1176)[name = tensor<string, []>("causal_with_valid_1")];
+            tensor<int32, [2]> var_1178 = const()[name = tensor<string, []>("op_1178"), val = tensor<int32, [2]>([4, 1])];
+            tensor<fp32, [4, 1]> var_1179 = reshape(shape = var_1178, x = sqrt_s_t_9)[name = tensor<string, []>("op_1179")];
+            tensor<fp32, [4, 4]> M_9 = real_div(x = causal_with_valid_1, y = var_1179)[name = tensor<string, []>("M_9")];
+            tensor<fp32, [6, 4, 4, 4]> var_1181 = mul(x = qk_9, y = M_9)[name = tensor<string, []>("op_1181")];
             tensor<bool, []> inner_9_transpose_x_0 = const()[name = tensor<string, []>("inner_9_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_9_transpose_y_0 = const()[name = tensor<string, []>("inner_9_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [6, 4, 4, 64]> v_9 = transpose(perm = v_9_perm_0, x = var_1130)[name = tensor<string, []>("transpose_25")];
-            tensor<fp32, [6, 4, 4, 64]> inner_9 = matmul(transpose_x = inner_9_transpose_x_0, transpose_y = inner_9_transpose_y_0, x = var_1148, y = v_9)[name = tensor<string, []>("inner_9")];
-            tensor<bool, []> var_1150_transpose_x_0 = const()[name = tensor<string, []>("op_1150_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_1150_transpose_y_0 = const()[name = tensor<string, []>("op_1150_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [6, 4, 4, 64]> var_1150 = matmul(transpose_x = var_1150_transpose_x_0, transpose_y = var_1150_transpose_y_0, x = q_9, y = prev_kv_9)[name = tensor<string, []>("op_1150")];
-            tensor<fp32, [4]> var_1151 = real_div(x = sqrt_s0_9, y = sqrt_s_t_9)[name = tensor<string, []>("op_1151")];
-            tensor<int32, [4]> var_1152 = const()[name = tensor<string, []>("op_1152"), val = tensor<int32, [4]>([1, 1, 4, 1])];
-            tensor<fp32, [1, 1, 4, 1]> var_1153 = reshape(shape = var_1152, x = var_1151)[name = tensor<string, []>("op_1153")];
-            tensor<fp32, [6, 4, 4, 64]> cross_9 = mul(x = var_1150, y = var_1153)[name = tensor<string, []>("cross_9")];
+            tensor<fp32, [6, 4, 4, 64]> v_9 = transpose(perm = v_9_perm_0, x = var_1163)[name = tensor<string, []>("transpose_25")];
+            tensor<fp32, [6, 4, 4, 64]> inner_9 = matmul(transpose_x = inner_9_transpose_x_0, transpose_y = inner_9_transpose_y_0, x = var_1181, y = v_9)[name = tensor<string, []>("inner_9")];
+            tensor<bool, []> var_1183_transpose_x_0 = const()[name = tensor<string, []>("op_1183_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_1183_transpose_y_0 = const()[name = tensor<string, []>("op_1183_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [6, 4, 4, 64]> var_1183 = matmul(transpose_x = var_1183_transpose_x_0, transpose_y = var_1183_transpose_y_0, x = q_9, y = prev_kv_9)[name = tensor<string, []>("op_1183")];
+            tensor<fp32, [4]> var_1184 = real_div(x = sqrt_s0_9, y = sqrt_s_t_9)[name = tensor<string, []>("op_1184")];
+            tensor<int32, [4]> var_1185 = const()[name = tensor<string, []>("op_1185"), val = tensor<int32, [4]>([1, 1, 4, 1])];
+            tensor<fp32, [1, 1, 4, 1]> var_1186 = reshape(shape = var_1185, x = var_1184)[name = tensor<string, []>("op_1186")];
+            tensor<fp32, [6, 4, 4, 64]> cross_9 = mul(x = var_1183, y = var_1186)[name = tensor<string, []>("cross_9")];
             tensor<fp32, [6, 4, 4, 64]> out_25 = add(x = inner_9, y = cross_9)[name = tensor<string, []>("out_25")];
-            tensor<int32, [4]> var_1156 = const()[name = tensor<string, []>("op_1156"), val = tensor<int32, [4]>([1, 1, 4, 1])];
-            tensor<fp32, [1, 1, 4, 1]> var_1157 = reshape(shape = var_1156, x = valid_mask)[name = tensor<string, []>("op_1157")];
-            tensor<fp32, [6, 4, 4, 64]> v_masked_1 = mul(x = v_9, y = var_1157)[name = tensor<string, []>("v_masked_1")];
-            tensor<fp32, [6, 4, 64, 64]> var_1159 = mul(x = prev_kv_9, y = sqrt_s0_9)[name = tensor<string, []>("op_1159")];
-            tensor<bool, []> var_1161_transpose_x_1 = const()[name = tensor<string, []>("op_1161_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_1161_transpose_y_1 = const()[name = tensor<string, []>("op_1161_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [6, 4, 64, 64]> var_1161 = matmul(transpose_x = var_1161_transpose_x_1, transpose_y = var_1161_transpose_y_1, x = k_9, y = v_masked_1)[name = tensor<string, []>("op_1161")];
-            tensor<fp32, [6, 4, 64, 64]> new_kv_unnorm_9 = add(x = var_1159, y = var_1161)[name = tensor<string, []>("new_kv_unnorm_9")];
-            tensor<bool, []> var_1163_keep_dims_0 = const()[name = tensor<string, []>("op_1163_keep_dims_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, []> var_1163 = reduce_sum(keep_dims = var_1163_keep_dims_0, x = valid_mask)[name = tensor<string, []>("op_1163")];
-            tensor<int32, [1]> var_1164 = const()[name = tensor<string, []>("op_1164"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [1]> var_1165 = reshape(shape = var_1164, x = var_1163)[name = tensor<string, []>("op_1165")];
-            tensor<fp32, [1]> new_scale_9 = add(x = prev_scale_9, y = var_1165)[name = tensor<string, []>("new_scale_9")];
+            tensor<int32, [4]> var_1189 = const()[name = tensor<string, []>("op_1189"), val = tensor<int32, [4]>([1, 1, 4, 1])];
+            tensor<fp32, [1, 1, 4, 1]> var_1190 = reshape(shape = var_1189, x = valid_mask)[name = tensor<string, []>("op_1190")];
+            tensor<fp32, [6, 4, 4, 64]> v_masked_1 = mul(x = v_9, y = var_1190)[name = tensor<string, []>("v_masked_1")];
+            tensor<fp32, [6, 4, 64, 64]> var_1192 = mul(x = prev_kv_9, y = sqrt_s0_9)[name = tensor<string, []>("op_1192")];
+            tensor<bool, []> var_1194_transpose_x_1 = const()[name = tensor<string, []>("op_1194_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_1194_transpose_y_1 = const()[name = tensor<string, []>("op_1194_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [6, 4, 64, 64]> var_1194 = matmul(transpose_x = var_1194_transpose_x_1, transpose_y = var_1194_transpose_y_1, x = k_9, y = v_masked_1)[name = tensor<string, []>("op_1194")];
+            tensor<fp32, [6, 4, 64, 64]> new_kv_unnorm_9 = add(x = var_1192, y = var_1194)[name = tensor<string, []>("new_kv_unnorm_9")];
+            tensor<bool, []> var_1196_keep_dims_0 = const()[name = tensor<string, []>("op_1196_keep_dims_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, []> var_1196 = reduce_sum(keep_dims = var_1196_keep_dims_0, x = valid_mask)[name = tensor<string, []>("op_1196")];
+            tensor<int32, [1]> var_1197 = const()[name = tensor<string, []>("op_1197"), val = tensor<int32, [1]>([1])];
+            tensor<fp32, [1]> var_1198 = reshape(shape = var_1197, x = var_1196)[name = tensor<string, []>("op_1198")];
+            tensor<fp32, [1]> new_scale_9 = add(x = prev_scale_9, y = var_1198)[name = tensor<string, []>("new_scale_9")];
             tensor<fp32, []> const_21 = const()[name = tensor<string, []>("const_21"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_2 = clip(alpha = var_1022, beta = const_21, x = new_scale_9)[name = tensor<string, []>("clip_2")];
+            tensor<fp32, [1]> clip_2 = clip(alpha = var_58, beta = const_21, x = new_scale_9)[name = tensor<string, []>("clip_2")];
             tensor<fp32, [1]> sqrt_new_scale_1 = sqrt(x = clip_2)[name = tensor<string, []>("sqrt_new_scale_1")];
-            tensor<fp32, [6, 4, 64, 64]> var_1169 = real_div(x = new_kv_unnorm_9, y = sqrt_new_scale_1)[name = tensor<string, []>("op_1169")];
-            tensor<int32, [4]> var_1170_perm_0 = const()[name = tensor<string, []>("op_1170_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [6, 4, 64, 64]> var_1202 = real_div(x = new_kv_unnorm_9, y = sqrt_new_scale_1)[name = tensor<string, []>("op_1202")];
+            tensor<int32, [4]> var_1203_perm_0 = const()[name = tensor<string, []>("op_1203_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_27_axes_0 = const()[name = tensor<string, []>("out_27_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [6, 4, 4, 64]> var_1170 = transpose(perm = var_1170_perm_0, x = out_25)[name = tensor<string, []>("transpose_24")];
-            tensor<fp32, [6, 4, 4, 64]> out_27 = layer_norm(axes = out_27_axes_0, epsilon = var_1019, x = var_1170)[name = tensor<string, []>("out_27")];
-            tensor<int32, [3]> var_1174 = const()[name = tensor<string, []>("op_1174"), val = tensor<int32, [3]>([6, 4, 256])];
-            tensor<fp32, [6, 4, 256]> out_29 = reshape(shape = var_1174, x = out_27)[name = tensor<string, []>("out_29")];
-            tensor<fp32, [6, 4, 256]> var_1176 = silu(x = input_169)[name = tensor<string, []>("op_1176")];
-            tensor<fp32, [6, 4, 256]> input_171 = mul(x = var_1176, y = out_29)[name = tensor<string, []>("input_171")];
-            tensor<fp32, [6, 4, 256]> ret_out_9 = linear(bias = decoder_out_proj_0_bias, weight = decoder_out_proj_0_weight, x = input_171)[name = tensor<string, []>("linear_42")];
-            tensor<fp32, [6, 4, 256]> input_173 = add(x = x_29, y = ret_out_9)[name = tensor<string, []>("input_173")];
+            tensor<fp32, [6, 4, 4, 64]> var_1203 = transpose(perm = var_1203_perm_0, x = out_25)[name = tensor<string, []>("transpose_24")];
+            tensor<fp32, [6, 4, 4, 64]> out_27 = layer_norm(axes = out_27_axes_0, epsilon = var_73, x = var_1203)[name = tensor<string, []>("out_27")];
+            tensor<int32, [3]> var_1207 = const()[name = tensor<string, []>("op_1207"), val = tensor<int32, [3]>([6, 4, 256])];
+            tensor<fp32, [6, 4, 256]> out_29 = reshape(shape = var_1207, x = out_27)[name = tensor<string, []>("out_29")];
+            tensor<fp32, [6, 4, 256]> var_1209 = silu(x = input_171)[name = tensor<string, []>("op_1209")];
+            tensor<fp32, [6, 4, 256]> input_173 = mul(x = var_1209, y = out_29)[name = tensor<string, []>("input_173")];
+            tensor<fp32, [6, 4, 256]> ret_out_9 = linear(bias = inner_decoder_out_proj_0_bias, weight = inner_decoder_out_proj_0_weight, x = input_173)[name = tensor<string, []>("linear_42")];
+            tensor<fp32, [6, 4, 256]> input_175 = add(x = x_29, y = ret_out_9)[name = tensor<string, []>("input_175")];
             tensor<int32, [1]> xt_1_axes_0 = const()[name = tensor<string, []>("xt_1_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [6, 4, 256]> xt_1 = layer_norm(axes = xt_1_axes_0, beta = decoder_norm11_0_bias, epsilon = var_1017, gamma = decoder_norm11_0_weight, x = input_173)[name = tensor<string, []>("xt_1")];
-            tensor<int32, [4]> var_1186 = const()[name = tensor<string, []>("op_1186"), val = tensor<int32, [4]>([1, 6, 4, 256])];
-            tensor<fp32, [1, 6, 4, 256]> var_1187 = reshape(shape = var_1186, x = xt_1)[name = tensor<string, []>("op_1187")];
-            tensor<int32, [4]> var_1188_perm_0 = const()[name = tensor<string, []>("op_1188_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1191 = const()[name = tensor<string, []>("op_1191"), val = tensor<int32, [3]>([4, 6, 256])];
-            tensor<fp32, [1, 4, 6, 256]> var_1188 = transpose(perm = var_1188_perm_0, x = var_1187)[name = tensor<string, []>("transpose_23")];
-            tensor<fp32, [4, 6, 256]> query_1 = reshape(shape = var_1191, x = var_1188)[name = tensor<string, []>("query_1")];
+            tensor<fp32, [6, 4, 256]> xt_1 = layer_norm(axes = xt_1_axes_0, beta = inner_decoder_norm11_0_bias, epsilon = var_65, gamma = inner_decoder_norm11_0_weight, x = input_175)[name = tensor<string, []>("xt_1")];
+            tensor<int32, [4]> var_1219 = const()[name = tensor<string, []>("op_1219"), val = tensor<int32, [4]>([1, 6, 4, 256])];
+            tensor<fp32, [1, 6, 4, 256]> var_1220 = reshape(shape = var_1219, x = xt_1)[name = tensor<string, []>("op_1220")];
+            tensor<int32, [4]> var_1221_perm_0 = const()[name = tensor<string, []>("op_1221_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1224 = const()[name = tensor<string, []>("op_1224"), val = tensor<int32, [3]>([4, 6, 256])];
+            tensor<fp32, [1, 4, 6, 256]> var_1221 = transpose(perm = var_1221_perm_0, x = var_1220)[name = tensor<string, []>("transpose_23")];
+            tensor<fp32, [4, 6, 256]> query_1 = reshape(shape = var_1224, x = var_1221)[name = tensor<string, []>("query_1")];
             tensor<int32, [3]> query_3_perm_0 = const()[name = tensor<string, []>("query_3_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [6, 4, 256]> query_3 = transpose(perm = query_3_perm_0, x = query_1)[name = tensor<string, []>("transpose_22")];
-            tensor<fp32, [6, 4, 768]> var_1214 = linear(bias = decoder_self_attn2_0_in_proj_bias, weight = decoder_self_attn2_0_in_proj_weight, x = query_3)[name = tensor<string, []>("linear_43")];
+            tensor<fp32, [6, 4, 768]> var_1247 = linear(bias = inner_decoder_self_attn2_0_in_proj_bias, weight = inner_decoder_self_attn2_0_in_proj_weight, x = query_3)[name = tensor<string, []>("linear_43")];
             tensor<int32, [4]> concat_1 = const()[name = tensor<string, []>("concat_1"), val = tensor<int32, [4]>([6, 4, 3, 256])];
-            tensor<fp32, [6, 4, 3, 256]> var_1216 = reshape(shape = concat_1, x = var_1214)[name = tensor<string, []>("op_1216")];
-            tensor<int32, [1]> var_1217_axes_0 = const()[name = tensor<string, []>("op_1217_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp32, [1, 6, 4, 3, 256]> var_1217 = expand_dims(axes = var_1217_axes_0, x = var_1216)[name = tensor<string, []>("op_1217")];
-            tensor<int32, [5]> var_1218_perm_0 = const()[name = tensor<string, []>("op_1218_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
-            tensor<int32, [1]> var_1219_axes_0 = const()[name = tensor<string, []>("op_1219_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp32, [3, 6, 4, 1, 256]> var_1218 = transpose(perm = var_1218_perm_0, x = var_1217)[name = tensor<string, []>("transpose_21")];
-            tensor<fp32, [3, 6, 4, 256]> var_1219 = squeeze(axes = var_1219_axes_0, x = var_1218)[name = tensor<string, []>("op_1219")];
+            tensor<fp32, [6, 4, 3, 256]> var_1249 = reshape(shape = concat_1, x = var_1247)[name = tensor<string, []>("op_1249")];
+            tensor<int32, [1]> var_1250_axes_0 = const()[name = tensor<string, []>("op_1250_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 6, 4, 3, 256]> var_1250 = expand_dims(axes = var_1250_axes_0, x = var_1249)[name = tensor<string, []>("op_1250")];
+            tensor<int32, [5]> var_1251_perm_0 = const()[name = tensor<string, []>("op_1251_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
+            tensor<int32, [1]> var_1252_axes_0 = const()[name = tensor<string, []>("op_1252_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp32, [3, 6, 4, 1, 256]> var_1251 = transpose(perm = var_1251_perm_0, x = var_1250)[name = tensor<string, []>("transpose_21")];
+            tensor<fp32, [3, 6, 4, 256]> var_1252 = squeeze(axes = var_1252_axes_0, x = var_1251)[name = tensor<string, []>("op_1252")];
             tensor<int32, [4]> q_11_begin_0 = const()[name = tensor<string, []>("q_11_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> q_11_end_0 = const()[name = tensor<string, []>("q_11_end_0"), val = tensor<int32, [4]>([1, 6, 4, 256])];
             tensor<bool, [4]> q_11_end_mask_0 = const()[name = tensor<string, []>("q_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> q_11_squeeze_mask_0 = const()[name = tensor<string, []>("q_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [6, 4, 256]> q_11 = slice_by_index(begin = q_11_begin_0, end = q_11_end_0, end_mask = q_11_end_mask_0, squeeze_mask = q_11_squeeze_mask_0, x = var_1219)[name = tensor<string, []>("q_11")];
+            tensor<fp32, [6, 4, 256]> q_11 = slice_by_index(begin = q_11_begin_0, end = q_11_end_0, end_mask = q_11_end_mask_0, squeeze_mask = q_11_squeeze_mask_0, x = var_1252)[name = tensor<string, []>("q_11")];
             tensor<int32, [4]> k_11_begin_0 = const()[name = tensor<string, []>("k_11_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> k_11_end_0 = const()[name = tensor<string, []>("k_11_end_0"), val = tensor<int32, [4]>([2, 6, 4, 256])];
             tensor<bool, [4]> k_11_end_mask_0 = const()[name = tensor<string, []>("k_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> k_11_squeeze_mask_0 = const()[name = tensor<string, []>("k_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [6, 4, 256]> k_11 = slice_by_index(begin = k_11_begin_0, end = k_11_end_0, end_mask = k_11_end_mask_0, squeeze_mask = k_11_squeeze_mask_0, x = var_1219)[name = tensor<string, []>("k_11")];
+            tensor<fp32, [6, 4, 256]> k_11 = slice_by_index(begin = k_11_begin_0, end = k_11_end_0, end_mask = k_11_end_mask_0, squeeze_mask = k_11_squeeze_mask_0, x = var_1252)[name = tensor<string, []>("k_11")];
             tensor<int32, [4]> v_11_begin_0 = const()[name = tensor<string, []>("v_11_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> v_11_end_0 = const()[name = tensor<string, []>("v_11_end_0"), val = tensor<int32, [4]>([3, 6, 4, 256])];
             tensor<bool, [4]> v_11_end_mask_0 = const()[name = tensor<string, []>("v_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> v_11_squeeze_mask_0 = const()[name = tensor<string, []>("v_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [6, 4, 256]> v_11 = slice_by_index(begin = v_11_begin_0, end = v_11_end_0, end_mask = v_11_end_mask_0, squeeze_mask = v_11_squeeze_mask_0, x = var_1219)[name = tensor<string, []>("v_11")];
-            tensor<int32, [3]> var_1227 = const()[name = tensor<string, []>("op_1227"), val = tensor<int32, [3]>([6, 16, 64])];
-            tensor<fp32, [6, 16, 64]> var_1228 = reshape(shape = var_1227, x = q_11)[name = tensor<string, []>("op_1228")];
+            tensor<fp32, [6, 4, 256]> v_11 = slice_by_index(begin = v_11_begin_0, end = v_11_end_0, end_mask = v_11_end_mask_0, squeeze_mask = v_11_squeeze_mask_0, x = var_1252)[name = tensor<string, []>("v_11")];
+            tensor<int32, [3]> var_1260 = const()[name = tensor<string, []>("op_1260"), val = tensor<int32, [3]>([6, 16, 64])];
+            tensor<fp32, [6, 16, 64]> var_1261 = reshape(shape = var_1260, x = q_11)[name = tensor<string, []>("op_1261")];
             tensor<int32, [3]> q_13_perm_0 = const()[name = tensor<string, []>("q_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1234 = const()[name = tensor<string, []>("op_1234"), val = tensor<int32, [3]>([6, 16, 64])];
-            tensor<fp32, [6, 16, 64]> var_1235 = reshape(shape = var_1234, x = k_11)[name = tensor<string, []>("op_1235")];
+            tensor<int32, [3]> var_1267 = const()[name = tensor<string, []>("op_1267"), val = tensor<int32, [3]>([6, 16, 64])];
+            tensor<fp32, [6, 16, 64]> var_1268 = reshape(shape = var_1267, x = k_11)[name = tensor<string, []>("op_1268")];
             tensor<int32, [3]> k_13_perm_0 = const()[name = tensor<string, []>("k_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1241 = const()[name = tensor<string, []>("op_1241"), val = tensor<int32, [3]>([6, 16, 64])];
-            tensor<fp32, [6, 16, 64]> var_1242 = reshape(shape = var_1241, x = v_11)[name = tensor<string, []>("op_1242")];
+            tensor<int32, [3]> var_1274 = const()[name = tensor<string, []>("op_1274"), val = tensor<int32, [3]>([6, 16, 64])];
+            tensor<fp32, [6, 16, 64]> var_1275 = reshape(shape = var_1274, x = v_11)[name = tensor<string, []>("op_1275")];
             tensor<int32, [3]> v_13_perm_0 = const()[name = tensor<string, []>("v_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [4]> var_1245 = const()[name = tensor<string, []>("op_1245"), val = tensor<int32, [4]>([4, 4, 6, 64])];
-            tensor<fp32, [16, 6, 64]> q_13 = transpose(perm = q_13_perm_0, x = var_1228)[name = tensor<string, []>("transpose_20")];
-            tensor<fp32, [4, 4, 6, 64]> q_15 = reshape(shape = var_1245, x = q_13)[name = tensor<string, []>("q_15")];
-            tensor<int32, [4]> var_1247 = const()[name = tensor<string, []>("op_1247"), val = tensor<int32, [4]>([4, 4, 6, 64])];
-            tensor<fp32, [16, 6, 64]> k_13 = transpose(perm = k_13_perm_0, x = var_1235)[name = tensor<string, []>("transpose_19")];
-            tensor<fp32, [4, 4, 6, 64]> k_15 = reshape(shape = var_1247, x = k_13)[name = tensor<string, []>("k_15")];
-            tensor<int32, [4]> var_1249 = const()[name = tensor<string, []>("op_1249"), val = tensor<int32, [4]>([4, 4, 6, 64])];
-            tensor<fp32, [16, 6, 64]> v_13 = transpose(perm = v_13_perm_0, x = var_1242)[name = tensor<string, []>("transpose_18")];
-            tensor<fp32, [4, 4, 6, 64]> v_15 = reshape(shape = var_1249, x = v_13)[name = tensor<string, []>("v_15")];
+            tensor<int32, [4]> var_1278 = const()[name = tensor<string, []>("op_1278"), val = tensor<int32, [4]>([4, 4, 6, 64])];
+            tensor<fp32, [16, 6, 64]> q_13 = transpose(perm = q_13_perm_0, x = var_1261)[name = tensor<string, []>("transpose_20")];
+            tensor<fp32, [4, 4, 6, 64]> q_15 = reshape(shape = var_1278, x = q_13)[name = tensor<string, []>("q_15")];
+            tensor<int32, [4]> var_1280 = const()[name = tensor<string, []>("op_1280"), val = tensor<int32, [4]>([4, 4, 6, 64])];
+            tensor<fp32, [16, 6, 64]> k_13 = transpose(perm = k_13_perm_0, x = var_1268)[name = tensor<string, []>("transpose_19")];
+            tensor<fp32, [4, 4, 6, 64]> k_15 = reshape(shape = var_1280, x = k_13)[name = tensor<string, []>("k_15")];
+            tensor<int32, [4]> var_1282 = const()[name = tensor<string, []>("op_1282"), val = tensor<int32, [4]>([4, 4, 6, 64])];
+            tensor<fp32, [16, 6, 64]> v_13 = transpose(perm = v_13_perm_0, x = var_1275)[name = tensor<string, []>("transpose_18")];
+            tensor<fp32, [4, 4, 6, 64]> v_15 = reshape(shape = var_1282, x = v_13)[name = tensor<string, []>("v_15")];
             tensor<fp32, []> mul_1_y_0 = const()[name = tensor<string, []>("mul_1_y_0"), val = tensor<fp32, []>(0x1p-3)];
             tensor<fp32, [4, 4, 6, 64]> mul_1 = mul(x = q_15, y = mul_1_y_0)[name = tensor<string, []>("mul_1")];
             tensor<bool, []> matmul_0_transpose_y_0 = const()[name = tensor<string, []>("matmul_0_transpose_y_0"), val = tensor<bool, []>(true)];
@@ -1119,30 +1134,30 @@ program(1.0)
             tensor<bool, []> attn_output_1_transpose_x_0 = const()[name = tensor<string, []>("attn_output_1_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> attn_output_1_transpose_y_0 = const()[name = tensor<string, []>("attn_output_1_transpose_y_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [4, 4, 6, 64]> attn_output_1 = matmul(transpose_x = attn_output_1_transpose_x_0, transpose_y = attn_output_1_transpose_y_0, x = softmax_0, y = v_15)[name = tensor<string, []>("attn_output_1")];
-            tensor<int32, [4]> var_1252 = const()[name = tensor<string, []>("op_1252"), val = tensor<int32, [4]>([2, 0, 1, 3])];
-            tensor<int32, [2]> var_1257 = const()[name = tensor<string, []>("op_1257"), val = tensor<int32, [2]>([24, 256])];
-            tensor<fp32, [6, 4, 4, 64]> var_1253 = transpose(perm = var_1252, x = attn_output_1)[name = tensor<string, []>("transpose_17")];
-            tensor<fp32, [24, 256]> attn_output_3 = reshape(shape = var_1257, x = var_1253)[name = tensor<string, []>("attn_output_3")];
-            tensor<fp32, [24, 256]> attn_output_5 = linear(bias = decoder_self_attn2_0_out_proj_bias, weight = decoder_self_attn2_0_out_proj_weight, x = attn_output_3)[name = tensor<string, []>("linear_44")];
-            tensor<int32, [3]> var_1261 = const()[name = tensor<string, []>("op_1261"), val = tensor<int32, [3]>([6, 4, 256])];
-            tensor<fp32, [6, 4, 256]> attn_output_7 = reshape(shape = var_1261, x = attn_output_5)[name = tensor<string, []>("attn_output_7")];
+            tensor<int32, [4]> var_1285 = const()[name = tensor<string, []>("op_1285"), val = tensor<int32, [4]>([2, 0, 1, 3])];
+            tensor<int32, [2]> var_1290 = const()[name = tensor<string, []>("op_1290"), val = tensor<int32, [2]>([24, 256])];
+            tensor<fp32, [6, 4, 4, 64]> var_1286 = transpose(perm = var_1285, x = attn_output_1)[name = tensor<string, []>("transpose_17")];
+            tensor<fp32, [24, 256]> attn_output_3 = reshape(shape = var_1290, x = var_1286)[name = tensor<string, []>("attn_output_3")];
+            tensor<fp32, [24, 256]> attn_output_5 = linear(bias = inner_decoder_self_attn2_0_out_proj_bias, weight = inner_decoder_self_attn2_0_out_proj_weight, x = attn_output_3)[name = tensor<string, []>("linear_44")];
+            tensor<int32, [3]> var_1294 = const()[name = tensor<string, []>("op_1294"), val = tensor<int32, [3]>([6, 4, 256])];
+            tensor<fp32, [6, 4, 256]> attn_output_7 = reshape(shape = var_1294, x = attn_output_5)[name = tensor<string, []>("attn_output_7")];
             tensor<int32, [3]> sa2_out_1_perm_0 = const()[name = tensor<string, []>("sa2_out_1_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [4, 6, 256]> sa2_out_1 = transpose(perm = sa2_out_1_perm_0, x = attn_output_7)[name = tensor<string, []>("transpose_16")];
-            tensor<fp32, [4, 6, 256]> input_175 = add(x = query_1, y = sa2_out_1)[name = tensor<string, []>("input_175")];
-            tensor<int32, [1]> input_177_axes_0 = const()[name = tensor<string, []>("input_177_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [4, 6, 256]> input_177 = layer_norm(axes = input_177_axes_0, beta = decoder_norm21_0_bias, epsilon = var_1017, gamma = decoder_norm21_0_weight, x = input_175)[name = tensor<string, []>("input_177")];
-            tensor<fp32, [4, 6, 2048]> input_179 = linear(bias = decoder_linear1_0_bias, weight = decoder_linear1_0_weight, x = input_177)[name = tensor<string, []>("linear_45")];
-            tensor<fp32, [4, 6, 2048]> input_181 = relu(x = input_179)[name = tensor<string, []>("input_181")];
-            tensor<fp32, [4, 6, 256]> ffn_out_1 = linear(bias = decoder_linear2_0_bias, weight = decoder_linear2_0_weight, x = input_181)[name = tensor<string, []>("linear_46")];
-            tensor<fp32, [4, 6, 256]> input_183 = add(x = input_177, y = ffn_out_1)[name = tensor<string, []>("input_183")];
+            tensor<fp32, [4, 6, 256]> input_177 = add(x = query_1, y = sa2_out_1)[name = tensor<string, []>("input_177")];
+            tensor<int32, [1]> input_179_axes_0 = const()[name = tensor<string, []>("input_179_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [4, 6, 256]> input_179 = layer_norm(axes = input_179_axes_0, beta = inner_decoder_norm21_0_bias, epsilon = var_65, gamma = inner_decoder_norm21_0_weight, x = input_177)[name = tensor<string, []>("input_179")];
+            tensor<fp32, [4, 6, 2048]> input_181 = linear(bias = inner_decoder_linear1_0_bias, weight = inner_decoder_linear1_0_weight, x = input_179)[name = tensor<string, []>("linear_45")];
+            tensor<fp32, [4, 6, 2048]> input_183 = relu(x = input_181)[name = tensor<string, []>("input_183")];
+            tensor<fp32, [4, 6, 256]> ffn_out_1 = linear(bias = inner_decoder_linear2_0_bias, weight = inner_decoder_linear2_0_weight, x = input_183)[name = tensor<string, []>("linear_46")];
+            tensor<fp32, [4, 6, 256]> input_185 = add(x = input_179, y = ffn_out_1)[name = tensor<string, []>("input_185")];
             tensor<int32, [1]> xt_3_axes_0 = const()[name = tensor<string, []>("xt_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [4, 6, 256]> xt_3 = layer_norm(axes = xt_3_axes_0, beta = decoder_norm22_0_bias, epsilon = var_1017, gamma = decoder_norm22_0_weight, x = input_183)[name = tensor<string, []>("xt_3")];
-            tensor<int32, [4]> var_1281 = const()[name = tensor<string, []>("op_1281"), val = tensor<int32, [4]>([1, 4, 6, 256])];
-            tensor<fp32, [1, 4, 6, 256]> x_31 = reshape(shape = var_1281, x = xt_3)[name = tensor<string, []>("x_31")];
-            tensor<int32, [4]> var_1283_perm_0 = const()[name = tensor<string, []>("op_1283_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1287 = const()[name = tensor<string, []>("op_1287"), val = tensor<int32, [3]>([6, 4, 256])];
-            tensor<fp32, [1, 6, 4, 256]> var_1283 = transpose(perm = var_1283_perm_0, x = x_31)[name = tensor<string, []>("transpose_15")];
-            tensor<fp32, [6, 4, 256]> x = reshape(shape = var_1287, x = var_1283)[name = tensor<string, []>("x")];
+            tensor<fp32, [4, 6, 256]> xt_3 = layer_norm(axes = xt_3_axes_0, beta = inner_decoder_norm22_0_bias, epsilon = var_65, gamma = inner_decoder_norm22_0_weight, x = input_185)[name = tensor<string, []>("xt_3")];
+            tensor<int32, [4]> var_1314 = const()[name = tensor<string, []>("op_1314"), val = tensor<int32, [4]>([1, 4, 6, 256])];
+            tensor<fp32, [1, 4, 6, 256]> x_31 = reshape(shape = var_1314, x = xt_3)[name = tensor<string, []>("x_31")];
+            tensor<int32, [4]> var_1316_perm_0 = const()[name = tensor<string, []>("op_1316_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1320 = const()[name = tensor<string, []>("op_1320"), val = tensor<int32, [3]>([6, 4, 256])];
+            tensor<fp32, [1, 6, 4, 256]> var_1316 = transpose(perm = var_1316_perm_0, x = x_31)[name = tensor<string, []>("transpose_15")];
+            tensor<fp32, [6, 4, 256]> x = reshape(shape = var_1320, x = var_1316)[name = tensor<string, []>("x")];
             tensor<int32, [5]> prev_kv_begin_0 = const()[name = tensor<string, []>("prev_kv_begin_0"), val = tensor<int32, [5]>([1, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_end_0 = const()[name = tensor<string, []>("prev_kv_end_0"), val = tensor<int32, [5]>([2, 6, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_end_mask_0 = const()[name = tensor<string, []>("prev_kv_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -1153,120 +1168,120 @@ program(1.0)
             tensor<bool, [2]> prev_scale_end_mask_0 = const()[name = tensor<string, []>("prev_scale_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale = slice_by_index(begin = prev_scale_begin_0, end = prev_scale_end_0, end_mask = prev_scale_end_mask_0, squeeze_mask = prev_scale_squeeze_mask_0, x = dec_scale)[name = tensor<string, []>("prev_scale")];
-            tensor<fp32, [6, 4, 256]> var_1295 = linear(bias = decoder_q_proj_1_bias, weight = decoder_q_proj_1_weight, x = x)[name = tensor<string, []>("linear_47")];
-            tensor<int32, [4]> var_1296 = const()[name = tensor<string, []>("op_1296"), val = tensor<int32, [4]>([6, 4, 4, 64])];
-            tensor<fp32, [6, 4, 4, 64]> var_1297 = reshape(shape = var_1296, x = var_1295)[name = tensor<string, []>("op_1297")];
+            tensor<fp32, [6, 4, 256]> var_1328 = linear(bias = inner_decoder_q_proj_1_bias, weight = inner_decoder_q_proj_1_weight, x = x)[name = tensor<string, []>("linear_47")];
+            tensor<int32, [4]> var_1329 = const()[name = tensor<string, []>("op_1329"), val = tensor<int32, [4]>([6, 4, 4, 64])];
+            tensor<fp32, [6, 4, 4, 64]> var_1330 = reshape(shape = var_1329, x = var_1328)[name = tensor<string, []>("op_1330")];
             tensor<int32, [4]> q_17_perm_0 = const()[name = tensor<string, []>("q_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [6, 4, 256]> var_1301 = linear(bias = decoder_k_proj_1_bias, weight = decoder_k_proj_1_weight, x = x)[name = tensor<string, []>("linear_48")];
-            tensor<fp32, []> var_1302 = const()[name = tensor<string, []>("op_1302"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [6, 4, 256]> var_1303 = mul(x = var_1301, y = var_1302)[name = tensor<string, []>("op_1303")];
-            tensor<int32, [4]> var_1304 = const()[name = tensor<string, []>("op_1304"), val = tensor<int32, [4]>([6, 4, 4, 64])];
-            tensor<fp32, [6, 4, 4, 64]> var_1305 = reshape(shape = var_1304, x = var_1303)[name = tensor<string, []>("op_1305")];
+            tensor<fp32, [6, 4, 256]> var_1334 = linear(bias = inner_decoder_k_proj_1_bias, weight = inner_decoder_k_proj_1_weight, x = x)[name = tensor<string, []>("linear_48")];
+            tensor<fp32, []> var_1335 = const()[name = tensor<string, []>("op_1335"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [6, 4, 256]> var_1336 = mul(x = var_1334, y = var_1335)[name = tensor<string, []>("op_1336")];
+            tensor<int32, [4]> var_1337 = const()[name = tensor<string, []>("op_1337"), val = tensor<int32, [4]>([6, 4, 4, 64])];
+            tensor<fp32, [6, 4, 4, 64]> var_1338 = reshape(shape = var_1337, x = var_1336)[name = tensor<string, []>("op_1338")];
             tensor<int32, [4]> k_17_perm_0 = const()[name = tensor<string, []>("k_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [6, 4, 256]> var_1309 = linear(bias = decoder_v_proj_1_bias, weight = decoder_v_proj_1_weight, x = x)[name = tensor<string, []>("linear_49")];
-            tensor<int32, [4]> var_1310 = const()[name = tensor<string, []>("op_1310"), val = tensor<int32, [4]>([6, 4, 4, 64])];
-            tensor<fp32, [6, 4, 4, 64]> var_1311 = reshape(shape = var_1310, x = var_1309)[name = tensor<string, []>("op_1311")];
+            tensor<fp32, [6, 4, 256]> var_1342 = linear(bias = inner_decoder_v_proj_1_bias, weight = inner_decoder_v_proj_1_weight, x = x)[name = tensor<string, []>("linear_49")];
+            tensor<int32, [4]> var_1343 = const()[name = tensor<string, []>("op_1343"), val = tensor<int32, [4]>([6, 4, 4, 64])];
+            tensor<fp32, [6, 4, 4, 64]> var_1344 = reshape(shape = var_1343, x = var_1342)[name = tensor<string, []>("op_1344")];
             tensor<int32, [4]> v_17_perm_0 = const()[name = tensor<string, []>("v_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [6, 4, 256]> input_187 = linear(bias = decoder_g_proj_1_bias, weight = decoder_g_proj_1_weight, x = x)[name = tensor<string, []>("linear_50")];
+            tensor<fp32, [6, 4, 256]> input_189 = linear(bias = inner_decoder_g_proj_1_bias, weight = inner_decoder_g_proj_1_weight, x = x)[name = tensor<string, []>("linear_50")];
             tensor<fp32, [1]> sqrt_s0 = sqrt(x = prev_scale)[name = tensor<string, []>("sqrt_s0")];
             tensor<fp32, [4]> s_t = add(x = prev_scale, y = cumsum_mask_1)[name = tensor<string, []>("s_t")];
             tensor<fp32, []> const_32 = const()[name = tensor<string, []>("const_32"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [4]> clip_3 = clip(alpha = var_1022, beta = const_32, x = s_t)[name = tensor<string, []>("clip_3")];
+            tensor<fp32, [4]> clip_3 = clip(alpha = var_58, beta = const_32, x = s_t)[name = tensor<string, []>("clip_3")];
             tensor<fp32, [4]> sqrt_s_t = sqrt(x = clip_3)[name = tensor<string, []>("sqrt_s_t")];
             tensor<bool, []> qk_transpose_x_1 = const()[name = tensor<string, []>("qk_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_transpose_y_1 = const()[name = tensor<string, []>("qk_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [6, 4, 4, 64]> k_17 = transpose(perm = k_17_perm_0, x = var_1305)[name = tensor<string, []>("transpose_13")];
-            tensor<fp32, [6, 4, 4, 64]> q_17 = transpose(perm = q_17_perm_0, x = var_1297)[name = tensor<string, []>("transpose_14")];
+            tensor<fp32, [6, 4, 4, 64]> k_17 = transpose(perm = k_17_perm_0, x = var_1338)[name = tensor<string, []>("transpose_13")];
+            tensor<fp32, [6, 4, 4, 64]> q_17 = transpose(perm = q_17_perm_0, x = var_1330)[name = tensor<string, []>("transpose_14")];
             tensor<fp32, [6, 4, 4, 4]> qk = matmul(transpose_x = qk_transpose_x_1, transpose_y = qk_transpose_y_1, x = q_17, y = k_17)[name = tensor<string, []>("qk")];
-            tensor<int32, [2]> var_1326 = const()[name = tensor<string, []>("op_1326"), val = tensor<int32, [2]>([4, 1])];
-            tensor<fp32, [4, 1]> var_1327 = reshape(shape = var_1326, x = sqrt_s_t)[name = tensor<string, []>("op_1327")];
-            tensor<fp32, [4, 4]> M = real_div(x = causal_with_valid_1, y = var_1327)[name = tensor<string, []>("M")];
-            tensor<fp32, [6, 4, 4, 4]> var_1329 = mul(x = qk, y = M)[name = tensor<string, []>("op_1329")];
-            tensor<bool, []> inner_transpose_x_0 = const()[name = tensor<string, []>("inner_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> inner_transpose_y_0 = const()[name = tensor<string, []>("inner_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [6, 4, 4, 64]> v_17 = transpose(perm = v_17_perm_0, x = var_1311)[name = tensor<string, []>("transpose_12")];
-            tensor<fp32, [6, 4, 4, 64]> inner = matmul(transpose_x = inner_transpose_x_0, transpose_y = inner_transpose_y_0, x = var_1329, y = v_17)[name = tensor<string, []>("inner")];
-            tensor<bool, []> var_1331_transpose_x_0 = const()[name = tensor<string, []>("op_1331_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_1331_transpose_y_0 = const()[name = tensor<string, []>("op_1331_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [6, 4, 4, 64]> var_1331 = matmul(transpose_x = var_1331_transpose_x_0, transpose_y = var_1331_transpose_y_0, x = q_17, y = prev_kv)[name = tensor<string, []>("op_1331")];
-            tensor<fp32, [4]> var_1332 = real_div(x = sqrt_s0, y = sqrt_s_t)[name = tensor<string, []>("op_1332")];
-            tensor<int32, [4]> var_1333 = const()[name = tensor<string, []>("op_1333"), val = tensor<int32, [4]>([1, 1, 4, 1])];
-            tensor<fp32, [1, 1, 4, 1]> var_1334 = reshape(shape = var_1333, x = var_1332)[name = tensor<string, []>("op_1334")];
-            tensor<fp32, [6, 4, 4, 64]> cross = mul(x = var_1331, y = var_1334)[name = tensor<string, []>("cross")];
-            tensor<fp32, [6, 4, 4, 64]> out_31 = add(x = inner, y = cross)[name = tensor<string, []>("out_31")];
-            tensor<fp32, [6, 4, 4, 64]> v_masked = mul(x = v_17, y = var_1157)[name = tensor<string, []>("v_masked")];
-            tensor<fp32, [6, 4, 64, 64]> var_1340 = mul(x = prev_kv, y = sqrt_s0)[name = tensor<string, []>("op_1340")];
-            tensor<bool, []> var_1342_transpose_x_1 = const()[name = tensor<string, []>("op_1342_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_1342_transpose_y_1 = const()[name = tensor<string, []>("op_1342_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [6, 4, 64, 64]> var_1342 = matmul(transpose_x = var_1342_transpose_x_1, transpose_y = var_1342_transpose_y_1, x = k_17, y = v_masked)[name = tensor<string, []>("op_1342")];
-            tensor<fp32, [6, 4, 64, 64]> new_kv_unnorm = add(x = var_1340, y = var_1342)[name = tensor<string, []>("new_kv_unnorm")];
-            tensor<fp32, [1]> new_scale = add(x = prev_scale, y = var_1165)[name = tensor<string, []>("new_scale")];
+            tensor<int32, [2]> var_1359 = const()[name = tensor<string, []>("op_1359"), val = tensor<int32, [2]>([4, 1])];
+            tensor<fp32, [4, 1]> var_1360 = reshape(shape = var_1359, x = sqrt_s_t)[name = tensor<string, []>("op_1360")];
+            tensor<fp32, [4, 4]> M = real_div(x = causal_with_valid_1, y = var_1360)[name = tensor<string, []>("M")];
+            tensor<fp32, [6, 4, 4, 4]> var_1362 = mul(x = qk, y = M)[name = tensor<string, []>("op_1362")];
+            tensor<bool, []> inner_11_transpose_x_0 = const()[name = tensor<string, []>("inner_11_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> inner_11_transpose_y_0 = const()[name = tensor<string, []>("inner_11_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [6, 4, 4, 64]> v_17 = transpose(perm = v_17_perm_0, x = var_1344)[name = tensor<string, []>("transpose_12")];
+            tensor<fp32, [6, 4, 4, 64]> inner_11 = matmul(transpose_x = inner_11_transpose_x_0, transpose_y = inner_11_transpose_y_0, x = var_1362, y = v_17)[name = tensor<string, []>("inner_11")];
+            tensor<bool, []> var_1364_transpose_x_0 = const()[name = tensor<string, []>("op_1364_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_1364_transpose_y_0 = const()[name = tensor<string, []>("op_1364_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [6, 4, 4, 64]> var_1364 = matmul(transpose_x = var_1364_transpose_x_0, transpose_y = var_1364_transpose_y_0, x = q_17, y = prev_kv)[name = tensor<string, []>("op_1364")];
+            tensor<fp32, [4]> var_1365 = real_div(x = sqrt_s0, y = sqrt_s_t)[name = tensor<string, []>("op_1365")];
+            tensor<int32, [4]> var_1366 = const()[name = tensor<string, []>("op_1366"), val = tensor<int32, [4]>([1, 1, 4, 1])];
+            tensor<fp32, [1, 1, 4, 1]> var_1367 = reshape(shape = var_1366, x = var_1365)[name = tensor<string, []>("op_1367")];
+            tensor<fp32, [6, 4, 4, 64]> cross = mul(x = var_1364, y = var_1367)[name = tensor<string, []>("cross")];
+            tensor<fp32, [6, 4, 4, 64]> out_31 = add(x = inner_11, y = cross)[name = tensor<string, []>("out_31")];
+            tensor<fp32, [6, 4, 4, 64]> v_masked = mul(x = v_17, y = var_1190)[name = tensor<string, []>("v_masked")];
+            tensor<fp32, [6, 4, 64, 64]> var_1373 = mul(x = prev_kv, y = sqrt_s0)[name = tensor<string, []>("op_1373")];
+            tensor<bool, []> var_1375_transpose_x_1 = const()[name = tensor<string, []>("op_1375_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_1375_transpose_y_1 = const()[name = tensor<string, []>("op_1375_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [6, 4, 64, 64]> var_1375 = matmul(transpose_x = var_1375_transpose_x_1, transpose_y = var_1375_transpose_y_1, x = k_17, y = v_masked)[name = tensor<string, []>("op_1375")];
+            tensor<fp32, [6, 4, 64, 64]> new_kv_unnorm = add(x = var_1373, y = var_1375)[name = tensor<string, []>("new_kv_unnorm")];
+            tensor<fp32, [1]> new_scale = add(x = prev_scale, y = var_1198)[name = tensor<string, []>("new_scale")];
             tensor<fp32, []> const_33 = const()[name = tensor<string, []>("const_33"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_4 = clip(alpha = var_1022, beta = const_33, x = new_scale)[name = tensor<string, []>("clip_4")];
+            tensor<fp32, [1]> clip_4 = clip(alpha = var_58, beta = const_33, x = new_scale)[name = tensor<string, []>("clip_4")];
             tensor<fp32, [1]> sqrt_new_scale = sqrt(x = clip_4)[name = tensor<string, []>("sqrt_new_scale")];
             tensor<fp32, [6, 4, 64, 64]> nkv = real_div(x = new_kv_unnorm, y = sqrt_new_scale)[name = tensor<string, []>("nkv")];
-            tensor<int32, [4]> var_1351_perm_0 = const()[name = tensor<string, []>("op_1351_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [4]> var_1384_perm_0 = const()[name = tensor<string, []>("op_1384_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_33_axes_0 = const()[name = tensor<string, []>("out_33_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [6, 4, 4, 64]> var_1351 = transpose(perm = var_1351_perm_0, x = out_31)[name = tensor<string, []>("transpose_11")];
-            tensor<fp32, [6, 4, 4, 64]> out_33 = layer_norm(axes = out_33_axes_0, epsilon = var_1019, x = var_1351)[name = tensor<string, []>("out_33")];
-            tensor<int32, [3]> var_1355 = const()[name = tensor<string, []>("op_1355"), val = tensor<int32, [3]>([6, 4, 256])];
-            tensor<fp32, [6, 4, 256]> out = reshape(shape = var_1355, x = out_33)[name = tensor<string, []>("out")];
-            tensor<fp32, [6, 4, 256]> var_1357 = silu(x = input_187)[name = tensor<string, []>("op_1357")];
-            tensor<fp32, [6, 4, 256]> input_189 = mul(x = var_1357, y = out)[name = tensor<string, []>("input_189")];
-            tensor<fp32, [6, 4, 256]> ret_out = linear(bias = decoder_out_proj_1_bias, weight = decoder_out_proj_1_weight, x = input_189)[name = tensor<string, []>("linear_51")];
-            tensor<fp32, [6, 4, 256]> input_191 = add(x = x, y = ret_out)[name = tensor<string, []>("input_191")];
+            tensor<fp32, [6, 4, 4, 64]> var_1384 = transpose(perm = var_1384_perm_0, x = out_31)[name = tensor<string, []>("transpose_11")];
+            tensor<fp32, [6, 4, 4, 64]> out_33 = layer_norm(axes = out_33_axes_0, epsilon = var_73, x = var_1384)[name = tensor<string, []>("out_33")];
+            tensor<int32, [3]> var_1388 = const()[name = tensor<string, []>("op_1388"), val = tensor<int32, [3]>([6, 4, 256])];
+            tensor<fp32, [6, 4, 256]> out = reshape(shape = var_1388, x = out_33)[name = tensor<string, []>("out")];
+            tensor<fp32, [6, 4, 256]> var_1390 = silu(x = input_189)[name = tensor<string, []>("op_1390")];
+            tensor<fp32, [6, 4, 256]> input_191 = mul(x = var_1390, y = out)[name = tensor<string, []>("input_191")];
+            tensor<fp32, [6, 4, 256]> ret_out = linear(bias = inner_decoder_out_proj_1_bias, weight = inner_decoder_out_proj_1_weight, x = input_191)[name = tensor<string, []>("linear_51")];
+            tensor<fp32, [6, 4, 256]> input_193 = add(x = x, y = ret_out)[name = tensor<string, []>("input_193")];
             tensor<int32, [1]> xt_5_axes_0 = const()[name = tensor<string, []>("xt_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [6, 4, 256]> xt_5 = layer_norm(axes = xt_5_axes_0, beta = decoder_norm11_1_bias, epsilon = var_1017, gamma = decoder_norm11_1_weight, x = input_191)[name = tensor<string, []>("xt_5")];
-            tensor<int32, [4]> var_1367 = const()[name = tensor<string, []>("op_1367"), val = tensor<int32, [4]>([1, 6, 4, 256])];
-            tensor<fp32, [1, 6, 4, 256]> var_1368 = reshape(shape = var_1367, x = xt_5)[name = tensor<string, []>("op_1368")];
-            tensor<int32, [4]> var_1369_perm_0 = const()[name = tensor<string, []>("op_1369_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1372 = const()[name = tensor<string, []>("op_1372"), val = tensor<int32, [3]>([4, 6, 256])];
-            tensor<fp32, [1, 4, 6, 256]> var_1369 = transpose(perm = var_1369_perm_0, x = var_1368)[name = tensor<string, []>("transpose_10")];
-            tensor<fp32, [4, 6, 256]> query_5 = reshape(shape = var_1372, x = var_1369)[name = tensor<string, []>("query_5")];
+            tensor<fp32, [6, 4, 256]> xt_5 = layer_norm(axes = xt_5_axes_0, beta = inner_decoder_norm11_1_bias, epsilon = var_65, gamma = inner_decoder_norm11_1_weight, x = input_193)[name = tensor<string, []>("xt_5")];
+            tensor<int32, [4]> var_1400 = const()[name = tensor<string, []>("op_1400"), val = tensor<int32, [4]>([1, 6, 4, 256])];
+            tensor<fp32, [1, 6, 4, 256]> var_1401 = reshape(shape = var_1400, x = xt_5)[name = tensor<string, []>("op_1401")];
+            tensor<int32, [4]> var_1402_perm_0 = const()[name = tensor<string, []>("op_1402_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1405 = const()[name = tensor<string, []>("op_1405"), val = tensor<int32, [3]>([4, 6, 256])];
+            tensor<fp32, [1, 4, 6, 256]> var_1402 = transpose(perm = var_1402_perm_0, x = var_1401)[name = tensor<string, []>("transpose_10")];
+            tensor<fp32, [4, 6, 256]> query_5 = reshape(shape = var_1405, x = var_1402)[name = tensor<string, []>("query_5")];
             tensor<int32, [3]> query_perm_0 = const()[name = tensor<string, []>("query_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [6, 4, 256]> query = transpose(perm = query_perm_0, x = query_5)[name = tensor<string, []>("transpose_9")];
-            tensor<fp32, [6, 4, 768]> var_1395 = linear(bias = decoder_self_attn2_1_in_proj_bias, weight = decoder_self_attn2_1_in_proj_weight, x = query)[name = tensor<string, []>("linear_52")];
+            tensor<fp32, [6, 4, 768]> var_1428 = linear(bias = inner_decoder_self_attn2_1_in_proj_bias, weight = inner_decoder_self_attn2_1_in_proj_weight, x = query)[name = tensor<string, []>("linear_52")];
             tensor<int32, [4]> concat_2 = const()[name = tensor<string, []>("concat_2"), val = tensor<int32, [4]>([6, 4, 3, 256])];
-            tensor<fp32, [6, 4, 3, 256]> var_1397 = reshape(shape = concat_2, x = var_1395)[name = tensor<string, []>("op_1397")];
-            tensor<int32, [1]> var_1398_axes_0 = const()[name = tensor<string, []>("op_1398_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp32, [1, 6, 4, 3, 256]> var_1398 = expand_dims(axes = var_1398_axes_0, x = var_1397)[name = tensor<string, []>("op_1398")];
-            tensor<int32, [5]> var_1399_perm_0 = const()[name = tensor<string, []>("op_1399_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
-            tensor<int32, [1]> var_1400_axes_0 = const()[name = tensor<string, []>("op_1400_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp32, [3, 6, 4, 1, 256]> var_1399 = transpose(perm = var_1399_perm_0, x = var_1398)[name = tensor<string, []>("transpose_8")];
-            tensor<fp32, [3, 6, 4, 256]> var_1400 = squeeze(axes = var_1400_axes_0, x = var_1399)[name = tensor<string, []>("op_1400")];
+            tensor<fp32, [6, 4, 3, 256]> var_1430 = reshape(shape = concat_2, x = var_1428)[name = tensor<string, []>("op_1430")];
+            tensor<int32, [1]> var_1431_axes_0 = const()[name = tensor<string, []>("op_1431_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 6, 4, 3, 256]> var_1431 = expand_dims(axes = var_1431_axes_0, x = var_1430)[name = tensor<string, []>("op_1431")];
+            tensor<int32, [5]> var_1432_perm_0 = const()[name = tensor<string, []>("op_1432_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
+            tensor<int32, [1]> var_1433_axes_0 = const()[name = tensor<string, []>("op_1433_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp32, [3, 6, 4, 1, 256]> var_1432 = transpose(perm = var_1432_perm_0, x = var_1431)[name = tensor<string, []>("transpose_8")];
+            tensor<fp32, [3, 6, 4, 256]> var_1433 = squeeze(axes = var_1433_axes_0, x = var_1432)[name = tensor<string, []>("op_1433")];
             tensor<int32, [4]> q_19_begin_0 = const()[name = tensor<string, []>("q_19_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> q_19_end_0 = const()[name = tensor<string, []>("q_19_end_0"), val = tensor<int32, [4]>([1, 6, 4, 256])];
             tensor<bool, [4]> q_19_end_mask_0 = const()[name = tensor<string, []>("q_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> q_19_squeeze_mask_0 = const()[name = tensor<string, []>("q_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [6, 4, 256]> q_19 = slice_by_index(begin = q_19_begin_0, end = q_19_end_0, end_mask = q_19_end_mask_0, squeeze_mask = q_19_squeeze_mask_0, x = var_1400)[name = tensor<string, []>("q_19")];
+            tensor<fp32, [6, 4, 256]> q_19 = slice_by_index(begin = q_19_begin_0, end = q_19_end_0, end_mask = q_19_end_mask_0, squeeze_mask = q_19_squeeze_mask_0, x = var_1433)[name = tensor<string, []>("q_19")];
             tensor<int32, [4]> k_19_begin_0 = const()[name = tensor<string, []>("k_19_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> k_19_end_0 = const()[name = tensor<string, []>("k_19_end_0"), val = tensor<int32, [4]>([2, 6, 4, 256])];
             tensor<bool, [4]> k_19_end_mask_0 = const()[name = tensor<string, []>("k_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> k_19_squeeze_mask_0 = const()[name = tensor<string, []>("k_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [6, 4, 256]> k_19 = slice_by_index(begin = k_19_begin_0, end = k_19_end_0, end_mask = k_19_end_mask_0, squeeze_mask = k_19_squeeze_mask_0, x = var_1400)[name = tensor<string, []>("k_19")];
+            tensor<fp32, [6, 4, 256]> k_19 = slice_by_index(begin = k_19_begin_0, end = k_19_end_0, end_mask = k_19_end_mask_0, squeeze_mask = k_19_squeeze_mask_0, x = var_1433)[name = tensor<string, []>("k_19")];
             tensor<int32, [4]> v_19_begin_0 = const()[name = tensor<string, []>("v_19_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> v_19_end_0 = const()[name = tensor<string, []>("v_19_end_0"), val = tensor<int32, [4]>([3, 6, 4, 256])];
             tensor<bool, [4]> v_19_end_mask_0 = const()[name = tensor<string, []>("v_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> v_19_squeeze_mask_0 = const()[name = tensor<string, []>("v_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [6, 4, 256]> v_19 = slice_by_index(begin = v_19_begin_0, end = v_19_end_0, end_mask = v_19_end_mask_0, squeeze_mask = v_19_squeeze_mask_0, x = var_1400)[name = tensor<string, []>("v_19")];
-            tensor<int32, [3]> var_1408 = const()[name = tensor<string, []>("op_1408"), val = tensor<int32, [3]>([6, 16, 64])];
-            tensor<fp32, [6, 16, 64]> var_1409 = reshape(shape = var_1408, x = q_19)[name = tensor<string, []>("op_1409")];
+            tensor<fp32, [6, 4, 256]> v_19 = slice_by_index(begin = v_19_begin_0, end = v_19_end_0, end_mask = v_19_end_mask_0, squeeze_mask = v_19_squeeze_mask_0, x = var_1433)[name = tensor<string, []>("v_19")];
+            tensor<int32, [3]> var_1441 = const()[name = tensor<string, []>("op_1441"), val = tensor<int32, [3]>([6, 16, 64])];
+            tensor<fp32, [6, 16, 64]> var_1442 = reshape(shape = var_1441, x = q_19)[name = tensor<string, []>("op_1442")];
             tensor<int32, [3]> q_21_perm_0 = const()[name = tensor<string, []>("q_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1415 = const()[name = tensor<string, []>("op_1415"), val = tensor<int32, [3]>([6, 16, 64])];
-            tensor<fp32, [6, 16, 64]> var_1416 = reshape(shape = var_1415, x = k_19)[name = tensor<string, []>("op_1416")];
+            tensor<int32, [3]> var_1448 = const()[name = tensor<string, []>("op_1448"), val = tensor<int32, [3]>([6, 16, 64])];
+            tensor<fp32, [6, 16, 64]> var_1449 = reshape(shape = var_1448, x = k_19)[name = tensor<string, []>("op_1449")];
             tensor<int32, [3]> k_21_perm_0 = const()[name = tensor<string, []>("k_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1422 = const()[name = tensor<string, []>("op_1422"), val = tensor<int32, [3]>([6, 16, 64])];
-            tensor<fp32, [6, 16, 64]> var_1423 = reshape(shape = var_1422, x = v_19)[name = tensor<string, []>("op_1423")];
+            tensor<int32, [3]> var_1455 = const()[name = tensor<string, []>("op_1455"), val = tensor<int32, [3]>([6, 16, 64])];
+            tensor<fp32, [6, 16, 64]> var_1456 = reshape(shape = var_1455, x = v_19)[name = tensor<string, []>("op_1456")];
             tensor<int32, [3]> v_21_perm_0 = const()[name = tensor<string, []>("v_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [4]> var_1426 = const()[name = tensor<string, []>("op_1426"), val = tensor<int32, [4]>([4, 4, 6, 64])];
-            tensor<fp32, [16, 6, 64]> q_21 = transpose(perm = q_21_perm_0, x = var_1409)[name = tensor<string, []>("transpose_7")];
-            tensor<fp32, [4, 4, 6, 64]> q = reshape(shape = var_1426, x = q_21)[name = tensor<string, []>("q")];
-            tensor<int32, [4]> var_1428 = const()[name = tensor<string, []>("op_1428"), val = tensor<int32, [4]>([4, 4, 6, 64])];
-            tensor<fp32, [16, 6, 64]> k_21 = transpose(perm = k_21_perm_0, x = var_1416)[name = tensor<string, []>("transpose_6")];
-            tensor<fp32, [4, 4, 6, 64]> k = reshape(shape = var_1428, x = k_21)[name = tensor<string, []>("k")];
-            tensor<int32, [4]> var_1430 = const()[name = tensor<string, []>("op_1430"), val = tensor<int32, [4]>([4, 4, 6, 64])];
-            tensor<fp32, [16, 6, 64]> v_21 = transpose(perm = v_21_perm_0, x = var_1423)[name = tensor<string, []>("transpose_5")];
-            tensor<fp32, [4, 4, 6, 64]> v = reshape(shape = var_1430, x = v_21)[name = tensor<string, []>("v")];
+            tensor<int32, [4]> var_1459 = const()[name = tensor<string, []>("op_1459"), val = tensor<int32, [4]>([4, 4, 6, 64])];
+            tensor<fp32, [16, 6, 64]> q_21 = transpose(perm = q_21_perm_0, x = var_1442)[name = tensor<string, []>("transpose_7")];
+            tensor<fp32, [4, 4, 6, 64]> q = reshape(shape = var_1459, x = q_21)[name = tensor<string, []>("q")];
+            tensor<int32, [4]> var_1461 = const()[name = tensor<string, []>("op_1461"), val = tensor<int32, [4]>([4, 4, 6, 64])];
+            tensor<fp32, [16, 6, 64]> k_21 = transpose(perm = k_21_perm_0, x = var_1449)[name = tensor<string, []>("transpose_6")];
+            tensor<fp32, [4, 4, 6, 64]> k = reshape(shape = var_1461, x = k_21)[name = tensor<string, []>("k")];
+            tensor<int32, [4]> var_1463 = const()[name = tensor<string, []>("op_1463"), val = tensor<int32, [4]>([4, 4, 6, 64])];
+            tensor<fp32, [16, 6, 64]> v_21 = transpose(perm = v_21_perm_0, x = var_1456)[name = tensor<string, []>("transpose_5")];
+            tensor<fp32, [4, 4, 6, 64]> v = reshape(shape = var_1463, x = v_21)[name = tensor<string, []>("v")];
             tensor<fp32, []> mul_3_y_0 = const()[name = tensor<string, []>("mul_3_y_0"), val = tensor<fp32, []>(0x1p-3)];
             tensor<fp32, [4, 4, 6, 64]> mul_3 = mul(x = q, y = mul_3_y_0)[name = tensor<string, []>("mul_3")];
             tensor<bool, []> matmul_1_transpose_y_0 = const()[name = tensor<string, []>("matmul_1_transpose_y_0"), val = tensor<bool, []>(true)];
@@ -1277,36 +1292,36 @@ program(1.0)
             tensor<bool, []> attn_output_9_transpose_x_0 = const()[name = tensor<string, []>("attn_output_9_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> attn_output_9_transpose_y_0 = const()[name = tensor<string, []>("attn_output_9_transpose_y_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [4, 4, 6, 64]> attn_output_9 = matmul(transpose_x = attn_output_9_transpose_x_0, transpose_y = attn_output_9_transpose_y_0, x = softmax_1, y = v)[name = tensor<string, []>("attn_output_9")];
-            tensor<int32, [4]> var_1433 = const()[name = tensor<string, []>("op_1433"), val = tensor<int32, [4]>([2, 0, 1, 3])];
-            tensor<int32, [2]> var_1438 = const()[name = tensor<string, []>("op_1438"), val = tensor<int32, [2]>([24, 256])];
-            tensor<fp32, [6, 4, 4, 64]> var_1434 = transpose(perm = var_1433, x = attn_output_9)[name = tensor<string, []>("transpose_4")];
-            tensor<fp32, [24, 256]> attn_output_11 = reshape(shape = var_1438, x = var_1434)[name = tensor<string, []>("attn_output_11")];
-            tensor<fp32, [24, 256]> attn_output_13 = linear(bias = decoder_self_attn2_1_out_proj_bias, weight = decoder_self_attn2_1_out_proj_weight, x = attn_output_11)[name = tensor<string, []>("linear_53")];
-            tensor<int32, [3]> var_1442 = const()[name = tensor<string, []>("op_1442"), val = tensor<int32, [3]>([6, 4, 256])];
-            tensor<fp32, [6, 4, 256]> attn_output = reshape(shape = var_1442, x = attn_output_13)[name = tensor<string, []>("attn_output")];
+            tensor<int32, [4]> var_1466 = const()[name = tensor<string, []>("op_1466"), val = tensor<int32, [4]>([2, 0, 1, 3])];
+            tensor<int32, [2]> var_1471 = const()[name = tensor<string, []>("op_1471"), val = tensor<int32, [2]>([24, 256])];
+            tensor<fp32, [6, 4, 4, 64]> var_1467 = transpose(perm = var_1466, x = attn_output_9)[name = tensor<string, []>("transpose_4")];
+            tensor<fp32, [24, 256]> attn_output_11 = reshape(shape = var_1471, x = var_1467)[name = tensor<string, []>("attn_output_11")];
+            tensor<fp32, [24, 256]> attn_output_13 = linear(bias = inner_decoder_self_attn2_1_out_proj_bias, weight = inner_decoder_self_attn2_1_out_proj_weight, x = attn_output_11)[name = tensor<string, []>("linear_53")];
+            tensor<int32, [3]> var_1475 = const()[name = tensor<string, []>("op_1475"), val = tensor<int32, [3]>([6, 4, 256])];
+            tensor<fp32, [6, 4, 256]> attn_output = reshape(shape = var_1475, x = attn_output_13)[name = tensor<string, []>("attn_output")];
             tensor<int32, [3]> sa2_out_perm_0 = const()[name = tensor<string, []>("sa2_out_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [4, 6, 256]> sa2_out = transpose(perm = sa2_out_perm_0, x = attn_output)[name = tensor<string, []>("transpose_3")];
-            tensor<fp32, [4, 6, 256]> input_193 = add(x = query_5, y = sa2_out)[name = tensor<string, []>("input_193")];
-            tensor<int32, [1]> input_195_axes_0 = const()[name = tensor<string, []>("input_195_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [4, 6, 256]> input_195 = layer_norm(axes = input_195_axes_0, beta = decoder_norm21_1_bias, epsilon = var_1017, gamma = decoder_norm21_1_weight, x = input_193)[name = tensor<string, []>("input_195")];
-            tensor<fp32, [4, 6, 2048]> input_197 = linear(bias = decoder_linear1_1_bias, weight = decoder_linear1_1_weight, x = input_195)[name = tensor<string, []>("linear_54")];
-            tensor<fp32, [4, 6, 2048]> input_199 = relu(x = input_197)[name = tensor<string, []>("input_199")];
-            tensor<fp32, [4, 6, 256]> ffn_out = linear(bias = decoder_linear2_1_bias, weight = decoder_linear2_1_weight, x = input_199)[name = tensor<string, []>("linear_55")];
-            tensor<fp32, [4, 6, 256]> input_201 = add(x = input_195, y = ffn_out)[name = tensor<string, []>("input_201")];
+            tensor<fp32, [4, 6, 256]> input_195 = add(x = query_5, y = sa2_out)[name = tensor<string, []>("input_195")];
+            tensor<int32, [1]> input_197_axes_0 = const()[name = tensor<string, []>("input_197_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [4, 6, 256]> input_197 = layer_norm(axes = input_197_axes_0, beta = inner_decoder_norm21_1_bias, epsilon = var_65, gamma = inner_decoder_norm21_1_weight, x = input_195)[name = tensor<string, []>("input_197")];
+            tensor<fp32, [4, 6, 2048]> input_199 = linear(bias = inner_decoder_linear1_1_bias, weight = inner_decoder_linear1_1_weight, x = input_197)[name = tensor<string, []>("linear_54")];
+            tensor<fp32, [4, 6, 2048]> input_201 = relu(x = input_199)[name = tensor<string, []>("input_201")];
+            tensor<fp32, [4, 6, 256]> ffn_out = linear(bias = inner_decoder_linear2_1_bias, weight = inner_decoder_linear2_1_weight, x = input_201)[name = tensor<string, []>("linear_55")];
+            tensor<fp32, [4, 6, 256]> input_203 = add(x = input_197, y = ffn_out)[name = tensor<string, []>("input_203")];
             tensor<int32, [1]> xt_axes_0 = const()[name = tensor<string, []>("xt_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [4, 6, 256]> xt = layer_norm(axes = xt_axes_0, beta = decoder_norm22_1_bias, epsilon = var_1017, gamma = decoder_norm22_1_weight, x = input_201)[name = tensor<string, []>("xt")];
-            tensor<int32, [4]> var_1462 = const()[name = tensor<string, []>("op_1462"), val = tensor<int32, [4]>([1, 4, 6, 256])];
-            tensor<fp32, [1, 4, 6, 256]> input = reshape(shape = var_1462, x = xt)[name = tensor<string, []>("input")];
-            tensor<int32, [1]> var_1464 = const()[name = tensor<string, []>("op_1464"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 6, 1]> var_1465 = reduce_l2_norm(axes = var_1464, keep_dims = var_1020, x = input)[name = tensor<string, []>("op_1465")];
+            tensor<fp32, [4, 6, 256]> xt = layer_norm(axes = xt_axes_0, beta = inner_decoder_norm22_1_bias, epsilon = var_65, gamma = inner_decoder_norm22_1_weight, x = input_203)[name = tensor<string, []>("xt")];
+            tensor<int32, [4]> var_1495 = const()[name = tensor<string, []>("op_1495"), val = tensor<int32, [4]>([1, 4, 6, 256])];
+            tensor<fp32, [1, 4, 6, 256]> input = reshape(shape = var_1495, x = xt)[name = tensor<string, []>("input")];
+            tensor<int32, [1]> var_1497 = const()[name = tensor<string, []>("op_1497"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 4, 6, 1]> var_1498 = reduce_l2_norm(axes = var_1497, keep_dims = var_64, x = input)[name = tensor<string, []>("op_1498")];
             tensor<fp32, []> const_42 = const()[name = tensor<string, []>("const_42"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1, 4, 6, 1]> clip_5 = clip(alpha = var_1012, beta = const_42, x = var_1465)[name = tensor<string, []>("clip_5")];
-            tensor<fp32, [1, 4, 6, 256]> var_1467 = real_div(x = input, y = clip_5)[name = tensor<string, []>("op_1467")];
+            tensor<fp32, [1, 4, 6, 1]> clip_5 = clip(alpha = var_78, beta = const_42, x = var_1498)[name = tensor<string, []>("clip_5")];
+            tensor<fp32, [1, 4, 6, 256]> var_1500 = real_div(x = input, y = clip_5)[name = tensor<string, []>("op_1500")];
             tensor<int32, [3]> concat_6 = const()[name = tensor<string, []>("concat_6"), val = tensor<int32, [3]>([4, 1, 256])];
             tensor<fp32, [4, 1, 256]> reshape_0 = reshape(shape = concat_6, x = emb)[name = tensor<string, []>("reshape_0")];
             tensor<int32, [4]> transpose_1_perm_0 = const()[name = tensor<string, []>("transpose_1_perm_0"), val = tensor<int32, [4]>([0, 1, 3, 2])];
             tensor<int32, [3]> concat_7 = const()[name = tensor<string, []>("concat_7"), val = tensor<int32, [3]>([4, 256, 6])];
-            tensor<fp32, [1, 4, 256, 6]> transpose_1 = transpose(perm = transpose_1_perm_0, x = var_1467)[name = tensor<string, []>("transpose_2")];
+            tensor<fp32, [1, 4, 256, 6]> transpose_1 = transpose(perm = transpose_1_perm_0, x = var_1500)[name = tensor<string, []>("transpose_2")];
             tensor<fp32, [4, 256, 6]> reshape_1 = reshape(shape = concat_7, x = transpose_1)[name = tensor<string, []>("reshape_1")];
             tensor<bool, []> matmul_2_transpose_x_0 = const()[name = tensor<string, []>("matmul_2_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> matmul_2_transpose_y_0 = const()[name = tensor<string, []>("matmul_2_transpose_y_0"), val = tensor<bool, []>(false)];
@@ -1317,10 +1332,10 @@ program(1.0)
             tensor<int32, [3]> output_end_0 = const()[name = tensor<string, []>("output_end_0"), val = tensor<int32, [3]>([1, 4, 5])];
             tensor<bool, [3]> output_end_mask_0 = const()[name = tensor<string, []>("output_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
             tensor<fp32, [1, 4, 4]> output = slice_by_index(begin = output_begin_0, end = output_end_0, end_mask = output_end_mask_0, x = reshape_2)[name = tensor<string, []>("output")];
-            tensor<fp32, [1, 4, 4]> probs = sigmoid(x = output)[name = tensor<string, []>("op_1471")];
-            tensor<int32, []> var_1473_axis_0 = const()[name = tensor<string, []>("op_1473_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [2, 6, 4, 64, 64]> dec_kv_new = stack(axis = var_1473_axis_0, values = (var_1169, nkv))[name = tensor<string, []>("op_1473")];
-            tensor<int32, []> var_1475_axis_0 = const()[name = tensor<string, []>("op_1475_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [2, 1]> dec_scale_new = stack(axis = var_1475_axis_0, values = (new_scale_9, new_scale))[name = tensor<string, []>("op_1475")];
+            tensor<fp32, [1, 4, 4]> probs = sigmoid(x = output)[name = tensor<string, []>("op_1504")];
+            tensor<int32, []> var_1506_axis_0 = const()[name = tensor<string, []>("op_1506_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [2, 6, 4, 64, 64]> dec_kv_new = stack(axis = var_1506_axis_0, values = (var_1202, nkv))[name = tensor<string, []>("op_1506")];
+            tensor<int32, []> var_1508_axis_0 = const()[name = tensor<string, []>("op_1508_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [2, 1]> dec_scale_new = stack(axis = var_1508_axis_0, values = (new_scale_9, new_scale))[name = tensor<string, []>("op_1508")];
         } -> (probs, enc_kv_new, enc_scale_new, enc_conv_cache_new, cnn_window_new, dec_kv_new, dec_scale_new);
 }
\ No newline at end of file
diff --git a/optimized/ami/400ms/ls_eend_ami_400ms.mlpackage/Data/com.apple.CoreML/model.mlmodel b/optimized/ami/400ms/ls_eend_ami_400ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
index 11bd6cc8bc201ab8dc743d904c5a906849392429..67687bbb2394cad079080ece84b036a41b37d124 100644
--- a/optimized/ami/400ms/ls_eend_ami_400ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
+++ b/optimized/ami/400ms/ls_eend_ami_400ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3bf0e98390dbb2eac96d1c7400ce29fa396132e83c940c9f6fb9aa1d779f97c3
-size 191035
+oid sha256:139c50454d888783e9c845f69f1948c913ba3b6d9e5773f5abae72887cb75a72
+size 197107
diff --git a/optimized/ami/400ms/ls_eend_ami_400ms.mlpackage/Manifest.json b/optimized/ami/400ms/ls_eend_ami_400ms.mlpackage/Manifest.json
index 4ea926ed488b5227c9d9a317766f7e6b911916dd..26fc622c438e51bb17223f8c3f9f85d13685006a 100644
--- a/optimized/ami/400ms/ls_eend_ami_400ms.mlpackage/Manifest.json
+++ b/optimized/ami/400ms/ls_eend_ami_400ms.mlpackage/Manifest.json
@@ -1,18 +1,18 @@
 {
     "fileFormatVersion": "1.0.0",
     "itemInfoEntries": {
-        "111C110F-82D7-4E61-A58C-05908C6E61C8": {
-            "author": "com.apple.CoreML",
-            "description": "CoreML Model Weights",
-            "name": "weights",
-            "path": "com.apple.CoreML/weights"
-        },
-        "A106D900-1931-4022-84A9-F374DF9D3F12": {
+        "DEA911EA-DD5F-40B3-ACAD-71757B857265": {
             "author": "com.apple.CoreML",
             "description": "CoreML Model Specification",
             "name": "model.mlmodel",
             "path": "com.apple.CoreML/model.mlmodel"
+        },
+        "EE26D82A-DA8A-4B4C-938F-63934CA9722D": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Weights",
+            "name": "weights",
+            "path": "com.apple.CoreML/weights"
         }
     },
-    "rootModelIdentifier": "A106D900-1931-4022-84A9-F374DF9D3F12"
+    "rootModelIdentifier": "DEA911EA-DD5F-40B3-ACAD-71757B857265"
 }
diff --git a/optimized/ami/500ms/ls_eend_ami_500ms.mlmodelc/analytics/coremldata.bin b/optimized/ami/500ms/ls_eend_ami_500ms.mlmodelc/analytics/coremldata.bin
index 38627aa1b674f9fdb4fc74b0266575b8228d9989..329d62e0e526a6645a8be26cf6cdc4659b12e0c9 100644
--- a/optimized/ami/500ms/ls_eend_ami_500ms.mlmodelc/analytics/coremldata.bin
+++ b/optimized/ami/500ms/ls_eend_ami_500ms.mlmodelc/analytics/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e15806e285f1582d15c27637594c49104f55b3b7b72abdaad4b32313223bee5b
+oid sha256:8c8d6032e92c8c43fe974f203d4a9041453e83932bbc133eaa00605afe3464b4
 size 243
diff --git a/optimized/ami/500ms/ls_eend_ami_500ms.mlmodelc/coremldata.bin b/optimized/ami/500ms/ls_eend_ami_500ms.mlmodelc/coremldata.bin
index 93ccd031fa1a60d8bc1e9e1900eebe4ed013ded1..a6c28c2ec1c0ed0e854ae17762c90fa173e4266b 100644
--- a/optimized/ami/500ms/ls_eend_ami_500ms.mlmodelc/coremldata.bin
+++ b/optimized/ami/500ms/ls_eend_ami_500ms.mlmodelc/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:77dd81a1ebcc4c55be5fef0d9f8639561671cb3d7b23bed333b23b53fe9a5ab7
-size 1292
+oid sha256:984dbda533de03f37b496d41400be4fb0cad97233106f868482ef79302526a38
+size 1395
diff --git a/optimized/ami/500ms/ls_eend_ami_500ms.mlmodelc/metadata.json b/optimized/ami/500ms/ls_eend_ami_500ms.mlmodelc/metadata.json
index 410cbbe3ea4581edc9b16eeffe96a1435ac493c2..3ec79fce1e284c0951daf5eba6209646633f21bd 100644
--- a/optimized/ami/500ms/ls_eend_ami_500ms.mlmodelc/metadata.json
+++ b/optimized/ami/500ms/ls_eend_ami_500ms.mlmodelc/metadata.json
@@ -1,7 +1,7 @@
 [
   {
     "metadataOutputVersion" : "3.0",
-    "shortDescription" : "LS-EEND AMI streaming diarizer (pipeline, T=5, max_speakers=4)",
+    "shortDescription" : "LS-EEND AMI streaming diarizer (pipeline, T=5, max_speakers=4, layout=raw_mel, cu=all)",
     "outputSchema" : [
       {
         "hasShapeFlexibility" : "0",
@@ -81,7 +81,7 @@
     "specificationVersion" : 8,
     "mlProgramOperationTypeHistogram" : {
       "Ios16.reduceL2Norm" : 2,
-      "Ios17.reshape" : 68,
+      "Ios17.reshape" : 69,
       "Ios16.softmax" : 2,
       "Ios17.matmul" : 29,
       "Ios17.transpose" : 57,
@@ -89,7 +89,7 @@
       "Ios17.expandDims" : 3,
       "Ios17.add" : 46,
       "Ios16.sigmoid" : 5,
-      "Ios17.sliceByIndex" : 72,
+      "Ios17.sliceByIndex" : 77,
       "Tile" : 1,
       "Ios16.reduceSum" : 1,
       "Ios17.squeeze" : 2,
@@ -101,7 +101,7 @@
       "Ios16.silu" : 18,
       "Ios17.realDiv" : 20,
       "Ios17.linear" : 56,
-      "Stack" : 5,
+      "Stack" : 6,
       "Ios17.concat" : 26,
       "Ios16.relu" : 2,
       "Ios16.cumsum" : 1,
@@ -128,9 +128,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float32",
-        "formattedType" : "MultiArray (Float32 1 × 5 × 345)",
+        "formattedType" : "MultiArray (Float32 1 × 55 × 23)",
         "shortDescription" : "",
-        "shape" : "[1, 5, 345]",
+        "shape" : "[1, 55, 23]",
         "name" : "features",
         "type" : "MultiArray"
       },
@@ -206,8 +206,8 @@
       }
     ],
     "userDefinedMetadata" : {
-      "com.github.apple.coremltools.conversion_date" : "2026-04-16",
-      "config" : "{\"model_name\": \"ami\", \"model_label\": \"AMI\", \"variant\": \"pipeline\", \"chunk_size\": 5, \"step_duration_ms\": 500, \"frame_duration_ms\": 100, \"frame_duration_seconds\": 0.1, \"max_speakers\": 4, \"max_nspks\": 6, \"n_units\": 256, \"n_heads\": 4, \"enc_n_layers\": 4, \"dec_n_layers\": 2, \"conv_kernel_size\": 16, \"conv_delay\": 9, \"sample_rate\": 8000, \"win_length\": 200, \"hop_length\": 80, \"n_mels\": 23, \"context_size\": 7, \"subsampling\": 10, \"feat_type\": \"logmel23_cummn\", \"pure_roll\": true}",
+      "com.github.apple.coremltools.conversion_date" : "2026-04-18",
+      "config" : "{\"model_name\": \"ami\", \"model_label\": \"AMI\", \"variant\": \"pipeline\", \"chunk_size\": 5, \"step_duration_ms\": 500, \"frame_duration_ms\": 100, \"frame_duration_seconds\": 0.1, \"max_speakers\": 4, \"max_nspks\": 6, \"n_units\": 256, \"n_heads\": 4, \"enc_n_layers\": 4, \"dec_n_layers\": 2, \"conv_kernel_size\": 16, \"conv_delay\": 9, \"sample_rate\": 8000, \"win_length\": 200, \"hop_length\": 80, \"n_mels\": 23, \"context_size\": 7, \"subsampling\": 10, \"feat_type\": \"logmel23_cummn\", \"pure_roll\": true, \"input_layout\": \"raw_mel\", \"compute_units_export\": \"all\", \"raw_mel_length\": 55}",
       "com.github.apple.coremltools.source" : "torch==2.6.0",
       "com.github.apple.coremltools.version" : "9.0",
       "com.github.apple.coremltools.source_dialect" : "TorchScript"
diff --git a/optimized/ami/500ms/ls_eend_ami_500ms.mlmodelc/model.mil b/optimized/ami/500ms/ls_eend_ami_500ms.mlmodelc/model.mil
index f062932fd3c10f5f9a6925badb848bf8e06e6c01..7f57cbf699017c4f570f6ccdd73a5252e8b8642e 100644
--- a/optimized/ami/500ms/ls_eend_ami_500ms.mlmodelc/model.mil
+++ b/optimized/ami/500ms/ls_eend_ami_500ms.mlmodelc/model.mil
@@ -1,234 +1,260 @@
 program(1.0)
 [buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.6.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "9.0"}})]
 {
-    func main<ios17>(tensor<fp32, [1, 256, 18]> cnn_window, tensor<fp32, [2, 6, 4, 64, 64]> dec_kv, tensor<fp32, [2, 1]> dec_scale, tensor<fp32, [4, 1, 16, 256]> enc_conv_cache, tensor<fp32, [4, 1, 4, 64, 64]> enc_kv, tensor<fp32, [4, 1]> enc_scale, tensor<fp32, [1, 5, 345]> features, tensor<fp32, [5]> valid_mask) {
-            tensor<fp32, [256]> encoder_cnn_bias = const()[name = tensor<string, []>("encoder_cnn_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
-            tensor<fp32, [256, 256, 19]> encoder_cnn_weight = const()[name = tensor<string, []>("encoder_cnn_weight"), val = tensor<fp32, [256, 256, 19]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
-            tensor<fp32, [5, 5]> encoder__causal_mask = const()[name = tensor<string, []>("encoder__causal_mask"), val = tensor<fp32, [5, 5]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4981952)))];
-            tensor<fp32, [5]> encoder__t_index = const()[name = tensor<string, []>("encoder__t_index"), val = tensor<fp32, [5]>([0x1p+0, 0x1p+1, 0x1.8p+1, 0x1p+2, 0x1.4p+2])];
-            tensor<fp32, [256]> encoder_input_projection_linear_bias = const()[name = tensor<string, []>("encoder_input_projection_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4982144)))];
-            tensor<fp32, [256, 345]> encoder_input_projection_linear_weight = const()[name = tensor<string, []>("encoder_input_projection_linear_weight"), val = tensor<fp32, [256, 345]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4983232)))];
-            tensor<fp32, [256]> encoder_pre_layer_norm_bias = const()[name = tensor<string, []>("encoder_pre_layer_norm_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5336576)))];
-            tensor<fp32, [256]> encoder_pre_layer_norm_weight = const()[name = tensor<string, []>("encoder_pre_layer_norm_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5337664)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5338752)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5339840)))];
-            tensor<fp32, [1024]> encoder_ffn1_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5340928)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5345088)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6393728)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6394816)))];
-            tensor<fp32, [256]> encoder_ret_lns_0_bias = const()[name = tensor<string, []>("encoder_ret_lns_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7443456)))];
-            tensor<fp32, [256]> encoder_ret_lns_0_weight = const()[name = tensor<string, []>("encoder_ret_lns_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7444544)))];
-            tensor<fp32, [256]> encoder_q_proj_0_bias = const()[name = tensor<string, []>("encoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7445632)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_0_weight = const()[name = tensor<string, []>("encoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7446720)))];
-            tensor<fp32, [256]> encoder_k_proj_0_bias = const()[name = tensor<string, []>("encoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7708928)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_0_weight = const()[name = tensor<string, []>("encoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7710016)))];
-            tensor<fp32, [256]> encoder_v_proj_0_bias = const()[name = tensor<string, []>("encoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7972224)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_0_weight = const()[name = tensor<string, []>("encoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7973312)))];
-            tensor<fp32, [256]> encoder_g_proj_0_bias = const()[name = tensor<string, []>("encoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8235520)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_0_weight = const()[name = tensor<string, []>("encoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8236608)))];
-            tensor<fp32, [256]> encoder_out_proj_0_bias = const()[name = tensor<string, []>("encoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8498816)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_0_weight = const()[name = tensor<string, []>("encoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8499904)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8762112)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8763200)))];
-            tensor<fp32, [512]> encoder_conv_module_0_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8764288)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_0_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8766400)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_0_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9290752)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9307200)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9308288)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9309376)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9310464)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9311552)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_0_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9312640)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9574848)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9575936)))];
-            tensor<fp32, [1024]> encoder_ffn2_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9577024)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9581184)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10629824)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10630912)))];
-            tensor<fp32, [256]> encoder_layer_norm_0_bias = const()[name = tensor<string, []>("encoder_layer_norm_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11679552)))];
-            tensor<fp32, [256]> encoder_layer_norm_0_weight = const()[name = tensor<string, []>("encoder_layer_norm_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11680640)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11681728)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11682816)))];
-            tensor<fp32, [1024]> encoder_ffn1_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11683904)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11688064)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12736704)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12737792)))];
-            tensor<fp32, [256]> encoder_ret_lns_1_bias = const()[name = tensor<string, []>("encoder_ret_lns_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13786432)))];
-            tensor<fp32, [256]> encoder_ret_lns_1_weight = const()[name = tensor<string, []>("encoder_ret_lns_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13787520)))];
-            tensor<fp32, [256]> encoder_q_proj_1_bias = const()[name = tensor<string, []>("encoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13788608)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_1_weight = const()[name = tensor<string, []>("encoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13789696)))];
-            tensor<fp32, [256]> encoder_k_proj_1_bias = const()[name = tensor<string, []>("encoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14051904)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_1_weight = const()[name = tensor<string, []>("encoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14052992)))];
-            tensor<fp32, [256]> encoder_v_proj_1_bias = const()[name = tensor<string, []>("encoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14315200)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_1_weight = const()[name = tensor<string, []>("encoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14316288)))];
-            tensor<fp32, [256]> encoder_g_proj_1_bias = const()[name = tensor<string, []>("encoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14578496)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_1_weight = const()[name = tensor<string, []>("encoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14579584)))];
-            tensor<fp32, [256]> encoder_out_proj_1_bias = const()[name = tensor<string, []>("encoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14841792)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_1_weight = const()[name = tensor<string, []>("encoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14842880)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15105088)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15106176)))];
-            tensor<fp32, [512]> encoder_conv_module_1_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15107264)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_1_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15109376)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_1_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15633728)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15650176)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15651264)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15652352)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15653440)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15654528)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_1_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15655616)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15917824)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15918912)))];
-            tensor<fp32, [1024]> encoder_ffn2_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15920000)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15924160)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16972800)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16973888)))];
-            tensor<fp32, [256]> encoder_layer_norm_1_bias = const()[name = tensor<string, []>("encoder_layer_norm_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18022528)))];
-            tensor<fp32, [256]> encoder_layer_norm_1_weight = const()[name = tensor<string, []>("encoder_layer_norm_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18023616)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18024704)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18025792)))];
-            tensor<fp32, [1024]> encoder_ffn1_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18026880)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18031040)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19079680)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19080768)))];
-            tensor<fp32, [256]> encoder_ret_lns_2_bias = const()[name = tensor<string, []>("encoder_ret_lns_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20129408)))];
-            tensor<fp32, [256]> encoder_ret_lns_2_weight = const()[name = tensor<string, []>("encoder_ret_lns_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20130496)))];
-            tensor<fp32, [256]> encoder_q_proj_2_bias = const()[name = tensor<string, []>("encoder_q_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20131584)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_2_weight = const()[name = tensor<string, []>("encoder_q_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20132672)))];
-            tensor<fp32, [256]> encoder_k_proj_2_bias = const()[name = tensor<string, []>("encoder_k_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20394880)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_2_weight = const()[name = tensor<string, []>("encoder_k_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20395968)))];
-            tensor<fp32, [256]> encoder_v_proj_2_bias = const()[name = tensor<string, []>("encoder_v_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20658176)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_2_weight = const()[name = tensor<string, []>("encoder_v_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20659264)))];
-            tensor<fp32, [256]> encoder_g_proj_2_bias = const()[name = tensor<string, []>("encoder_g_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20921472)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_2_weight = const()[name = tensor<string, []>("encoder_g_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20922560)))];
-            tensor<fp32, [256]> encoder_out_proj_2_bias = const()[name = tensor<string, []>("encoder_out_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21184768)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_2_weight = const()[name = tensor<string, []>("encoder_out_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21185856)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21448064)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21449152)))];
-            tensor<fp32, [512]> encoder_conv_module_2_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21450240)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_2_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21452352)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_2_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21976704)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21993152)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21994240)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21995328)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21996416)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21997504)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_2_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21998592)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22260800)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22261888)))];
-            tensor<fp32, [1024]> encoder_ffn2_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22262976)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22267136)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23315776)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23316864)))];
-            tensor<fp32, [256]> encoder_layer_norm_2_bias = const()[name = tensor<string, []>("encoder_layer_norm_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24365504)))];
-            tensor<fp32, [256]> encoder_layer_norm_2_weight = const()[name = tensor<string, []>("encoder_layer_norm_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24366592)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24367680)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24368768)))];
-            tensor<fp32, [1024]> encoder_ffn1_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24369856)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24374016)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25422656)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25423744)))];
-            tensor<fp32, [256]> encoder_ret_lns_3_bias = const()[name = tensor<string, []>("encoder_ret_lns_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26472384)))];
-            tensor<fp32, [256]> encoder_ret_lns_3_weight = const()[name = tensor<string, []>("encoder_ret_lns_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26473472)))];
-            tensor<fp32, [256]> encoder_q_proj_3_bias = const()[name = tensor<string, []>("encoder_q_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26474560)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_3_weight = const()[name = tensor<string, []>("encoder_q_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26475648)))];
-            tensor<fp32, [256]> encoder_k_proj_3_bias = const()[name = tensor<string, []>("encoder_k_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26737856)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_3_weight = const()[name = tensor<string, []>("encoder_k_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26738944)))];
-            tensor<fp32, [256]> encoder_v_proj_3_bias = const()[name = tensor<string, []>("encoder_v_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27001152)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_3_weight = const()[name = tensor<string, []>("encoder_v_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27002240)))];
-            tensor<fp32, [256]> encoder_g_proj_3_bias = const()[name = tensor<string, []>("encoder_g_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27264448)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_3_weight = const()[name = tensor<string, []>("encoder_g_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27265536)))];
-            tensor<fp32, [256]> encoder_out_proj_3_bias = const()[name = tensor<string, []>("encoder_out_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27527744)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_3_weight = const()[name = tensor<string, []>("encoder_out_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27528832)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27791040)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27792128)))];
-            tensor<fp32, [512]> encoder_conv_module_3_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27793216)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_3_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27795328)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_3_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28319680)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28336128)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28337216)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28338304)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28339392)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28340480)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_3_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28341568)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28603776)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28604864)))];
-            tensor<fp32, [1024]> encoder_ffn2_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28605952)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28610112)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29658752)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29659840)))];
-            tensor<fp32, [256]> encoder_layer_norm_3_bias = const()[name = tensor<string, []>("encoder_layer_norm_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30708480)))];
-            tensor<fp32, [256]> encoder_layer_norm_3_weight = const()[name = tensor<string, []>("encoder_layer_norm_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30709568)))];
-            tensor<fp32, [5, 5]> decoder__causal_mask = const()[name = tensor<string, []>("decoder__causal_mask"), val = tensor<fp32, [5, 5]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710656)))];
-            tensor<fp32, [256]> decoder_convert_bias = const()[name = tensor<string, []>("decoder_convert_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710848)))];
-            tensor<fp32, [256, 512]> decoder_convert_weight = const()[name = tensor<string, []>("decoder_convert_weight"), val = tensor<fp32, [256, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30711936)))];
-            tensor<fp32, [256]> decoder_q_proj_0_bias = const()[name = tensor<string, []>("decoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31236288)))];
-            tensor<fp32, [256, 256]> decoder_q_proj_0_weight = const()[name = tensor<string, []>("decoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31237376)))];
-            tensor<fp32, [256]> decoder_k_proj_0_bias = const()[name = tensor<string, []>("decoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31499584)))];
-            tensor<fp32, [256, 256]> decoder_k_proj_0_weight = const()[name = tensor<string, []>("decoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31500672)))];
-            tensor<fp32, [256]> decoder_v_proj_0_bias = const()[name = tensor<string, []>("decoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31762880)))];
-            tensor<fp32, [256, 256]> decoder_v_proj_0_weight = const()[name = tensor<string, []>("decoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31763968)))];
-            tensor<fp32, [256]> decoder_g_proj_0_bias = const()[name = tensor<string, []>("decoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32026176)))];
-            tensor<fp32, [256, 256]> decoder_g_proj_0_weight = const()[name = tensor<string, []>("decoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32027264)))];
-            tensor<fp32, [256]> decoder_out_proj_0_bias = const()[name = tensor<string, []>("decoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32289472)))];
-            tensor<fp32, [256, 256]> decoder_out_proj_0_weight = const()[name = tensor<string, []>("decoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32290560)))];
-            tensor<fp32, [256]> decoder_norm11_0_bias = const()[name = tensor<string, []>("decoder_norm11_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32552768)))];
-            tensor<fp32, [256]> decoder_norm11_0_weight = const()[name = tensor<string, []>("decoder_norm11_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32553856)))];
-            tensor<fp32, [256]> decoder_self_attn2_0_out_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_0_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32554944)))];
-            tensor<fp32, [256, 256]> decoder_self_attn2_0_out_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_0_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32556032)))];
-            tensor<fp32, [768]> decoder_self_attn2_0_in_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_0_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32818240)))];
-            tensor<fp32, [768, 256]> decoder_self_attn2_0_in_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_0_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32821376)))];
-            tensor<fp32, [256]> decoder_norm21_0_bias = const()[name = tensor<string, []>("decoder_norm21_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33607872)))];
-            tensor<fp32, [256]> decoder_norm21_0_weight = const()[name = tensor<string, []>("decoder_norm21_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33608960)))];
-            tensor<fp32, [2048]> decoder_linear1_0_bias = const()[name = tensor<string, []>("decoder_linear1_0_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33610048)))];
-            tensor<fp32, [2048, 256]> decoder_linear1_0_weight = const()[name = tensor<string, []>("decoder_linear1_0_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33618304)))];
-            tensor<fp32, [256]> decoder_linear2_0_bias = const()[name = tensor<string, []>("decoder_linear2_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35715520)))];
-            tensor<fp32, [256, 2048]> decoder_linear2_0_weight = const()[name = tensor<string, []>("decoder_linear2_0_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35716608)))];
-            tensor<fp32, [256]> decoder_norm22_0_bias = const()[name = tensor<string, []>("decoder_norm22_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37813824)))];
-            tensor<fp32, [256]> decoder_norm22_0_weight = const()[name = tensor<string, []>("decoder_norm22_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37814912)))];
-            tensor<fp32, [256]> decoder_q_proj_1_bias = const()[name = tensor<string, []>("decoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37816000)))];
-            tensor<fp32, [256, 256]> decoder_q_proj_1_weight = const()[name = tensor<string, []>("decoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37817088)))];
-            tensor<fp32, [256]> decoder_k_proj_1_bias = const()[name = tensor<string, []>("decoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38079296)))];
-            tensor<fp32, [256, 256]> decoder_k_proj_1_weight = const()[name = tensor<string, []>("decoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38080384)))];
-            tensor<fp32, [256]> decoder_v_proj_1_bias = const()[name = tensor<string, []>("decoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38342592)))];
-            tensor<fp32, [256, 256]> decoder_v_proj_1_weight = const()[name = tensor<string, []>("decoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38343680)))];
-            tensor<fp32, [256]> decoder_g_proj_1_bias = const()[name = tensor<string, []>("decoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38605888)))];
-            tensor<fp32, [256, 256]> decoder_g_proj_1_weight = const()[name = tensor<string, []>("decoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38606976)))];
-            tensor<fp32, [256]> decoder_out_proj_1_bias = const()[name = tensor<string, []>("decoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38869184)))];
-            tensor<fp32, [256, 256]> decoder_out_proj_1_weight = const()[name = tensor<string, []>("decoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38870272)))];
-            tensor<fp32, [256]> decoder_norm11_1_bias = const()[name = tensor<string, []>("decoder_norm11_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39132480)))];
-            tensor<fp32, [256]> decoder_norm11_1_weight = const()[name = tensor<string, []>("decoder_norm11_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39133568)))];
-            tensor<fp32, [256]> decoder_self_attn2_1_out_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_1_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39134656)))];
-            tensor<fp32, [256, 256]> decoder_self_attn2_1_out_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_1_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39135744)))];
-            tensor<fp32, [768]> decoder_self_attn2_1_in_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_1_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39397952)))];
-            tensor<fp32, [768, 256]> decoder_self_attn2_1_in_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_1_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39401088)))];
-            tensor<fp32, [256]> decoder_norm21_1_bias = const()[name = tensor<string, []>("decoder_norm21_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40187584)))];
-            tensor<fp32, [256]> decoder_norm21_1_weight = const()[name = tensor<string, []>("decoder_norm21_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40188672)))];
-            tensor<fp32, [2048]> decoder_linear1_1_bias = const()[name = tensor<string, []>("decoder_linear1_1_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40189760)))];
-            tensor<fp32, [2048, 256]> decoder_linear1_1_weight = const()[name = tensor<string, []>("decoder_linear1_1_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40198016)))];
-            tensor<fp32, [256]> decoder_linear2_1_bias = const()[name = tensor<string, []>("decoder_linear2_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42295232)))];
-            tensor<fp32, [256, 2048]> decoder_linear2_1_weight = const()[name = tensor<string, []>("decoder_linear2_1_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42296320)))];
-            tensor<fp32, [256]> decoder_norm22_1_bias = const()[name = tensor<string, []>("decoder_norm22_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44393536)))];
-            tensor<fp32, [256]> decoder_norm22_1_weight = const()[name = tensor<string, []>("decoder_norm22_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44394624)))];
-            tensor<fp32, []> var_11 = const()[name = tensor<string, []>("op_11"), val = tensor<fp32, []>(0x1.5798eep-27)];
-            tensor<fp32, []> var_18 = const()[name = tensor<string, []>("op_18"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
-            tensor<int32, []> var_21 = const()[name = tensor<string, []>("op_21"), val = tensor<int32, []>(2)];
-            tensor<int32, []> var_24 = const()[name = tensor<string, []>("op_24"), val = tensor<int32, []>(0)];
-            tensor<int32, []> var_27 = const()[name = tensor<string, []>("op_27"), val = tensor<int32, []>(1)];
-            tensor<fp32, []> var_29 = const()[name = tensor<string, []>("op_29"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
-            tensor<bool, []> var_30 = const()[name = tensor<string, []>("op_30"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 5, 256]> input_1 = linear(bias = encoder_input_projection_linear_bias, weight = encoder_input_projection_linear_weight, x = features)[name = tensor<string, []>("linear_0")];
-            tensor<int32, [1]> input_3_axes_0 = const()[name = tensor<string, []>("input_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_3 = layer_norm(axes = input_3_axes_0, beta = encoder_pre_layer_norm_bias, epsilon = var_29, gamma = encoder_pre_layer_norm_weight, x = input_1)[name = tensor<string, []>("input_3")];
+    func main<ios17>(tensor<fp32, [1, 256, 18]> cnn_window, tensor<fp32, [2, 6, 4, 64, 64]> dec_kv, tensor<fp32, [2, 1]> dec_scale, tensor<fp32, [4, 1, 16, 256]> enc_conv_cache, tensor<fp32, [4, 1, 4, 64, 64]> enc_kv, tensor<fp32, [4, 1]> enc_scale, tensor<fp32, [1, 55, 23]> features, tensor<fp32, [5]> valid_mask) {
+            tensor<fp32, [256]> inner_encoder_cnn_bias = const()[name = tensor<string, []>("inner_encoder_cnn_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp32, [256, 256, 19]> inner_encoder_cnn_weight = const()[name = tensor<string, []>("inner_encoder_cnn_weight"), val = tensor<fp32, [256, 256, 19]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
+            tensor<fp32, [5, 5]> inner_encoder__causal_mask = const()[name = tensor<string, []>("inner_encoder__causal_mask"), val = tensor<fp32, [5, 5]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4981952)))];
+            tensor<fp32, [5]> inner_encoder__t_index = const()[name = tensor<string, []>("inner_encoder__t_index"), val = tensor<fp32, [5]>([0x1p+0, 0x1p+1, 0x1.8p+1, 0x1p+2, 0x1.4p+2])];
+            tensor<fp32, [256]> inner_encoder_input_projection_linear_bias = const()[name = tensor<string, []>("inner_encoder_input_projection_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4982144)))];
+            tensor<fp32, [256, 345]> inner_encoder_input_projection_linear_weight = const()[name = tensor<string, []>("inner_encoder_input_projection_linear_weight"), val = tensor<fp32, [256, 345]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4983232)))];
+            tensor<fp32, [256]> inner_encoder_pre_layer_norm_bias = const()[name = tensor<string, []>("inner_encoder_pre_layer_norm_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5336576)))];
+            tensor<fp32, [256]> inner_encoder_pre_layer_norm_weight = const()[name = tensor<string, []>("inner_encoder_pre_layer_norm_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5337664)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5338752)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5339840)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5340928)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5345088)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6393728)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6394816)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_0_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7443456)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_0_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7444544)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7445632)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7446720)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7708928)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7710016)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7972224)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7973312)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8235520)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8236608)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8498816)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8499904)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8762112)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8763200)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_0_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8764288)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_0_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8766400)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_0_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9290752)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9307200)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9308288)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9309376)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9310464)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9311552)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_0_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9312640)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9574848)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9575936)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9577024)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9581184)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10629824)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10630912)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_0_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11679552)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_0_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11680640)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11681728)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11682816)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11683904)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11688064)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12736704)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12737792)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_1_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13786432)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_1_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13787520)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13788608)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13789696)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14051904)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14052992)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14315200)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14316288)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14578496)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14579584)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14841792)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14842880)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15105088)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15106176)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_1_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15107264)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_1_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15109376)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_1_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15633728)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15650176)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15651264)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15652352)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15653440)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15654528)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_1_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15655616)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15917824)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15918912)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15920000)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15924160)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16972800)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16973888)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_1_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18022528)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_1_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18023616)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18024704)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18025792)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18026880)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18031040)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19079680)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19080768)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_2_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20129408)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_2_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20130496)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20131584)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20132672)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20394880)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20395968)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20658176)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20659264)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20921472)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20922560)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21184768)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21185856)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21448064)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21449152)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_2_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21450240)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_2_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21452352)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_2_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21976704)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21993152)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21994240)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21995328)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21996416)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21997504)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_2_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21998592)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22260800)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22261888)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22262976)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22267136)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23315776)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23316864)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_2_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24365504)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_2_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24366592)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24367680)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24368768)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24369856)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24374016)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25422656)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25423744)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_3_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26472384)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_3_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26473472)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26474560)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26475648)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26737856)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26738944)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27001152)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27002240)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27264448)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27265536)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27527744)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27528832)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27791040)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27792128)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_3_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27793216)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_3_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27795328)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_3_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28319680)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28336128)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28337216)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28338304)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28339392)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28340480)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_3_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28341568)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28603776)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28604864)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28605952)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28610112)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29658752)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29659840)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_3_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30708480)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_3_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30709568)))];
+            tensor<fp32, [5, 5]> inner_decoder__causal_mask = const()[name = tensor<string, []>("inner_decoder__causal_mask"), val = tensor<fp32, [5, 5]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710656)))];
+            tensor<fp32, [256]> inner_decoder_convert_bias = const()[name = tensor<string, []>("inner_decoder_convert_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710848)))];
+            tensor<fp32, [256, 512]> inner_decoder_convert_weight = const()[name = tensor<string, []>("inner_decoder_convert_weight"), val = tensor<fp32, [256, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30711936)))];
+            tensor<fp32, [256]> inner_decoder_q_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31236288)))];
+            tensor<fp32, [256, 256]> inner_decoder_q_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31237376)))];
+            tensor<fp32, [256]> inner_decoder_k_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31499584)))];
+            tensor<fp32, [256, 256]> inner_decoder_k_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31500672)))];
+            tensor<fp32, [256]> inner_decoder_v_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31762880)))];
+            tensor<fp32, [256, 256]> inner_decoder_v_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31763968)))];
+            tensor<fp32, [256]> inner_decoder_g_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32026176)))];
+            tensor<fp32, [256, 256]> inner_decoder_g_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32027264)))];
+            tensor<fp32, [256]> inner_decoder_out_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32289472)))];
+            tensor<fp32, [256, 256]> inner_decoder_out_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32290560)))];
+            tensor<fp32, [256]> inner_decoder_norm11_0_bias = const()[name = tensor<string, []>("inner_decoder_norm11_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32552768)))];
+            tensor<fp32, [256]> inner_decoder_norm11_0_weight = const()[name = tensor<string, []>("inner_decoder_norm11_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32553856)))];
+            tensor<fp32, [256]> inner_decoder_self_attn2_0_out_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32554944)))];
+            tensor<fp32, [256, 256]> inner_decoder_self_attn2_0_out_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32556032)))];
+            tensor<fp32, [768]> inner_decoder_self_attn2_0_in_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32818240)))];
+            tensor<fp32, [768, 256]> inner_decoder_self_attn2_0_in_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32821376)))];
+            tensor<fp32, [256]> inner_decoder_norm21_0_bias = const()[name = tensor<string, []>("inner_decoder_norm21_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33607872)))];
+            tensor<fp32, [256]> inner_decoder_norm21_0_weight = const()[name = tensor<string, []>("inner_decoder_norm21_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33608960)))];
+            tensor<fp32, [2048]> inner_decoder_linear1_0_bias = const()[name = tensor<string, []>("inner_decoder_linear1_0_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33610048)))];
+            tensor<fp32, [2048, 256]> inner_decoder_linear1_0_weight = const()[name = tensor<string, []>("inner_decoder_linear1_0_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33618304)))];
+            tensor<fp32, [256]> inner_decoder_linear2_0_bias = const()[name = tensor<string, []>("inner_decoder_linear2_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35715520)))];
+            tensor<fp32, [256, 2048]> inner_decoder_linear2_0_weight = const()[name = tensor<string, []>("inner_decoder_linear2_0_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35716608)))];
+            tensor<fp32, [256]> inner_decoder_norm22_0_bias = const()[name = tensor<string, []>("inner_decoder_norm22_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37813824)))];
+            tensor<fp32, [256]> inner_decoder_norm22_0_weight = const()[name = tensor<string, []>("inner_decoder_norm22_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37814912)))];
+            tensor<fp32, [256]> inner_decoder_q_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37816000)))];
+            tensor<fp32, [256, 256]> inner_decoder_q_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37817088)))];
+            tensor<fp32, [256]> inner_decoder_k_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38079296)))];
+            tensor<fp32, [256, 256]> inner_decoder_k_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38080384)))];
+            tensor<fp32, [256]> inner_decoder_v_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38342592)))];
+            tensor<fp32, [256, 256]> inner_decoder_v_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38343680)))];
+            tensor<fp32, [256]> inner_decoder_g_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38605888)))];
+            tensor<fp32, [256, 256]> inner_decoder_g_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38606976)))];
+            tensor<fp32, [256]> inner_decoder_out_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38869184)))];
+            tensor<fp32, [256, 256]> inner_decoder_out_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38870272)))];
+            tensor<fp32, [256]> inner_decoder_norm11_1_bias = const()[name = tensor<string, []>("inner_decoder_norm11_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39132480)))];
+            tensor<fp32, [256]> inner_decoder_norm11_1_weight = const()[name = tensor<string, []>("inner_decoder_norm11_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39133568)))];
+            tensor<fp32, [256]> inner_decoder_self_attn2_1_out_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39134656)))];
+            tensor<fp32, [256, 256]> inner_decoder_self_attn2_1_out_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39135744)))];
+            tensor<fp32, [768]> inner_decoder_self_attn2_1_in_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39397952)))];
+            tensor<fp32, [768, 256]> inner_decoder_self_attn2_1_in_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39401088)))];
+            tensor<fp32, [256]> inner_decoder_norm21_1_bias = const()[name = tensor<string, []>("inner_decoder_norm21_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40187584)))];
+            tensor<fp32, [256]> inner_decoder_norm21_1_weight = const()[name = tensor<string, []>("inner_decoder_norm21_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40188672)))];
+            tensor<fp32, [2048]> inner_decoder_linear1_1_bias = const()[name = tensor<string, []>("inner_decoder_linear1_1_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40189760)))];
+            tensor<fp32, [2048, 256]> inner_decoder_linear1_1_weight = const()[name = tensor<string, []>("inner_decoder_linear1_1_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40198016)))];
+            tensor<fp32, [256]> inner_decoder_linear2_1_bias = const()[name = tensor<string, []>("inner_decoder_linear2_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42295232)))];
+            tensor<fp32, [256, 2048]> inner_decoder_linear2_1_weight = const()[name = tensor<string, []>("inner_decoder_linear2_1_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42296320)))];
+            tensor<fp32, [256]> inner_decoder_norm22_1_bias = const()[name = tensor<string, []>("inner_decoder_norm22_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44393536)))];
+            tensor<fp32, [256]> inner_decoder_norm22_1_weight = const()[name = tensor<string, []>("inner_decoder_norm22_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44394624)))];
+            tensor<int32, [3]> var_19_begin_0 = const()[name = tensor<string, []>("op_19_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_19_end_0 = const()[name = tensor<string, []>("op_19_end_0"), val = tensor<int32, [3]>([1, 15, 23])];
+            tensor<bool, [3]> var_19_end_mask_0 = const()[name = tensor<string, []>("op_19_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 15, 23]> var_19 = slice_by_index(begin = var_19_begin_0, end = var_19_end_0, end_mask = var_19_end_mask_0, x = features)[name = tensor<string, []>("op_19")];
+            tensor<int32, [3]> var_29_begin_0 = const()[name = tensor<string, []>("op_29_begin_0"), val = tensor<int32, [3]>([0, 10, 0])];
+            tensor<int32, [3]> var_29_end_0 = const()[name = tensor<string, []>("op_29_end_0"), val = tensor<int32, [3]>([1, 25, 23])];
+            tensor<bool, [3]> var_29_end_mask_0 = const()[name = tensor<string, []>("op_29_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 15, 23]> var_29 = slice_by_index(begin = var_29_begin_0, end = var_29_end_0, end_mask = var_29_end_mask_0, x = features)[name = tensor<string, []>("op_29")];
+            tensor<int32, [3]> var_39_begin_0 = const()[name = tensor<string, []>("op_39_begin_0"), val = tensor<int32, [3]>([0, 20, 0])];
+            tensor<int32, [3]> var_39_end_0 = const()[name = tensor<string, []>("op_39_end_0"), val = tensor<int32, [3]>([1, 35, 23])];
+            tensor<bool, [3]> var_39_end_mask_0 = const()[name = tensor<string, []>("op_39_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 15, 23]> var_39 = slice_by_index(begin = var_39_begin_0, end = var_39_end_0, end_mask = var_39_end_mask_0, x = features)[name = tensor<string, []>("op_39")];
+            tensor<int32, [3]> var_49_begin_0 = const()[name = tensor<string, []>("op_49_begin_0"), val = tensor<int32, [3]>([0, 30, 0])];
+            tensor<int32, [3]> var_49_end_0 = const()[name = tensor<string, []>("op_49_end_0"), val = tensor<int32, [3]>([1, 45, 23])];
+            tensor<bool, [3]> var_49_end_mask_0 = const()[name = tensor<string, []>("op_49_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 15, 23]> var_49 = slice_by_index(begin = var_49_begin_0, end = var_49_end_0, end_mask = var_49_end_mask_0, x = features)[name = tensor<string, []>("op_49")];
+            tensor<int32, [3]> var_59_begin_0 = const()[name = tensor<string, []>("op_59_begin_0"), val = tensor<int32, [3]>([0, 40, 0])];
+            tensor<int32, [3]> var_59_end_0 = const()[name = tensor<string, []>("op_59_end_0"), val = tensor<int32, [3]>([1, 1, 23])];
+            tensor<bool, [3]> var_59_end_mask_0 = const()[name = tensor<string, []>("op_59_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 23]> var_59 = slice_by_index(begin = var_59_begin_0, end = var_59_end_0, end_mask = var_59_end_mask_0, x = features)[name = tensor<string, []>("op_59")];
+            tensor<int32, []> stacked_axis_0 = const()[name = tensor<string, []>("stacked_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 5, 15, 23]> stacked = stack(axis = stacked_axis_0, values = (var_19, var_29, var_39, var_49, var_59))[name = tensor<string, []>("stacked")];
+            tensor<int32, [3]> var_66 = const()[name = tensor<string, []>("op_66"), val = tensor<int32, [3]>([1, 5, 345])];
+            tensor<fp32, [1, 5, 345]> input_1 = reshape(shape = var_66, x = stacked)[name = tensor<string, []>("input_1")];
+            tensor<fp32, []> var_68 = const()[name = tensor<string, []>("op_68"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<bool, []> var_73 = const()[name = tensor<string, []>("op_73"), val = tensor<bool, []>(true)];
+            tensor<fp32, []> var_74 = const()[name = tensor<string, []>("op_74"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
+            tensor<int32, []> var_77 = const()[name = tensor<string, []>("op_77"), val = tensor<int32, []>(0)];
+            tensor<int32, []> var_79 = const()[name = tensor<string, []>("op_79"), val = tensor<int32, []>(2)];
+            tensor<int32, []> var_80 = const()[name = tensor<string, []>("op_80"), val = tensor<int32, []>(-1)];
+            tensor<fp32, []> var_82 = const()[name = tensor<string, []>("op_82"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
+            tensor<fp32, []> var_88 = const()[name = tensor<string, []>("op_88"), val = tensor<fp32, []>(0x1.5798eep-27)];
+            tensor<int32, []> var_92 = const()[name = tensor<string, []>("op_92"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 5, 256]> input_3 = linear(bias = inner_encoder_input_projection_linear_bias, weight = inner_encoder_input_projection_linear_weight, x = input_1)[name = tensor<string, []>("linear_0")];
             tensor<int32, [1]> input_5_axes_0 = const()[name = tensor<string, []>("input_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_5 = layer_norm(axes = input_5_axes_0, beta = encoder_ffn1_0_module_sequential_0_bias, epsilon = var_29, gamma = encoder_ffn1_0_module_sequential_0_weight, x = input_3)[name = tensor<string, []>("input_5")];
-            tensor<fp32, [1, 5, 1024]> inputs_1 = linear(bias = encoder_ffn1_0_module_sequential_1_linear_bias, weight = encoder_ffn1_0_module_sequential_1_linear_weight, x = input_5)[name = tensor<string, []>("linear_1")];
-            tensor<fp32, [1, 5, 1024]> input_7 = silu(x = inputs_1)[name = tensor<string, []>("input_7")];
-            tensor<fp32, [1, 5, 256]> input_11 = linear(bias = encoder_ffn1_0_module_sequential_4_linear_bias, weight = encoder_ffn1_0_module_sequential_4_linear_weight, x = input_7)[name = tensor<string, []>("linear_2")];
-            tensor<fp32, []> var_148 = const()[name = tensor<string, []>("op_148"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 5, 256]> var_149 = mul(x = input_11, y = var_148)[name = tensor<string, []>("op_149")];
-            tensor<fp32, [1, 5, 256]> input_13 = add(x = var_149, y = input_3)[name = tensor<string, []>("input_13")];
+            tensor<fp32, [1, 5, 256]> input_5 = layer_norm(axes = input_5_axes_0, beta = inner_encoder_pre_layer_norm_bias, epsilon = var_74, gamma = inner_encoder_pre_layer_norm_weight, x = input_3)[name = tensor<string, []>("input_5")];
+            tensor<int32, [1]> input_7_axes_0 = const()[name = tensor<string, []>("input_7_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 5, 256]> input_7 = layer_norm(axes = input_7_axes_0, beta = inner_encoder_ffn1_0_module_sequential_0_bias, epsilon = var_74, gamma = inner_encoder_ffn1_0_module_sequential_0_weight, x = input_5)[name = tensor<string, []>("input_7")];
+            tensor<fp32, [1, 5, 1024]> inputs_1 = linear(bias = inner_encoder_ffn1_0_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_0_module_sequential_1_linear_weight, x = input_7)[name = tensor<string, []>("linear_1")];
+            tensor<fp32, [1, 5, 1024]> input_9 = silu(x = inputs_1)[name = tensor<string, []>("input_9")];
+            tensor<fp32, [1, 5, 256]> input_13 = linear(bias = inner_encoder_ffn1_0_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_0_module_sequential_4_linear_weight, x = input_9)[name = tensor<string, []>("linear_2")];
+            tensor<fp32, []> var_213 = const()[name = tensor<string, []>("op_213"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 5, 256]> var_214 = mul(x = input_13, y = var_213)[name = tensor<string, []>("op_214")];
+            tensor<fp32, [1, 5, 256]> input_15 = add(x = var_214, y = input_5)[name = tensor<string, []>("input_15")];
             tensor<int32, [1]> x_1_axes_0 = const()[name = tensor<string, []>("x_1_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> x_1 = layer_norm(axes = x_1_axes_0, beta = encoder_ret_lns_0_bias, epsilon = var_29, gamma = encoder_ret_lns_0_weight, x = input_13)[name = tensor<string, []>("x_1")];
+            tensor<fp32, [1, 5, 256]> x_1 = layer_norm(axes = x_1_axes_0, beta = inner_encoder_ret_lns_0_bias, epsilon = var_74, gamma = inner_encoder_ret_lns_0_weight, x = input_15)[name = tensor<string, []>("x_1")];
             tensor<int32, [5]> prev_kv_1_begin_0 = const()[name = tensor<string, []>("prev_kv_1_begin_0"), val = tensor<int32, [5]>([0, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_1_end_0 = const()[name = tensor<string, []>("prev_kv_1_end_0"), val = tensor<int32, [5]>([1, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_1_end_mask_0 = const()[name = tensor<string, []>("prev_kv_1_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -239,183 +265,183 @@ program(1.0)
             tensor<bool, [2]> prev_scale_1_end_mask_0 = const()[name = tensor<string, []>("prev_scale_1_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_1_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_1_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_1 = slice_by_index(begin = prev_scale_1_begin_0, end = prev_scale_1_end_0, end_mask = prev_scale_1_end_mask_0, squeeze_mask = prev_scale_1_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_1")];
-            tensor<fp32, [1, 5, 256]> var_163 = linear(bias = encoder_q_proj_0_bias, weight = encoder_q_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_3")];
-            tensor<int32, [4]> var_164 = const()[name = tensor<string, []>("op_164"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_165 = reshape(shape = var_164, x = var_163)[name = tensor<string, []>("op_165")];
+            tensor<fp32, [1, 5, 256]> var_228 = linear(bias = inner_encoder_q_proj_0_bias, weight = inner_encoder_q_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_3")];
+            tensor<int32, [4]> var_229 = const()[name = tensor<string, []>("op_229"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_230 = reshape(shape = var_229, x = var_228)[name = tensor<string, []>("op_230")];
             tensor<int32, [4]> q_1_perm_0 = const()[name = tensor<string, []>("q_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> var_169 = linear(bias = encoder_k_proj_0_bias, weight = encoder_k_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_4")];
-            tensor<fp32, []> var_170 = const()[name = tensor<string, []>("op_170"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 5, 256]> var_171 = mul(x = var_169, y = var_170)[name = tensor<string, []>("op_171")];
-            tensor<int32, [4]> var_172 = const()[name = tensor<string, []>("op_172"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_173 = reshape(shape = var_172, x = var_171)[name = tensor<string, []>("op_173")];
+            tensor<fp32, [1, 5, 256]> var_234 = linear(bias = inner_encoder_k_proj_0_bias, weight = inner_encoder_k_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_4")];
+            tensor<fp32, []> var_235 = const()[name = tensor<string, []>("op_235"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 5, 256]> var_236 = mul(x = var_234, y = var_235)[name = tensor<string, []>("op_236")];
+            tensor<int32, [4]> var_237 = const()[name = tensor<string, []>("op_237"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_238 = reshape(shape = var_237, x = var_236)[name = tensor<string, []>("op_238")];
             tensor<int32, [4]> k_1_perm_0 = const()[name = tensor<string, []>("k_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> var_177 = linear(bias = encoder_v_proj_0_bias, weight = encoder_v_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_5")];
-            tensor<int32, [4]> var_178 = const()[name = tensor<string, []>("op_178"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_179 = reshape(shape = var_178, x = var_177)[name = tensor<string, []>("op_179")];
+            tensor<fp32, [1, 5, 256]> var_242 = linear(bias = inner_encoder_v_proj_0_bias, weight = inner_encoder_v_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_5")];
+            tensor<int32, [4]> var_243 = const()[name = tensor<string, []>("op_243"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_244 = reshape(shape = var_243, x = var_242)[name = tensor<string, []>("op_244")];
             tensor<int32, [4]> v_1_perm_0 = const()[name = tensor<string, []>("v_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> input_17 = linear(bias = encoder_g_proj_0_bias, weight = encoder_g_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_6")];
+            tensor<fp32, [1, 5, 256]> input_19 = linear(bias = inner_encoder_g_proj_0_bias, weight = inner_encoder_g_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_6")];
             tensor<fp32, [1]> sqrt_s0_1 = sqrt(x = prev_scale_1)[name = tensor<string, []>("sqrt_s0_1")];
-            tensor<fp32, [5]> s_t_1 = add(x = prev_scale_1, y = encoder__t_index)[name = tensor<string, []>("s_t_1")];
+            tensor<fp32, [5]> s_t_1 = add(x = prev_scale_1, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_1")];
             tensor<fp32, [5]> sqrt_s_t_1 = sqrt(x = s_t_1)[name = tensor<string, []>("sqrt_s_t_1")];
             tensor<bool, []> qk_1_transpose_x_1 = const()[name = tensor<string, []>("qk_1_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_1_transpose_y_1 = const()[name = tensor<string, []>("qk_1_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 5, 64]> k_1 = transpose(perm = k_1_perm_0, x = var_173)[name = tensor<string, []>("transpose_57")];
-            tensor<fp32, [1, 4, 5, 64]> q_1 = transpose(perm = q_1_perm_0, x = var_165)[name = tensor<string, []>("transpose_58")];
+            tensor<fp32, [1, 4, 5, 64]> k_1 = transpose(perm = k_1_perm_0, x = var_238)[name = tensor<string, []>("transpose_57")];
+            tensor<fp32, [1, 4, 5, 64]> q_1 = transpose(perm = q_1_perm_0, x = var_230)[name = tensor<string, []>("transpose_58")];
             tensor<fp32, [1, 4, 5, 5]> qk_1 = matmul(transpose_x = qk_1_transpose_x_1, transpose_y = qk_1_transpose_y_1, x = q_1, y = k_1)[name = tensor<string, []>("qk_1")];
-            tensor<int32, [2]> var_189 = const()[name = tensor<string, []>("op_189"), val = tensor<int32, [2]>([5, 1])];
-            tensor<fp32, [5, 1]> var_190 = reshape(shape = var_189, x = sqrt_s_t_1)[name = tensor<string, []>("op_190")];
-            tensor<fp32, [5, 5]> M_1 = real_div(x = encoder__causal_mask, y = var_190)[name = tensor<string, []>("M_1")];
-            tensor<fp32, [1, 4, 5, 5]> var_192 = mul(x = qk_1, y = M_1)[name = tensor<string, []>("op_192")];
+            tensor<int32, [2]> var_254 = const()[name = tensor<string, []>("op_254"), val = tensor<int32, [2]>([5, 1])];
+            tensor<fp32, [5, 1]> var_255 = reshape(shape = var_254, x = sqrt_s_t_1)[name = tensor<string, []>("op_255")];
+            tensor<fp32, [5, 5]> M_1 = real_div(x = inner_encoder__causal_mask, y = var_255)[name = tensor<string, []>("M_1")];
+            tensor<fp32, [1, 4, 5, 5]> var_257 = mul(x = qk_1, y = M_1)[name = tensor<string, []>("op_257")];
             tensor<bool, []> inner_1_transpose_x_0 = const()[name = tensor<string, []>("inner_1_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_1_transpose_y_0 = const()[name = tensor<string, []>("inner_1_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 5, 64]> v_1 = transpose(perm = v_1_perm_0, x = var_179)[name = tensor<string, []>("transpose_56")];
-            tensor<fp32, [1, 4, 5, 64]> inner_1 = matmul(transpose_x = inner_1_transpose_x_0, transpose_y = inner_1_transpose_y_0, x = var_192, y = v_1)[name = tensor<string, []>("inner_1")];
-            tensor<bool, []> var_194_transpose_x_0 = const()[name = tensor<string, []>("op_194_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_194_transpose_y_0 = const()[name = tensor<string, []>("op_194_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 5, 64]> var_194 = matmul(transpose_x = var_194_transpose_x_0, transpose_y = var_194_transpose_y_0, x = q_1, y = prev_kv_1)[name = tensor<string, []>("op_194")];
-            tensor<fp32, [5]> var_195 = real_div(x = sqrt_s0_1, y = sqrt_s_t_1)[name = tensor<string, []>("op_195")];
-            tensor<int32, [4]> var_196 = const()[name = tensor<string, []>("op_196"), val = tensor<int32, [4]>([1, 1, 5, 1])];
-            tensor<fp32, [1, 1, 5, 1]> var_197 = reshape(shape = var_196, x = var_195)[name = tensor<string, []>("op_197")];
-            tensor<fp32, [1, 4, 5, 64]> cross_1 = mul(x = var_194, y = var_197)[name = tensor<string, []>("cross_1")];
+            tensor<fp32, [1, 4, 5, 64]> v_1 = transpose(perm = v_1_perm_0, x = var_244)[name = tensor<string, []>("transpose_56")];
+            tensor<fp32, [1, 4, 5, 64]> inner_1 = matmul(transpose_x = inner_1_transpose_x_0, transpose_y = inner_1_transpose_y_0, x = var_257, y = v_1)[name = tensor<string, []>("inner_1")];
+            tensor<bool, []> var_259_transpose_x_0 = const()[name = tensor<string, []>("op_259_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_259_transpose_y_0 = const()[name = tensor<string, []>("op_259_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 5, 64]> var_259 = matmul(transpose_x = var_259_transpose_x_0, transpose_y = var_259_transpose_y_0, x = q_1, y = prev_kv_1)[name = tensor<string, []>("op_259")];
+            tensor<fp32, [5]> var_260 = real_div(x = sqrt_s0_1, y = sqrt_s_t_1)[name = tensor<string, []>("op_260")];
+            tensor<int32, [4]> var_261 = const()[name = tensor<string, []>("op_261"), val = tensor<int32, [4]>([1, 1, 5, 1])];
+            tensor<fp32, [1, 1, 5, 1]> var_262 = reshape(shape = var_261, x = var_260)[name = tensor<string, []>("op_262")];
+            tensor<fp32, [1, 4, 5, 64]> cross_1 = mul(x = var_259, y = var_262)[name = tensor<string, []>("cross_1")];
             tensor<fp32, [1, 4, 5, 64]> out_1 = add(x = inner_1, y = cross_1)[name = tensor<string, []>("out_1")];
-            tensor<fp32, [1, 4, 64, 64]> var_200 = mul(x = prev_kv_1, y = sqrt_s0_1)[name = tensor<string, []>("op_200")];
-            tensor<bool, []> var_202_transpose_x_1 = const()[name = tensor<string, []>("op_202_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_202_transpose_y_1 = const()[name = tensor<string, []>("op_202_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_202 = matmul(transpose_x = var_202_transpose_x_1, transpose_y = var_202_transpose_y_1, x = k_1, y = v_1)[name = tensor<string, []>("op_202")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_1 = add(x = var_200, y = var_202)[name = tensor<string, []>("new_kv_unnorm_1")];
-            tensor<fp32, []> var_204 = const()[name = tensor<string, []>("op_204"), val = tensor<fp32, []>(0x1.4p+2)];
-            tensor<fp32, [1]> new_scale_1 = add(x = prev_scale_1, y = var_204)[name = tensor<string, []>("new_scale_1")];
-            tensor<fp32, [1]> var_206 = sqrt(x = new_scale_1)[name = tensor<string, []>("op_206")];
-            tensor<fp32, [1, 4, 64, 64]> var_207 = real_div(x = new_kv_unnorm_1, y = var_206)[name = tensor<string, []>("op_207")];
-            tensor<int32, [4]> var_208_perm_0 = const()[name = tensor<string, []>("op_208_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_265 = mul(x = prev_kv_1, y = sqrt_s0_1)[name = tensor<string, []>("op_265")];
+            tensor<bool, []> var_267_transpose_x_1 = const()[name = tensor<string, []>("op_267_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_267_transpose_y_1 = const()[name = tensor<string, []>("op_267_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_267 = matmul(transpose_x = var_267_transpose_x_1, transpose_y = var_267_transpose_y_1, x = k_1, y = v_1)[name = tensor<string, []>("op_267")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_1 = add(x = var_265, y = var_267)[name = tensor<string, []>("new_kv_unnorm_1")];
+            tensor<fp32, []> var_269 = const()[name = tensor<string, []>("op_269"), val = tensor<fp32, []>(0x1.4p+2)];
+            tensor<fp32, [1]> new_scale_1 = add(x = prev_scale_1, y = var_269)[name = tensor<string, []>("new_scale_1")];
+            tensor<fp32, [1]> var_271 = sqrt(x = new_scale_1)[name = tensor<string, []>("op_271")];
+            tensor<fp32, [1, 4, 64, 64]> var_272 = real_div(x = new_kv_unnorm_1, y = var_271)[name = tensor<string, []>("op_272")];
+            tensor<int32, [4]> var_273_perm_0 = const()[name = tensor<string, []>("op_273_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_3_axes_0 = const()[name = tensor<string, []>("out_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 4, 64]> var_208 = transpose(perm = var_208_perm_0, x = out_1)[name = tensor<string, []>("transpose_55")];
-            tensor<fp32, [1, 5, 4, 64]> out_3 = layer_norm(axes = out_3_axes_0, epsilon = var_18, x = var_208)[name = tensor<string, []>("out_3")];
-            tensor<int32, [3]> var_212 = const()[name = tensor<string, []>("op_212"), val = tensor<int32, [3]>([1, 5, 256])];
-            tensor<fp32, [1, 5, 256]> out_5 = reshape(shape = var_212, x = out_3)[name = tensor<string, []>("out_5")];
-            tensor<fp32, [1, 5, 256]> var_214 = silu(x = input_17)[name = tensor<string, []>("op_214")];
-            tensor<fp32, [1, 5, 256]> input_19 = mul(x = var_214, y = out_5)[name = tensor<string, []>("input_19")];
-            tensor<fp32, [1, 5, 256]> ret_out_1 = linear(bias = encoder_out_proj_0_bias, weight = encoder_out_proj_0_weight, x = input_19)[name = tensor<string, []>("linear_7")];
-            tensor<fp32, [1, 5, 256]> x_3 = add(x = input_13, y = ret_out_1)[name = tensor<string, []>("x_3")];
+            tensor<fp32, [1, 5, 4, 64]> var_273 = transpose(perm = var_273_perm_0, x = out_1)[name = tensor<string, []>("transpose_55")];
+            tensor<fp32, [1, 5, 4, 64]> out_3 = layer_norm(axes = out_3_axes_0, epsilon = var_82, x = var_273)[name = tensor<string, []>("out_3")];
+            tensor<int32, [3]> var_277 = const()[name = tensor<string, []>("op_277"), val = tensor<int32, [3]>([1, 5, 256])];
+            tensor<fp32, [1, 5, 256]> out_5 = reshape(shape = var_277, x = out_3)[name = tensor<string, []>("out_5")];
+            tensor<fp32, [1, 5, 256]> var_279 = silu(x = input_19)[name = tensor<string, []>("op_279")];
+            tensor<fp32, [1, 5, 256]> input_21 = mul(x = var_279, y = out_5)[name = tensor<string, []>("input_21")];
+            tensor<fp32, [1, 5, 256]> ret_out_1 = linear(bias = inner_encoder_out_proj_0_bias, weight = inner_encoder_out_proj_0_weight, x = input_21)[name = tensor<string, []>("linear_7")];
+            tensor<fp32, [1, 5, 256]> x_3 = add(x = input_15, y = ret_out_1)[name = tensor<string, []>("x_3")];
             tensor<int32, [4]> window_1_begin_0 = const()[name = tensor<string, []>("window_1_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> window_1_end_0 = const()[name = tensor<string, []>("window_1_end_0"), val = tensor<int32, [4]>([1, 1, 16, 256])];
             tensor<bool, [4]> window_1_end_mask_0 = const()[name = tensor<string, []>("window_1_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_1_squeeze_mask_0 = const()[name = tensor<string, []>("window_1_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_1 = slice_by_index(begin = window_1_begin_0, end = window_1_end_0, end_mask = window_1_end_mask_0, squeeze_mask = window_1_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_1")];
-            tensor<int32, [3]> var_222_begin_0 = const()[name = tensor<string, []>("op_222_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_222_end_0 = const()[name = tensor<string, []>("op_222_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_222_end_mask_0 = const()[name = tensor<string, []>("op_222_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_222 = slice_by_index(begin = var_222_begin_0, end = var_222_end_0, end_mask = var_222_end_mask_0, x = x_3)[name = tensor<string, []>("op_222")];
-            tensor<int32, [3]> var_225_begin_0 = const()[name = tensor<string, []>("op_225_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_225_end_0 = const()[name = tensor<string, []>("op_225_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_225_end_mask_0 = const()[name = tensor<string, []>("op_225_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_225 = slice_by_index(begin = var_225_begin_0, end = var_225_end_0, end_mask = var_225_end_mask_0, x = window_1)[name = tensor<string, []>("op_225")];
+            tensor<int32, [3]> var_287_begin_0 = const()[name = tensor<string, []>("op_287_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_287_end_0 = const()[name = tensor<string, []>("op_287_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_287_end_mask_0 = const()[name = tensor<string, []>("op_287_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_287 = slice_by_index(begin = var_287_begin_0, end = var_287_end_0, end_mask = var_287_end_mask_0, x = x_3)[name = tensor<string, []>("op_287")];
+            tensor<int32, [3]> var_290_begin_0 = const()[name = tensor<string, []>("op_290_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_290_end_0 = const()[name = tensor<string, []>("op_290_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_290_end_mask_0 = const()[name = tensor<string, []>("op_290_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_290 = slice_by_index(begin = var_290_begin_0, end = var_290_end_0, end_mask = var_290_end_mask_0, x = window_1)[name = tensor<string, []>("op_290")];
             tensor<bool, []> window_3_interleave_0 = const()[name = tensor<string, []>("window_3_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_3 = concat(axis = var_27, interleave = window_3_interleave_0, values = (var_225, var_222))[name = tensor<string, []>("window_3")];
-            tensor<int32, [3]> var_230_begin_0 = const()[name = tensor<string, []>("op_230_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_230_end_0 = const()[name = tensor<string, []>("op_230_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_230_end_mask_0 = const()[name = tensor<string, []>("op_230_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_230 = slice_by_index(begin = var_230_begin_0, end = var_230_end_0, end_mask = var_230_end_mask_0, x = x_3)[name = tensor<string, []>("op_230")];
-            tensor<int32, [3]> var_233_begin_0 = const()[name = tensor<string, []>("op_233_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_233_end_0 = const()[name = tensor<string, []>("op_233_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_233_end_mask_0 = const()[name = tensor<string, []>("op_233_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_233 = slice_by_index(begin = var_233_begin_0, end = var_233_end_0, end_mask = var_233_end_mask_0, x = window_3)[name = tensor<string, []>("op_233")];
+            tensor<fp32, [1, 16, 256]> window_3 = concat(axis = var_92, interleave = window_3_interleave_0, values = (var_290, var_287))[name = tensor<string, []>("window_3")];
+            tensor<int32, [3]> var_295_begin_0 = const()[name = tensor<string, []>("op_295_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_295_end_0 = const()[name = tensor<string, []>("op_295_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_295_end_mask_0 = const()[name = tensor<string, []>("op_295_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_295 = slice_by_index(begin = var_295_begin_0, end = var_295_end_0, end_mask = var_295_end_mask_0, x = x_3)[name = tensor<string, []>("op_295")];
+            tensor<int32, [3]> var_298_begin_0 = const()[name = tensor<string, []>("op_298_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_298_end_0 = const()[name = tensor<string, []>("op_298_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_298_end_mask_0 = const()[name = tensor<string, []>("op_298_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_298 = slice_by_index(begin = var_298_begin_0, end = var_298_end_0, end_mask = var_298_end_mask_0, x = window_3)[name = tensor<string, []>("op_298")];
             tensor<bool, []> window_5_interleave_0 = const()[name = tensor<string, []>("window_5_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_5 = concat(axis = var_27, interleave = window_5_interleave_0, values = (var_233, var_230))[name = tensor<string, []>("window_5")];
-            tensor<int32, [3]> var_238_begin_0 = const()[name = tensor<string, []>("op_238_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_238_end_0 = const()[name = tensor<string, []>("op_238_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<bool, [3]> var_238_end_mask_0 = const()[name = tensor<string, []>("op_238_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_238 = slice_by_index(begin = var_238_begin_0, end = var_238_end_0, end_mask = var_238_end_mask_0, x = x_3)[name = tensor<string, []>("op_238")];
-            tensor<int32, [3]> var_241_begin_0 = const()[name = tensor<string, []>("op_241_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_241_end_0 = const()[name = tensor<string, []>("op_241_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_241_end_mask_0 = const()[name = tensor<string, []>("op_241_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_241 = slice_by_index(begin = var_241_begin_0, end = var_241_end_0, end_mask = var_241_end_mask_0, x = window_5)[name = tensor<string, []>("op_241")];
+            tensor<fp32, [1, 16, 256]> window_5 = concat(axis = var_92, interleave = window_5_interleave_0, values = (var_298, var_295))[name = tensor<string, []>("window_5")];
+            tensor<int32, [3]> var_303_begin_0 = const()[name = tensor<string, []>("op_303_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_303_end_0 = const()[name = tensor<string, []>("op_303_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<bool, [3]> var_303_end_mask_0 = const()[name = tensor<string, []>("op_303_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_303 = slice_by_index(begin = var_303_begin_0, end = var_303_end_0, end_mask = var_303_end_mask_0, x = x_3)[name = tensor<string, []>("op_303")];
+            tensor<int32, [3]> var_306_begin_0 = const()[name = tensor<string, []>("op_306_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_306_end_0 = const()[name = tensor<string, []>("op_306_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_306_end_mask_0 = const()[name = tensor<string, []>("op_306_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_306 = slice_by_index(begin = var_306_begin_0, end = var_306_end_0, end_mask = var_306_end_mask_0, x = window_5)[name = tensor<string, []>("op_306")];
             tensor<bool, []> window_7_interleave_0 = const()[name = tensor<string, []>("window_7_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_7 = concat(axis = var_27, interleave = window_7_interleave_0, values = (var_241, var_238))[name = tensor<string, []>("window_7")];
-            tensor<int32, [3]> var_246_begin_0 = const()[name = tensor<string, []>("op_246_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
-            tensor<int32, [3]> var_246_end_0 = const()[name = tensor<string, []>("op_246_end_0"), val = tensor<int32, [3]>([1, 4, 256])];
-            tensor<bool, [3]> var_246_end_mask_0 = const()[name = tensor<string, []>("op_246_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_246 = slice_by_index(begin = var_246_begin_0, end = var_246_end_0, end_mask = var_246_end_mask_0, x = x_3)[name = tensor<string, []>("op_246")];
-            tensor<int32, [3]> var_249_begin_0 = const()[name = tensor<string, []>("op_249_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_249_end_0 = const()[name = tensor<string, []>("op_249_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_249_end_mask_0 = const()[name = tensor<string, []>("op_249_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_249 = slice_by_index(begin = var_249_begin_0, end = var_249_end_0, end_mask = var_249_end_mask_0, x = window_7)[name = tensor<string, []>("op_249")];
+            tensor<fp32, [1, 16, 256]> window_7 = concat(axis = var_92, interleave = window_7_interleave_0, values = (var_306, var_303))[name = tensor<string, []>("window_7")];
+            tensor<int32, [3]> var_311_begin_0 = const()[name = tensor<string, []>("op_311_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
+            tensor<int32, [3]> var_311_end_0 = const()[name = tensor<string, []>("op_311_end_0"), val = tensor<int32, [3]>([1, 4, 256])];
+            tensor<bool, [3]> var_311_end_mask_0 = const()[name = tensor<string, []>("op_311_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_311 = slice_by_index(begin = var_311_begin_0, end = var_311_end_0, end_mask = var_311_end_mask_0, x = x_3)[name = tensor<string, []>("op_311")];
+            tensor<int32, [3]> var_314_begin_0 = const()[name = tensor<string, []>("op_314_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_314_end_0 = const()[name = tensor<string, []>("op_314_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_314_end_mask_0 = const()[name = tensor<string, []>("op_314_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_314 = slice_by_index(begin = var_314_begin_0, end = var_314_end_0, end_mask = var_314_end_mask_0, x = window_7)[name = tensor<string, []>("op_314")];
             tensor<bool, []> window_9_interleave_0 = const()[name = tensor<string, []>("window_9_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_9 = concat(axis = var_27, interleave = window_9_interleave_0, values = (var_249, var_246))[name = tensor<string, []>("window_9")];
-            tensor<int32, [3]> var_254_begin_0 = const()[name = tensor<string, []>("op_254_begin_0"), val = tensor<int32, [3]>([0, 4, 0])];
-            tensor<int32, [3]> var_254_end_0 = const()[name = tensor<string, []>("op_254_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_254_end_mask_0 = const()[name = tensor<string, []>("op_254_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_254 = slice_by_index(begin = var_254_begin_0, end = var_254_end_0, end_mask = var_254_end_mask_0, x = x_3)[name = tensor<string, []>("op_254")];
-            tensor<int32, [3]> var_257_begin_0 = const()[name = tensor<string, []>("op_257_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_257_end_0 = const()[name = tensor<string, []>("op_257_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_257_end_mask_0 = const()[name = tensor<string, []>("op_257_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_257 = slice_by_index(begin = var_257_begin_0, end = var_257_end_0, end_mask = var_257_end_mask_0, x = window_9)[name = tensor<string, []>("op_257")];
+            tensor<fp32, [1, 16, 256]> window_9 = concat(axis = var_92, interleave = window_9_interleave_0, values = (var_314, var_311))[name = tensor<string, []>("window_9")];
+            tensor<int32, [3]> var_319_begin_0 = const()[name = tensor<string, []>("op_319_begin_0"), val = tensor<int32, [3]>([0, 4, 0])];
+            tensor<int32, [3]> var_319_end_0 = const()[name = tensor<string, []>("op_319_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_319_end_mask_0 = const()[name = tensor<string, []>("op_319_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_319 = slice_by_index(begin = var_319_begin_0, end = var_319_end_0, end_mask = var_319_end_mask_0, x = x_3)[name = tensor<string, []>("op_319")];
+            tensor<int32, [3]> var_322_begin_0 = const()[name = tensor<string, []>("op_322_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_322_end_0 = const()[name = tensor<string, []>("op_322_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_322_end_mask_0 = const()[name = tensor<string, []>("op_322_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_322 = slice_by_index(begin = var_322_begin_0, end = var_322_end_0, end_mask = var_322_end_mask_0, x = window_9)[name = tensor<string, []>("op_322")];
             tensor<bool, []> window_11_interleave_0 = const()[name = tensor<string, []>("window_11_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_11 = concat(axis = var_27, interleave = window_11_interleave_0, values = (var_257, var_254))[name = tensor<string, []>("window_11")];
-            tensor<bool, []> input_21_interleave_0 = const()[name = tensor<string, []>("input_21_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [5, 16, 256]> input_21 = concat(axis = var_24, interleave = input_21_interleave_0, values = (window_3, window_5, window_7, window_9, window_11))[name = tensor<string, []>("input_21")];
+            tensor<fp32, [1, 16, 256]> window_11 = concat(axis = var_92, interleave = window_11_interleave_0, values = (var_322, var_319))[name = tensor<string, []>("window_11")];
+            tensor<bool, []> input_23_interleave_0 = const()[name = tensor<string, []>("input_23_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [5, 16, 256]> input_23 = concat(axis = var_77, interleave = input_23_interleave_0, values = (window_3, window_5, window_7, window_9, window_11))[name = tensor<string, []>("input_23")];
             tensor<int32, [1]> x_5_axes_0 = const()[name = tensor<string, []>("x_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [5, 16, 256]> x_5 = layer_norm(axes = x_5_axes_0, beta = encoder_conv_module_0_sequential_0_bias, epsilon = var_29, gamma = encoder_conv_module_0_sequential_0_weight, x = input_21)[name = tensor<string, []>("x_5")];
-            tensor<int32, [3]> input_23_perm_0 = const()[name = tensor<string, []>("input_23_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [5, 16, 256]> x_5 = layer_norm(axes = x_5_axes_0, beta = inner_encoder_conv_module_0_sequential_0_bias, epsilon = var_74, gamma = inner_encoder_conv_module_0_sequential_0_weight, x = input_23)[name = tensor<string, []>("x_5")];
+            tensor<int32, [3]> input_25_perm_0 = const()[name = tensor<string, []>("input_25_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_3_pad_type_0 = const()[name = tensor<string, []>("inputs_3_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_3_strides_0 = const()[name = tensor<string, []>("inputs_3_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_3_pad_0 = const()[name = tensor<string, []>("inputs_3_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_3_dilations_0 = const()[name = tensor<string, []>("inputs_3_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_3_groups_0 = const()[name = tensor<string, []>("inputs_3_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> input_23 = transpose(perm = input_23_perm_0, x = x_5)[name = tensor<string, []>("transpose_54")];
-            tensor<fp32, [5, 512, 16]> inputs_3 = conv(bias = encoder_conv_module_0_sequential_2_conv_bias, dilations = inputs_3_dilations_0, groups = inputs_3_groups_0, pad = inputs_3_pad_0, pad_type = inputs_3_pad_type_0, strides = inputs_3_strides_0, weight = encoder_conv_module_0_sequential_2_conv_weight, x = input_23)[name = tensor<string, []>("inputs_3")];
-            tensor<int32, [2]> var_282_split_sizes_0 = const()[name = tensor<string, []>("op_282_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_282_axis_0 = const()[name = tensor<string, []>("op_282_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> var_282_0, tensor<fp32, [5, 256, 16]> var_282_1 = split(axis = var_282_axis_0, split_sizes = var_282_split_sizes_0, x = inputs_3)[name = tensor<string, []>("op_282")];
-            tensor<fp32, [5, 256, 16]> var_284 = sigmoid(x = var_282_1)[name = tensor<string, []>("op_284")];
-            tensor<fp32, [5, 256, 16]> inputs_5 = mul(x = var_282_0, y = var_284)[name = tensor<string, []>("inputs_5")];
+            tensor<fp32, [5, 256, 16]> input_25 = transpose(perm = input_25_perm_0, x = x_5)[name = tensor<string, []>("transpose_54")];
+            tensor<fp32, [5, 512, 16]> inputs_3 = conv(bias = inner_encoder_conv_module_0_sequential_2_conv_bias, dilations = inputs_3_dilations_0, groups = inputs_3_groups_0, pad = inputs_3_pad_0, pad_type = inputs_3_pad_type_0, strides = inputs_3_strides_0, weight = inner_encoder_conv_module_0_sequential_2_conv_weight, x = input_25)[name = tensor<string, []>("inputs_3")];
+            tensor<int32, [2]> var_347_split_sizes_0 = const()[name = tensor<string, []>("op_347_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_347_axis_0 = const()[name = tensor<string, []>("op_347_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [5, 256, 16]> var_347_0, tensor<fp32, [5, 256, 16]> var_347_1 = split(axis = var_347_axis_0, split_sizes = var_347_split_sizes_0, x = inputs_3)[name = tensor<string, []>("op_347")];
+            tensor<fp32, [5, 256, 16]> var_349 = sigmoid(x = var_347_1)[name = tensor<string, []>("op_349")];
+            tensor<fp32, [5, 256, 16]> inputs_5 = mul(x = var_347_0, y = var_349)[name = tensor<string, []>("inputs_5")];
             tensor<string, []> outputs_aug_1_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_1_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_1_pad_0 = const()[name = tensor<string, []>("outputs_aug_1_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_1_groups_0 = const()[name = tensor<string, []>("outputs_aug_1_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_1_strides_0 = const()[name = tensor<string, []>("outputs_aug_1_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_1_dilations_0 = const()[name = tensor<string, []>("outputs_aug_1_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [5, 256, 31]> outputs_aug_1 = conv(dilations = outputs_aug_1_dilations_0, groups = outputs_aug_1_groups_0, pad = outputs_aug_1_pad_0, pad_type = outputs_aug_1_pad_type_0, strides = outputs_aug_1_strides_0, weight = encoder_conv_module_0_sequential_4_conv_weight, x = inputs_5)[name = tensor<string, []>("outputs_aug_1")];
-            tensor<int32, [3]> input_25_begin_0 = const()[name = tensor<string, []>("input_25_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_25_end_0 = const()[name = tensor<string, []>("input_25_end_0"), val = tensor<int32, [3]>([5, 256, 16])];
-            tensor<bool, [3]> input_25_end_mask_0 = const()[name = tensor<string, []>("input_25_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [5, 256, 16]> input_25 = slice_by_index(begin = input_25_begin_0, end = input_25_end_0, end_mask = input_25_end_mask_0, x = outputs_aug_1)[name = tensor<string, []>("input_25")];
-            tensor<fp32, [5, 256, 16]> inputs_7 = batch_norm(beta = encoder_conv_module_0_sequential_5_bias, epsilon = var_29, gamma = encoder_conv_module_0_sequential_5_weight, mean = encoder_conv_module_0_sequential_5_running_mean, variance = encoder_conv_module_0_sequential_5_running_var, x = input_25)[name = tensor<string, []>("inputs_7")];
-            tensor<fp32, [5, 256, 16]> input_27 = silu(x = inputs_7)[name = tensor<string, []>("input_27")];
-            tensor<string, []> input_29_pad_type_0 = const()[name = tensor<string, []>("input_29_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_29_strides_0 = const()[name = tensor<string, []>("input_29_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_29_pad_0 = const()[name = tensor<string, []>("input_29_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_29_dilations_0 = const()[name = tensor<string, []>("input_29_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_29_groups_0 = const()[name = tensor<string, []>("input_29_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> input_29 = conv(bias = encoder_conv_module_0_sequential_7_conv_bias, dilations = input_29_dilations_0, groups = input_29_groups_0, pad = input_29_pad_0, pad_type = input_29_pad_type_0, strides = input_29_strides_0, weight = encoder_conv_module_0_sequential_7_conv_weight, x = input_27)[name = tensor<string, []>("input_29")];
+            tensor<fp32, [5, 256, 31]> outputs_aug_1 = conv(dilations = outputs_aug_1_dilations_0, groups = outputs_aug_1_groups_0, pad = outputs_aug_1_pad_0, pad_type = outputs_aug_1_pad_type_0, strides = outputs_aug_1_strides_0, weight = inner_encoder_conv_module_0_sequential_4_conv_weight, x = inputs_5)[name = tensor<string, []>("outputs_aug_1")];
+            tensor<int32, [3]> input_27_begin_0 = const()[name = tensor<string, []>("input_27_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_27_end_0 = const()[name = tensor<string, []>("input_27_end_0"), val = tensor<int32, [3]>([5, 256, 16])];
+            tensor<bool, [3]> input_27_end_mask_0 = const()[name = tensor<string, []>("input_27_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [5, 256, 16]> input_27 = slice_by_index(begin = input_27_begin_0, end = input_27_end_0, end_mask = input_27_end_mask_0, x = outputs_aug_1)[name = tensor<string, []>("input_27")];
+            tensor<fp32, [5, 256, 16]> inputs_7 = batch_norm(beta = inner_encoder_conv_module_0_sequential_5_bias, epsilon = var_74, gamma = inner_encoder_conv_module_0_sequential_5_weight, mean = inner_encoder_conv_module_0_sequential_5_running_mean, variance = inner_encoder_conv_module_0_sequential_5_running_var, x = input_27)[name = tensor<string, []>("inputs_7")];
+            tensor<fp32, [5, 256, 16]> input_29 = silu(x = inputs_7)[name = tensor<string, []>("input_29")];
+            tensor<string, []> input_31_pad_type_0 = const()[name = tensor<string, []>("input_31_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_31_strides_0 = const()[name = tensor<string, []>("input_31_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_31_pad_0 = const()[name = tensor<string, []>("input_31_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_31_dilations_0 = const()[name = tensor<string, []>("input_31_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_31_groups_0 = const()[name = tensor<string, []>("input_31_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [5, 256, 16]> input_31 = conv(bias = inner_encoder_conv_module_0_sequential_7_conv_bias, dilations = input_31_dilations_0, groups = input_31_groups_0, pad = input_31_pad_0, pad_type = input_31_pad_type_0, strides = input_31_strides_0, weight = inner_encoder_conv_module_0_sequential_7_conv_weight, x = input_29)[name = tensor<string, []>("input_31")];
             tensor<int32, [3]> conv_out_1_perm_0 = const()[name = tensor<string, []>("conv_out_1_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_315_begin_0 = const()[name = tensor<string, []>("op_315_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_315_end_0 = const()[name = tensor<string, []>("op_315_end_0"), val = tensor<int32, [3]>([5, 16, 256])];
-            tensor<bool, [3]> var_315_end_mask_0 = const()[name = tensor<string, []>("op_315_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [5, 16, 256]> conv_out_1 = transpose(perm = conv_out_1_perm_0, x = input_29)[name = tensor<string, []>("transpose_53")];
-            tensor<fp32, [5, 1, 256]> var_315 = slice_by_index(begin = var_315_begin_0, end = var_315_end_0, end_mask = var_315_end_mask_0, x = conv_out_1)[name = tensor<string, []>("op_315")];
-            tensor<int32, [3]> var_317_perm_0 = const()[name = tensor<string, []>("op_317_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 5, 256]> var_317 = transpose(perm = var_317_perm_0, x = var_315)[name = tensor<string, []>("transpose_52")];
-            tensor<fp32, [1, 5, 256]> input_31 = add(x = x_3, y = var_317)[name = tensor<string, []>("input_31")];
-            tensor<int32, [1]> input_33_axes_0 = const()[name = tensor<string, []>("input_33_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_33 = layer_norm(axes = input_33_axes_0, beta = encoder_ffn2_0_module_sequential_0_bias, epsilon = var_29, gamma = encoder_ffn2_0_module_sequential_0_weight, x = input_31)[name = tensor<string, []>("input_33")];
-            tensor<fp32, [1, 5, 1024]> inputs_9 = linear(bias = encoder_ffn2_0_module_sequential_1_linear_bias, weight = encoder_ffn2_0_module_sequential_1_linear_weight, x = input_33)[name = tensor<string, []>("linear_8")];
-            tensor<fp32, [1, 5, 1024]> input_35 = silu(x = inputs_9)[name = tensor<string, []>("input_35")];
-            tensor<fp32, [1, 5, 256]> input_39 = linear(bias = encoder_ffn2_0_module_sequential_4_linear_bias, weight = encoder_ffn2_0_module_sequential_4_linear_weight, x = input_35)[name = tensor<string, []>("linear_9")];
-            tensor<fp32, []> var_340 = const()[name = tensor<string, []>("op_340"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 5, 256]> var_341 = mul(x = input_39, y = var_340)[name = tensor<string, []>("op_341")];
-            tensor<fp32, [1, 5, 256]> input_41 = add(x = var_341, y = input_31)[name = tensor<string, []>("input_41")];
-            tensor<int32, [1]> input_43_axes_0 = const()[name = tensor<string, []>("input_43_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_43 = layer_norm(axes = input_43_axes_0, beta = encoder_layer_norm_0_bias, epsilon = var_29, gamma = encoder_layer_norm_0_weight, x = input_41)[name = tensor<string, []>("input_43")];
+            tensor<int32, [3]> var_380_begin_0 = const()[name = tensor<string, []>("op_380_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_380_end_0 = const()[name = tensor<string, []>("op_380_end_0"), val = tensor<int32, [3]>([5, 16, 256])];
+            tensor<bool, [3]> var_380_end_mask_0 = const()[name = tensor<string, []>("op_380_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [5, 16, 256]> conv_out_1 = transpose(perm = conv_out_1_perm_0, x = input_31)[name = tensor<string, []>("transpose_53")];
+            tensor<fp32, [5, 1, 256]> var_380 = slice_by_index(begin = var_380_begin_0, end = var_380_end_0, end_mask = var_380_end_mask_0, x = conv_out_1)[name = tensor<string, []>("op_380")];
+            tensor<int32, [3]> var_382_perm_0 = const()[name = tensor<string, []>("op_382_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 5, 256]> var_382 = transpose(perm = var_382_perm_0, x = var_380)[name = tensor<string, []>("transpose_52")];
+            tensor<fp32, [1, 5, 256]> input_33 = add(x = x_3, y = var_382)[name = tensor<string, []>("input_33")];
+            tensor<int32, [1]> input_35_axes_0 = const()[name = tensor<string, []>("input_35_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 5, 256]> input_35 = layer_norm(axes = input_35_axes_0, beta = inner_encoder_ffn2_0_module_sequential_0_bias, epsilon = var_74, gamma = inner_encoder_ffn2_0_module_sequential_0_weight, x = input_33)[name = tensor<string, []>("input_35")];
+            tensor<fp32, [1, 5, 1024]> inputs_9 = linear(bias = inner_encoder_ffn2_0_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_0_module_sequential_1_linear_weight, x = input_35)[name = tensor<string, []>("linear_8")];
+            tensor<fp32, [1, 5, 1024]> input_37 = silu(x = inputs_9)[name = tensor<string, []>("input_37")];
+            tensor<fp32, [1, 5, 256]> input_41 = linear(bias = inner_encoder_ffn2_0_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_0_module_sequential_4_linear_weight, x = input_37)[name = tensor<string, []>("linear_9")];
+            tensor<fp32, []> var_405 = const()[name = tensor<string, []>("op_405"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 5, 256]> var_406 = mul(x = input_41, y = var_405)[name = tensor<string, []>("op_406")];
+            tensor<fp32, [1, 5, 256]> input_43 = add(x = var_406, y = input_33)[name = tensor<string, []>("input_43")];
             tensor<int32, [1]> input_45_axes_0 = const()[name = tensor<string, []>("input_45_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_45 = layer_norm(axes = input_45_axes_0, beta = encoder_ffn1_1_module_sequential_0_bias, epsilon = var_29, gamma = encoder_ffn1_1_module_sequential_0_weight, x = input_43)[name = tensor<string, []>("input_45")];
-            tensor<fp32, [1, 5, 1024]> inputs_11 = linear(bias = encoder_ffn1_1_module_sequential_1_linear_bias, weight = encoder_ffn1_1_module_sequential_1_linear_weight, x = input_45)[name = tensor<string, []>("linear_10")];
-            tensor<fp32, [1, 5, 1024]> input_47 = silu(x = inputs_11)[name = tensor<string, []>("input_47")];
-            tensor<fp32, [1, 5, 256]> input_51 = linear(bias = encoder_ffn1_1_module_sequential_4_linear_bias, weight = encoder_ffn1_1_module_sequential_4_linear_weight, x = input_47)[name = tensor<string, []>("linear_11")];
-            tensor<fp32, []> var_370 = const()[name = tensor<string, []>("op_370"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 5, 256]> var_371 = mul(x = input_51, y = var_370)[name = tensor<string, []>("op_371")];
-            tensor<fp32, [1, 5, 256]> input_53 = add(x = var_371, y = input_43)[name = tensor<string, []>("input_53")];
+            tensor<fp32, [1, 5, 256]> input_45 = layer_norm(axes = input_45_axes_0, beta = inner_encoder_layer_norm_0_bias, epsilon = var_74, gamma = inner_encoder_layer_norm_0_weight, x = input_43)[name = tensor<string, []>("input_45")];
+            tensor<int32, [1]> input_47_axes_0 = const()[name = tensor<string, []>("input_47_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 5, 256]> input_47 = layer_norm(axes = input_47_axes_0, beta = inner_encoder_ffn1_1_module_sequential_0_bias, epsilon = var_74, gamma = inner_encoder_ffn1_1_module_sequential_0_weight, x = input_45)[name = tensor<string, []>("input_47")];
+            tensor<fp32, [1, 5, 1024]> inputs_11 = linear(bias = inner_encoder_ffn1_1_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_1_module_sequential_1_linear_weight, x = input_47)[name = tensor<string, []>("linear_10")];
+            tensor<fp32, [1, 5, 1024]> input_49 = silu(x = inputs_11)[name = tensor<string, []>("input_49")];
+            tensor<fp32, [1, 5, 256]> input_53 = linear(bias = inner_encoder_ffn1_1_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_1_module_sequential_4_linear_weight, x = input_49)[name = tensor<string, []>("linear_11")];
+            tensor<fp32, []> var_435 = const()[name = tensor<string, []>("op_435"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 5, 256]> var_436 = mul(x = input_53, y = var_435)[name = tensor<string, []>("op_436")];
+            tensor<fp32, [1, 5, 256]> input_55 = add(x = var_436, y = input_45)[name = tensor<string, []>("input_55")];
             tensor<int32, [1]> x_7_axes_0 = const()[name = tensor<string, []>("x_7_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> x_7 = layer_norm(axes = x_7_axes_0, beta = encoder_ret_lns_1_bias, epsilon = var_29, gamma = encoder_ret_lns_1_weight, x = input_53)[name = tensor<string, []>("x_7")];
+            tensor<fp32, [1, 5, 256]> x_7 = layer_norm(axes = x_7_axes_0, beta = inner_encoder_ret_lns_1_bias, epsilon = var_74, gamma = inner_encoder_ret_lns_1_weight, x = input_55)[name = tensor<string, []>("x_7")];
             tensor<int32, [5]> prev_kv_3_begin_0 = const()[name = tensor<string, []>("prev_kv_3_begin_0"), val = tensor<int32, [5]>([1, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_3_end_0 = const()[name = tensor<string, []>("prev_kv_3_end_0"), val = tensor<int32, [5]>([2, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_3_end_mask_0 = const()[name = tensor<string, []>("prev_kv_3_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -426,183 +452,183 @@ program(1.0)
             tensor<bool, [2]> prev_scale_3_end_mask_0 = const()[name = tensor<string, []>("prev_scale_3_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_3_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_3_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_3 = slice_by_index(begin = prev_scale_3_begin_0, end = prev_scale_3_end_0, end_mask = prev_scale_3_end_mask_0, squeeze_mask = prev_scale_3_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_3")];
-            tensor<fp32, [1, 5, 256]> var_385 = linear(bias = encoder_q_proj_1_bias, weight = encoder_q_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_12")];
-            tensor<int32, [4]> var_386 = const()[name = tensor<string, []>("op_386"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_387 = reshape(shape = var_386, x = var_385)[name = tensor<string, []>("op_387")];
+            tensor<fp32, [1, 5, 256]> var_450 = linear(bias = inner_encoder_q_proj_1_bias, weight = inner_encoder_q_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_12")];
+            tensor<int32, [4]> var_451 = const()[name = tensor<string, []>("op_451"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_452 = reshape(shape = var_451, x = var_450)[name = tensor<string, []>("op_452")];
             tensor<int32, [4]> q_3_perm_0 = const()[name = tensor<string, []>("q_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> var_391 = linear(bias = encoder_k_proj_1_bias, weight = encoder_k_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_13")];
-            tensor<fp32, []> var_392 = const()[name = tensor<string, []>("op_392"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 5, 256]> var_393 = mul(x = var_391, y = var_392)[name = tensor<string, []>("op_393")];
-            tensor<int32, [4]> var_394 = const()[name = tensor<string, []>("op_394"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_395 = reshape(shape = var_394, x = var_393)[name = tensor<string, []>("op_395")];
+            tensor<fp32, [1, 5, 256]> var_456 = linear(bias = inner_encoder_k_proj_1_bias, weight = inner_encoder_k_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_13")];
+            tensor<fp32, []> var_457 = const()[name = tensor<string, []>("op_457"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 5, 256]> var_458 = mul(x = var_456, y = var_457)[name = tensor<string, []>("op_458")];
+            tensor<int32, [4]> var_459 = const()[name = tensor<string, []>("op_459"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_460 = reshape(shape = var_459, x = var_458)[name = tensor<string, []>("op_460")];
             tensor<int32, [4]> k_3_perm_0 = const()[name = tensor<string, []>("k_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> var_399 = linear(bias = encoder_v_proj_1_bias, weight = encoder_v_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_14")];
-            tensor<int32, [4]> var_400 = const()[name = tensor<string, []>("op_400"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_401 = reshape(shape = var_400, x = var_399)[name = tensor<string, []>("op_401")];
+            tensor<fp32, [1, 5, 256]> var_464 = linear(bias = inner_encoder_v_proj_1_bias, weight = inner_encoder_v_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_14")];
+            tensor<int32, [4]> var_465 = const()[name = tensor<string, []>("op_465"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_466 = reshape(shape = var_465, x = var_464)[name = tensor<string, []>("op_466")];
             tensor<int32, [4]> v_3_perm_0 = const()[name = tensor<string, []>("v_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> input_57 = linear(bias = encoder_g_proj_1_bias, weight = encoder_g_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_15")];
+            tensor<fp32, [1, 5, 256]> input_59 = linear(bias = inner_encoder_g_proj_1_bias, weight = inner_encoder_g_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_15")];
             tensor<fp32, [1]> sqrt_s0_3 = sqrt(x = prev_scale_3)[name = tensor<string, []>("sqrt_s0_3")];
-            tensor<fp32, [5]> s_t_3 = add(x = prev_scale_3, y = encoder__t_index)[name = tensor<string, []>("s_t_3")];
+            tensor<fp32, [5]> s_t_3 = add(x = prev_scale_3, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_3")];
             tensor<fp32, [5]> sqrt_s_t_3 = sqrt(x = s_t_3)[name = tensor<string, []>("sqrt_s_t_3")];
             tensor<bool, []> qk_3_transpose_x_1 = const()[name = tensor<string, []>("qk_3_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_3_transpose_y_1 = const()[name = tensor<string, []>("qk_3_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 5, 64]> k_3 = transpose(perm = k_3_perm_0, x = var_395)[name = tensor<string, []>("transpose_50")];
-            tensor<fp32, [1, 4, 5, 64]> q_3 = transpose(perm = q_3_perm_0, x = var_387)[name = tensor<string, []>("transpose_51")];
+            tensor<fp32, [1, 4, 5, 64]> k_3 = transpose(perm = k_3_perm_0, x = var_460)[name = tensor<string, []>("transpose_50")];
+            tensor<fp32, [1, 4, 5, 64]> q_3 = transpose(perm = q_3_perm_0, x = var_452)[name = tensor<string, []>("transpose_51")];
             tensor<fp32, [1, 4, 5, 5]> qk_3 = matmul(transpose_x = qk_3_transpose_x_1, transpose_y = qk_3_transpose_y_1, x = q_3, y = k_3)[name = tensor<string, []>("qk_3")];
-            tensor<int32, [2]> var_411 = const()[name = tensor<string, []>("op_411"), val = tensor<int32, [2]>([5, 1])];
-            tensor<fp32, [5, 1]> var_412 = reshape(shape = var_411, x = sqrt_s_t_3)[name = tensor<string, []>("op_412")];
-            tensor<fp32, [5, 5]> M_3 = real_div(x = encoder__causal_mask, y = var_412)[name = tensor<string, []>("M_3")];
-            tensor<fp32, [1, 4, 5, 5]> var_414 = mul(x = qk_3, y = M_3)[name = tensor<string, []>("op_414")];
+            tensor<int32, [2]> var_476 = const()[name = tensor<string, []>("op_476"), val = tensor<int32, [2]>([5, 1])];
+            tensor<fp32, [5, 1]> var_477 = reshape(shape = var_476, x = sqrt_s_t_3)[name = tensor<string, []>("op_477")];
+            tensor<fp32, [5, 5]> M_3 = real_div(x = inner_encoder__causal_mask, y = var_477)[name = tensor<string, []>("M_3")];
+            tensor<fp32, [1, 4, 5, 5]> var_479 = mul(x = qk_3, y = M_3)[name = tensor<string, []>("op_479")];
             tensor<bool, []> inner_3_transpose_x_0 = const()[name = tensor<string, []>("inner_3_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_3_transpose_y_0 = const()[name = tensor<string, []>("inner_3_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 5, 64]> v_3 = transpose(perm = v_3_perm_0, x = var_401)[name = tensor<string, []>("transpose_49")];
-            tensor<fp32, [1, 4, 5, 64]> inner_3 = matmul(transpose_x = inner_3_transpose_x_0, transpose_y = inner_3_transpose_y_0, x = var_414, y = v_3)[name = tensor<string, []>("inner_3")];
-            tensor<bool, []> var_416_transpose_x_0 = const()[name = tensor<string, []>("op_416_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_416_transpose_y_0 = const()[name = tensor<string, []>("op_416_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 5, 64]> var_416 = matmul(transpose_x = var_416_transpose_x_0, transpose_y = var_416_transpose_y_0, x = q_3, y = prev_kv_3)[name = tensor<string, []>("op_416")];
-            tensor<fp32, [5]> var_417 = real_div(x = sqrt_s0_3, y = sqrt_s_t_3)[name = tensor<string, []>("op_417")];
-            tensor<int32, [4]> var_418 = const()[name = tensor<string, []>("op_418"), val = tensor<int32, [4]>([1, 1, 5, 1])];
-            tensor<fp32, [1, 1, 5, 1]> var_419 = reshape(shape = var_418, x = var_417)[name = tensor<string, []>("op_419")];
-            tensor<fp32, [1, 4, 5, 64]> cross_3 = mul(x = var_416, y = var_419)[name = tensor<string, []>("cross_3")];
+            tensor<fp32, [1, 4, 5, 64]> v_3 = transpose(perm = v_3_perm_0, x = var_466)[name = tensor<string, []>("transpose_49")];
+            tensor<fp32, [1, 4, 5, 64]> inner_3 = matmul(transpose_x = inner_3_transpose_x_0, transpose_y = inner_3_transpose_y_0, x = var_479, y = v_3)[name = tensor<string, []>("inner_3")];
+            tensor<bool, []> var_481_transpose_x_0 = const()[name = tensor<string, []>("op_481_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_481_transpose_y_0 = const()[name = tensor<string, []>("op_481_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 5, 64]> var_481 = matmul(transpose_x = var_481_transpose_x_0, transpose_y = var_481_transpose_y_0, x = q_3, y = prev_kv_3)[name = tensor<string, []>("op_481")];
+            tensor<fp32, [5]> var_482 = real_div(x = sqrt_s0_3, y = sqrt_s_t_3)[name = tensor<string, []>("op_482")];
+            tensor<int32, [4]> var_483 = const()[name = tensor<string, []>("op_483"), val = tensor<int32, [4]>([1, 1, 5, 1])];
+            tensor<fp32, [1, 1, 5, 1]> var_484 = reshape(shape = var_483, x = var_482)[name = tensor<string, []>("op_484")];
+            tensor<fp32, [1, 4, 5, 64]> cross_3 = mul(x = var_481, y = var_484)[name = tensor<string, []>("cross_3")];
             tensor<fp32, [1, 4, 5, 64]> out_7 = add(x = inner_3, y = cross_3)[name = tensor<string, []>("out_7")];
-            tensor<fp32, [1, 4, 64, 64]> var_422 = mul(x = prev_kv_3, y = sqrt_s0_3)[name = tensor<string, []>("op_422")];
-            tensor<bool, []> var_424_transpose_x_1 = const()[name = tensor<string, []>("op_424_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_424_transpose_y_1 = const()[name = tensor<string, []>("op_424_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_424 = matmul(transpose_x = var_424_transpose_x_1, transpose_y = var_424_transpose_y_1, x = k_3, y = v_3)[name = tensor<string, []>("op_424")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_3 = add(x = var_422, y = var_424)[name = tensor<string, []>("new_kv_unnorm_3")];
-            tensor<fp32, []> var_426 = const()[name = tensor<string, []>("op_426"), val = tensor<fp32, []>(0x1.4p+2)];
-            tensor<fp32, [1]> new_scale_3 = add(x = prev_scale_3, y = var_426)[name = tensor<string, []>("new_scale_3")];
-            tensor<fp32, [1]> var_428 = sqrt(x = new_scale_3)[name = tensor<string, []>("op_428")];
-            tensor<fp32, [1, 4, 64, 64]> var_429 = real_div(x = new_kv_unnorm_3, y = var_428)[name = tensor<string, []>("op_429")];
-            tensor<int32, [4]> var_430_perm_0 = const()[name = tensor<string, []>("op_430_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_487 = mul(x = prev_kv_3, y = sqrt_s0_3)[name = tensor<string, []>("op_487")];
+            tensor<bool, []> var_489_transpose_x_1 = const()[name = tensor<string, []>("op_489_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_489_transpose_y_1 = const()[name = tensor<string, []>("op_489_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_489 = matmul(transpose_x = var_489_transpose_x_1, transpose_y = var_489_transpose_y_1, x = k_3, y = v_3)[name = tensor<string, []>("op_489")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_3 = add(x = var_487, y = var_489)[name = tensor<string, []>("new_kv_unnorm_3")];
+            tensor<fp32, []> var_491 = const()[name = tensor<string, []>("op_491"), val = tensor<fp32, []>(0x1.4p+2)];
+            tensor<fp32, [1]> new_scale_3 = add(x = prev_scale_3, y = var_491)[name = tensor<string, []>("new_scale_3")];
+            tensor<fp32, [1]> var_493 = sqrt(x = new_scale_3)[name = tensor<string, []>("op_493")];
+            tensor<fp32, [1, 4, 64, 64]> var_494 = real_div(x = new_kv_unnorm_3, y = var_493)[name = tensor<string, []>("op_494")];
+            tensor<int32, [4]> var_495_perm_0 = const()[name = tensor<string, []>("op_495_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_9_axes_0 = const()[name = tensor<string, []>("out_9_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 4, 64]> var_430 = transpose(perm = var_430_perm_0, x = out_7)[name = tensor<string, []>("transpose_48")];
-            tensor<fp32, [1, 5, 4, 64]> out_9 = layer_norm(axes = out_9_axes_0, epsilon = var_18, x = var_430)[name = tensor<string, []>("out_9")];
-            tensor<int32, [3]> var_434 = const()[name = tensor<string, []>("op_434"), val = tensor<int32, [3]>([1, 5, 256])];
-            tensor<fp32, [1, 5, 256]> out_11 = reshape(shape = var_434, x = out_9)[name = tensor<string, []>("out_11")];
-            tensor<fp32, [1, 5, 256]> var_436 = silu(x = input_57)[name = tensor<string, []>("op_436")];
-            tensor<fp32, [1, 5, 256]> input_59 = mul(x = var_436, y = out_11)[name = tensor<string, []>("input_59")];
-            tensor<fp32, [1, 5, 256]> ret_out_3 = linear(bias = encoder_out_proj_1_bias, weight = encoder_out_proj_1_weight, x = input_59)[name = tensor<string, []>("linear_16")];
-            tensor<fp32, [1, 5, 256]> x_9 = add(x = input_53, y = ret_out_3)[name = tensor<string, []>("x_9")];
+            tensor<fp32, [1, 5, 4, 64]> var_495 = transpose(perm = var_495_perm_0, x = out_7)[name = tensor<string, []>("transpose_48")];
+            tensor<fp32, [1, 5, 4, 64]> out_9 = layer_norm(axes = out_9_axes_0, epsilon = var_82, x = var_495)[name = tensor<string, []>("out_9")];
+            tensor<int32, [3]> var_499 = const()[name = tensor<string, []>("op_499"), val = tensor<int32, [3]>([1, 5, 256])];
+            tensor<fp32, [1, 5, 256]> out_11 = reshape(shape = var_499, x = out_9)[name = tensor<string, []>("out_11")];
+            tensor<fp32, [1, 5, 256]> var_501 = silu(x = input_59)[name = tensor<string, []>("op_501")];
+            tensor<fp32, [1, 5, 256]> input_61 = mul(x = var_501, y = out_11)[name = tensor<string, []>("input_61")];
+            tensor<fp32, [1, 5, 256]> ret_out_3 = linear(bias = inner_encoder_out_proj_1_bias, weight = inner_encoder_out_proj_1_weight, x = input_61)[name = tensor<string, []>("linear_16")];
+            tensor<fp32, [1, 5, 256]> x_9 = add(x = input_55, y = ret_out_3)[name = tensor<string, []>("x_9")];
             tensor<int32, [4]> window_13_begin_0 = const()[name = tensor<string, []>("window_13_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> window_13_end_0 = const()[name = tensor<string, []>("window_13_end_0"), val = tensor<int32, [4]>([2, 1, 16, 256])];
             tensor<bool, [4]> window_13_end_mask_0 = const()[name = tensor<string, []>("window_13_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_13_squeeze_mask_0 = const()[name = tensor<string, []>("window_13_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_13 = slice_by_index(begin = window_13_begin_0, end = window_13_end_0, end_mask = window_13_end_mask_0, squeeze_mask = window_13_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_13")];
-            tensor<int32, [3]> var_444_begin_0 = const()[name = tensor<string, []>("op_444_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_444_end_0 = const()[name = tensor<string, []>("op_444_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_444_end_mask_0 = const()[name = tensor<string, []>("op_444_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_444 = slice_by_index(begin = var_444_begin_0, end = var_444_end_0, end_mask = var_444_end_mask_0, x = x_9)[name = tensor<string, []>("op_444")];
-            tensor<int32, [3]> var_447_begin_0 = const()[name = tensor<string, []>("op_447_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_447_end_0 = const()[name = tensor<string, []>("op_447_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_447_end_mask_0 = const()[name = tensor<string, []>("op_447_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_447 = slice_by_index(begin = var_447_begin_0, end = var_447_end_0, end_mask = var_447_end_mask_0, x = window_13)[name = tensor<string, []>("op_447")];
+            tensor<int32, [3]> var_509_begin_0 = const()[name = tensor<string, []>("op_509_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_509_end_0 = const()[name = tensor<string, []>("op_509_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_509_end_mask_0 = const()[name = tensor<string, []>("op_509_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_509 = slice_by_index(begin = var_509_begin_0, end = var_509_end_0, end_mask = var_509_end_mask_0, x = x_9)[name = tensor<string, []>("op_509")];
+            tensor<int32, [3]> var_512_begin_0 = const()[name = tensor<string, []>("op_512_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_512_end_0 = const()[name = tensor<string, []>("op_512_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_512_end_mask_0 = const()[name = tensor<string, []>("op_512_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_512 = slice_by_index(begin = var_512_begin_0, end = var_512_end_0, end_mask = var_512_end_mask_0, x = window_13)[name = tensor<string, []>("op_512")];
             tensor<bool, []> window_15_interleave_0 = const()[name = tensor<string, []>("window_15_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_15 = concat(axis = var_27, interleave = window_15_interleave_0, values = (var_447, var_444))[name = tensor<string, []>("window_15")];
-            tensor<int32, [3]> var_452_begin_0 = const()[name = tensor<string, []>("op_452_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_452_end_0 = const()[name = tensor<string, []>("op_452_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_452_end_mask_0 = const()[name = tensor<string, []>("op_452_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_452 = slice_by_index(begin = var_452_begin_0, end = var_452_end_0, end_mask = var_452_end_mask_0, x = x_9)[name = tensor<string, []>("op_452")];
-            tensor<int32, [3]> var_455_begin_0 = const()[name = tensor<string, []>("op_455_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_455_end_0 = const()[name = tensor<string, []>("op_455_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_455_end_mask_0 = const()[name = tensor<string, []>("op_455_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_455 = slice_by_index(begin = var_455_begin_0, end = var_455_end_0, end_mask = var_455_end_mask_0, x = window_15)[name = tensor<string, []>("op_455")];
+            tensor<fp32, [1, 16, 256]> window_15 = concat(axis = var_92, interleave = window_15_interleave_0, values = (var_512, var_509))[name = tensor<string, []>("window_15")];
+            tensor<int32, [3]> var_517_begin_0 = const()[name = tensor<string, []>("op_517_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_517_end_0 = const()[name = tensor<string, []>("op_517_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_517_end_mask_0 = const()[name = tensor<string, []>("op_517_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_517 = slice_by_index(begin = var_517_begin_0, end = var_517_end_0, end_mask = var_517_end_mask_0, x = x_9)[name = tensor<string, []>("op_517")];
+            tensor<int32, [3]> var_520_begin_0 = const()[name = tensor<string, []>("op_520_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_520_end_0 = const()[name = tensor<string, []>("op_520_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_520_end_mask_0 = const()[name = tensor<string, []>("op_520_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_520 = slice_by_index(begin = var_520_begin_0, end = var_520_end_0, end_mask = var_520_end_mask_0, x = window_15)[name = tensor<string, []>("op_520")];
             tensor<bool, []> window_17_interleave_0 = const()[name = tensor<string, []>("window_17_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_17 = concat(axis = var_27, interleave = window_17_interleave_0, values = (var_455, var_452))[name = tensor<string, []>("window_17")];
-            tensor<int32, [3]> var_460_begin_0 = const()[name = tensor<string, []>("op_460_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_460_end_0 = const()[name = tensor<string, []>("op_460_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<bool, [3]> var_460_end_mask_0 = const()[name = tensor<string, []>("op_460_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_460 = slice_by_index(begin = var_460_begin_0, end = var_460_end_0, end_mask = var_460_end_mask_0, x = x_9)[name = tensor<string, []>("op_460")];
-            tensor<int32, [3]> var_463_begin_0 = const()[name = tensor<string, []>("op_463_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_463_end_0 = const()[name = tensor<string, []>("op_463_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_463_end_mask_0 = const()[name = tensor<string, []>("op_463_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_463 = slice_by_index(begin = var_463_begin_0, end = var_463_end_0, end_mask = var_463_end_mask_0, x = window_17)[name = tensor<string, []>("op_463")];
+            tensor<fp32, [1, 16, 256]> window_17 = concat(axis = var_92, interleave = window_17_interleave_0, values = (var_520, var_517))[name = tensor<string, []>("window_17")];
+            tensor<int32, [3]> var_525_begin_0 = const()[name = tensor<string, []>("op_525_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_525_end_0 = const()[name = tensor<string, []>("op_525_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<bool, [3]> var_525_end_mask_0 = const()[name = tensor<string, []>("op_525_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_525 = slice_by_index(begin = var_525_begin_0, end = var_525_end_0, end_mask = var_525_end_mask_0, x = x_9)[name = tensor<string, []>("op_525")];
+            tensor<int32, [3]> var_528_begin_0 = const()[name = tensor<string, []>("op_528_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_528_end_0 = const()[name = tensor<string, []>("op_528_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_528_end_mask_0 = const()[name = tensor<string, []>("op_528_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_528 = slice_by_index(begin = var_528_begin_0, end = var_528_end_0, end_mask = var_528_end_mask_0, x = window_17)[name = tensor<string, []>("op_528")];
             tensor<bool, []> window_19_interleave_0 = const()[name = tensor<string, []>("window_19_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_19 = concat(axis = var_27, interleave = window_19_interleave_0, values = (var_463, var_460))[name = tensor<string, []>("window_19")];
-            tensor<int32, [3]> var_468_begin_0 = const()[name = tensor<string, []>("op_468_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
-            tensor<int32, [3]> var_468_end_0 = const()[name = tensor<string, []>("op_468_end_0"), val = tensor<int32, [3]>([1, 4, 256])];
-            tensor<bool, [3]> var_468_end_mask_0 = const()[name = tensor<string, []>("op_468_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_468 = slice_by_index(begin = var_468_begin_0, end = var_468_end_0, end_mask = var_468_end_mask_0, x = x_9)[name = tensor<string, []>("op_468")];
-            tensor<int32, [3]> var_471_begin_0 = const()[name = tensor<string, []>("op_471_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_471_end_0 = const()[name = tensor<string, []>("op_471_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_471_end_mask_0 = const()[name = tensor<string, []>("op_471_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_471 = slice_by_index(begin = var_471_begin_0, end = var_471_end_0, end_mask = var_471_end_mask_0, x = window_19)[name = tensor<string, []>("op_471")];
+            tensor<fp32, [1, 16, 256]> window_19 = concat(axis = var_92, interleave = window_19_interleave_0, values = (var_528, var_525))[name = tensor<string, []>("window_19")];
+            tensor<int32, [3]> var_533_begin_0 = const()[name = tensor<string, []>("op_533_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
+            tensor<int32, [3]> var_533_end_0 = const()[name = tensor<string, []>("op_533_end_0"), val = tensor<int32, [3]>([1, 4, 256])];
+            tensor<bool, [3]> var_533_end_mask_0 = const()[name = tensor<string, []>("op_533_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_533 = slice_by_index(begin = var_533_begin_0, end = var_533_end_0, end_mask = var_533_end_mask_0, x = x_9)[name = tensor<string, []>("op_533")];
+            tensor<int32, [3]> var_536_begin_0 = const()[name = tensor<string, []>("op_536_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_536_end_0 = const()[name = tensor<string, []>("op_536_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_536_end_mask_0 = const()[name = tensor<string, []>("op_536_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_536 = slice_by_index(begin = var_536_begin_0, end = var_536_end_0, end_mask = var_536_end_mask_0, x = window_19)[name = tensor<string, []>("op_536")];
             tensor<bool, []> window_21_interleave_0 = const()[name = tensor<string, []>("window_21_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_21 = concat(axis = var_27, interleave = window_21_interleave_0, values = (var_471, var_468))[name = tensor<string, []>("window_21")];
-            tensor<int32, [3]> var_476_begin_0 = const()[name = tensor<string, []>("op_476_begin_0"), val = tensor<int32, [3]>([0, 4, 0])];
-            tensor<int32, [3]> var_476_end_0 = const()[name = tensor<string, []>("op_476_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_476_end_mask_0 = const()[name = tensor<string, []>("op_476_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_476 = slice_by_index(begin = var_476_begin_0, end = var_476_end_0, end_mask = var_476_end_mask_0, x = x_9)[name = tensor<string, []>("op_476")];
-            tensor<int32, [3]> var_479_begin_0 = const()[name = tensor<string, []>("op_479_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_479_end_0 = const()[name = tensor<string, []>("op_479_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_479_end_mask_0 = const()[name = tensor<string, []>("op_479_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_479 = slice_by_index(begin = var_479_begin_0, end = var_479_end_0, end_mask = var_479_end_mask_0, x = window_21)[name = tensor<string, []>("op_479")];
+            tensor<fp32, [1, 16, 256]> window_21 = concat(axis = var_92, interleave = window_21_interleave_0, values = (var_536, var_533))[name = tensor<string, []>("window_21")];
+            tensor<int32, [3]> var_541_begin_0 = const()[name = tensor<string, []>("op_541_begin_0"), val = tensor<int32, [3]>([0, 4, 0])];
+            tensor<int32, [3]> var_541_end_0 = const()[name = tensor<string, []>("op_541_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_541_end_mask_0 = const()[name = tensor<string, []>("op_541_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_541 = slice_by_index(begin = var_541_begin_0, end = var_541_end_0, end_mask = var_541_end_mask_0, x = x_9)[name = tensor<string, []>("op_541")];
+            tensor<int32, [3]> var_544_begin_0 = const()[name = tensor<string, []>("op_544_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_544_end_0 = const()[name = tensor<string, []>("op_544_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_544_end_mask_0 = const()[name = tensor<string, []>("op_544_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_544 = slice_by_index(begin = var_544_begin_0, end = var_544_end_0, end_mask = var_544_end_mask_0, x = window_21)[name = tensor<string, []>("op_544")];
             tensor<bool, []> window_23_interleave_0 = const()[name = tensor<string, []>("window_23_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_23 = concat(axis = var_27, interleave = window_23_interleave_0, values = (var_479, var_476))[name = tensor<string, []>("window_23")];
-            tensor<bool, []> input_61_interleave_0 = const()[name = tensor<string, []>("input_61_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [5, 16, 256]> input_61 = concat(axis = var_24, interleave = input_61_interleave_0, values = (window_15, window_17, window_19, window_21, window_23))[name = tensor<string, []>("input_61")];
+            tensor<fp32, [1, 16, 256]> window_23 = concat(axis = var_92, interleave = window_23_interleave_0, values = (var_544, var_541))[name = tensor<string, []>("window_23")];
+            tensor<bool, []> input_63_interleave_0 = const()[name = tensor<string, []>("input_63_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [5, 16, 256]> input_63 = concat(axis = var_77, interleave = input_63_interleave_0, values = (window_15, window_17, window_19, window_21, window_23))[name = tensor<string, []>("input_63")];
             tensor<int32, [1]> x_11_axes_0 = const()[name = tensor<string, []>("x_11_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [5, 16, 256]> x_11 = layer_norm(axes = x_11_axes_0, beta = encoder_conv_module_1_sequential_0_bias, epsilon = var_29, gamma = encoder_conv_module_1_sequential_0_weight, x = input_61)[name = tensor<string, []>("x_11")];
-            tensor<int32, [3]> input_63_perm_0 = const()[name = tensor<string, []>("input_63_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [5, 16, 256]> x_11 = layer_norm(axes = x_11_axes_0, beta = inner_encoder_conv_module_1_sequential_0_bias, epsilon = var_74, gamma = inner_encoder_conv_module_1_sequential_0_weight, x = input_63)[name = tensor<string, []>("x_11")];
+            tensor<int32, [3]> input_65_perm_0 = const()[name = tensor<string, []>("input_65_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_13_pad_type_0 = const()[name = tensor<string, []>("inputs_13_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_13_strides_0 = const()[name = tensor<string, []>("inputs_13_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_13_pad_0 = const()[name = tensor<string, []>("inputs_13_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_13_dilations_0 = const()[name = tensor<string, []>("inputs_13_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_13_groups_0 = const()[name = tensor<string, []>("inputs_13_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> input_63 = transpose(perm = input_63_perm_0, x = x_11)[name = tensor<string, []>("transpose_47")];
-            tensor<fp32, [5, 512, 16]> inputs_13 = conv(bias = encoder_conv_module_1_sequential_2_conv_bias, dilations = inputs_13_dilations_0, groups = inputs_13_groups_0, pad = inputs_13_pad_0, pad_type = inputs_13_pad_type_0, strides = inputs_13_strides_0, weight = encoder_conv_module_1_sequential_2_conv_weight, x = input_63)[name = tensor<string, []>("inputs_13")];
-            tensor<int32, [2]> var_504_split_sizes_0 = const()[name = tensor<string, []>("op_504_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_504_axis_0 = const()[name = tensor<string, []>("op_504_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> var_504_0, tensor<fp32, [5, 256, 16]> var_504_1 = split(axis = var_504_axis_0, split_sizes = var_504_split_sizes_0, x = inputs_13)[name = tensor<string, []>("op_504")];
-            tensor<fp32, [5, 256, 16]> var_506 = sigmoid(x = var_504_1)[name = tensor<string, []>("op_506")];
-            tensor<fp32, [5, 256, 16]> inputs_15 = mul(x = var_504_0, y = var_506)[name = tensor<string, []>("inputs_15")];
+            tensor<fp32, [5, 256, 16]> input_65 = transpose(perm = input_65_perm_0, x = x_11)[name = tensor<string, []>("transpose_47")];
+            tensor<fp32, [5, 512, 16]> inputs_13 = conv(bias = inner_encoder_conv_module_1_sequential_2_conv_bias, dilations = inputs_13_dilations_0, groups = inputs_13_groups_0, pad = inputs_13_pad_0, pad_type = inputs_13_pad_type_0, strides = inputs_13_strides_0, weight = inner_encoder_conv_module_1_sequential_2_conv_weight, x = input_65)[name = tensor<string, []>("inputs_13")];
+            tensor<int32, [2]> var_569_split_sizes_0 = const()[name = tensor<string, []>("op_569_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_569_axis_0 = const()[name = tensor<string, []>("op_569_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [5, 256, 16]> var_569_0, tensor<fp32, [5, 256, 16]> var_569_1 = split(axis = var_569_axis_0, split_sizes = var_569_split_sizes_0, x = inputs_13)[name = tensor<string, []>("op_569")];
+            tensor<fp32, [5, 256, 16]> var_571 = sigmoid(x = var_569_1)[name = tensor<string, []>("op_571")];
+            tensor<fp32, [5, 256, 16]> inputs_15 = mul(x = var_569_0, y = var_571)[name = tensor<string, []>("inputs_15")];
             tensor<string, []> outputs_aug_3_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_3_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_3_pad_0 = const()[name = tensor<string, []>("outputs_aug_3_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_3_groups_0 = const()[name = tensor<string, []>("outputs_aug_3_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_3_strides_0 = const()[name = tensor<string, []>("outputs_aug_3_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_3_dilations_0 = const()[name = tensor<string, []>("outputs_aug_3_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [5, 256, 31]> outputs_aug_3 = conv(dilations = outputs_aug_3_dilations_0, groups = outputs_aug_3_groups_0, pad = outputs_aug_3_pad_0, pad_type = outputs_aug_3_pad_type_0, strides = outputs_aug_3_strides_0, weight = encoder_conv_module_1_sequential_4_conv_weight, x = inputs_15)[name = tensor<string, []>("outputs_aug_3")];
-            tensor<int32, [3]> input_65_begin_0 = const()[name = tensor<string, []>("input_65_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_65_end_0 = const()[name = tensor<string, []>("input_65_end_0"), val = tensor<int32, [3]>([5, 256, 16])];
-            tensor<bool, [3]> input_65_end_mask_0 = const()[name = tensor<string, []>("input_65_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [5, 256, 16]> input_65 = slice_by_index(begin = input_65_begin_0, end = input_65_end_0, end_mask = input_65_end_mask_0, x = outputs_aug_3)[name = tensor<string, []>("input_65")];
-            tensor<fp32, [5, 256, 16]> inputs_17 = batch_norm(beta = encoder_conv_module_1_sequential_5_bias, epsilon = var_29, gamma = encoder_conv_module_1_sequential_5_weight, mean = encoder_conv_module_1_sequential_5_running_mean, variance = encoder_conv_module_1_sequential_5_running_var, x = input_65)[name = tensor<string, []>("inputs_17")];
-            tensor<fp32, [5, 256, 16]> input_67 = silu(x = inputs_17)[name = tensor<string, []>("input_67")];
-            tensor<string, []> input_69_pad_type_0 = const()[name = tensor<string, []>("input_69_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_69_strides_0 = const()[name = tensor<string, []>("input_69_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_69_pad_0 = const()[name = tensor<string, []>("input_69_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_69_dilations_0 = const()[name = tensor<string, []>("input_69_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_69_groups_0 = const()[name = tensor<string, []>("input_69_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> input_69 = conv(bias = encoder_conv_module_1_sequential_7_conv_bias, dilations = input_69_dilations_0, groups = input_69_groups_0, pad = input_69_pad_0, pad_type = input_69_pad_type_0, strides = input_69_strides_0, weight = encoder_conv_module_1_sequential_7_conv_weight, x = input_67)[name = tensor<string, []>("input_69")];
+            tensor<fp32, [5, 256, 31]> outputs_aug_3 = conv(dilations = outputs_aug_3_dilations_0, groups = outputs_aug_3_groups_0, pad = outputs_aug_3_pad_0, pad_type = outputs_aug_3_pad_type_0, strides = outputs_aug_3_strides_0, weight = inner_encoder_conv_module_1_sequential_4_conv_weight, x = inputs_15)[name = tensor<string, []>("outputs_aug_3")];
+            tensor<int32, [3]> input_67_begin_0 = const()[name = tensor<string, []>("input_67_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_67_end_0 = const()[name = tensor<string, []>("input_67_end_0"), val = tensor<int32, [3]>([5, 256, 16])];
+            tensor<bool, [3]> input_67_end_mask_0 = const()[name = tensor<string, []>("input_67_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [5, 256, 16]> input_67 = slice_by_index(begin = input_67_begin_0, end = input_67_end_0, end_mask = input_67_end_mask_0, x = outputs_aug_3)[name = tensor<string, []>("input_67")];
+            tensor<fp32, [5, 256, 16]> inputs_17 = batch_norm(beta = inner_encoder_conv_module_1_sequential_5_bias, epsilon = var_74, gamma = inner_encoder_conv_module_1_sequential_5_weight, mean = inner_encoder_conv_module_1_sequential_5_running_mean, variance = inner_encoder_conv_module_1_sequential_5_running_var, x = input_67)[name = tensor<string, []>("inputs_17")];
+            tensor<fp32, [5, 256, 16]> input_69 = silu(x = inputs_17)[name = tensor<string, []>("input_69")];
+            tensor<string, []> input_71_pad_type_0 = const()[name = tensor<string, []>("input_71_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_71_strides_0 = const()[name = tensor<string, []>("input_71_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_71_pad_0 = const()[name = tensor<string, []>("input_71_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_71_dilations_0 = const()[name = tensor<string, []>("input_71_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_71_groups_0 = const()[name = tensor<string, []>("input_71_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [5, 256, 16]> input_71 = conv(bias = inner_encoder_conv_module_1_sequential_7_conv_bias, dilations = input_71_dilations_0, groups = input_71_groups_0, pad = input_71_pad_0, pad_type = input_71_pad_type_0, strides = input_71_strides_0, weight = inner_encoder_conv_module_1_sequential_7_conv_weight, x = input_69)[name = tensor<string, []>("input_71")];
             tensor<int32, [3]> conv_out_3_perm_0 = const()[name = tensor<string, []>("conv_out_3_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_537_begin_0 = const()[name = tensor<string, []>("op_537_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_537_end_0 = const()[name = tensor<string, []>("op_537_end_0"), val = tensor<int32, [3]>([5, 16, 256])];
-            tensor<bool, [3]> var_537_end_mask_0 = const()[name = tensor<string, []>("op_537_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [5, 16, 256]> conv_out_3 = transpose(perm = conv_out_3_perm_0, x = input_69)[name = tensor<string, []>("transpose_46")];
-            tensor<fp32, [5, 1, 256]> var_537 = slice_by_index(begin = var_537_begin_0, end = var_537_end_0, end_mask = var_537_end_mask_0, x = conv_out_3)[name = tensor<string, []>("op_537")];
-            tensor<int32, [3]> var_539_perm_0 = const()[name = tensor<string, []>("op_539_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 5, 256]> var_539 = transpose(perm = var_539_perm_0, x = var_537)[name = tensor<string, []>("transpose_45")];
-            tensor<fp32, [1, 5, 256]> input_71 = add(x = x_9, y = var_539)[name = tensor<string, []>("input_71")];
-            tensor<int32, [1]> input_73_axes_0 = const()[name = tensor<string, []>("input_73_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_73 = layer_norm(axes = input_73_axes_0, beta = encoder_ffn2_1_module_sequential_0_bias, epsilon = var_29, gamma = encoder_ffn2_1_module_sequential_0_weight, x = input_71)[name = tensor<string, []>("input_73")];
-            tensor<fp32, [1, 5, 1024]> inputs_19 = linear(bias = encoder_ffn2_1_module_sequential_1_linear_bias, weight = encoder_ffn2_1_module_sequential_1_linear_weight, x = input_73)[name = tensor<string, []>("linear_17")];
-            tensor<fp32, [1, 5, 1024]> input_75 = silu(x = inputs_19)[name = tensor<string, []>("input_75")];
-            tensor<fp32, [1, 5, 256]> input_79 = linear(bias = encoder_ffn2_1_module_sequential_4_linear_bias, weight = encoder_ffn2_1_module_sequential_4_linear_weight, x = input_75)[name = tensor<string, []>("linear_18")];
-            tensor<fp32, []> var_562 = const()[name = tensor<string, []>("op_562"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 5, 256]> var_563 = mul(x = input_79, y = var_562)[name = tensor<string, []>("op_563")];
-            tensor<fp32, [1, 5, 256]> input_81 = add(x = var_563, y = input_71)[name = tensor<string, []>("input_81")];
-            tensor<int32, [1]> input_83_axes_0 = const()[name = tensor<string, []>("input_83_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_83 = layer_norm(axes = input_83_axes_0, beta = encoder_layer_norm_1_bias, epsilon = var_29, gamma = encoder_layer_norm_1_weight, x = input_81)[name = tensor<string, []>("input_83")];
+            tensor<int32, [3]> var_602_begin_0 = const()[name = tensor<string, []>("op_602_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_602_end_0 = const()[name = tensor<string, []>("op_602_end_0"), val = tensor<int32, [3]>([5, 16, 256])];
+            tensor<bool, [3]> var_602_end_mask_0 = const()[name = tensor<string, []>("op_602_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [5, 16, 256]> conv_out_3 = transpose(perm = conv_out_3_perm_0, x = input_71)[name = tensor<string, []>("transpose_46")];
+            tensor<fp32, [5, 1, 256]> var_602 = slice_by_index(begin = var_602_begin_0, end = var_602_end_0, end_mask = var_602_end_mask_0, x = conv_out_3)[name = tensor<string, []>("op_602")];
+            tensor<int32, [3]> var_604_perm_0 = const()[name = tensor<string, []>("op_604_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 5, 256]> var_604 = transpose(perm = var_604_perm_0, x = var_602)[name = tensor<string, []>("transpose_45")];
+            tensor<fp32, [1, 5, 256]> input_73 = add(x = x_9, y = var_604)[name = tensor<string, []>("input_73")];
+            tensor<int32, [1]> input_75_axes_0 = const()[name = tensor<string, []>("input_75_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 5, 256]> input_75 = layer_norm(axes = input_75_axes_0, beta = inner_encoder_ffn2_1_module_sequential_0_bias, epsilon = var_74, gamma = inner_encoder_ffn2_1_module_sequential_0_weight, x = input_73)[name = tensor<string, []>("input_75")];
+            tensor<fp32, [1, 5, 1024]> inputs_19 = linear(bias = inner_encoder_ffn2_1_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_1_module_sequential_1_linear_weight, x = input_75)[name = tensor<string, []>("linear_17")];
+            tensor<fp32, [1, 5, 1024]> input_77 = silu(x = inputs_19)[name = tensor<string, []>("input_77")];
+            tensor<fp32, [1, 5, 256]> input_81 = linear(bias = inner_encoder_ffn2_1_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_1_module_sequential_4_linear_weight, x = input_77)[name = tensor<string, []>("linear_18")];
+            tensor<fp32, []> var_627 = const()[name = tensor<string, []>("op_627"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 5, 256]> var_628 = mul(x = input_81, y = var_627)[name = tensor<string, []>("op_628")];
+            tensor<fp32, [1, 5, 256]> input_83 = add(x = var_628, y = input_73)[name = tensor<string, []>("input_83")];
             tensor<int32, [1]> input_85_axes_0 = const()[name = tensor<string, []>("input_85_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_85 = layer_norm(axes = input_85_axes_0, beta = encoder_ffn1_2_module_sequential_0_bias, epsilon = var_29, gamma = encoder_ffn1_2_module_sequential_0_weight, x = input_83)[name = tensor<string, []>("input_85")];
-            tensor<fp32, [1, 5, 1024]> inputs_21 = linear(bias = encoder_ffn1_2_module_sequential_1_linear_bias, weight = encoder_ffn1_2_module_sequential_1_linear_weight, x = input_85)[name = tensor<string, []>("linear_19")];
-            tensor<fp32, [1, 5, 1024]> input_87 = silu(x = inputs_21)[name = tensor<string, []>("input_87")];
-            tensor<fp32, [1, 5, 256]> input_91 = linear(bias = encoder_ffn1_2_module_sequential_4_linear_bias, weight = encoder_ffn1_2_module_sequential_4_linear_weight, x = input_87)[name = tensor<string, []>("linear_20")];
-            tensor<fp32, []> var_592 = const()[name = tensor<string, []>("op_592"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 5, 256]> var_593 = mul(x = input_91, y = var_592)[name = tensor<string, []>("op_593")];
-            tensor<fp32, [1, 5, 256]> input_93 = add(x = var_593, y = input_83)[name = tensor<string, []>("input_93")];
+            tensor<fp32, [1, 5, 256]> input_85 = layer_norm(axes = input_85_axes_0, beta = inner_encoder_layer_norm_1_bias, epsilon = var_74, gamma = inner_encoder_layer_norm_1_weight, x = input_83)[name = tensor<string, []>("input_85")];
+            tensor<int32, [1]> input_87_axes_0 = const()[name = tensor<string, []>("input_87_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 5, 256]> input_87 = layer_norm(axes = input_87_axes_0, beta = inner_encoder_ffn1_2_module_sequential_0_bias, epsilon = var_74, gamma = inner_encoder_ffn1_2_module_sequential_0_weight, x = input_85)[name = tensor<string, []>("input_87")];
+            tensor<fp32, [1, 5, 1024]> inputs_21 = linear(bias = inner_encoder_ffn1_2_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_2_module_sequential_1_linear_weight, x = input_87)[name = tensor<string, []>("linear_19")];
+            tensor<fp32, [1, 5, 1024]> input_89 = silu(x = inputs_21)[name = tensor<string, []>("input_89")];
+            tensor<fp32, [1, 5, 256]> input_93 = linear(bias = inner_encoder_ffn1_2_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_2_module_sequential_4_linear_weight, x = input_89)[name = tensor<string, []>("linear_20")];
+            tensor<fp32, []> var_657 = const()[name = tensor<string, []>("op_657"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 5, 256]> var_658 = mul(x = input_93, y = var_657)[name = tensor<string, []>("op_658")];
+            tensor<fp32, [1, 5, 256]> input_95 = add(x = var_658, y = input_85)[name = tensor<string, []>("input_95")];
             tensor<int32, [1]> x_13_axes_0 = const()[name = tensor<string, []>("x_13_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> x_13 = layer_norm(axes = x_13_axes_0, beta = encoder_ret_lns_2_bias, epsilon = var_29, gamma = encoder_ret_lns_2_weight, x = input_93)[name = tensor<string, []>("x_13")];
+            tensor<fp32, [1, 5, 256]> x_13 = layer_norm(axes = x_13_axes_0, beta = inner_encoder_ret_lns_2_bias, epsilon = var_74, gamma = inner_encoder_ret_lns_2_weight, x = input_95)[name = tensor<string, []>("x_13")];
             tensor<int32, [5]> prev_kv_5_begin_0 = const()[name = tensor<string, []>("prev_kv_5_begin_0"), val = tensor<int32, [5]>([2, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_5_end_0 = const()[name = tensor<string, []>("prev_kv_5_end_0"), val = tensor<int32, [5]>([3, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_5_end_mask_0 = const()[name = tensor<string, []>("prev_kv_5_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -613,183 +639,183 @@ program(1.0)
             tensor<bool, [2]> prev_scale_5_end_mask_0 = const()[name = tensor<string, []>("prev_scale_5_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_5_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_5_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_5 = slice_by_index(begin = prev_scale_5_begin_0, end = prev_scale_5_end_0, end_mask = prev_scale_5_end_mask_0, squeeze_mask = prev_scale_5_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_5")];
-            tensor<fp32, [1, 5, 256]> var_607 = linear(bias = encoder_q_proj_2_bias, weight = encoder_q_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_21")];
-            tensor<int32, [4]> var_608 = const()[name = tensor<string, []>("op_608"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_609 = reshape(shape = var_608, x = var_607)[name = tensor<string, []>("op_609")];
+            tensor<fp32, [1, 5, 256]> var_672 = linear(bias = inner_encoder_q_proj_2_bias, weight = inner_encoder_q_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_21")];
+            tensor<int32, [4]> var_673 = const()[name = tensor<string, []>("op_673"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_674 = reshape(shape = var_673, x = var_672)[name = tensor<string, []>("op_674")];
             tensor<int32, [4]> q_5_perm_0 = const()[name = tensor<string, []>("q_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> var_613 = linear(bias = encoder_k_proj_2_bias, weight = encoder_k_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_22")];
-            tensor<fp32, []> var_614 = const()[name = tensor<string, []>("op_614"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 5, 256]> var_615 = mul(x = var_613, y = var_614)[name = tensor<string, []>("op_615")];
-            tensor<int32, [4]> var_616 = const()[name = tensor<string, []>("op_616"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_617 = reshape(shape = var_616, x = var_615)[name = tensor<string, []>("op_617")];
+            tensor<fp32, [1, 5, 256]> var_678 = linear(bias = inner_encoder_k_proj_2_bias, weight = inner_encoder_k_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_22")];
+            tensor<fp32, []> var_679 = const()[name = tensor<string, []>("op_679"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 5, 256]> var_680 = mul(x = var_678, y = var_679)[name = tensor<string, []>("op_680")];
+            tensor<int32, [4]> var_681 = const()[name = tensor<string, []>("op_681"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_682 = reshape(shape = var_681, x = var_680)[name = tensor<string, []>("op_682")];
             tensor<int32, [4]> k_5_perm_0 = const()[name = tensor<string, []>("k_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> var_621 = linear(bias = encoder_v_proj_2_bias, weight = encoder_v_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_23")];
-            tensor<int32, [4]> var_622 = const()[name = tensor<string, []>("op_622"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_623 = reshape(shape = var_622, x = var_621)[name = tensor<string, []>("op_623")];
+            tensor<fp32, [1, 5, 256]> var_686 = linear(bias = inner_encoder_v_proj_2_bias, weight = inner_encoder_v_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_23")];
+            tensor<int32, [4]> var_687 = const()[name = tensor<string, []>("op_687"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_688 = reshape(shape = var_687, x = var_686)[name = tensor<string, []>("op_688")];
             tensor<int32, [4]> v_5_perm_0 = const()[name = tensor<string, []>("v_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> input_97 = linear(bias = encoder_g_proj_2_bias, weight = encoder_g_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_24")];
+            tensor<fp32, [1, 5, 256]> input_99 = linear(bias = inner_encoder_g_proj_2_bias, weight = inner_encoder_g_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_24")];
             tensor<fp32, [1]> sqrt_s0_5 = sqrt(x = prev_scale_5)[name = tensor<string, []>("sqrt_s0_5")];
-            tensor<fp32, [5]> s_t_5 = add(x = prev_scale_5, y = encoder__t_index)[name = tensor<string, []>("s_t_5")];
+            tensor<fp32, [5]> s_t_5 = add(x = prev_scale_5, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_5")];
             tensor<fp32, [5]> sqrt_s_t_5 = sqrt(x = s_t_5)[name = tensor<string, []>("sqrt_s_t_5")];
             tensor<bool, []> qk_5_transpose_x_1 = const()[name = tensor<string, []>("qk_5_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_5_transpose_y_1 = const()[name = tensor<string, []>("qk_5_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 5, 64]> k_5 = transpose(perm = k_5_perm_0, x = var_617)[name = tensor<string, []>("transpose_43")];
-            tensor<fp32, [1, 4, 5, 64]> q_5 = transpose(perm = q_5_perm_0, x = var_609)[name = tensor<string, []>("transpose_44")];
+            tensor<fp32, [1, 4, 5, 64]> k_5 = transpose(perm = k_5_perm_0, x = var_682)[name = tensor<string, []>("transpose_43")];
+            tensor<fp32, [1, 4, 5, 64]> q_5 = transpose(perm = q_5_perm_0, x = var_674)[name = tensor<string, []>("transpose_44")];
             tensor<fp32, [1, 4, 5, 5]> qk_5 = matmul(transpose_x = qk_5_transpose_x_1, transpose_y = qk_5_transpose_y_1, x = q_5, y = k_5)[name = tensor<string, []>("qk_5")];
-            tensor<int32, [2]> var_633 = const()[name = tensor<string, []>("op_633"), val = tensor<int32, [2]>([5, 1])];
-            tensor<fp32, [5, 1]> var_634 = reshape(shape = var_633, x = sqrt_s_t_5)[name = tensor<string, []>("op_634")];
-            tensor<fp32, [5, 5]> M_5 = real_div(x = encoder__causal_mask, y = var_634)[name = tensor<string, []>("M_5")];
-            tensor<fp32, [1, 4, 5, 5]> var_636 = mul(x = qk_5, y = M_5)[name = tensor<string, []>("op_636")];
+            tensor<int32, [2]> var_698 = const()[name = tensor<string, []>("op_698"), val = tensor<int32, [2]>([5, 1])];
+            tensor<fp32, [5, 1]> var_699 = reshape(shape = var_698, x = sqrt_s_t_5)[name = tensor<string, []>("op_699")];
+            tensor<fp32, [5, 5]> M_5 = real_div(x = inner_encoder__causal_mask, y = var_699)[name = tensor<string, []>("M_5")];
+            tensor<fp32, [1, 4, 5, 5]> var_701 = mul(x = qk_5, y = M_5)[name = tensor<string, []>("op_701")];
             tensor<bool, []> inner_5_transpose_x_0 = const()[name = tensor<string, []>("inner_5_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_5_transpose_y_0 = const()[name = tensor<string, []>("inner_5_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 5, 64]> v_5 = transpose(perm = v_5_perm_0, x = var_623)[name = tensor<string, []>("transpose_42")];
-            tensor<fp32, [1, 4, 5, 64]> inner_5 = matmul(transpose_x = inner_5_transpose_x_0, transpose_y = inner_5_transpose_y_0, x = var_636, y = v_5)[name = tensor<string, []>("inner_5")];
-            tensor<bool, []> var_638_transpose_x_0 = const()[name = tensor<string, []>("op_638_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_638_transpose_y_0 = const()[name = tensor<string, []>("op_638_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 5, 64]> var_638 = matmul(transpose_x = var_638_transpose_x_0, transpose_y = var_638_transpose_y_0, x = q_5, y = prev_kv_5)[name = tensor<string, []>("op_638")];
-            tensor<fp32, [5]> var_639 = real_div(x = sqrt_s0_5, y = sqrt_s_t_5)[name = tensor<string, []>("op_639")];
-            tensor<int32, [4]> var_640 = const()[name = tensor<string, []>("op_640"), val = tensor<int32, [4]>([1, 1, 5, 1])];
-            tensor<fp32, [1, 1, 5, 1]> var_641 = reshape(shape = var_640, x = var_639)[name = tensor<string, []>("op_641")];
-            tensor<fp32, [1, 4, 5, 64]> cross_5 = mul(x = var_638, y = var_641)[name = tensor<string, []>("cross_5")];
+            tensor<fp32, [1, 4, 5, 64]> v_5 = transpose(perm = v_5_perm_0, x = var_688)[name = tensor<string, []>("transpose_42")];
+            tensor<fp32, [1, 4, 5, 64]> inner_5 = matmul(transpose_x = inner_5_transpose_x_0, transpose_y = inner_5_transpose_y_0, x = var_701, y = v_5)[name = tensor<string, []>("inner_5")];
+            tensor<bool, []> var_703_transpose_x_0 = const()[name = tensor<string, []>("op_703_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_703_transpose_y_0 = const()[name = tensor<string, []>("op_703_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 5, 64]> var_703 = matmul(transpose_x = var_703_transpose_x_0, transpose_y = var_703_transpose_y_0, x = q_5, y = prev_kv_5)[name = tensor<string, []>("op_703")];
+            tensor<fp32, [5]> var_704 = real_div(x = sqrt_s0_5, y = sqrt_s_t_5)[name = tensor<string, []>("op_704")];
+            tensor<int32, [4]> var_705 = const()[name = tensor<string, []>("op_705"), val = tensor<int32, [4]>([1, 1, 5, 1])];
+            tensor<fp32, [1, 1, 5, 1]> var_706 = reshape(shape = var_705, x = var_704)[name = tensor<string, []>("op_706")];
+            tensor<fp32, [1, 4, 5, 64]> cross_5 = mul(x = var_703, y = var_706)[name = tensor<string, []>("cross_5")];
             tensor<fp32, [1, 4, 5, 64]> out_13 = add(x = inner_5, y = cross_5)[name = tensor<string, []>("out_13")];
-            tensor<fp32, [1, 4, 64, 64]> var_644 = mul(x = prev_kv_5, y = sqrt_s0_5)[name = tensor<string, []>("op_644")];
-            tensor<bool, []> var_646_transpose_x_1 = const()[name = tensor<string, []>("op_646_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_646_transpose_y_1 = const()[name = tensor<string, []>("op_646_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_646 = matmul(transpose_x = var_646_transpose_x_1, transpose_y = var_646_transpose_y_1, x = k_5, y = v_5)[name = tensor<string, []>("op_646")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_5 = add(x = var_644, y = var_646)[name = tensor<string, []>("new_kv_unnorm_5")];
-            tensor<fp32, []> var_648 = const()[name = tensor<string, []>("op_648"), val = tensor<fp32, []>(0x1.4p+2)];
-            tensor<fp32, [1]> new_scale_5 = add(x = prev_scale_5, y = var_648)[name = tensor<string, []>("new_scale_5")];
-            tensor<fp32, [1]> var_650 = sqrt(x = new_scale_5)[name = tensor<string, []>("op_650")];
-            tensor<fp32, [1, 4, 64, 64]> var_651 = real_div(x = new_kv_unnorm_5, y = var_650)[name = tensor<string, []>("op_651")];
-            tensor<int32, [4]> var_652_perm_0 = const()[name = tensor<string, []>("op_652_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_709 = mul(x = prev_kv_5, y = sqrt_s0_5)[name = tensor<string, []>("op_709")];
+            tensor<bool, []> var_711_transpose_x_1 = const()[name = tensor<string, []>("op_711_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_711_transpose_y_1 = const()[name = tensor<string, []>("op_711_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_711 = matmul(transpose_x = var_711_transpose_x_1, transpose_y = var_711_transpose_y_1, x = k_5, y = v_5)[name = tensor<string, []>("op_711")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_5 = add(x = var_709, y = var_711)[name = tensor<string, []>("new_kv_unnorm_5")];
+            tensor<fp32, []> var_713 = const()[name = tensor<string, []>("op_713"), val = tensor<fp32, []>(0x1.4p+2)];
+            tensor<fp32, [1]> new_scale_5 = add(x = prev_scale_5, y = var_713)[name = tensor<string, []>("new_scale_5")];
+            tensor<fp32, [1]> var_715 = sqrt(x = new_scale_5)[name = tensor<string, []>("op_715")];
+            tensor<fp32, [1, 4, 64, 64]> var_716 = real_div(x = new_kv_unnorm_5, y = var_715)[name = tensor<string, []>("op_716")];
+            tensor<int32, [4]> var_717_perm_0 = const()[name = tensor<string, []>("op_717_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_15_axes_0 = const()[name = tensor<string, []>("out_15_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 4, 64]> var_652 = transpose(perm = var_652_perm_0, x = out_13)[name = tensor<string, []>("transpose_41")];
-            tensor<fp32, [1, 5, 4, 64]> out_15 = layer_norm(axes = out_15_axes_0, epsilon = var_18, x = var_652)[name = tensor<string, []>("out_15")];
-            tensor<int32, [3]> var_656 = const()[name = tensor<string, []>("op_656"), val = tensor<int32, [3]>([1, 5, 256])];
-            tensor<fp32, [1, 5, 256]> out_17 = reshape(shape = var_656, x = out_15)[name = tensor<string, []>("out_17")];
-            tensor<fp32, [1, 5, 256]> var_658 = silu(x = input_97)[name = tensor<string, []>("op_658")];
-            tensor<fp32, [1, 5, 256]> input_99 = mul(x = var_658, y = out_17)[name = tensor<string, []>("input_99")];
-            tensor<fp32, [1, 5, 256]> ret_out_5 = linear(bias = encoder_out_proj_2_bias, weight = encoder_out_proj_2_weight, x = input_99)[name = tensor<string, []>("linear_25")];
-            tensor<fp32, [1, 5, 256]> x_15 = add(x = input_93, y = ret_out_5)[name = tensor<string, []>("x_15")];
+            tensor<fp32, [1, 5, 4, 64]> var_717 = transpose(perm = var_717_perm_0, x = out_13)[name = tensor<string, []>("transpose_41")];
+            tensor<fp32, [1, 5, 4, 64]> out_15 = layer_norm(axes = out_15_axes_0, epsilon = var_82, x = var_717)[name = tensor<string, []>("out_15")];
+            tensor<int32, [3]> var_721 = const()[name = tensor<string, []>("op_721"), val = tensor<int32, [3]>([1, 5, 256])];
+            tensor<fp32, [1, 5, 256]> out_17 = reshape(shape = var_721, x = out_15)[name = tensor<string, []>("out_17")];
+            tensor<fp32, [1, 5, 256]> var_723 = silu(x = input_99)[name = tensor<string, []>("op_723")];
+            tensor<fp32, [1, 5, 256]> input_101 = mul(x = var_723, y = out_17)[name = tensor<string, []>("input_101")];
+            tensor<fp32, [1, 5, 256]> ret_out_5 = linear(bias = inner_encoder_out_proj_2_bias, weight = inner_encoder_out_proj_2_weight, x = input_101)[name = tensor<string, []>("linear_25")];
+            tensor<fp32, [1, 5, 256]> x_15 = add(x = input_95, y = ret_out_5)[name = tensor<string, []>("x_15")];
             tensor<int32, [4]> window_25_begin_0 = const()[name = tensor<string, []>("window_25_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> window_25_end_0 = const()[name = tensor<string, []>("window_25_end_0"), val = tensor<int32, [4]>([3, 1, 16, 256])];
             tensor<bool, [4]> window_25_end_mask_0 = const()[name = tensor<string, []>("window_25_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_25_squeeze_mask_0 = const()[name = tensor<string, []>("window_25_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_25 = slice_by_index(begin = window_25_begin_0, end = window_25_end_0, end_mask = window_25_end_mask_0, squeeze_mask = window_25_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_25")];
-            tensor<int32, [3]> var_666_begin_0 = const()[name = tensor<string, []>("op_666_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_666_end_0 = const()[name = tensor<string, []>("op_666_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_666_end_mask_0 = const()[name = tensor<string, []>("op_666_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_666 = slice_by_index(begin = var_666_begin_0, end = var_666_end_0, end_mask = var_666_end_mask_0, x = x_15)[name = tensor<string, []>("op_666")];
-            tensor<int32, [3]> var_669_begin_0 = const()[name = tensor<string, []>("op_669_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_669_end_0 = const()[name = tensor<string, []>("op_669_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_669_end_mask_0 = const()[name = tensor<string, []>("op_669_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_669 = slice_by_index(begin = var_669_begin_0, end = var_669_end_0, end_mask = var_669_end_mask_0, x = window_25)[name = tensor<string, []>("op_669")];
+            tensor<int32, [3]> var_731_begin_0 = const()[name = tensor<string, []>("op_731_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_731_end_0 = const()[name = tensor<string, []>("op_731_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_731_end_mask_0 = const()[name = tensor<string, []>("op_731_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_731 = slice_by_index(begin = var_731_begin_0, end = var_731_end_0, end_mask = var_731_end_mask_0, x = x_15)[name = tensor<string, []>("op_731")];
+            tensor<int32, [3]> var_734_begin_0 = const()[name = tensor<string, []>("op_734_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_734_end_0 = const()[name = tensor<string, []>("op_734_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_734_end_mask_0 = const()[name = tensor<string, []>("op_734_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_734 = slice_by_index(begin = var_734_begin_0, end = var_734_end_0, end_mask = var_734_end_mask_0, x = window_25)[name = tensor<string, []>("op_734")];
             tensor<bool, []> window_27_interleave_0 = const()[name = tensor<string, []>("window_27_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_27 = concat(axis = var_27, interleave = window_27_interleave_0, values = (var_669, var_666))[name = tensor<string, []>("window_27")];
-            tensor<int32, [3]> var_674_begin_0 = const()[name = tensor<string, []>("op_674_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_674_end_0 = const()[name = tensor<string, []>("op_674_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_674_end_mask_0 = const()[name = tensor<string, []>("op_674_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_674 = slice_by_index(begin = var_674_begin_0, end = var_674_end_0, end_mask = var_674_end_mask_0, x = x_15)[name = tensor<string, []>("op_674")];
-            tensor<int32, [3]> var_677_begin_0 = const()[name = tensor<string, []>("op_677_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_677_end_0 = const()[name = tensor<string, []>("op_677_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_677_end_mask_0 = const()[name = tensor<string, []>("op_677_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_677 = slice_by_index(begin = var_677_begin_0, end = var_677_end_0, end_mask = var_677_end_mask_0, x = window_27)[name = tensor<string, []>("op_677")];
+            tensor<fp32, [1, 16, 256]> window_27 = concat(axis = var_92, interleave = window_27_interleave_0, values = (var_734, var_731))[name = tensor<string, []>("window_27")];
+            tensor<int32, [3]> var_739_begin_0 = const()[name = tensor<string, []>("op_739_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_739_end_0 = const()[name = tensor<string, []>("op_739_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_739_end_mask_0 = const()[name = tensor<string, []>("op_739_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_739 = slice_by_index(begin = var_739_begin_0, end = var_739_end_0, end_mask = var_739_end_mask_0, x = x_15)[name = tensor<string, []>("op_739")];
+            tensor<int32, [3]> var_742_begin_0 = const()[name = tensor<string, []>("op_742_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_742_end_0 = const()[name = tensor<string, []>("op_742_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_742_end_mask_0 = const()[name = tensor<string, []>("op_742_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_742 = slice_by_index(begin = var_742_begin_0, end = var_742_end_0, end_mask = var_742_end_mask_0, x = window_27)[name = tensor<string, []>("op_742")];
             tensor<bool, []> window_29_interleave_0 = const()[name = tensor<string, []>("window_29_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_29 = concat(axis = var_27, interleave = window_29_interleave_0, values = (var_677, var_674))[name = tensor<string, []>("window_29")];
-            tensor<int32, [3]> var_682_begin_0 = const()[name = tensor<string, []>("op_682_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_682_end_0 = const()[name = tensor<string, []>("op_682_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<bool, [3]> var_682_end_mask_0 = const()[name = tensor<string, []>("op_682_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_682 = slice_by_index(begin = var_682_begin_0, end = var_682_end_0, end_mask = var_682_end_mask_0, x = x_15)[name = tensor<string, []>("op_682")];
-            tensor<int32, [3]> var_685_begin_0 = const()[name = tensor<string, []>("op_685_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_685_end_0 = const()[name = tensor<string, []>("op_685_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_685_end_mask_0 = const()[name = tensor<string, []>("op_685_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_685 = slice_by_index(begin = var_685_begin_0, end = var_685_end_0, end_mask = var_685_end_mask_0, x = window_29)[name = tensor<string, []>("op_685")];
+            tensor<fp32, [1, 16, 256]> window_29 = concat(axis = var_92, interleave = window_29_interleave_0, values = (var_742, var_739))[name = tensor<string, []>("window_29")];
+            tensor<int32, [3]> var_747_begin_0 = const()[name = tensor<string, []>("op_747_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_747_end_0 = const()[name = tensor<string, []>("op_747_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<bool, [3]> var_747_end_mask_0 = const()[name = tensor<string, []>("op_747_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_747 = slice_by_index(begin = var_747_begin_0, end = var_747_end_0, end_mask = var_747_end_mask_0, x = x_15)[name = tensor<string, []>("op_747")];
+            tensor<int32, [3]> var_750_begin_0 = const()[name = tensor<string, []>("op_750_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_750_end_0 = const()[name = tensor<string, []>("op_750_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_750_end_mask_0 = const()[name = tensor<string, []>("op_750_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_750 = slice_by_index(begin = var_750_begin_0, end = var_750_end_0, end_mask = var_750_end_mask_0, x = window_29)[name = tensor<string, []>("op_750")];
             tensor<bool, []> window_31_interleave_0 = const()[name = tensor<string, []>("window_31_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_31 = concat(axis = var_27, interleave = window_31_interleave_0, values = (var_685, var_682))[name = tensor<string, []>("window_31")];
-            tensor<int32, [3]> var_690_begin_0 = const()[name = tensor<string, []>("op_690_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
-            tensor<int32, [3]> var_690_end_0 = const()[name = tensor<string, []>("op_690_end_0"), val = tensor<int32, [3]>([1, 4, 256])];
-            tensor<bool, [3]> var_690_end_mask_0 = const()[name = tensor<string, []>("op_690_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_690 = slice_by_index(begin = var_690_begin_0, end = var_690_end_0, end_mask = var_690_end_mask_0, x = x_15)[name = tensor<string, []>("op_690")];
-            tensor<int32, [3]> var_693_begin_0 = const()[name = tensor<string, []>("op_693_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_693_end_0 = const()[name = tensor<string, []>("op_693_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_693_end_mask_0 = const()[name = tensor<string, []>("op_693_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_693 = slice_by_index(begin = var_693_begin_0, end = var_693_end_0, end_mask = var_693_end_mask_0, x = window_31)[name = tensor<string, []>("op_693")];
+            tensor<fp32, [1, 16, 256]> window_31 = concat(axis = var_92, interleave = window_31_interleave_0, values = (var_750, var_747))[name = tensor<string, []>("window_31")];
+            tensor<int32, [3]> var_755_begin_0 = const()[name = tensor<string, []>("op_755_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
+            tensor<int32, [3]> var_755_end_0 = const()[name = tensor<string, []>("op_755_end_0"), val = tensor<int32, [3]>([1, 4, 256])];
+            tensor<bool, [3]> var_755_end_mask_0 = const()[name = tensor<string, []>("op_755_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_755 = slice_by_index(begin = var_755_begin_0, end = var_755_end_0, end_mask = var_755_end_mask_0, x = x_15)[name = tensor<string, []>("op_755")];
+            tensor<int32, [3]> var_758_begin_0 = const()[name = tensor<string, []>("op_758_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_758_end_0 = const()[name = tensor<string, []>("op_758_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_758_end_mask_0 = const()[name = tensor<string, []>("op_758_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_758 = slice_by_index(begin = var_758_begin_0, end = var_758_end_0, end_mask = var_758_end_mask_0, x = window_31)[name = tensor<string, []>("op_758")];
             tensor<bool, []> window_33_interleave_0 = const()[name = tensor<string, []>("window_33_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_33 = concat(axis = var_27, interleave = window_33_interleave_0, values = (var_693, var_690))[name = tensor<string, []>("window_33")];
-            tensor<int32, [3]> var_698_begin_0 = const()[name = tensor<string, []>("op_698_begin_0"), val = tensor<int32, [3]>([0, 4, 0])];
-            tensor<int32, [3]> var_698_end_0 = const()[name = tensor<string, []>("op_698_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_698_end_mask_0 = const()[name = tensor<string, []>("op_698_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_698 = slice_by_index(begin = var_698_begin_0, end = var_698_end_0, end_mask = var_698_end_mask_0, x = x_15)[name = tensor<string, []>("op_698")];
-            tensor<int32, [3]> var_701_begin_0 = const()[name = tensor<string, []>("op_701_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_701_end_0 = const()[name = tensor<string, []>("op_701_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_701_end_mask_0 = const()[name = tensor<string, []>("op_701_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_701 = slice_by_index(begin = var_701_begin_0, end = var_701_end_0, end_mask = var_701_end_mask_0, x = window_33)[name = tensor<string, []>("op_701")];
+            tensor<fp32, [1, 16, 256]> window_33 = concat(axis = var_92, interleave = window_33_interleave_0, values = (var_758, var_755))[name = tensor<string, []>("window_33")];
+            tensor<int32, [3]> var_763_begin_0 = const()[name = tensor<string, []>("op_763_begin_0"), val = tensor<int32, [3]>([0, 4, 0])];
+            tensor<int32, [3]> var_763_end_0 = const()[name = tensor<string, []>("op_763_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_763_end_mask_0 = const()[name = tensor<string, []>("op_763_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_763 = slice_by_index(begin = var_763_begin_0, end = var_763_end_0, end_mask = var_763_end_mask_0, x = x_15)[name = tensor<string, []>("op_763")];
+            tensor<int32, [3]> var_766_begin_0 = const()[name = tensor<string, []>("op_766_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_766_end_0 = const()[name = tensor<string, []>("op_766_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_766_end_mask_0 = const()[name = tensor<string, []>("op_766_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_766 = slice_by_index(begin = var_766_begin_0, end = var_766_end_0, end_mask = var_766_end_mask_0, x = window_33)[name = tensor<string, []>("op_766")];
             tensor<bool, []> window_35_interleave_0 = const()[name = tensor<string, []>("window_35_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_35 = concat(axis = var_27, interleave = window_35_interleave_0, values = (var_701, var_698))[name = tensor<string, []>("window_35")];
-            tensor<bool, []> input_101_interleave_0 = const()[name = tensor<string, []>("input_101_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [5, 16, 256]> input_101 = concat(axis = var_24, interleave = input_101_interleave_0, values = (window_27, window_29, window_31, window_33, window_35))[name = tensor<string, []>("input_101")];
+            tensor<fp32, [1, 16, 256]> window_35 = concat(axis = var_92, interleave = window_35_interleave_0, values = (var_766, var_763))[name = tensor<string, []>("window_35")];
+            tensor<bool, []> input_103_interleave_0 = const()[name = tensor<string, []>("input_103_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [5, 16, 256]> input_103 = concat(axis = var_77, interleave = input_103_interleave_0, values = (window_27, window_29, window_31, window_33, window_35))[name = tensor<string, []>("input_103")];
             tensor<int32, [1]> x_17_axes_0 = const()[name = tensor<string, []>("x_17_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [5, 16, 256]> x_17 = layer_norm(axes = x_17_axes_0, beta = encoder_conv_module_2_sequential_0_bias, epsilon = var_29, gamma = encoder_conv_module_2_sequential_0_weight, x = input_101)[name = tensor<string, []>("x_17")];
-            tensor<int32, [3]> input_103_perm_0 = const()[name = tensor<string, []>("input_103_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [5, 16, 256]> x_17 = layer_norm(axes = x_17_axes_0, beta = inner_encoder_conv_module_2_sequential_0_bias, epsilon = var_74, gamma = inner_encoder_conv_module_2_sequential_0_weight, x = input_103)[name = tensor<string, []>("x_17")];
+            tensor<int32, [3]> input_105_perm_0 = const()[name = tensor<string, []>("input_105_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_23_pad_type_0 = const()[name = tensor<string, []>("inputs_23_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_23_strides_0 = const()[name = tensor<string, []>("inputs_23_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_23_pad_0 = const()[name = tensor<string, []>("inputs_23_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_23_dilations_0 = const()[name = tensor<string, []>("inputs_23_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_23_groups_0 = const()[name = tensor<string, []>("inputs_23_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> input_103 = transpose(perm = input_103_perm_0, x = x_17)[name = tensor<string, []>("transpose_40")];
-            tensor<fp32, [5, 512, 16]> inputs_23 = conv(bias = encoder_conv_module_2_sequential_2_conv_bias, dilations = inputs_23_dilations_0, groups = inputs_23_groups_0, pad = inputs_23_pad_0, pad_type = inputs_23_pad_type_0, strides = inputs_23_strides_0, weight = encoder_conv_module_2_sequential_2_conv_weight, x = input_103)[name = tensor<string, []>("inputs_23")];
-            tensor<int32, [2]> var_726_split_sizes_0 = const()[name = tensor<string, []>("op_726_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_726_axis_0 = const()[name = tensor<string, []>("op_726_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> var_726_0, tensor<fp32, [5, 256, 16]> var_726_1 = split(axis = var_726_axis_0, split_sizes = var_726_split_sizes_0, x = inputs_23)[name = tensor<string, []>("op_726")];
-            tensor<fp32, [5, 256, 16]> var_728 = sigmoid(x = var_726_1)[name = tensor<string, []>("op_728")];
-            tensor<fp32, [5, 256, 16]> inputs_25 = mul(x = var_726_0, y = var_728)[name = tensor<string, []>("inputs_25")];
+            tensor<fp32, [5, 256, 16]> input_105 = transpose(perm = input_105_perm_0, x = x_17)[name = tensor<string, []>("transpose_40")];
+            tensor<fp32, [5, 512, 16]> inputs_23 = conv(bias = inner_encoder_conv_module_2_sequential_2_conv_bias, dilations = inputs_23_dilations_0, groups = inputs_23_groups_0, pad = inputs_23_pad_0, pad_type = inputs_23_pad_type_0, strides = inputs_23_strides_0, weight = inner_encoder_conv_module_2_sequential_2_conv_weight, x = input_105)[name = tensor<string, []>("inputs_23")];
+            tensor<int32, [2]> var_791_split_sizes_0 = const()[name = tensor<string, []>("op_791_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_791_axis_0 = const()[name = tensor<string, []>("op_791_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [5, 256, 16]> var_791_0, tensor<fp32, [5, 256, 16]> var_791_1 = split(axis = var_791_axis_0, split_sizes = var_791_split_sizes_0, x = inputs_23)[name = tensor<string, []>("op_791")];
+            tensor<fp32, [5, 256, 16]> var_793 = sigmoid(x = var_791_1)[name = tensor<string, []>("op_793")];
+            tensor<fp32, [5, 256, 16]> inputs_25 = mul(x = var_791_0, y = var_793)[name = tensor<string, []>("inputs_25")];
             tensor<string, []> outputs_aug_5_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_5_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_5_pad_0 = const()[name = tensor<string, []>("outputs_aug_5_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_5_groups_0 = const()[name = tensor<string, []>("outputs_aug_5_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_5_strides_0 = const()[name = tensor<string, []>("outputs_aug_5_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_5_dilations_0 = const()[name = tensor<string, []>("outputs_aug_5_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [5, 256, 31]> outputs_aug_5 = conv(dilations = outputs_aug_5_dilations_0, groups = outputs_aug_5_groups_0, pad = outputs_aug_5_pad_0, pad_type = outputs_aug_5_pad_type_0, strides = outputs_aug_5_strides_0, weight = encoder_conv_module_2_sequential_4_conv_weight, x = inputs_25)[name = tensor<string, []>("outputs_aug_5")];
-            tensor<int32, [3]> input_105_begin_0 = const()[name = tensor<string, []>("input_105_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_105_end_0 = const()[name = tensor<string, []>("input_105_end_0"), val = tensor<int32, [3]>([5, 256, 16])];
-            tensor<bool, [3]> input_105_end_mask_0 = const()[name = tensor<string, []>("input_105_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [5, 256, 16]> input_105 = slice_by_index(begin = input_105_begin_0, end = input_105_end_0, end_mask = input_105_end_mask_0, x = outputs_aug_5)[name = tensor<string, []>("input_105")];
-            tensor<fp32, [5, 256, 16]> inputs_27 = batch_norm(beta = encoder_conv_module_2_sequential_5_bias, epsilon = var_29, gamma = encoder_conv_module_2_sequential_5_weight, mean = encoder_conv_module_2_sequential_5_running_mean, variance = encoder_conv_module_2_sequential_5_running_var, x = input_105)[name = tensor<string, []>("inputs_27")];
-            tensor<fp32, [5, 256, 16]> input_107 = silu(x = inputs_27)[name = tensor<string, []>("input_107")];
-            tensor<string, []> input_109_pad_type_0 = const()[name = tensor<string, []>("input_109_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_109_strides_0 = const()[name = tensor<string, []>("input_109_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_109_pad_0 = const()[name = tensor<string, []>("input_109_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_109_dilations_0 = const()[name = tensor<string, []>("input_109_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_109_groups_0 = const()[name = tensor<string, []>("input_109_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> input_109 = conv(bias = encoder_conv_module_2_sequential_7_conv_bias, dilations = input_109_dilations_0, groups = input_109_groups_0, pad = input_109_pad_0, pad_type = input_109_pad_type_0, strides = input_109_strides_0, weight = encoder_conv_module_2_sequential_7_conv_weight, x = input_107)[name = tensor<string, []>("input_109")];
+            tensor<fp32, [5, 256, 31]> outputs_aug_5 = conv(dilations = outputs_aug_5_dilations_0, groups = outputs_aug_5_groups_0, pad = outputs_aug_5_pad_0, pad_type = outputs_aug_5_pad_type_0, strides = outputs_aug_5_strides_0, weight = inner_encoder_conv_module_2_sequential_4_conv_weight, x = inputs_25)[name = tensor<string, []>("outputs_aug_5")];
+            tensor<int32, [3]> input_107_begin_0 = const()[name = tensor<string, []>("input_107_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_107_end_0 = const()[name = tensor<string, []>("input_107_end_0"), val = tensor<int32, [3]>([5, 256, 16])];
+            tensor<bool, [3]> input_107_end_mask_0 = const()[name = tensor<string, []>("input_107_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [5, 256, 16]> input_107 = slice_by_index(begin = input_107_begin_0, end = input_107_end_0, end_mask = input_107_end_mask_0, x = outputs_aug_5)[name = tensor<string, []>("input_107")];
+            tensor<fp32, [5, 256, 16]> inputs_27 = batch_norm(beta = inner_encoder_conv_module_2_sequential_5_bias, epsilon = var_74, gamma = inner_encoder_conv_module_2_sequential_5_weight, mean = inner_encoder_conv_module_2_sequential_5_running_mean, variance = inner_encoder_conv_module_2_sequential_5_running_var, x = input_107)[name = tensor<string, []>("inputs_27")];
+            tensor<fp32, [5, 256, 16]> input_109 = silu(x = inputs_27)[name = tensor<string, []>("input_109")];
+            tensor<string, []> input_111_pad_type_0 = const()[name = tensor<string, []>("input_111_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_111_strides_0 = const()[name = tensor<string, []>("input_111_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_111_pad_0 = const()[name = tensor<string, []>("input_111_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_111_dilations_0 = const()[name = tensor<string, []>("input_111_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_111_groups_0 = const()[name = tensor<string, []>("input_111_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [5, 256, 16]> input_111 = conv(bias = inner_encoder_conv_module_2_sequential_7_conv_bias, dilations = input_111_dilations_0, groups = input_111_groups_0, pad = input_111_pad_0, pad_type = input_111_pad_type_0, strides = input_111_strides_0, weight = inner_encoder_conv_module_2_sequential_7_conv_weight, x = input_109)[name = tensor<string, []>("input_111")];
             tensor<int32, [3]> conv_out_5_perm_0 = const()[name = tensor<string, []>("conv_out_5_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_759_begin_0 = const()[name = tensor<string, []>("op_759_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_759_end_0 = const()[name = tensor<string, []>("op_759_end_0"), val = tensor<int32, [3]>([5, 16, 256])];
-            tensor<bool, [3]> var_759_end_mask_0 = const()[name = tensor<string, []>("op_759_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [5, 16, 256]> conv_out_5 = transpose(perm = conv_out_5_perm_0, x = input_109)[name = tensor<string, []>("transpose_39")];
-            tensor<fp32, [5, 1, 256]> var_759 = slice_by_index(begin = var_759_begin_0, end = var_759_end_0, end_mask = var_759_end_mask_0, x = conv_out_5)[name = tensor<string, []>("op_759")];
-            tensor<int32, [3]> var_761_perm_0 = const()[name = tensor<string, []>("op_761_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 5, 256]> var_761 = transpose(perm = var_761_perm_0, x = var_759)[name = tensor<string, []>("transpose_38")];
-            tensor<fp32, [1, 5, 256]> input_111 = add(x = x_15, y = var_761)[name = tensor<string, []>("input_111")];
-            tensor<int32, [1]> input_113_axes_0 = const()[name = tensor<string, []>("input_113_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_113 = layer_norm(axes = input_113_axes_0, beta = encoder_ffn2_2_module_sequential_0_bias, epsilon = var_29, gamma = encoder_ffn2_2_module_sequential_0_weight, x = input_111)[name = tensor<string, []>("input_113")];
-            tensor<fp32, [1, 5, 1024]> inputs_29 = linear(bias = encoder_ffn2_2_module_sequential_1_linear_bias, weight = encoder_ffn2_2_module_sequential_1_linear_weight, x = input_113)[name = tensor<string, []>("linear_26")];
-            tensor<fp32, [1, 5, 1024]> input_115 = silu(x = inputs_29)[name = tensor<string, []>("input_115")];
-            tensor<fp32, [1, 5, 256]> input_119 = linear(bias = encoder_ffn2_2_module_sequential_4_linear_bias, weight = encoder_ffn2_2_module_sequential_4_linear_weight, x = input_115)[name = tensor<string, []>("linear_27")];
-            tensor<fp32, []> var_784 = const()[name = tensor<string, []>("op_784"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 5, 256]> var_785 = mul(x = input_119, y = var_784)[name = tensor<string, []>("op_785")];
-            tensor<fp32, [1, 5, 256]> input_121 = add(x = var_785, y = input_111)[name = tensor<string, []>("input_121")];
-            tensor<int32, [1]> input_123_axes_0 = const()[name = tensor<string, []>("input_123_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_123 = layer_norm(axes = input_123_axes_0, beta = encoder_layer_norm_2_bias, epsilon = var_29, gamma = encoder_layer_norm_2_weight, x = input_121)[name = tensor<string, []>("input_123")];
+            tensor<int32, [3]> var_824_begin_0 = const()[name = tensor<string, []>("op_824_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_824_end_0 = const()[name = tensor<string, []>("op_824_end_0"), val = tensor<int32, [3]>([5, 16, 256])];
+            tensor<bool, [3]> var_824_end_mask_0 = const()[name = tensor<string, []>("op_824_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [5, 16, 256]> conv_out_5 = transpose(perm = conv_out_5_perm_0, x = input_111)[name = tensor<string, []>("transpose_39")];
+            tensor<fp32, [5, 1, 256]> var_824 = slice_by_index(begin = var_824_begin_0, end = var_824_end_0, end_mask = var_824_end_mask_0, x = conv_out_5)[name = tensor<string, []>("op_824")];
+            tensor<int32, [3]> var_826_perm_0 = const()[name = tensor<string, []>("op_826_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 5, 256]> var_826 = transpose(perm = var_826_perm_0, x = var_824)[name = tensor<string, []>("transpose_38")];
+            tensor<fp32, [1, 5, 256]> input_113 = add(x = x_15, y = var_826)[name = tensor<string, []>("input_113")];
+            tensor<int32, [1]> input_115_axes_0 = const()[name = tensor<string, []>("input_115_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 5, 256]> input_115 = layer_norm(axes = input_115_axes_0, beta = inner_encoder_ffn2_2_module_sequential_0_bias, epsilon = var_74, gamma = inner_encoder_ffn2_2_module_sequential_0_weight, x = input_113)[name = tensor<string, []>("input_115")];
+            tensor<fp32, [1, 5, 1024]> inputs_29 = linear(bias = inner_encoder_ffn2_2_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_2_module_sequential_1_linear_weight, x = input_115)[name = tensor<string, []>("linear_26")];
+            tensor<fp32, [1, 5, 1024]> input_117 = silu(x = inputs_29)[name = tensor<string, []>("input_117")];
+            tensor<fp32, [1, 5, 256]> input_121 = linear(bias = inner_encoder_ffn2_2_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_2_module_sequential_4_linear_weight, x = input_117)[name = tensor<string, []>("linear_27")];
+            tensor<fp32, []> var_849 = const()[name = tensor<string, []>("op_849"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 5, 256]> var_850 = mul(x = input_121, y = var_849)[name = tensor<string, []>("op_850")];
+            tensor<fp32, [1, 5, 256]> input_123 = add(x = var_850, y = input_113)[name = tensor<string, []>("input_123")];
             tensor<int32, [1]> input_125_axes_0 = const()[name = tensor<string, []>("input_125_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_125 = layer_norm(axes = input_125_axes_0, beta = encoder_ffn1_3_module_sequential_0_bias, epsilon = var_29, gamma = encoder_ffn1_3_module_sequential_0_weight, x = input_123)[name = tensor<string, []>("input_125")];
-            tensor<fp32, [1, 5, 1024]> inputs_31 = linear(bias = encoder_ffn1_3_module_sequential_1_linear_bias, weight = encoder_ffn1_3_module_sequential_1_linear_weight, x = input_125)[name = tensor<string, []>("linear_28")];
-            tensor<fp32, [1, 5, 1024]> input_127 = silu(x = inputs_31)[name = tensor<string, []>("input_127")];
-            tensor<fp32, [1, 5, 256]> input_131 = linear(bias = encoder_ffn1_3_module_sequential_4_linear_bias, weight = encoder_ffn1_3_module_sequential_4_linear_weight, x = input_127)[name = tensor<string, []>("linear_29")];
-            tensor<fp32, []> var_814 = const()[name = tensor<string, []>("op_814"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 5, 256]> var_815 = mul(x = input_131, y = var_814)[name = tensor<string, []>("op_815")];
-            tensor<fp32, [1, 5, 256]> input_133 = add(x = var_815, y = input_123)[name = tensor<string, []>("input_133")];
+            tensor<fp32, [1, 5, 256]> input_125 = layer_norm(axes = input_125_axes_0, beta = inner_encoder_layer_norm_2_bias, epsilon = var_74, gamma = inner_encoder_layer_norm_2_weight, x = input_123)[name = tensor<string, []>("input_125")];
+            tensor<int32, [1]> input_127_axes_0 = const()[name = tensor<string, []>("input_127_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 5, 256]> input_127 = layer_norm(axes = input_127_axes_0, beta = inner_encoder_ffn1_3_module_sequential_0_bias, epsilon = var_74, gamma = inner_encoder_ffn1_3_module_sequential_0_weight, x = input_125)[name = tensor<string, []>("input_127")];
+            tensor<fp32, [1, 5, 1024]> inputs_31 = linear(bias = inner_encoder_ffn1_3_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_3_module_sequential_1_linear_weight, x = input_127)[name = tensor<string, []>("linear_28")];
+            tensor<fp32, [1, 5, 1024]> input_129 = silu(x = inputs_31)[name = tensor<string, []>("input_129")];
+            tensor<fp32, [1, 5, 256]> input_133 = linear(bias = inner_encoder_ffn1_3_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_3_module_sequential_4_linear_weight, x = input_129)[name = tensor<string, []>("linear_29")];
+            tensor<fp32, []> var_879 = const()[name = tensor<string, []>("op_879"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 5, 256]> var_880 = mul(x = input_133, y = var_879)[name = tensor<string, []>("op_880")];
+            tensor<fp32, [1, 5, 256]> input_135 = add(x = var_880, y = input_125)[name = tensor<string, []>("input_135")];
             tensor<int32, [1]> x_19_axes_0 = const()[name = tensor<string, []>("x_19_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> x_19 = layer_norm(axes = x_19_axes_0, beta = encoder_ret_lns_3_bias, epsilon = var_29, gamma = encoder_ret_lns_3_weight, x = input_133)[name = tensor<string, []>("x_19")];
+            tensor<fp32, [1, 5, 256]> x_19 = layer_norm(axes = x_19_axes_0, beta = inner_encoder_ret_lns_3_bias, epsilon = var_74, gamma = inner_encoder_ret_lns_3_weight, x = input_135)[name = tensor<string, []>("x_19")];
             tensor<int32, [5]> prev_kv_7_begin_0 = const()[name = tensor<string, []>("prev_kv_7_begin_0"), val = tensor<int32, [5]>([3, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_7_end_0 = const()[name = tensor<string, []>("prev_kv_7_end_0"), val = tensor<int32, [5]>([4, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_7_end_mask_0 = const()[name = tensor<string, []>("prev_kv_7_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -800,219 +826,212 @@ program(1.0)
             tensor<bool, [2]> prev_scale_7_end_mask_0 = const()[name = tensor<string, []>("prev_scale_7_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_7_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_7_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_7 = slice_by_index(begin = prev_scale_7_begin_0, end = prev_scale_7_end_0, end_mask = prev_scale_7_end_mask_0, squeeze_mask = prev_scale_7_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_7")];
-            tensor<fp32, [1, 5, 256]> var_829 = linear(bias = encoder_q_proj_3_bias, weight = encoder_q_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_30")];
-            tensor<int32, [4]> var_830 = const()[name = tensor<string, []>("op_830"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_831 = reshape(shape = var_830, x = var_829)[name = tensor<string, []>("op_831")];
+            tensor<fp32, [1, 5, 256]> var_894 = linear(bias = inner_encoder_q_proj_3_bias, weight = inner_encoder_q_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_30")];
+            tensor<int32, [4]> var_895 = const()[name = tensor<string, []>("op_895"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_896 = reshape(shape = var_895, x = var_894)[name = tensor<string, []>("op_896")];
             tensor<int32, [4]> q_7_perm_0 = const()[name = tensor<string, []>("q_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> var_835 = linear(bias = encoder_k_proj_3_bias, weight = encoder_k_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_31")];
-            tensor<fp32, []> var_836 = const()[name = tensor<string, []>("op_836"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 5, 256]> var_837 = mul(x = var_835, y = var_836)[name = tensor<string, []>("op_837")];
-            tensor<int32, [4]> var_838 = const()[name = tensor<string, []>("op_838"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_839 = reshape(shape = var_838, x = var_837)[name = tensor<string, []>("op_839")];
+            tensor<fp32, [1, 5, 256]> var_900 = linear(bias = inner_encoder_k_proj_3_bias, weight = inner_encoder_k_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_31")];
+            tensor<fp32, []> var_901 = const()[name = tensor<string, []>("op_901"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 5, 256]> var_902 = mul(x = var_900, y = var_901)[name = tensor<string, []>("op_902")];
+            tensor<int32, [4]> var_903 = const()[name = tensor<string, []>("op_903"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_904 = reshape(shape = var_903, x = var_902)[name = tensor<string, []>("op_904")];
             tensor<int32, [4]> k_7_perm_0 = const()[name = tensor<string, []>("k_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> var_843 = linear(bias = encoder_v_proj_3_bias, weight = encoder_v_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_32")];
-            tensor<int32, [4]> var_844 = const()[name = tensor<string, []>("op_844"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_845 = reshape(shape = var_844, x = var_843)[name = tensor<string, []>("op_845")];
+            tensor<fp32, [1, 5, 256]> var_908 = linear(bias = inner_encoder_v_proj_3_bias, weight = inner_encoder_v_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_32")];
+            tensor<int32, [4]> var_909 = const()[name = tensor<string, []>("op_909"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_910 = reshape(shape = var_909, x = var_908)[name = tensor<string, []>("op_910")];
             tensor<int32, [4]> v_7_perm_0 = const()[name = tensor<string, []>("v_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> input_137 = linear(bias = encoder_g_proj_3_bias, weight = encoder_g_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_33")];
+            tensor<fp32, [1, 5, 256]> input_139 = linear(bias = inner_encoder_g_proj_3_bias, weight = inner_encoder_g_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_33")];
             tensor<fp32, [1]> sqrt_s0_7 = sqrt(x = prev_scale_7)[name = tensor<string, []>("sqrt_s0_7")];
-            tensor<fp32, [5]> s_t_7 = add(x = prev_scale_7, y = encoder__t_index)[name = tensor<string, []>("s_t_7")];
+            tensor<fp32, [5]> s_t_7 = add(x = prev_scale_7, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_7")];
             tensor<fp32, [5]> sqrt_s_t_7 = sqrt(x = s_t_7)[name = tensor<string, []>("sqrt_s_t_7")];
             tensor<bool, []> qk_7_transpose_x_1 = const()[name = tensor<string, []>("qk_7_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_7_transpose_y_1 = const()[name = tensor<string, []>("qk_7_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 5, 64]> k_7 = transpose(perm = k_7_perm_0, x = var_839)[name = tensor<string, []>("transpose_36")];
-            tensor<fp32, [1, 4, 5, 64]> q_7 = transpose(perm = q_7_perm_0, x = var_831)[name = tensor<string, []>("transpose_37")];
+            tensor<fp32, [1, 4, 5, 64]> k_7 = transpose(perm = k_7_perm_0, x = var_904)[name = tensor<string, []>("transpose_36")];
+            tensor<fp32, [1, 4, 5, 64]> q_7 = transpose(perm = q_7_perm_0, x = var_896)[name = tensor<string, []>("transpose_37")];
             tensor<fp32, [1, 4, 5, 5]> qk_7 = matmul(transpose_x = qk_7_transpose_x_1, transpose_y = qk_7_transpose_y_1, x = q_7, y = k_7)[name = tensor<string, []>("qk_7")];
-            tensor<int32, [2]> var_855 = const()[name = tensor<string, []>("op_855"), val = tensor<int32, [2]>([5, 1])];
-            tensor<fp32, [5, 1]> var_856 = reshape(shape = var_855, x = sqrt_s_t_7)[name = tensor<string, []>("op_856")];
-            tensor<fp32, [5, 5]> M_7 = real_div(x = encoder__causal_mask, y = var_856)[name = tensor<string, []>("M_7")];
-            tensor<fp32, [1, 4, 5, 5]> var_858 = mul(x = qk_7, y = M_7)[name = tensor<string, []>("op_858")];
+            tensor<int32, [2]> var_920 = const()[name = tensor<string, []>("op_920"), val = tensor<int32, [2]>([5, 1])];
+            tensor<fp32, [5, 1]> var_921 = reshape(shape = var_920, x = sqrt_s_t_7)[name = tensor<string, []>("op_921")];
+            tensor<fp32, [5, 5]> M_7 = real_div(x = inner_encoder__causal_mask, y = var_921)[name = tensor<string, []>("M_7")];
+            tensor<fp32, [1, 4, 5, 5]> var_923 = mul(x = qk_7, y = M_7)[name = tensor<string, []>("op_923")];
             tensor<bool, []> inner_7_transpose_x_0 = const()[name = tensor<string, []>("inner_7_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_7_transpose_y_0 = const()[name = tensor<string, []>("inner_7_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 5, 64]> v_7 = transpose(perm = v_7_perm_0, x = var_845)[name = tensor<string, []>("transpose_35")];
-            tensor<fp32, [1, 4, 5, 64]> inner_7 = matmul(transpose_x = inner_7_transpose_x_0, transpose_y = inner_7_transpose_y_0, x = var_858, y = v_7)[name = tensor<string, []>("inner_7")];
-            tensor<bool, []> var_860_transpose_x_0 = const()[name = tensor<string, []>("op_860_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_860_transpose_y_0 = const()[name = tensor<string, []>("op_860_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 5, 64]> var_860 = matmul(transpose_x = var_860_transpose_x_0, transpose_y = var_860_transpose_y_0, x = q_7, y = prev_kv_7)[name = tensor<string, []>("op_860")];
-            tensor<fp32, [5]> var_861 = real_div(x = sqrt_s0_7, y = sqrt_s_t_7)[name = tensor<string, []>("op_861")];
-            tensor<int32, [4]> var_862 = const()[name = tensor<string, []>("op_862"), val = tensor<int32, [4]>([1, 1, 5, 1])];
-            tensor<fp32, [1, 1, 5, 1]> var_863 = reshape(shape = var_862, x = var_861)[name = tensor<string, []>("op_863")];
-            tensor<fp32, [1, 4, 5, 64]> cross_7 = mul(x = var_860, y = var_863)[name = tensor<string, []>("cross_7")];
+            tensor<fp32, [1, 4, 5, 64]> v_7 = transpose(perm = v_7_perm_0, x = var_910)[name = tensor<string, []>("transpose_35")];
+            tensor<fp32, [1, 4, 5, 64]> inner_7 = matmul(transpose_x = inner_7_transpose_x_0, transpose_y = inner_7_transpose_y_0, x = var_923, y = v_7)[name = tensor<string, []>("inner_7")];
+            tensor<bool, []> var_925_transpose_x_0 = const()[name = tensor<string, []>("op_925_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_925_transpose_y_0 = const()[name = tensor<string, []>("op_925_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 5, 64]> var_925 = matmul(transpose_x = var_925_transpose_x_0, transpose_y = var_925_transpose_y_0, x = q_7, y = prev_kv_7)[name = tensor<string, []>("op_925")];
+            tensor<fp32, [5]> var_926 = real_div(x = sqrt_s0_7, y = sqrt_s_t_7)[name = tensor<string, []>("op_926")];
+            tensor<int32, [4]> var_927 = const()[name = tensor<string, []>("op_927"), val = tensor<int32, [4]>([1, 1, 5, 1])];
+            tensor<fp32, [1, 1, 5, 1]> var_928 = reshape(shape = var_927, x = var_926)[name = tensor<string, []>("op_928")];
+            tensor<fp32, [1, 4, 5, 64]> cross_7 = mul(x = var_925, y = var_928)[name = tensor<string, []>("cross_7")];
             tensor<fp32, [1, 4, 5, 64]> out_19 = add(x = inner_7, y = cross_7)[name = tensor<string, []>("out_19")];
-            tensor<fp32, [1, 4, 64, 64]> var_866 = mul(x = prev_kv_7, y = sqrt_s0_7)[name = tensor<string, []>("op_866")];
-            tensor<bool, []> var_868_transpose_x_1 = const()[name = tensor<string, []>("op_868_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_868_transpose_y_1 = const()[name = tensor<string, []>("op_868_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_868 = matmul(transpose_x = var_868_transpose_x_1, transpose_y = var_868_transpose_y_1, x = k_7, y = v_7)[name = tensor<string, []>("op_868")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_7 = add(x = var_866, y = var_868)[name = tensor<string, []>("new_kv_unnorm_7")];
-            tensor<fp32, []> var_870 = const()[name = tensor<string, []>("op_870"), val = tensor<fp32, []>(0x1.4p+2)];
-            tensor<fp32, [1]> new_scale_7 = add(x = prev_scale_7, y = var_870)[name = tensor<string, []>("new_scale_7")];
-            tensor<fp32, [1]> var_872 = sqrt(x = new_scale_7)[name = tensor<string, []>("op_872")];
-            tensor<fp32, [1, 4, 64, 64]> nkv_1 = real_div(x = new_kv_unnorm_7, y = var_872)[name = tensor<string, []>("nkv_1")];
-            tensor<int32, [4]> var_874_perm_0 = const()[name = tensor<string, []>("op_874_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_931 = mul(x = prev_kv_7, y = sqrt_s0_7)[name = tensor<string, []>("op_931")];
+            tensor<bool, []> var_933_transpose_x_1 = const()[name = tensor<string, []>("op_933_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_933_transpose_y_1 = const()[name = tensor<string, []>("op_933_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_933 = matmul(transpose_x = var_933_transpose_x_1, transpose_y = var_933_transpose_y_1, x = k_7, y = v_7)[name = tensor<string, []>("op_933")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_7 = add(x = var_931, y = var_933)[name = tensor<string, []>("new_kv_unnorm_7")];
+            tensor<fp32, []> var_935 = const()[name = tensor<string, []>("op_935"), val = tensor<fp32, []>(0x1.4p+2)];
+            tensor<fp32, [1]> new_scale_7 = add(x = prev_scale_7, y = var_935)[name = tensor<string, []>("new_scale_7")];
+            tensor<fp32, [1]> var_937 = sqrt(x = new_scale_7)[name = tensor<string, []>("op_937")];
+            tensor<fp32, [1, 4, 64, 64]> nkv_1 = real_div(x = new_kv_unnorm_7, y = var_937)[name = tensor<string, []>("nkv_1")];
+            tensor<int32, [4]> var_939_perm_0 = const()[name = tensor<string, []>("op_939_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_21_axes_0 = const()[name = tensor<string, []>("out_21_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 4, 64]> var_874 = transpose(perm = var_874_perm_0, x = out_19)[name = tensor<string, []>("transpose_34")];
-            tensor<fp32, [1, 5, 4, 64]> out_21 = layer_norm(axes = out_21_axes_0, epsilon = var_18, x = var_874)[name = tensor<string, []>("out_21")];
-            tensor<int32, [3]> var_878 = const()[name = tensor<string, []>("op_878"), val = tensor<int32, [3]>([1, 5, 256])];
-            tensor<fp32, [1, 5, 256]> out_23 = reshape(shape = var_878, x = out_21)[name = tensor<string, []>("out_23")];
-            tensor<fp32, [1, 5, 256]> var_880 = silu(x = input_137)[name = tensor<string, []>("op_880")];
-            tensor<fp32, [1, 5, 256]> input_139 = mul(x = var_880, y = out_23)[name = tensor<string, []>("input_139")];
-            tensor<fp32, [1, 5, 256]> ret_out_7 = linear(bias = encoder_out_proj_3_bias, weight = encoder_out_proj_3_weight, x = input_139)[name = tensor<string, []>("linear_34")];
-            tensor<fp32, [1, 5, 256]> x_21 = add(x = input_133, y = ret_out_7)[name = tensor<string, []>("x_21")];
+            tensor<fp32, [1, 5, 4, 64]> var_939 = transpose(perm = var_939_perm_0, x = out_19)[name = tensor<string, []>("transpose_34")];
+            tensor<fp32, [1, 5, 4, 64]> out_21 = layer_norm(axes = out_21_axes_0, epsilon = var_82, x = var_939)[name = tensor<string, []>("out_21")];
+            tensor<int32, [3]> var_943 = const()[name = tensor<string, []>("op_943"), val = tensor<int32, [3]>([1, 5, 256])];
+            tensor<fp32, [1, 5, 256]> out_23 = reshape(shape = var_943, x = out_21)[name = tensor<string, []>("out_23")];
+            tensor<fp32, [1, 5, 256]> var_945 = silu(x = input_139)[name = tensor<string, []>("op_945")];
+            tensor<fp32, [1, 5, 256]> input_141 = mul(x = var_945, y = out_23)[name = tensor<string, []>("input_141")];
+            tensor<fp32, [1, 5, 256]> ret_out_7 = linear(bias = inner_encoder_out_proj_3_bias, weight = inner_encoder_out_proj_3_weight, x = input_141)[name = tensor<string, []>("linear_34")];
+            tensor<fp32, [1, 5, 256]> x_21 = add(x = input_135, y = ret_out_7)[name = tensor<string, []>("x_21")];
             tensor<int32, [4]> window_37_begin_0 = const()[name = tensor<string, []>("window_37_begin_0"), val = tensor<int32, [4]>([3, 0, 0, 0])];
             tensor<int32, [4]> window_37_end_0 = const()[name = tensor<string, []>("window_37_end_0"), val = tensor<int32, [4]>([4, 1, 16, 256])];
             tensor<bool, [4]> window_37_end_mask_0 = const()[name = tensor<string, []>("window_37_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_37_squeeze_mask_0 = const()[name = tensor<string, []>("window_37_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_37 = slice_by_index(begin = window_37_begin_0, end = window_37_end_0, end_mask = window_37_end_mask_0, squeeze_mask = window_37_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_37")];
-            tensor<int32, [3]> var_888_begin_0 = const()[name = tensor<string, []>("op_888_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_888_end_0 = const()[name = tensor<string, []>("op_888_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_888_end_mask_0 = const()[name = tensor<string, []>("op_888_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_888 = slice_by_index(begin = var_888_begin_0, end = var_888_end_0, end_mask = var_888_end_mask_0, x = x_21)[name = tensor<string, []>("op_888")];
-            tensor<int32, [3]> var_891_begin_0 = const()[name = tensor<string, []>("op_891_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_891_end_0 = const()[name = tensor<string, []>("op_891_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_891_end_mask_0 = const()[name = tensor<string, []>("op_891_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_891 = slice_by_index(begin = var_891_begin_0, end = var_891_end_0, end_mask = var_891_end_mask_0, x = window_37)[name = tensor<string, []>("op_891")];
+            tensor<int32, [3]> var_953_begin_0 = const()[name = tensor<string, []>("op_953_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_953_end_0 = const()[name = tensor<string, []>("op_953_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_953_end_mask_0 = const()[name = tensor<string, []>("op_953_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_953 = slice_by_index(begin = var_953_begin_0, end = var_953_end_0, end_mask = var_953_end_mask_0, x = x_21)[name = tensor<string, []>("op_953")];
+            tensor<int32, [3]> var_956_begin_0 = const()[name = tensor<string, []>("op_956_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_956_end_0 = const()[name = tensor<string, []>("op_956_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_956_end_mask_0 = const()[name = tensor<string, []>("op_956_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_956 = slice_by_index(begin = var_956_begin_0, end = var_956_end_0, end_mask = var_956_end_mask_0, x = window_37)[name = tensor<string, []>("op_956")];
             tensor<bool, []> window_39_interleave_0 = const()[name = tensor<string, []>("window_39_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_39 = concat(axis = var_27, interleave = window_39_interleave_0, values = (var_891, var_888))[name = tensor<string, []>("window_39")];
-            tensor<int32, [3]> var_896_begin_0 = const()[name = tensor<string, []>("op_896_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_896_end_0 = const()[name = tensor<string, []>("op_896_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_896_end_mask_0 = const()[name = tensor<string, []>("op_896_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_896 = slice_by_index(begin = var_896_begin_0, end = var_896_end_0, end_mask = var_896_end_mask_0, x = x_21)[name = tensor<string, []>("op_896")];
-            tensor<int32, [3]> var_899_begin_0 = const()[name = tensor<string, []>("op_899_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_899_end_0 = const()[name = tensor<string, []>("op_899_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_899_end_mask_0 = const()[name = tensor<string, []>("op_899_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_899 = slice_by_index(begin = var_899_begin_0, end = var_899_end_0, end_mask = var_899_end_mask_0, x = window_39)[name = tensor<string, []>("op_899")];
+            tensor<fp32, [1, 16, 256]> window_39 = concat(axis = var_92, interleave = window_39_interleave_0, values = (var_956, var_953))[name = tensor<string, []>("window_39")];
+            tensor<int32, [3]> var_961_begin_0 = const()[name = tensor<string, []>("op_961_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_961_end_0 = const()[name = tensor<string, []>("op_961_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_961_end_mask_0 = const()[name = tensor<string, []>("op_961_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_961 = slice_by_index(begin = var_961_begin_0, end = var_961_end_0, end_mask = var_961_end_mask_0, x = x_21)[name = tensor<string, []>("op_961")];
+            tensor<int32, [3]> var_964_begin_0 = const()[name = tensor<string, []>("op_964_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_964_end_0 = const()[name = tensor<string, []>("op_964_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_964_end_mask_0 = const()[name = tensor<string, []>("op_964_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_964 = slice_by_index(begin = var_964_begin_0, end = var_964_end_0, end_mask = var_964_end_mask_0, x = window_39)[name = tensor<string, []>("op_964")];
             tensor<bool, []> window_41_interleave_0 = const()[name = tensor<string, []>("window_41_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_41 = concat(axis = var_27, interleave = window_41_interleave_0, values = (var_899, var_896))[name = tensor<string, []>("window_41")];
-            tensor<int32, [3]> var_904_begin_0 = const()[name = tensor<string, []>("op_904_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_904_end_0 = const()[name = tensor<string, []>("op_904_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<bool, [3]> var_904_end_mask_0 = const()[name = tensor<string, []>("op_904_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_904 = slice_by_index(begin = var_904_begin_0, end = var_904_end_0, end_mask = var_904_end_mask_0, x = x_21)[name = tensor<string, []>("op_904")];
-            tensor<int32, [3]> var_907_begin_0 = const()[name = tensor<string, []>("op_907_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_907_end_0 = const()[name = tensor<string, []>("op_907_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_907_end_mask_0 = const()[name = tensor<string, []>("op_907_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_907 = slice_by_index(begin = var_907_begin_0, end = var_907_end_0, end_mask = var_907_end_mask_0, x = window_41)[name = tensor<string, []>("op_907")];
+            tensor<fp32, [1, 16, 256]> window_41 = concat(axis = var_92, interleave = window_41_interleave_0, values = (var_964, var_961))[name = tensor<string, []>("window_41")];
+            tensor<int32, [3]> var_969_begin_0 = const()[name = tensor<string, []>("op_969_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_969_end_0 = const()[name = tensor<string, []>("op_969_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<bool, [3]> var_969_end_mask_0 = const()[name = tensor<string, []>("op_969_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_969 = slice_by_index(begin = var_969_begin_0, end = var_969_end_0, end_mask = var_969_end_mask_0, x = x_21)[name = tensor<string, []>("op_969")];
+            tensor<int32, [3]> var_972_begin_0 = const()[name = tensor<string, []>("op_972_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_972_end_0 = const()[name = tensor<string, []>("op_972_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_972_end_mask_0 = const()[name = tensor<string, []>("op_972_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_972 = slice_by_index(begin = var_972_begin_0, end = var_972_end_0, end_mask = var_972_end_mask_0, x = window_41)[name = tensor<string, []>("op_972")];
             tensor<bool, []> window_43_interleave_0 = const()[name = tensor<string, []>("window_43_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_43 = concat(axis = var_27, interleave = window_43_interleave_0, values = (var_907, var_904))[name = tensor<string, []>("window_43")];
-            tensor<int32, [3]> var_912_begin_0 = const()[name = tensor<string, []>("op_912_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
-            tensor<int32, [3]> var_912_end_0 = const()[name = tensor<string, []>("op_912_end_0"), val = tensor<int32, [3]>([1, 4, 256])];
-            tensor<bool, [3]> var_912_end_mask_0 = const()[name = tensor<string, []>("op_912_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_912 = slice_by_index(begin = var_912_begin_0, end = var_912_end_0, end_mask = var_912_end_mask_0, x = x_21)[name = tensor<string, []>("op_912")];
-            tensor<int32, [3]> var_915_begin_0 = const()[name = tensor<string, []>("op_915_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_915_end_0 = const()[name = tensor<string, []>("op_915_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_915_end_mask_0 = const()[name = tensor<string, []>("op_915_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_915 = slice_by_index(begin = var_915_begin_0, end = var_915_end_0, end_mask = var_915_end_mask_0, x = window_43)[name = tensor<string, []>("op_915")];
+            tensor<fp32, [1, 16, 256]> window_43 = concat(axis = var_92, interleave = window_43_interleave_0, values = (var_972, var_969))[name = tensor<string, []>("window_43")];
+            tensor<int32, [3]> var_977_begin_0 = const()[name = tensor<string, []>("op_977_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
+            tensor<int32, [3]> var_977_end_0 = const()[name = tensor<string, []>("op_977_end_0"), val = tensor<int32, [3]>([1, 4, 256])];
+            tensor<bool, [3]> var_977_end_mask_0 = const()[name = tensor<string, []>("op_977_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_977 = slice_by_index(begin = var_977_begin_0, end = var_977_end_0, end_mask = var_977_end_mask_0, x = x_21)[name = tensor<string, []>("op_977")];
+            tensor<int32, [3]> var_980_begin_0 = const()[name = tensor<string, []>("op_980_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_980_end_0 = const()[name = tensor<string, []>("op_980_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_980_end_mask_0 = const()[name = tensor<string, []>("op_980_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_980 = slice_by_index(begin = var_980_begin_0, end = var_980_end_0, end_mask = var_980_end_mask_0, x = window_43)[name = tensor<string, []>("op_980")];
             tensor<bool, []> window_45_interleave_0 = const()[name = tensor<string, []>("window_45_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_45 = concat(axis = var_27, interleave = window_45_interleave_0, values = (var_915, var_912))[name = tensor<string, []>("window_45")];
-            tensor<int32, [3]> var_920_begin_0 = const()[name = tensor<string, []>("op_920_begin_0"), val = tensor<int32, [3]>([0, 4, 0])];
-            tensor<int32, [3]> var_920_end_0 = const()[name = tensor<string, []>("op_920_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_920_end_mask_0 = const()[name = tensor<string, []>("op_920_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_920 = slice_by_index(begin = var_920_begin_0, end = var_920_end_0, end_mask = var_920_end_mask_0, x = x_21)[name = tensor<string, []>("op_920")];
-            tensor<int32, [3]> var_923_begin_0 = const()[name = tensor<string, []>("op_923_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_923_end_0 = const()[name = tensor<string, []>("op_923_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_923_end_mask_0 = const()[name = tensor<string, []>("op_923_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_923 = slice_by_index(begin = var_923_begin_0, end = var_923_end_0, end_mask = var_923_end_mask_0, x = window_45)[name = tensor<string, []>("op_923")];
+            tensor<fp32, [1, 16, 256]> window_45 = concat(axis = var_92, interleave = window_45_interleave_0, values = (var_980, var_977))[name = tensor<string, []>("window_45")];
+            tensor<int32, [3]> var_985_begin_0 = const()[name = tensor<string, []>("op_985_begin_0"), val = tensor<int32, [3]>([0, 4, 0])];
+            tensor<int32, [3]> var_985_end_0 = const()[name = tensor<string, []>("op_985_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_985_end_mask_0 = const()[name = tensor<string, []>("op_985_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_985 = slice_by_index(begin = var_985_begin_0, end = var_985_end_0, end_mask = var_985_end_mask_0, x = x_21)[name = tensor<string, []>("op_985")];
+            tensor<int32, [3]> var_988_begin_0 = const()[name = tensor<string, []>("op_988_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_988_end_0 = const()[name = tensor<string, []>("op_988_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_988_end_mask_0 = const()[name = tensor<string, []>("op_988_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_988 = slice_by_index(begin = var_988_begin_0, end = var_988_end_0, end_mask = var_988_end_mask_0, x = window_45)[name = tensor<string, []>("op_988")];
             tensor<bool, []> window_interleave_0 = const()[name = tensor<string, []>("window_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window = concat(axis = var_27, interleave = window_interleave_0, values = (var_923, var_920))[name = tensor<string, []>("window")];
-            tensor<bool, []> input_141_interleave_0 = const()[name = tensor<string, []>("input_141_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [5, 16, 256]> input_141 = concat(axis = var_24, interleave = input_141_interleave_0, values = (window_39, window_41, window_43, window_45, window))[name = tensor<string, []>("input_141")];
+            tensor<fp32, [1, 16, 256]> window = concat(axis = var_92, interleave = window_interleave_0, values = (var_988, var_985))[name = tensor<string, []>("window")];
+            tensor<bool, []> input_143_interleave_0 = const()[name = tensor<string, []>("input_143_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [5, 16, 256]> input_143 = concat(axis = var_77, interleave = input_143_interleave_0, values = (window_39, window_41, window_43, window_45, window))[name = tensor<string, []>("input_143")];
             tensor<int32, [1]> x_23_axes_0 = const()[name = tensor<string, []>("x_23_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [5, 16, 256]> x_23 = layer_norm(axes = x_23_axes_0, beta = encoder_conv_module_3_sequential_0_bias, epsilon = var_29, gamma = encoder_conv_module_3_sequential_0_weight, x = input_141)[name = tensor<string, []>("x_23")];
-            tensor<int32, [3]> input_143_perm_0 = const()[name = tensor<string, []>("input_143_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [5, 16, 256]> x_23 = layer_norm(axes = x_23_axes_0, beta = inner_encoder_conv_module_3_sequential_0_bias, epsilon = var_74, gamma = inner_encoder_conv_module_3_sequential_0_weight, x = input_143)[name = tensor<string, []>("x_23")];
+            tensor<int32, [3]> input_145_perm_0 = const()[name = tensor<string, []>("input_145_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_33_pad_type_0 = const()[name = tensor<string, []>("inputs_33_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_33_strides_0 = const()[name = tensor<string, []>("inputs_33_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_33_pad_0 = const()[name = tensor<string, []>("inputs_33_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_33_dilations_0 = const()[name = tensor<string, []>("inputs_33_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_33_groups_0 = const()[name = tensor<string, []>("inputs_33_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> input_143 = transpose(perm = input_143_perm_0, x = x_23)[name = tensor<string, []>("transpose_33")];
-            tensor<fp32, [5, 512, 16]> inputs_33 = conv(bias = encoder_conv_module_3_sequential_2_conv_bias, dilations = inputs_33_dilations_0, groups = inputs_33_groups_0, pad = inputs_33_pad_0, pad_type = inputs_33_pad_type_0, strides = inputs_33_strides_0, weight = encoder_conv_module_3_sequential_2_conv_weight, x = input_143)[name = tensor<string, []>("inputs_33")];
-            tensor<int32, [2]> var_948_split_sizes_0 = const()[name = tensor<string, []>("op_948_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_948_axis_0 = const()[name = tensor<string, []>("op_948_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> var_948_0, tensor<fp32, [5, 256, 16]> var_948_1 = split(axis = var_948_axis_0, split_sizes = var_948_split_sizes_0, x = inputs_33)[name = tensor<string, []>("op_948")];
-            tensor<fp32, [5, 256, 16]> var_950 = sigmoid(x = var_948_1)[name = tensor<string, []>("op_950")];
-            tensor<fp32, [5, 256, 16]> inputs_35 = mul(x = var_948_0, y = var_950)[name = tensor<string, []>("inputs_35")];
+            tensor<fp32, [5, 256, 16]> input_145 = transpose(perm = input_145_perm_0, x = x_23)[name = tensor<string, []>("transpose_33")];
+            tensor<fp32, [5, 512, 16]> inputs_33 = conv(bias = inner_encoder_conv_module_3_sequential_2_conv_bias, dilations = inputs_33_dilations_0, groups = inputs_33_groups_0, pad = inputs_33_pad_0, pad_type = inputs_33_pad_type_0, strides = inputs_33_strides_0, weight = inner_encoder_conv_module_3_sequential_2_conv_weight, x = input_145)[name = tensor<string, []>("inputs_33")];
+            tensor<int32, [2]> var_1013_split_sizes_0 = const()[name = tensor<string, []>("op_1013_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_1013_axis_0 = const()[name = tensor<string, []>("op_1013_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [5, 256, 16]> var_1013_0, tensor<fp32, [5, 256, 16]> var_1013_1 = split(axis = var_1013_axis_0, split_sizes = var_1013_split_sizes_0, x = inputs_33)[name = tensor<string, []>("op_1013")];
+            tensor<fp32, [5, 256, 16]> var_1015 = sigmoid(x = var_1013_1)[name = tensor<string, []>("op_1015")];
+            tensor<fp32, [5, 256, 16]> inputs_35 = mul(x = var_1013_0, y = var_1015)[name = tensor<string, []>("inputs_35")];
             tensor<string, []> outputs_aug_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_pad_0 = const()[name = tensor<string, []>("outputs_aug_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_groups_0 = const()[name = tensor<string, []>("outputs_aug_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_strides_0 = const()[name = tensor<string, []>("outputs_aug_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_dilations_0 = const()[name = tensor<string, []>("outputs_aug_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [5, 256, 31]> outputs_aug = conv(dilations = outputs_aug_dilations_0, groups = outputs_aug_groups_0, pad = outputs_aug_pad_0, pad_type = outputs_aug_pad_type_0, strides = outputs_aug_strides_0, weight = encoder_conv_module_3_sequential_4_conv_weight, x = inputs_35)[name = tensor<string, []>("outputs_aug")];
-            tensor<int32, [3]> input_145_begin_0 = const()[name = tensor<string, []>("input_145_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_145_end_0 = const()[name = tensor<string, []>("input_145_end_0"), val = tensor<int32, [3]>([5, 256, 16])];
-            tensor<bool, [3]> input_145_end_mask_0 = const()[name = tensor<string, []>("input_145_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [5, 256, 16]> input_145 = slice_by_index(begin = input_145_begin_0, end = input_145_end_0, end_mask = input_145_end_mask_0, x = outputs_aug)[name = tensor<string, []>("input_145")];
-            tensor<fp32, [5, 256, 16]> inputs_37 = batch_norm(beta = encoder_conv_module_3_sequential_5_bias, epsilon = var_29, gamma = encoder_conv_module_3_sequential_5_weight, mean = encoder_conv_module_3_sequential_5_running_mean, variance = encoder_conv_module_3_sequential_5_running_var, x = input_145)[name = tensor<string, []>("inputs_37")];
-            tensor<fp32, [5, 256, 16]> input_147 = silu(x = inputs_37)[name = tensor<string, []>("input_147")];
-            tensor<string, []> input_149_pad_type_0 = const()[name = tensor<string, []>("input_149_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_149_strides_0 = const()[name = tensor<string, []>("input_149_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_149_pad_0 = const()[name = tensor<string, []>("input_149_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_149_dilations_0 = const()[name = tensor<string, []>("input_149_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_149_groups_0 = const()[name = tensor<string, []>("input_149_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> input_149 = conv(bias = encoder_conv_module_3_sequential_7_conv_bias, dilations = input_149_dilations_0, groups = input_149_groups_0, pad = input_149_pad_0, pad_type = input_149_pad_type_0, strides = input_149_strides_0, weight = encoder_conv_module_3_sequential_7_conv_weight, x = input_147)[name = tensor<string, []>("input_149")];
+            tensor<fp32, [5, 256, 31]> outputs_aug = conv(dilations = outputs_aug_dilations_0, groups = outputs_aug_groups_0, pad = outputs_aug_pad_0, pad_type = outputs_aug_pad_type_0, strides = outputs_aug_strides_0, weight = inner_encoder_conv_module_3_sequential_4_conv_weight, x = inputs_35)[name = tensor<string, []>("outputs_aug")];
+            tensor<int32, [3]> input_147_begin_0 = const()[name = tensor<string, []>("input_147_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_147_end_0 = const()[name = tensor<string, []>("input_147_end_0"), val = tensor<int32, [3]>([5, 256, 16])];
+            tensor<bool, [3]> input_147_end_mask_0 = const()[name = tensor<string, []>("input_147_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [5, 256, 16]> input_147 = slice_by_index(begin = input_147_begin_0, end = input_147_end_0, end_mask = input_147_end_mask_0, x = outputs_aug)[name = tensor<string, []>("input_147")];
+            tensor<fp32, [5, 256, 16]> inputs_37 = batch_norm(beta = inner_encoder_conv_module_3_sequential_5_bias, epsilon = var_74, gamma = inner_encoder_conv_module_3_sequential_5_weight, mean = inner_encoder_conv_module_3_sequential_5_running_mean, variance = inner_encoder_conv_module_3_sequential_5_running_var, x = input_147)[name = tensor<string, []>("inputs_37")];
+            tensor<fp32, [5, 256, 16]> input_149 = silu(x = inputs_37)[name = tensor<string, []>("input_149")];
+            tensor<string, []> input_151_pad_type_0 = const()[name = tensor<string, []>("input_151_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_151_strides_0 = const()[name = tensor<string, []>("input_151_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_151_pad_0 = const()[name = tensor<string, []>("input_151_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_151_dilations_0 = const()[name = tensor<string, []>("input_151_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_151_groups_0 = const()[name = tensor<string, []>("input_151_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [5, 256, 16]> input_151 = conv(bias = inner_encoder_conv_module_3_sequential_7_conv_bias, dilations = input_151_dilations_0, groups = input_151_groups_0, pad = input_151_pad_0, pad_type = input_151_pad_type_0, strides = input_151_strides_0, weight = inner_encoder_conv_module_3_sequential_7_conv_weight, x = input_149)[name = tensor<string, []>("input_151")];
             tensor<int32, [3]> conv_out_7_perm_0 = const()[name = tensor<string, []>("conv_out_7_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_981_begin_0 = const()[name = tensor<string, []>("op_981_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_981_end_0 = const()[name = tensor<string, []>("op_981_end_0"), val = tensor<int32, [3]>([5, 16, 256])];
-            tensor<bool, [3]> var_981_end_mask_0 = const()[name = tensor<string, []>("op_981_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [5, 16, 256]> conv_out_7 = transpose(perm = conv_out_7_perm_0, x = input_149)[name = tensor<string, []>("transpose_32")];
-            tensor<fp32, [5, 1, 256]> var_981 = slice_by_index(begin = var_981_begin_0, end = var_981_end_0, end_mask = var_981_end_mask_0, x = conv_out_7)[name = tensor<string, []>("op_981")];
-            tensor<int32, [3]> var_983_perm_0 = const()[name = tensor<string, []>("op_983_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 5, 256]> var_983 = transpose(perm = var_983_perm_0, x = var_981)[name = tensor<string, []>("transpose_31")];
-            tensor<fp32, [1, 5, 256]> input_151 = add(x = x_21, y = var_983)[name = tensor<string, []>("input_151")];
-            tensor<int32, [1]> input_153_axes_0 = const()[name = tensor<string, []>("input_153_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_153 = layer_norm(axes = input_153_axes_0, beta = encoder_ffn2_3_module_sequential_0_bias, epsilon = var_29, gamma = encoder_ffn2_3_module_sequential_0_weight, x = input_151)[name = tensor<string, []>("input_153")];
-            tensor<fp32, [1, 5, 1024]> inputs = linear(bias = encoder_ffn2_3_module_sequential_1_linear_bias, weight = encoder_ffn2_3_module_sequential_1_linear_weight, x = input_153)[name = tensor<string, []>("linear_35")];
-            tensor<fp32, [1, 5, 1024]> input_155 = silu(x = inputs)[name = tensor<string, []>("input_155")];
-            tensor<fp32, [1, 5, 256]> input_159 = linear(bias = encoder_ffn2_3_module_sequential_4_linear_bias, weight = encoder_ffn2_3_module_sequential_4_linear_weight, x = input_155)[name = tensor<string, []>("linear_36")];
-            tensor<fp32, []> var_1006 = const()[name = tensor<string, []>("op_1006"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 5, 256]> var_1007 = mul(x = input_159, y = var_1006)[name = tensor<string, []>("op_1007")];
-            tensor<fp32, [1, 5, 256]> input_161 = add(x = var_1007, y = input_151)[name = tensor<string, []>("input_161")];
+            tensor<int32, [3]> var_1046_begin_0 = const()[name = tensor<string, []>("op_1046_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_1046_end_0 = const()[name = tensor<string, []>("op_1046_end_0"), val = tensor<int32, [3]>([5, 16, 256])];
+            tensor<bool, [3]> var_1046_end_mask_0 = const()[name = tensor<string, []>("op_1046_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [5, 16, 256]> conv_out_7 = transpose(perm = conv_out_7_perm_0, x = input_151)[name = tensor<string, []>("transpose_32")];
+            tensor<fp32, [5, 1, 256]> var_1046 = slice_by_index(begin = var_1046_begin_0, end = var_1046_end_0, end_mask = var_1046_end_mask_0, x = conv_out_7)[name = tensor<string, []>("op_1046")];
+            tensor<int32, [3]> var_1048_perm_0 = const()[name = tensor<string, []>("op_1048_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 5, 256]> var_1048 = transpose(perm = var_1048_perm_0, x = var_1046)[name = tensor<string, []>("transpose_31")];
+            tensor<fp32, [1, 5, 256]> input_153 = add(x = x_21, y = var_1048)[name = tensor<string, []>("input_153")];
+            tensor<int32, [1]> input_155_axes_0 = const()[name = tensor<string, []>("input_155_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 5, 256]> input_155 = layer_norm(axes = input_155_axes_0, beta = inner_encoder_ffn2_3_module_sequential_0_bias, epsilon = var_74, gamma = inner_encoder_ffn2_3_module_sequential_0_weight, x = input_153)[name = tensor<string, []>("input_155")];
+            tensor<fp32, [1, 5, 1024]> inputs = linear(bias = inner_encoder_ffn2_3_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_3_module_sequential_1_linear_weight, x = input_155)[name = tensor<string, []>("linear_35")];
+            tensor<fp32, [1, 5, 1024]> input_157 = silu(x = inputs)[name = tensor<string, []>("input_157")];
+            tensor<fp32, [1, 5, 256]> input_161 = linear(bias = inner_encoder_ffn2_3_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_3_module_sequential_4_linear_weight, x = input_157)[name = tensor<string, []>("linear_36")];
+            tensor<fp32, []> var_1071 = const()[name = tensor<string, []>("op_1071"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 5, 256]> var_1072 = mul(x = input_161, y = var_1071)[name = tensor<string, []>("op_1072")];
+            tensor<fp32, [1, 5, 256]> input_163 = add(x = var_1072, y = input_153)[name = tensor<string, []>("input_163")];
             tensor<int32, [1]> x_25_axes_0 = const()[name = tensor<string, []>("x_25_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> x_25 = layer_norm(axes = x_25_axes_0, beta = encoder_layer_norm_3_bias, epsilon = var_29, gamma = encoder_layer_norm_3_weight, x = input_161)[name = tensor<string, []>("x_25")];
+            tensor<fp32, [1, 5, 256]> x_25 = layer_norm(axes = x_25_axes_0, beta = inner_encoder_layer_norm_3_bias, epsilon = var_74, gamma = inner_encoder_layer_norm_3_weight, x = input_163)[name = tensor<string, []>("x_25")];
             tensor<int32, [3]> x_ct_perm_0 = const()[name = tensor<string, []>("x_ct_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<bool, []> cat_interleave_0 = const()[name = tensor<string, []>("cat_interleave_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [1, 256, 5]> x_ct = transpose(perm = x_ct_perm_0, x = x_25)[name = tensor<string, []>("transpose_30")];
-            tensor<fp32, [1, 256, 23]> cat = concat(axis = var_21, interleave = cat_interleave_0, values = (cnn_window, x_ct))[name = tensor<string, []>("cat")];
+            tensor<fp32, [1, 256, 23]> cat = concat(axis = var_79, interleave = cat_interleave_0, values = (cnn_window, x_ct))[name = tensor<string, []>("cat")];
             tensor<string, []> conv_out_pad_type_0 = const()[name = tensor<string, []>("conv_out_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> conv_out_strides_0 = const()[name = tensor<string, []>("conv_out_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> conv_out_pad_0 = const()[name = tensor<string, []>("conv_out_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> conv_out_dilations_0 = const()[name = tensor<string, []>("conv_out_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> conv_out_groups_0 = const()[name = tensor<string, []>("conv_out_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 5]> conv_out = conv(bias = encoder_cnn_bias, dilations = conv_out_dilations_0, groups = conv_out_groups_0, pad = conv_out_pad_0, pad_type = conv_out_pad_type_0, strides = conv_out_strides_0, weight = encoder_cnn_weight, x = cat)[name = tensor<string, []>("conv_out")];
-            tensor<int32, [3]> var_1025_begin_0 = const()[name = tensor<string, []>("op_1025_begin_0"), val = tensor<int32, [3]>([0, 0, 5])];
-            tensor<int32, [3]> var_1025_end_0 = const()[name = tensor<string, []>("op_1025_end_0"), val = tensor<int32, [3]>([1, 256, 23])];
-            tensor<bool, [3]> var_1025_end_mask_0 = const()[name = tensor<string, []>("op_1025_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 256, 18]> cnn_window_new = slice_by_index(begin = var_1025_begin_0, end = var_1025_end_0, end_mask = var_1025_end_mask_0, x = cat)[name = tensor<string, []>("op_1025")];
-            tensor<int32, [3]> input_163_perm_0 = const()[name = tensor<string, []>("input_163_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [1]> var_1027 = const()[name = tensor<string, []>("op_1027"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_163 = transpose(perm = input_163_perm_0, x = conv_out)[name = tensor<string, []>("transpose_29")];
-            tensor<fp32, [1, 5, 1]> var_1028 = reduce_l2_norm(axes = var_1027, keep_dims = var_30, x = input_163)[name = tensor<string, []>("op_1028")];
+            tensor<fp32, [1, 256, 5]> conv_out = conv(bias = inner_encoder_cnn_bias, dilations = conv_out_dilations_0, groups = conv_out_groups_0, pad = conv_out_pad_0, pad_type = conv_out_pad_type_0, strides = conv_out_strides_0, weight = inner_encoder_cnn_weight, x = cat)[name = tensor<string, []>("conv_out")];
+            tensor<int32, [3]> var_1090_begin_0 = const()[name = tensor<string, []>("op_1090_begin_0"), val = tensor<int32, [3]>([0, 0, 5])];
+            tensor<int32, [3]> var_1090_end_0 = const()[name = tensor<string, []>("op_1090_end_0"), val = tensor<int32, [3]>([1, 256, 23])];
+            tensor<bool, [3]> var_1090_end_mask_0 = const()[name = tensor<string, []>("op_1090_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 256, 18]> cnn_window_new = slice_by_index(begin = var_1090_begin_0, end = var_1090_end_0, end_mask = var_1090_end_mask_0, x = cat)[name = tensor<string, []>("op_1090")];
+            tensor<int32, [3]> input_165_perm_0 = const()[name = tensor<string, []>("input_165_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_1092 = const()[name = tensor<string, []>("op_1092"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 5, 256]> input_165 = transpose(perm = input_165_perm_0, x = conv_out)[name = tensor<string, []>("transpose_29")];
+            tensor<fp32, [1, 5, 1]> var_1093 = reduce_l2_norm(axes = var_1092, keep_dims = var_73, x = input_165)[name = tensor<string, []>("op_1093")];
             tensor<fp32, []> const_12 = const()[name = tensor<string, []>("const_12"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1, 5, 1]> clip_0 = clip(alpha = var_11, beta = const_12, x = var_1028)[name = tensor<string, []>("clip_0")];
-            tensor<fp32, [1, 5, 256]> emb = real_div(x = input_163, y = clip_0)[name = tensor<string, []>("emb")];
-            tensor<int32, []> var_1032_axis_0 = const()[name = tensor<string, []>("op_1032_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1, 4, 64, 64]> enc_kv_new = stack(axis = var_1032_axis_0, values = (var_207, var_429, var_651, nkv_1))[name = tensor<string, []>("op_1032")];
-            tensor<int32, []> var_1034_axis_0 = const()[name = tensor<string, []>("op_1034_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1]> enc_scale_new = stack(axis = var_1034_axis_0, values = (new_scale_1, new_scale_3, new_scale_5, new_scale_7))[name = tensor<string, []>("op_1034")];
-            tensor<int32, []> var_1036_axis_0 = const()[name = tensor<string, []>("op_1036_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1, 16, 256]> enc_conv_cache_new = stack(axis = var_1036_axis_0, values = (window_11, window_23, window_35, window))[name = tensor<string, []>("op_1036")];
-            tensor<fp32, []> var_1045 = const()[name = tensor<string, []>("op_1045"), val = tensor<fp32, []>(0x1.5798eep-27)];
-            tensor<fp32, []> var_1050 = const()[name = tensor<string, []>("op_1050"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
-            tensor<fp32, []> var_1052 = const()[name = tensor<string, []>("op_1052"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
-            tensor<bool, []> var_1053 = const()[name = tensor<string, []>("op_1053"), val = tensor<bool, []>(true)];
-            tensor<fp32, []> var_1055 = const()[name = tensor<string, []>("op_1055"), val = tensor<fp32, []>(0x1p+0)];
-            tensor<int32, []> var_1059 = const()[name = tensor<string, []>("op_1059"), val = tensor<int32, []>(-1)];
-            tensor<int32, []> var_1065 = const()[name = tensor<string, []>("op_1065"), val = tensor<int32, []>(0)];
+            tensor<fp32, [1, 5, 1]> clip_0 = clip(alpha = var_88, beta = const_12, x = var_1093)[name = tensor<string, []>("clip_0")];
+            tensor<fp32, [1, 5, 256]> emb = real_div(x = input_165, y = clip_0)[name = tensor<string, []>("emb")];
+            tensor<int32, []> var_1097_axis_0 = const()[name = tensor<string, []>("op_1097_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1, 4, 64, 64]> enc_kv_new = stack(axis = var_1097_axis_0, values = (var_272, var_494, var_716, nkv_1))[name = tensor<string, []>("op_1097")];
+            tensor<int32, []> var_1099_axis_0 = const()[name = tensor<string, []>("op_1099_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1]> enc_scale_new = stack(axis = var_1099_axis_0, values = (new_scale_1, new_scale_3, new_scale_5, new_scale_7))[name = tensor<string, []>("op_1099")];
+            tensor<int32, []> var_1101_axis_0 = const()[name = tensor<string, []>("op_1101_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1, 16, 256]> enc_conv_cache_new = stack(axis = var_1101_axis_0, values = (window_11, window_23, window_35, window))[name = tensor<string, []>("op_1101")];
             tensor<fp32, [1, 5, 6, 256]> pos = const()[name = tensor<string, []>("pos"), val = tensor<fp32, [1, 5, 6, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44395712)))];
-            tensor<int32, [1]> var_1127_axes_0 = const()[name = tensor<string, []>("op_1127_axes_0"), val = tensor<int32, [1]>([2])];
-            tensor<fp32, [1, 5, 1, 256]> var_1127 = expand_dims(axes = var_1127_axes_0, x = emb)[name = tensor<string, []>("op_1127")];
+            tensor<int32, [1]> var_1169_axes_0 = const()[name = tensor<string, []>("op_1169_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp32, [1, 5, 1, 256]> var_1169 = expand_dims(axes = var_1169_axes_0, x = emb)[name = tensor<string, []>("op_1169")];
             tensor<int32, [4]> emb_exp_reps_0 = const()[name = tensor<string, []>("emb_exp_reps_0"), val = tensor<int32, [4]>([1, 1, 6, 1])];
-            tensor<fp32, [1, 5, 6, 256]> emb_exp = tile(reps = emb_exp_reps_0, x = var_1127)[name = tensor<string, []>("emb_exp")];
-            tensor<bool, []> input_165_interleave_0 = const()[name = tensor<string, []>("input_165_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 5, 6, 512]> input_165 = concat(axis = var_1059, interleave = input_165_interleave_0, values = (emb_exp, pos))[name = tensor<string, []>("input_165")];
-            tensor<fp32, [1, 5, 6, 256]> x_27 = linear(bias = decoder_convert_bias, weight = decoder_convert_weight, x = input_165)[name = tensor<string, []>("linear_37")];
-            tensor<int32, [4]> var_1135_perm_0 = const()[name = tensor<string, []>("op_1135_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1139 = const()[name = tensor<string, []>("op_1139"), val = tensor<int32, [3]>([6, 5, 256])];
-            tensor<fp32, [1, 6, 5, 256]> var_1135 = transpose(perm = var_1135_perm_0, x = x_27)[name = tensor<string, []>("transpose_28")];
-            tensor<fp32, [6, 5, 256]> x_29 = reshape(shape = var_1139, x = var_1135)[name = tensor<string, []>("x_29")];
+            tensor<fp32, [1, 5, 6, 256]> emb_exp = tile(reps = emb_exp_reps_0, x = var_1169)[name = tensor<string, []>("emb_exp")];
+            tensor<bool, []> input_167_interleave_0 = const()[name = tensor<string, []>("input_167_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 5, 6, 512]> input_167 = concat(axis = var_80, interleave = input_167_interleave_0, values = (emb_exp, pos))[name = tensor<string, []>("input_167")];
+            tensor<fp32, [1, 5, 6, 256]> x_27 = linear(bias = inner_decoder_convert_bias, weight = inner_decoder_convert_weight, x = input_167)[name = tensor<string, []>("linear_37")];
+            tensor<int32, [4]> var_1177_perm_0 = const()[name = tensor<string, []>("op_1177_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1181 = const()[name = tensor<string, []>("op_1181"), val = tensor<int32, [3]>([6, 5, 256])];
+            tensor<fp32, [1, 6, 5, 256]> var_1177 = transpose(perm = var_1177_perm_0, x = x_27)[name = tensor<string, []>("transpose_28")];
+            tensor<fp32, [6, 5, 256]> x_29 = reshape(shape = var_1181, x = var_1177)[name = tensor<string, []>("x_29")];
             tensor<int32, [5]> prev_kv_9_begin_0 = const()[name = tensor<string, []>("prev_kv_9_begin_0"), val = tensor<int32, [5]>([0, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_9_end_0 = const()[name = tensor<string, []>("prev_kv_9_end_0"), val = tensor<int32, [5]>([1, 6, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_9_end_mask_0 = const()[name = tensor<string, []>("prev_kv_9_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -1023,132 +1042,132 @@ program(1.0)
             tensor<bool, [2]> prev_scale_9_end_mask_0 = const()[name = tensor<string, []>("prev_scale_9_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_9_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_9_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_9 = slice_by_index(begin = prev_scale_9_begin_0, end = prev_scale_9_end_0, end_mask = prev_scale_9_end_mask_0, squeeze_mask = prev_scale_9_squeeze_mask_0, x = dec_scale)[name = tensor<string, []>("prev_scale_9")];
-            tensor<fp32, [6, 5, 256]> var_1147 = linear(bias = decoder_q_proj_0_bias, weight = decoder_q_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_38")];
-            tensor<int32, [4]> var_1148 = const()[name = tensor<string, []>("op_1148"), val = tensor<int32, [4]>([6, 5, 4, 64])];
-            tensor<fp32, [6, 5, 4, 64]> var_1149 = reshape(shape = var_1148, x = var_1147)[name = tensor<string, []>("op_1149")];
+            tensor<fp32, [6, 5, 256]> var_1189 = linear(bias = inner_decoder_q_proj_0_bias, weight = inner_decoder_q_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_38")];
+            tensor<int32, [4]> var_1190 = const()[name = tensor<string, []>("op_1190"), val = tensor<int32, [4]>([6, 5, 4, 64])];
+            tensor<fp32, [6, 5, 4, 64]> var_1191 = reshape(shape = var_1190, x = var_1189)[name = tensor<string, []>("op_1191")];
             tensor<int32, [4]> q_9_perm_0 = const()[name = tensor<string, []>("q_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [6, 5, 256]> var_1153 = linear(bias = decoder_k_proj_0_bias, weight = decoder_k_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_39")];
-            tensor<fp32, []> var_1154 = const()[name = tensor<string, []>("op_1154"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [6, 5, 256]> var_1155 = mul(x = var_1153, y = var_1154)[name = tensor<string, []>("op_1155")];
-            tensor<int32, [4]> var_1156 = const()[name = tensor<string, []>("op_1156"), val = tensor<int32, [4]>([6, 5, 4, 64])];
-            tensor<fp32, [6, 5, 4, 64]> var_1157 = reshape(shape = var_1156, x = var_1155)[name = tensor<string, []>("op_1157")];
+            tensor<fp32, [6, 5, 256]> var_1195 = linear(bias = inner_decoder_k_proj_0_bias, weight = inner_decoder_k_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_39")];
+            tensor<fp32, []> var_1196 = const()[name = tensor<string, []>("op_1196"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [6, 5, 256]> var_1197 = mul(x = var_1195, y = var_1196)[name = tensor<string, []>("op_1197")];
+            tensor<int32, [4]> var_1198 = const()[name = tensor<string, []>("op_1198"), val = tensor<int32, [4]>([6, 5, 4, 64])];
+            tensor<fp32, [6, 5, 4, 64]> var_1199 = reshape(shape = var_1198, x = var_1197)[name = tensor<string, []>("op_1199")];
             tensor<int32, [4]> k_9_perm_0 = const()[name = tensor<string, []>("k_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [6, 5, 256]> var_1161 = linear(bias = decoder_v_proj_0_bias, weight = decoder_v_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_40")];
-            tensor<int32, [4]> var_1162 = const()[name = tensor<string, []>("op_1162"), val = tensor<int32, [4]>([6, 5, 4, 64])];
-            tensor<fp32, [6, 5, 4, 64]> var_1163 = reshape(shape = var_1162, x = var_1161)[name = tensor<string, []>("op_1163")];
+            tensor<fp32, [6, 5, 256]> var_1203 = linear(bias = inner_decoder_v_proj_0_bias, weight = inner_decoder_v_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_40")];
+            tensor<int32, [4]> var_1204 = const()[name = tensor<string, []>("op_1204"), val = tensor<int32, [4]>([6, 5, 4, 64])];
+            tensor<fp32, [6, 5, 4, 64]> var_1205 = reshape(shape = var_1204, x = var_1203)[name = tensor<string, []>("op_1205")];
             tensor<int32, [4]> v_9_perm_0 = const()[name = tensor<string, []>("v_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [6, 5, 256]> input_169 = linear(bias = decoder_g_proj_0_bias, weight = decoder_g_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_41")];
+            tensor<fp32, [6, 5, 256]> input_171 = linear(bias = inner_decoder_g_proj_0_bias, weight = inner_decoder_g_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_41")];
             tensor<bool, []> cumsum_mask_1_exclusive_0 = const()[name = tensor<string, []>("cumsum_mask_1_exclusive_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> cumsum_mask_1_reverse_0 = const()[name = tensor<string, []>("cumsum_mask_1_reverse_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [5]> cumsum_mask_1 = cumsum(axis = var_1065, exclusive = cumsum_mask_1_exclusive_0, reverse = cumsum_mask_1_reverse_0, x = valid_mask)[name = tensor<string, []>("cumsum_mask_1")];
+            tensor<fp32, [5]> cumsum_mask_1 = cumsum(axis = var_77, exclusive = cumsum_mask_1_exclusive_0, reverse = cumsum_mask_1_reverse_0, x = valid_mask)[name = tensor<string, []>("cumsum_mask_1")];
             tensor<fp32, [1]> sqrt_s0_9 = sqrt(x = prev_scale_9)[name = tensor<string, []>("sqrt_s0_9")];
             tensor<fp32, [5]> s_t_9 = add(x = prev_scale_9, y = cumsum_mask_1)[name = tensor<string, []>("s_t_9")];
             tensor<fp32, []> const_20 = const()[name = tensor<string, []>("const_20"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [5]> clip_1 = clip(alpha = var_1055, beta = const_20, x = s_t_9)[name = tensor<string, []>("clip_1")];
+            tensor<fp32, [5]> clip_1 = clip(alpha = var_68, beta = const_20, x = s_t_9)[name = tensor<string, []>("clip_1")];
             tensor<fp32, [5]> sqrt_s_t_9 = sqrt(x = clip_1)[name = tensor<string, []>("sqrt_s_t_9")];
             tensor<bool, []> qk_9_transpose_x_1 = const()[name = tensor<string, []>("qk_9_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_9_transpose_y_1 = const()[name = tensor<string, []>("qk_9_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [6, 4, 5, 64]> k_9 = transpose(perm = k_9_perm_0, x = var_1157)[name = tensor<string, []>("transpose_26")];
-            tensor<fp32, [6, 4, 5, 64]> q_9 = transpose(perm = q_9_perm_0, x = var_1149)[name = tensor<string, []>("transpose_27")];
+            tensor<fp32, [6, 4, 5, 64]> k_9 = transpose(perm = k_9_perm_0, x = var_1199)[name = tensor<string, []>("transpose_26")];
+            tensor<fp32, [6, 4, 5, 64]> q_9 = transpose(perm = q_9_perm_0, x = var_1191)[name = tensor<string, []>("transpose_27")];
             tensor<fp32, [6, 4, 5, 5]> qk_9 = matmul(transpose_x = qk_9_transpose_x_1, transpose_y = qk_9_transpose_y_1, x = q_9, y = k_9)[name = tensor<string, []>("qk_9")];
-            tensor<int32, [2]> var_1175 = const()[name = tensor<string, []>("op_1175"), val = tensor<int32, [2]>([1, 5])];
-            tensor<fp32, [1, 5]> var_1176 = reshape(shape = var_1175, x = valid_mask)[name = tensor<string, []>("op_1176")];
-            tensor<fp32, [5, 5]> causal_with_valid_1 = mul(x = decoder__causal_mask, y = var_1176)[name = tensor<string, []>("causal_with_valid_1")];
-            tensor<int32, [2]> var_1178 = const()[name = tensor<string, []>("op_1178"), val = tensor<int32, [2]>([5, 1])];
-            tensor<fp32, [5, 1]> var_1179 = reshape(shape = var_1178, x = sqrt_s_t_9)[name = tensor<string, []>("op_1179")];
-            tensor<fp32, [5, 5]> M_9 = real_div(x = causal_with_valid_1, y = var_1179)[name = tensor<string, []>("M_9")];
-            tensor<fp32, [6, 4, 5, 5]> var_1181 = mul(x = qk_9, y = M_9)[name = tensor<string, []>("op_1181")];
+            tensor<int32, [2]> var_1217 = const()[name = tensor<string, []>("op_1217"), val = tensor<int32, [2]>([1, 5])];
+            tensor<fp32, [1, 5]> var_1218 = reshape(shape = var_1217, x = valid_mask)[name = tensor<string, []>("op_1218")];
+            tensor<fp32, [5, 5]> causal_with_valid_1 = mul(x = inner_decoder__causal_mask, y = var_1218)[name = tensor<string, []>("causal_with_valid_1")];
+            tensor<int32, [2]> var_1220 = const()[name = tensor<string, []>("op_1220"), val = tensor<int32, [2]>([5, 1])];
+            tensor<fp32, [5, 1]> var_1221 = reshape(shape = var_1220, x = sqrt_s_t_9)[name = tensor<string, []>("op_1221")];
+            tensor<fp32, [5, 5]> M_9 = real_div(x = causal_with_valid_1, y = var_1221)[name = tensor<string, []>("M_9")];
+            tensor<fp32, [6, 4, 5, 5]> var_1223 = mul(x = qk_9, y = M_9)[name = tensor<string, []>("op_1223")];
             tensor<bool, []> inner_9_transpose_x_0 = const()[name = tensor<string, []>("inner_9_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_9_transpose_y_0 = const()[name = tensor<string, []>("inner_9_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [6, 4, 5, 64]> v_9 = transpose(perm = v_9_perm_0, x = var_1163)[name = tensor<string, []>("transpose_25")];
-            tensor<fp32, [6, 4, 5, 64]> inner_9 = matmul(transpose_x = inner_9_transpose_x_0, transpose_y = inner_9_transpose_y_0, x = var_1181, y = v_9)[name = tensor<string, []>("inner_9")];
-            tensor<bool, []> var_1183_transpose_x_0 = const()[name = tensor<string, []>("op_1183_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_1183_transpose_y_0 = const()[name = tensor<string, []>("op_1183_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [6, 4, 5, 64]> var_1183 = matmul(transpose_x = var_1183_transpose_x_0, transpose_y = var_1183_transpose_y_0, x = q_9, y = prev_kv_9)[name = tensor<string, []>("op_1183")];
-            tensor<fp32, [5]> var_1184 = real_div(x = sqrt_s0_9, y = sqrt_s_t_9)[name = tensor<string, []>("op_1184")];
-            tensor<int32, [4]> var_1185 = const()[name = tensor<string, []>("op_1185"), val = tensor<int32, [4]>([1, 1, 5, 1])];
-            tensor<fp32, [1, 1, 5, 1]> var_1186 = reshape(shape = var_1185, x = var_1184)[name = tensor<string, []>("op_1186")];
-            tensor<fp32, [6, 4, 5, 64]> cross_9 = mul(x = var_1183, y = var_1186)[name = tensor<string, []>("cross_9")];
+            tensor<fp32, [6, 4, 5, 64]> v_9 = transpose(perm = v_9_perm_0, x = var_1205)[name = tensor<string, []>("transpose_25")];
+            tensor<fp32, [6, 4, 5, 64]> inner_9 = matmul(transpose_x = inner_9_transpose_x_0, transpose_y = inner_9_transpose_y_0, x = var_1223, y = v_9)[name = tensor<string, []>("inner_9")];
+            tensor<bool, []> var_1225_transpose_x_0 = const()[name = tensor<string, []>("op_1225_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_1225_transpose_y_0 = const()[name = tensor<string, []>("op_1225_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [6, 4, 5, 64]> var_1225 = matmul(transpose_x = var_1225_transpose_x_0, transpose_y = var_1225_transpose_y_0, x = q_9, y = prev_kv_9)[name = tensor<string, []>("op_1225")];
+            tensor<fp32, [5]> var_1226 = real_div(x = sqrt_s0_9, y = sqrt_s_t_9)[name = tensor<string, []>("op_1226")];
+            tensor<int32, [4]> var_1227 = const()[name = tensor<string, []>("op_1227"), val = tensor<int32, [4]>([1, 1, 5, 1])];
+            tensor<fp32, [1, 1, 5, 1]> var_1228 = reshape(shape = var_1227, x = var_1226)[name = tensor<string, []>("op_1228")];
+            tensor<fp32, [6, 4, 5, 64]> cross_9 = mul(x = var_1225, y = var_1228)[name = tensor<string, []>("cross_9")];
             tensor<fp32, [6, 4, 5, 64]> out_25 = add(x = inner_9, y = cross_9)[name = tensor<string, []>("out_25")];
-            tensor<int32, [4]> var_1189 = const()[name = tensor<string, []>("op_1189"), val = tensor<int32, [4]>([1, 1, 5, 1])];
-            tensor<fp32, [1, 1, 5, 1]> var_1190 = reshape(shape = var_1189, x = valid_mask)[name = tensor<string, []>("op_1190")];
-            tensor<fp32, [6, 4, 5, 64]> v_masked_1 = mul(x = v_9, y = var_1190)[name = tensor<string, []>("v_masked_1")];
-            tensor<fp32, [6, 4, 64, 64]> var_1192 = mul(x = prev_kv_9, y = sqrt_s0_9)[name = tensor<string, []>("op_1192")];
-            tensor<bool, []> var_1194_transpose_x_1 = const()[name = tensor<string, []>("op_1194_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_1194_transpose_y_1 = const()[name = tensor<string, []>("op_1194_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [6, 4, 64, 64]> var_1194 = matmul(transpose_x = var_1194_transpose_x_1, transpose_y = var_1194_transpose_y_1, x = k_9, y = v_masked_1)[name = tensor<string, []>("op_1194")];
-            tensor<fp32, [6, 4, 64, 64]> new_kv_unnorm_9 = add(x = var_1192, y = var_1194)[name = tensor<string, []>("new_kv_unnorm_9")];
-            tensor<bool, []> var_1196_keep_dims_0 = const()[name = tensor<string, []>("op_1196_keep_dims_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, []> var_1196 = reduce_sum(keep_dims = var_1196_keep_dims_0, x = valid_mask)[name = tensor<string, []>("op_1196")];
-            tensor<int32, [1]> var_1197 = const()[name = tensor<string, []>("op_1197"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [1]> var_1198 = reshape(shape = var_1197, x = var_1196)[name = tensor<string, []>("op_1198")];
-            tensor<fp32, [1]> new_scale_9 = add(x = prev_scale_9, y = var_1198)[name = tensor<string, []>("new_scale_9")];
+            tensor<int32, [4]> var_1231 = const()[name = tensor<string, []>("op_1231"), val = tensor<int32, [4]>([1, 1, 5, 1])];
+            tensor<fp32, [1, 1, 5, 1]> var_1232 = reshape(shape = var_1231, x = valid_mask)[name = tensor<string, []>("op_1232")];
+            tensor<fp32, [6, 4, 5, 64]> v_masked_1 = mul(x = v_9, y = var_1232)[name = tensor<string, []>("v_masked_1")];
+            tensor<fp32, [6, 4, 64, 64]> var_1234 = mul(x = prev_kv_9, y = sqrt_s0_9)[name = tensor<string, []>("op_1234")];
+            tensor<bool, []> var_1236_transpose_x_1 = const()[name = tensor<string, []>("op_1236_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_1236_transpose_y_1 = const()[name = tensor<string, []>("op_1236_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [6, 4, 64, 64]> var_1236 = matmul(transpose_x = var_1236_transpose_x_1, transpose_y = var_1236_transpose_y_1, x = k_9, y = v_masked_1)[name = tensor<string, []>("op_1236")];
+            tensor<fp32, [6, 4, 64, 64]> new_kv_unnorm_9 = add(x = var_1234, y = var_1236)[name = tensor<string, []>("new_kv_unnorm_9")];
+            tensor<bool, []> var_1238_keep_dims_0 = const()[name = tensor<string, []>("op_1238_keep_dims_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, []> var_1238 = reduce_sum(keep_dims = var_1238_keep_dims_0, x = valid_mask)[name = tensor<string, []>("op_1238")];
+            tensor<int32, [1]> var_1239 = const()[name = tensor<string, []>("op_1239"), val = tensor<int32, [1]>([1])];
+            tensor<fp32, [1]> var_1240 = reshape(shape = var_1239, x = var_1238)[name = tensor<string, []>("op_1240")];
+            tensor<fp32, [1]> new_scale_9 = add(x = prev_scale_9, y = var_1240)[name = tensor<string, []>("new_scale_9")];
             tensor<fp32, []> const_21 = const()[name = tensor<string, []>("const_21"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_2 = clip(alpha = var_1055, beta = const_21, x = new_scale_9)[name = tensor<string, []>("clip_2")];
+            tensor<fp32, [1]> clip_2 = clip(alpha = var_68, beta = const_21, x = new_scale_9)[name = tensor<string, []>("clip_2")];
             tensor<fp32, [1]> sqrt_new_scale_1 = sqrt(x = clip_2)[name = tensor<string, []>("sqrt_new_scale_1")];
-            tensor<fp32, [6, 4, 64, 64]> var_1202 = real_div(x = new_kv_unnorm_9, y = sqrt_new_scale_1)[name = tensor<string, []>("op_1202")];
-            tensor<int32, [4]> var_1203_perm_0 = const()[name = tensor<string, []>("op_1203_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [6, 4, 64, 64]> var_1244 = real_div(x = new_kv_unnorm_9, y = sqrt_new_scale_1)[name = tensor<string, []>("op_1244")];
+            tensor<int32, [4]> var_1245_perm_0 = const()[name = tensor<string, []>("op_1245_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_27_axes_0 = const()[name = tensor<string, []>("out_27_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [6, 5, 4, 64]> var_1203 = transpose(perm = var_1203_perm_0, x = out_25)[name = tensor<string, []>("transpose_24")];
-            tensor<fp32, [6, 5, 4, 64]> out_27 = layer_norm(axes = out_27_axes_0, epsilon = var_1052, x = var_1203)[name = tensor<string, []>("out_27")];
-            tensor<int32, [3]> var_1207 = const()[name = tensor<string, []>("op_1207"), val = tensor<int32, [3]>([6, 5, 256])];
-            tensor<fp32, [6, 5, 256]> out_29 = reshape(shape = var_1207, x = out_27)[name = tensor<string, []>("out_29")];
-            tensor<fp32, [6, 5, 256]> var_1209 = silu(x = input_169)[name = tensor<string, []>("op_1209")];
-            tensor<fp32, [6, 5, 256]> input_171 = mul(x = var_1209, y = out_29)[name = tensor<string, []>("input_171")];
-            tensor<fp32, [6, 5, 256]> ret_out_9 = linear(bias = decoder_out_proj_0_bias, weight = decoder_out_proj_0_weight, x = input_171)[name = tensor<string, []>("linear_42")];
-            tensor<fp32, [6, 5, 256]> input_173 = add(x = x_29, y = ret_out_9)[name = tensor<string, []>("input_173")];
+            tensor<fp32, [6, 5, 4, 64]> var_1245 = transpose(perm = var_1245_perm_0, x = out_25)[name = tensor<string, []>("transpose_24")];
+            tensor<fp32, [6, 5, 4, 64]> out_27 = layer_norm(axes = out_27_axes_0, epsilon = var_82, x = var_1245)[name = tensor<string, []>("out_27")];
+            tensor<int32, [3]> var_1249 = const()[name = tensor<string, []>("op_1249"), val = tensor<int32, [3]>([6, 5, 256])];
+            tensor<fp32, [6, 5, 256]> out_29 = reshape(shape = var_1249, x = out_27)[name = tensor<string, []>("out_29")];
+            tensor<fp32, [6, 5, 256]> var_1251 = silu(x = input_171)[name = tensor<string, []>("op_1251")];
+            tensor<fp32, [6, 5, 256]> input_173 = mul(x = var_1251, y = out_29)[name = tensor<string, []>("input_173")];
+            tensor<fp32, [6, 5, 256]> ret_out_9 = linear(bias = inner_decoder_out_proj_0_bias, weight = inner_decoder_out_proj_0_weight, x = input_173)[name = tensor<string, []>("linear_42")];
+            tensor<fp32, [6, 5, 256]> input_175 = add(x = x_29, y = ret_out_9)[name = tensor<string, []>("input_175")];
             tensor<int32, [1]> xt_1_axes_0 = const()[name = tensor<string, []>("xt_1_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [6, 5, 256]> xt_1 = layer_norm(axes = xt_1_axes_0, beta = decoder_norm11_0_bias, epsilon = var_1050, gamma = decoder_norm11_0_weight, x = input_173)[name = tensor<string, []>("xt_1")];
-            tensor<int32, [4]> var_1219 = const()[name = tensor<string, []>("op_1219"), val = tensor<int32, [4]>([1, 6, 5, 256])];
-            tensor<fp32, [1, 6, 5, 256]> var_1220 = reshape(shape = var_1219, x = xt_1)[name = tensor<string, []>("op_1220")];
-            tensor<int32, [4]> var_1221_perm_0 = const()[name = tensor<string, []>("op_1221_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1224 = const()[name = tensor<string, []>("op_1224"), val = tensor<int32, [3]>([5, 6, 256])];
-            tensor<fp32, [1, 5, 6, 256]> var_1221 = transpose(perm = var_1221_perm_0, x = var_1220)[name = tensor<string, []>("transpose_23")];
-            tensor<fp32, [5, 6, 256]> query_1 = reshape(shape = var_1224, x = var_1221)[name = tensor<string, []>("query_1")];
+            tensor<fp32, [6, 5, 256]> xt_1 = layer_norm(axes = xt_1_axes_0, beta = inner_decoder_norm11_0_bias, epsilon = var_74, gamma = inner_decoder_norm11_0_weight, x = input_175)[name = tensor<string, []>("xt_1")];
+            tensor<int32, [4]> var_1261 = const()[name = tensor<string, []>("op_1261"), val = tensor<int32, [4]>([1, 6, 5, 256])];
+            tensor<fp32, [1, 6, 5, 256]> var_1262 = reshape(shape = var_1261, x = xt_1)[name = tensor<string, []>("op_1262")];
+            tensor<int32, [4]> var_1263_perm_0 = const()[name = tensor<string, []>("op_1263_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1266 = const()[name = tensor<string, []>("op_1266"), val = tensor<int32, [3]>([5, 6, 256])];
+            tensor<fp32, [1, 5, 6, 256]> var_1263 = transpose(perm = var_1263_perm_0, x = var_1262)[name = tensor<string, []>("transpose_23")];
+            tensor<fp32, [5, 6, 256]> query_1 = reshape(shape = var_1266, x = var_1263)[name = tensor<string, []>("query_1")];
             tensor<int32, [3]> query_3_perm_0 = const()[name = tensor<string, []>("query_3_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [6, 5, 256]> query_3 = transpose(perm = query_3_perm_0, x = query_1)[name = tensor<string, []>("transpose_22")];
-            tensor<fp32, [6, 5, 768]> var_1247 = linear(bias = decoder_self_attn2_0_in_proj_bias, weight = decoder_self_attn2_0_in_proj_weight, x = query_3)[name = tensor<string, []>("linear_43")];
+            tensor<fp32, [6, 5, 768]> var_1289 = linear(bias = inner_decoder_self_attn2_0_in_proj_bias, weight = inner_decoder_self_attn2_0_in_proj_weight, x = query_3)[name = tensor<string, []>("linear_43")];
             tensor<int32, [4]> concat_1 = const()[name = tensor<string, []>("concat_1"), val = tensor<int32, [4]>([6, 5, 3, 256])];
-            tensor<fp32, [6, 5, 3, 256]> var_1249 = reshape(shape = concat_1, x = var_1247)[name = tensor<string, []>("op_1249")];
-            tensor<int32, [1]> var_1250_axes_0 = const()[name = tensor<string, []>("op_1250_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp32, [1, 6, 5, 3, 256]> var_1250 = expand_dims(axes = var_1250_axes_0, x = var_1249)[name = tensor<string, []>("op_1250")];
-            tensor<int32, [5]> var_1251_perm_0 = const()[name = tensor<string, []>("op_1251_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
-            tensor<int32, [1]> var_1252_axes_0 = const()[name = tensor<string, []>("op_1252_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp32, [3, 6, 5, 1, 256]> var_1251 = transpose(perm = var_1251_perm_0, x = var_1250)[name = tensor<string, []>("transpose_21")];
-            tensor<fp32, [3, 6, 5, 256]> var_1252 = squeeze(axes = var_1252_axes_0, x = var_1251)[name = tensor<string, []>("op_1252")];
+            tensor<fp32, [6, 5, 3, 256]> var_1291 = reshape(shape = concat_1, x = var_1289)[name = tensor<string, []>("op_1291")];
+            tensor<int32, [1]> var_1292_axes_0 = const()[name = tensor<string, []>("op_1292_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 6, 5, 3, 256]> var_1292 = expand_dims(axes = var_1292_axes_0, x = var_1291)[name = tensor<string, []>("op_1292")];
+            tensor<int32, [5]> var_1293_perm_0 = const()[name = tensor<string, []>("op_1293_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
+            tensor<int32, [1]> var_1294_axes_0 = const()[name = tensor<string, []>("op_1294_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp32, [3, 6, 5, 1, 256]> var_1293 = transpose(perm = var_1293_perm_0, x = var_1292)[name = tensor<string, []>("transpose_21")];
+            tensor<fp32, [3, 6, 5, 256]> var_1294 = squeeze(axes = var_1294_axes_0, x = var_1293)[name = tensor<string, []>("op_1294")];
             tensor<int32, [4]> q_11_begin_0 = const()[name = tensor<string, []>("q_11_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> q_11_end_0 = const()[name = tensor<string, []>("q_11_end_0"), val = tensor<int32, [4]>([1, 6, 5, 256])];
             tensor<bool, [4]> q_11_end_mask_0 = const()[name = tensor<string, []>("q_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> q_11_squeeze_mask_0 = const()[name = tensor<string, []>("q_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [6, 5, 256]> q_11 = slice_by_index(begin = q_11_begin_0, end = q_11_end_0, end_mask = q_11_end_mask_0, squeeze_mask = q_11_squeeze_mask_0, x = var_1252)[name = tensor<string, []>("q_11")];
+            tensor<fp32, [6, 5, 256]> q_11 = slice_by_index(begin = q_11_begin_0, end = q_11_end_0, end_mask = q_11_end_mask_0, squeeze_mask = q_11_squeeze_mask_0, x = var_1294)[name = tensor<string, []>("q_11")];
             tensor<int32, [4]> k_11_begin_0 = const()[name = tensor<string, []>("k_11_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> k_11_end_0 = const()[name = tensor<string, []>("k_11_end_0"), val = tensor<int32, [4]>([2, 6, 5, 256])];
             tensor<bool, [4]> k_11_end_mask_0 = const()[name = tensor<string, []>("k_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> k_11_squeeze_mask_0 = const()[name = tensor<string, []>("k_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [6, 5, 256]> k_11 = slice_by_index(begin = k_11_begin_0, end = k_11_end_0, end_mask = k_11_end_mask_0, squeeze_mask = k_11_squeeze_mask_0, x = var_1252)[name = tensor<string, []>("k_11")];
+            tensor<fp32, [6, 5, 256]> k_11 = slice_by_index(begin = k_11_begin_0, end = k_11_end_0, end_mask = k_11_end_mask_0, squeeze_mask = k_11_squeeze_mask_0, x = var_1294)[name = tensor<string, []>("k_11")];
             tensor<int32, [4]> v_11_begin_0 = const()[name = tensor<string, []>("v_11_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> v_11_end_0 = const()[name = tensor<string, []>("v_11_end_0"), val = tensor<int32, [4]>([3, 6, 5, 256])];
             tensor<bool, [4]> v_11_end_mask_0 = const()[name = tensor<string, []>("v_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> v_11_squeeze_mask_0 = const()[name = tensor<string, []>("v_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [6, 5, 256]> v_11 = slice_by_index(begin = v_11_begin_0, end = v_11_end_0, end_mask = v_11_end_mask_0, squeeze_mask = v_11_squeeze_mask_0, x = var_1252)[name = tensor<string, []>("v_11")];
-            tensor<int32, [3]> var_1260 = const()[name = tensor<string, []>("op_1260"), val = tensor<int32, [3]>([6, 20, 64])];
-            tensor<fp32, [6, 20, 64]> var_1261 = reshape(shape = var_1260, x = q_11)[name = tensor<string, []>("op_1261")];
+            tensor<fp32, [6, 5, 256]> v_11 = slice_by_index(begin = v_11_begin_0, end = v_11_end_0, end_mask = v_11_end_mask_0, squeeze_mask = v_11_squeeze_mask_0, x = var_1294)[name = tensor<string, []>("v_11")];
+            tensor<int32, [3]> var_1302 = const()[name = tensor<string, []>("op_1302"), val = tensor<int32, [3]>([6, 20, 64])];
+            tensor<fp32, [6, 20, 64]> var_1303 = reshape(shape = var_1302, x = q_11)[name = tensor<string, []>("op_1303")];
             tensor<int32, [3]> q_13_perm_0 = const()[name = tensor<string, []>("q_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1267 = const()[name = tensor<string, []>("op_1267"), val = tensor<int32, [3]>([6, 20, 64])];
-            tensor<fp32, [6, 20, 64]> var_1268 = reshape(shape = var_1267, x = k_11)[name = tensor<string, []>("op_1268")];
+            tensor<int32, [3]> var_1309 = const()[name = tensor<string, []>("op_1309"), val = tensor<int32, [3]>([6, 20, 64])];
+            tensor<fp32, [6, 20, 64]> var_1310 = reshape(shape = var_1309, x = k_11)[name = tensor<string, []>("op_1310")];
             tensor<int32, [3]> k_13_perm_0 = const()[name = tensor<string, []>("k_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1274 = const()[name = tensor<string, []>("op_1274"), val = tensor<int32, [3]>([6, 20, 64])];
-            tensor<fp32, [6, 20, 64]> var_1275 = reshape(shape = var_1274, x = v_11)[name = tensor<string, []>("op_1275")];
+            tensor<int32, [3]> var_1316 = const()[name = tensor<string, []>("op_1316"), val = tensor<int32, [3]>([6, 20, 64])];
+            tensor<fp32, [6, 20, 64]> var_1317 = reshape(shape = var_1316, x = v_11)[name = tensor<string, []>("op_1317")];
             tensor<int32, [3]> v_13_perm_0 = const()[name = tensor<string, []>("v_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [4]> var_1278 = const()[name = tensor<string, []>("op_1278"), val = tensor<int32, [4]>([5, 4, 6, 64])];
-            tensor<fp32, [20, 6, 64]> q_13 = transpose(perm = q_13_perm_0, x = var_1261)[name = tensor<string, []>("transpose_20")];
-            tensor<fp32, [5, 4, 6, 64]> q_15 = reshape(shape = var_1278, x = q_13)[name = tensor<string, []>("q_15")];
-            tensor<int32, [4]> var_1280 = const()[name = tensor<string, []>("op_1280"), val = tensor<int32, [4]>([5, 4, 6, 64])];
-            tensor<fp32, [20, 6, 64]> k_13 = transpose(perm = k_13_perm_0, x = var_1268)[name = tensor<string, []>("transpose_19")];
-            tensor<fp32, [5, 4, 6, 64]> k_15 = reshape(shape = var_1280, x = k_13)[name = tensor<string, []>("k_15")];
-            tensor<int32, [4]> var_1282 = const()[name = tensor<string, []>("op_1282"), val = tensor<int32, [4]>([5, 4, 6, 64])];
-            tensor<fp32, [20, 6, 64]> v_13 = transpose(perm = v_13_perm_0, x = var_1275)[name = tensor<string, []>("transpose_18")];
-            tensor<fp32, [5, 4, 6, 64]> v_15 = reshape(shape = var_1282, x = v_13)[name = tensor<string, []>("v_15")];
+            tensor<int32, [4]> var_1320 = const()[name = tensor<string, []>("op_1320"), val = tensor<int32, [4]>([5, 4, 6, 64])];
+            tensor<fp32, [20, 6, 64]> q_13 = transpose(perm = q_13_perm_0, x = var_1303)[name = tensor<string, []>("transpose_20")];
+            tensor<fp32, [5, 4, 6, 64]> q_15 = reshape(shape = var_1320, x = q_13)[name = tensor<string, []>("q_15")];
+            tensor<int32, [4]> var_1322 = const()[name = tensor<string, []>("op_1322"), val = tensor<int32, [4]>([5, 4, 6, 64])];
+            tensor<fp32, [20, 6, 64]> k_13 = transpose(perm = k_13_perm_0, x = var_1310)[name = tensor<string, []>("transpose_19")];
+            tensor<fp32, [5, 4, 6, 64]> k_15 = reshape(shape = var_1322, x = k_13)[name = tensor<string, []>("k_15")];
+            tensor<int32, [4]> var_1324 = const()[name = tensor<string, []>("op_1324"), val = tensor<int32, [4]>([5, 4, 6, 64])];
+            tensor<fp32, [20, 6, 64]> v_13 = transpose(perm = v_13_perm_0, x = var_1317)[name = tensor<string, []>("transpose_18")];
+            tensor<fp32, [5, 4, 6, 64]> v_15 = reshape(shape = var_1324, x = v_13)[name = tensor<string, []>("v_15")];
             tensor<fp32, []> mul_1_y_0 = const()[name = tensor<string, []>("mul_1_y_0"), val = tensor<fp32, []>(0x1p-3)];
             tensor<fp32, [5, 4, 6, 64]> mul_1 = mul(x = q_15, y = mul_1_y_0)[name = tensor<string, []>("mul_1")];
             tensor<bool, []> matmul_0_transpose_y_0 = const()[name = tensor<string, []>("matmul_0_transpose_y_0"), val = tensor<bool, []>(true)];
@@ -1159,30 +1178,30 @@ program(1.0)
             tensor<bool, []> attn_output_1_transpose_x_0 = const()[name = tensor<string, []>("attn_output_1_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> attn_output_1_transpose_y_0 = const()[name = tensor<string, []>("attn_output_1_transpose_y_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [5, 4, 6, 64]> attn_output_1 = matmul(transpose_x = attn_output_1_transpose_x_0, transpose_y = attn_output_1_transpose_y_0, x = softmax_0, y = v_15)[name = tensor<string, []>("attn_output_1")];
-            tensor<int32, [4]> var_1285 = const()[name = tensor<string, []>("op_1285"), val = tensor<int32, [4]>([2, 0, 1, 3])];
-            tensor<int32, [2]> var_1290 = const()[name = tensor<string, []>("op_1290"), val = tensor<int32, [2]>([30, 256])];
-            tensor<fp32, [6, 5, 4, 64]> var_1286 = transpose(perm = var_1285, x = attn_output_1)[name = tensor<string, []>("transpose_17")];
-            tensor<fp32, [30, 256]> attn_output_3 = reshape(shape = var_1290, x = var_1286)[name = tensor<string, []>("attn_output_3")];
-            tensor<fp32, [30, 256]> attn_output_5 = linear(bias = decoder_self_attn2_0_out_proj_bias, weight = decoder_self_attn2_0_out_proj_weight, x = attn_output_3)[name = tensor<string, []>("linear_44")];
-            tensor<int32, [3]> var_1294 = const()[name = tensor<string, []>("op_1294"), val = tensor<int32, [3]>([6, 5, 256])];
-            tensor<fp32, [6, 5, 256]> attn_output_7 = reshape(shape = var_1294, x = attn_output_5)[name = tensor<string, []>("attn_output_7")];
+            tensor<int32, [4]> var_1327 = const()[name = tensor<string, []>("op_1327"), val = tensor<int32, [4]>([2, 0, 1, 3])];
+            tensor<int32, [2]> var_1332 = const()[name = tensor<string, []>("op_1332"), val = tensor<int32, [2]>([30, 256])];
+            tensor<fp32, [6, 5, 4, 64]> var_1328 = transpose(perm = var_1327, x = attn_output_1)[name = tensor<string, []>("transpose_17")];
+            tensor<fp32, [30, 256]> attn_output_3 = reshape(shape = var_1332, x = var_1328)[name = tensor<string, []>("attn_output_3")];
+            tensor<fp32, [30, 256]> attn_output_5 = linear(bias = inner_decoder_self_attn2_0_out_proj_bias, weight = inner_decoder_self_attn2_0_out_proj_weight, x = attn_output_3)[name = tensor<string, []>("linear_44")];
+            tensor<int32, [3]> var_1336 = const()[name = tensor<string, []>("op_1336"), val = tensor<int32, [3]>([6, 5, 256])];
+            tensor<fp32, [6, 5, 256]> attn_output_7 = reshape(shape = var_1336, x = attn_output_5)[name = tensor<string, []>("attn_output_7")];
             tensor<int32, [3]> sa2_out_1_perm_0 = const()[name = tensor<string, []>("sa2_out_1_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [5, 6, 256]> sa2_out_1 = transpose(perm = sa2_out_1_perm_0, x = attn_output_7)[name = tensor<string, []>("transpose_16")];
-            tensor<fp32, [5, 6, 256]> input_175 = add(x = query_1, y = sa2_out_1)[name = tensor<string, []>("input_175")];
-            tensor<int32, [1]> input_177_axes_0 = const()[name = tensor<string, []>("input_177_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [5, 6, 256]> input_177 = layer_norm(axes = input_177_axes_0, beta = decoder_norm21_0_bias, epsilon = var_1050, gamma = decoder_norm21_0_weight, x = input_175)[name = tensor<string, []>("input_177")];
-            tensor<fp32, [5, 6, 2048]> input_179 = linear(bias = decoder_linear1_0_bias, weight = decoder_linear1_0_weight, x = input_177)[name = tensor<string, []>("linear_45")];
-            tensor<fp32, [5, 6, 2048]> input_181 = relu(x = input_179)[name = tensor<string, []>("input_181")];
-            tensor<fp32, [5, 6, 256]> ffn_out_1 = linear(bias = decoder_linear2_0_bias, weight = decoder_linear2_0_weight, x = input_181)[name = tensor<string, []>("linear_46")];
-            tensor<fp32, [5, 6, 256]> input_183 = add(x = input_177, y = ffn_out_1)[name = tensor<string, []>("input_183")];
+            tensor<fp32, [5, 6, 256]> input_177 = add(x = query_1, y = sa2_out_1)[name = tensor<string, []>("input_177")];
+            tensor<int32, [1]> input_179_axes_0 = const()[name = tensor<string, []>("input_179_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [5, 6, 256]> input_179 = layer_norm(axes = input_179_axes_0, beta = inner_decoder_norm21_0_bias, epsilon = var_74, gamma = inner_decoder_norm21_0_weight, x = input_177)[name = tensor<string, []>("input_179")];
+            tensor<fp32, [5, 6, 2048]> input_181 = linear(bias = inner_decoder_linear1_0_bias, weight = inner_decoder_linear1_0_weight, x = input_179)[name = tensor<string, []>("linear_45")];
+            tensor<fp32, [5, 6, 2048]> input_183 = relu(x = input_181)[name = tensor<string, []>("input_183")];
+            tensor<fp32, [5, 6, 256]> ffn_out_1 = linear(bias = inner_decoder_linear2_0_bias, weight = inner_decoder_linear2_0_weight, x = input_183)[name = tensor<string, []>("linear_46")];
+            tensor<fp32, [5, 6, 256]> input_185 = add(x = input_179, y = ffn_out_1)[name = tensor<string, []>("input_185")];
             tensor<int32, [1]> xt_3_axes_0 = const()[name = tensor<string, []>("xt_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [5, 6, 256]> xt_3 = layer_norm(axes = xt_3_axes_0, beta = decoder_norm22_0_bias, epsilon = var_1050, gamma = decoder_norm22_0_weight, x = input_183)[name = tensor<string, []>("xt_3")];
-            tensor<int32, [4]> var_1314 = const()[name = tensor<string, []>("op_1314"), val = tensor<int32, [4]>([1, 5, 6, 256])];
-            tensor<fp32, [1, 5, 6, 256]> x_31 = reshape(shape = var_1314, x = xt_3)[name = tensor<string, []>("x_31")];
-            tensor<int32, [4]> var_1316_perm_0 = const()[name = tensor<string, []>("op_1316_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1320 = const()[name = tensor<string, []>("op_1320"), val = tensor<int32, [3]>([6, 5, 256])];
-            tensor<fp32, [1, 6, 5, 256]> var_1316 = transpose(perm = var_1316_perm_0, x = x_31)[name = tensor<string, []>("transpose_15")];
-            tensor<fp32, [6, 5, 256]> x = reshape(shape = var_1320, x = var_1316)[name = tensor<string, []>("x")];
+            tensor<fp32, [5, 6, 256]> xt_3 = layer_norm(axes = xt_3_axes_0, beta = inner_decoder_norm22_0_bias, epsilon = var_74, gamma = inner_decoder_norm22_0_weight, x = input_185)[name = tensor<string, []>("xt_3")];
+            tensor<int32, [4]> var_1356 = const()[name = tensor<string, []>("op_1356"), val = tensor<int32, [4]>([1, 5, 6, 256])];
+            tensor<fp32, [1, 5, 6, 256]> x_31 = reshape(shape = var_1356, x = xt_3)[name = tensor<string, []>("x_31")];
+            tensor<int32, [4]> var_1358_perm_0 = const()[name = tensor<string, []>("op_1358_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1362 = const()[name = tensor<string, []>("op_1362"), val = tensor<int32, [3]>([6, 5, 256])];
+            tensor<fp32, [1, 6, 5, 256]> var_1358 = transpose(perm = var_1358_perm_0, x = x_31)[name = tensor<string, []>("transpose_15")];
+            tensor<fp32, [6, 5, 256]> x = reshape(shape = var_1362, x = var_1358)[name = tensor<string, []>("x")];
             tensor<int32, [5]> prev_kv_begin_0 = const()[name = tensor<string, []>("prev_kv_begin_0"), val = tensor<int32, [5]>([1, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_end_0 = const()[name = tensor<string, []>("prev_kv_end_0"), val = tensor<int32, [5]>([2, 6, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_end_mask_0 = const()[name = tensor<string, []>("prev_kv_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -1193,120 +1212,120 @@ program(1.0)
             tensor<bool, [2]> prev_scale_end_mask_0 = const()[name = tensor<string, []>("prev_scale_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale = slice_by_index(begin = prev_scale_begin_0, end = prev_scale_end_0, end_mask = prev_scale_end_mask_0, squeeze_mask = prev_scale_squeeze_mask_0, x = dec_scale)[name = tensor<string, []>("prev_scale")];
-            tensor<fp32, [6, 5, 256]> var_1328 = linear(bias = decoder_q_proj_1_bias, weight = decoder_q_proj_1_weight, x = x)[name = tensor<string, []>("linear_47")];
-            tensor<int32, [4]> var_1329 = const()[name = tensor<string, []>("op_1329"), val = tensor<int32, [4]>([6, 5, 4, 64])];
-            tensor<fp32, [6, 5, 4, 64]> var_1330 = reshape(shape = var_1329, x = var_1328)[name = tensor<string, []>("op_1330")];
+            tensor<fp32, [6, 5, 256]> var_1370 = linear(bias = inner_decoder_q_proj_1_bias, weight = inner_decoder_q_proj_1_weight, x = x)[name = tensor<string, []>("linear_47")];
+            tensor<int32, [4]> var_1371 = const()[name = tensor<string, []>("op_1371"), val = tensor<int32, [4]>([6, 5, 4, 64])];
+            tensor<fp32, [6, 5, 4, 64]> var_1372 = reshape(shape = var_1371, x = var_1370)[name = tensor<string, []>("op_1372")];
             tensor<int32, [4]> q_17_perm_0 = const()[name = tensor<string, []>("q_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [6, 5, 256]> var_1334 = linear(bias = decoder_k_proj_1_bias, weight = decoder_k_proj_1_weight, x = x)[name = tensor<string, []>("linear_48")];
-            tensor<fp32, []> var_1335 = const()[name = tensor<string, []>("op_1335"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [6, 5, 256]> var_1336 = mul(x = var_1334, y = var_1335)[name = tensor<string, []>("op_1336")];
-            tensor<int32, [4]> var_1337 = const()[name = tensor<string, []>("op_1337"), val = tensor<int32, [4]>([6, 5, 4, 64])];
-            tensor<fp32, [6, 5, 4, 64]> var_1338 = reshape(shape = var_1337, x = var_1336)[name = tensor<string, []>("op_1338")];
+            tensor<fp32, [6, 5, 256]> var_1376 = linear(bias = inner_decoder_k_proj_1_bias, weight = inner_decoder_k_proj_1_weight, x = x)[name = tensor<string, []>("linear_48")];
+            tensor<fp32, []> var_1377 = const()[name = tensor<string, []>("op_1377"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [6, 5, 256]> var_1378 = mul(x = var_1376, y = var_1377)[name = tensor<string, []>("op_1378")];
+            tensor<int32, [4]> var_1379 = const()[name = tensor<string, []>("op_1379"), val = tensor<int32, [4]>([6, 5, 4, 64])];
+            tensor<fp32, [6, 5, 4, 64]> var_1380 = reshape(shape = var_1379, x = var_1378)[name = tensor<string, []>("op_1380")];
             tensor<int32, [4]> k_17_perm_0 = const()[name = tensor<string, []>("k_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [6, 5, 256]> var_1342 = linear(bias = decoder_v_proj_1_bias, weight = decoder_v_proj_1_weight, x = x)[name = tensor<string, []>("linear_49")];
-            tensor<int32, [4]> var_1343 = const()[name = tensor<string, []>("op_1343"), val = tensor<int32, [4]>([6, 5, 4, 64])];
-            tensor<fp32, [6, 5, 4, 64]> var_1344 = reshape(shape = var_1343, x = var_1342)[name = tensor<string, []>("op_1344")];
+            tensor<fp32, [6, 5, 256]> var_1384 = linear(bias = inner_decoder_v_proj_1_bias, weight = inner_decoder_v_proj_1_weight, x = x)[name = tensor<string, []>("linear_49")];
+            tensor<int32, [4]> var_1385 = const()[name = tensor<string, []>("op_1385"), val = tensor<int32, [4]>([6, 5, 4, 64])];
+            tensor<fp32, [6, 5, 4, 64]> var_1386 = reshape(shape = var_1385, x = var_1384)[name = tensor<string, []>("op_1386")];
             tensor<int32, [4]> v_17_perm_0 = const()[name = tensor<string, []>("v_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [6, 5, 256]> input_187 = linear(bias = decoder_g_proj_1_bias, weight = decoder_g_proj_1_weight, x = x)[name = tensor<string, []>("linear_50")];
+            tensor<fp32, [6, 5, 256]> input_189 = linear(bias = inner_decoder_g_proj_1_bias, weight = inner_decoder_g_proj_1_weight, x = x)[name = tensor<string, []>("linear_50")];
             tensor<fp32, [1]> sqrt_s0 = sqrt(x = prev_scale)[name = tensor<string, []>("sqrt_s0")];
             tensor<fp32, [5]> s_t = add(x = prev_scale, y = cumsum_mask_1)[name = tensor<string, []>("s_t")];
             tensor<fp32, []> const_32 = const()[name = tensor<string, []>("const_32"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [5]> clip_3 = clip(alpha = var_1055, beta = const_32, x = s_t)[name = tensor<string, []>("clip_3")];
+            tensor<fp32, [5]> clip_3 = clip(alpha = var_68, beta = const_32, x = s_t)[name = tensor<string, []>("clip_3")];
             tensor<fp32, [5]> sqrt_s_t = sqrt(x = clip_3)[name = tensor<string, []>("sqrt_s_t")];
             tensor<bool, []> qk_transpose_x_1 = const()[name = tensor<string, []>("qk_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_transpose_y_1 = const()[name = tensor<string, []>("qk_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [6, 4, 5, 64]> k_17 = transpose(perm = k_17_perm_0, x = var_1338)[name = tensor<string, []>("transpose_13")];
-            tensor<fp32, [6, 4, 5, 64]> q_17 = transpose(perm = q_17_perm_0, x = var_1330)[name = tensor<string, []>("transpose_14")];
+            tensor<fp32, [6, 4, 5, 64]> k_17 = transpose(perm = k_17_perm_0, x = var_1380)[name = tensor<string, []>("transpose_13")];
+            tensor<fp32, [6, 4, 5, 64]> q_17 = transpose(perm = q_17_perm_0, x = var_1372)[name = tensor<string, []>("transpose_14")];
             tensor<fp32, [6, 4, 5, 5]> qk = matmul(transpose_x = qk_transpose_x_1, transpose_y = qk_transpose_y_1, x = q_17, y = k_17)[name = tensor<string, []>("qk")];
-            tensor<int32, [2]> var_1359 = const()[name = tensor<string, []>("op_1359"), val = tensor<int32, [2]>([5, 1])];
-            tensor<fp32, [5, 1]> var_1360 = reshape(shape = var_1359, x = sqrt_s_t)[name = tensor<string, []>("op_1360")];
-            tensor<fp32, [5, 5]> M = real_div(x = causal_with_valid_1, y = var_1360)[name = tensor<string, []>("M")];
-            tensor<fp32, [6, 4, 5, 5]> var_1362 = mul(x = qk, y = M)[name = tensor<string, []>("op_1362")];
-            tensor<bool, []> inner_transpose_x_0 = const()[name = tensor<string, []>("inner_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> inner_transpose_y_0 = const()[name = tensor<string, []>("inner_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [6, 4, 5, 64]> v_17 = transpose(perm = v_17_perm_0, x = var_1344)[name = tensor<string, []>("transpose_12")];
-            tensor<fp32, [6, 4, 5, 64]> inner = matmul(transpose_x = inner_transpose_x_0, transpose_y = inner_transpose_y_0, x = var_1362, y = v_17)[name = tensor<string, []>("inner")];
-            tensor<bool, []> var_1364_transpose_x_0 = const()[name = tensor<string, []>("op_1364_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_1364_transpose_y_0 = const()[name = tensor<string, []>("op_1364_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [6, 4, 5, 64]> var_1364 = matmul(transpose_x = var_1364_transpose_x_0, transpose_y = var_1364_transpose_y_0, x = q_17, y = prev_kv)[name = tensor<string, []>("op_1364")];
-            tensor<fp32, [5]> var_1365 = real_div(x = sqrt_s0, y = sqrt_s_t)[name = tensor<string, []>("op_1365")];
-            tensor<int32, [4]> var_1366 = const()[name = tensor<string, []>("op_1366"), val = tensor<int32, [4]>([1, 1, 5, 1])];
-            tensor<fp32, [1, 1, 5, 1]> var_1367 = reshape(shape = var_1366, x = var_1365)[name = tensor<string, []>("op_1367")];
-            tensor<fp32, [6, 4, 5, 64]> cross = mul(x = var_1364, y = var_1367)[name = tensor<string, []>("cross")];
-            tensor<fp32, [6, 4, 5, 64]> out_31 = add(x = inner, y = cross)[name = tensor<string, []>("out_31")];
-            tensor<fp32, [6, 4, 5, 64]> v_masked = mul(x = v_17, y = var_1190)[name = tensor<string, []>("v_masked")];
-            tensor<fp32, [6, 4, 64, 64]> var_1373 = mul(x = prev_kv, y = sqrt_s0)[name = tensor<string, []>("op_1373")];
-            tensor<bool, []> var_1375_transpose_x_1 = const()[name = tensor<string, []>("op_1375_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_1375_transpose_y_1 = const()[name = tensor<string, []>("op_1375_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [6, 4, 64, 64]> var_1375 = matmul(transpose_x = var_1375_transpose_x_1, transpose_y = var_1375_transpose_y_1, x = k_17, y = v_masked)[name = tensor<string, []>("op_1375")];
-            tensor<fp32, [6, 4, 64, 64]> new_kv_unnorm = add(x = var_1373, y = var_1375)[name = tensor<string, []>("new_kv_unnorm")];
-            tensor<fp32, [1]> new_scale = add(x = prev_scale, y = var_1198)[name = tensor<string, []>("new_scale")];
+            tensor<int32, [2]> var_1401 = const()[name = tensor<string, []>("op_1401"), val = tensor<int32, [2]>([5, 1])];
+            tensor<fp32, [5, 1]> var_1402 = reshape(shape = var_1401, x = sqrt_s_t)[name = tensor<string, []>("op_1402")];
+            tensor<fp32, [5, 5]> M = real_div(x = causal_with_valid_1, y = var_1402)[name = tensor<string, []>("M")];
+            tensor<fp32, [6, 4, 5, 5]> var_1404 = mul(x = qk, y = M)[name = tensor<string, []>("op_1404")];
+            tensor<bool, []> inner_11_transpose_x_0 = const()[name = tensor<string, []>("inner_11_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> inner_11_transpose_y_0 = const()[name = tensor<string, []>("inner_11_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [6, 4, 5, 64]> v_17 = transpose(perm = v_17_perm_0, x = var_1386)[name = tensor<string, []>("transpose_12")];
+            tensor<fp32, [6, 4, 5, 64]> inner_11 = matmul(transpose_x = inner_11_transpose_x_0, transpose_y = inner_11_transpose_y_0, x = var_1404, y = v_17)[name = tensor<string, []>("inner_11")];
+            tensor<bool, []> var_1406_transpose_x_0 = const()[name = tensor<string, []>("op_1406_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_1406_transpose_y_0 = const()[name = tensor<string, []>("op_1406_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [6, 4, 5, 64]> var_1406 = matmul(transpose_x = var_1406_transpose_x_0, transpose_y = var_1406_transpose_y_0, x = q_17, y = prev_kv)[name = tensor<string, []>("op_1406")];
+            tensor<fp32, [5]> var_1407 = real_div(x = sqrt_s0, y = sqrt_s_t)[name = tensor<string, []>("op_1407")];
+            tensor<int32, [4]> var_1408 = const()[name = tensor<string, []>("op_1408"), val = tensor<int32, [4]>([1, 1, 5, 1])];
+            tensor<fp32, [1, 1, 5, 1]> var_1409 = reshape(shape = var_1408, x = var_1407)[name = tensor<string, []>("op_1409")];
+            tensor<fp32, [6, 4, 5, 64]> cross = mul(x = var_1406, y = var_1409)[name = tensor<string, []>("cross")];
+            tensor<fp32, [6, 4, 5, 64]> out_31 = add(x = inner_11, y = cross)[name = tensor<string, []>("out_31")];
+            tensor<fp32, [6, 4, 5, 64]> v_masked = mul(x = v_17, y = var_1232)[name = tensor<string, []>("v_masked")];
+            tensor<fp32, [6, 4, 64, 64]> var_1415 = mul(x = prev_kv, y = sqrt_s0)[name = tensor<string, []>("op_1415")];
+            tensor<bool, []> var_1417_transpose_x_1 = const()[name = tensor<string, []>("op_1417_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_1417_transpose_y_1 = const()[name = tensor<string, []>("op_1417_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [6, 4, 64, 64]> var_1417 = matmul(transpose_x = var_1417_transpose_x_1, transpose_y = var_1417_transpose_y_1, x = k_17, y = v_masked)[name = tensor<string, []>("op_1417")];
+            tensor<fp32, [6, 4, 64, 64]> new_kv_unnorm = add(x = var_1415, y = var_1417)[name = tensor<string, []>("new_kv_unnorm")];
+            tensor<fp32, [1]> new_scale = add(x = prev_scale, y = var_1240)[name = tensor<string, []>("new_scale")];
             tensor<fp32, []> const_33 = const()[name = tensor<string, []>("const_33"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_4 = clip(alpha = var_1055, beta = const_33, x = new_scale)[name = tensor<string, []>("clip_4")];
+            tensor<fp32, [1]> clip_4 = clip(alpha = var_68, beta = const_33, x = new_scale)[name = tensor<string, []>("clip_4")];
             tensor<fp32, [1]> sqrt_new_scale = sqrt(x = clip_4)[name = tensor<string, []>("sqrt_new_scale")];
             tensor<fp32, [6, 4, 64, 64]> nkv = real_div(x = new_kv_unnorm, y = sqrt_new_scale)[name = tensor<string, []>("nkv")];
-            tensor<int32, [4]> var_1384_perm_0 = const()[name = tensor<string, []>("op_1384_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [4]> var_1426_perm_0 = const()[name = tensor<string, []>("op_1426_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_33_axes_0 = const()[name = tensor<string, []>("out_33_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [6, 5, 4, 64]> var_1384 = transpose(perm = var_1384_perm_0, x = out_31)[name = tensor<string, []>("transpose_11")];
-            tensor<fp32, [6, 5, 4, 64]> out_33 = layer_norm(axes = out_33_axes_0, epsilon = var_1052, x = var_1384)[name = tensor<string, []>("out_33")];
-            tensor<int32, [3]> var_1388 = const()[name = tensor<string, []>("op_1388"), val = tensor<int32, [3]>([6, 5, 256])];
-            tensor<fp32, [6, 5, 256]> out = reshape(shape = var_1388, x = out_33)[name = tensor<string, []>("out")];
-            tensor<fp32, [6, 5, 256]> var_1390 = silu(x = input_187)[name = tensor<string, []>("op_1390")];
-            tensor<fp32, [6, 5, 256]> input_189 = mul(x = var_1390, y = out)[name = tensor<string, []>("input_189")];
-            tensor<fp32, [6, 5, 256]> ret_out = linear(bias = decoder_out_proj_1_bias, weight = decoder_out_proj_1_weight, x = input_189)[name = tensor<string, []>("linear_51")];
-            tensor<fp32, [6, 5, 256]> input_191 = add(x = x, y = ret_out)[name = tensor<string, []>("input_191")];
+            tensor<fp32, [6, 5, 4, 64]> var_1426 = transpose(perm = var_1426_perm_0, x = out_31)[name = tensor<string, []>("transpose_11")];
+            tensor<fp32, [6, 5, 4, 64]> out_33 = layer_norm(axes = out_33_axes_0, epsilon = var_82, x = var_1426)[name = tensor<string, []>("out_33")];
+            tensor<int32, [3]> var_1430 = const()[name = tensor<string, []>("op_1430"), val = tensor<int32, [3]>([6, 5, 256])];
+            tensor<fp32, [6, 5, 256]> out = reshape(shape = var_1430, x = out_33)[name = tensor<string, []>("out")];
+            tensor<fp32, [6, 5, 256]> var_1432 = silu(x = input_189)[name = tensor<string, []>("op_1432")];
+            tensor<fp32, [6, 5, 256]> input_191 = mul(x = var_1432, y = out)[name = tensor<string, []>("input_191")];
+            tensor<fp32, [6, 5, 256]> ret_out = linear(bias = inner_decoder_out_proj_1_bias, weight = inner_decoder_out_proj_1_weight, x = input_191)[name = tensor<string, []>("linear_51")];
+            tensor<fp32, [6, 5, 256]> input_193 = add(x = x, y = ret_out)[name = tensor<string, []>("input_193")];
             tensor<int32, [1]> xt_5_axes_0 = const()[name = tensor<string, []>("xt_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [6, 5, 256]> xt_5 = layer_norm(axes = xt_5_axes_0, beta = decoder_norm11_1_bias, epsilon = var_1050, gamma = decoder_norm11_1_weight, x = input_191)[name = tensor<string, []>("xt_5")];
-            tensor<int32, [4]> var_1400 = const()[name = tensor<string, []>("op_1400"), val = tensor<int32, [4]>([1, 6, 5, 256])];
-            tensor<fp32, [1, 6, 5, 256]> var_1401 = reshape(shape = var_1400, x = xt_5)[name = tensor<string, []>("op_1401")];
-            tensor<int32, [4]> var_1402_perm_0 = const()[name = tensor<string, []>("op_1402_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1405 = const()[name = tensor<string, []>("op_1405"), val = tensor<int32, [3]>([5, 6, 256])];
-            tensor<fp32, [1, 5, 6, 256]> var_1402 = transpose(perm = var_1402_perm_0, x = var_1401)[name = tensor<string, []>("transpose_10")];
-            tensor<fp32, [5, 6, 256]> query_5 = reshape(shape = var_1405, x = var_1402)[name = tensor<string, []>("query_5")];
+            tensor<fp32, [6, 5, 256]> xt_5 = layer_norm(axes = xt_5_axes_0, beta = inner_decoder_norm11_1_bias, epsilon = var_74, gamma = inner_decoder_norm11_1_weight, x = input_193)[name = tensor<string, []>("xt_5")];
+            tensor<int32, [4]> var_1442 = const()[name = tensor<string, []>("op_1442"), val = tensor<int32, [4]>([1, 6, 5, 256])];
+            tensor<fp32, [1, 6, 5, 256]> var_1443 = reshape(shape = var_1442, x = xt_5)[name = tensor<string, []>("op_1443")];
+            tensor<int32, [4]> var_1444_perm_0 = const()[name = tensor<string, []>("op_1444_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1447 = const()[name = tensor<string, []>("op_1447"), val = tensor<int32, [3]>([5, 6, 256])];
+            tensor<fp32, [1, 5, 6, 256]> var_1444 = transpose(perm = var_1444_perm_0, x = var_1443)[name = tensor<string, []>("transpose_10")];
+            tensor<fp32, [5, 6, 256]> query_5 = reshape(shape = var_1447, x = var_1444)[name = tensor<string, []>("query_5")];
             tensor<int32, [3]> query_perm_0 = const()[name = tensor<string, []>("query_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [6, 5, 256]> query = transpose(perm = query_perm_0, x = query_5)[name = tensor<string, []>("transpose_9")];
-            tensor<fp32, [6, 5, 768]> var_1428 = linear(bias = decoder_self_attn2_1_in_proj_bias, weight = decoder_self_attn2_1_in_proj_weight, x = query)[name = tensor<string, []>("linear_52")];
+            tensor<fp32, [6, 5, 768]> var_1470 = linear(bias = inner_decoder_self_attn2_1_in_proj_bias, weight = inner_decoder_self_attn2_1_in_proj_weight, x = query)[name = tensor<string, []>("linear_52")];
             tensor<int32, [4]> concat_2 = const()[name = tensor<string, []>("concat_2"), val = tensor<int32, [4]>([6, 5, 3, 256])];
-            tensor<fp32, [6, 5, 3, 256]> var_1430 = reshape(shape = concat_2, x = var_1428)[name = tensor<string, []>("op_1430")];
-            tensor<int32, [1]> var_1431_axes_0 = const()[name = tensor<string, []>("op_1431_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp32, [1, 6, 5, 3, 256]> var_1431 = expand_dims(axes = var_1431_axes_0, x = var_1430)[name = tensor<string, []>("op_1431")];
-            tensor<int32, [5]> var_1432_perm_0 = const()[name = tensor<string, []>("op_1432_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
-            tensor<int32, [1]> var_1433_axes_0 = const()[name = tensor<string, []>("op_1433_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp32, [3, 6, 5, 1, 256]> var_1432 = transpose(perm = var_1432_perm_0, x = var_1431)[name = tensor<string, []>("transpose_8")];
-            tensor<fp32, [3, 6, 5, 256]> var_1433 = squeeze(axes = var_1433_axes_0, x = var_1432)[name = tensor<string, []>("op_1433")];
+            tensor<fp32, [6, 5, 3, 256]> var_1472 = reshape(shape = concat_2, x = var_1470)[name = tensor<string, []>("op_1472")];
+            tensor<int32, [1]> var_1473_axes_0 = const()[name = tensor<string, []>("op_1473_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 6, 5, 3, 256]> var_1473 = expand_dims(axes = var_1473_axes_0, x = var_1472)[name = tensor<string, []>("op_1473")];
+            tensor<int32, [5]> var_1474_perm_0 = const()[name = tensor<string, []>("op_1474_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
+            tensor<int32, [1]> var_1475_axes_0 = const()[name = tensor<string, []>("op_1475_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp32, [3, 6, 5, 1, 256]> var_1474 = transpose(perm = var_1474_perm_0, x = var_1473)[name = tensor<string, []>("transpose_8")];
+            tensor<fp32, [3, 6, 5, 256]> var_1475 = squeeze(axes = var_1475_axes_0, x = var_1474)[name = tensor<string, []>("op_1475")];
             tensor<int32, [4]> q_19_begin_0 = const()[name = tensor<string, []>("q_19_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> q_19_end_0 = const()[name = tensor<string, []>("q_19_end_0"), val = tensor<int32, [4]>([1, 6, 5, 256])];
             tensor<bool, [4]> q_19_end_mask_0 = const()[name = tensor<string, []>("q_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> q_19_squeeze_mask_0 = const()[name = tensor<string, []>("q_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [6, 5, 256]> q_19 = slice_by_index(begin = q_19_begin_0, end = q_19_end_0, end_mask = q_19_end_mask_0, squeeze_mask = q_19_squeeze_mask_0, x = var_1433)[name = tensor<string, []>("q_19")];
+            tensor<fp32, [6, 5, 256]> q_19 = slice_by_index(begin = q_19_begin_0, end = q_19_end_0, end_mask = q_19_end_mask_0, squeeze_mask = q_19_squeeze_mask_0, x = var_1475)[name = tensor<string, []>("q_19")];
             tensor<int32, [4]> k_19_begin_0 = const()[name = tensor<string, []>("k_19_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> k_19_end_0 = const()[name = tensor<string, []>("k_19_end_0"), val = tensor<int32, [4]>([2, 6, 5, 256])];
             tensor<bool, [4]> k_19_end_mask_0 = const()[name = tensor<string, []>("k_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> k_19_squeeze_mask_0 = const()[name = tensor<string, []>("k_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [6, 5, 256]> k_19 = slice_by_index(begin = k_19_begin_0, end = k_19_end_0, end_mask = k_19_end_mask_0, squeeze_mask = k_19_squeeze_mask_0, x = var_1433)[name = tensor<string, []>("k_19")];
+            tensor<fp32, [6, 5, 256]> k_19 = slice_by_index(begin = k_19_begin_0, end = k_19_end_0, end_mask = k_19_end_mask_0, squeeze_mask = k_19_squeeze_mask_0, x = var_1475)[name = tensor<string, []>("k_19")];
             tensor<int32, [4]> v_19_begin_0 = const()[name = tensor<string, []>("v_19_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> v_19_end_0 = const()[name = tensor<string, []>("v_19_end_0"), val = tensor<int32, [4]>([3, 6, 5, 256])];
             tensor<bool, [4]> v_19_end_mask_0 = const()[name = tensor<string, []>("v_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> v_19_squeeze_mask_0 = const()[name = tensor<string, []>("v_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [6, 5, 256]> v_19 = slice_by_index(begin = v_19_begin_0, end = v_19_end_0, end_mask = v_19_end_mask_0, squeeze_mask = v_19_squeeze_mask_0, x = var_1433)[name = tensor<string, []>("v_19")];
-            tensor<int32, [3]> var_1441 = const()[name = tensor<string, []>("op_1441"), val = tensor<int32, [3]>([6, 20, 64])];
-            tensor<fp32, [6, 20, 64]> var_1442 = reshape(shape = var_1441, x = q_19)[name = tensor<string, []>("op_1442")];
+            tensor<fp32, [6, 5, 256]> v_19 = slice_by_index(begin = v_19_begin_0, end = v_19_end_0, end_mask = v_19_end_mask_0, squeeze_mask = v_19_squeeze_mask_0, x = var_1475)[name = tensor<string, []>("v_19")];
+            tensor<int32, [3]> var_1483 = const()[name = tensor<string, []>("op_1483"), val = tensor<int32, [3]>([6, 20, 64])];
+            tensor<fp32, [6, 20, 64]> var_1484 = reshape(shape = var_1483, x = q_19)[name = tensor<string, []>("op_1484")];
             tensor<int32, [3]> q_21_perm_0 = const()[name = tensor<string, []>("q_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1448 = const()[name = tensor<string, []>("op_1448"), val = tensor<int32, [3]>([6, 20, 64])];
-            tensor<fp32, [6, 20, 64]> var_1449 = reshape(shape = var_1448, x = k_19)[name = tensor<string, []>("op_1449")];
+            tensor<int32, [3]> var_1490 = const()[name = tensor<string, []>("op_1490"), val = tensor<int32, [3]>([6, 20, 64])];
+            tensor<fp32, [6, 20, 64]> var_1491 = reshape(shape = var_1490, x = k_19)[name = tensor<string, []>("op_1491")];
             tensor<int32, [3]> k_21_perm_0 = const()[name = tensor<string, []>("k_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1455 = const()[name = tensor<string, []>("op_1455"), val = tensor<int32, [3]>([6, 20, 64])];
-            tensor<fp32, [6, 20, 64]> var_1456 = reshape(shape = var_1455, x = v_19)[name = tensor<string, []>("op_1456")];
+            tensor<int32, [3]> var_1497 = const()[name = tensor<string, []>("op_1497"), val = tensor<int32, [3]>([6, 20, 64])];
+            tensor<fp32, [6, 20, 64]> var_1498 = reshape(shape = var_1497, x = v_19)[name = tensor<string, []>("op_1498")];
             tensor<int32, [3]> v_21_perm_0 = const()[name = tensor<string, []>("v_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [4]> var_1459 = const()[name = tensor<string, []>("op_1459"), val = tensor<int32, [4]>([5, 4, 6, 64])];
-            tensor<fp32, [20, 6, 64]> q_21 = transpose(perm = q_21_perm_0, x = var_1442)[name = tensor<string, []>("transpose_7")];
-            tensor<fp32, [5, 4, 6, 64]> q = reshape(shape = var_1459, x = q_21)[name = tensor<string, []>("q")];
-            tensor<int32, [4]> var_1461 = const()[name = tensor<string, []>("op_1461"), val = tensor<int32, [4]>([5, 4, 6, 64])];
-            tensor<fp32, [20, 6, 64]> k_21 = transpose(perm = k_21_perm_0, x = var_1449)[name = tensor<string, []>("transpose_6")];
-            tensor<fp32, [5, 4, 6, 64]> k = reshape(shape = var_1461, x = k_21)[name = tensor<string, []>("k")];
-            tensor<int32, [4]> var_1463 = const()[name = tensor<string, []>("op_1463"), val = tensor<int32, [4]>([5, 4, 6, 64])];
-            tensor<fp32, [20, 6, 64]> v_21 = transpose(perm = v_21_perm_0, x = var_1456)[name = tensor<string, []>("transpose_5")];
-            tensor<fp32, [5, 4, 6, 64]> v = reshape(shape = var_1463, x = v_21)[name = tensor<string, []>("v")];
+            tensor<int32, [4]> var_1501 = const()[name = tensor<string, []>("op_1501"), val = tensor<int32, [4]>([5, 4, 6, 64])];
+            tensor<fp32, [20, 6, 64]> q_21 = transpose(perm = q_21_perm_0, x = var_1484)[name = tensor<string, []>("transpose_7")];
+            tensor<fp32, [5, 4, 6, 64]> q = reshape(shape = var_1501, x = q_21)[name = tensor<string, []>("q")];
+            tensor<int32, [4]> var_1503 = const()[name = tensor<string, []>("op_1503"), val = tensor<int32, [4]>([5, 4, 6, 64])];
+            tensor<fp32, [20, 6, 64]> k_21 = transpose(perm = k_21_perm_0, x = var_1491)[name = tensor<string, []>("transpose_6")];
+            tensor<fp32, [5, 4, 6, 64]> k = reshape(shape = var_1503, x = k_21)[name = tensor<string, []>("k")];
+            tensor<int32, [4]> var_1505 = const()[name = tensor<string, []>("op_1505"), val = tensor<int32, [4]>([5, 4, 6, 64])];
+            tensor<fp32, [20, 6, 64]> v_21 = transpose(perm = v_21_perm_0, x = var_1498)[name = tensor<string, []>("transpose_5")];
+            tensor<fp32, [5, 4, 6, 64]> v = reshape(shape = var_1505, x = v_21)[name = tensor<string, []>("v")];
             tensor<fp32, []> mul_3_y_0 = const()[name = tensor<string, []>("mul_3_y_0"), val = tensor<fp32, []>(0x1p-3)];
             tensor<fp32, [5, 4, 6, 64]> mul_3 = mul(x = q, y = mul_3_y_0)[name = tensor<string, []>("mul_3")];
             tensor<bool, []> matmul_1_transpose_y_0 = const()[name = tensor<string, []>("matmul_1_transpose_y_0"), val = tensor<bool, []>(true)];
@@ -1317,36 +1336,36 @@ program(1.0)
             tensor<bool, []> attn_output_9_transpose_x_0 = const()[name = tensor<string, []>("attn_output_9_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> attn_output_9_transpose_y_0 = const()[name = tensor<string, []>("attn_output_9_transpose_y_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [5, 4, 6, 64]> attn_output_9 = matmul(transpose_x = attn_output_9_transpose_x_0, transpose_y = attn_output_9_transpose_y_0, x = softmax_1, y = v)[name = tensor<string, []>("attn_output_9")];
-            tensor<int32, [4]> var_1466 = const()[name = tensor<string, []>("op_1466"), val = tensor<int32, [4]>([2, 0, 1, 3])];
-            tensor<int32, [2]> var_1471 = const()[name = tensor<string, []>("op_1471"), val = tensor<int32, [2]>([30, 256])];
-            tensor<fp32, [6, 5, 4, 64]> var_1467 = transpose(perm = var_1466, x = attn_output_9)[name = tensor<string, []>("transpose_4")];
-            tensor<fp32, [30, 256]> attn_output_11 = reshape(shape = var_1471, x = var_1467)[name = tensor<string, []>("attn_output_11")];
-            tensor<fp32, [30, 256]> attn_output_13 = linear(bias = decoder_self_attn2_1_out_proj_bias, weight = decoder_self_attn2_1_out_proj_weight, x = attn_output_11)[name = tensor<string, []>("linear_53")];
-            tensor<int32, [3]> var_1475 = const()[name = tensor<string, []>("op_1475"), val = tensor<int32, [3]>([6, 5, 256])];
-            tensor<fp32, [6, 5, 256]> attn_output = reshape(shape = var_1475, x = attn_output_13)[name = tensor<string, []>("attn_output")];
+            tensor<int32, [4]> var_1508 = const()[name = tensor<string, []>("op_1508"), val = tensor<int32, [4]>([2, 0, 1, 3])];
+            tensor<int32, [2]> var_1513 = const()[name = tensor<string, []>("op_1513"), val = tensor<int32, [2]>([30, 256])];
+            tensor<fp32, [6, 5, 4, 64]> var_1509 = transpose(perm = var_1508, x = attn_output_9)[name = tensor<string, []>("transpose_4")];
+            tensor<fp32, [30, 256]> attn_output_11 = reshape(shape = var_1513, x = var_1509)[name = tensor<string, []>("attn_output_11")];
+            tensor<fp32, [30, 256]> attn_output_13 = linear(bias = inner_decoder_self_attn2_1_out_proj_bias, weight = inner_decoder_self_attn2_1_out_proj_weight, x = attn_output_11)[name = tensor<string, []>("linear_53")];
+            tensor<int32, [3]> var_1517 = const()[name = tensor<string, []>("op_1517"), val = tensor<int32, [3]>([6, 5, 256])];
+            tensor<fp32, [6, 5, 256]> attn_output = reshape(shape = var_1517, x = attn_output_13)[name = tensor<string, []>("attn_output")];
             tensor<int32, [3]> sa2_out_perm_0 = const()[name = tensor<string, []>("sa2_out_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [5, 6, 256]> sa2_out = transpose(perm = sa2_out_perm_0, x = attn_output)[name = tensor<string, []>("transpose_3")];
-            tensor<fp32, [5, 6, 256]> input_193 = add(x = query_5, y = sa2_out)[name = tensor<string, []>("input_193")];
-            tensor<int32, [1]> input_195_axes_0 = const()[name = tensor<string, []>("input_195_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [5, 6, 256]> input_195 = layer_norm(axes = input_195_axes_0, beta = decoder_norm21_1_bias, epsilon = var_1050, gamma = decoder_norm21_1_weight, x = input_193)[name = tensor<string, []>("input_195")];
-            tensor<fp32, [5, 6, 2048]> input_197 = linear(bias = decoder_linear1_1_bias, weight = decoder_linear1_1_weight, x = input_195)[name = tensor<string, []>("linear_54")];
-            tensor<fp32, [5, 6, 2048]> input_199 = relu(x = input_197)[name = tensor<string, []>("input_199")];
-            tensor<fp32, [5, 6, 256]> ffn_out = linear(bias = decoder_linear2_1_bias, weight = decoder_linear2_1_weight, x = input_199)[name = tensor<string, []>("linear_55")];
-            tensor<fp32, [5, 6, 256]> input_201 = add(x = input_195, y = ffn_out)[name = tensor<string, []>("input_201")];
+            tensor<fp32, [5, 6, 256]> input_195 = add(x = query_5, y = sa2_out)[name = tensor<string, []>("input_195")];
+            tensor<int32, [1]> input_197_axes_0 = const()[name = tensor<string, []>("input_197_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [5, 6, 256]> input_197 = layer_norm(axes = input_197_axes_0, beta = inner_decoder_norm21_1_bias, epsilon = var_74, gamma = inner_decoder_norm21_1_weight, x = input_195)[name = tensor<string, []>("input_197")];
+            tensor<fp32, [5, 6, 2048]> input_199 = linear(bias = inner_decoder_linear1_1_bias, weight = inner_decoder_linear1_1_weight, x = input_197)[name = tensor<string, []>("linear_54")];
+            tensor<fp32, [5, 6, 2048]> input_201 = relu(x = input_199)[name = tensor<string, []>("input_201")];
+            tensor<fp32, [5, 6, 256]> ffn_out = linear(bias = inner_decoder_linear2_1_bias, weight = inner_decoder_linear2_1_weight, x = input_201)[name = tensor<string, []>("linear_55")];
+            tensor<fp32, [5, 6, 256]> input_203 = add(x = input_197, y = ffn_out)[name = tensor<string, []>("input_203")];
             tensor<int32, [1]> xt_axes_0 = const()[name = tensor<string, []>("xt_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [5, 6, 256]> xt = layer_norm(axes = xt_axes_0, beta = decoder_norm22_1_bias, epsilon = var_1050, gamma = decoder_norm22_1_weight, x = input_201)[name = tensor<string, []>("xt")];
-            tensor<int32, [4]> var_1495 = const()[name = tensor<string, []>("op_1495"), val = tensor<int32, [4]>([1, 5, 6, 256])];
-            tensor<fp32, [1, 5, 6, 256]> input = reshape(shape = var_1495, x = xt)[name = tensor<string, []>("input")];
-            tensor<int32, [1]> var_1497 = const()[name = tensor<string, []>("op_1497"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 6, 1]> var_1498 = reduce_l2_norm(axes = var_1497, keep_dims = var_1053, x = input)[name = tensor<string, []>("op_1498")];
+            tensor<fp32, [5, 6, 256]> xt = layer_norm(axes = xt_axes_0, beta = inner_decoder_norm22_1_bias, epsilon = var_74, gamma = inner_decoder_norm22_1_weight, x = input_203)[name = tensor<string, []>("xt")];
+            tensor<int32, [4]> var_1537 = const()[name = tensor<string, []>("op_1537"), val = tensor<int32, [4]>([1, 5, 6, 256])];
+            tensor<fp32, [1, 5, 6, 256]> input = reshape(shape = var_1537, x = xt)[name = tensor<string, []>("input")];
+            tensor<int32, [1]> var_1539 = const()[name = tensor<string, []>("op_1539"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 5, 6, 1]> var_1540 = reduce_l2_norm(axes = var_1539, keep_dims = var_73, x = input)[name = tensor<string, []>("op_1540")];
             tensor<fp32, []> const_42 = const()[name = tensor<string, []>("const_42"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1, 5, 6, 1]> clip_5 = clip(alpha = var_1045, beta = const_42, x = var_1498)[name = tensor<string, []>("clip_5")];
-            tensor<fp32, [1, 5, 6, 256]> var_1500 = real_div(x = input, y = clip_5)[name = tensor<string, []>("op_1500")];
+            tensor<fp32, [1, 5, 6, 1]> clip_5 = clip(alpha = var_88, beta = const_42, x = var_1540)[name = tensor<string, []>("clip_5")];
+            tensor<fp32, [1, 5, 6, 256]> var_1542 = real_div(x = input, y = clip_5)[name = tensor<string, []>("op_1542")];
             tensor<int32, [3]> concat_6 = const()[name = tensor<string, []>("concat_6"), val = tensor<int32, [3]>([5, 1, 256])];
             tensor<fp32, [5, 1, 256]> reshape_0 = reshape(shape = concat_6, x = emb)[name = tensor<string, []>("reshape_0")];
             tensor<int32, [4]> transpose_1_perm_0 = const()[name = tensor<string, []>("transpose_1_perm_0"), val = tensor<int32, [4]>([0, 1, 3, 2])];
             tensor<int32, [3]> concat_7 = const()[name = tensor<string, []>("concat_7"), val = tensor<int32, [3]>([5, 256, 6])];
-            tensor<fp32, [1, 5, 256, 6]> transpose_1 = transpose(perm = transpose_1_perm_0, x = var_1500)[name = tensor<string, []>("transpose_2")];
+            tensor<fp32, [1, 5, 256, 6]> transpose_1 = transpose(perm = transpose_1_perm_0, x = var_1542)[name = tensor<string, []>("transpose_2")];
             tensor<fp32, [5, 256, 6]> reshape_1 = reshape(shape = concat_7, x = transpose_1)[name = tensor<string, []>("reshape_1")];
             tensor<bool, []> matmul_2_transpose_x_0 = const()[name = tensor<string, []>("matmul_2_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> matmul_2_transpose_y_0 = const()[name = tensor<string, []>("matmul_2_transpose_y_0"), val = tensor<bool, []>(false)];
@@ -1357,10 +1376,10 @@ program(1.0)
             tensor<int32, [3]> output_end_0 = const()[name = tensor<string, []>("output_end_0"), val = tensor<int32, [3]>([1, 5, 5])];
             tensor<bool, [3]> output_end_mask_0 = const()[name = tensor<string, []>("output_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
             tensor<fp32, [1, 5, 4]> output = slice_by_index(begin = output_begin_0, end = output_end_0, end_mask = output_end_mask_0, x = reshape_2)[name = tensor<string, []>("output")];
-            tensor<fp32, [1, 5, 4]> probs = sigmoid(x = output)[name = tensor<string, []>("op_1504")];
-            tensor<int32, []> var_1506_axis_0 = const()[name = tensor<string, []>("op_1506_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [2, 6, 4, 64, 64]> dec_kv_new = stack(axis = var_1506_axis_0, values = (var_1202, nkv))[name = tensor<string, []>("op_1506")];
-            tensor<int32, []> var_1508_axis_0 = const()[name = tensor<string, []>("op_1508_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [2, 1]> dec_scale_new = stack(axis = var_1508_axis_0, values = (new_scale_9, new_scale))[name = tensor<string, []>("op_1508")];
+            tensor<fp32, [1, 5, 4]> probs = sigmoid(x = output)[name = tensor<string, []>("op_1546")];
+            tensor<int32, []> var_1548_axis_0 = const()[name = tensor<string, []>("op_1548_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [2, 6, 4, 64, 64]> dec_kv_new = stack(axis = var_1548_axis_0, values = (var_1244, nkv))[name = tensor<string, []>("op_1548")];
+            tensor<int32, []> var_1550_axis_0 = const()[name = tensor<string, []>("op_1550_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [2, 1]> dec_scale_new = stack(axis = var_1550_axis_0, values = (new_scale_9, new_scale))[name = tensor<string, []>("op_1550")];
         } -> (probs, enc_kv_new, enc_scale_new, enc_conv_cache_new, cnn_window_new, dec_kv_new, dec_scale_new);
 }
\ No newline at end of file
diff --git a/optimized/ami/500ms/ls_eend_ami_500ms.mlpackage/Data/com.apple.CoreML/model.mlmodel b/optimized/ami/500ms/ls_eend_ami_500ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
index c2635f870434e5d00bdf7febc221c1084a938f1b..b439a634d1a038a58cc98042708019768152db58 100644
--- a/optimized/ami/500ms/ls_eend_ami_500ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
+++ b/optimized/ami/500ms/ls_eend_ami_500ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bb1fc846d5fe66b99cc3d94f2f27dc499fd4bcd04080e23e0297dd02f0f61f7b
-size 196611
+oid sha256:449581250eeb3f6eda599089dca398f8bd8fb4e4433e96a35c5ccfdf9a84e6c7
+size 203211
diff --git a/optimized/ami/500ms/ls_eend_ami_500ms.mlpackage/Manifest.json b/optimized/ami/500ms/ls_eend_ami_500ms.mlpackage/Manifest.json
index 719daafccc33381f00ca4380bee97b50ce8c7019..3aae5688c1b26b6cb8cdfd89ba0210bc978433ea 100644
--- a/optimized/ami/500ms/ls_eend_ami_500ms.mlpackage/Manifest.json
+++ b/optimized/ami/500ms/ls_eend_ami_500ms.mlpackage/Manifest.json
@@ -1,18 +1,18 @@
 {
     "fileFormatVersion": "1.0.0",
     "itemInfoEntries": {
-        "BFFA710C-2BC9-41A2-80C8-E692BA4158D2": {
+        "42F69C96-0A26-4074-BA5C-B0CA1B7F580E": {
             "author": "com.apple.CoreML",
             "description": "CoreML Model Weights",
             "name": "weights",
             "path": "com.apple.CoreML/weights"
         },
-        "F337566C-1A74-4DED-8712-B263AAD1B3CF": {
+        "516AD8CE-7461-455E-837D-56DC0B7EE116": {
             "author": "com.apple.CoreML",
             "description": "CoreML Model Specification",
             "name": "model.mlmodel",
             "path": "com.apple.CoreML/model.mlmodel"
         }
     },
-    "rootModelIdentifier": "F337566C-1A74-4DED-8712-B263AAD1B3CF"
+    "rootModelIdentifier": "516AD8CE-7461-455E-837D-56DC0B7EE116"
 }
diff --git a/optimized/ch/100ms/ls_eend_ch_100ms.mlmodelc/analytics/coremldata.bin b/optimized/ch/100ms/ls_eend_ch_100ms.mlmodelc/analytics/coremldata.bin
index d14d2473e765f039603cf751a34e8895b6d8f152..31fee0a94bede8702423f9026567ab93f857b5a1 100644
--- a/optimized/ch/100ms/ls_eend_ch_100ms.mlmodelc/analytics/coremldata.bin
+++ b/optimized/ch/100ms/ls_eend_ch_100ms.mlmodelc/analytics/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:162c9b06e4452aa666026e5cf6d51ebd396faac77145f39864e8e1fee2ec569e
+oid sha256:75d3cc235f66ab174d6aa9cba0d97746fe4f69ba198a3ade13dc3cd450a16a2a
 size 243
diff --git a/optimized/ch/100ms/ls_eend_ch_100ms.mlmodelc/coremldata.bin b/optimized/ch/100ms/ls_eend_ch_100ms.mlmodelc/coremldata.bin
index 4f3888dde2e2edab7506a40b902b161ff6a88031..8ac0ea92ab52934795d546e4b456121c116daaa2 100644
--- a/optimized/ch/100ms/ls_eend_ch_100ms.mlmodelc/coremldata.bin
+++ b/optimized/ch/100ms/ls_eend_ch_100ms.mlmodelc/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6e2c9fc79e93736f5e7e4017f5282df7b689fac77791ab360645dd0732ca2af9
-size 1301
+oid sha256:e097920817284b8134fcc6a8906e568bdf28e5a55bd78818f9e0485c71019561
+size 1404
diff --git a/optimized/ch/100ms/ls_eend_ch_100ms.mlmodelc/metadata.json b/optimized/ch/100ms/ls_eend_ch_100ms.mlmodelc/metadata.json
index c0c60029c9245e143c373a94558005a27729a798..7354bbdc1dd030e949e3666df3cfdc7562a14d38 100644
--- a/optimized/ch/100ms/ls_eend_ch_100ms.mlmodelc/metadata.json
+++ b/optimized/ch/100ms/ls_eend_ch_100ms.mlmodelc/metadata.json
@@ -1,7 +1,7 @@
 [
   {
     "metadataOutputVersion" : "3.0",
-    "shortDescription" : "LS-EEND CALLHOME streaming diarizer (pipeline, T=1, max_speakers=7)",
+    "shortDescription" : "LS-EEND CALLHOME streaming diarizer (pipeline, T=1, max_speakers=7, layout=raw_mel, cu=all)",
     "outputSchema" : [
       {
         "hasShapeFlexibility" : "0",
@@ -81,12 +81,12 @@
     "specificationVersion" : 8,
     "mlProgramOperationTypeHistogram" : {
       "Ios16.reduceL2Norm" : 2,
-      "Ios17.reshape" : 66,
+      "Ios17.reshape" : 67,
       "Ios16.softmax" : 2,
       "Ios17.matmul" : 29,
       "Ios17.transpose" : 57,
       "Split" : 4,
-      "Ios17.expandDims" : 3,
+      "Ios17.expandDims" : 4,
       "Ios17.add" : 46,
       "Ios16.sigmoid" : 5,
       "Ios17.sliceByIndex" : 36,
@@ -128,9 +128,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float32",
-        "formattedType" : "MultiArray (Float32 1 × 1 × 345)",
+        "formattedType" : "MultiArray (Float32 1 × 15 × 23)",
         "shortDescription" : "",
-        "shape" : "[1, 1, 345]",
+        "shape" : "[1, 15, 23]",
         "name" : "features",
         "type" : "MultiArray"
       },
@@ -206,8 +206,8 @@
       }
     ],
     "userDefinedMetadata" : {
-      "com.github.apple.coremltools.conversion_date" : "2026-04-16",
-      "config" : "{\"model_name\": \"ch\", \"model_label\": \"CALLHOME\", \"variant\": \"pipeline\", \"chunk_size\": 1, \"step_duration_ms\": 100, \"frame_duration_ms\": 100, \"frame_duration_seconds\": 0.1, \"max_speakers\": 7, \"max_nspks\": 9, \"n_units\": 256, \"n_heads\": 4, \"enc_n_layers\": 4, \"dec_n_layers\": 2, \"conv_kernel_size\": 16, \"conv_delay\": 9, \"sample_rate\": 8000, \"win_length\": 200, \"hop_length\": 80, \"n_mels\": 23, \"context_size\": 7, \"subsampling\": 10, \"feat_type\": \"logmel23_cummn\", \"pure_roll\": true}",
+      "com.github.apple.coremltools.conversion_date" : "2026-04-18",
+      "config" : "{\"model_name\": \"ch\", \"model_label\": \"CALLHOME\", \"variant\": \"pipeline\", \"chunk_size\": 1, \"step_duration_ms\": 100, \"frame_duration_ms\": 100, \"frame_duration_seconds\": 0.1, \"max_speakers\": 7, \"max_nspks\": 9, \"n_units\": 256, \"n_heads\": 4, \"enc_n_layers\": 4, \"dec_n_layers\": 2, \"conv_kernel_size\": 16, \"conv_delay\": 9, \"sample_rate\": 8000, \"win_length\": 200, \"hop_length\": 80, \"n_mels\": 23, \"context_size\": 7, \"subsampling\": 10, \"feat_type\": \"logmel23_cummn\", \"pure_roll\": true, \"input_layout\": \"raw_mel\", \"compute_units_export\": \"all\", \"raw_mel_length\": 15}",
       "com.github.apple.coremltools.source" : "torch==2.6.0",
       "com.github.apple.coremltools.version" : "9.0",
       "com.github.apple.coremltools.source_dialect" : "TorchScript"
diff --git a/optimized/ch/100ms/ls_eend_ch_100ms.mlmodelc/model.mil b/optimized/ch/100ms/ls_eend_ch_100ms.mlmodelc/model.mil
index c6564bd8478dcda29a396e2ab8dcb7eda1e4a19b..dfa2adbe0753d8a6dffeec554a5c9c51bd856833 100644
--- a/optimized/ch/100ms/ls_eend_ch_100ms.mlmodelc/model.mil
+++ b/optimized/ch/100ms/ls_eend_ch_100ms.mlmodelc/model.mil
@@ -1,233 +1,239 @@
 program(1.0)
 [buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.6.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "9.0"}})]
 {
-    func main<ios17>(tensor<fp32, [1, 256, 18]> cnn_window, tensor<fp32, [2, 9, 4, 64, 64]> dec_kv, tensor<fp32, [2, 1]> dec_scale, tensor<fp32, [4, 1, 16, 256]> enc_conv_cache, tensor<fp32, [4, 1, 4, 64, 64]> enc_kv, tensor<fp32, [4, 1]> enc_scale, tensor<fp32, [1, 1, 345]> features, tensor<fp32, [1]> valid_mask) {
-            tensor<fp32, [256]> encoder_cnn_bias = const()[name = tensor<string, []>("encoder_cnn_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
-            tensor<fp32, [256, 256, 19]> encoder_cnn_weight = const()[name = tensor<string, []>("encoder_cnn_weight"), val = tensor<fp32, [256, 256, 19]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
-            tensor<fp32, [1, 1]> encoder__causal_mask = const()[name = tensor<string, []>("encoder__causal_mask"), val = tensor<fp32, [1, 1]>([[0x1p+0]])];
-            tensor<fp32, [1]> encoder__t_index = const()[name = tensor<string, []>("encoder__t_index"), val = tensor<fp32, [1]>([0x1p+0])];
-            tensor<fp32, [256]> encoder_input_projection_linear_bias = const()[name = tensor<string, []>("encoder_input_projection_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4981952)))];
-            tensor<fp32, [256, 345]> encoder_input_projection_linear_weight = const()[name = tensor<string, []>("encoder_input_projection_linear_weight"), val = tensor<fp32, [256, 345]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4983040)))];
-            tensor<fp32, [256]> encoder_pre_layer_norm_bias = const()[name = tensor<string, []>("encoder_pre_layer_norm_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5336384)))];
-            tensor<fp32, [256]> encoder_pre_layer_norm_weight = const()[name = tensor<string, []>("encoder_pre_layer_norm_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5337472)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5338560)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5339648)))];
-            tensor<fp32, [1024]> encoder_ffn1_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5340736)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5344896)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6393536)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6394624)))];
-            tensor<fp32, [256]> encoder_ret_lns_0_bias = const()[name = tensor<string, []>("encoder_ret_lns_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7443264)))];
-            tensor<fp32, [256]> encoder_ret_lns_0_weight = const()[name = tensor<string, []>("encoder_ret_lns_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7444352)))];
-            tensor<fp32, [256]> encoder_q_proj_0_bias = const()[name = tensor<string, []>("encoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7445440)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_0_weight = const()[name = tensor<string, []>("encoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7446528)))];
-            tensor<fp32, [256]> encoder_k_proj_0_bias = const()[name = tensor<string, []>("encoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7708736)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_0_weight = const()[name = tensor<string, []>("encoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7709824)))];
-            tensor<fp32, [256]> encoder_v_proj_0_bias = const()[name = tensor<string, []>("encoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7972032)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_0_weight = const()[name = tensor<string, []>("encoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7973120)))];
-            tensor<fp32, [256]> encoder_g_proj_0_bias = const()[name = tensor<string, []>("encoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8235328)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_0_weight = const()[name = tensor<string, []>("encoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8236416)))];
-            tensor<fp32, [256]> encoder_out_proj_0_bias = const()[name = tensor<string, []>("encoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8498624)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_0_weight = const()[name = tensor<string, []>("encoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8499712)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8761920)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8763008)))];
-            tensor<fp32, [512]> encoder_conv_module_0_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8764096)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_0_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8766208)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_0_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9290560)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9307008)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9308096)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9309184)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9310272)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9311360)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_0_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9312448)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9574656)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9575744)))];
-            tensor<fp32, [1024]> encoder_ffn2_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9576832)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9580992)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10629632)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10630720)))];
-            tensor<fp32, [256]> encoder_layer_norm_0_bias = const()[name = tensor<string, []>("encoder_layer_norm_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11679360)))];
-            tensor<fp32, [256]> encoder_layer_norm_0_weight = const()[name = tensor<string, []>("encoder_layer_norm_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11680448)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11681536)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11682624)))];
-            tensor<fp32, [1024]> encoder_ffn1_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11683712)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11687872)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12736512)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12737600)))];
-            tensor<fp32, [256]> encoder_ret_lns_1_bias = const()[name = tensor<string, []>("encoder_ret_lns_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13786240)))];
-            tensor<fp32, [256]> encoder_ret_lns_1_weight = const()[name = tensor<string, []>("encoder_ret_lns_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13787328)))];
-            tensor<fp32, [256]> encoder_q_proj_1_bias = const()[name = tensor<string, []>("encoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13788416)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_1_weight = const()[name = tensor<string, []>("encoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13789504)))];
-            tensor<fp32, [256]> encoder_k_proj_1_bias = const()[name = tensor<string, []>("encoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14051712)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_1_weight = const()[name = tensor<string, []>("encoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14052800)))];
-            tensor<fp32, [256]> encoder_v_proj_1_bias = const()[name = tensor<string, []>("encoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14315008)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_1_weight = const()[name = tensor<string, []>("encoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14316096)))];
-            tensor<fp32, [256]> encoder_g_proj_1_bias = const()[name = tensor<string, []>("encoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14578304)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_1_weight = const()[name = tensor<string, []>("encoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14579392)))];
-            tensor<fp32, [256]> encoder_out_proj_1_bias = const()[name = tensor<string, []>("encoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14841600)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_1_weight = const()[name = tensor<string, []>("encoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14842688)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15104896)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15105984)))];
-            tensor<fp32, [512]> encoder_conv_module_1_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15107072)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_1_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15109184)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_1_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15633536)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15649984)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15651072)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15652160)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15653248)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15654336)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_1_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15655424)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15917632)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15918720)))];
-            tensor<fp32, [1024]> encoder_ffn2_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15919808)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15923968)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16972608)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16973696)))];
-            tensor<fp32, [256]> encoder_layer_norm_1_bias = const()[name = tensor<string, []>("encoder_layer_norm_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18022336)))];
-            tensor<fp32, [256]> encoder_layer_norm_1_weight = const()[name = tensor<string, []>("encoder_layer_norm_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18023424)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18024512)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18025600)))];
-            tensor<fp32, [1024]> encoder_ffn1_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18026688)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18030848)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19079488)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19080576)))];
-            tensor<fp32, [256]> encoder_ret_lns_2_bias = const()[name = tensor<string, []>("encoder_ret_lns_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20129216)))];
-            tensor<fp32, [256]> encoder_ret_lns_2_weight = const()[name = tensor<string, []>("encoder_ret_lns_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20130304)))];
-            tensor<fp32, [256]> encoder_q_proj_2_bias = const()[name = tensor<string, []>("encoder_q_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20131392)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_2_weight = const()[name = tensor<string, []>("encoder_q_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20132480)))];
-            tensor<fp32, [256]> encoder_k_proj_2_bias = const()[name = tensor<string, []>("encoder_k_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20394688)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_2_weight = const()[name = tensor<string, []>("encoder_k_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20395776)))];
-            tensor<fp32, [256]> encoder_v_proj_2_bias = const()[name = tensor<string, []>("encoder_v_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20657984)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_2_weight = const()[name = tensor<string, []>("encoder_v_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20659072)))];
-            tensor<fp32, [256]> encoder_g_proj_2_bias = const()[name = tensor<string, []>("encoder_g_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20921280)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_2_weight = const()[name = tensor<string, []>("encoder_g_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20922368)))];
-            tensor<fp32, [256]> encoder_out_proj_2_bias = const()[name = tensor<string, []>("encoder_out_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21184576)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_2_weight = const()[name = tensor<string, []>("encoder_out_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21185664)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21447872)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21448960)))];
-            tensor<fp32, [512]> encoder_conv_module_2_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21450048)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_2_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21452160)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_2_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21976512)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21992960)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21994048)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21995136)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21996224)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21997312)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_2_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21998400)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22260608)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22261696)))];
-            tensor<fp32, [1024]> encoder_ffn2_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22262784)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22266944)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23315584)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23316672)))];
-            tensor<fp32, [256]> encoder_layer_norm_2_bias = const()[name = tensor<string, []>("encoder_layer_norm_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24365312)))];
-            tensor<fp32, [256]> encoder_layer_norm_2_weight = const()[name = tensor<string, []>("encoder_layer_norm_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24366400)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24367488)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24368576)))];
-            tensor<fp32, [1024]> encoder_ffn1_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24369664)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24373824)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25422464)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25423552)))];
-            tensor<fp32, [256]> encoder_ret_lns_3_bias = const()[name = tensor<string, []>("encoder_ret_lns_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26472192)))];
-            tensor<fp32, [256]> encoder_ret_lns_3_weight = const()[name = tensor<string, []>("encoder_ret_lns_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26473280)))];
-            tensor<fp32, [256]> encoder_q_proj_3_bias = const()[name = tensor<string, []>("encoder_q_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26474368)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_3_weight = const()[name = tensor<string, []>("encoder_q_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26475456)))];
-            tensor<fp32, [256]> encoder_k_proj_3_bias = const()[name = tensor<string, []>("encoder_k_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26737664)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_3_weight = const()[name = tensor<string, []>("encoder_k_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26738752)))];
-            tensor<fp32, [256]> encoder_v_proj_3_bias = const()[name = tensor<string, []>("encoder_v_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27000960)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_3_weight = const()[name = tensor<string, []>("encoder_v_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27002048)))];
-            tensor<fp32, [256]> encoder_g_proj_3_bias = const()[name = tensor<string, []>("encoder_g_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27264256)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_3_weight = const()[name = tensor<string, []>("encoder_g_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27265344)))];
-            tensor<fp32, [256]> encoder_out_proj_3_bias = const()[name = tensor<string, []>("encoder_out_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27527552)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_3_weight = const()[name = tensor<string, []>("encoder_out_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27528640)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27790848)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27791936)))];
-            tensor<fp32, [512]> encoder_conv_module_3_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27793024)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_3_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27795136)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_3_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28319488)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28335936)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28337024)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28338112)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28339200)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28340288)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_3_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28341376)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28603584)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28604672)))];
-            tensor<fp32, [1024]> encoder_ffn2_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28605760)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28609920)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29658560)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29659648)))];
-            tensor<fp32, [256]> encoder_layer_norm_3_bias = const()[name = tensor<string, []>("encoder_layer_norm_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30708288)))];
-            tensor<fp32, [256]> encoder_layer_norm_3_weight = const()[name = tensor<string, []>("encoder_layer_norm_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30709376)))];
-            tensor<fp32, [256]> decoder_convert_bias = const()[name = tensor<string, []>("decoder_convert_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710464)))];
-            tensor<fp32, [256, 512]> decoder_convert_weight = const()[name = tensor<string, []>("decoder_convert_weight"), val = tensor<fp32, [256, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30711552)))];
-            tensor<fp32, [256]> decoder_q_proj_0_bias = const()[name = tensor<string, []>("decoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31235904)))];
-            tensor<fp32, [256, 256]> decoder_q_proj_0_weight = const()[name = tensor<string, []>("decoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31236992)))];
-            tensor<fp32, [256]> decoder_k_proj_0_bias = const()[name = tensor<string, []>("decoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31499200)))];
-            tensor<fp32, [256, 256]> decoder_k_proj_0_weight = const()[name = tensor<string, []>("decoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31500288)))];
-            tensor<fp32, [256]> decoder_v_proj_0_bias = const()[name = tensor<string, []>("decoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31762496)))];
-            tensor<fp32, [256, 256]> decoder_v_proj_0_weight = const()[name = tensor<string, []>("decoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31763584)))];
-            tensor<fp32, [256]> decoder_g_proj_0_bias = const()[name = tensor<string, []>("decoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32025792)))];
-            tensor<fp32, [256, 256]> decoder_g_proj_0_weight = const()[name = tensor<string, []>("decoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32026880)))];
-            tensor<fp32, [256]> decoder_out_proj_0_bias = const()[name = tensor<string, []>("decoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32289088)))];
-            tensor<fp32, [256, 256]> decoder_out_proj_0_weight = const()[name = tensor<string, []>("decoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32290176)))];
-            tensor<fp32, [256]> decoder_norm11_0_bias = const()[name = tensor<string, []>("decoder_norm11_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32552384)))];
-            tensor<fp32, [256]> decoder_norm11_0_weight = const()[name = tensor<string, []>("decoder_norm11_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32553472)))];
-            tensor<fp32, [256]> decoder_self_attn2_0_out_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_0_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32554560)))];
-            tensor<fp32, [256, 256]> decoder_self_attn2_0_out_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_0_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32555648)))];
-            tensor<fp32, [768]> decoder_self_attn2_0_in_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_0_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32817856)))];
-            tensor<fp32, [768, 256]> decoder_self_attn2_0_in_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_0_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32820992)))];
-            tensor<fp32, [256]> decoder_norm21_0_bias = const()[name = tensor<string, []>("decoder_norm21_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33607488)))];
-            tensor<fp32, [256]> decoder_norm21_0_weight = const()[name = tensor<string, []>("decoder_norm21_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33608576)))];
-            tensor<fp32, [2048]> decoder_linear1_0_bias = const()[name = tensor<string, []>("decoder_linear1_0_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33609664)))];
-            tensor<fp32, [2048, 256]> decoder_linear1_0_weight = const()[name = tensor<string, []>("decoder_linear1_0_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33617920)))];
-            tensor<fp32, [256]> decoder_linear2_0_bias = const()[name = tensor<string, []>("decoder_linear2_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35715136)))];
-            tensor<fp32, [256, 2048]> decoder_linear2_0_weight = const()[name = tensor<string, []>("decoder_linear2_0_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35716224)))];
-            tensor<fp32, [256]> decoder_norm22_0_bias = const()[name = tensor<string, []>("decoder_norm22_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37813440)))];
-            tensor<fp32, [256]> decoder_norm22_0_weight = const()[name = tensor<string, []>("decoder_norm22_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37814528)))];
-            tensor<fp32, [256]> decoder_q_proj_1_bias = const()[name = tensor<string, []>("decoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37815616)))];
-            tensor<fp32, [256, 256]> decoder_q_proj_1_weight = const()[name = tensor<string, []>("decoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37816704)))];
-            tensor<fp32, [256]> decoder_k_proj_1_bias = const()[name = tensor<string, []>("decoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38078912)))];
-            tensor<fp32, [256, 256]> decoder_k_proj_1_weight = const()[name = tensor<string, []>("decoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38080000)))];
-            tensor<fp32, [256]> decoder_v_proj_1_bias = const()[name = tensor<string, []>("decoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38342208)))];
-            tensor<fp32, [256, 256]> decoder_v_proj_1_weight = const()[name = tensor<string, []>("decoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38343296)))];
-            tensor<fp32, [256]> decoder_g_proj_1_bias = const()[name = tensor<string, []>("decoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38605504)))];
-            tensor<fp32, [256, 256]> decoder_g_proj_1_weight = const()[name = tensor<string, []>("decoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38606592)))];
-            tensor<fp32, [256]> decoder_out_proj_1_bias = const()[name = tensor<string, []>("decoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38868800)))];
-            tensor<fp32, [256, 256]> decoder_out_proj_1_weight = const()[name = tensor<string, []>("decoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38869888)))];
-            tensor<fp32, [256]> decoder_norm11_1_bias = const()[name = tensor<string, []>("decoder_norm11_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39132096)))];
-            tensor<fp32, [256]> decoder_norm11_1_weight = const()[name = tensor<string, []>("decoder_norm11_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39133184)))];
-            tensor<fp32, [256]> decoder_self_attn2_1_out_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_1_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39134272)))];
-            tensor<fp32, [256, 256]> decoder_self_attn2_1_out_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_1_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39135360)))];
-            tensor<fp32, [768]> decoder_self_attn2_1_in_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_1_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39397568)))];
-            tensor<fp32, [768, 256]> decoder_self_attn2_1_in_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_1_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39400704)))];
-            tensor<fp32, [256]> decoder_norm21_1_bias = const()[name = tensor<string, []>("decoder_norm21_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40187200)))];
-            tensor<fp32, [256]> decoder_norm21_1_weight = const()[name = tensor<string, []>("decoder_norm21_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40188288)))];
-            tensor<fp32, [2048]> decoder_linear1_1_bias = const()[name = tensor<string, []>("decoder_linear1_1_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40189376)))];
-            tensor<fp32, [2048, 256]> decoder_linear1_1_weight = const()[name = tensor<string, []>("decoder_linear1_1_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40197632)))];
-            tensor<fp32, [256]> decoder_linear2_1_bias = const()[name = tensor<string, []>("decoder_linear2_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42294848)))];
-            tensor<fp32, [256, 2048]> decoder_linear2_1_weight = const()[name = tensor<string, []>("decoder_linear2_1_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42295936)))];
-            tensor<fp32, [256]> decoder_norm22_1_bias = const()[name = tensor<string, []>("decoder_norm22_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44393152)))];
-            tensor<fp32, [256]> decoder_norm22_1_weight = const()[name = tensor<string, []>("decoder_norm22_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44394240)))];
-            tensor<fp32, []> var_11 = const()[name = tensor<string, []>("op_11"), val = tensor<fp32, []>(0x1.5798eep-27)];
-            tensor<fp32, []> var_17 = const()[name = tensor<string, []>("op_17"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
-            tensor<int32, []> var_20 = const()[name = tensor<string, []>("op_20"), val = tensor<int32, []>(2)];
-            tensor<int32, []> var_23 = const()[name = tensor<string, []>("op_23"), val = tensor<int32, []>(0)];
-            tensor<int32, []> var_26 = const()[name = tensor<string, []>("op_26"), val = tensor<int32, []>(1)];
-            tensor<fp32, []> var_28 = const()[name = tensor<string, []>("op_28"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
-            tensor<bool, []> var_29 = const()[name = tensor<string, []>("op_29"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 1, 256]> input_1 = linear(bias = encoder_input_projection_linear_bias, weight = encoder_input_projection_linear_weight, x = features)[name = tensor<string, []>("linear_0")];
-            tensor<int32, [1]> input_3_axes_0 = const()[name = tensor<string, []>("input_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_3 = layer_norm(axes = input_3_axes_0, beta = encoder_pre_layer_norm_bias, epsilon = var_28, gamma = encoder_pre_layer_norm_weight, x = input_1)[name = tensor<string, []>("input_3")];
+    func main<ios17>(tensor<fp32, [1, 256, 18]> cnn_window, tensor<fp32, [2, 9, 4, 64, 64]> dec_kv, tensor<fp32, [2, 1]> dec_scale, tensor<fp32, [4, 1, 16, 256]> enc_conv_cache, tensor<fp32, [4, 1, 4, 64, 64]> enc_kv, tensor<fp32, [4, 1]> enc_scale, tensor<fp32, [1, 15, 23]> features, tensor<fp32, [1]> valid_mask) {
+            tensor<fp32, [256]> inner_encoder_cnn_bias = const()[name = tensor<string, []>("inner_encoder_cnn_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp32, [256, 256, 19]> inner_encoder_cnn_weight = const()[name = tensor<string, []>("inner_encoder_cnn_weight"), val = tensor<fp32, [256, 256, 19]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
+            tensor<fp32, [1, 1]> inner_encoder__causal_mask = const()[name = tensor<string, []>("inner_encoder__causal_mask"), val = tensor<fp32, [1, 1]>([[0x1p+0]])];
+            tensor<fp32, [1]> inner_encoder__t_index = const()[name = tensor<string, []>("inner_encoder__t_index"), val = tensor<fp32, [1]>([0x1p+0])];
+            tensor<fp32, [256]> inner_encoder_input_projection_linear_bias = const()[name = tensor<string, []>("inner_encoder_input_projection_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4981952)))];
+            tensor<fp32, [256, 345]> inner_encoder_input_projection_linear_weight = const()[name = tensor<string, []>("inner_encoder_input_projection_linear_weight"), val = tensor<fp32, [256, 345]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4983040)))];
+            tensor<fp32, [256]> inner_encoder_pre_layer_norm_bias = const()[name = tensor<string, []>("inner_encoder_pre_layer_norm_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5336384)))];
+            tensor<fp32, [256]> inner_encoder_pre_layer_norm_weight = const()[name = tensor<string, []>("inner_encoder_pre_layer_norm_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5337472)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5338560)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5339648)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5340736)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5344896)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6393536)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6394624)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_0_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7443264)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_0_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7444352)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7445440)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7446528)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7708736)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7709824)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7972032)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7973120)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8235328)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8236416)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8498624)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8499712)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8761920)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8763008)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_0_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8764096)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_0_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8766208)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_0_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9290560)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9307008)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9308096)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9309184)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9310272)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9311360)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_0_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9312448)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9574656)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9575744)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9576832)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9580992)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10629632)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10630720)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_0_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11679360)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_0_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11680448)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11681536)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11682624)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11683712)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11687872)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12736512)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12737600)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_1_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13786240)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_1_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13787328)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13788416)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13789504)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14051712)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14052800)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14315008)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14316096)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14578304)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14579392)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14841600)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14842688)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15104896)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15105984)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_1_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15107072)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_1_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15109184)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_1_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15633536)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15649984)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15651072)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15652160)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15653248)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15654336)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_1_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15655424)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15917632)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15918720)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15919808)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15923968)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16972608)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16973696)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_1_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18022336)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_1_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18023424)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18024512)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18025600)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18026688)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18030848)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19079488)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19080576)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_2_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20129216)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_2_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20130304)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20131392)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20132480)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20394688)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20395776)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20657984)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20659072)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20921280)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20922368)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21184576)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21185664)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21447872)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21448960)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_2_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21450048)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_2_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21452160)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_2_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21976512)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21992960)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21994048)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21995136)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21996224)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21997312)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_2_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21998400)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22260608)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22261696)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22262784)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22266944)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23315584)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23316672)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_2_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24365312)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_2_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24366400)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24367488)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24368576)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24369664)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24373824)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25422464)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25423552)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_3_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26472192)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_3_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26473280)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26474368)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26475456)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26737664)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26738752)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27000960)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27002048)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27264256)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27265344)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27527552)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27528640)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27790848)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27791936)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_3_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27793024)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_3_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27795136)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_3_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28319488)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28335936)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28337024)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28338112)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28339200)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28340288)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_3_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28341376)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28603584)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28604672)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28605760)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28609920)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29658560)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29659648)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_3_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30708288)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_3_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30709376)))];
+            tensor<fp32, [256]> inner_decoder_convert_bias = const()[name = tensor<string, []>("inner_decoder_convert_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710464)))];
+            tensor<fp32, [256, 512]> inner_decoder_convert_weight = const()[name = tensor<string, []>("inner_decoder_convert_weight"), val = tensor<fp32, [256, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30711552)))];
+            tensor<fp32, [256]> inner_decoder_q_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31235904)))];
+            tensor<fp32, [256, 256]> inner_decoder_q_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31236992)))];
+            tensor<fp32, [256]> inner_decoder_k_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31499200)))];
+            tensor<fp32, [256, 256]> inner_decoder_k_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31500288)))];
+            tensor<fp32, [256]> inner_decoder_v_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31762496)))];
+            tensor<fp32, [256, 256]> inner_decoder_v_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31763584)))];
+            tensor<fp32, [256]> inner_decoder_g_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32025792)))];
+            tensor<fp32, [256, 256]> inner_decoder_g_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32026880)))];
+            tensor<fp32, [256]> inner_decoder_out_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32289088)))];
+            tensor<fp32, [256, 256]> inner_decoder_out_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32290176)))];
+            tensor<fp32, [256]> inner_decoder_norm11_0_bias = const()[name = tensor<string, []>("inner_decoder_norm11_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32552384)))];
+            tensor<fp32, [256]> inner_decoder_norm11_0_weight = const()[name = tensor<string, []>("inner_decoder_norm11_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32553472)))];
+            tensor<fp32, [256]> inner_decoder_self_attn2_0_out_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32554560)))];
+            tensor<fp32, [256, 256]> inner_decoder_self_attn2_0_out_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32555648)))];
+            tensor<fp32, [768]> inner_decoder_self_attn2_0_in_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32817856)))];
+            tensor<fp32, [768, 256]> inner_decoder_self_attn2_0_in_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32820992)))];
+            tensor<fp32, [256]> inner_decoder_norm21_0_bias = const()[name = tensor<string, []>("inner_decoder_norm21_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33607488)))];
+            tensor<fp32, [256]> inner_decoder_norm21_0_weight = const()[name = tensor<string, []>("inner_decoder_norm21_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33608576)))];
+            tensor<fp32, [2048]> inner_decoder_linear1_0_bias = const()[name = tensor<string, []>("inner_decoder_linear1_0_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33609664)))];
+            tensor<fp32, [2048, 256]> inner_decoder_linear1_0_weight = const()[name = tensor<string, []>("inner_decoder_linear1_0_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33617920)))];
+            tensor<fp32, [256]> inner_decoder_linear2_0_bias = const()[name = tensor<string, []>("inner_decoder_linear2_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35715136)))];
+            tensor<fp32, [256, 2048]> inner_decoder_linear2_0_weight = const()[name = tensor<string, []>("inner_decoder_linear2_0_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35716224)))];
+            tensor<fp32, [256]> inner_decoder_norm22_0_bias = const()[name = tensor<string, []>("inner_decoder_norm22_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37813440)))];
+            tensor<fp32, [256]> inner_decoder_norm22_0_weight = const()[name = tensor<string, []>("inner_decoder_norm22_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37814528)))];
+            tensor<fp32, [256]> inner_decoder_q_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37815616)))];
+            tensor<fp32, [256, 256]> inner_decoder_q_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37816704)))];
+            tensor<fp32, [256]> inner_decoder_k_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38078912)))];
+            tensor<fp32, [256, 256]> inner_decoder_k_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38080000)))];
+            tensor<fp32, [256]> inner_decoder_v_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38342208)))];
+            tensor<fp32, [256, 256]> inner_decoder_v_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38343296)))];
+            tensor<fp32, [256]> inner_decoder_g_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38605504)))];
+            tensor<fp32, [256, 256]> inner_decoder_g_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38606592)))];
+            tensor<fp32, [256]> inner_decoder_out_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38868800)))];
+            tensor<fp32, [256, 256]> inner_decoder_out_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38869888)))];
+            tensor<fp32, [256]> inner_decoder_norm11_1_bias = const()[name = tensor<string, []>("inner_decoder_norm11_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39132096)))];
+            tensor<fp32, [256]> inner_decoder_norm11_1_weight = const()[name = tensor<string, []>("inner_decoder_norm11_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39133184)))];
+            tensor<fp32, [256]> inner_decoder_self_attn2_1_out_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39134272)))];
+            tensor<fp32, [256, 256]> inner_decoder_self_attn2_1_out_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39135360)))];
+            tensor<fp32, [768]> inner_decoder_self_attn2_1_in_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39397568)))];
+            tensor<fp32, [768, 256]> inner_decoder_self_attn2_1_in_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39400704)))];
+            tensor<fp32, [256]> inner_decoder_norm21_1_bias = const()[name = tensor<string, []>("inner_decoder_norm21_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40187200)))];
+            tensor<fp32, [256]> inner_decoder_norm21_1_weight = const()[name = tensor<string, []>("inner_decoder_norm21_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40188288)))];
+            tensor<fp32, [2048]> inner_decoder_linear1_1_bias = const()[name = tensor<string, []>("inner_decoder_linear1_1_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40189376)))];
+            tensor<fp32, [2048, 256]> inner_decoder_linear1_1_weight = const()[name = tensor<string, []>("inner_decoder_linear1_1_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40197632)))];
+            tensor<fp32, [256]> inner_decoder_linear2_1_bias = const()[name = tensor<string, []>("inner_decoder_linear2_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42294848)))];
+            tensor<fp32, [256, 2048]> inner_decoder_linear2_1_weight = const()[name = tensor<string, []>("inner_decoder_linear2_1_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42295936)))];
+            tensor<fp32, [256]> inner_decoder_norm22_1_bias = const()[name = tensor<string, []>("inner_decoder_norm22_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44393152)))];
+            tensor<fp32, [256]> inner_decoder_norm22_1_weight = const()[name = tensor<string, []>("inner_decoder_norm22_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44394240)))];
+            tensor<int32, [1]> stacked_axes_0 = const()[name = tensor<string, []>("stacked_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp32, [1, 1, 15, 23]> stacked = expand_dims(axes = stacked_axes_0, x = features)[name = tensor<string, []>("stacked")];
+            tensor<int32, [3]> var_26 = const()[name = tensor<string, []>("op_26"), val = tensor<int32, [3]>([1, 1, 345])];
+            tensor<fp32, [1, 1, 345]> input_1 = reshape(shape = var_26, x = stacked)[name = tensor<string, []>("input_1")];
+            tensor<fp32, []> var_29 = const()[name = tensor<string, []>("op_29"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<bool, []> var_35 = const()[name = tensor<string, []>("op_35"), val = tensor<bool, []>(true)];
+            tensor<fp32, []> var_36 = const()[name = tensor<string, []>("op_36"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
+            tensor<int32, []> var_39 = const()[name = tensor<string, []>("op_39"), val = tensor<int32, []>(0)];
+            tensor<int32, []> var_41 = const()[name = tensor<string, []>("op_41"), val = tensor<int32, []>(2)];
+            tensor<int32, []> var_42 = const()[name = tensor<string, []>("op_42"), val = tensor<int32, []>(-1)];
+            tensor<fp32, []> var_44 = const()[name = tensor<string, []>("op_44"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
+            tensor<fp32, []> var_49 = const()[name = tensor<string, []>("op_49"), val = tensor<fp32, []>(0x1.5798eep-27)];
+            tensor<int32, []> var_52 = const()[name = tensor<string, []>("op_52"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 1, 256]> input_3 = linear(bias = inner_encoder_input_projection_linear_bias, weight = inner_encoder_input_projection_linear_weight, x = input_1)[name = tensor<string, []>("linear_0")];
             tensor<int32, [1]> input_5_axes_0 = const()[name = tensor<string, []>("input_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_5 = layer_norm(axes = input_5_axes_0, beta = encoder_ffn1_0_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_0_module_sequential_0_weight, x = input_3)[name = tensor<string, []>("input_5")];
-            tensor<fp32, [1, 1, 1024]> inputs_1 = linear(bias = encoder_ffn1_0_module_sequential_1_linear_bias, weight = encoder_ffn1_0_module_sequential_1_linear_weight, x = input_5)[name = tensor<string, []>("linear_1")];
-            tensor<fp32, [1, 1, 1024]> input_7 = silu(x = inputs_1)[name = tensor<string, []>("input_7")];
-            tensor<fp32, [1, 1, 256]> input_11 = linear(bias = encoder_ffn1_0_module_sequential_4_linear_bias, weight = encoder_ffn1_0_module_sequential_4_linear_weight, x = input_7)[name = tensor<string, []>("linear_2")];
-            tensor<fp32, []> var_147 = const()[name = tensor<string, []>("op_147"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 1, 256]> var_148 = mul(x = input_11, y = var_147)[name = tensor<string, []>("op_148")];
-            tensor<fp32, [1, 1, 256]> input_13 = add(x = var_148, y = input_3)[name = tensor<string, []>("input_13")];
+            tensor<fp32, [1, 1, 256]> input_5 = layer_norm(axes = input_5_axes_0, beta = inner_encoder_pre_layer_norm_bias, epsilon = var_36, gamma = inner_encoder_pre_layer_norm_weight, x = input_3)[name = tensor<string, []>("input_5")];
+            tensor<int32, [1]> input_7_axes_0 = const()[name = tensor<string, []>("input_7_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 1, 256]> input_7 = layer_norm(axes = input_7_axes_0, beta = inner_encoder_ffn1_0_module_sequential_0_bias, epsilon = var_36, gamma = inner_encoder_ffn1_0_module_sequential_0_weight, x = input_5)[name = tensor<string, []>("input_7")];
+            tensor<fp32, [1, 1, 1024]> inputs_1 = linear(bias = inner_encoder_ffn1_0_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_0_module_sequential_1_linear_weight, x = input_7)[name = tensor<string, []>("linear_1")];
+            tensor<fp32, [1, 1, 1024]> input_9 = silu(x = inputs_1)[name = tensor<string, []>("input_9")];
+            tensor<fp32, [1, 1, 256]> input_13 = linear(bias = inner_encoder_ffn1_0_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_0_module_sequential_4_linear_weight, x = input_9)[name = tensor<string, []>("linear_2")];
+            tensor<fp32, []> var_173 = const()[name = tensor<string, []>("op_173"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 1, 256]> var_174 = mul(x = input_13, y = var_173)[name = tensor<string, []>("op_174")];
+            tensor<fp32, [1, 1, 256]> input_15 = add(x = var_174, y = input_5)[name = tensor<string, []>("input_15")];
             tensor<int32, [1]> x_1_axes_0 = const()[name = tensor<string, []>("x_1_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> x_1 = layer_norm(axes = x_1_axes_0, beta = encoder_ret_lns_0_bias, epsilon = var_28, gamma = encoder_ret_lns_0_weight, x = input_13)[name = tensor<string, []>("x_1")];
+            tensor<fp32, [1, 1, 256]> x_1 = layer_norm(axes = x_1_axes_0, beta = inner_encoder_ret_lns_0_bias, epsilon = var_36, gamma = inner_encoder_ret_lns_0_weight, x = input_15)[name = tensor<string, []>("x_1")];
             tensor<int32, [5]> prev_kv_1_begin_0 = const()[name = tensor<string, []>("prev_kv_1_begin_0"), val = tensor<int32, [5]>([0, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_1_end_0 = const()[name = tensor<string, []>("prev_kv_1_end_0"), val = tensor<int32, [5]>([1, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_1_end_mask_0 = const()[name = tensor<string, []>("prev_kv_1_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -238,139 +244,139 @@ program(1.0)
             tensor<bool, [2]> prev_scale_1_end_mask_0 = const()[name = tensor<string, []>("prev_scale_1_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_1_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_1_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_1 = slice_by_index(begin = prev_scale_1_begin_0, end = prev_scale_1_end_0, end_mask = prev_scale_1_end_mask_0, squeeze_mask = prev_scale_1_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_1")];
-            tensor<fp32, [1, 1, 256]> var_162 = linear(bias = encoder_q_proj_0_bias, weight = encoder_q_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_3")];
-            tensor<int32, [4]> var_163 = const()[name = tensor<string, []>("op_163"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_164 = reshape(shape = var_163, x = var_162)[name = tensor<string, []>("op_164")];
+            tensor<fp32, [1, 1, 256]> var_188 = linear(bias = inner_encoder_q_proj_0_bias, weight = inner_encoder_q_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_3")];
+            tensor<int32, [4]> var_189 = const()[name = tensor<string, []>("op_189"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_190 = reshape(shape = var_189, x = var_188)[name = tensor<string, []>("op_190")];
             tensor<int32, [4]> q_1_perm_0 = const()[name = tensor<string, []>("q_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> var_168 = linear(bias = encoder_k_proj_0_bias, weight = encoder_k_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_4")];
-            tensor<fp32, []> var_169 = const()[name = tensor<string, []>("op_169"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 1, 256]> var_170 = mul(x = var_168, y = var_169)[name = tensor<string, []>("op_170")];
-            tensor<int32, [4]> var_171 = const()[name = tensor<string, []>("op_171"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_172 = reshape(shape = var_171, x = var_170)[name = tensor<string, []>("op_172")];
+            tensor<fp32, [1, 1, 256]> var_194 = linear(bias = inner_encoder_k_proj_0_bias, weight = inner_encoder_k_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_4")];
+            tensor<fp32, []> var_195 = const()[name = tensor<string, []>("op_195"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 1, 256]> var_196 = mul(x = var_194, y = var_195)[name = tensor<string, []>("op_196")];
+            tensor<int32, [4]> var_197 = const()[name = tensor<string, []>("op_197"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_198 = reshape(shape = var_197, x = var_196)[name = tensor<string, []>("op_198")];
             tensor<int32, [4]> k_1_perm_0 = const()[name = tensor<string, []>("k_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> var_176 = linear(bias = encoder_v_proj_0_bias, weight = encoder_v_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_5")];
-            tensor<int32, [4]> var_177 = const()[name = tensor<string, []>("op_177"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_178 = reshape(shape = var_177, x = var_176)[name = tensor<string, []>("op_178")];
+            tensor<fp32, [1, 1, 256]> var_202 = linear(bias = inner_encoder_v_proj_0_bias, weight = inner_encoder_v_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_5")];
+            tensor<int32, [4]> var_203 = const()[name = tensor<string, []>("op_203"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_204 = reshape(shape = var_203, x = var_202)[name = tensor<string, []>("op_204")];
             tensor<int32, [4]> v_1_perm_0 = const()[name = tensor<string, []>("v_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> input_17 = linear(bias = encoder_g_proj_0_bias, weight = encoder_g_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_6")];
+            tensor<fp32, [1, 1, 256]> input_19 = linear(bias = inner_encoder_g_proj_0_bias, weight = inner_encoder_g_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_6")];
             tensor<fp32, [1]> sqrt_s0_1 = sqrt(x = prev_scale_1)[name = tensor<string, []>("sqrt_s0_1")];
-            tensor<fp32, [1]> s_t_1 = add(x = prev_scale_1, y = encoder__t_index)[name = tensor<string, []>("s_t_1")];
+            tensor<fp32, [1]> s_t_1 = add(x = prev_scale_1, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_1")];
             tensor<fp32, [1]> sqrt_s_t_1 = sqrt(x = s_t_1)[name = tensor<string, []>("sqrt_s_t_1")];
             tensor<bool, []> qk_1_transpose_x_1 = const()[name = tensor<string, []>("qk_1_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_1_transpose_y_1 = const()[name = tensor<string, []>("qk_1_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 1, 64]> k_1 = transpose(perm = k_1_perm_0, x = var_172)[name = tensor<string, []>("transpose_57")];
-            tensor<fp32, [1, 4, 1, 64]> q_1 = transpose(perm = q_1_perm_0, x = var_164)[name = tensor<string, []>("transpose_58")];
+            tensor<fp32, [1, 4, 1, 64]> k_1 = transpose(perm = k_1_perm_0, x = var_198)[name = tensor<string, []>("transpose_57")];
+            tensor<fp32, [1, 4, 1, 64]> q_1 = transpose(perm = q_1_perm_0, x = var_190)[name = tensor<string, []>("transpose_58")];
             tensor<fp32, [1, 4, 1, 1]> qk_1 = matmul(transpose_x = qk_1_transpose_x_1, transpose_y = qk_1_transpose_y_1, x = q_1, y = k_1)[name = tensor<string, []>("qk_1")];
-            tensor<int32, [2]> var_188 = const()[name = tensor<string, []>("op_188"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp32, [1, 1]> var_189 = reshape(shape = var_188, x = sqrt_s_t_1)[name = tensor<string, []>("op_189")];
-            tensor<fp32, [1, 1]> M_1 = real_div(x = encoder__causal_mask, y = var_189)[name = tensor<string, []>("M_1")];
-            tensor<fp32, [1, 4, 1, 1]> var_191 = mul(x = qk_1, y = M_1)[name = tensor<string, []>("op_191")];
+            tensor<int32, [2]> var_214 = const()[name = tensor<string, []>("op_214"), val = tensor<int32, [2]>([1, 1])];
+            tensor<fp32, [1, 1]> var_215 = reshape(shape = var_214, x = sqrt_s_t_1)[name = tensor<string, []>("op_215")];
+            tensor<fp32, [1, 1]> M_1 = real_div(x = inner_encoder__causal_mask, y = var_215)[name = tensor<string, []>("M_1")];
+            tensor<fp32, [1, 4, 1, 1]> var_217 = mul(x = qk_1, y = M_1)[name = tensor<string, []>("op_217")];
             tensor<bool, []> inner_1_transpose_x_0 = const()[name = tensor<string, []>("inner_1_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_1_transpose_y_0 = const()[name = tensor<string, []>("inner_1_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 1, 64]> v_1 = transpose(perm = v_1_perm_0, x = var_178)[name = tensor<string, []>("transpose_56")];
-            tensor<fp32, [1, 4, 1, 64]> inner_1 = matmul(transpose_x = inner_1_transpose_x_0, transpose_y = inner_1_transpose_y_0, x = var_191, y = v_1)[name = tensor<string, []>("inner_1")];
-            tensor<bool, []> var_193_transpose_x_0 = const()[name = tensor<string, []>("op_193_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_193_transpose_y_0 = const()[name = tensor<string, []>("op_193_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 1, 64]> var_193 = matmul(transpose_x = var_193_transpose_x_0, transpose_y = var_193_transpose_y_0, x = q_1, y = prev_kv_1)[name = tensor<string, []>("op_193")];
-            tensor<fp32, [1]> var_194 = real_div(x = sqrt_s0_1, y = sqrt_s_t_1)[name = tensor<string, []>("op_194")];
-            tensor<int32, [4]> var_195 = const()[name = tensor<string, []>("op_195"), val = tensor<int32, [4]>([1, 1, 1, 1])];
-            tensor<fp32, [1, 1, 1, 1]> var_196 = reshape(shape = var_195, x = var_194)[name = tensor<string, []>("op_196")];
-            tensor<fp32, [1, 4, 1, 64]> cross_1 = mul(x = var_193, y = var_196)[name = tensor<string, []>("cross_1")];
+            tensor<fp32, [1, 4, 1, 64]> v_1 = transpose(perm = v_1_perm_0, x = var_204)[name = tensor<string, []>("transpose_56")];
+            tensor<fp32, [1, 4, 1, 64]> inner_1 = matmul(transpose_x = inner_1_transpose_x_0, transpose_y = inner_1_transpose_y_0, x = var_217, y = v_1)[name = tensor<string, []>("inner_1")];
+            tensor<bool, []> var_219_transpose_x_0 = const()[name = tensor<string, []>("op_219_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_219_transpose_y_0 = const()[name = tensor<string, []>("op_219_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 1, 64]> var_219 = matmul(transpose_x = var_219_transpose_x_0, transpose_y = var_219_transpose_y_0, x = q_1, y = prev_kv_1)[name = tensor<string, []>("op_219")];
+            tensor<fp32, [1]> var_220 = real_div(x = sqrt_s0_1, y = sqrt_s_t_1)[name = tensor<string, []>("op_220")];
+            tensor<int32, [4]> var_221 = const()[name = tensor<string, []>("op_221"), val = tensor<int32, [4]>([1, 1, 1, 1])];
+            tensor<fp32, [1, 1, 1, 1]> var_222 = reshape(shape = var_221, x = var_220)[name = tensor<string, []>("op_222")];
+            tensor<fp32, [1, 4, 1, 64]> cross_1 = mul(x = var_219, y = var_222)[name = tensor<string, []>("cross_1")];
             tensor<fp32, [1, 4, 1, 64]> out_1 = add(x = inner_1, y = cross_1)[name = tensor<string, []>("out_1")];
-            tensor<fp32, [1, 4, 64, 64]> var_199 = mul(x = prev_kv_1, y = sqrt_s0_1)[name = tensor<string, []>("op_199")];
-            tensor<bool, []> var_201_transpose_x_1 = const()[name = tensor<string, []>("op_201_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_201_transpose_y_1 = const()[name = tensor<string, []>("op_201_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_201 = matmul(transpose_x = var_201_transpose_x_1, transpose_y = var_201_transpose_y_1, x = k_1, y = v_1)[name = tensor<string, []>("op_201")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_1 = add(x = var_199, y = var_201)[name = tensor<string, []>("new_kv_unnorm_1")];
-            tensor<fp32, []> var_203 = const()[name = tensor<string, []>("op_203"), val = tensor<fp32, []>(0x1p+0)];
-            tensor<fp32, [1]> new_scale_1 = add(x = prev_scale_1, y = var_203)[name = tensor<string, []>("new_scale_1")];
-            tensor<fp32, [1]> var_205 = sqrt(x = new_scale_1)[name = tensor<string, []>("op_205")];
-            tensor<fp32, [1, 4, 64, 64]> var_206 = real_div(x = new_kv_unnorm_1, y = var_205)[name = tensor<string, []>("op_206")];
-            tensor<int32, [4]> var_207_perm_0 = const()[name = tensor<string, []>("op_207_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_225 = mul(x = prev_kv_1, y = sqrt_s0_1)[name = tensor<string, []>("op_225")];
+            tensor<bool, []> var_227_transpose_x_1 = const()[name = tensor<string, []>("op_227_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_227_transpose_y_1 = const()[name = tensor<string, []>("op_227_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_227 = matmul(transpose_x = var_227_transpose_x_1, transpose_y = var_227_transpose_y_1, x = k_1, y = v_1)[name = tensor<string, []>("op_227")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_1 = add(x = var_225, y = var_227)[name = tensor<string, []>("new_kv_unnorm_1")];
+            tensor<fp32, []> var_229 = const()[name = tensor<string, []>("op_229"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<fp32, [1]> new_scale_1 = add(x = prev_scale_1, y = var_229)[name = tensor<string, []>("new_scale_1")];
+            tensor<fp32, [1]> var_231 = sqrt(x = new_scale_1)[name = tensor<string, []>("op_231")];
+            tensor<fp32, [1, 4, 64, 64]> var_232 = real_div(x = new_kv_unnorm_1, y = var_231)[name = tensor<string, []>("op_232")];
+            tensor<int32, [4]> var_233_perm_0 = const()[name = tensor<string, []>("op_233_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_3_axes_0 = const()[name = tensor<string, []>("out_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 4, 64]> var_207 = transpose(perm = var_207_perm_0, x = out_1)[name = tensor<string, []>("transpose_55")];
-            tensor<fp32, [1, 1, 4, 64]> out_3 = layer_norm(axes = out_3_axes_0, epsilon = var_17, x = var_207)[name = tensor<string, []>("out_3")];
-            tensor<int32, [3]> var_211 = const()[name = tensor<string, []>("op_211"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<fp32, [1, 1, 256]> out_5 = reshape(shape = var_211, x = out_3)[name = tensor<string, []>("out_5")];
-            tensor<fp32, [1, 1, 256]> var_213 = silu(x = input_17)[name = tensor<string, []>("op_213")];
-            tensor<fp32, [1, 1, 256]> input_19 = mul(x = var_213, y = out_5)[name = tensor<string, []>("input_19")];
-            tensor<fp32, [1, 1, 256]> ret_out_1 = linear(bias = encoder_out_proj_0_bias, weight = encoder_out_proj_0_weight, x = input_19)[name = tensor<string, []>("linear_7")];
-            tensor<fp32, [1, 1, 256]> x_3 = add(x = input_13, y = ret_out_1)[name = tensor<string, []>("x_3")];
+            tensor<fp32, [1, 1, 4, 64]> var_233 = transpose(perm = var_233_perm_0, x = out_1)[name = tensor<string, []>("transpose_55")];
+            tensor<fp32, [1, 1, 4, 64]> out_3 = layer_norm(axes = out_3_axes_0, epsilon = var_44, x = var_233)[name = tensor<string, []>("out_3")];
+            tensor<int32, [3]> var_237 = const()[name = tensor<string, []>("op_237"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<fp32, [1, 1, 256]> out_5 = reshape(shape = var_237, x = out_3)[name = tensor<string, []>("out_5")];
+            tensor<fp32, [1, 1, 256]> var_239 = silu(x = input_19)[name = tensor<string, []>("op_239")];
+            tensor<fp32, [1, 1, 256]> input_21 = mul(x = var_239, y = out_5)[name = tensor<string, []>("input_21")];
+            tensor<fp32, [1, 1, 256]> ret_out_1 = linear(bias = inner_encoder_out_proj_0_bias, weight = inner_encoder_out_proj_0_weight, x = input_21)[name = tensor<string, []>("linear_7")];
+            tensor<fp32, [1, 1, 256]> x_3 = add(x = input_15, y = ret_out_1)[name = tensor<string, []>("x_3")];
             tensor<int32, [4]> window_1_begin_0 = const()[name = tensor<string, []>("window_1_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> window_1_end_0 = const()[name = tensor<string, []>("window_1_end_0"), val = tensor<int32, [4]>([1, 1, 16, 256])];
             tensor<bool, [4]> window_1_end_mask_0 = const()[name = tensor<string, []>("window_1_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_1_squeeze_mask_0 = const()[name = tensor<string, []>("window_1_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_1 = slice_by_index(begin = window_1_begin_0, end = window_1_end_0, end_mask = window_1_end_mask_0, squeeze_mask = window_1_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_1")];
-            tensor<int32, [3]> var_224_begin_0 = const()[name = tensor<string, []>("op_224_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_224_end_0 = const()[name = tensor<string, []>("op_224_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_224_end_mask_0 = const()[name = tensor<string, []>("op_224_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_224 = slice_by_index(begin = var_224_begin_0, end = var_224_end_0, end_mask = var_224_end_mask_0, x = window_1)[name = tensor<string, []>("op_224")];
+            tensor<int32, [3]> var_250_begin_0 = const()[name = tensor<string, []>("op_250_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_250_end_0 = const()[name = tensor<string, []>("op_250_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_250_end_mask_0 = const()[name = tensor<string, []>("op_250_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_250 = slice_by_index(begin = var_250_begin_0, end = var_250_end_0, end_mask = var_250_end_mask_0, x = window_1)[name = tensor<string, []>("op_250")];
             tensor<bool, []> window_3_interleave_0 = const()[name = tensor<string, []>("window_3_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_3 = concat(axis = var_26, interleave = window_3_interleave_0, values = (var_224, x_3))[name = tensor<string, []>("window_3")];
-            tensor<bool, []> input_21_interleave_0 = const()[name = tensor<string, []>("input_21_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> input_21 = concat(axis = var_23, interleave = input_21_interleave_0, values = window_3)[name = tensor<string, []>("input_21")];
+            tensor<fp32, [1, 16, 256]> window_3 = concat(axis = var_52, interleave = window_3_interleave_0, values = (var_250, x_3))[name = tensor<string, []>("window_3")];
+            tensor<bool, []> input_23_interleave_0 = const()[name = tensor<string, []>("input_23_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 16, 256]> input_23 = concat(axis = var_39, interleave = input_23_interleave_0, values = window_3)[name = tensor<string, []>("input_23")];
             tensor<int32, [1]> x_5_axes_0 = const()[name = tensor<string, []>("x_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 16, 256]> x_5 = layer_norm(axes = x_5_axes_0, beta = encoder_conv_module_0_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_0_sequential_0_weight, x = input_21)[name = tensor<string, []>("x_5")];
-            tensor<int32, [3]> input_23_perm_0 = const()[name = tensor<string, []>("input_23_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [1, 16, 256]> x_5 = layer_norm(axes = x_5_axes_0, beta = inner_encoder_conv_module_0_sequential_0_bias, epsilon = var_36, gamma = inner_encoder_conv_module_0_sequential_0_weight, x = input_23)[name = tensor<string, []>("x_5")];
+            tensor<int32, [3]> input_25_perm_0 = const()[name = tensor<string, []>("input_25_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_3_pad_type_0 = const()[name = tensor<string, []>("inputs_3_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_3_strides_0 = const()[name = tensor<string, []>("inputs_3_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_3_pad_0 = const()[name = tensor<string, []>("inputs_3_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_3_dilations_0 = const()[name = tensor<string, []>("inputs_3_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_3_groups_0 = const()[name = tensor<string, []>("inputs_3_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> input_23 = transpose(perm = input_23_perm_0, x = x_5)[name = tensor<string, []>("transpose_54")];
-            tensor<fp32, [1, 512, 16]> inputs_3 = conv(bias = encoder_conv_module_0_sequential_2_conv_bias, dilations = inputs_3_dilations_0, groups = inputs_3_groups_0, pad = inputs_3_pad_0, pad_type = inputs_3_pad_type_0, strides = inputs_3_strides_0, weight = encoder_conv_module_0_sequential_2_conv_weight, x = input_23)[name = tensor<string, []>("inputs_3")];
-            tensor<int32, [2]> var_249_split_sizes_0 = const()[name = tensor<string, []>("op_249_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_249_axis_0 = const()[name = tensor<string, []>("op_249_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> var_249_0, tensor<fp32, [1, 256, 16]> var_249_1 = split(axis = var_249_axis_0, split_sizes = var_249_split_sizes_0, x = inputs_3)[name = tensor<string, []>("op_249")];
-            tensor<fp32, [1, 256, 16]> var_251 = sigmoid(x = var_249_1)[name = tensor<string, []>("op_251")];
-            tensor<fp32, [1, 256, 16]> inputs_5 = mul(x = var_249_0, y = var_251)[name = tensor<string, []>("inputs_5")];
+            tensor<fp32, [1, 256, 16]> input_25 = transpose(perm = input_25_perm_0, x = x_5)[name = tensor<string, []>("transpose_54")];
+            tensor<fp32, [1, 512, 16]> inputs_3 = conv(bias = inner_encoder_conv_module_0_sequential_2_conv_bias, dilations = inputs_3_dilations_0, groups = inputs_3_groups_0, pad = inputs_3_pad_0, pad_type = inputs_3_pad_type_0, strides = inputs_3_strides_0, weight = inner_encoder_conv_module_0_sequential_2_conv_weight, x = input_25)[name = tensor<string, []>("inputs_3")];
+            tensor<int32, [2]> var_275_split_sizes_0 = const()[name = tensor<string, []>("op_275_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_275_axis_0 = const()[name = tensor<string, []>("op_275_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 256, 16]> var_275_0, tensor<fp32, [1, 256, 16]> var_275_1 = split(axis = var_275_axis_0, split_sizes = var_275_split_sizes_0, x = inputs_3)[name = tensor<string, []>("op_275")];
+            tensor<fp32, [1, 256, 16]> var_277 = sigmoid(x = var_275_1)[name = tensor<string, []>("op_277")];
+            tensor<fp32, [1, 256, 16]> inputs_5 = mul(x = var_275_0, y = var_277)[name = tensor<string, []>("inputs_5")];
             tensor<string, []> outputs_aug_1_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_1_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_1_pad_0 = const()[name = tensor<string, []>("outputs_aug_1_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_1_groups_0 = const()[name = tensor<string, []>("outputs_aug_1_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_1_strides_0 = const()[name = tensor<string, []>("outputs_aug_1_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_1_dilations_0 = const()[name = tensor<string, []>("outputs_aug_1_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [1, 256, 31]> outputs_aug_1 = conv(dilations = outputs_aug_1_dilations_0, groups = outputs_aug_1_groups_0, pad = outputs_aug_1_pad_0, pad_type = outputs_aug_1_pad_type_0, strides = outputs_aug_1_strides_0, weight = encoder_conv_module_0_sequential_4_conv_weight, x = inputs_5)[name = tensor<string, []>("outputs_aug_1")];
-            tensor<int32, [3]> input_25_begin_0 = const()[name = tensor<string, []>("input_25_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_25_end_0 = const()[name = tensor<string, []>("input_25_end_0"), val = tensor<int32, [3]>([1, 256, 16])];
-            tensor<bool, [3]> input_25_end_mask_0 = const()[name = tensor<string, []>("input_25_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [1, 256, 16]> input_25 = slice_by_index(begin = input_25_begin_0, end = input_25_end_0, end_mask = input_25_end_mask_0, x = outputs_aug_1)[name = tensor<string, []>("input_25")];
-            tensor<fp32, [1, 256, 16]> inputs_7 = batch_norm(beta = encoder_conv_module_0_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_0_sequential_5_weight, mean = encoder_conv_module_0_sequential_5_running_mean, variance = encoder_conv_module_0_sequential_5_running_var, x = input_25)[name = tensor<string, []>("inputs_7")];
-            tensor<fp32, [1, 256, 16]> input_27 = silu(x = inputs_7)[name = tensor<string, []>("input_27")];
-            tensor<string, []> input_29_pad_type_0 = const()[name = tensor<string, []>("input_29_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_29_strides_0 = const()[name = tensor<string, []>("input_29_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_29_pad_0 = const()[name = tensor<string, []>("input_29_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_29_dilations_0 = const()[name = tensor<string, []>("input_29_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_29_groups_0 = const()[name = tensor<string, []>("input_29_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> input_29 = conv(bias = encoder_conv_module_0_sequential_7_conv_bias, dilations = input_29_dilations_0, groups = input_29_groups_0, pad = input_29_pad_0, pad_type = input_29_pad_type_0, strides = input_29_strides_0, weight = encoder_conv_module_0_sequential_7_conv_weight, x = input_27)[name = tensor<string, []>("input_29")];
+            tensor<fp32, [1, 256, 31]> outputs_aug_1 = conv(dilations = outputs_aug_1_dilations_0, groups = outputs_aug_1_groups_0, pad = outputs_aug_1_pad_0, pad_type = outputs_aug_1_pad_type_0, strides = outputs_aug_1_strides_0, weight = inner_encoder_conv_module_0_sequential_4_conv_weight, x = inputs_5)[name = tensor<string, []>("outputs_aug_1")];
+            tensor<int32, [3]> input_27_begin_0 = const()[name = tensor<string, []>("input_27_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_27_end_0 = const()[name = tensor<string, []>("input_27_end_0"), val = tensor<int32, [3]>([1, 256, 16])];
+            tensor<bool, [3]> input_27_end_mask_0 = const()[name = tensor<string, []>("input_27_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [1, 256, 16]> input_27 = slice_by_index(begin = input_27_begin_0, end = input_27_end_0, end_mask = input_27_end_mask_0, x = outputs_aug_1)[name = tensor<string, []>("input_27")];
+            tensor<fp32, [1, 256, 16]> inputs_7 = batch_norm(beta = inner_encoder_conv_module_0_sequential_5_bias, epsilon = var_36, gamma = inner_encoder_conv_module_0_sequential_5_weight, mean = inner_encoder_conv_module_0_sequential_5_running_mean, variance = inner_encoder_conv_module_0_sequential_5_running_var, x = input_27)[name = tensor<string, []>("inputs_7")];
+            tensor<fp32, [1, 256, 16]> input_29 = silu(x = inputs_7)[name = tensor<string, []>("input_29")];
+            tensor<string, []> input_31_pad_type_0 = const()[name = tensor<string, []>("input_31_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_31_strides_0 = const()[name = tensor<string, []>("input_31_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_31_pad_0 = const()[name = tensor<string, []>("input_31_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_31_dilations_0 = const()[name = tensor<string, []>("input_31_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_31_groups_0 = const()[name = tensor<string, []>("input_31_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 256, 16]> input_31 = conv(bias = inner_encoder_conv_module_0_sequential_7_conv_bias, dilations = input_31_dilations_0, groups = input_31_groups_0, pad = input_31_pad_0, pad_type = input_31_pad_type_0, strides = input_31_strides_0, weight = inner_encoder_conv_module_0_sequential_7_conv_weight, x = input_29)[name = tensor<string, []>("input_31")];
             tensor<int32, [3]> conv_out_1_perm_0 = const()[name = tensor<string, []>("conv_out_1_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_282_begin_0 = const()[name = tensor<string, []>("op_282_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_282_end_0 = const()[name = tensor<string, []>("op_282_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_282_end_mask_0 = const()[name = tensor<string, []>("op_282_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 16, 256]> conv_out_1 = transpose(perm = conv_out_1_perm_0, x = input_29)[name = tensor<string, []>("transpose_53")];
-            tensor<fp32, [1, 1, 256]> var_282 = slice_by_index(begin = var_282_begin_0, end = var_282_end_0, end_mask = var_282_end_mask_0, x = conv_out_1)[name = tensor<string, []>("op_282")];
-            tensor<int32, [3]> var_284_perm_0 = const()[name = tensor<string, []>("op_284_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 1, 256]> var_284 = transpose(perm = var_284_perm_0, x = var_282)[name = tensor<string, []>("transpose_52")];
-            tensor<fp32, [1, 1, 256]> input_31 = add(x = x_3, y = var_284)[name = tensor<string, []>("input_31")];
-            tensor<int32, [1]> input_33_axes_0 = const()[name = tensor<string, []>("input_33_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_33 = layer_norm(axes = input_33_axes_0, beta = encoder_ffn2_0_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_0_module_sequential_0_weight, x = input_31)[name = tensor<string, []>("input_33")];
-            tensor<fp32, [1, 1, 1024]> inputs_9 = linear(bias = encoder_ffn2_0_module_sequential_1_linear_bias, weight = encoder_ffn2_0_module_sequential_1_linear_weight, x = input_33)[name = tensor<string, []>("linear_8")];
-            tensor<fp32, [1, 1, 1024]> input_35 = silu(x = inputs_9)[name = tensor<string, []>("input_35")];
-            tensor<fp32, [1, 1, 256]> input_39 = linear(bias = encoder_ffn2_0_module_sequential_4_linear_bias, weight = encoder_ffn2_0_module_sequential_4_linear_weight, x = input_35)[name = tensor<string, []>("linear_9")];
-            tensor<fp32, []> var_307 = const()[name = tensor<string, []>("op_307"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 1, 256]> var_308 = mul(x = input_39, y = var_307)[name = tensor<string, []>("op_308")];
-            tensor<fp32, [1, 1, 256]> input_41 = add(x = var_308, y = input_31)[name = tensor<string, []>("input_41")];
-            tensor<int32, [1]> input_43_axes_0 = const()[name = tensor<string, []>("input_43_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_43 = layer_norm(axes = input_43_axes_0, beta = encoder_layer_norm_0_bias, epsilon = var_28, gamma = encoder_layer_norm_0_weight, x = input_41)[name = tensor<string, []>("input_43")];
+            tensor<int32, [3]> var_308_begin_0 = const()[name = tensor<string, []>("op_308_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_308_end_0 = const()[name = tensor<string, []>("op_308_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_308_end_mask_0 = const()[name = tensor<string, []>("op_308_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 16, 256]> conv_out_1 = transpose(perm = conv_out_1_perm_0, x = input_31)[name = tensor<string, []>("transpose_53")];
+            tensor<fp32, [1, 1, 256]> var_308 = slice_by_index(begin = var_308_begin_0, end = var_308_end_0, end_mask = var_308_end_mask_0, x = conv_out_1)[name = tensor<string, []>("op_308")];
+            tensor<int32, [3]> var_310_perm_0 = const()[name = tensor<string, []>("op_310_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 1, 256]> var_310 = transpose(perm = var_310_perm_0, x = var_308)[name = tensor<string, []>("transpose_52")];
+            tensor<fp32, [1, 1, 256]> input_33 = add(x = x_3, y = var_310)[name = tensor<string, []>("input_33")];
+            tensor<int32, [1]> input_35_axes_0 = const()[name = tensor<string, []>("input_35_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 1, 256]> input_35 = layer_norm(axes = input_35_axes_0, beta = inner_encoder_ffn2_0_module_sequential_0_bias, epsilon = var_36, gamma = inner_encoder_ffn2_0_module_sequential_0_weight, x = input_33)[name = tensor<string, []>("input_35")];
+            tensor<fp32, [1, 1, 1024]> inputs_9 = linear(bias = inner_encoder_ffn2_0_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_0_module_sequential_1_linear_weight, x = input_35)[name = tensor<string, []>("linear_8")];
+            tensor<fp32, [1, 1, 1024]> input_37 = silu(x = inputs_9)[name = tensor<string, []>("input_37")];
+            tensor<fp32, [1, 1, 256]> input_41 = linear(bias = inner_encoder_ffn2_0_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_0_module_sequential_4_linear_weight, x = input_37)[name = tensor<string, []>("linear_9")];
+            tensor<fp32, []> var_333 = const()[name = tensor<string, []>("op_333"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 1, 256]> var_334 = mul(x = input_41, y = var_333)[name = tensor<string, []>("op_334")];
+            tensor<fp32, [1, 1, 256]> input_43 = add(x = var_334, y = input_33)[name = tensor<string, []>("input_43")];
             tensor<int32, [1]> input_45_axes_0 = const()[name = tensor<string, []>("input_45_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_45 = layer_norm(axes = input_45_axes_0, beta = encoder_ffn1_1_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_1_module_sequential_0_weight, x = input_43)[name = tensor<string, []>("input_45")];
-            tensor<fp32, [1, 1, 1024]> inputs_11 = linear(bias = encoder_ffn1_1_module_sequential_1_linear_bias, weight = encoder_ffn1_1_module_sequential_1_linear_weight, x = input_45)[name = tensor<string, []>("linear_10")];
-            tensor<fp32, [1, 1, 1024]> input_47 = silu(x = inputs_11)[name = tensor<string, []>("input_47")];
-            tensor<fp32, [1, 1, 256]> input_51 = linear(bias = encoder_ffn1_1_module_sequential_4_linear_bias, weight = encoder_ffn1_1_module_sequential_4_linear_weight, x = input_47)[name = tensor<string, []>("linear_11")];
-            tensor<fp32, []> var_337 = const()[name = tensor<string, []>("op_337"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 1, 256]> var_338 = mul(x = input_51, y = var_337)[name = tensor<string, []>("op_338")];
-            tensor<fp32, [1, 1, 256]> input_53 = add(x = var_338, y = input_43)[name = tensor<string, []>("input_53")];
+            tensor<fp32, [1, 1, 256]> input_45 = layer_norm(axes = input_45_axes_0, beta = inner_encoder_layer_norm_0_bias, epsilon = var_36, gamma = inner_encoder_layer_norm_0_weight, x = input_43)[name = tensor<string, []>("input_45")];
+            tensor<int32, [1]> input_47_axes_0 = const()[name = tensor<string, []>("input_47_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 1, 256]> input_47 = layer_norm(axes = input_47_axes_0, beta = inner_encoder_ffn1_1_module_sequential_0_bias, epsilon = var_36, gamma = inner_encoder_ffn1_1_module_sequential_0_weight, x = input_45)[name = tensor<string, []>("input_47")];
+            tensor<fp32, [1, 1, 1024]> inputs_11 = linear(bias = inner_encoder_ffn1_1_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_1_module_sequential_1_linear_weight, x = input_47)[name = tensor<string, []>("linear_10")];
+            tensor<fp32, [1, 1, 1024]> input_49 = silu(x = inputs_11)[name = tensor<string, []>("input_49")];
+            tensor<fp32, [1, 1, 256]> input_53 = linear(bias = inner_encoder_ffn1_1_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_1_module_sequential_4_linear_weight, x = input_49)[name = tensor<string, []>("linear_11")];
+            tensor<fp32, []> var_363 = const()[name = tensor<string, []>("op_363"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 1, 256]> var_364 = mul(x = input_53, y = var_363)[name = tensor<string, []>("op_364")];
+            tensor<fp32, [1, 1, 256]> input_55 = add(x = var_364, y = input_45)[name = tensor<string, []>("input_55")];
             tensor<int32, [1]> x_7_axes_0 = const()[name = tensor<string, []>("x_7_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> x_7 = layer_norm(axes = x_7_axes_0, beta = encoder_ret_lns_1_bias, epsilon = var_28, gamma = encoder_ret_lns_1_weight, x = input_53)[name = tensor<string, []>("x_7")];
+            tensor<fp32, [1, 1, 256]> x_7 = layer_norm(axes = x_7_axes_0, beta = inner_encoder_ret_lns_1_bias, epsilon = var_36, gamma = inner_encoder_ret_lns_1_weight, x = input_55)[name = tensor<string, []>("x_7")];
             tensor<int32, [5]> prev_kv_3_begin_0 = const()[name = tensor<string, []>("prev_kv_3_begin_0"), val = tensor<int32, [5]>([1, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_3_end_0 = const()[name = tensor<string, []>("prev_kv_3_end_0"), val = tensor<int32, [5]>([2, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_3_end_mask_0 = const()[name = tensor<string, []>("prev_kv_3_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -381,139 +387,139 @@ program(1.0)
             tensor<bool, [2]> prev_scale_3_end_mask_0 = const()[name = tensor<string, []>("prev_scale_3_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_3_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_3_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_3 = slice_by_index(begin = prev_scale_3_begin_0, end = prev_scale_3_end_0, end_mask = prev_scale_3_end_mask_0, squeeze_mask = prev_scale_3_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_3")];
-            tensor<fp32, [1, 1, 256]> var_352 = linear(bias = encoder_q_proj_1_bias, weight = encoder_q_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_12")];
-            tensor<int32, [4]> var_353 = const()[name = tensor<string, []>("op_353"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_354 = reshape(shape = var_353, x = var_352)[name = tensor<string, []>("op_354")];
+            tensor<fp32, [1, 1, 256]> var_378 = linear(bias = inner_encoder_q_proj_1_bias, weight = inner_encoder_q_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_12")];
+            tensor<int32, [4]> var_379 = const()[name = tensor<string, []>("op_379"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_380 = reshape(shape = var_379, x = var_378)[name = tensor<string, []>("op_380")];
             tensor<int32, [4]> q_3_perm_0 = const()[name = tensor<string, []>("q_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> var_358 = linear(bias = encoder_k_proj_1_bias, weight = encoder_k_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_13")];
-            tensor<fp32, []> var_359 = const()[name = tensor<string, []>("op_359"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 1, 256]> var_360 = mul(x = var_358, y = var_359)[name = tensor<string, []>("op_360")];
-            tensor<int32, [4]> var_361 = const()[name = tensor<string, []>("op_361"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_362 = reshape(shape = var_361, x = var_360)[name = tensor<string, []>("op_362")];
+            tensor<fp32, [1, 1, 256]> var_384 = linear(bias = inner_encoder_k_proj_1_bias, weight = inner_encoder_k_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_13")];
+            tensor<fp32, []> var_385 = const()[name = tensor<string, []>("op_385"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 1, 256]> var_386 = mul(x = var_384, y = var_385)[name = tensor<string, []>("op_386")];
+            tensor<int32, [4]> var_387 = const()[name = tensor<string, []>("op_387"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_388 = reshape(shape = var_387, x = var_386)[name = tensor<string, []>("op_388")];
             tensor<int32, [4]> k_3_perm_0 = const()[name = tensor<string, []>("k_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> var_366 = linear(bias = encoder_v_proj_1_bias, weight = encoder_v_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_14")];
-            tensor<int32, [4]> var_367 = const()[name = tensor<string, []>("op_367"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_368 = reshape(shape = var_367, x = var_366)[name = tensor<string, []>("op_368")];
+            tensor<fp32, [1, 1, 256]> var_392 = linear(bias = inner_encoder_v_proj_1_bias, weight = inner_encoder_v_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_14")];
+            tensor<int32, [4]> var_393 = const()[name = tensor<string, []>("op_393"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_394 = reshape(shape = var_393, x = var_392)[name = tensor<string, []>("op_394")];
             tensor<int32, [4]> v_3_perm_0 = const()[name = tensor<string, []>("v_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> input_57 = linear(bias = encoder_g_proj_1_bias, weight = encoder_g_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_15")];
+            tensor<fp32, [1, 1, 256]> input_59 = linear(bias = inner_encoder_g_proj_1_bias, weight = inner_encoder_g_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_15")];
             tensor<fp32, [1]> sqrt_s0_3 = sqrt(x = prev_scale_3)[name = tensor<string, []>("sqrt_s0_3")];
-            tensor<fp32, [1]> s_t_3 = add(x = prev_scale_3, y = encoder__t_index)[name = tensor<string, []>("s_t_3")];
+            tensor<fp32, [1]> s_t_3 = add(x = prev_scale_3, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_3")];
             tensor<fp32, [1]> sqrt_s_t_3 = sqrt(x = s_t_3)[name = tensor<string, []>("sqrt_s_t_3")];
             tensor<bool, []> qk_3_transpose_x_1 = const()[name = tensor<string, []>("qk_3_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_3_transpose_y_1 = const()[name = tensor<string, []>("qk_3_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 1, 64]> k_3 = transpose(perm = k_3_perm_0, x = var_362)[name = tensor<string, []>("transpose_50")];
-            tensor<fp32, [1, 4, 1, 64]> q_3 = transpose(perm = q_3_perm_0, x = var_354)[name = tensor<string, []>("transpose_51")];
+            tensor<fp32, [1, 4, 1, 64]> k_3 = transpose(perm = k_3_perm_0, x = var_388)[name = tensor<string, []>("transpose_50")];
+            tensor<fp32, [1, 4, 1, 64]> q_3 = transpose(perm = q_3_perm_0, x = var_380)[name = tensor<string, []>("transpose_51")];
             tensor<fp32, [1, 4, 1, 1]> qk_3 = matmul(transpose_x = qk_3_transpose_x_1, transpose_y = qk_3_transpose_y_1, x = q_3, y = k_3)[name = tensor<string, []>("qk_3")];
-            tensor<int32, [2]> var_378 = const()[name = tensor<string, []>("op_378"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp32, [1, 1]> var_379 = reshape(shape = var_378, x = sqrt_s_t_3)[name = tensor<string, []>("op_379")];
-            tensor<fp32, [1, 1]> M_3 = real_div(x = encoder__causal_mask, y = var_379)[name = tensor<string, []>("M_3")];
-            tensor<fp32, [1, 4, 1, 1]> var_381 = mul(x = qk_3, y = M_3)[name = tensor<string, []>("op_381")];
+            tensor<int32, [2]> var_404 = const()[name = tensor<string, []>("op_404"), val = tensor<int32, [2]>([1, 1])];
+            tensor<fp32, [1, 1]> var_405 = reshape(shape = var_404, x = sqrt_s_t_3)[name = tensor<string, []>("op_405")];
+            tensor<fp32, [1, 1]> M_3 = real_div(x = inner_encoder__causal_mask, y = var_405)[name = tensor<string, []>("M_3")];
+            tensor<fp32, [1, 4, 1, 1]> var_407 = mul(x = qk_3, y = M_3)[name = tensor<string, []>("op_407")];
             tensor<bool, []> inner_3_transpose_x_0 = const()[name = tensor<string, []>("inner_3_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_3_transpose_y_0 = const()[name = tensor<string, []>("inner_3_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 1, 64]> v_3 = transpose(perm = v_3_perm_0, x = var_368)[name = tensor<string, []>("transpose_49")];
-            tensor<fp32, [1, 4, 1, 64]> inner_3 = matmul(transpose_x = inner_3_transpose_x_0, transpose_y = inner_3_transpose_y_0, x = var_381, y = v_3)[name = tensor<string, []>("inner_3")];
-            tensor<bool, []> var_383_transpose_x_0 = const()[name = tensor<string, []>("op_383_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_383_transpose_y_0 = const()[name = tensor<string, []>("op_383_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 1, 64]> var_383 = matmul(transpose_x = var_383_transpose_x_0, transpose_y = var_383_transpose_y_0, x = q_3, y = prev_kv_3)[name = tensor<string, []>("op_383")];
-            tensor<fp32, [1]> var_384 = real_div(x = sqrt_s0_3, y = sqrt_s_t_3)[name = tensor<string, []>("op_384")];
-            tensor<int32, [4]> var_385 = const()[name = tensor<string, []>("op_385"), val = tensor<int32, [4]>([1, 1, 1, 1])];
-            tensor<fp32, [1, 1, 1, 1]> var_386 = reshape(shape = var_385, x = var_384)[name = tensor<string, []>("op_386")];
-            tensor<fp32, [1, 4, 1, 64]> cross_3 = mul(x = var_383, y = var_386)[name = tensor<string, []>("cross_3")];
+            tensor<fp32, [1, 4, 1, 64]> v_3 = transpose(perm = v_3_perm_0, x = var_394)[name = tensor<string, []>("transpose_49")];
+            tensor<fp32, [1, 4, 1, 64]> inner_3 = matmul(transpose_x = inner_3_transpose_x_0, transpose_y = inner_3_transpose_y_0, x = var_407, y = v_3)[name = tensor<string, []>("inner_3")];
+            tensor<bool, []> var_409_transpose_x_0 = const()[name = tensor<string, []>("op_409_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_409_transpose_y_0 = const()[name = tensor<string, []>("op_409_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 1, 64]> var_409 = matmul(transpose_x = var_409_transpose_x_0, transpose_y = var_409_transpose_y_0, x = q_3, y = prev_kv_3)[name = tensor<string, []>("op_409")];
+            tensor<fp32, [1]> var_410 = real_div(x = sqrt_s0_3, y = sqrt_s_t_3)[name = tensor<string, []>("op_410")];
+            tensor<int32, [4]> var_411 = const()[name = tensor<string, []>("op_411"), val = tensor<int32, [4]>([1, 1, 1, 1])];
+            tensor<fp32, [1, 1, 1, 1]> var_412 = reshape(shape = var_411, x = var_410)[name = tensor<string, []>("op_412")];
+            tensor<fp32, [1, 4, 1, 64]> cross_3 = mul(x = var_409, y = var_412)[name = tensor<string, []>("cross_3")];
             tensor<fp32, [1, 4, 1, 64]> out_7 = add(x = inner_3, y = cross_3)[name = tensor<string, []>("out_7")];
-            tensor<fp32, [1, 4, 64, 64]> var_389 = mul(x = prev_kv_3, y = sqrt_s0_3)[name = tensor<string, []>("op_389")];
-            tensor<bool, []> var_391_transpose_x_1 = const()[name = tensor<string, []>("op_391_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_391_transpose_y_1 = const()[name = tensor<string, []>("op_391_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_391 = matmul(transpose_x = var_391_transpose_x_1, transpose_y = var_391_transpose_y_1, x = k_3, y = v_3)[name = tensor<string, []>("op_391")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_3 = add(x = var_389, y = var_391)[name = tensor<string, []>("new_kv_unnorm_3")];
-            tensor<fp32, []> var_393 = const()[name = tensor<string, []>("op_393"), val = tensor<fp32, []>(0x1p+0)];
-            tensor<fp32, [1]> new_scale_3 = add(x = prev_scale_3, y = var_393)[name = tensor<string, []>("new_scale_3")];
-            tensor<fp32, [1]> var_395 = sqrt(x = new_scale_3)[name = tensor<string, []>("op_395")];
-            tensor<fp32, [1, 4, 64, 64]> var_396 = real_div(x = new_kv_unnorm_3, y = var_395)[name = tensor<string, []>("op_396")];
-            tensor<int32, [4]> var_397_perm_0 = const()[name = tensor<string, []>("op_397_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_415 = mul(x = prev_kv_3, y = sqrt_s0_3)[name = tensor<string, []>("op_415")];
+            tensor<bool, []> var_417_transpose_x_1 = const()[name = tensor<string, []>("op_417_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_417_transpose_y_1 = const()[name = tensor<string, []>("op_417_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_417 = matmul(transpose_x = var_417_transpose_x_1, transpose_y = var_417_transpose_y_1, x = k_3, y = v_3)[name = tensor<string, []>("op_417")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_3 = add(x = var_415, y = var_417)[name = tensor<string, []>("new_kv_unnorm_3")];
+            tensor<fp32, []> var_419 = const()[name = tensor<string, []>("op_419"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<fp32, [1]> new_scale_3 = add(x = prev_scale_3, y = var_419)[name = tensor<string, []>("new_scale_3")];
+            tensor<fp32, [1]> var_421 = sqrt(x = new_scale_3)[name = tensor<string, []>("op_421")];
+            tensor<fp32, [1, 4, 64, 64]> var_422 = real_div(x = new_kv_unnorm_3, y = var_421)[name = tensor<string, []>("op_422")];
+            tensor<int32, [4]> var_423_perm_0 = const()[name = tensor<string, []>("op_423_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_9_axes_0 = const()[name = tensor<string, []>("out_9_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 4, 64]> var_397 = transpose(perm = var_397_perm_0, x = out_7)[name = tensor<string, []>("transpose_48")];
-            tensor<fp32, [1, 1, 4, 64]> out_9 = layer_norm(axes = out_9_axes_0, epsilon = var_17, x = var_397)[name = tensor<string, []>("out_9")];
-            tensor<int32, [3]> var_401 = const()[name = tensor<string, []>("op_401"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<fp32, [1, 1, 256]> out_11 = reshape(shape = var_401, x = out_9)[name = tensor<string, []>("out_11")];
-            tensor<fp32, [1, 1, 256]> var_403 = silu(x = input_57)[name = tensor<string, []>("op_403")];
-            tensor<fp32, [1, 1, 256]> input_59 = mul(x = var_403, y = out_11)[name = tensor<string, []>("input_59")];
-            tensor<fp32, [1, 1, 256]> ret_out_3 = linear(bias = encoder_out_proj_1_bias, weight = encoder_out_proj_1_weight, x = input_59)[name = tensor<string, []>("linear_16")];
-            tensor<fp32, [1, 1, 256]> x_9 = add(x = input_53, y = ret_out_3)[name = tensor<string, []>("x_9")];
+            tensor<fp32, [1, 1, 4, 64]> var_423 = transpose(perm = var_423_perm_0, x = out_7)[name = tensor<string, []>("transpose_48")];
+            tensor<fp32, [1, 1, 4, 64]> out_9 = layer_norm(axes = out_9_axes_0, epsilon = var_44, x = var_423)[name = tensor<string, []>("out_9")];
+            tensor<int32, [3]> var_427 = const()[name = tensor<string, []>("op_427"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<fp32, [1, 1, 256]> out_11 = reshape(shape = var_427, x = out_9)[name = tensor<string, []>("out_11")];
+            tensor<fp32, [1, 1, 256]> var_429 = silu(x = input_59)[name = tensor<string, []>("op_429")];
+            tensor<fp32, [1, 1, 256]> input_61 = mul(x = var_429, y = out_11)[name = tensor<string, []>("input_61")];
+            tensor<fp32, [1, 1, 256]> ret_out_3 = linear(bias = inner_encoder_out_proj_1_bias, weight = inner_encoder_out_proj_1_weight, x = input_61)[name = tensor<string, []>("linear_16")];
+            tensor<fp32, [1, 1, 256]> x_9 = add(x = input_55, y = ret_out_3)[name = tensor<string, []>("x_9")];
             tensor<int32, [4]> window_5_begin_0 = const()[name = tensor<string, []>("window_5_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> window_5_end_0 = const()[name = tensor<string, []>("window_5_end_0"), val = tensor<int32, [4]>([2, 1, 16, 256])];
             tensor<bool, [4]> window_5_end_mask_0 = const()[name = tensor<string, []>("window_5_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_5_squeeze_mask_0 = const()[name = tensor<string, []>("window_5_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_5 = slice_by_index(begin = window_5_begin_0, end = window_5_end_0, end_mask = window_5_end_mask_0, squeeze_mask = window_5_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_5")];
-            tensor<int32, [3]> var_414_begin_0 = const()[name = tensor<string, []>("op_414_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_414_end_0 = const()[name = tensor<string, []>("op_414_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_414_end_mask_0 = const()[name = tensor<string, []>("op_414_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_414 = slice_by_index(begin = var_414_begin_0, end = var_414_end_0, end_mask = var_414_end_mask_0, x = window_5)[name = tensor<string, []>("op_414")];
+            tensor<int32, [3]> var_440_begin_0 = const()[name = tensor<string, []>("op_440_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_440_end_0 = const()[name = tensor<string, []>("op_440_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_440_end_mask_0 = const()[name = tensor<string, []>("op_440_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_440 = slice_by_index(begin = var_440_begin_0, end = var_440_end_0, end_mask = var_440_end_mask_0, x = window_5)[name = tensor<string, []>("op_440")];
             tensor<bool, []> window_7_interleave_0 = const()[name = tensor<string, []>("window_7_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_7 = concat(axis = var_26, interleave = window_7_interleave_0, values = (var_414, x_9))[name = tensor<string, []>("window_7")];
-            tensor<bool, []> input_61_interleave_0 = const()[name = tensor<string, []>("input_61_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> input_61 = concat(axis = var_23, interleave = input_61_interleave_0, values = window_7)[name = tensor<string, []>("input_61")];
+            tensor<fp32, [1, 16, 256]> window_7 = concat(axis = var_52, interleave = window_7_interleave_0, values = (var_440, x_9))[name = tensor<string, []>("window_7")];
+            tensor<bool, []> input_63_interleave_0 = const()[name = tensor<string, []>("input_63_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 16, 256]> input_63 = concat(axis = var_39, interleave = input_63_interleave_0, values = window_7)[name = tensor<string, []>("input_63")];
             tensor<int32, [1]> x_11_axes_0 = const()[name = tensor<string, []>("x_11_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 16, 256]> x_11 = layer_norm(axes = x_11_axes_0, beta = encoder_conv_module_1_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_1_sequential_0_weight, x = input_61)[name = tensor<string, []>("x_11")];
-            tensor<int32, [3]> input_63_perm_0 = const()[name = tensor<string, []>("input_63_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [1, 16, 256]> x_11 = layer_norm(axes = x_11_axes_0, beta = inner_encoder_conv_module_1_sequential_0_bias, epsilon = var_36, gamma = inner_encoder_conv_module_1_sequential_0_weight, x = input_63)[name = tensor<string, []>("x_11")];
+            tensor<int32, [3]> input_65_perm_0 = const()[name = tensor<string, []>("input_65_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_13_pad_type_0 = const()[name = tensor<string, []>("inputs_13_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_13_strides_0 = const()[name = tensor<string, []>("inputs_13_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_13_pad_0 = const()[name = tensor<string, []>("inputs_13_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_13_dilations_0 = const()[name = tensor<string, []>("inputs_13_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_13_groups_0 = const()[name = tensor<string, []>("inputs_13_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> input_63 = transpose(perm = input_63_perm_0, x = x_11)[name = tensor<string, []>("transpose_47")];
-            tensor<fp32, [1, 512, 16]> inputs_13 = conv(bias = encoder_conv_module_1_sequential_2_conv_bias, dilations = inputs_13_dilations_0, groups = inputs_13_groups_0, pad = inputs_13_pad_0, pad_type = inputs_13_pad_type_0, strides = inputs_13_strides_0, weight = encoder_conv_module_1_sequential_2_conv_weight, x = input_63)[name = tensor<string, []>("inputs_13")];
-            tensor<int32, [2]> var_439_split_sizes_0 = const()[name = tensor<string, []>("op_439_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_439_axis_0 = const()[name = tensor<string, []>("op_439_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> var_439_0, tensor<fp32, [1, 256, 16]> var_439_1 = split(axis = var_439_axis_0, split_sizes = var_439_split_sizes_0, x = inputs_13)[name = tensor<string, []>("op_439")];
-            tensor<fp32, [1, 256, 16]> var_441 = sigmoid(x = var_439_1)[name = tensor<string, []>("op_441")];
-            tensor<fp32, [1, 256, 16]> inputs_15 = mul(x = var_439_0, y = var_441)[name = tensor<string, []>("inputs_15")];
+            tensor<fp32, [1, 256, 16]> input_65 = transpose(perm = input_65_perm_0, x = x_11)[name = tensor<string, []>("transpose_47")];
+            tensor<fp32, [1, 512, 16]> inputs_13 = conv(bias = inner_encoder_conv_module_1_sequential_2_conv_bias, dilations = inputs_13_dilations_0, groups = inputs_13_groups_0, pad = inputs_13_pad_0, pad_type = inputs_13_pad_type_0, strides = inputs_13_strides_0, weight = inner_encoder_conv_module_1_sequential_2_conv_weight, x = input_65)[name = tensor<string, []>("inputs_13")];
+            tensor<int32, [2]> var_465_split_sizes_0 = const()[name = tensor<string, []>("op_465_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_465_axis_0 = const()[name = tensor<string, []>("op_465_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 256, 16]> var_465_0, tensor<fp32, [1, 256, 16]> var_465_1 = split(axis = var_465_axis_0, split_sizes = var_465_split_sizes_0, x = inputs_13)[name = tensor<string, []>("op_465")];
+            tensor<fp32, [1, 256, 16]> var_467 = sigmoid(x = var_465_1)[name = tensor<string, []>("op_467")];
+            tensor<fp32, [1, 256, 16]> inputs_15 = mul(x = var_465_0, y = var_467)[name = tensor<string, []>("inputs_15")];
             tensor<string, []> outputs_aug_3_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_3_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_3_pad_0 = const()[name = tensor<string, []>("outputs_aug_3_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_3_groups_0 = const()[name = tensor<string, []>("outputs_aug_3_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_3_strides_0 = const()[name = tensor<string, []>("outputs_aug_3_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_3_dilations_0 = const()[name = tensor<string, []>("outputs_aug_3_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [1, 256, 31]> outputs_aug_3 = conv(dilations = outputs_aug_3_dilations_0, groups = outputs_aug_3_groups_0, pad = outputs_aug_3_pad_0, pad_type = outputs_aug_3_pad_type_0, strides = outputs_aug_3_strides_0, weight = encoder_conv_module_1_sequential_4_conv_weight, x = inputs_15)[name = tensor<string, []>("outputs_aug_3")];
-            tensor<int32, [3]> input_65_begin_0 = const()[name = tensor<string, []>("input_65_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_65_end_0 = const()[name = tensor<string, []>("input_65_end_0"), val = tensor<int32, [3]>([1, 256, 16])];
-            tensor<bool, [3]> input_65_end_mask_0 = const()[name = tensor<string, []>("input_65_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [1, 256, 16]> input_65 = slice_by_index(begin = input_65_begin_0, end = input_65_end_0, end_mask = input_65_end_mask_0, x = outputs_aug_3)[name = tensor<string, []>("input_65")];
-            tensor<fp32, [1, 256, 16]> inputs_17 = batch_norm(beta = encoder_conv_module_1_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_1_sequential_5_weight, mean = encoder_conv_module_1_sequential_5_running_mean, variance = encoder_conv_module_1_sequential_5_running_var, x = input_65)[name = tensor<string, []>("inputs_17")];
-            tensor<fp32, [1, 256, 16]> input_67 = silu(x = inputs_17)[name = tensor<string, []>("input_67")];
-            tensor<string, []> input_69_pad_type_0 = const()[name = tensor<string, []>("input_69_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_69_strides_0 = const()[name = tensor<string, []>("input_69_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_69_pad_0 = const()[name = tensor<string, []>("input_69_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_69_dilations_0 = const()[name = tensor<string, []>("input_69_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_69_groups_0 = const()[name = tensor<string, []>("input_69_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> input_69 = conv(bias = encoder_conv_module_1_sequential_7_conv_bias, dilations = input_69_dilations_0, groups = input_69_groups_0, pad = input_69_pad_0, pad_type = input_69_pad_type_0, strides = input_69_strides_0, weight = encoder_conv_module_1_sequential_7_conv_weight, x = input_67)[name = tensor<string, []>("input_69")];
+            tensor<fp32, [1, 256, 31]> outputs_aug_3 = conv(dilations = outputs_aug_3_dilations_0, groups = outputs_aug_3_groups_0, pad = outputs_aug_3_pad_0, pad_type = outputs_aug_3_pad_type_0, strides = outputs_aug_3_strides_0, weight = inner_encoder_conv_module_1_sequential_4_conv_weight, x = inputs_15)[name = tensor<string, []>("outputs_aug_3")];
+            tensor<int32, [3]> input_67_begin_0 = const()[name = tensor<string, []>("input_67_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_67_end_0 = const()[name = tensor<string, []>("input_67_end_0"), val = tensor<int32, [3]>([1, 256, 16])];
+            tensor<bool, [3]> input_67_end_mask_0 = const()[name = tensor<string, []>("input_67_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [1, 256, 16]> input_67 = slice_by_index(begin = input_67_begin_0, end = input_67_end_0, end_mask = input_67_end_mask_0, x = outputs_aug_3)[name = tensor<string, []>("input_67")];
+            tensor<fp32, [1, 256, 16]> inputs_17 = batch_norm(beta = inner_encoder_conv_module_1_sequential_5_bias, epsilon = var_36, gamma = inner_encoder_conv_module_1_sequential_5_weight, mean = inner_encoder_conv_module_1_sequential_5_running_mean, variance = inner_encoder_conv_module_1_sequential_5_running_var, x = input_67)[name = tensor<string, []>("inputs_17")];
+            tensor<fp32, [1, 256, 16]> input_69 = silu(x = inputs_17)[name = tensor<string, []>("input_69")];
+            tensor<string, []> input_71_pad_type_0 = const()[name = tensor<string, []>("input_71_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_71_strides_0 = const()[name = tensor<string, []>("input_71_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_71_pad_0 = const()[name = tensor<string, []>("input_71_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_71_dilations_0 = const()[name = tensor<string, []>("input_71_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_71_groups_0 = const()[name = tensor<string, []>("input_71_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 256, 16]> input_71 = conv(bias = inner_encoder_conv_module_1_sequential_7_conv_bias, dilations = input_71_dilations_0, groups = input_71_groups_0, pad = input_71_pad_0, pad_type = input_71_pad_type_0, strides = input_71_strides_0, weight = inner_encoder_conv_module_1_sequential_7_conv_weight, x = input_69)[name = tensor<string, []>("input_71")];
             tensor<int32, [3]> conv_out_3_perm_0 = const()[name = tensor<string, []>("conv_out_3_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_472_begin_0 = const()[name = tensor<string, []>("op_472_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_472_end_0 = const()[name = tensor<string, []>("op_472_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_472_end_mask_0 = const()[name = tensor<string, []>("op_472_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 16, 256]> conv_out_3 = transpose(perm = conv_out_3_perm_0, x = input_69)[name = tensor<string, []>("transpose_46")];
-            tensor<fp32, [1, 1, 256]> var_472 = slice_by_index(begin = var_472_begin_0, end = var_472_end_0, end_mask = var_472_end_mask_0, x = conv_out_3)[name = tensor<string, []>("op_472")];
-            tensor<int32, [3]> var_474_perm_0 = const()[name = tensor<string, []>("op_474_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 1, 256]> var_474 = transpose(perm = var_474_perm_0, x = var_472)[name = tensor<string, []>("transpose_45")];
-            tensor<fp32, [1, 1, 256]> input_71 = add(x = x_9, y = var_474)[name = tensor<string, []>("input_71")];
-            tensor<int32, [1]> input_73_axes_0 = const()[name = tensor<string, []>("input_73_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_73 = layer_norm(axes = input_73_axes_0, beta = encoder_ffn2_1_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_1_module_sequential_0_weight, x = input_71)[name = tensor<string, []>("input_73")];
-            tensor<fp32, [1, 1, 1024]> inputs_19 = linear(bias = encoder_ffn2_1_module_sequential_1_linear_bias, weight = encoder_ffn2_1_module_sequential_1_linear_weight, x = input_73)[name = tensor<string, []>("linear_17")];
-            tensor<fp32, [1, 1, 1024]> input_75 = silu(x = inputs_19)[name = tensor<string, []>("input_75")];
-            tensor<fp32, [1, 1, 256]> input_79 = linear(bias = encoder_ffn2_1_module_sequential_4_linear_bias, weight = encoder_ffn2_1_module_sequential_4_linear_weight, x = input_75)[name = tensor<string, []>("linear_18")];
-            tensor<fp32, []> var_497 = const()[name = tensor<string, []>("op_497"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 1, 256]> var_498 = mul(x = input_79, y = var_497)[name = tensor<string, []>("op_498")];
-            tensor<fp32, [1, 1, 256]> input_81 = add(x = var_498, y = input_71)[name = tensor<string, []>("input_81")];
-            tensor<int32, [1]> input_83_axes_0 = const()[name = tensor<string, []>("input_83_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_83 = layer_norm(axes = input_83_axes_0, beta = encoder_layer_norm_1_bias, epsilon = var_28, gamma = encoder_layer_norm_1_weight, x = input_81)[name = tensor<string, []>("input_83")];
+            tensor<int32, [3]> var_498_begin_0 = const()[name = tensor<string, []>("op_498_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_498_end_0 = const()[name = tensor<string, []>("op_498_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_498_end_mask_0 = const()[name = tensor<string, []>("op_498_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 16, 256]> conv_out_3 = transpose(perm = conv_out_3_perm_0, x = input_71)[name = tensor<string, []>("transpose_46")];
+            tensor<fp32, [1, 1, 256]> var_498 = slice_by_index(begin = var_498_begin_0, end = var_498_end_0, end_mask = var_498_end_mask_0, x = conv_out_3)[name = tensor<string, []>("op_498")];
+            tensor<int32, [3]> var_500_perm_0 = const()[name = tensor<string, []>("op_500_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 1, 256]> var_500 = transpose(perm = var_500_perm_0, x = var_498)[name = tensor<string, []>("transpose_45")];
+            tensor<fp32, [1, 1, 256]> input_73 = add(x = x_9, y = var_500)[name = tensor<string, []>("input_73")];
+            tensor<int32, [1]> input_75_axes_0 = const()[name = tensor<string, []>("input_75_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 1, 256]> input_75 = layer_norm(axes = input_75_axes_0, beta = inner_encoder_ffn2_1_module_sequential_0_bias, epsilon = var_36, gamma = inner_encoder_ffn2_1_module_sequential_0_weight, x = input_73)[name = tensor<string, []>("input_75")];
+            tensor<fp32, [1, 1, 1024]> inputs_19 = linear(bias = inner_encoder_ffn2_1_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_1_module_sequential_1_linear_weight, x = input_75)[name = tensor<string, []>("linear_17")];
+            tensor<fp32, [1, 1, 1024]> input_77 = silu(x = inputs_19)[name = tensor<string, []>("input_77")];
+            tensor<fp32, [1, 1, 256]> input_81 = linear(bias = inner_encoder_ffn2_1_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_1_module_sequential_4_linear_weight, x = input_77)[name = tensor<string, []>("linear_18")];
+            tensor<fp32, []> var_523 = const()[name = tensor<string, []>("op_523"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 1, 256]> var_524 = mul(x = input_81, y = var_523)[name = tensor<string, []>("op_524")];
+            tensor<fp32, [1, 1, 256]> input_83 = add(x = var_524, y = input_73)[name = tensor<string, []>("input_83")];
             tensor<int32, [1]> input_85_axes_0 = const()[name = tensor<string, []>("input_85_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_85 = layer_norm(axes = input_85_axes_0, beta = encoder_ffn1_2_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_2_module_sequential_0_weight, x = input_83)[name = tensor<string, []>("input_85")];
-            tensor<fp32, [1, 1, 1024]> inputs_21 = linear(bias = encoder_ffn1_2_module_sequential_1_linear_bias, weight = encoder_ffn1_2_module_sequential_1_linear_weight, x = input_85)[name = tensor<string, []>("linear_19")];
-            tensor<fp32, [1, 1, 1024]> input_87 = silu(x = inputs_21)[name = tensor<string, []>("input_87")];
-            tensor<fp32, [1, 1, 256]> input_91 = linear(bias = encoder_ffn1_2_module_sequential_4_linear_bias, weight = encoder_ffn1_2_module_sequential_4_linear_weight, x = input_87)[name = tensor<string, []>("linear_20")];
-            tensor<fp32, []> var_527 = const()[name = tensor<string, []>("op_527"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 1, 256]> var_528 = mul(x = input_91, y = var_527)[name = tensor<string, []>("op_528")];
-            tensor<fp32, [1, 1, 256]> input_93 = add(x = var_528, y = input_83)[name = tensor<string, []>("input_93")];
+            tensor<fp32, [1, 1, 256]> input_85 = layer_norm(axes = input_85_axes_0, beta = inner_encoder_layer_norm_1_bias, epsilon = var_36, gamma = inner_encoder_layer_norm_1_weight, x = input_83)[name = tensor<string, []>("input_85")];
+            tensor<int32, [1]> input_87_axes_0 = const()[name = tensor<string, []>("input_87_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 1, 256]> input_87 = layer_norm(axes = input_87_axes_0, beta = inner_encoder_ffn1_2_module_sequential_0_bias, epsilon = var_36, gamma = inner_encoder_ffn1_2_module_sequential_0_weight, x = input_85)[name = tensor<string, []>("input_87")];
+            tensor<fp32, [1, 1, 1024]> inputs_21 = linear(bias = inner_encoder_ffn1_2_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_2_module_sequential_1_linear_weight, x = input_87)[name = tensor<string, []>("linear_19")];
+            tensor<fp32, [1, 1, 1024]> input_89 = silu(x = inputs_21)[name = tensor<string, []>("input_89")];
+            tensor<fp32, [1, 1, 256]> input_93 = linear(bias = inner_encoder_ffn1_2_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_2_module_sequential_4_linear_weight, x = input_89)[name = tensor<string, []>("linear_20")];
+            tensor<fp32, []> var_553 = const()[name = tensor<string, []>("op_553"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 1, 256]> var_554 = mul(x = input_93, y = var_553)[name = tensor<string, []>("op_554")];
+            tensor<fp32, [1, 1, 256]> input_95 = add(x = var_554, y = input_85)[name = tensor<string, []>("input_95")];
             tensor<int32, [1]> x_13_axes_0 = const()[name = tensor<string, []>("x_13_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> x_13 = layer_norm(axes = x_13_axes_0, beta = encoder_ret_lns_2_bias, epsilon = var_28, gamma = encoder_ret_lns_2_weight, x = input_93)[name = tensor<string, []>("x_13")];
+            tensor<fp32, [1, 1, 256]> x_13 = layer_norm(axes = x_13_axes_0, beta = inner_encoder_ret_lns_2_bias, epsilon = var_36, gamma = inner_encoder_ret_lns_2_weight, x = input_95)[name = tensor<string, []>("x_13")];
             tensor<int32, [5]> prev_kv_5_begin_0 = const()[name = tensor<string, []>("prev_kv_5_begin_0"), val = tensor<int32, [5]>([2, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_5_end_0 = const()[name = tensor<string, []>("prev_kv_5_end_0"), val = tensor<int32, [5]>([3, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_5_end_mask_0 = const()[name = tensor<string, []>("prev_kv_5_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -524,139 +530,139 @@ program(1.0)
             tensor<bool, [2]> prev_scale_5_end_mask_0 = const()[name = tensor<string, []>("prev_scale_5_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_5_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_5_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_5 = slice_by_index(begin = prev_scale_5_begin_0, end = prev_scale_5_end_0, end_mask = prev_scale_5_end_mask_0, squeeze_mask = prev_scale_5_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_5")];
-            tensor<fp32, [1, 1, 256]> var_542 = linear(bias = encoder_q_proj_2_bias, weight = encoder_q_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_21")];
-            tensor<int32, [4]> var_543 = const()[name = tensor<string, []>("op_543"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_544 = reshape(shape = var_543, x = var_542)[name = tensor<string, []>("op_544")];
+            tensor<fp32, [1, 1, 256]> var_568 = linear(bias = inner_encoder_q_proj_2_bias, weight = inner_encoder_q_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_21")];
+            tensor<int32, [4]> var_569 = const()[name = tensor<string, []>("op_569"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_570 = reshape(shape = var_569, x = var_568)[name = tensor<string, []>("op_570")];
             tensor<int32, [4]> q_5_perm_0 = const()[name = tensor<string, []>("q_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> var_548 = linear(bias = encoder_k_proj_2_bias, weight = encoder_k_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_22")];
-            tensor<fp32, []> var_549 = const()[name = tensor<string, []>("op_549"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 1, 256]> var_550 = mul(x = var_548, y = var_549)[name = tensor<string, []>("op_550")];
-            tensor<int32, [4]> var_551 = const()[name = tensor<string, []>("op_551"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_552 = reshape(shape = var_551, x = var_550)[name = tensor<string, []>("op_552")];
+            tensor<fp32, [1, 1, 256]> var_574 = linear(bias = inner_encoder_k_proj_2_bias, weight = inner_encoder_k_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_22")];
+            tensor<fp32, []> var_575 = const()[name = tensor<string, []>("op_575"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 1, 256]> var_576 = mul(x = var_574, y = var_575)[name = tensor<string, []>("op_576")];
+            tensor<int32, [4]> var_577 = const()[name = tensor<string, []>("op_577"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_578 = reshape(shape = var_577, x = var_576)[name = tensor<string, []>("op_578")];
             tensor<int32, [4]> k_5_perm_0 = const()[name = tensor<string, []>("k_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> var_556 = linear(bias = encoder_v_proj_2_bias, weight = encoder_v_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_23")];
-            tensor<int32, [4]> var_557 = const()[name = tensor<string, []>("op_557"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_558 = reshape(shape = var_557, x = var_556)[name = tensor<string, []>("op_558")];
+            tensor<fp32, [1, 1, 256]> var_582 = linear(bias = inner_encoder_v_proj_2_bias, weight = inner_encoder_v_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_23")];
+            tensor<int32, [4]> var_583 = const()[name = tensor<string, []>("op_583"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_584 = reshape(shape = var_583, x = var_582)[name = tensor<string, []>("op_584")];
             tensor<int32, [4]> v_5_perm_0 = const()[name = tensor<string, []>("v_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> input_97 = linear(bias = encoder_g_proj_2_bias, weight = encoder_g_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_24")];
+            tensor<fp32, [1, 1, 256]> input_99 = linear(bias = inner_encoder_g_proj_2_bias, weight = inner_encoder_g_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_24")];
             tensor<fp32, [1]> sqrt_s0_5 = sqrt(x = prev_scale_5)[name = tensor<string, []>("sqrt_s0_5")];
-            tensor<fp32, [1]> s_t_5 = add(x = prev_scale_5, y = encoder__t_index)[name = tensor<string, []>("s_t_5")];
+            tensor<fp32, [1]> s_t_5 = add(x = prev_scale_5, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_5")];
             tensor<fp32, [1]> sqrt_s_t_5 = sqrt(x = s_t_5)[name = tensor<string, []>("sqrt_s_t_5")];
             tensor<bool, []> qk_5_transpose_x_1 = const()[name = tensor<string, []>("qk_5_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_5_transpose_y_1 = const()[name = tensor<string, []>("qk_5_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 1, 64]> k_5 = transpose(perm = k_5_perm_0, x = var_552)[name = tensor<string, []>("transpose_43")];
-            tensor<fp32, [1, 4, 1, 64]> q_5 = transpose(perm = q_5_perm_0, x = var_544)[name = tensor<string, []>("transpose_44")];
+            tensor<fp32, [1, 4, 1, 64]> k_5 = transpose(perm = k_5_perm_0, x = var_578)[name = tensor<string, []>("transpose_43")];
+            tensor<fp32, [1, 4, 1, 64]> q_5 = transpose(perm = q_5_perm_0, x = var_570)[name = tensor<string, []>("transpose_44")];
             tensor<fp32, [1, 4, 1, 1]> qk_5 = matmul(transpose_x = qk_5_transpose_x_1, transpose_y = qk_5_transpose_y_1, x = q_5, y = k_5)[name = tensor<string, []>("qk_5")];
-            tensor<int32, [2]> var_568 = const()[name = tensor<string, []>("op_568"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp32, [1, 1]> var_569 = reshape(shape = var_568, x = sqrt_s_t_5)[name = tensor<string, []>("op_569")];
-            tensor<fp32, [1, 1]> M_5 = real_div(x = encoder__causal_mask, y = var_569)[name = tensor<string, []>("M_5")];
-            tensor<fp32, [1, 4, 1, 1]> var_571 = mul(x = qk_5, y = M_5)[name = tensor<string, []>("op_571")];
+            tensor<int32, [2]> var_594 = const()[name = tensor<string, []>("op_594"), val = tensor<int32, [2]>([1, 1])];
+            tensor<fp32, [1, 1]> var_595 = reshape(shape = var_594, x = sqrt_s_t_5)[name = tensor<string, []>("op_595")];
+            tensor<fp32, [1, 1]> M_5 = real_div(x = inner_encoder__causal_mask, y = var_595)[name = tensor<string, []>("M_5")];
+            tensor<fp32, [1, 4, 1, 1]> var_597 = mul(x = qk_5, y = M_5)[name = tensor<string, []>("op_597")];
             tensor<bool, []> inner_5_transpose_x_0 = const()[name = tensor<string, []>("inner_5_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_5_transpose_y_0 = const()[name = tensor<string, []>("inner_5_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 1, 64]> v_5 = transpose(perm = v_5_perm_0, x = var_558)[name = tensor<string, []>("transpose_42")];
-            tensor<fp32, [1, 4, 1, 64]> inner_5 = matmul(transpose_x = inner_5_transpose_x_0, transpose_y = inner_5_transpose_y_0, x = var_571, y = v_5)[name = tensor<string, []>("inner_5")];
-            tensor<bool, []> var_573_transpose_x_0 = const()[name = tensor<string, []>("op_573_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_573_transpose_y_0 = const()[name = tensor<string, []>("op_573_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 1, 64]> var_573 = matmul(transpose_x = var_573_transpose_x_0, transpose_y = var_573_transpose_y_0, x = q_5, y = prev_kv_5)[name = tensor<string, []>("op_573")];
-            tensor<fp32, [1]> var_574 = real_div(x = sqrt_s0_5, y = sqrt_s_t_5)[name = tensor<string, []>("op_574")];
-            tensor<int32, [4]> var_575 = const()[name = tensor<string, []>("op_575"), val = tensor<int32, [4]>([1, 1, 1, 1])];
-            tensor<fp32, [1, 1, 1, 1]> var_576 = reshape(shape = var_575, x = var_574)[name = tensor<string, []>("op_576")];
-            tensor<fp32, [1, 4, 1, 64]> cross_5 = mul(x = var_573, y = var_576)[name = tensor<string, []>("cross_5")];
+            tensor<fp32, [1, 4, 1, 64]> v_5 = transpose(perm = v_5_perm_0, x = var_584)[name = tensor<string, []>("transpose_42")];
+            tensor<fp32, [1, 4, 1, 64]> inner_5 = matmul(transpose_x = inner_5_transpose_x_0, transpose_y = inner_5_transpose_y_0, x = var_597, y = v_5)[name = tensor<string, []>("inner_5")];
+            tensor<bool, []> var_599_transpose_x_0 = const()[name = tensor<string, []>("op_599_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_599_transpose_y_0 = const()[name = tensor<string, []>("op_599_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 1, 64]> var_599 = matmul(transpose_x = var_599_transpose_x_0, transpose_y = var_599_transpose_y_0, x = q_5, y = prev_kv_5)[name = tensor<string, []>("op_599")];
+            tensor<fp32, [1]> var_600 = real_div(x = sqrt_s0_5, y = sqrt_s_t_5)[name = tensor<string, []>("op_600")];
+            tensor<int32, [4]> var_601 = const()[name = tensor<string, []>("op_601"), val = tensor<int32, [4]>([1, 1, 1, 1])];
+            tensor<fp32, [1, 1, 1, 1]> var_602 = reshape(shape = var_601, x = var_600)[name = tensor<string, []>("op_602")];
+            tensor<fp32, [1, 4, 1, 64]> cross_5 = mul(x = var_599, y = var_602)[name = tensor<string, []>("cross_5")];
             tensor<fp32, [1, 4, 1, 64]> out_13 = add(x = inner_5, y = cross_5)[name = tensor<string, []>("out_13")];
-            tensor<fp32, [1, 4, 64, 64]> var_579 = mul(x = prev_kv_5, y = sqrt_s0_5)[name = tensor<string, []>("op_579")];
-            tensor<bool, []> var_581_transpose_x_1 = const()[name = tensor<string, []>("op_581_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_581_transpose_y_1 = const()[name = tensor<string, []>("op_581_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_581 = matmul(transpose_x = var_581_transpose_x_1, transpose_y = var_581_transpose_y_1, x = k_5, y = v_5)[name = tensor<string, []>("op_581")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_5 = add(x = var_579, y = var_581)[name = tensor<string, []>("new_kv_unnorm_5")];
-            tensor<fp32, []> var_583 = const()[name = tensor<string, []>("op_583"), val = tensor<fp32, []>(0x1p+0)];
-            tensor<fp32, [1]> new_scale_5 = add(x = prev_scale_5, y = var_583)[name = tensor<string, []>("new_scale_5")];
-            tensor<fp32, [1]> var_585 = sqrt(x = new_scale_5)[name = tensor<string, []>("op_585")];
-            tensor<fp32, [1, 4, 64, 64]> var_586 = real_div(x = new_kv_unnorm_5, y = var_585)[name = tensor<string, []>("op_586")];
-            tensor<int32, [4]> var_587_perm_0 = const()[name = tensor<string, []>("op_587_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_605 = mul(x = prev_kv_5, y = sqrt_s0_5)[name = tensor<string, []>("op_605")];
+            tensor<bool, []> var_607_transpose_x_1 = const()[name = tensor<string, []>("op_607_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_607_transpose_y_1 = const()[name = tensor<string, []>("op_607_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_607 = matmul(transpose_x = var_607_transpose_x_1, transpose_y = var_607_transpose_y_1, x = k_5, y = v_5)[name = tensor<string, []>("op_607")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_5 = add(x = var_605, y = var_607)[name = tensor<string, []>("new_kv_unnorm_5")];
+            tensor<fp32, []> var_609 = const()[name = tensor<string, []>("op_609"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<fp32, [1]> new_scale_5 = add(x = prev_scale_5, y = var_609)[name = tensor<string, []>("new_scale_5")];
+            tensor<fp32, [1]> var_611 = sqrt(x = new_scale_5)[name = tensor<string, []>("op_611")];
+            tensor<fp32, [1, 4, 64, 64]> var_612 = real_div(x = new_kv_unnorm_5, y = var_611)[name = tensor<string, []>("op_612")];
+            tensor<int32, [4]> var_613_perm_0 = const()[name = tensor<string, []>("op_613_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_15_axes_0 = const()[name = tensor<string, []>("out_15_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 4, 64]> var_587 = transpose(perm = var_587_perm_0, x = out_13)[name = tensor<string, []>("transpose_41")];
-            tensor<fp32, [1, 1, 4, 64]> out_15 = layer_norm(axes = out_15_axes_0, epsilon = var_17, x = var_587)[name = tensor<string, []>("out_15")];
-            tensor<int32, [3]> var_591 = const()[name = tensor<string, []>("op_591"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<fp32, [1, 1, 256]> out_17 = reshape(shape = var_591, x = out_15)[name = tensor<string, []>("out_17")];
-            tensor<fp32, [1, 1, 256]> var_593 = silu(x = input_97)[name = tensor<string, []>("op_593")];
-            tensor<fp32, [1, 1, 256]> input_99 = mul(x = var_593, y = out_17)[name = tensor<string, []>("input_99")];
-            tensor<fp32, [1, 1, 256]> ret_out_5 = linear(bias = encoder_out_proj_2_bias, weight = encoder_out_proj_2_weight, x = input_99)[name = tensor<string, []>("linear_25")];
-            tensor<fp32, [1, 1, 256]> x_15 = add(x = input_93, y = ret_out_5)[name = tensor<string, []>("x_15")];
+            tensor<fp32, [1, 1, 4, 64]> var_613 = transpose(perm = var_613_perm_0, x = out_13)[name = tensor<string, []>("transpose_41")];
+            tensor<fp32, [1, 1, 4, 64]> out_15 = layer_norm(axes = out_15_axes_0, epsilon = var_44, x = var_613)[name = tensor<string, []>("out_15")];
+            tensor<int32, [3]> var_617 = const()[name = tensor<string, []>("op_617"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<fp32, [1, 1, 256]> out_17 = reshape(shape = var_617, x = out_15)[name = tensor<string, []>("out_17")];
+            tensor<fp32, [1, 1, 256]> var_619 = silu(x = input_99)[name = tensor<string, []>("op_619")];
+            tensor<fp32, [1, 1, 256]> input_101 = mul(x = var_619, y = out_17)[name = tensor<string, []>("input_101")];
+            tensor<fp32, [1, 1, 256]> ret_out_5 = linear(bias = inner_encoder_out_proj_2_bias, weight = inner_encoder_out_proj_2_weight, x = input_101)[name = tensor<string, []>("linear_25")];
+            tensor<fp32, [1, 1, 256]> x_15 = add(x = input_95, y = ret_out_5)[name = tensor<string, []>("x_15")];
             tensor<int32, [4]> window_9_begin_0 = const()[name = tensor<string, []>("window_9_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> window_9_end_0 = const()[name = tensor<string, []>("window_9_end_0"), val = tensor<int32, [4]>([3, 1, 16, 256])];
             tensor<bool, [4]> window_9_end_mask_0 = const()[name = tensor<string, []>("window_9_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_9_squeeze_mask_0 = const()[name = tensor<string, []>("window_9_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_9 = slice_by_index(begin = window_9_begin_0, end = window_9_end_0, end_mask = window_9_end_mask_0, squeeze_mask = window_9_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_9")];
-            tensor<int32, [3]> var_604_begin_0 = const()[name = tensor<string, []>("op_604_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_604_end_0 = const()[name = tensor<string, []>("op_604_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_604_end_mask_0 = const()[name = tensor<string, []>("op_604_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_604 = slice_by_index(begin = var_604_begin_0, end = var_604_end_0, end_mask = var_604_end_mask_0, x = window_9)[name = tensor<string, []>("op_604")];
+            tensor<int32, [3]> var_630_begin_0 = const()[name = tensor<string, []>("op_630_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_630_end_0 = const()[name = tensor<string, []>("op_630_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_630_end_mask_0 = const()[name = tensor<string, []>("op_630_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_630 = slice_by_index(begin = var_630_begin_0, end = var_630_end_0, end_mask = var_630_end_mask_0, x = window_9)[name = tensor<string, []>("op_630")];
             tensor<bool, []> window_11_interleave_0 = const()[name = tensor<string, []>("window_11_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_11 = concat(axis = var_26, interleave = window_11_interleave_0, values = (var_604, x_15))[name = tensor<string, []>("window_11")];
-            tensor<bool, []> input_101_interleave_0 = const()[name = tensor<string, []>("input_101_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> input_101 = concat(axis = var_23, interleave = input_101_interleave_0, values = window_11)[name = tensor<string, []>("input_101")];
+            tensor<fp32, [1, 16, 256]> window_11 = concat(axis = var_52, interleave = window_11_interleave_0, values = (var_630, x_15))[name = tensor<string, []>("window_11")];
+            tensor<bool, []> input_103_interleave_0 = const()[name = tensor<string, []>("input_103_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 16, 256]> input_103 = concat(axis = var_39, interleave = input_103_interleave_0, values = window_11)[name = tensor<string, []>("input_103")];
             tensor<int32, [1]> x_17_axes_0 = const()[name = tensor<string, []>("x_17_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 16, 256]> x_17 = layer_norm(axes = x_17_axes_0, beta = encoder_conv_module_2_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_2_sequential_0_weight, x = input_101)[name = tensor<string, []>("x_17")];
-            tensor<int32, [3]> input_103_perm_0 = const()[name = tensor<string, []>("input_103_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [1, 16, 256]> x_17 = layer_norm(axes = x_17_axes_0, beta = inner_encoder_conv_module_2_sequential_0_bias, epsilon = var_36, gamma = inner_encoder_conv_module_2_sequential_0_weight, x = input_103)[name = tensor<string, []>("x_17")];
+            tensor<int32, [3]> input_105_perm_0 = const()[name = tensor<string, []>("input_105_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_23_pad_type_0 = const()[name = tensor<string, []>("inputs_23_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_23_strides_0 = const()[name = tensor<string, []>("inputs_23_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_23_pad_0 = const()[name = tensor<string, []>("inputs_23_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_23_dilations_0 = const()[name = tensor<string, []>("inputs_23_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_23_groups_0 = const()[name = tensor<string, []>("inputs_23_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> input_103 = transpose(perm = input_103_perm_0, x = x_17)[name = tensor<string, []>("transpose_40")];
-            tensor<fp32, [1, 512, 16]> inputs_23 = conv(bias = encoder_conv_module_2_sequential_2_conv_bias, dilations = inputs_23_dilations_0, groups = inputs_23_groups_0, pad = inputs_23_pad_0, pad_type = inputs_23_pad_type_0, strides = inputs_23_strides_0, weight = encoder_conv_module_2_sequential_2_conv_weight, x = input_103)[name = tensor<string, []>("inputs_23")];
-            tensor<int32, [2]> var_629_split_sizes_0 = const()[name = tensor<string, []>("op_629_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_629_axis_0 = const()[name = tensor<string, []>("op_629_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> var_629_0, tensor<fp32, [1, 256, 16]> var_629_1 = split(axis = var_629_axis_0, split_sizes = var_629_split_sizes_0, x = inputs_23)[name = tensor<string, []>("op_629")];
-            tensor<fp32, [1, 256, 16]> var_631 = sigmoid(x = var_629_1)[name = tensor<string, []>("op_631")];
-            tensor<fp32, [1, 256, 16]> inputs_25 = mul(x = var_629_0, y = var_631)[name = tensor<string, []>("inputs_25")];
+            tensor<fp32, [1, 256, 16]> input_105 = transpose(perm = input_105_perm_0, x = x_17)[name = tensor<string, []>("transpose_40")];
+            tensor<fp32, [1, 512, 16]> inputs_23 = conv(bias = inner_encoder_conv_module_2_sequential_2_conv_bias, dilations = inputs_23_dilations_0, groups = inputs_23_groups_0, pad = inputs_23_pad_0, pad_type = inputs_23_pad_type_0, strides = inputs_23_strides_0, weight = inner_encoder_conv_module_2_sequential_2_conv_weight, x = input_105)[name = tensor<string, []>("inputs_23")];
+            tensor<int32, [2]> var_655_split_sizes_0 = const()[name = tensor<string, []>("op_655_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_655_axis_0 = const()[name = tensor<string, []>("op_655_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 256, 16]> var_655_0, tensor<fp32, [1, 256, 16]> var_655_1 = split(axis = var_655_axis_0, split_sizes = var_655_split_sizes_0, x = inputs_23)[name = tensor<string, []>("op_655")];
+            tensor<fp32, [1, 256, 16]> var_657 = sigmoid(x = var_655_1)[name = tensor<string, []>("op_657")];
+            tensor<fp32, [1, 256, 16]> inputs_25 = mul(x = var_655_0, y = var_657)[name = tensor<string, []>("inputs_25")];
             tensor<string, []> outputs_aug_5_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_5_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_5_pad_0 = const()[name = tensor<string, []>("outputs_aug_5_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_5_groups_0 = const()[name = tensor<string, []>("outputs_aug_5_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_5_strides_0 = const()[name = tensor<string, []>("outputs_aug_5_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_5_dilations_0 = const()[name = tensor<string, []>("outputs_aug_5_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [1, 256, 31]> outputs_aug_5 = conv(dilations = outputs_aug_5_dilations_0, groups = outputs_aug_5_groups_0, pad = outputs_aug_5_pad_0, pad_type = outputs_aug_5_pad_type_0, strides = outputs_aug_5_strides_0, weight = encoder_conv_module_2_sequential_4_conv_weight, x = inputs_25)[name = tensor<string, []>("outputs_aug_5")];
-            tensor<int32, [3]> input_105_begin_0 = const()[name = tensor<string, []>("input_105_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_105_end_0 = const()[name = tensor<string, []>("input_105_end_0"), val = tensor<int32, [3]>([1, 256, 16])];
-            tensor<bool, [3]> input_105_end_mask_0 = const()[name = tensor<string, []>("input_105_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [1, 256, 16]> input_105 = slice_by_index(begin = input_105_begin_0, end = input_105_end_0, end_mask = input_105_end_mask_0, x = outputs_aug_5)[name = tensor<string, []>("input_105")];
-            tensor<fp32, [1, 256, 16]> inputs_27 = batch_norm(beta = encoder_conv_module_2_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_2_sequential_5_weight, mean = encoder_conv_module_2_sequential_5_running_mean, variance = encoder_conv_module_2_sequential_5_running_var, x = input_105)[name = tensor<string, []>("inputs_27")];
-            tensor<fp32, [1, 256, 16]> input_107 = silu(x = inputs_27)[name = tensor<string, []>("input_107")];
-            tensor<string, []> input_109_pad_type_0 = const()[name = tensor<string, []>("input_109_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_109_strides_0 = const()[name = tensor<string, []>("input_109_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_109_pad_0 = const()[name = tensor<string, []>("input_109_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_109_dilations_0 = const()[name = tensor<string, []>("input_109_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_109_groups_0 = const()[name = tensor<string, []>("input_109_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> input_109 = conv(bias = encoder_conv_module_2_sequential_7_conv_bias, dilations = input_109_dilations_0, groups = input_109_groups_0, pad = input_109_pad_0, pad_type = input_109_pad_type_0, strides = input_109_strides_0, weight = encoder_conv_module_2_sequential_7_conv_weight, x = input_107)[name = tensor<string, []>("input_109")];
+            tensor<fp32, [1, 256, 31]> outputs_aug_5 = conv(dilations = outputs_aug_5_dilations_0, groups = outputs_aug_5_groups_0, pad = outputs_aug_5_pad_0, pad_type = outputs_aug_5_pad_type_0, strides = outputs_aug_5_strides_0, weight = inner_encoder_conv_module_2_sequential_4_conv_weight, x = inputs_25)[name = tensor<string, []>("outputs_aug_5")];
+            tensor<int32, [3]> input_107_begin_0 = const()[name = tensor<string, []>("input_107_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_107_end_0 = const()[name = tensor<string, []>("input_107_end_0"), val = tensor<int32, [3]>([1, 256, 16])];
+            tensor<bool, [3]> input_107_end_mask_0 = const()[name = tensor<string, []>("input_107_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [1, 256, 16]> input_107 = slice_by_index(begin = input_107_begin_0, end = input_107_end_0, end_mask = input_107_end_mask_0, x = outputs_aug_5)[name = tensor<string, []>("input_107")];
+            tensor<fp32, [1, 256, 16]> inputs_27 = batch_norm(beta = inner_encoder_conv_module_2_sequential_5_bias, epsilon = var_36, gamma = inner_encoder_conv_module_2_sequential_5_weight, mean = inner_encoder_conv_module_2_sequential_5_running_mean, variance = inner_encoder_conv_module_2_sequential_5_running_var, x = input_107)[name = tensor<string, []>("inputs_27")];
+            tensor<fp32, [1, 256, 16]> input_109 = silu(x = inputs_27)[name = tensor<string, []>("input_109")];
+            tensor<string, []> input_111_pad_type_0 = const()[name = tensor<string, []>("input_111_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_111_strides_0 = const()[name = tensor<string, []>("input_111_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_111_pad_0 = const()[name = tensor<string, []>("input_111_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_111_dilations_0 = const()[name = tensor<string, []>("input_111_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_111_groups_0 = const()[name = tensor<string, []>("input_111_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 256, 16]> input_111 = conv(bias = inner_encoder_conv_module_2_sequential_7_conv_bias, dilations = input_111_dilations_0, groups = input_111_groups_0, pad = input_111_pad_0, pad_type = input_111_pad_type_0, strides = input_111_strides_0, weight = inner_encoder_conv_module_2_sequential_7_conv_weight, x = input_109)[name = tensor<string, []>("input_111")];
             tensor<int32, [3]> conv_out_5_perm_0 = const()[name = tensor<string, []>("conv_out_5_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_662_begin_0 = const()[name = tensor<string, []>("op_662_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_662_end_0 = const()[name = tensor<string, []>("op_662_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_662_end_mask_0 = const()[name = tensor<string, []>("op_662_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 16, 256]> conv_out_5 = transpose(perm = conv_out_5_perm_0, x = input_109)[name = tensor<string, []>("transpose_39")];
-            tensor<fp32, [1, 1, 256]> var_662 = slice_by_index(begin = var_662_begin_0, end = var_662_end_0, end_mask = var_662_end_mask_0, x = conv_out_5)[name = tensor<string, []>("op_662")];
-            tensor<int32, [3]> var_664_perm_0 = const()[name = tensor<string, []>("op_664_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 1, 256]> var_664 = transpose(perm = var_664_perm_0, x = var_662)[name = tensor<string, []>("transpose_38")];
-            tensor<fp32, [1, 1, 256]> input_111 = add(x = x_15, y = var_664)[name = tensor<string, []>("input_111")];
-            tensor<int32, [1]> input_113_axes_0 = const()[name = tensor<string, []>("input_113_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_113 = layer_norm(axes = input_113_axes_0, beta = encoder_ffn2_2_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_2_module_sequential_0_weight, x = input_111)[name = tensor<string, []>("input_113")];
-            tensor<fp32, [1, 1, 1024]> inputs_29 = linear(bias = encoder_ffn2_2_module_sequential_1_linear_bias, weight = encoder_ffn2_2_module_sequential_1_linear_weight, x = input_113)[name = tensor<string, []>("linear_26")];
-            tensor<fp32, [1, 1, 1024]> input_115 = silu(x = inputs_29)[name = tensor<string, []>("input_115")];
-            tensor<fp32, [1, 1, 256]> input_119 = linear(bias = encoder_ffn2_2_module_sequential_4_linear_bias, weight = encoder_ffn2_2_module_sequential_4_linear_weight, x = input_115)[name = tensor<string, []>("linear_27")];
-            tensor<fp32, []> var_687 = const()[name = tensor<string, []>("op_687"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 1, 256]> var_688 = mul(x = input_119, y = var_687)[name = tensor<string, []>("op_688")];
-            tensor<fp32, [1, 1, 256]> input_121 = add(x = var_688, y = input_111)[name = tensor<string, []>("input_121")];
-            tensor<int32, [1]> input_123_axes_0 = const()[name = tensor<string, []>("input_123_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_123 = layer_norm(axes = input_123_axes_0, beta = encoder_layer_norm_2_bias, epsilon = var_28, gamma = encoder_layer_norm_2_weight, x = input_121)[name = tensor<string, []>("input_123")];
+            tensor<int32, [3]> var_688_begin_0 = const()[name = tensor<string, []>("op_688_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_688_end_0 = const()[name = tensor<string, []>("op_688_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_688_end_mask_0 = const()[name = tensor<string, []>("op_688_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 16, 256]> conv_out_5 = transpose(perm = conv_out_5_perm_0, x = input_111)[name = tensor<string, []>("transpose_39")];
+            tensor<fp32, [1, 1, 256]> var_688 = slice_by_index(begin = var_688_begin_0, end = var_688_end_0, end_mask = var_688_end_mask_0, x = conv_out_5)[name = tensor<string, []>("op_688")];
+            tensor<int32, [3]> var_690_perm_0 = const()[name = tensor<string, []>("op_690_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 1, 256]> var_690 = transpose(perm = var_690_perm_0, x = var_688)[name = tensor<string, []>("transpose_38")];
+            tensor<fp32, [1, 1, 256]> input_113 = add(x = x_15, y = var_690)[name = tensor<string, []>("input_113")];
+            tensor<int32, [1]> input_115_axes_0 = const()[name = tensor<string, []>("input_115_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 1, 256]> input_115 = layer_norm(axes = input_115_axes_0, beta = inner_encoder_ffn2_2_module_sequential_0_bias, epsilon = var_36, gamma = inner_encoder_ffn2_2_module_sequential_0_weight, x = input_113)[name = tensor<string, []>("input_115")];
+            tensor<fp32, [1, 1, 1024]> inputs_29 = linear(bias = inner_encoder_ffn2_2_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_2_module_sequential_1_linear_weight, x = input_115)[name = tensor<string, []>("linear_26")];
+            tensor<fp32, [1, 1, 1024]> input_117 = silu(x = inputs_29)[name = tensor<string, []>("input_117")];
+            tensor<fp32, [1, 1, 256]> input_121 = linear(bias = inner_encoder_ffn2_2_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_2_module_sequential_4_linear_weight, x = input_117)[name = tensor<string, []>("linear_27")];
+            tensor<fp32, []> var_713 = const()[name = tensor<string, []>("op_713"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 1, 256]> var_714 = mul(x = input_121, y = var_713)[name = tensor<string, []>("op_714")];
+            tensor<fp32, [1, 1, 256]> input_123 = add(x = var_714, y = input_113)[name = tensor<string, []>("input_123")];
             tensor<int32, [1]> input_125_axes_0 = const()[name = tensor<string, []>("input_125_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_125 = layer_norm(axes = input_125_axes_0, beta = encoder_ffn1_3_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_3_module_sequential_0_weight, x = input_123)[name = tensor<string, []>("input_125")];
-            tensor<fp32, [1, 1, 1024]> inputs_31 = linear(bias = encoder_ffn1_3_module_sequential_1_linear_bias, weight = encoder_ffn1_3_module_sequential_1_linear_weight, x = input_125)[name = tensor<string, []>("linear_28")];
-            tensor<fp32, [1, 1, 1024]> input_127 = silu(x = inputs_31)[name = tensor<string, []>("input_127")];
-            tensor<fp32, [1, 1, 256]> input_131 = linear(bias = encoder_ffn1_3_module_sequential_4_linear_bias, weight = encoder_ffn1_3_module_sequential_4_linear_weight, x = input_127)[name = tensor<string, []>("linear_29")];
-            tensor<fp32, []> var_717 = const()[name = tensor<string, []>("op_717"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 1, 256]> var_718 = mul(x = input_131, y = var_717)[name = tensor<string, []>("op_718")];
-            tensor<fp32, [1, 1, 256]> input_133 = add(x = var_718, y = input_123)[name = tensor<string, []>("input_133")];
+            tensor<fp32, [1, 1, 256]> input_125 = layer_norm(axes = input_125_axes_0, beta = inner_encoder_layer_norm_2_bias, epsilon = var_36, gamma = inner_encoder_layer_norm_2_weight, x = input_123)[name = tensor<string, []>("input_125")];
+            tensor<int32, [1]> input_127_axes_0 = const()[name = tensor<string, []>("input_127_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 1, 256]> input_127 = layer_norm(axes = input_127_axes_0, beta = inner_encoder_ffn1_3_module_sequential_0_bias, epsilon = var_36, gamma = inner_encoder_ffn1_3_module_sequential_0_weight, x = input_125)[name = tensor<string, []>("input_127")];
+            tensor<fp32, [1, 1, 1024]> inputs_31 = linear(bias = inner_encoder_ffn1_3_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_3_module_sequential_1_linear_weight, x = input_127)[name = tensor<string, []>("linear_28")];
+            tensor<fp32, [1, 1, 1024]> input_129 = silu(x = inputs_31)[name = tensor<string, []>("input_129")];
+            tensor<fp32, [1, 1, 256]> input_133 = linear(bias = inner_encoder_ffn1_3_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_3_module_sequential_4_linear_weight, x = input_129)[name = tensor<string, []>("linear_29")];
+            tensor<fp32, []> var_743 = const()[name = tensor<string, []>("op_743"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 1, 256]> var_744 = mul(x = input_133, y = var_743)[name = tensor<string, []>("op_744")];
+            tensor<fp32, [1, 1, 256]> input_135 = add(x = var_744, y = input_125)[name = tensor<string, []>("input_135")];
             tensor<int32, [1]> x_19_axes_0 = const()[name = tensor<string, []>("x_19_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> x_19 = layer_norm(axes = x_19_axes_0, beta = encoder_ret_lns_3_bias, epsilon = var_28, gamma = encoder_ret_lns_3_weight, x = input_133)[name = tensor<string, []>("x_19")];
+            tensor<fp32, [1, 1, 256]> x_19 = layer_norm(axes = x_19_axes_0, beta = inner_encoder_ret_lns_3_bias, epsilon = var_36, gamma = inner_encoder_ret_lns_3_weight, x = input_135)[name = tensor<string, []>("x_19")];
             tensor<int32, [5]> prev_kv_7_begin_0 = const()[name = tensor<string, []>("prev_kv_7_begin_0"), val = tensor<int32, [5]>([3, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_7_end_0 = const()[name = tensor<string, []>("prev_kv_7_end_0"), val = tensor<int32, [5]>([4, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_7_end_mask_0 = const()[name = tensor<string, []>("prev_kv_7_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -667,175 +673,168 @@ program(1.0)
             tensor<bool, [2]> prev_scale_7_end_mask_0 = const()[name = tensor<string, []>("prev_scale_7_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_7_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_7_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_7 = slice_by_index(begin = prev_scale_7_begin_0, end = prev_scale_7_end_0, end_mask = prev_scale_7_end_mask_0, squeeze_mask = prev_scale_7_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_7")];
-            tensor<fp32, [1, 1, 256]> var_732 = linear(bias = encoder_q_proj_3_bias, weight = encoder_q_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_30")];
-            tensor<int32, [4]> var_733 = const()[name = tensor<string, []>("op_733"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_734 = reshape(shape = var_733, x = var_732)[name = tensor<string, []>("op_734")];
+            tensor<fp32, [1, 1, 256]> var_758 = linear(bias = inner_encoder_q_proj_3_bias, weight = inner_encoder_q_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_30")];
+            tensor<int32, [4]> var_759 = const()[name = tensor<string, []>("op_759"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_760 = reshape(shape = var_759, x = var_758)[name = tensor<string, []>("op_760")];
             tensor<int32, [4]> q_7_perm_0 = const()[name = tensor<string, []>("q_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> var_738 = linear(bias = encoder_k_proj_3_bias, weight = encoder_k_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_31")];
-            tensor<fp32, []> var_739 = const()[name = tensor<string, []>("op_739"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 1, 256]> var_740 = mul(x = var_738, y = var_739)[name = tensor<string, []>("op_740")];
-            tensor<int32, [4]> var_741 = const()[name = tensor<string, []>("op_741"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_742 = reshape(shape = var_741, x = var_740)[name = tensor<string, []>("op_742")];
+            tensor<fp32, [1, 1, 256]> var_764 = linear(bias = inner_encoder_k_proj_3_bias, weight = inner_encoder_k_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_31")];
+            tensor<fp32, []> var_765 = const()[name = tensor<string, []>("op_765"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 1, 256]> var_766 = mul(x = var_764, y = var_765)[name = tensor<string, []>("op_766")];
+            tensor<int32, [4]> var_767 = const()[name = tensor<string, []>("op_767"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_768 = reshape(shape = var_767, x = var_766)[name = tensor<string, []>("op_768")];
             tensor<int32, [4]> k_7_perm_0 = const()[name = tensor<string, []>("k_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> var_746 = linear(bias = encoder_v_proj_3_bias, weight = encoder_v_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_32")];
-            tensor<int32, [4]> var_747 = const()[name = tensor<string, []>("op_747"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_748 = reshape(shape = var_747, x = var_746)[name = tensor<string, []>("op_748")];
+            tensor<fp32, [1, 1, 256]> var_772 = linear(bias = inner_encoder_v_proj_3_bias, weight = inner_encoder_v_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_32")];
+            tensor<int32, [4]> var_773 = const()[name = tensor<string, []>("op_773"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_774 = reshape(shape = var_773, x = var_772)[name = tensor<string, []>("op_774")];
             tensor<int32, [4]> v_7_perm_0 = const()[name = tensor<string, []>("v_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> input_137 = linear(bias = encoder_g_proj_3_bias, weight = encoder_g_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_33")];
+            tensor<fp32, [1, 1, 256]> input_139 = linear(bias = inner_encoder_g_proj_3_bias, weight = inner_encoder_g_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_33")];
             tensor<fp32, [1]> sqrt_s0_7 = sqrt(x = prev_scale_7)[name = tensor<string, []>("sqrt_s0_7")];
-            tensor<fp32, [1]> s_t_7 = add(x = prev_scale_7, y = encoder__t_index)[name = tensor<string, []>("s_t_7")];
+            tensor<fp32, [1]> s_t_7 = add(x = prev_scale_7, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_7")];
             tensor<fp32, [1]> sqrt_s_t_7 = sqrt(x = s_t_7)[name = tensor<string, []>("sqrt_s_t_7")];
             tensor<bool, []> qk_7_transpose_x_1 = const()[name = tensor<string, []>("qk_7_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_7_transpose_y_1 = const()[name = tensor<string, []>("qk_7_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 1, 64]> k_7 = transpose(perm = k_7_perm_0, x = var_742)[name = tensor<string, []>("transpose_36")];
-            tensor<fp32, [1, 4, 1, 64]> q_7 = transpose(perm = q_7_perm_0, x = var_734)[name = tensor<string, []>("transpose_37")];
+            tensor<fp32, [1, 4, 1, 64]> k_7 = transpose(perm = k_7_perm_0, x = var_768)[name = tensor<string, []>("transpose_36")];
+            tensor<fp32, [1, 4, 1, 64]> q_7 = transpose(perm = q_7_perm_0, x = var_760)[name = tensor<string, []>("transpose_37")];
             tensor<fp32, [1, 4, 1, 1]> qk_7 = matmul(transpose_x = qk_7_transpose_x_1, transpose_y = qk_7_transpose_y_1, x = q_7, y = k_7)[name = tensor<string, []>("qk_7")];
-            tensor<int32, [2]> var_758 = const()[name = tensor<string, []>("op_758"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp32, [1, 1]> var_759 = reshape(shape = var_758, x = sqrt_s_t_7)[name = tensor<string, []>("op_759")];
-            tensor<fp32, [1, 1]> M_7 = real_div(x = encoder__causal_mask, y = var_759)[name = tensor<string, []>("M_7")];
-            tensor<fp32, [1, 4, 1, 1]> var_761 = mul(x = qk_7, y = M_7)[name = tensor<string, []>("op_761")];
+            tensor<int32, [2]> var_784 = const()[name = tensor<string, []>("op_784"), val = tensor<int32, [2]>([1, 1])];
+            tensor<fp32, [1, 1]> var_785 = reshape(shape = var_784, x = sqrt_s_t_7)[name = tensor<string, []>("op_785")];
+            tensor<fp32, [1, 1]> M_7 = real_div(x = inner_encoder__causal_mask, y = var_785)[name = tensor<string, []>("M_7")];
+            tensor<fp32, [1, 4, 1, 1]> var_787 = mul(x = qk_7, y = M_7)[name = tensor<string, []>("op_787")];
             tensor<bool, []> inner_7_transpose_x_0 = const()[name = tensor<string, []>("inner_7_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_7_transpose_y_0 = const()[name = tensor<string, []>("inner_7_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 1, 64]> v_7 = transpose(perm = v_7_perm_0, x = var_748)[name = tensor<string, []>("transpose_35")];
-            tensor<fp32, [1, 4, 1, 64]> inner_7 = matmul(transpose_x = inner_7_transpose_x_0, transpose_y = inner_7_transpose_y_0, x = var_761, y = v_7)[name = tensor<string, []>("inner_7")];
-            tensor<bool, []> var_763_transpose_x_0 = const()[name = tensor<string, []>("op_763_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_763_transpose_y_0 = const()[name = tensor<string, []>("op_763_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 1, 64]> var_763 = matmul(transpose_x = var_763_transpose_x_0, transpose_y = var_763_transpose_y_0, x = q_7, y = prev_kv_7)[name = tensor<string, []>("op_763")];
-            tensor<fp32, [1]> var_764 = real_div(x = sqrt_s0_7, y = sqrt_s_t_7)[name = tensor<string, []>("op_764")];
-            tensor<int32, [4]> var_765 = const()[name = tensor<string, []>("op_765"), val = tensor<int32, [4]>([1, 1, 1, 1])];
-            tensor<fp32, [1, 1, 1, 1]> var_766 = reshape(shape = var_765, x = var_764)[name = tensor<string, []>("op_766")];
-            tensor<fp32, [1, 4, 1, 64]> cross_7 = mul(x = var_763, y = var_766)[name = tensor<string, []>("cross_7")];
+            tensor<fp32, [1, 4, 1, 64]> v_7 = transpose(perm = v_7_perm_0, x = var_774)[name = tensor<string, []>("transpose_35")];
+            tensor<fp32, [1, 4, 1, 64]> inner_7 = matmul(transpose_x = inner_7_transpose_x_0, transpose_y = inner_7_transpose_y_0, x = var_787, y = v_7)[name = tensor<string, []>("inner_7")];
+            tensor<bool, []> var_789_transpose_x_0 = const()[name = tensor<string, []>("op_789_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_789_transpose_y_0 = const()[name = tensor<string, []>("op_789_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 1, 64]> var_789 = matmul(transpose_x = var_789_transpose_x_0, transpose_y = var_789_transpose_y_0, x = q_7, y = prev_kv_7)[name = tensor<string, []>("op_789")];
+            tensor<fp32, [1]> var_790 = real_div(x = sqrt_s0_7, y = sqrt_s_t_7)[name = tensor<string, []>("op_790")];
+            tensor<int32, [4]> var_791 = const()[name = tensor<string, []>("op_791"), val = tensor<int32, [4]>([1, 1, 1, 1])];
+            tensor<fp32, [1, 1, 1, 1]> var_792 = reshape(shape = var_791, x = var_790)[name = tensor<string, []>("op_792")];
+            tensor<fp32, [1, 4, 1, 64]> cross_7 = mul(x = var_789, y = var_792)[name = tensor<string, []>("cross_7")];
             tensor<fp32, [1, 4, 1, 64]> out_19 = add(x = inner_7, y = cross_7)[name = tensor<string, []>("out_19")];
-            tensor<fp32, [1, 4, 64, 64]> var_769 = mul(x = prev_kv_7, y = sqrt_s0_7)[name = tensor<string, []>("op_769")];
-            tensor<bool, []> var_771_transpose_x_1 = const()[name = tensor<string, []>("op_771_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_771_transpose_y_1 = const()[name = tensor<string, []>("op_771_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_771 = matmul(transpose_x = var_771_transpose_x_1, transpose_y = var_771_transpose_y_1, x = k_7, y = v_7)[name = tensor<string, []>("op_771")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_7 = add(x = var_769, y = var_771)[name = tensor<string, []>("new_kv_unnorm_7")];
-            tensor<fp32, []> var_773 = const()[name = tensor<string, []>("op_773"), val = tensor<fp32, []>(0x1p+0)];
-            tensor<fp32, [1]> new_scale_7 = add(x = prev_scale_7, y = var_773)[name = tensor<string, []>("new_scale_7")];
-            tensor<fp32, [1]> var_775 = sqrt(x = new_scale_7)[name = tensor<string, []>("op_775")];
-            tensor<fp32, [1, 4, 64, 64]> nkv_1 = real_div(x = new_kv_unnorm_7, y = var_775)[name = tensor<string, []>("nkv_1")];
-            tensor<int32, [4]> var_777_perm_0 = const()[name = tensor<string, []>("op_777_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_795 = mul(x = prev_kv_7, y = sqrt_s0_7)[name = tensor<string, []>("op_795")];
+            tensor<bool, []> var_797_transpose_x_1 = const()[name = tensor<string, []>("op_797_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_797_transpose_y_1 = const()[name = tensor<string, []>("op_797_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_797 = matmul(transpose_x = var_797_transpose_x_1, transpose_y = var_797_transpose_y_1, x = k_7, y = v_7)[name = tensor<string, []>("op_797")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_7 = add(x = var_795, y = var_797)[name = tensor<string, []>("new_kv_unnorm_7")];
+            tensor<fp32, []> var_799 = const()[name = tensor<string, []>("op_799"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<fp32, [1]> new_scale_7 = add(x = prev_scale_7, y = var_799)[name = tensor<string, []>("new_scale_7")];
+            tensor<fp32, [1]> var_801 = sqrt(x = new_scale_7)[name = tensor<string, []>("op_801")];
+            tensor<fp32, [1, 4, 64, 64]> nkv_1 = real_div(x = new_kv_unnorm_7, y = var_801)[name = tensor<string, []>("nkv_1")];
+            tensor<int32, [4]> var_803_perm_0 = const()[name = tensor<string, []>("op_803_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_21_axes_0 = const()[name = tensor<string, []>("out_21_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 4, 64]> var_777 = transpose(perm = var_777_perm_0, x = out_19)[name = tensor<string, []>("transpose_34")];
-            tensor<fp32, [1, 1, 4, 64]> out_21 = layer_norm(axes = out_21_axes_0, epsilon = var_17, x = var_777)[name = tensor<string, []>("out_21")];
-            tensor<int32, [3]> var_781 = const()[name = tensor<string, []>("op_781"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<fp32, [1, 1, 256]> out_23 = reshape(shape = var_781, x = out_21)[name = tensor<string, []>("out_23")];
-            tensor<fp32, [1, 1, 256]> var_783 = silu(x = input_137)[name = tensor<string, []>("op_783")];
-            tensor<fp32, [1, 1, 256]> input_139 = mul(x = var_783, y = out_23)[name = tensor<string, []>("input_139")];
-            tensor<fp32, [1, 1, 256]> ret_out_7 = linear(bias = encoder_out_proj_3_bias, weight = encoder_out_proj_3_weight, x = input_139)[name = tensor<string, []>("linear_34")];
-            tensor<fp32, [1, 1, 256]> x_21 = add(x = input_133, y = ret_out_7)[name = tensor<string, []>("x_21")];
+            tensor<fp32, [1, 1, 4, 64]> var_803 = transpose(perm = var_803_perm_0, x = out_19)[name = tensor<string, []>("transpose_34")];
+            tensor<fp32, [1, 1, 4, 64]> out_21 = layer_norm(axes = out_21_axes_0, epsilon = var_44, x = var_803)[name = tensor<string, []>("out_21")];
+            tensor<int32, [3]> var_807 = const()[name = tensor<string, []>("op_807"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<fp32, [1, 1, 256]> out_23 = reshape(shape = var_807, x = out_21)[name = tensor<string, []>("out_23")];
+            tensor<fp32, [1, 1, 256]> var_809 = silu(x = input_139)[name = tensor<string, []>("op_809")];
+            tensor<fp32, [1, 1, 256]> input_141 = mul(x = var_809, y = out_23)[name = tensor<string, []>("input_141")];
+            tensor<fp32, [1, 1, 256]> ret_out_7 = linear(bias = inner_encoder_out_proj_3_bias, weight = inner_encoder_out_proj_3_weight, x = input_141)[name = tensor<string, []>("linear_34")];
+            tensor<fp32, [1, 1, 256]> x_21 = add(x = input_135, y = ret_out_7)[name = tensor<string, []>("x_21")];
             tensor<int32, [4]> window_13_begin_0 = const()[name = tensor<string, []>("window_13_begin_0"), val = tensor<int32, [4]>([3, 0, 0, 0])];
             tensor<int32, [4]> window_13_end_0 = const()[name = tensor<string, []>("window_13_end_0"), val = tensor<int32, [4]>([4, 1, 16, 256])];
             tensor<bool, [4]> window_13_end_mask_0 = const()[name = tensor<string, []>("window_13_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_13_squeeze_mask_0 = const()[name = tensor<string, []>("window_13_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_13 = slice_by_index(begin = window_13_begin_0, end = window_13_end_0, end_mask = window_13_end_mask_0, squeeze_mask = window_13_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_13")];
-            tensor<int32, [3]> var_794_begin_0 = const()[name = tensor<string, []>("op_794_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_794_end_0 = const()[name = tensor<string, []>("op_794_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_794_end_mask_0 = const()[name = tensor<string, []>("op_794_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_794 = slice_by_index(begin = var_794_begin_0, end = var_794_end_0, end_mask = var_794_end_mask_0, x = window_13)[name = tensor<string, []>("op_794")];
+            tensor<int32, [3]> var_820_begin_0 = const()[name = tensor<string, []>("op_820_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_820_end_0 = const()[name = tensor<string, []>("op_820_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_820_end_mask_0 = const()[name = tensor<string, []>("op_820_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_820 = slice_by_index(begin = var_820_begin_0, end = var_820_end_0, end_mask = var_820_end_mask_0, x = window_13)[name = tensor<string, []>("op_820")];
             tensor<bool, []> window_interleave_0 = const()[name = tensor<string, []>("window_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window = concat(axis = var_26, interleave = window_interleave_0, values = (var_794, x_21))[name = tensor<string, []>("window")];
-            tensor<bool, []> input_141_interleave_0 = const()[name = tensor<string, []>("input_141_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> input_141 = concat(axis = var_23, interleave = input_141_interleave_0, values = window)[name = tensor<string, []>("input_141")];
+            tensor<fp32, [1, 16, 256]> window = concat(axis = var_52, interleave = window_interleave_0, values = (var_820, x_21))[name = tensor<string, []>("window")];
+            tensor<bool, []> input_143_interleave_0 = const()[name = tensor<string, []>("input_143_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 16, 256]> input_143 = concat(axis = var_39, interleave = input_143_interleave_0, values = window)[name = tensor<string, []>("input_143")];
             tensor<int32, [1]> x_23_axes_0 = const()[name = tensor<string, []>("x_23_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 16, 256]> x_23 = layer_norm(axes = x_23_axes_0, beta = encoder_conv_module_3_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_3_sequential_0_weight, x = input_141)[name = tensor<string, []>("x_23")];
-            tensor<int32, [3]> input_143_perm_0 = const()[name = tensor<string, []>("input_143_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [1, 16, 256]> x_23 = layer_norm(axes = x_23_axes_0, beta = inner_encoder_conv_module_3_sequential_0_bias, epsilon = var_36, gamma = inner_encoder_conv_module_3_sequential_0_weight, x = input_143)[name = tensor<string, []>("x_23")];
+            tensor<int32, [3]> input_145_perm_0 = const()[name = tensor<string, []>("input_145_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_33_pad_type_0 = const()[name = tensor<string, []>("inputs_33_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_33_strides_0 = const()[name = tensor<string, []>("inputs_33_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_33_pad_0 = const()[name = tensor<string, []>("inputs_33_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_33_dilations_0 = const()[name = tensor<string, []>("inputs_33_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_33_groups_0 = const()[name = tensor<string, []>("inputs_33_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> input_143 = transpose(perm = input_143_perm_0, x = x_23)[name = tensor<string, []>("transpose_33")];
-            tensor<fp32, [1, 512, 16]> inputs_33 = conv(bias = encoder_conv_module_3_sequential_2_conv_bias, dilations = inputs_33_dilations_0, groups = inputs_33_groups_0, pad = inputs_33_pad_0, pad_type = inputs_33_pad_type_0, strides = inputs_33_strides_0, weight = encoder_conv_module_3_sequential_2_conv_weight, x = input_143)[name = tensor<string, []>("inputs_33")];
-            tensor<int32, [2]> var_819_split_sizes_0 = const()[name = tensor<string, []>("op_819_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_819_axis_0 = const()[name = tensor<string, []>("op_819_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> var_819_0, tensor<fp32, [1, 256, 16]> var_819_1 = split(axis = var_819_axis_0, split_sizes = var_819_split_sizes_0, x = inputs_33)[name = tensor<string, []>("op_819")];
-            tensor<fp32, [1, 256, 16]> var_821 = sigmoid(x = var_819_1)[name = tensor<string, []>("op_821")];
-            tensor<fp32, [1, 256, 16]> inputs_35 = mul(x = var_819_0, y = var_821)[name = tensor<string, []>("inputs_35")];
+            tensor<fp32, [1, 256, 16]> input_145 = transpose(perm = input_145_perm_0, x = x_23)[name = tensor<string, []>("transpose_33")];
+            tensor<fp32, [1, 512, 16]> inputs_33 = conv(bias = inner_encoder_conv_module_3_sequential_2_conv_bias, dilations = inputs_33_dilations_0, groups = inputs_33_groups_0, pad = inputs_33_pad_0, pad_type = inputs_33_pad_type_0, strides = inputs_33_strides_0, weight = inner_encoder_conv_module_3_sequential_2_conv_weight, x = input_145)[name = tensor<string, []>("inputs_33")];
+            tensor<int32, [2]> var_845_split_sizes_0 = const()[name = tensor<string, []>("op_845_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_845_axis_0 = const()[name = tensor<string, []>("op_845_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 256, 16]> var_845_0, tensor<fp32, [1, 256, 16]> var_845_1 = split(axis = var_845_axis_0, split_sizes = var_845_split_sizes_0, x = inputs_33)[name = tensor<string, []>("op_845")];
+            tensor<fp32, [1, 256, 16]> var_847 = sigmoid(x = var_845_1)[name = tensor<string, []>("op_847")];
+            tensor<fp32, [1, 256, 16]> inputs_35 = mul(x = var_845_0, y = var_847)[name = tensor<string, []>("inputs_35")];
             tensor<string, []> outputs_aug_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_pad_0 = const()[name = tensor<string, []>("outputs_aug_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_groups_0 = const()[name = tensor<string, []>("outputs_aug_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_strides_0 = const()[name = tensor<string, []>("outputs_aug_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_dilations_0 = const()[name = tensor<string, []>("outputs_aug_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [1, 256, 31]> outputs_aug = conv(dilations = outputs_aug_dilations_0, groups = outputs_aug_groups_0, pad = outputs_aug_pad_0, pad_type = outputs_aug_pad_type_0, strides = outputs_aug_strides_0, weight = encoder_conv_module_3_sequential_4_conv_weight, x = inputs_35)[name = tensor<string, []>("outputs_aug")];
-            tensor<int32, [3]> input_145_begin_0 = const()[name = tensor<string, []>("input_145_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_145_end_0 = const()[name = tensor<string, []>("input_145_end_0"), val = tensor<int32, [3]>([1, 256, 16])];
-            tensor<bool, [3]> input_145_end_mask_0 = const()[name = tensor<string, []>("input_145_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [1, 256, 16]> input_145 = slice_by_index(begin = input_145_begin_0, end = input_145_end_0, end_mask = input_145_end_mask_0, x = outputs_aug)[name = tensor<string, []>("input_145")];
-            tensor<fp32, [1, 256, 16]> inputs_37 = batch_norm(beta = encoder_conv_module_3_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_3_sequential_5_weight, mean = encoder_conv_module_3_sequential_5_running_mean, variance = encoder_conv_module_3_sequential_5_running_var, x = input_145)[name = tensor<string, []>("inputs_37")];
-            tensor<fp32, [1, 256, 16]> input_147 = silu(x = inputs_37)[name = tensor<string, []>("input_147")];
-            tensor<string, []> input_149_pad_type_0 = const()[name = tensor<string, []>("input_149_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_149_strides_0 = const()[name = tensor<string, []>("input_149_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_149_pad_0 = const()[name = tensor<string, []>("input_149_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_149_dilations_0 = const()[name = tensor<string, []>("input_149_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_149_groups_0 = const()[name = tensor<string, []>("input_149_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> input_149 = conv(bias = encoder_conv_module_3_sequential_7_conv_bias, dilations = input_149_dilations_0, groups = input_149_groups_0, pad = input_149_pad_0, pad_type = input_149_pad_type_0, strides = input_149_strides_0, weight = encoder_conv_module_3_sequential_7_conv_weight, x = input_147)[name = tensor<string, []>("input_149")];
+            tensor<fp32, [1, 256, 31]> outputs_aug = conv(dilations = outputs_aug_dilations_0, groups = outputs_aug_groups_0, pad = outputs_aug_pad_0, pad_type = outputs_aug_pad_type_0, strides = outputs_aug_strides_0, weight = inner_encoder_conv_module_3_sequential_4_conv_weight, x = inputs_35)[name = tensor<string, []>("outputs_aug")];
+            tensor<int32, [3]> input_147_begin_0 = const()[name = tensor<string, []>("input_147_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_147_end_0 = const()[name = tensor<string, []>("input_147_end_0"), val = tensor<int32, [3]>([1, 256, 16])];
+            tensor<bool, [3]> input_147_end_mask_0 = const()[name = tensor<string, []>("input_147_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [1, 256, 16]> input_147 = slice_by_index(begin = input_147_begin_0, end = input_147_end_0, end_mask = input_147_end_mask_0, x = outputs_aug)[name = tensor<string, []>("input_147")];
+            tensor<fp32, [1, 256, 16]> inputs_37 = batch_norm(beta = inner_encoder_conv_module_3_sequential_5_bias, epsilon = var_36, gamma = inner_encoder_conv_module_3_sequential_5_weight, mean = inner_encoder_conv_module_3_sequential_5_running_mean, variance = inner_encoder_conv_module_3_sequential_5_running_var, x = input_147)[name = tensor<string, []>("inputs_37")];
+            tensor<fp32, [1, 256, 16]> input_149 = silu(x = inputs_37)[name = tensor<string, []>("input_149")];
+            tensor<string, []> input_151_pad_type_0 = const()[name = tensor<string, []>("input_151_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_151_strides_0 = const()[name = tensor<string, []>("input_151_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_151_pad_0 = const()[name = tensor<string, []>("input_151_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_151_dilations_0 = const()[name = tensor<string, []>("input_151_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_151_groups_0 = const()[name = tensor<string, []>("input_151_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 256, 16]> input_151 = conv(bias = inner_encoder_conv_module_3_sequential_7_conv_bias, dilations = input_151_dilations_0, groups = input_151_groups_0, pad = input_151_pad_0, pad_type = input_151_pad_type_0, strides = input_151_strides_0, weight = inner_encoder_conv_module_3_sequential_7_conv_weight, x = input_149)[name = tensor<string, []>("input_151")];
             tensor<int32, [3]> conv_out_7_perm_0 = const()[name = tensor<string, []>("conv_out_7_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_852_begin_0 = const()[name = tensor<string, []>("op_852_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_852_end_0 = const()[name = tensor<string, []>("op_852_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_852_end_mask_0 = const()[name = tensor<string, []>("op_852_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 16, 256]> conv_out_7 = transpose(perm = conv_out_7_perm_0, x = input_149)[name = tensor<string, []>("transpose_32")];
-            tensor<fp32, [1, 1, 256]> var_852 = slice_by_index(begin = var_852_begin_0, end = var_852_end_0, end_mask = var_852_end_mask_0, x = conv_out_7)[name = tensor<string, []>("op_852")];
-            tensor<int32, [3]> var_854_perm_0 = const()[name = tensor<string, []>("op_854_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 1, 256]> var_854 = transpose(perm = var_854_perm_0, x = var_852)[name = tensor<string, []>("transpose_31")];
-            tensor<fp32, [1, 1, 256]> input_151 = add(x = x_21, y = var_854)[name = tensor<string, []>("input_151")];
-            tensor<int32, [1]> input_153_axes_0 = const()[name = tensor<string, []>("input_153_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_153 = layer_norm(axes = input_153_axes_0, beta = encoder_ffn2_3_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_3_module_sequential_0_weight, x = input_151)[name = tensor<string, []>("input_153")];
-            tensor<fp32, [1, 1, 1024]> inputs = linear(bias = encoder_ffn2_3_module_sequential_1_linear_bias, weight = encoder_ffn2_3_module_sequential_1_linear_weight, x = input_153)[name = tensor<string, []>("linear_35")];
-            tensor<fp32, [1, 1, 1024]> input_155 = silu(x = inputs)[name = tensor<string, []>("input_155")];
-            tensor<fp32, [1, 1, 256]> input_159 = linear(bias = encoder_ffn2_3_module_sequential_4_linear_bias, weight = encoder_ffn2_3_module_sequential_4_linear_weight, x = input_155)[name = tensor<string, []>("linear_36")];
-            tensor<fp32, []> var_877 = const()[name = tensor<string, []>("op_877"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 1, 256]> var_878 = mul(x = input_159, y = var_877)[name = tensor<string, []>("op_878")];
-            tensor<fp32, [1, 1, 256]> input_161 = add(x = var_878, y = input_151)[name = tensor<string, []>("input_161")];
+            tensor<int32, [3]> var_878_begin_0 = const()[name = tensor<string, []>("op_878_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_878_end_0 = const()[name = tensor<string, []>("op_878_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_878_end_mask_0 = const()[name = tensor<string, []>("op_878_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 16, 256]> conv_out_7 = transpose(perm = conv_out_7_perm_0, x = input_151)[name = tensor<string, []>("transpose_32")];
+            tensor<fp32, [1, 1, 256]> var_878 = slice_by_index(begin = var_878_begin_0, end = var_878_end_0, end_mask = var_878_end_mask_0, x = conv_out_7)[name = tensor<string, []>("op_878")];
+            tensor<int32, [3]> var_880_perm_0 = const()[name = tensor<string, []>("op_880_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 1, 256]> var_880 = transpose(perm = var_880_perm_0, x = var_878)[name = tensor<string, []>("transpose_31")];
+            tensor<fp32, [1, 1, 256]> input_153 = add(x = x_21, y = var_880)[name = tensor<string, []>("input_153")];
+            tensor<int32, [1]> input_155_axes_0 = const()[name = tensor<string, []>("input_155_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 1, 256]> input_155 = layer_norm(axes = input_155_axes_0, beta = inner_encoder_ffn2_3_module_sequential_0_bias, epsilon = var_36, gamma = inner_encoder_ffn2_3_module_sequential_0_weight, x = input_153)[name = tensor<string, []>("input_155")];
+            tensor<fp32, [1, 1, 1024]> inputs = linear(bias = inner_encoder_ffn2_3_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_3_module_sequential_1_linear_weight, x = input_155)[name = tensor<string, []>("linear_35")];
+            tensor<fp32, [1, 1, 1024]> input_157 = silu(x = inputs)[name = tensor<string, []>("input_157")];
+            tensor<fp32, [1, 1, 256]> input_161 = linear(bias = inner_encoder_ffn2_3_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_3_module_sequential_4_linear_weight, x = input_157)[name = tensor<string, []>("linear_36")];
+            tensor<fp32, []> var_903 = const()[name = tensor<string, []>("op_903"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 1, 256]> var_904 = mul(x = input_161, y = var_903)[name = tensor<string, []>("op_904")];
+            tensor<fp32, [1, 1, 256]> input_163 = add(x = var_904, y = input_153)[name = tensor<string, []>("input_163")];
             tensor<int32, [1]> x_25_axes_0 = const()[name = tensor<string, []>("x_25_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> x_25 = layer_norm(axes = x_25_axes_0, beta = encoder_layer_norm_3_bias, epsilon = var_28, gamma = encoder_layer_norm_3_weight, x = input_161)[name = tensor<string, []>("x_25")];
+            tensor<fp32, [1, 1, 256]> x_25 = layer_norm(axes = x_25_axes_0, beta = inner_encoder_layer_norm_3_bias, epsilon = var_36, gamma = inner_encoder_layer_norm_3_weight, x = input_163)[name = tensor<string, []>("x_25")];
             tensor<int32, [3]> x_ct_perm_0 = const()[name = tensor<string, []>("x_ct_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<bool, []> cat_interleave_0 = const()[name = tensor<string, []>("cat_interleave_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [1, 256, 1]> x_ct = transpose(perm = x_ct_perm_0, x = x_25)[name = tensor<string, []>("transpose_30")];
-            tensor<fp32, [1, 256, 19]> cat = concat(axis = var_20, interleave = cat_interleave_0, values = (cnn_window, x_ct))[name = tensor<string, []>("cat")];
+            tensor<fp32, [1, 256, 19]> cat = concat(axis = var_41, interleave = cat_interleave_0, values = (cnn_window, x_ct))[name = tensor<string, []>("cat")];
             tensor<string, []> conv_out_pad_type_0 = const()[name = tensor<string, []>("conv_out_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> conv_out_strides_0 = const()[name = tensor<string, []>("conv_out_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> conv_out_pad_0 = const()[name = tensor<string, []>("conv_out_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> conv_out_dilations_0 = const()[name = tensor<string, []>("conv_out_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> conv_out_groups_0 = const()[name = tensor<string, []>("conv_out_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 1]> conv_out = conv(bias = encoder_cnn_bias, dilations = conv_out_dilations_0, groups = conv_out_groups_0, pad = conv_out_pad_0, pad_type = conv_out_pad_type_0, strides = conv_out_strides_0, weight = encoder_cnn_weight, x = cat)[name = tensor<string, []>("conv_out")];
-            tensor<int32, [3]> var_896_begin_0 = const()[name = tensor<string, []>("op_896_begin_0"), val = tensor<int32, [3]>([0, 0, 1])];
-            tensor<int32, [3]> var_896_end_0 = const()[name = tensor<string, []>("op_896_end_0"), val = tensor<int32, [3]>([1, 256, 19])];
-            tensor<bool, [3]> var_896_end_mask_0 = const()[name = tensor<string, []>("op_896_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 256, 18]> cnn_window_new = slice_by_index(begin = var_896_begin_0, end = var_896_end_0, end_mask = var_896_end_mask_0, x = cat)[name = tensor<string, []>("op_896")];
-            tensor<int32, [3]> input_163_perm_0 = const()[name = tensor<string, []>("input_163_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [1]> var_898 = const()[name = tensor<string, []>("op_898"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_163 = transpose(perm = input_163_perm_0, x = conv_out)[name = tensor<string, []>("transpose_29")];
-            tensor<fp32, [1, 1, 1]> var_899 = reduce_l2_norm(axes = var_898, keep_dims = var_29, x = input_163)[name = tensor<string, []>("op_899")];
+            tensor<fp32, [1, 256, 1]> conv_out = conv(bias = inner_encoder_cnn_bias, dilations = conv_out_dilations_0, groups = conv_out_groups_0, pad = conv_out_pad_0, pad_type = conv_out_pad_type_0, strides = conv_out_strides_0, weight = inner_encoder_cnn_weight, x = cat)[name = tensor<string, []>("conv_out")];
+            tensor<int32, [3]> var_922_begin_0 = const()[name = tensor<string, []>("op_922_begin_0"), val = tensor<int32, [3]>([0, 0, 1])];
+            tensor<int32, [3]> var_922_end_0 = const()[name = tensor<string, []>("op_922_end_0"), val = tensor<int32, [3]>([1, 256, 19])];
+            tensor<bool, [3]> var_922_end_mask_0 = const()[name = tensor<string, []>("op_922_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 256, 18]> cnn_window_new = slice_by_index(begin = var_922_begin_0, end = var_922_end_0, end_mask = var_922_end_mask_0, x = cat)[name = tensor<string, []>("op_922")];
+            tensor<int32, [3]> input_165_perm_0 = const()[name = tensor<string, []>("input_165_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_924 = const()[name = tensor<string, []>("op_924"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 1, 256]> input_165 = transpose(perm = input_165_perm_0, x = conv_out)[name = tensor<string, []>("transpose_29")];
+            tensor<fp32, [1, 1, 1]> var_925 = reduce_l2_norm(axes = var_924, keep_dims = var_35, x = input_165)[name = tensor<string, []>("op_925")];
             tensor<fp32, []> const_12 = const()[name = tensor<string, []>("const_12"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1, 1, 1]> clip_0 = clip(alpha = var_11, beta = const_12, x = var_899)[name = tensor<string, []>("clip_0")];
-            tensor<fp32, [1, 1, 256]> emb = real_div(x = input_163, y = clip_0)[name = tensor<string, []>("emb")];
-            tensor<int32, []> var_903_axis_0 = const()[name = tensor<string, []>("op_903_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1, 4, 64, 64]> enc_kv_new = stack(axis = var_903_axis_0, values = (var_206, var_396, var_586, nkv_1))[name = tensor<string, []>("op_903")];
-            tensor<int32, []> var_905_axis_0 = const()[name = tensor<string, []>("op_905_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1]> enc_scale_new = stack(axis = var_905_axis_0, values = (new_scale_1, new_scale_3, new_scale_5, new_scale_7))[name = tensor<string, []>("op_905")];
-            tensor<int32, []> var_907_axis_0 = const()[name = tensor<string, []>("op_907_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1, 16, 256]> enc_conv_cache_new = stack(axis = var_907_axis_0, values = (window_3, window_7, window_11, window))[name = tensor<string, []>("op_907")];
-            tensor<fp32, []> var_916 = const()[name = tensor<string, []>("op_916"), val = tensor<fp32, []>(0x1.5798eep-27)];
-            tensor<fp32, []> var_921 = const()[name = tensor<string, []>("op_921"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
-            tensor<fp32, []> var_923 = const()[name = tensor<string, []>("op_923"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
-            tensor<bool, []> var_924 = const()[name = tensor<string, []>("op_924"), val = tensor<bool, []>(true)];
-            tensor<fp32, []> var_926 = const()[name = tensor<string, []>("op_926"), val = tensor<fp32, []>(0x1p+0)];
-            tensor<int32, []> var_930 = const()[name = tensor<string, []>("op_930"), val = tensor<int32, []>(-1)];
-            tensor<int32, []> var_936 = const()[name = tensor<string, []>("op_936"), val = tensor<int32, []>(0)];
-            tensor<fp32, [1, 1, 9, 256]> var_993 = const()[name = tensor<string, []>("op_993"), val = tensor<fp32, [1, 1, 9, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44395328)))];
-            tensor<int32, [1]> var_998_axes_0 = const()[name = tensor<string, []>("op_998_axes_0"), val = tensor<int32, [1]>([2])];
-            tensor<fp32, [1, 1, 1, 256]> var_998 = expand_dims(axes = var_998_axes_0, x = emb)[name = tensor<string, []>("op_998")];
+            tensor<fp32, [1, 1, 1]> clip_0 = clip(alpha = var_49, beta = const_12, x = var_925)[name = tensor<string, []>("clip_0")];
+            tensor<fp32, [1, 1, 256]> emb = real_div(x = input_165, y = clip_0)[name = tensor<string, []>("emb")];
+            tensor<int32, []> var_929_axis_0 = const()[name = tensor<string, []>("op_929_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1, 4, 64, 64]> enc_kv_new = stack(axis = var_929_axis_0, values = (var_232, var_422, var_612, nkv_1))[name = tensor<string, []>("op_929")];
+            tensor<int32, []> var_931_axis_0 = const()[name = tensor<string, []>("op_931_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1]> enc_scale_new = stack(axis = var_931_axis_0, values = (new_scale_1, new_scale_3, new_scale_5, new_scale_7))[name = tensor<string, []>("op_931")];
+            tensor<int32, []> var_933_axis_0 = const()[name = tensor<string, []>("op_933_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1, 16, 256]> enc_conv_cache_new = stack(axis = var_933_axis_0, values = (window_3, window_7, window_11, window))[name = tensor<string, []>("op_933")];
+            tensor<fp32, [1, 1, 9, 256]> var_996 = const()[name = tensor<string, []>("op_996"), val = tensor<fp32, [1, 1, 9, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44395328)))];
+            tensor<int32, [1]> var_1001_axes_0 = const()[name = tensor<string, []>("op_1001_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp32, [1, 1, 1, 256]> var_1001 = expand_dims(axes = var_1001_axes_0, x = emb)[name = tensor<string, []>("op_1001")];
             tensor<int32, [4]> emb_exp_reps_0 = const()[name = tensor<string, []>("emb_exp_reps_0"), val = tensor<int32, [4]>([1, 1, 9, 1])];
-            tensor<fp32, [1, 1, 9, 256]> emb_exp = tile(reps = emb_exp_reps_0, x = var_998)[name = tensor<string, []>("emb_exp")];
-            tensor<bool, []> input_165_interleave_0 = const()[name = tensor<string, []>("input_165_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 1, 9, 512]> input_165 = concat(axis = var_930, interleave = input_165_interleave_0, values = (emb_exp, var_993))[name = tensor<string, []>("input_165")];
-            tensor<fp32, [1, 1, 9, 256]> x_27 = linear(bias = decoder_convert_bias, weight = decoder_convert_weight, x = input_165)[name = tensor<string, []>("linear_37")];
-            tensor<int32, [4]> var_1006_perm_0 = const()[name = tensor<string, []>("op_1006_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1010 = const()[name = tensor<string, []>("op_1010"), val = tensor<int32, [3]>([9, 1, 256])];
-            tensor<fp32, [1, 9, 1, 256]> var_1006 = transpose(perm = var_1006_perm_0, x = x_27)[name = tensor<string, []>("transpose_28")];
-            tensor<fp32, [9, 1, 256]> x_29 = reshape(shape = var_1010, x = var_1006)[name = tensor<string, []>("x_29")];
+            tensor<fp32, [1, 1, 9, 256]> emb_exp = tile(reps = emb_exp_reps_0, x = var_1001)[name = tensor<string, []>("emb_exp")];
+            tensor<bool, []> input_167_interleave_0 = const()[name = tensor<string, []>("input_167_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 1, 9, 512]> input_167 = concat(axis = var_42, interleave = input_167_interleave_0, values = (emb_exp, var_996))[name = tensor<string, []>("input_167")];
+            tensor<fp32, [1, 1, 9, 256]> x_27 = linear(bias = inner_decoder_convert_bias, weight = inner_decoder_convert_weight, x = input_167)[name = tensor<string, []>("linear_37")];
+            tensor<int32, [4]> var_1009_perm_0 = const()[name = tensor<string, []>("op_1009_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1013 = const()[name = tensor<string, []>("op_1013"), val = tensor<int32, [3]>([9, 1, 256])];
+            tensor<fp32, [1, 9, 1, 256]> var_1009 = transpose(perm = var_1009_perm_0, x = x_27)[name = tensor<string, []>("transpose_28")];
+            tensor<fp32, [9, 1, 256]> x_29 = reshape(shape = var_1013, x = var_1009)[name = tensor<string, []>("x_29")];
             tensor<int32, [5]> prev_kv_9_begin_0 = const()[name = tensor<string, []>("prev_kv_9_begin_0"), val = tensor<int32, [5]>([0, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_9_end_0 = const()[name = tensor<string, []>("prev_kv_9_end_0"), val = tensor<int32, [5]>([1, 9, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_9_end_mask_0 = const()[name = tensor<string, []>("prev_kv_9_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -846,131 +845,131 @@ program(1.0)
             tensor<bool, [2]> prev_scale_9_end_mask_0 = const()[name = tensor<string, []>("prev_scale_9_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_9_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_9_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_9 = slice_by_index(begin = prev_scale_9_begin_0, end = prev_scale_9_end_0, end_mask = prev_scale_9_end_mask_0, squeeze_mask = prev_scale_9_squeeze_mask_0, x = dec_scale)[name = tensor<string, []>("prev_scale_9")];
-            tensor<fp32, [9, 1, 256]> var_1018 = linear(bias = decoder_q_proj_0_bias, weight = decoder_q_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_38")];
-            tensor<int32, [4]> var_1019 = const()[name = tensor<string, []>("op_1019"), val = tensor<int32, [4]>([9, 1, 4, 64])];
-            tensor<fp32, [9, 1, 4, 64]> var_1020 = reshape(shape = var_1019, x = var_1018)[name = tensor<string, []>("op_1020")];
+            tensor<fp32, [9, 1, 256]> var_1021 = linear(bias = inner_decoder_q_proj_0_bias, weight = inner_decoder_q_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_38")];
+            tensor<int32, [4]> var_1022 = const()[name = tensor<string, []>("op_1022"), val = tensor<int32, [4]>([9, 1, 4, 64])];
+            tensor<fp32, [9, 1, 4, 64]> var_1023 = reshape(shape = var_1022, x = var_1021)[name = tensor<string, []>("op_1023")];
             tensor<int32, [4]> q_9_perm_0 = const()[name = tensor<string, []>("q_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [9, 1, 256]> var_1024 = linear(bias = decoder_k_proj_0_bias, weight = decoder_k_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_39")];
-            tensor<fp32, []> var_1025 = const()[name = tensor<string, []>("op_1025"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [9, 1, 256]> var_1026 = mul(x = var_1024, y = var_1025)[name = tensor<string, []>("op_1026")];
-            tensor<int32, [4]> var_1027 = const()[name = tensor<string, []>("op_1027"), val = tensor<int32, [4]>([9, 1, 4, 64])];
-            tensor<fp32, [9, 1, 4, 64]> var_1028 = reshape(shape = var_1027, x = var_1026)[name = tensor<string, []>("op_1028")];
+            tensor<fp32, [9, 1, 256]> var_1027 = linear(bias = inner_decoder_k_proj_0_bias, weight = inner_decoder_k_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_39")];
+            tensor<fp32, []> var_1028 = const()[name = tensor<string, []>("op_1028"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [9, 1, 256]> var_1029 = mul(x = var_1027, y = var_1028)[name = tensor<string, []>("op_1029")];
+            tensor<int32, [4]> var_1030 = const()[name = tensor<string, []>("op_1030"), val = tensor<int32, [4]>([9, 1, 4, 64])];
+            tensor<fp32, [9, 1, 4, 64]> var_1031 = reshape(shape = var_1030, x = var_1029)[name = tensor<string, []>("op_1031")];
             tensor<int32, [4]> k_9_perm_0 = const()[name = tensor<string, []>("k_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [9, 1, 256]> var_1032 = linear(bias = decoder_v_proj_0_bias, weight = decoder_v_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_40")];
-            tensor<int32, [4]> var_1033 = const()[name = tensor<string, []>("op_1033"), val = tensor<int32, [4]>([9, 1, 4, 64])];
-            tensor<fp32, [9, 1, 4, 64]> var_1034 = reshape(shape = var_1033, x = var_1032)[name = tensor<string, []>("op_1034")];
+            tensor<fp32, [9, 1, 256]> var_1035 = linear(bias = inner_decoder_v_proj_0_bias, weight = inner_decoder_v_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_40")];
+            tensor<int32, [4]> var_1036 = const()[name = tensor<string, []>("op_1036"), val = tensor<int32, [4]>([9, 1, 4, 64])];
+            tensor<fp32, [9, 1, 4, 64]> var_1037 = reshape(shape = var_1036, x = var_1035)[name = tensor<string, []>("op_1037")];
             tensor<int32, [4]> v_9_perm_0 = const()[name = tensor<string, []>("v_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [9, 1, 256]> input_169 = linear(bias = decoder_g_proj_0_bias, weight = decoder_g_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_41")];
+            tensor<fp32, [9, 1, 256]> input_171 = linear(bias = inner_decoder_g_proj_0_bias, weight = inner_decoder_g_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_41")];
             tensor<bool, []> cumsum_mask_1_exclusive_0 = const()[name = tensor<string, []>("cumsum_mask_1_exclusive_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> cumsum_mask_1_reverse_0 = const()[name = tensor<string, []>("cumsum_mask_1_reverse_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1]> cumsum_mask_1 = cumsum(axis = var_936, exclusive = cumsum_mask_1_exclusive_0, reverse = cumsum_mask_1_reverse_0, x = valid_mask)[name = tensor<string, []>("cumsum_mask_1")];
+            tensor<fp32, [1]> cumsum_mask_1 = cumsum(axis = var_39, exclusive = cumsum_mask_1_exclusive_0, reverse = cumsum_mask_1_reverse_0, x = valid_mask)[name = tensor<string, []>("cumsum_mask_1")];
             tensor<fp32, [1]> sqrt_s0_9 = sqrt(x = prev_scale_9)[name = tensor<string, []>("sqrt_s0_9")];
             tensor<fp32, [1]> s_t_9 = add(x = prev_scale_9, y = cumsum_mask_1)[name = tensor<string, []>("s_t_9")];
             tensor<fp32, []> const_20 = const()[name = tensor<string, []>("const_20"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_1 = clip(alpha = var_926, beta = const_20, x = s_t_9)[name = tensor<string, []>("clip_1")];
+            tensor<fp32, [1]> clip_1 = clip(alpha = var_29, beta = const_20, x = s_t_9)[name = tensor<string, []>("clip_1")];
             tensor<fp32, [1]> sqrt_s_t_9 = sqrt(x = clip_1)[name = tensor<string, []>("sqrt_s_t_9")];
             tensor<bool, []> qk_9_transpose_x_1 = const()[name = tensor<string, []>("qk_9_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_9_transpose_y_1 = const()[name = tensor<string, []>("qk_9_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [9, 4, 1, 64]> k_9 = transpose(perm = k_9_perm_0, x = var_1028)[name = tensor<string, []>("transpose_26")];
-            tensor<fp32, [9, 4, 1, 64]> q_9 = transpose(perm = q_9_perm_0, x = var_1020)[name = tensor<string, []>("transpose_27")];
+            tensor<fp32, [9, 4, 1, 64]> k_9 = transpose(perm = k_9_perm_0, x = var_1031)[name = tensor<string, []>("transpose_26")];
+            tensor<fp32, [9, 4, 1, 64]> q_9 = transpose(perm = q_9_perm_0, x = var_1023)[name = tensor<string, []>("transpose_27")];
             tensor<fp32, [9, 4, 1, 1]> qk_9 = matmul(transpose_x = qk_9_transpose_x_1, transpose_y = qk_9_transpose_y_1, x = q_9, y = k_9)[name = tensor<string, []>("qk_9")];
-            tensor<int32, [2]> var_1046 = const()[name = tensor<string, []>("op_1046"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp32, [1, 1]> var_1047 = reshape(shape = var_1046, x = valid_mask)[name = tensor<string, []>("op_1047")];
             tensor<int32, [2]> var_1049 = const()[name = tensor<string, []>("op_1049"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp32, [1, 1]> var_1050 = reshape(shape = var_1049, x = sqrt_s_t_9)[name = tensor<string, []>("op_1050")];
-            tensor<fp32, [1, 1]> M_9 = real_div(x = var_1047, y = var_1050)[name = tensor<string, []>("M_9")];
-            tensor<fp32, [9, 4, 1, 1]> var_1052 = mul(x = qk_9, y = M_9)[name = tensor<string, []>("op_1052")];
+            tensor<fp32, [1, 1]> var_1050 = reshape(shape = var_1049, x = valid_mask)[name = tensor<string, []>("op_1050")];
+            tensor<int32, [2]> var_1052 = const()[name = tensor<string, []>("op_1052"), val = tensor<int32, [2]>([1, 1])];
+            tensor<fp32, [1, 1]> var_1053 = reshape(shape = var_1052, x = sqrt_s_t_9)[name = tensor<string, []>("op_1053")];
+            tensor<fp32, [1, 1]> M_9 = real_div(x = var_1050, y = var_1053)[name = tensor<string, []>("M_9")];
+            tensor<fp32, [9, 4, 1, 1]> var_1055 = mul(x = qk_9, y = M_9)[name = tensor<string, []>("op_1055")];
             tensor<bool, []> inner_9_transpose_x_0 = const()[name = tensor<string, []>("inner_9_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_9_transpose_y_0 = const()[name = tensor<string, []>("inner_9_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [9, 4, 1, 64]> v_9 = transpose(perm = v_9_perm_0, x = var_1034)[name = tensor<string, []>("transpose_25")];
-            tensor<fp32, [9, 4, 1, 64]> inner_9 = matmul(transpose_x = inner_9_transpose_x_0, transpose_y = inner_9_transpose_y_0, x = var_1052, y = v_9)[name = tensor<string, []>("inner_9")];
-            tensor<bool, []> var_1054_transpose_x_0 = const()[name = tensor<string, []>("op_1054_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_1054_transpose_y_0 = const()[name = tensor<string, []>("op_1054_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [9, 4, 1, 64]> var_1054 = matmul(transpose_x = var_1054_transpose_x_0, transpose_y = var_1054_transpose_y_0, x = q_9, y = prev_kv_9)[name = tensor<string, []>("op_1054")];
-            tensor<fp32, [1]> var_1055 = real_div(x = sqrt_s0_9, y = sqrt_s_t_9)[name = tensor<string, []>("op_1055")];
-            tensor<int32, [4]> var_1056 = const()[name = tensor<string, []>("op_1056"), val = tensor<int32, [4]>([1, 1, 1, 1])];
-            tensor<fp32, [1, 1, 1, 1]> var_1057 = reshape(shape = var_1056, x = var_1055)[name = tensor<string, []>("op_1057")];
-            tensor<fp32, [9, 4, 1, 64]> cross_9 = mul(x = var_1054, y = var_1057)[name = tensor<string, []>("cross_9")];
+            tensor<fp32, [9, 4, 1, 64]> v_9 = transpose(perm = v_9_perm_0, x = var_1037)[name = tensor<string, []>("transpose_25")];
+            tensor<fp32, [9, 4, 1, 64]> inner_9 = matmul(transpose_x = inner_9_transpose_x_0, transpose_y = inner_9_transpose_y_0, x = var_1055, y = v_9)[name = tensor<string, []>("inner_9")];
+            tensor<bool, []> var_1057_transpose_x_0 = const()[name = tensor<string, []>("op_1057_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_1057_transpose_y_0 = const()[name = tensor<string, []>("op_1057_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [9, 4, 1, 64]> var_1057 = matmul(transpose_x = var_1057_transpose_x_0, transpose_y = var_1057_transpose_y_0, x = q_9, y = prev_kv_9)[name = tensor<string, []>("op_1057")];
+            tensor<fp32, [1]> var_1058 = real_div(x = sqrt_s0_9, y = sqrt_s_t_9)[name = tensor<string, []>("op_1058")];
+            tensor<int32, [4]> var_1059 = const()[name = tensor<string, []>("op_1059"), val = tensor<int32, [4]>([1, 1, 1, 1])];
+            tensor<fp32, [1, 1, 1, 1]> var_1060 = reshape(shape = var_1059, x = var_1058)[name = tensor<string, []>("op_1060")];
+            tensor<fp32, [9, 4, 1, 64]> cross_9 = mul(x = var_1057, y = var_1060)[name = tensor<string, []>("cross_9")];
             tensor<fp32, [9, 4, 1, 64]> out_25 = add(x = inner_9, y = cross_9)[name = tensor<string, []>("out_25")];
-            tensor<int32, [4]> var_1060 = const()[name = tensor<string, []>("op_1060"), val = tensor<int32, [4]>([1, 1, 1, 1])];
-            tensor<fp32, [1, 1, 1, 1]> var_1061 = reshape(shape = var_1060, x = valid_mask)[name = tensor<string, []>("op_1061")];
-            tensor<fp32, [9, 4, 1, 64]> v_masked_1 = mul(x = v_9, y = var_1061)[name = tensor<string, []>("v_masked_1")];
-            tensor<fp32, [9, 4, 64, 64]> var_1063 = mul(x = prev_kv_9, y = sqrt_s0_9)[name = tensor<string, []>("op_1063")];
-            tensor<bool, []> var_1065_transpose_x_1 = const()[name = tensor<string, []>("op_1065_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_1065_transpose_y_1 = const()[name = tensor<string, []>("op_1065_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [9, 4, 64, 64]> var_1065 = matmul(transpose_x = var_1065_transpose_x_1, transpose_y = var_1065_transpose_y_1, x = k_9, y = v_masked_1)[name = tensor<string, []>("op_1065")];
-            tensor<fp32, [9, 4, 64, 64]> new_kv_unnorm_9 = add(x = var_1063, y = var_1065)[name = tensor<string, []>("new_kv_unnorm_9")];
-            tensor<bool, []> var_1067_keep_dims_0 = const()[name = tensor<string, []>("op_1067_keep_dims_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, []> var_1067 = reduce_sum(keep_dims = var_1067_keep_dims_0, x = valid_mask)[name = tensor<string, []>("op_1067")];
-            tensor<int32, [1]> var_1068 = const()[name = tensor<string, []>("op_1068"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [1]> var_1069 = reshape(shape = var_1068, x = var_1067)[name = tensor<string, []>("op_1069")];
-            tensor<fp32, [1]> new_scale_9 = add(x = prev_scale_9, y = var_1069)[name = tensor<string, []>("new_scale_9")];
+            tensor<int32, [4]> var_1063 = const()[name = tensor<string, []>("op_1063"), val = tensor<int32, [4]>([1, 1, 1, 1])];
+            tensor<fp32, [1, 1, 1, 1]> var_1064 = reshape(shape = var_1063, x = valid_mask)[name = tensor<string, []>("op_1064")];
+            tensor<fp32, [9, 4, 1, 64]> v_masked_1 = mul(x = v_9, y = var_1064)[name = tensor<string, []>("v_masked_1")];
+            tensor<fp32, [9, 4, 64, 64]> var_1066 = mul(x = prev_kv_9, y = sqrt_s0_9)[name = tensor<string, []>("op_1066")];
+            tensor<bool, []> var_1068_transpose_x_1 = const()[name = tensor<string, []>("op_1068_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_1068_transpose_y_1 = const()[name = tensor<string, []>("op_1068_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [9, 4, 64, 64]> var_1068 = matmul(transpose_x = var_1068_transpose_x_1, transpose_y = var_1068_transpose_y_1, x = k_9, y = v_masked_1)[name = tensor<string, []>("op_1068")];
+            tensor<fp32, [9, 4, 64, 64]> new_kv_unnorm_9 = add(x = var_1066, y = var_1068)[name = tensor<string, []>("new_kv_unnorm_9")];
+            tensor<bool, []> var_1070_keep_dims_0 = const()[name = tensor<string, []>("op_1070_keep_dims_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, []> var_1070 = reduce_sum(keep_dims = var_1070_keep_dims_0, x = valid_mask)[name = tensor<string, []>("op_1070")];
+            tensor<int32, [1]> var_1071 = const()[name = tensor<string, []>("op_1071"), val = tensor<int32, [1]>([1])];
+            tensor<fp32, [1]> var_1072 = reshape(shape = var_1071, x = var_1070)[name = tensor<string, []>("op_1072")];
+            tensor<fp32, [1]> new_scale_9 = add(x = prev_scale_9, y = var_1072)[name = tensor<string, []>("new_scale_9")];
             tensor<fp32, []> const_21 = const()[name = tensor<string, []>("const_21"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_2 = clip(alpha = var_926, beta = const_21, x = new_scale_9)[name = tensor<string, []>("clip_2")];
+            tensor<fp32, [1]> clip_2 = clip(alpha = var_29, beta = const_21, x = new_scale_9)[name = tensor<string, []>("clip_2")];
             tensor<fp32, [1]> sqrt_new_scale_1 = sqrt(x = clip_2)[name = tensor<string, []>("sqrt_new_scale_1")];
-            tensor<fp32, [9, 4, 64, 64]> var_1073 = real_div(x = new_kv_unnorm_9, y = sqrt_new_scale_1)[name = tensor<string, []>("op_1073")];
-            tensor<int32, [4]> var_1074_perm_0 = const()[name = tensor<string, []>("op_1074_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [9, 4, 64, 64]> var_1076 = real_div(x = new_kv_unnorm_9, y = sqrt_new_scale_1)[name = tensor<string, []>("op_1076")];
+            tensor<int32, [4]> var_1077_perm_0 = const()[name = tensor<string, []>("op_1077_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_27_axes_0 = const()[name = tensor<string, []>("out_27_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [9, 1, 4, 64]> var_1074 = transpose(perm = var_1074_perm_0, x = out_25)[name = tensor<string, []>("transpose_24")];
-            tensor<fp32, [9, 1, 4, 64]> out_27 = layer_norm(axes = out_27_axes_0, epsilon = var_923, x = var_1074)[name = tensor<string, []>("out_27")];
-            tensor<int32, [3]> var_1078 = const()[name = tensor<string, []>("op_1078"), val = tensor<int32, [3]>([9, 1, 256])];
-            tensor<fp32, [9, 1, 256]> out_29 = reshape(shape = var_1078, x = out_27)[name = tensor<string, []>("out_29")];
-            tensor<fp32, [9, 1, 256]> var_1080 = silu(x = input_169)[name = tensor<string, []>("op_1080")];
-            tensor<fp32, [9, 1, 256]> input_171 = mul(x = var_1080, y = out_29)[name = tensor<string, []>("input_171")];
-            tensor<fp32, [9, 1, 256]> ret_out_9 = linear(bias = decoder_out_proj_0_bias, weight = decoder_out_proj_0_weight, x = input_171)[name = tensor<string, []>("linear_42")];
-            tensor<fp32, [9, 1, 256]> input_173 = add(x = x_29, y = ret_out_9)[name = tensor<string, []>("input_173")];
+            tensor<fp32, [9, 1, 4, 64]> var_1077 = transpose(perm = var_1077_perm_0, x = out_25)[name = tensor<string, []>("transpose_24")];
+            tensor<fp32, [9, 1, 4, 64]> out_27 = layer_norm(axes = out_27_axes_0, epsilon = var_44, x = var_1077)[name = tensor<string, []>("out_27")];
+            tensor<int32, [3]> var_1081 = const()[name = tensor<string, []>("op_1081"), val = tensor<int32, [3]>([9, 1, 256])];
+            tensor<fp32, [9, 1, 256]> out_29 = reshape(shape = var_1081, x = out_27)[name = tensor<string, []>("out_29")];
+            tensor<fp32, [9, 1, 256]> var_1083 = silu(x = input_171)[name = tensor<string, []>("op_1083")];
+            tensor<fp32, [9, 1, 256]> input_173 = mul(x = var_1083, y = out_29)[name = tensor<string, []>("input_173")];
+            tensor<fp32, [9, 1, 256]> ret_out_9 = linear(bias = inner_decoder_out_proj_0_bias, weight = inner_decoder_out_proj_0_weight, x = input_173)[name = tensor<string, []>("linear_42")];
+            tensor<fp32, [9, 1, 256]> input_175 = add(x = x_29, y = ret_out_9)[name = tensor<string, []>("input_175")];
             tensor<int32, [1]> xt_1_axes_0 = const()[name = tensor<string, []>("xt_1_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [9, 1, 256]> xt_1 = layer_norm(axes = xt_1_axes_0, beta = decoder_norm11_0_bias, epsilon = var_921, gamma = decoder_norm11_0_weight, x = input_173)[name = tensor<string, []>("xt_1")];
-            tensor<int32, [4]> var_1090 = const()[name = tensor<string, []>("op_1090"), val = tensor<int32, [4]>([1, 9, 1, 256])];
-            tensor<fp32, [1, 9, 1, 256]> var_1091 = reshape(shape = var_1090, x = xt_1)[name = tensor<string, []>("op_1091")];
-            tensor<int32, [4]> var_1092_perm_0 = const()[name = tensor<string, []>("op_1092_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1095 = const()[name = tensor<string, []>("op_1095"), val = tensor<int32, [3]>([1, 9, 256])];
-            tensor<fp32, [1, 1, 9, 256]> var_1092 = transpose(perm = var_1092_perm_0, x = var_1091)[name = tensor<string, []>("transpose_23")];
-            tensor<fp32, [1, 9, 256]> query_1 = reshape(shape = var_1095, x = var_1092)[name = tensor<string, []>("query_1")];
+            tensor<fp32, [9, 1, 256]> xt_1 = layer_norm(axes = xt_1_axes_0, beta = inner_decoder_norm11_0_bias, epsilon = var_36, gamma = inner_decoder_norm11_0_weight, x = input_175)[name = tensor<string, []>("xt_1")];
+            tensor<int32, [4]> var_1093 = const()[name = tensor<string, []>("op_1093"), val = tensor<int32, [4]>([1, 9, 1, 256])];
+            tensor<fp32, [1, 9, 1, 256]> var_1094 = reshape(shape = var_1093, x = xt_1)[name = tensor<string, []>("op_1094")];
+            tensor<int32, [4]> var_1095_perm_0 = const()[name = tensor<string, []>("op_1095_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1098 = const()[name = tensor<string, []>("op_1098"), val = tensor<int32, [3]>([1, 9, 256])];
+            tensor<fp32, [1, 1, 9, 256]> var_1095 = transpose(perm = var_1095_perm_0, x = var_1094)[name = tensor<string, []>("transpose_23")];
+            tensor<fp32, [1, 9, 256]> query_1 = reshape(shape = var_1098, x = var_1095)[name = tensor<string, []>("query_1")];
             tensor<int32, [3]> query_3_perm_0 = const()[name = tensor<string, []>("query_3_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [9, 1, 256]> query_3 = transpose(perm = query_3_perm_0, x = query_1)[name = tensor<string, []>("transpose_22")];
-            tensor<fp32, [9, 1, 768]> var_1118 = linear(bias = decoder_self_attn2_0_in_proj_bias, weight = decoder_self_attn2_0_in_proj_weight, x = query_3)[name = tensor<string, []>("linear_43")];
+            tensor<fp32, [9, 1, 768]> var_1121 = linear(bias = inner_decoder_self_attn2_0_in_proj_bias, weight = inner_decoder_self_attn2_0_in_proj_weight, x = query_3)[name = tensor<string, []>("linear_43")];
             tensor<int32, [4]> concat_1 = const()[name = tensor<string, []>("concat_1"), val = tensor<int32, [4]>([9, 1, 3, 256])];
-            tensor<fp32, [9, 1, 3, 256]> var_1120 = reshape(shape = concat_1, x = var_1118)[name = tensor<string, []>("op_1120")];
-            tensor<int32, [1]> var_1121_axes_0 = const()[name = tensor<string, []>("op_1121_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp32, [1, 9, 1, 3, 256]> var_1121 = expand_dims(axes = var_1121_axes_0, x = var_1120)[name = tensor<string, []>("op_1121")];
-            tensor<int32, [5]> var_1122_perm_0 = const()[name = tensor<string, []>("op_1122_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
-            tensor<int32, [1]> var_1123_axes_0 = const()[name = tensor<string, []>("op_1123_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp32, [3, 9, 1, 1, 256]> var_1122 = transpose(perm = var_1122_perm_0, x = var_1121)[name = tensor<string, []>("transpose_21")];
-            tensor<fp32, [3, 9, 1, 256]> var_1123 = squeeze(axes = var_1123_axes_0, x = var_1122)[name = tensor<string, []>("op_1123")];
+            tensor<fp32, [9, 1, 3, 256]> var_1123 = reshape(shape = concat_1, x = var_1121)[name = tensor<string, []>("op_1123")];
+            tensor<int32, [1]> var_1124_axes_0 = const()[name = tensor<string, []>("op_1124_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 9, 1, 3, 256]> var_1124 = expand_dims(axes = var_1124_axes_0, x = var_1123)[name = tensor<string, []>("op_1124")];
+            tensor<int32, [5]> var_1125_perm_0 = const()[name = tensor<string, []>("op_1125_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
+            tensor<int32, [1]> var_1126_axes_0 = const()[name = tensor<string, []>("op_1126_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp32, [3, 9, 1, 1, 256]> var_1125 = transpose(perm = var_1125_perm_0, x = var_1124)[name = tensor<string, []>("transpose_21")];
+            tensor<fp32, [3, 9, 1, 256]> var_1126 = squeeze(axes = var_1126_axes_0, x = var_1125)[name = tensor<string, []>("op_1126")];
             tensor<int32, [4]> q_11_begin_0 = const()[name = tensor<string, []>("q_11_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> q_11_end_0 = const()[name = tensor<string, []>("q_11_end_0"), val = tensor<int32, [4]>([1, 9, 1, 256])];
             tensor<bool, [4]> q_11_end_mask_0 = const()[name = tensor<string, []>("q_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> q_11_squeeze_mask_0 = const()[name = tensor<string, []>("q_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [9, 1, 256]> q_11 = slice_by_index(begin = q_11_begin_0, end = q_11_end_0, end_mask = q_11_end_mask_0, squeeze_mask = q_11_squeeze_mask_0, x = var_1123)[name = tensor<string, []>("q_11")];
+            tensor<fp32, [9, 1, 256]> q_11 = slice_by_index(begin = q_11_begin_0, end = q_11_end_0, end_mask = q_11_end_mask_0, squeeze_mask = q_11_squeeze_mask_0, x = var_1126)[name = tensor<string, []>("q_11")];
             tensor<int32, [4]> k_11_begin_0 = const()[name = tensor<string, []>("k_11_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> k_11_end_0 = const()[name = tensor<string, []>("k_11_end_0"), val = tensor<int32, [4]>([2, 9, 1, 256])];
             tensor<bool, [4]> k_11_end_mask_0 = const()[name = tensor<string, []>("k_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> k_11_squeeze_mask_0 = const()[name = tensor<string, []>("k_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [9, 1, 256]> k_11 = slice_by_index(begin = k_11_begin_0, end = k_11_end_0, end_mask = k_11_end_mask_0, squeeze_mask = k_11_squeeze_mask_0, x = var_1123)[name = tensor<string, []>("k_11")];
+            tensor<fp32, [9, 1, 256]> k_11 = slice_by_index(begin = k_11_begin_0, end = k_11_end_0, end_mask = k_11_end_mask_0, squeeze_mask = k_11_squeeze_mask_0, x = var_1126)[name = tensor<string, []>("k_11")];
             tensor<int32, [4]> v_11_begin_0 = const()[name = tensor<string, []>("v_11_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> v_11_end_0 = const()[name = tensor<string, []>("v_11_end_0"), val = tensor<int32, [4]>([3, 9, 1, 256])];
             tensor<bool, [4]> v_11_end_mask_0 = const()[name = tensor<string, []>("v_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> v_11_squeeze_mask_0 = const()[name = tensor<string, []>("v_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [9, 1, 256]> v_11 = slice_by_index(begin = v_11_begin_0, end = v_11_end_0, end_mask = v_11_end_mask_0, squeeze_mask = v_11_squeeze_mask_0, x = var_1123)[name = tensor<string, []>("v_11")];
-            tensor<int32, [3]> var_1131 = const()[name = tensor<string, []>("op_1131"), val = tensor<int32, [3]>([9, 4, 64])];
-            tensor<fp32, [9, 4, 64]> var_1132 = reshape(shape = var_1131, x = q_11)[name = tensor<string, []>("op_1132")];
+            tensor<fp32, [9, 1, 256]> v_11 = slice_by_index(begin = v_11_begin_0, end = v_11_end_0, end_mask = v_11_end_mask_0, squeeze_mask = v_11_squeeze_mask_0, x = var_1126)[name = tensor<string, []>("v_11")];
+            tensor<int32, [3]> var_1134 = const()[name = tensor<string, []>("op_1134"), val = tensor<int32, [3]>([9, 4, 64])];
+            tensor<fp32, [9, 4, 64]> var_1135 = reshape(shape = var_1134, x = q_11)[name = tensor<string, []>("op_1135")];
             tensor<int32, [3]> q_13_perm_0 = const()[name = tensor<string, []>("q_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1138 = const()[name = tensor<string, []>("op_1138"), val = tensor<int32, [3]>([9, 4, 64])];
-            tensor<fp32, [9, 4, 64]> var_1139 = reshape(shape = var_1138, x = k_11)[name = tensor<string, []>("op_1139")];
+            tensor<int32, [3]> var_1141 = const()[name = tensor<string, []>("op_1141"), val = tensor<int32, [3]>([9, 4, 64])];
+            tensor<fp32, [9, 4, 64]> var_1142 = reshape(shape = var_1141, x = k_11)[name = tensor<string, []>("op_1142")];
             tensor<int32, [3]> k_13_perm_0 = const()[name = tensor<string, []>("k_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1145 = const()[name = tensor<string, []>("op_1145"), val = tensor<int32, [3]>([9, 4, 64])];
-            tensor<fp32, [9, 4, 64]> var_1146 = reshape(shape = var_1145, x = v_11)[name = tensor<string, []>("op_1146")];
+            tensor<int32, [3]> var_1148 = const()[name = tensor<string, []>("op_1148"), val = tensor<int32, [3]>([9, 4, 64])];
+            tensor<fp32, [9, 4, 64]> var_1149 = reshape(shape = var_1148, x = v_11)[name = tensor<string, []>("op_1149")];
             tensor<int32, [3]> v_13_perm_0 = const()[name = tensor<string, []>("v_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [4]> var_1149 = const()[name = tensor<string, []>("op_1149"), val = tensor<int32, [4]>([1, 4, 9, 64])];
-            tensor<fp32, [4, 9, 64]> q_13 = transpose(perm = q_13_perm_0, x = var_1132)[name = tensor<string, []>("transpose_20")];
-            tensor<fp32, [1, 4, 9, 64]> q_15 = reshape(shape = var_1149, x = q_13)[name = tensor<string, []>("q_15")];
-            tensor<int32, [4]> var_1151 = const()[name = tensor<string, []>("op_1151"), val = tensor<int32, [4]>([1, 4, 9, 64])];
-            tensor<fp32, [4, 9, 64]> k_13 = transpose(perm = k_13_perm_0, x = var_1139)[name = tensor<string, []>("transpose_19")];
-            tensor<fp32, [1, 4, 9, 64]> k_15 = reshape(shape = var_1151, x = k_13)[name = tensor<string, []>("k_15")];
-            tensor<int32, [4]> var_1153 = const()[name = tensor<string, []>("op_1153"), val = tensor<int32, [4]>([1, 4, 9, 64])];
-            tensor<fp32, [4, 9, 64]> v_13 = transpose(perm = v_13_perm_0, x = var_1146)[name = tensor<string, []>("transpose_18")];
-            tensor<fp32, [1, 4, 9, 64]> v_15 = reshape(shape = var_1153, x = v_13)[name = tensor<string, []>("v_15")];
+            tensor<int32, [4]> var_1152 = const()[name = tensor<string, []>("op_1152"), val = tensor<int32, [4]>([1, 4, 9, 64])];
+            tensor<fp32, [4, 9, 64]> q_13 = transpose(perm = q_13_perm_0, x = var_1135)[name = tensor<string, []>("transpose_20")];
+            tensor<fp32, [1, 4, 9, 64]> q_15 = reshape(shape = var_1152, x = q_13)[name = tensor<string, []>("q_15")];
+            tensor<int32, [4]> var_1154 = const()[name = tensor<string, []>("op_1154"), val = tensor<int32, [4]>([1, 4, 9, 64])];
+            tensor<fp32, [4, 9, 64]> k_13 = transpose(perm = k_13_perm_0, x = var_1142)[name = tensor<string, []>("transpose_19")];
+            tensor<fp32, [1, 4, 9, 64]> k_15 = reshape(shape = var_1154, x = k_13)[name = tensor<string, []>("k_15")];
+            tensor<int32, [4]> var_1156 = const()[name = tensor<string, []>("op_1156"), val = tensor<int32, [4]>([1, 4, 9, 64])];
+            tensor<fp32, [4, 9, 64]> v_13 = transpose(perm = v_13_perm_0, x = var_1149)[name = tensor<string, []>("transpose_18")];
+            tensor<fp32, [1, 4, 9, 64]> v_15 = reshape(shape = var_1156, x = v_13)[name = tensor<string, []>("v_15")];
             tensor<fp32, []> mul_1_y_0 = const()[name = tensor<string, []>("mul_1_y_0"), val = tensor<fp32, []>(0x1p-3)];
             tensor<fp32, [1, 4, 9, 64]> mul_1 = mul(x = q_15, y = mul_1_y_0)[name = tensor<string, []>("mul_1")];
             tensor<bool, []> matmul_0_transpose_y_0 = const()[name = tensor<string, []>("matmul_0_transpose_y_0"), val = tensor<bool, []>(true)];
@@ -981,30 +980,30 @@ program(1.0)
             tensor<bool, []> attn_output_1_transpose_x_0 = const()[name = tensor<string, []>("attn_output_1_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> attn_output_1_transpose_y_0 = const()[name = tensor<string, []>("attn_output_1_transpose_y_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [1, 4, 9, 64]> attn_output_1 = matmul(transpose_x = attn_output_1_transpose_x_0, transpose_y = attn_output_1_transpose_y_0, x = softmax_0, y = v_15)[name = tensor<string, []>("attn_output_1")];
-            tensor<int32, [4]> var_1156 = const()[name = tensor<string, []>("op_1156"), val = tensor<int32, [4]>([2, 0, 1, 3])];
-            tensor<int32, [2]> var_1161 = const()[name = tensor<string, []>("op_1161"), val = tensor<int32, [2]>([9, 256])];
-            tensor<fp32, [9, 1, 4, 64]> var_1157 = transpose(perm = var_1156, x = attn_output_1)[name = tensor<string, []>("transpose_17")];
-            tensor<fp32, [9, 256]> attn_output_3 = reshape(shape = var_1161, x = var_1157)[name = tensor<string, []>("attn_output_3")];
-            tensor<fp32, [9, 256]> attn_output_5 = linear(bias = decoder_self_attn2_0_out_proj_bias, weight = decoder_self_attn2_0_out_proj_weight, x = attn_output_3)[name = tensor<string, []>("linear_44")];
-            tensor<int32, [3]> var_1165 = const()[name = tensor<string, []>("op_1165"), val = tensor<int32, [3]>([9, 1, 256])];
-            tensor<fp32, [9, 1, 256]> attn_output_7 = reshape(shape = var_1165, x = attn_output_5)[name = tensor<string, []>("attn_output_7")];
+            tensor<int32, [4]> var_1159 = const()[name = tensor<string, []>("op_1159"), val = tensor<int32, [4]>([2, 0, 1, 3])];
+            tensor<int32, [2]> var_1164 = const()[name = tensor<string, []>("op_1164"), val = tensor<int32, [2]>([9, 256])];
+            tensor<fp32, [9, 1, 4, 64]> var_1160 = transpose(perm = var_1159, x = attn_output_1)[name = tensor<string, []>("transpose_17")];
+            tensor<fp32, [9, 256]> attn_output_3 = reshape(shape = var_1164, x = var_1160)[name = tensor<string, []>("attn_output_3")];
+            tensor<fp32, [9, 256]> attn_output_5 = linear(bias = inner_decoder_self_attn2_0_out_proj_bias, weight = inner_decoder_self_attn2_0_out_proj_weight, x = attn_output_3)[name = tensor<string, []>("linear_44")];
+            tensor<int32, [3]> var_1168 = const()[name = tensor<string, []>("op_1168"), val = tensor<int32, [3]>([9, 1, 256])];
+            tensor<fp32, [9, 1, 256]> attn_output_7 = reshape(shape = var_1168, x = attn_output_5)[name = tensor<string, []>("attn_output_7")];
             tensor<int32, [3]> sa2_out_1_perm_0 = const()[name = tensor<string, []>("sa2_out_1_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [1, 9, 256]> sa2_out_1 = transpose(perm = sa2_out_1_perm_0, x = attn_output_7)[name = tensor<string, []>("transpose_16")];
-            tensor<fp32, [1, 9, 256]> input_175 = add(x = query_1, y = sa2_out_1)[name = tensor<string, []>("input_175")];
-            tensor<int32, [1]> input_177_axes_0 = const()[name = tensor<string, []>("input_177_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 9, 256]> input_177 = layer_norm(axes = input_177_axes_0, beta = decoder_norm21_0_bias, epsilon = var_921, gamma = decoder_norm21_0_weight, x = input_175)[name = tensor<string, []>("input_177")];
-            tensor<fp32, [1, 9, 2048]> input_179 = linear(bias = decoder_linear1_0_bias, weight = decoder_linear1_0_weight, x = input_177)[name = tensor<string, []>("linear_45")];
-            tensor<fp32, [1, 9, 2048]> input_181 = relu(x = input_179)[name = tensor<string, []>("input_181")];
-            tensor<fp32, [1, 9, 256]> ffn_out_1 = linear(bias = decoder_linear2_0_bias, weight = decoder_linear2_0_weight, x = input_181)[name = tensor<string, []>("linear_46")];
-            tensor<fp32, [1, 9, 256]> input_183 = add(x = input_177, y = ffn_out_1)[name = tensor<string, []>("input_183")];
+            tensor<fp32, [1, 9, 256]> input_177 = add(x = query_1, y = sa2_out_1)[name = tensor<string, []>("input_177")];
+            tensor<int32, [1]> input_179_axes_0 = const()[name = tensor<string, []>("input_179_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 9, 256]> input_179 = layer_norm(axes = input_179_axes_0, beta = inner_decoder_norm21_0_bias, epsilon = var_36, gamma = inner_decoder_norm21_0_weight, x = input_177)[name = tensor<string, []>("input_179")];
+            tensor<fp32, [1, 9, 2048]> input_181 = linear(bias = inner_decoder_linear1_0_bias, weight = inner_decoder_linear1_0_weight, x = input_179)[name = tensor<string, []>("linear_45")];
+            tensor<fp32, [1, 9, 2048]> input_183 = relu(x = input_181)[name = tensor<string, []>("input_183")];
+            tensor<fp32, [1, 9, 256]> ffn_out_1 = linear(bias = inner_decoder_linear2_0_bias, weight = inner_decoder_linear2_0_weight, x = input_183)[name = tensor<string, []>("linear_46")];
+            tensor<fp32, [1, 9, 256]> input_185 = add(x = input_179, y = ffn_out_1)[name = tensor<string, []>("input_185")];
             tensor<int32, [1]> xt_3_axes_0 = const()[name = tensor<string, []>("xt_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 9, 256]> xt_3 = layer_norm(axes = xt_3_axes_0, beta = decoder_norm22_0_bias, epsilon = var_921, gamma = decoder_norm22_0_weight, x = input_183)[name = tensor<string, []>("xt_3")];
-            tensor<int32, [4]> var_1185 = const()[name = tensor<string, []>("op_1185"), val = tensor<int32, [4]>([1, 1, 9, 256])];
-            tensor<fp32, [1, 1, 9, 256]> x_31 = reshape(shape = var_1185, x = xt_3)[name = tensor<string, []>("x_31")];
-            tensor<int32, [4]> var_1187_perm_0 = const()[name = tensor<string, []>("op_1187_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1191 = const()[name = tensor<string, []>("op_1191"), val = tensor<int32, [3]>([9, 1, 256])];
-            tensor<fp32, [1, 9, 1, 256]> var_1187 = transpose(perm = var_1187_perm_0, x = x_31)[name = tensor<string, []>("transpose_15")];
-            tensor<fp32, [9, 1, 256]> x = reshape(shape = var_1191, x = var_1187)[name = tensor<string, []>("x")];
+            tensor<fp32, [1, 9, 256]> xt_3 = layer_norm(axes = xt_3_axes_0, beta = inner_decoder_norm22_0_bias, epsilon = var_36, gamma = inner_decoder_norm22_0_weight, x = input_185)[name = tensor<string, []>("xt_3")];
+            tensor<int32, [4]> var_1188 = const()[name = tensor<string, []>("op_1188"), val = tensor<int32, [4]>([1, 1, 9, 256])];
+            tensor<fp32, [1, 1, 9, 256]> x_31 = reshape(shape = var_1188, x = xt_3)[name = tensor<string, []>("x_31")];
+            tensor<int32, [4]> var_1190_perm_0 = const()[name = tensor<string, []>("op_1190_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1194 = const()[name = tensor<string, []>("op_1194"), val = tensor<int32, [3]>([9, 1, 256])];
+            tensor<fp32, [1, 9, 1, 256]> var_1190 = transpose(perm = var_1190_perm_0, x = x_31)[name = tensor<string, []>("transpose_15")];
+            tensor<fp32, [9, 1, 256]> x = reshape(shape = var_1194, x = var_1190)[name = tensor<string, []>("x")];
             tensor<int32, [5]> prev_kv_begin_0 = const()[name = tensor<string, []>("prev_kv_begin_0"), val = tensor<int32, [5]>([1, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_end_0 = const()[name = tensor<string, []>("prev_kv_end_0"), val = tensor<int32, [5]>([2, 9, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_end_mask_0 = const()[name = tensor<string, []>("prev_kv_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -1015,120 +1014,120 @@ program(1.0)
             tensor<bool, [2]> prev_scale_end_mask_0 = const()[name = tensor<string, []>("prev_scale_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale = slice_by_index(begin = prev_scale_begin_0, end = prev_scale_end_0, end_mask = prev_scale_end_mask_0, squeeze_mask = prev_scale_squeeze_mask_0, x = dec_scale)[name = tensor<string, []>("prev_scale")];
-            tensor<fp32, [9, 1, 256]> var_1199 = linear(bias = decoder_q_proj_1_bias, weight = decoder_q_proj_1_weight, x = x)[name = tensor<string, []>("linear_47")];
-            tensor<int32, [4]> var_1200 = const()[name = tensor<string, []>("op_1200"), val = tensor<int32, [4]>([9, 1, 4, 64])];
-            tensor<fp32, [9, 1, 4, 64]> var_1201 = reshape(shape = var_1200, x = var_1199)[name = tensor<string, []>("op_1201")];
+            tensor<fp32, [9, 1, 256]> var_1202 = linear(bias = inner_decoder_q_proj_1_bias, weight = inner_decoder_q_proj_1_weight, x = x)[name = tensor<string, []>("linear_47")];
+            tensor<int32, [4]> var_1203 = const()[name = tensor<string, []>("op_1203"), val = tensor<int32, [4]>([9, 1, 4, 64])];
+            tensor<fp32, [9, 1, 4, 64]> var_1204 = reshape(shape = var_1203, x = var_1202)[name = tensor<string, []>("op_1204")];
             tensor<int32, [4]> q_17_perm_0 = const()[name = tensor<string, []>("q_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [9, 1, 256]> var_1205 = linear(bias = decoder_k_proj_1_bias, weight = decoder_k_proj_1_weight, x = x)[name = tensor<string, []>("linear_48")];
-            tensor<fp32, []> var_1206 = const()[name = tensor<string, []>("op_1206"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [9, 1, 256]> var_1207 = mul(x = var_1205, y = var_1206)[name = tensor<string, []>("op_1207")];
-            tensor<int32, [4]> var_1208 = const()[name = tensor<string, []>("op_1208"), val = tensor<int32, [4]>([9, 1, 4, 64])];
-            tensor<fp32, [9, 1, 4, 64]> var_1209 = reshape(shape = var_1208, x = var_1207)[name = tensor<string, []>("op_1209")];
+            tensor<fp32, [9, 1, 256]> var_1208 = linear(bias = inner_decoder_k_proj_1_bias, weight = inner_decoder_k_proj_1_weight, x = x)[name = tensor<string, []>("linear_48")];
+            tensor<fp32, []> var_1209 = const()[name = tensor<string, []>("op_1209"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [9, 1, 256]> var_1210 = mul(x = var_1208, y = var_1209)[name = tensor<string, []>("op_1210")];
+            tensor<int32, [4]> var_1211 = const()[name = tensor<string, []>("op_1211"), val = tensor<int32, [4]>([9, 1, 4, 64])];
+            tensor<fp32, [9, 1, 4, 64]> var_1212 = reshape(shape = var_1211, x = var_1210)[name = tensor<string, []>("op_1212")];
             tensor<int32, [4]> k_17_perm_0 = const()[name = tensor<string, []>("k_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [9, 1, 256]> var_1213 = linear(bias = decoder_v_proj_1_bias, weight = decoder_v_proj_1_weight, x = x)[name = tensor<string, []>("linear_49")];
-            tensor<int32, [4]> var_1214 = const()[name = tensor<string, []>("op_1214"), val = tensor<int32, [4]>([9, 1, 4, 64])];
-            tensor<fp32, [9, 1, 4, 64]> var_1215 = reshape(shape = var_1214, x = var_1213)[name = tensor<string, []>("op_1215")];
+            tensor<fp32, [9, 1, 256]> var_1216 = linear(bias = inner_decoder_v_proj_1_bias, weight = inner_decoder_v_proj_1_weight, x = x)[name = tensor<string, []>("linear_49")];
+            tensor<int32, [4]> var_1217 = const()[name = tensor<string, []>("op_1217"), val = tensor<int32, [4]>([9, 1, 4, 64])];
+            tensor<fp32, [9, 1, 4, 64]> var_1218 = reshape(shape = var_1217, x = var_1216)[name = tensor<string, []>("op_1218")];
             tensor<int32, [4]> v_17_perm_0 = const()[name = tensor<string, []>("v_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [9, 1, 256]> input_187 = linear(bias = decoder_g_proj_1_bias, weight = decoder_g_proj_1_weight, x = x)[name = tensor<string, []>("linear_50")];
+            tensor<fp32, [9, 1, 256]> input_189 = linear(bias = inner_decoder_g_proj_1_bias, weight = inner_decoder_g_proj_1_weight, x = x)[name = tensor<string, []>("linear_50")];
             tensor<fp32, [1]> sqrt_s0 = sqrt(x = prev_scale)[name = tensor<string, []>("sqrt_s0")];
             tensor<fp32, [1]> s_t = add(x = prev_scale, y = cumsum_mask_1)[name = tensor<string, []>("s_t")];
             tensor<fp32, []> const_32 = const()[name = tensor<string, []>("const_32"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_3 = clip(alpha = var_926, beta = const_32, x = s_t)[name = tensor<string, []>("clip_3")];
+            tensor<fp32, [1]> clip_3 = clip(alpha = var_29, beta = const_32, x = s_t)[name = tensor<string, []>("clip_3")];
             tensor<fp32, [1]> sqrt_s_t = sqrt(x = clip_3)[name = tensor<string, []>("sqrt_s_t")];
             tensor<bool, []> qk_transpose_x_1 = const()[name = tensor<string, []>("qk_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_transpose_y_1 = const()[name = tensor<string, []>("qk_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [9, 4, 1, 64]> k_17 = transpose(perm = k_17_perm_0, x = var_1209)[name = tensor<string, []>("transpose_13")];
-            tensor<fp32, [9, 4, 1, 64]> q_17 = transpose(perm = q_17_perm_0, x = var_1201)[name = tensor<string, []>("transpose_14")];
+            tensor<fp32, [9, 4, 1, 64]> k_17 = transpose(perm = k_17_perm_0, x = var_1212)[name = tensor<string, []>("transpose_13")];
+            tensor<fp32, [9, 4, 1, 64]> q_17 = transpose(perm = q_17_perm_0, x = var_1204)[name = tensor<string, []>("transpose_14")];
             tensor<fp32, [9, 4, 1, 1]> qk = matmul(transpose_x = qk_transpose_x_1, transpose_y = qk_transpose_y_1, x = q_17, y = k_17)[name = tensor<string, []>("qk")];
-            tensor<int32, [2]> var_1230 = const()[name = tensor<string, []>("op_1230"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp32, [1, 1]> var_1231 = reshape(shape = var_1230, x = sqrt_s_t)[name = tensor<string, []>("op_1231")];
-            tensor<fp32, [1, 1]> M = real_div(x = var_1047, y = var_1231)[name = tensor<string, []>("M")];
-            tensor<fp32, [9, 4, 1, 1]> var_1233 = mul(x = qk, y = M)[name = tensor<string, []>("op_1233")];
-            tensor<bool, []> inner_transpose_x_0 = const()[name = tensor<string, []>("inner_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> inner_transpose_y_0 = const()[name = tensor<string, []>("inner_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [9, 4, 1, 64]> v_17 = transpose(perm = v_17_perm_0, x = var_1215)[name = tensor<string, []>("transpose_12")];
-            tensor<fp32, [9, 4, 1, 64]> inner = matmul(transpose_x = inner_transpose_x_0, transpose_y = inner_transpose_y_0, x = var_1233, y = v_17)[name = tensor<string, []>("inner")];
-            tensor<bool, []> var_1235_transpose_x_0 = const()[name = tensor<string, []>("op_1235_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_1235_transpose_y_0 = const()[name = tensor<string, []>("op_1235_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [9, 4, 1, 64]> var_1235 = matmul(transpose_x = var_1235_transpose_x_0, transpose_y = var_1235_transpose_y_0, x = q_17, y = prev_kv)[name = tensor<string, []>("op_1235")];
-            tensor<fp32, [1]> var_1236 = real_div(x = sqrt_s0, y = sqrt_s_t)[name = tensor<string, []>("op_1236")];
-            tensor<int32, [4]> var_1237 = const()[name = tensor<string, []>("op_1237"), val = tensor<int32, [4]>([1, 1, 1, 1])];
-            tensor<fp32, [1, 1, 1, 1]> var_1238 = reshape(shape = var_1237, x = var_1236)[name = tensor<string, []>("op_1238")];
-            tensor<fp32, [9, 4, 1, 64]> cross = mul(x = var_1235, y = var_1238)[name = tensor<string, []>("cross")];
-            tensor<fp32, [9, 4, 1, 64]> out_31 = add(x = inner, y = cross)[name = tensor<string, []>("out_31")];
-            tensor<fp32, [9, 4, 1, 64]> v_masked = mul(x = v_17, y = var_1061)[name = tensor<string, []>("v_masked")];
-            tensor<fp32, [9, 4, 64, 64]> var_1244 = mul(x = prev_kv, y = sqrt_s0)[name = tensor<string, []>("op_1244")];
-            tensor<bool, []> var_1246_transpose_x_1 = const()[name = tensor<string, []>("op_1246_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_1246_transpose_y_1 = const()[name = tensor<string, []>("op_1246_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [9, 4, 64, 64]> var_1246 = matmul(transpose_x = var_1246_transpose_x_1, transpose_y = var_1246_transpose_y_1, x = k_17, y = v_masked)[name = tensor<string, []>("op_1246")];
-            tensor<fp32, [9, 4, 64, 64]> new_kv_unnorm = add(x = var_1244, y = var_1246)[name = tensor<string, []>("new_kv_unnorm")];
-            tensor<fp32, [1]> new_scale = add(x = prev_scale, y = var_1069)[name = tensor<string, []>("new_scale")];
+            tensor<int32, [2]> var_1233 = const()[name = tensor<string, []>("op_1233"), val = tensor<int32, [2]>([1, 1])];
+            tensor<fp32, [1, 1]> var_1234 = reshape(shape = var_1233, x = sqrt_s_t)[name = tensor<string, []>("op_1234")];
+            tensor<fp32, [1, 1]> M = real_div(x = var_1050, y = var_1234)[name = tensor<string, []>("M")];
+            tensor<fp32, [9, 4, 1, 1]> var_1236 = mul(x = qk, y = M)[name = tensor<string, []>("op_1236")];
+            tensor<bool, []> inner_11_transpose_x_0 = const()[name = tensor<string, []>("inner_11_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> inner_11_transpose_y_0 = const()[name = tensor<string, []>("inner_11_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [9, 4, 1, 64]> v_17 = transpose(perm = v_17_perm_0, x = var_1218)[name = tensor<string, []>("transpose_12")];
+            tensor<fp32, [9, 4, 1, 64]> inner_11 = matmul(transpose_x = inner_11_transpose_x_0, transpose_y = inner_11_transpose_y_0, x = var_1236, y = v_17)[name = tensor<string, []>("inner_11")];
+            tensor<bool, []> var_1238_transpose_x_0 = const()[name = tensor<string, []>("op_1238_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_1238_transpose_y_0 = const()[name = tensor<string, []>("op_1238_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [9, 4, 1, 64]> var_1238 = matmul(transpose_x = var_1238_transpose_x_0, transpose_y = var_1238_transpose_y_0, x = q_17, y = prev_kv)[name = tensor<string, []>("op_1238")];
+            tensor<fp32, [1]> var_1239 = real_div(x = sqrt_s0, y = sqrt_s_t)[name = tensor<string, []>("op_1239")];
+            tensor<int32, [4]> var_1240 = const()[name = tensor<string, []>("op_1240"), val = tensor<int32, [4]>([1, 1, 1, 1])];
+            tensor<fp32, [1, 1, 1, 1]> var_1241 = reshape(shape = var_1240, x = var_1239)[name = tensor<string, []>("op_1241")];
+            tensor<fp32, [9, 4, 1, 64]> cross = mul(x = var_1238, y = var_1241)[name = tensor<string, []>("cross")];
+            tensor<fp32, [9, 4, 1, 64]> out_31 = add(x = inner_11, y = cross)[name = tensor<string, []>("out_31")];
+            tensor<fp32, [9, 4, 1, 64]> v_masked = mul(x = v_17, y = var_1064)[name = tensor<string, []>("v_masked")];
+            tensor<fp32, [9, 4, 64, 64]> var_1247 = mul(x = prev_kv, y = sqrt_s0)[name = tensor<string, []>("op_1247")];
+            tensor<bool, []> var_1249_transpose_x_1 = const()[name = tensor<string, []>("op_1249_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_1249_transpose_y_1 = const()[name = tensor<string, []>("op_1249_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [9, 4, 64, 64]> var_1249 = matmul(transpose_x = var_1249_transpose_x_1, transpose_y = var_1249_transpose_y_1, x = k_17, y = v_masked)[name = tensor<string, []>("op_1249")];
+            tensor<fp32, [9, 4, 64, 64]> new_kv_unnorm = add(x = var_1247, y = var_1249)[name = tensor<string, []>("new_kv_unnorm")];
+            tensor<fp32, [1]> new_scale = add(x = prev_scale, y = var_1072)[name = tensor<string, []>("new_scale")];
             tensor<fp32, []> const_33 = const()[name = tensor<string, []>("const_33"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_4 = clip(alpha = var_926, beta = const_33, x = new_scale)[name = tensor<string, []>("clip_4")];
+            tensor<fp32, [1]> clip_4 = clip(alpha = var_29, beta = const_33, x = new_scale)[name = tensor<string, []>("clip_4")];
             tensor<fp32, [1]> sqrt_new_scale = sqrt(x = clip_4)[name = tensor<string, []>("sqrt_new_scale")];
             tensor<fp32, [9, 4, 64, 64]> nkv = real_div(x = new_kv_unnorm, y = sqrt_new_scale)[name = tensor<string, []>("nkv")];
-            tensor<int32, [4]> var_1255_perm_0 = const()[name = tensor<string, []>("op_1255_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [4]> var_1258_perm_0 = const()[name = tensor<string, []>("op_1258_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_33_axes_0 = const()[name = tensor<string, []>("out_33_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [9, 1, 4, 64]> var_1255 = transpose(perm = var_1255_perm_0, x = out_31)[name = tensor<string, []>("transpose_11")];
-            tensor<fp32, [9, 1, 4, 64]> out_33 = layer_norm(axes = out_33_axes_0, epsilon = var_923, x = var_1255)[name = tensor<string, []>("out_33")];
-            tensor<int32, [3]> var_1259 = const()[name = tensor<string, []>("op_1259"), val = tensor<int32, [3]>([9, 1, 256])];
-            tensor<fp32, [9, 1, 256]> out = reshape(shape = var_1259, x = out_33)[name = tensor<string, []>("out")];
-            tensor<fp32, [9, 1, 256]> var_1261 = silu(x = input_187)[name = tensor<string, []>("op_1261")];
-            tensor<fp32, [9, 1, 256]> input_189 = mul(x = var_1261, y = out)[name = tensor<string, []>("input_189")];
-            tensor<fp32, [9, 1, 256]> ret_out = linear(bias = decoder_out_proj_1_bias, weight = decoder_out_proj_1_weight, x = input_189)[name = tensor<string, []>("linear_51")];
-            tensor<fp32, [9, 1, 256]> input_191 = add(x = x, y = ret_out)[name = tensor<string, []>("input_191")];
+            tensor<fp32, [9, 1, 4, 64]> var_1258 = transpose(perm = var_1258_perm_0, x = out_31)[name = tensor<string, []>("transpose_11")];
+            tensor<fp32, [9, 1, 4, 64]> out_33 = layer_norm(axes = out_33_axes_0, epsilon = var_44, x = var_1258)[name = tensor<string, []>("out_33")];
+            tensor<int32, [3]> var_1262 = const()[name = tensor<string, []>("op_1262"), val = tensor<int32, [3]>([9, 1, 256])];
+            tensor<fp32, [9, 1, 256]> out = reshape(shape = var_1262, x = out_33)[name = tensor<string, []>("out")];
+            tensor<fp32, [9, 1, 256]> var_1264 = silu(x = input_189)[name = tensor<string, []>("op_1264")];
+            tensor<fp32, [9, 1, 256]> input_191 = mul(x = var_1264, y = out)[name = tensor<string, []>("input_191")];
+            tensor<fp32, [9, 1, 256]> ret_out = linear(bias = inner_decoder_out_proj_1_bias, weight = inner_decoder_out_proj_1_weight, x = input_191)[name = tensor<string, []>("linear_51")];
+            tensor<fp32, [9, 1, 256]> input_193 = add(x = x, y = ret_out)[name = tensor<string, []>("input_193")];
             tensor<int32, [1]> xt_5_axes_0 = const()[name = tensor<string, []>("xt_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [9, 1, 256]> xt_5 = layer_norm(axes = xt_5_axes_0, beta = decoder_norm11_1_bias, epsilon = var_921, gamma = decoder_norm11_1_weight, x = input_191)[name = tensor<string, []>("xt_5")];
-            tensor<int32, [4]> var_1271 = const()[name = tensor<string, []>("op_1271"), val = tensor<int32, [4]>([1, 9, 1, 256])];
-            tensor<fp32, [1, 9, 1, 256]> var_1272 = reshape(shape = var_1271, x = xt_5)[name = tensor<string, []>("op_1272")];
-            tensor<int32, [4]> var_1273_perm_0 = const()[name = tensor<string, []>("op_1273_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1276 = const()[name = tensor<string, []>("op_1276"), val = tensor<int32, [3]>([1, 9, 256])];
-            tensor<fp32, [1, 1, 9, 256]> var_1273 = transpose(perm = var_1273_perm_0, x = var_1272)[name = tensor<string, []>("transpose_10")];
-            tensor<fp32, [1, 9, 256]> query_5 = reshape(shape = var_1276, x = var_1273)[name = tensor<string, []>("query_5")];
+            tensor<fp32, [9, 1, 256]> xt_5 = layer_norm(axes = xt_5_axes_0, beta = inner_decoder_norm11_1_bias, epsilon = var_36, gamma = inner_decoder_norm11_1_weight, x = input_193)[name = tensor<string, []>("xt_5")];
+            tensor<int32, [4]> var_1274 = const()[name = tensor<string, []>("op_1274"), val = tensor<int32, [4]>([1, 9, 1, 256])];
+            tensor<fp32, [1, 9, 1, 256]> var_1275 = reshape(shape = var_1274, x = xt_5)[name = tensor<string, []>("op_1275")];
+            tensor<int32, [4]> var_1276_perm_0 = const()[name = tensor<string, []>("op_1276_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1279 = const()[name = tensor<string, []>("op_1279"), val = tensor<int32, [3]>([1, 9, 256])];
+            tensor<fp32, [1, 1, 9, 256]> var_1276 = transpose(perm = var_1276_perm_0, x = var_1275)[name = tensor<string, []>("transpose_10")];
+            tensor<fp32, [1, 9, 256]> query_5 = reshape(shape = var_1279, x = var_1276)[name = tensor<string, []>("query_5")];
             tensor<int32, [3]> query_perm_0 = const()[name = tensor<string, []>("query_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [9, 1, 256]> query = transpose(perm = query_perm_0, x = query_5)[name = tensor<string, []>("transpose_9")];
-            tensor<fp32, [9, 1, 768]> var_1299 = linear(bias = decoder_self_attn2_1_in_proj_bias, weight = decoder_self_attn2_1_in_proj_weight, x = query)[name = tensor<string, []>("linear_52")];
+            tensor<fp32, [9, 1, 768]> var_1302 = linear(bias = inner_decoder_self_attn2_1_in_proj_bias, weight = inner_decoder_self_attn2_1_in_proj_weight, x = query)[name = tensor<string, []>("linear_52")];
             tensor<int32, [4]> concat_2 = const()[name = tensor<string, []>("concat_2"), val = tensor<int32, [4]>([9, 1, 3, 256])];
-            tensor<fp32, [9, 1, 3, 256]> var_1301 = reshape(shape = concat_2, x = var_1299)[name = tensor<string, []>("op_1301")];
-            tensor<int32, [1]> var_1302_axes_0 = const()[name = tensor<string, []>("op_1302_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp32, [1, 9, 1, 3, 256]> var_1302 = expand_dims(axes = var_1302_axes_0, x = var_1301)[name = tensor<string, []>("op_1302")];
-            tensor<int32, [5]> var_1303_perm_0 = const()[name = tensor<string, []>("op_1303_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
-            tensor<int32, [1]> var_1304_axes_0 = const()[name = tensor<string, []>("op_1304_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp32, [3, 9, 1, 1, 256]> var_1303 = transpose(perm = var_1303_perm_0, x = var_1302)[name = tensor<string, []>("transpose_8")];
-            tensor<fp32, [3, 9, 1, 256]> var_1304 = squeeze(axes = var_1304_axes_0, x = var_1303)[name = tensor<string, []>("op_1304")];
+            tensor<fp32, [9, 1, 3, 256]> var_1304 = reshape(shape = concat_2, x = var_1302)[name = tensor<string, []>("op_1304")];
+            tensor<int32, [1]> var_1305_axes_0 = const()[name = tensor<string, []>("op_1305_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 9, 1, 3, 256]> var_1305 = expand_dims(axes = var_1305_axes_0, x = var_1304)[name = tensor<string, []>("op_1305")];
+            tensor<int32, [5]> var_1306_perm_0 = const()[name = tensor<string, []>("op_1306_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
+            tensor<int32, [1]> var_1307_axes_0 = const()[name = tensor<string, []>("op_1307_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp32, [3, 9, 1, 1, 256]> var_1306 = transpose(perm = var_1306_perm_0, x = var_1305)[name = tensor<string, []>("transpose_8")];
+            tensor<fp32, [3, 9, 1, 256]> var_1307 = squeeze(axes = var_1307_axes_0, x = var_1306)[name = tensor<string, []>("op_1307")];
             tensor<int32, [4]> q_19_begin_0 = const()[name = tensor<string, []>("q_19_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> q_19_end_0 = const()[name = tensor<string, []>("q_19_end_0"), val = tensor<int32, [4]>([1, 9, 1, 256])];
             tensor<bool, [4]> q_19_end_mask_0 = const()[name = tensor<string, []>("q_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> q_19_squeeze_mask_0 = const()[name = tensor<string, []>("q_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [9, 1, 256]> q_19 = slice_by_index(begin = q_19_begin_0, end = q_19_end_0, end_mask = q_19_end_mask_0, squeeze_mask = q_19_squeeze_mask_0, x = var_1304)[name = tensor<string, []>("q_19")];
+            tensor<fp32, [9, 1, 256]> q_19 = slice_by_index(begin = q_19_begin_0, end = q_19_end_0, end_mask = q_19_end_mask_0, squeeze_mask = q_19_squeeze_mask_0, x = var_1307)[name = tensor<string, []>("q_19")];
             tensor<int32, [4]> k_19_begin_0 = const()[name = tensor<string, []>("k_19_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> k_19_end_0 = const()[name = tensor<string, []>("k_19_end_0"), val = tensor<int32, [4]>([2, 9, 1, 256])];
             tensor<bool, [4]> k_19_end_mask_0 = const()[name = tensor<string, []>("k_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> k_19_squeeze_mask_0 = const()[name = tensor<string, []>("k_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [9, 1, 256]> k_19 = slice_by_index(begin = k_19_begin_0, end = k_19_end_0, end_mask = k_19_end_mask_0, squeeze_mask = k_19_squeeze_mask_0, x = var_1304)[name = tensor<string, []>("k_19")];
+            tensor<fp32, [9, 1, 256]> k_19 = slice_by_index(begin = k_19_begin_0, end = k_19_end_0, end_mask = k_19_end_mask_0, squeeze_mask = k_19_squeeze_mask_0, x = var_1307)[name = tensor<string, []>("k_19")];
             tensor<int32, [4]> v_19_begin_0 = const()[name = tensor<string, []>("v_19_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> v_19_end_0 = const()[name = tensor<string, []>("v_19_end_0"), val = tensor<int32, [4]>([3, 9, 1, 256])];
             tensor<bool, [4]> v_19_end_mask_0 = const()[name = tensor<string, []>("v_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> v_19_squeeze_mask_0 = const()[name = tensor<string, []>("v_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [9, 1, 256]> v_19 = slice_by_index(begin = v_19_begin_0, end = v_19_end_0, end_mask = v_19_end_mask_0, squeeze_mask = v_19_squeeze_mask_0, x = var_1304)[name = tensor<string, []>("v_19")];
-            tensor<int32, [3]> var_1312 = const()[name = tensor<string, []>("op_1312"), val = tensor<int32, [3]>([9, 4, 64])];
-            tensor<fp32, [9, 4, 64]> var_1313 = reshape(shape = var_1312, x = q_19)[name = tensor<string, []>("op_1313")];
+            tensor<fp32, [9, 1, 256]> v_19 = slice_by_index(begin = v_19_begin_0, end = v_19_end_0, end_mask = v_19_end_mask_0, squeeze_mask = v_19_squeeze_mask_0, x = var_1307)[name = tensor<string, []>("v_19")];
+            tensor<int32, [3]> var_1315 = const()[name = tensor<string, []>("op_1315"), val = tensor<int32, [3]>([9, 4, 64])];
+            tensor<fp32, [9, 4, 64]> var_1316 = reshape(shape = var_1315, x = q_19)[name = tensor<string, []>("op_1316")];
             tensor<int32, [3]> q_21_perm_0 = const()[name = tensor<string, []>("q_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1319 = const()[name = tensor<string, []>("op_1319"), val = tensor<int32, [3]>([9, 4, 64])];
-            tensor<fp32, [9, 4, 64]> var_1320 = reshape(shape = var_1319, x = k_19)[name = tensor<string, []>("op_1320")];
+            tensor<int32, [3]> var_1322 = const()[name = tensor<string, []>("op_1322"), val = tensor<int32, [3]>([9, 4, 64])];
+            tensor<fp32, [9, 4, 64]> var_1323 = reshape(shape = var_1322, x = k_19)[name = tensor<string, []>("op_1323")];
             tensor<int32, [3]> k_21_perm_0 = const()[name = tensor<string, []>("k_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1326 = const()[name = tensor<string, []>("op_1326"), val = tensor<int32, [3]>([9, 4, 64])];
-            tensor<fp32, [9, 4, 64]> var_1327 = reshape(shape = var_1326, x = v_19)[name = tensor<string, []>("op_1327")];
+            tensor<int32, [3]> var_1329 = const()[name = tensor<string, []>("op_1329"), val = tensor<int32, [3]>([9, 4, 64])];
+            tensor<fp32, [9, 4, 64]> var_1330 = reshape(shape = var_1329, x = v_19)[name = tensor<string, []>("op_1330")];
             tensor<int32, [3]> v_21_perm_0 = const()[name = tensor<string, []>("v_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [4]> var_1330 = const()[name = tensor<string, []>("op_1330"), val = tensor<int32, [4]>([1, 4, 9, 64])];
-            tensor<fp32, [4, 9, 64]> q_21 = transpose(perm = q_21_perm_0, x = var_1313)[name = tensor<string, []>("transpose_7")];
-            tensor<fp32, [1, 4, 9, 64]> q = reshape(shape = var_1330, x = q_21)[name = tensor<string, []>("q")];
-            tensor<int32, [4]> var_1332 = const()[name = tensor<string, []>("op_1332"), val = tensor<int32, [4]>([1, 4, 9, 64])];
-            tensor<fp32, [4, 9, 64]> k_21 = transpose(perm = k_21_perm_0, x = var_1320)[name = tensor<string, []>("transpose_6")];
-            tensor<fp32, [1, 4, 9, 64]> k = reshape(shape = var_1332, x = k_21)[name = tensor<string, []>("k")];
-            tensor<int32, [4]> var_1334 = const()[name = tensor<string, []>("op_1334"), val = tensor<int32, [4]>([1, 4, 9, 64])];
-            tensor<fp32, [4, 9, 64]> v_21 = transpose(perm = v_21_perm_0, x = var_1327)[name = tensor<string, []>("transpose_5")];
-            tensor<fp32, [1, 4, 9, 64]> v = reshape(shape = var_1334, x = v_21)[name = tensor<string, []>("v")];
+            tensor<int32, [4]> var_1333 = const()[name = tensor<string, []>("op_1333"), val = tensor<int32, [4]>([1, 4, 9, 64])];
+            tensor<fp32, [4, 9, 64]> q_21 = transpose(perm = q_21_perm_0, x = var_1316)[name = tensor<string, []>("transpose_7")];
+            tensor<fp32, [1, 4, 9, 64]> q = reshape(shape = var_1333, x = q_21)[name = tensor<string, []>("q")];
+            tensor<int32, [4]> var_1335 = const()[name = tensor<string, []>("op_1335"), val = tensor<int32, [4]>([1, 4, 9, 64])];
+            tensor<fp32, [4, 9, 64]> k_21 = transpose(perm = k_21_perm_0, x = var_1323)[name = tensor<string, []>("transpose_6")];
+            tensor<fp32, [1, 4, 9, 64]> k = reshape(shape = var_1335, x = k_21)[name = tensor<string, []>("k")];
+            tensor<int32, [4]> var_1337 = const()[name = tensor<string, []>("op_1337"), val = tensor<int32, [4]>([1, 4, 9, 64])];
+            tensor<fp32, [4, 9, 64]> v_21 = transpose(perm = v_21_perm_0, x = var_1330)[name = tensor<string, []>("transpose_5")];
+            tensor<fp32, [1, 4, 9, 64]> v = reshape(shape = var_1337, x = v_21)[name = tensor<string, []>("v")];
             tensor<fp32, []> mul_3_y_0 = const()[name = tensor<string, []>("mul_3_y_0"), val = tensor<fp32, []>(0x1p-3)];
             tensor<fp32, [1, 4, 9, 64]> mul_3 = mul(x = q, y = mul_3_y_0)[name = tensor<string, []>("mul_3")];
             tensor<bool, []> matmul_1_transpose_y_0 = const()[name = tensor<string, []>("matmul_1_transpose_y_0"), val = tensor<bool, []>(true)];
@@ -1139,34 +1138,34 @@ program(1.0)
             tensor<bool, []> attn_output_9_transpose_x_0 = const()[name = tensor<string, []>("attn_output_9_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> attn_output_9_transpose_y_0 = const()[name = tensor<string, []>("attn_output_9_transpose_y_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [1, 4, 9, 64]> attn_output_9 = matmul(transpose_x = attn_output_9_transpose_x_0, transpose_y = attn_output_9_transpose_y_0, x = softmax_1, y = v)[name = tensor<string, []>("attn_output_9")];
-            tensor<int32, [4]> var_1337 = const()[name = tensor<string, []>("op_1337"), val = tensor<int32, [4]>([2, 0, 1, 3])];
-            tensor<int32, [2]> var_1342 = const()[name = tensor<string, []>("op_1342"), val = tensor<int32, [2]>([9, 256])];
-            tensor<fp32, [9, 1, 4, 64]> var_1338 = transpose(perm = var_1337, x = attn_output_9)[name = tensor<string, []>("transpose_4")];
-            tensor<fp32, [9, 256]> attn_output_11 = reshape(shape = var_1342, x = var_1338)[name = tensor<string, []>("attn_output_11")];
-            tensor<fp32, [9, 256]> attn_output_13 = linear(bias = decoder_self_attn2_1_out_proj_bias, weight = decoder_self_attn2_1_out_proj_weight, x = attn_output_11)[name = tensor<string, []>("linear_53")];
-            tensor<int32, [3]> var_1346 = const()[name = tensor<string, []>("op_1346"), val = tensor<int32, [3]>([9, 1, 256])];
-            tensor<fp32, [9, 1, 256]> attn_output = reshape(shape = var_1346, x = attn_output_13)[name = tensor<string, []>("attn_output")];
+            tensor<int32, [4]> var_1340 = const()[name = tensor<string, []>("op_1340"), val = tensor<int32, [4]>([2, 0, 1, 3])];
+            tensor<int32, [2]> var_1345 = const()[name = tensor<string, []>("op_1345"), val = tensor<int32, [2]>([9, 256])];
+            tensor<fp32, [9, 1, 4, 64]> var_1341 = transpose(perm = var_1340, x = attn_output_9)[name = tensor<string, []>("transpose_4")];
+            tensor<fp32, [9, 256]> attn_output_11 = reshape(shape = var_1345, x = var_1341)[name = tensor<string, []>("attn_output_11")];
+            tensor<fp32, [9, 256]> attn_output_13 = linear(bias = inner_decoder_self_attn2_1_out_proj_bias, weight = inner_decoder_self_attn2_1_out_proj_weight, x = attn_output_11)[name = tensor<string, []>("linear_53")];
+            tensor<int32, [3]> var_1349 = const()[name = tensor<string, []>("op_1349"), val = tensor<int32, [3]>([9, 1, 256])];
+            tensor<fp32, [9, 1, 256]> attn_output = reshape(shape = var_1349, x = attn_output_13)[name = tensor<string, []>("attn_output")];
             tensor<int32, [3]> sa2_out_perm_0 = const()[name = tensor<string, []>("sa2_out_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [1, 9, 256]> sa2_out = transpose(perm = sa2_out_perm_0, x = attn_output)[name = tensor<string, []>("transpose_3")];
-            tensor<fp32, [1, 9, 256]> input_193 = add(x = query_5, y = sa2_out)[name = tensor<string, []>("input_193")];
-            tensor<int32, [1]> input_195_axes_0 = const()[name = tensor<string, []>("input_195_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 9, 256]> input_195 = layer_norm(axes = input_195_axes_0, beta = decoder_norm21_1_bias, epsilon = var_921, gamma = decoder_norm21_1_weight, x = input_193)[name = tensor<string, []>("input_195")];
-            tensor<fp32, [1, 9, 2048]> input_197 = linear(bias = decoder_linear1_1_bias, weight = decoder_linear1_1_weight, x = input_195)[name = tensor<string, []>("linear_54")];
-            tensor<fp32, [1, 9, 2048]> input_199 = relu(x = input_197)[name = tensor<string, []>("input_199")];
-            tensor<fp32, [1, 9, 256]> ffn_out = linear(bias = decoder_linear2_1_bias, weight = decoder_linear2_1_weight, x = input_199)[name = tensor<string, []>("linear_55")];
-            tensor<fp32, [1, 9, 256]> input_201 = add(x = input_195, y = ffn_out)[name = tensor<string, []>("input_201")];
+            tensor<fp32, [1, 9, 256]> input_195 = add(x = query_5, y = sa2_out)[name = tensor<string, []>("input_195")];
+            tensor<int32, [1]> input_197_axes_0 = const()[name = tensor<string, []>("input_197_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 9, 256]> input_197 = layer_norm(axes = input_197_axes_0, beta = inner_decoder_norm21_1_bias, epsilon = var_36, gamma = inner_decoder_norm21_1_weight, x = input_195)[name = tensor<string, []>("input_197")];
+            tensor<fp32, [1, 9, 2048]> input_199 = linear(bias = inner_decoder_linear1_1_bias, weight = inner_decoder_linear1_1_weight, x = input_197)[name = tensor<string, []>("linear_54")];
+            tensor<fp32, [1, 9, 2048]> input_201 = relu(x = input_199)[name = tensor<string, []>("input_201")];
+            tensor<fp32, [1, 9, 256]> ffn_out = linear(bias = inner_decoder_linear2_1_bias, weight = inner_decoder_linear2_1_weight, x = input_201)[name = tensor<string, []>("linear_55")];
+            tensor<fp32, [1, 9, 256]> input_203 = add(x = input_197, y = ffn_out)[name = tensor<string, []>("input_203")];
             tensor<int32, [1]> xt_axes_0 = const()[name = tensor<string, []>("xt_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 9, 256]> xt = layer_norm(axes = xt_axes_0, beta = decoder_norm22_1_bias, epsilon = var_921, gamma = decoder_norm22_1_weight, x = input_201)[name = tensor<string, []>("xt")];
-            tensor<int32, [4]> var_1366 = const()[name = tensor<string, []>("op_1366"), val = tensor<int32, [4]>([1, 1, 9, 256])];
-            tensor<fp32, [1, 1, 9, 256]> input = reshape(shape = var_1366, x = xt)[name = tensor<string, []>("input")];
-            tensor<int32, [1]> var_1368 = const()[name = tensor<string, []>("op_1368"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 9, 1]> var_1369 = reduce_l2_norm(axes = var_1368, keep_dims = var_924, x = input)[name = tensor<string, []>("op_1369")];
+            tensor<fp32, [1, 9, 256]> xt = layer_norm(axes = xt_axes_0, beta = inner_decoder_norm22_1_bias, epsilon = var_36, gamma = inner_decoder_norm22_1_weight, x = input_203)[name = tensor<string, []>("xt")];
+            tensor<int32, [4]> var_1369 = const()[name = tensor<string, []>("op_1369"), val = tensor<int32, [4]>([1, 1, 9, 256])];
+            tensor<fp32, [1, 1, 9, 256]> input = reshape(shape = var_1369, x = xt)[name = tensor<string, []>("input")];
+            tensor<int32, [1]> var_1371 = const()[name = tensor<string, []>("op_1371"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 1, 9, 1]> var_1372 = reduce_l2_norm(axes = var_1371, keep_dims = var_35, x = input)[name = tensor<string, []>("op_1372")];
             tensor<fp32, []> const_42 = const()[name = tensor<string, []>("const_42"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1, 1, 9, 1]> clip_5 = clip(alpha = var_916, beta = const_42, x = var_1369)[name = tensor<string, []>("clip_5")];
-            tensor<fp32, [1, 1, 9, 256]> var_1371 = real_div(x = input, y = clip_5)[name = tensor<string, []>("op_1371")];
+            tensor<fp32, [1, 1, 9, 1]> clip_5 = clip(alpha = var_49, beta = const_42, x = var_1372)[name = tensor<string, []>("clip_5")];
+            tensor<fp32, [1, 1, 9, 256]> var_1374 = real_div(x = input, y = clip_5)[name = tensor<string, []>("op_1374")];
             tensor<int32, [4]> transpose_1_perm_0 = const()[name = tensor<string, []>("transpose_1_perm_0"), val = tensor<int32, [4]>([0, 1, 3, 2])];
             tensor<int32, [3]> concat_7 = const()[name = tensor<string, []>("concat_7"), val = tensor<int32, [3]>([1, 256, 9])];
-            tensor<fp32, [1, 1, 256, 9]> transpose_1 = transpose(perm = transpose_1_perm_0, x = var_1371)[name = tensor<string, []>("transpose_2")];
+            tensor<fp32, [1, 1, 256, 9]> transpose_1 = transpose(perm = transpose_1_perm_0, x = var_1374)[name = tensor<string, []>("transpose_2")];
             tensor<fp32, [1, 256, 9]> reshape_1 = reshape(shape = concat_7, x = transpose_1)[name = tensor<string, []>("reshape_1")];
             tensor<bool, []> matmul_2_transpose_x_0 = const()[name = tensor<string, []>("matmul_2_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> matmul_2_transpose_y_0 = const()[name = tensor<string, []>("matmul_2_transpose_y_0"), val = tensor<bool, []>(false)];
@@ -1175,10 +1174,10 @@ program(1.0)
             tensor<int32, [3]> output_end_0 = const()[name = tensor<string, []>("output_end_0"), val = tensor<int32, [3]>([1, 1, 8])];
             tensor<bool, [3]> output_end_mask_0 = const()[name = tensor<string, []>("output_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
             tensor<fp32, [1, 1, 7]> output = slice_by_index(begin = output_begin_0, end = output_end_0, end_mask = output_end_mask_0, x = matmul_2)[name = tensor<string, []>("output")];
-            tensor<fp32, [1, 1, 7]> probs = sigmoid(x = output)[name = tensor<string, []>("op_1375")];
-            tensor<int32, []> var_1377_axis_0 = const()[name = tensor<string, []>("op_1377_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [2, 9, 4, 64, 64]> dec_kv_new = stack(axis = var_1377_axis_0, values = (var_1073, nkv))[name = tensor<string, []>("op_1377")];
-            tensor<int32, []> var_1379_axis_0 = const()[name = tensor<string, []>("op_1379_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [2, 1]> dec_scale_new = stack(axis = var_1379_axis_0, values = (new_scale_9, new_scale))[name = tensor<string, []>("op_1379")];
+            tensor<fp32, [1, 1, 7]> probs = sigmoid(x = output)[name = tensor<string, []>("op_1378")];
+            tensor<int32, []> var_1380_axis_0 = const()[name = tensor<string, []>("op_1380_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [2, 9, 4, 64, 64]> dec_kv_new = stack(axis = var_1380_axis_0, values = (var_1076, nkv))[name = tensor<string, []>("op_1380")];
+            tensor<int32, []> var_1382_axis_0 = const()[name = tensor<string, []>("op_1382_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [2, 1]> dec_scale_new = stack(axis = var_1382_axis_0, values = (new_scale_9, new_scale))[name = tensor<string, []>("op_1382")];
         } -> (probs, enc_kv_new, enc_scale_new, enc_conv_cache_new, cnn_window_new, dec_kv_new, dec_scale_new);
 }
\ No newline at end of file
diff --git a/optimized/ch/100ms/ls_eend_ch_100ms.mlpackage/Data/com.apple.CoreML/model.mlmodel b/optimized/ch/100ms/ls_eend_ch_100ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
index 42c372e11d6d85d4e11b133904269e202b57e146..f9e0a2c3d6a9d275fe99a5b6e10ee2a63f955685 100644
--- a/optimized/ch/100ms/ls_eend_ch_100ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
+++ b/optimized/ch/100ms/ls_eend_ch_100ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a15eadb3397bff9c66d9266e9a495a92e7913dd25b985447b5efd0ebd212d182
-size 171357
+oid sha256:d27af30cb86aac3d4db11e7ed026c089dec4ccef4d0c7e12f0e7d59bf0b86092
+size 175275
diff --git a/optimized/ch/100ms/ls_eend_ch_100ms.mlpackage/Manifest.json b/optimized/ch/100ms/ls_eend_ch_100ms.mlpackage/Manifest.json
index 6438d33c88f42770e5903b4cf8b6b7fc7ffd893a..9fb9ade7c21b566c958ea9718a94e56ccf55686e 100644
--- a/optimized/ch/100ms/ls_eend_ch_100ms.mlpackage/Manifest.json
+++ b/optimized/ch/100ms/ls_eend_ch_100ms.mlpackage/Manifest.json
@@ -1,18 +1,18 @@
 {
     "fileFormatVersion": "1.0.0",
     "itemInfoEntries": {
-        "15BB7841-F157-4C26-9962-7E006B1F650B": {
+        "00C8998E-3FCC-440D-86B1-4A910C90D3C0": {
             "author": "com.apple.CoreML",
             "description": "CoreML Model Specification",
             "name": "model.mlmodel",
             "path": "com.apple.CoreML/model.mlmodel"
         },
-        "69383227-3EDF-4375-8C74-435BD4941612": {
+        "470AB283-D36B-4390-A0B0-CABAEA84E932": {
             "author": "com.apple.CoreML",
             "description": "CoreML Model Weights",
             "name": "weights",
             "path": "com.apple.CoreML/weights"
         }
     },
-    "rootModelIdentifier": "15BB7841-F157-4C26-9962-7E006B1F650B"
+    "rootModelIdentifier": "00C8998E-3FCC-440D-86B1-4A910C90D3C0"
 }
diff --git a/optimized/ch/200ms/ls_eend_ch_200ms.mlmodelc/analytics/coremldata.bin b/optimized/ch/200ms/ls_eend_ch_200ms.mlmodelc/analytics/coremldata.bin
index a37607ed47f4304af3eb57d2df22a9ad4ac88e3e..8f1abe174990c18e0bd327ab363aacda857d642b 100644
--- a/optimized/ch/200ms/ls_eend_ch_200ms.mlmodelc/analytics/coremldata.bin
+++ b/optimized/ch/200ms/ls_eend_ch_200ms.mlmodelc/analytics/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e72fc8970f10d313ad089b47fe4fcc1e390363db50c413f17b9fb0d756a8cea1
+oid sha256:b1c408f22c25e1aa2254bb0ea9d086db795029b10e465e8855e0ddb009393d89
 size 243
diff --git a/optimized/ch/200ms/ls_eend_ch_200ms.mlmodelc/coremldata.bin b/optimized/ch/200ms/ls_eend_ch_200ms.mlmodelc/coremldata.bin
index fd51deca07943f75691c3895fec309e63919fa02..ce1acf08093f90402bc6f90e2d7d5ed0391056e7 100644
--- a/optimized/ch/200ms/ls_eend_ch_200ms.mlmodelc/coremldata.bin
+++ b/optimized/ch/200ms/ls_eend_ch_200ms.mlmodelc/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:230d025ab4e17869d9301adf650df4fa921472c5d5582a9f365e7d70f3f13585
-size 1301
+oid sha256:25182b8ef75f90ca7ceb64114d45ea7a19e8500cb4ab6b7ce7250f8ad694d7ea
+size 1404
diff --git a/optimized/ch/200ms/ls_eend_ch_200ms.mlmodelc/metadata.json b/optimized/ch/200ms/ls_eend_ch_200ms.mlmodelc/metadata.json
index ffa39d702bedf6118ed46a822ffe5aee24cfcb30..61203cd26f46e815e985ea745a6b63d352292e3e 100644
--- a/optimized/ch/200ms/ls_eend_ch_200ms.mlmodelc/metadata.json
+++ b/optimized/ch/200ms/ls_eend_ch_200ms.mlmodelc/metadata.json
@@ -1,7 +1,7 @@
 [
   {
     "metadataOutputVersion" : "3.0",
-    "shortDescription" : "LS-EEND CALLHOME streaming diarizer (pipeline, T=2, max_speakers=7)",
+    "shortDescription" : "LS-EEND CALLHOME streaming diarizer (pipeline, T=2, max_speakers=7, layout=raw_mel, cu=all)",
     "outputSchema" : [
       {
         "hasShapeFlexibility" : "0",
@@ -81,7 +81,7 @@
     "specificationVersion" : 8,
     "mlProgramOperationTypeHistogram" : {
       "Ios16.reduceL2Norm" : 2,
-      "Ios17.reshape" : 68,
+      "Ios17.reshape" : 69,
       "Ios16.softmax" : 2,
       "Ios17.matmul" : 29,
       "Ios17.transpose" : 57,
@@ -89,7 +89,7 @@
       "Ios17.expandDims" : 3,
       "Ios17.add" : 46,
       "Ios16.sigmoid" : 5,
-      "Ios17.sliceByIndex" : 48,
+      "Ios17.sliceByIndex" : 50,
       "Tile" : 1,
       "Ios16.reduceSum" : 1,
       "Ios17.squeeze" : 2,
@@ -101,7 +101,7 @@
       "Ios16.silu" : 18,
       "Ios17.realDiv" : 20,
       "Ios17.linear" : 56,
-      "Stack" : 5,
+      "Stack" : 6,
       "Ios17.concat" : 14,
       "Ios16.relu" : 2,
       "Ios16.cumsum" : 1,
@@ -128,9 +128,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float32",
-        "formattedType" : "MultiArray (Float32 1 × 2 × 345)",
+        "formattedType" : "MultiArray (Float32 1 × 25 × 23)",
         "shortDescription" : "",
-        "shape" : "[1, 2, 345]",
+        "shape" : "[1, 25, 23]",
         "name" : "features",
         "type" : "MultiArray"
       },
@@ -206,8 +206,8 @@
       }
     ],
     "userDefinedMetadata" : {
-      "com.github.apple.coremltools.conversion_date" : "2026-04-16",
-      "config" : "{\"model_name\": \"ch\", \"model_label\": \"CALLHOME\", \"variant\": \"pipeline\", \"chunk_size\": 2, \"step_duration_ms\": 200, \"frame_duration_ms\": 100, \"frame_duration_seconds\": 0.1, \"max_speakers\": 7, \"max_nspks\": 9, \"n_units\": 256, \"n_heads\": 4, \"enc_n_layers\": 4, \"dec_n_layers\": 2, \"conv_kernel_size\": 16, \"conv_delay\": 9, \"sample_rate\": 8000, \"win_length\": 200, \"hop_length\": 80, \"n_mels\": 23, \"context_size\": 7, \"subsampling\": 10, \"feat_type\": \"logmel23_cummn\", \"pure_roll\": true}",
+      "com.github.apple.coremltools.conversion_date" : "2026-04-18",
+      "config" : "{\"model_name\": \"ch\", \"model_label\": \"CALLHOME\", \"variant\": \"pipeline\", \"chunk_size\": 2, \"step_duration_ms\": 200, \"frame_duration_ms\": 100, \"frame_duration_seconds\": 0.1, \"max_speakers\": 7, \"max_nspks\": 9, \"n_units\": 256, \"n_heads\": 4, \"enc_n_layers\": 4, \"dec_n_layers\": 2, \"conv_kernel_size\": 16, \"conv_delay\": 9, \"sample_rate\": 8000, \"win_length\": 200, \"hop_length\": 80, \"n_mels\": 23, \"context_size\": 7, \"subsampling\": 10, \"feat_type\": \"logmel23_cummn\", \"pure_roll\": true, \"input_layout\": \"raw_mel\", \"compute_units_export\": \"all\", \"raw_mel_length\": 25}",
       "com.github.apple.coremltools.source" : "torch==2.6.0",
       "com.github.apple.coremltools.version" : "9.0",
       "com.github.apple.coremltools.source_dialect" : "TorchScript"
diff --git a/optimized/ch/200ms/ls_eend_ch_200ms.mlmodelc/model.mil b/optimized/ch/200ms/ls_eend_ch_200ms.mlmodelc/model.mil
index 801b456ed38b11b675323d063b50f9d105936618..3e74603b0651724dff587b0ac9da39a164d753ed 100644
--- a/optimized/ch/200ms/ls_eend_ch_200ms.mlmodelc/model.mil
+++ b/optimized/ch/200ms/ls_eend_ch_200ms.mlmodelc/model.mil
@@ -1,234 +1,248 @@
 program(1.0)
 [buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.6.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "9.0"}})]
 {
-    func main<ios17>(tensor<fp32, [1, 256, 18]> cnn_window, tensor<fp32, [2, 9, 4, 64, 64]> dec_kv, tensor<fp32, [2, 1]> dec_scale, tensor<fp32, [4, 1, 16, 256]> enc_conv_cache, tensor<fp32, [4, 1, 4, 64, 64]> enc_kv, tensor<fp32, [4, 1]> enc_scale, tensor<fp32, [1, 2, 345]> features, tensor<fp32, [2]> valid_mask) {
-            tensor<fp32, [256]> encoder_cnn_bias = const()[name = tensor<string, []>("encoder_cnn_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
-            tensor<fp32, [256, 256, 19]> encoder_cnn_weight = const()[name = tensor<string, []>("encoder_cnn_weight"), val = tensor<fp32, [256, 256, 19]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
-            tensor<fp32, [2, 2]> encoder__causal_mask = const()[name = tensor<string, []>("encoder__causal_mask"), val = tensor<fp32, [2, 2]>([[0x1p+0, 0x0p+0], [0x1p+0, 0x1p+0]])];
-            tensor<fp32, [2]> encoder__t_index = const()[name = tensor<string, []>("encoder__t_index"), val = tensor<fp32, [2]>([0x1p+0, 0x1p+1])];
-            tensor<fp32, [256]> encoder_input_projection_linear_bias = const()[name = tensor<string, []>("encoder_input_projection_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4981952)))];
-            tensor<fp32, [256, 345]> encoder_input_projection_linear_weight = const()[name = tensor<string, []>("encoder_input_projection_linear_weight"), val = tensor<fp32, [256, 345]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4983040)))];
-            tensor<fp32, [256]> encoder_pre_layer_norm_bias = const()[name = tensor<string, []>("encoder_pre_layer_norm_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5336384)))];
-            tensor<fp32, [256]> encoder_pre_layer_norm_weight = const()[name = tensor<string, []>("encoder_pre_layer_norm_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5337472)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5338560)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5339648)))];
-            tensor<fp32, [1024]> encoder_ffn1_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5340736)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5344896)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6393536)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6394624)))];
-            tensor<fp32, [256]> encoder_ret_lns_0_bias = const()[name = tensor<string, []>("encoder_ret_lns_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7443264)))];
-            tensor<fp32, [256]> encoder_ret_lns_0_weight = const()[name = tensor<string, []>("encoder_ret_lns_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7444352)))];
-            tensor<fp32, [256]> encoder_q_proj_0_bias = const()[name = tensor<string, []>("encoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7445440)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_0_weight = const()[name = tensor<string, []>("encoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7446528)))];
-            tensor<fp32, [256]> encoder_k_proj_0_bias = const()[name = tensor<string, []>("encoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7708736)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_0_weight = const()[name = tensor<string, []>("encoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7709824)))];
-            tensor<fp32, [256]> encoder_v_proj_0_bias = const()[name = tensor<string, []>("encoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7972032)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_0_weight = const()[name = tensor<string, []>("encoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7973120)))];
-            tensor<fp32, [256]> encoder_g_proj_0_bias = const()[name = tensor<string, []>("encoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8235328)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_0_weight = const()[name = tensor<string, []>("encoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8236416)))];
-            tensor<fp32, [256]> encoder_out_proj_0_bias = const()[name = tensor<string, []>("encoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8498624)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_0_weight = const()[name = tensor<string, []>("encoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8499712)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8761920)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8763008)))];
-            tensor<fp32, [512]> encoder_conv_module_0_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8764096)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_0_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8766208)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_0_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9290560)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9307008)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9308096)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9309184)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9310272)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9311360)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_0_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9312448)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9574656)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9575744)))];
-            tensor<fp32, [1024]> encoder_ffn2_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9576832)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9580992)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10629632)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10630720)))];
-            tensor<fp32, [256]> encoder_layer_norm_0_bias = const()[name = tensor<string, []>("encoder_layer_norm_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11679360)))];
-            tensor<fp32, [256]> encoder_layer_norm_0_weight = const()[name = tensor<string, []>("encoder_layer_norm_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11680448)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11681536)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11682624)))];
-            tensor<fp32, [1024]> encoder_ffn1_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11683712)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11687872)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12736512)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12737600)))];
-            tensor<fp32, [256]> encoder_ret_lns_1_bias = const()[name = tensor<string, []>("encoder_ret_lns_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13786240)))];
-            tensor<fp32, [256]> encoder_ret_lns_1_weight = const()[name = tensor<string, []>("encoder_ret_lns_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13787328)))];
-            tensor<fp32, [256]> encoder_q_proj_1_bias = const()[name = tensor<string, []>("encoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13788416)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_1_weight = const()[name = tensor<string, []>("encoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13789504)))];
-            tensor<fp32, [256]> encoder_k_proj_1_bias = const()[name = tensor<string, []>("encoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14051712)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_1_weight = const()[name = tensor<string, []>("encoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14052800)))];
-            tensor<fp32, [256]> encoder_v_proj_1_bias = const()[name = tensor<string, []>("encoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14315008)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_1_weight = const()[name = tensor<string, []>("encoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14316096)))];
-            tensor<fp32, [256]> encoder_g_proj_1_bias = const()[name = tensor<string, []>("encoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14578304)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_1_weight = const()[name = tensor<string, []>("encoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14579392)))];
-            tensor<fp32, [256]> encoder_out_proj_1_bias = const()[name = tensor<string, []>("encoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14841600)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_1_weight = const()[name = tensor<string, []>("encoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14842688)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15104896)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15105984)))];
-            tensor<fp32, [512]> encoder_conv_module_1_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15107072)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_1_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15109184)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_1_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15633536)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15649984)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15651072)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15652160)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15653248)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15654336)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_1_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15655424)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15917632)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15918720)))];
-            tensor<fp32, [1024]> encoder_ffn2_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15919808)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15923968)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16972608)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16973696)))];
-            tensor<fp32, [256]> encoder_layer_norm_1_bias = const()[name = tensor<string, []>("encoder_layer_norm_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18022336)))];
-            tensor<fp32, [256]> encoder_layer_norm_1_weight = const()[name = tensor<string, []>("encoder_layer_norm_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18023424)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18024512)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18025600)))];
-            tensor<fp32, [1024]> encoder_ffn1_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18026688)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18030848)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19079488)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19080576)))];
-            tensor<fp32, [256]> encoder_ret_lns_2_bias = const()[name = tensor<string, []>("encoder_ret_lns_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20129216)))];
-            tensor<fp32, [256]> encoder_ret_lns_2_weight = const()[name = tensor<string, []>("encoder_ret_lns_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20130304)))];
-            tensor<fp32, [256]> encoder_q_proj_2_bias = const()[name = tensor<string, []>("encoder_q_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20131392)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_2_weight = const()[name = tensor<string, []>("encoder_q_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20132480)))];
-            tensor<fp32, [256]> encoder_k_proj_2_bias = const()[name = tensor<string, []>("encoder_k_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20394688)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_2_weight = const()[name = tensor<string, []>("encoder_k_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20395776)))];
-            tensor<fp32, [256]> encoder_v_proj_2_bias = const()[name = tensor<string, []>("encoder_v_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20657984)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_2_weight = const()[name = tensor<string, []>("encoder_v_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20659072)))];
-            tensor<fp32, [256]> encoder_g_proj_2_bias = const()[name = tensor<string, []>("encoder_g_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20921280)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_2_weight = const()[name = tensor<string, []>("encoder_g_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20922368)))];
-            tensor<fp32, [256]> encoder_out_proj_2_bias = const()[name = tensor<string, []>("encoder_out_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21184576)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_2_weight = const()[name = tensor<string, []>("encoder_out_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21185664)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21447872)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21448960)))];
-            tensor<fp32, [512]> encoder_conv_module_2_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21450048)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_2_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21452160)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_2_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21976512)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21992960)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21994048)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21995136)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21996224)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21997312)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_2_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21998400)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22260608)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22261696)))];
-            tensor<fp32, [1024]> encoder_ffn2_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22262784)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22266944)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23315584)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23316672)))];
-            tensor<fp32, [256]> encoder_layer_norm_2_bias = const()[name = tensor<string, []>("encoder_layer_norm_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24365312)))];
-            tensor<fp32, [256]> encoder_layer_norm_2_weight = const()[name = tensor<string, []>("encoder_layer_norm_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24366400)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24367488)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24368576)))];
-            tensor<fp32, [1024]> encoder_ffn1_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24369664)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24373824)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25422464)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25423552)))];
-            tensor<fp32, [256]> encoder_ret_lns_3_bias = const()[name = tensor<string, []>("encoder_ret_lns_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26472192)))];
-            tensor<fp32, [256]> encoder_ret_lns_3_weight = const()[name = tensor<string, []>("encoder_ret_lns_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26473280)))];
-            tensor<fp32, [256]> encoder_q_proj_3_bias = const()[name = tensor<string, []>("encoder_q_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26474368)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_3_weight = const()[name = tensor<string, []>("encoder_q_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26475456)))];
-            tensor<fp32, [256]> encoder_k_proj_3_bias = const()[name = tensor<string, []>("encoder_k_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26737664)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_3_weight = const()[name = tensor<string, []>("encoder_k_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26738752)))];
-            tensor<fp32, [256]> encoder_v_proj_3_bias = const()[name = tensor<string, []>("encoder_v_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27000960)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_3_weight = const()[name = tensor<string, []>("encoder_v_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27002048)))];
-            tensor<fp32, [256]> encoder_g_proj_3_bias = const()[name = tensor<string, []>("encoder_g_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27264256)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_3_weight = const()[name = tensor<string, []>("encoder_g_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27265344)))];
-            tensor<fp32, [256]> encoder_out_proj_3_bias = const()[name = tensor<string, []>("encoder_out_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27527552)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_3_weight = const()[name = tensor<string, []>("encoder_out_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27528640)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27790848)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27791936)))];
-            tensor<fp32, [512]> encoder_conv_module_3_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27793024)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_3_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27795136)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_3_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28319488)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28335936)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28337024)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28338112)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28339200)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28340288)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_3_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28341376)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28603584)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28604672)))];
-            tensor<fp32, [1024]> encoder_ffn2_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28605760)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28609920)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29658560)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29659648)))];
-            tensor<fp32, [256]> encoder_layer_norm_3_bias = const()[name = tensor<string, []>("encoder_layer_norm_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30708288)))];
-            tensor<fp32, [256]> encoder_layer_norm_3_weight = const()[name = tensor<string, []>("encoder_layer_norm_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30709376)))];
-            tensor<fp32, [2, 2]> decoder__causal_mask = const()[name = tensor<string, []>("decoder__causal_mask"), val = tensor<fp32, [2, 2]>([[0x1p+0, 0x0p+0], [0x1p+0, 0x1p+0]])];
-            tensor<fp32, [256]> decoder_convert_bias = const()[name = tensor<string, []>("decoder_convert_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710464)))];
-            tensor<fp32, [256, 512]> decoder_convert_weight = const()[name = tensor<string, []>("decoder_convert_weight"), val = tensor<fp32, [256, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30711552)))];
-            tensor<fp32, [256]> decoder_q_proj_0_bias = const()[name = tensor<string, []>("decoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31235904)))];
-            tensor<fp32, [256, 256]> decoder_q_proj_0_weight = const()[name = tensor<string, []>("decoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31236992)))];
-            tensor<fp32, [256]> decoder_k_proj_0_bias = const()[name = tensor<string, []>("decoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31499200)))];
-            tensor<fp32, [256, 256]> decoder_k_proj_0_weight = const()[name = tensor<string, []>("decoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31500288)))];
-            tensor<fp32, [256]> decoder_v_proj_0_bias = const()[name = tensor<string, []>("decoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31762496)))];
-            tensor<fp32, [256, 256]> decoder_v_proj_0_weight = const()[name = tensor<string, []>("decoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31763584)))];
-            tensor<fp32, [256]> decoder_g_proj_0_bias = const()[name = tensor<string, []>("decoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32025792)))];
-            tensor<fp32, [256, 256]> decoder_g_proj_0_weight = const()[name = tensor<string, []>("decoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32026880)))];
-            tensor<fp32, [256]> decoder_out_proj_0_bias = const()[name = tensor<string, []>("decoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32289088)))];
-            tensor<fp32, [256, 256]> decoder_out_proj_0_weight = const()[name = tensor<string, []>("decoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32290176)))];
-            tensor<fp32, [256]> decoder_norm11_0_bias = const()[name = tensor<string, []>("decoder_norm11_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32552384)))];
-            tensor<fp32, [256]> decoder_norm11_0_weight = const()[name = tensor<string, []>("decoder_norm11_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32553472)))];
-            tensor<fp32, [256]> decoder_self_attn2_0_out_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_0_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32554560)))];
-            tensor<fp32, [256, 256]> decoder_self_attn2_0_out_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_0_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32555648)))];
-            tensor<fp32, [768]> decoder_self_attn2_0_in_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_0_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32817856)))];
-            tensor<fp32, [768, 256]> decoder_self_attn2_0_in_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_0_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32820992)))];
-            tensor<fp32, [256]> decoder_norm21_0_bias = const()[name = tensor<string, []>("decoder_norm21_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33607488)))];
-            tensor<fp32, [256]> decoder_norm21_0_weight = const()[name = tensor<string, []>("decoder_norm21_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33608576)))];
-            tensor<fp32, [2048]> decoder_linear1_0_bias = const()[name = tensor<string, []>("decoder_linear1_0_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33609664)))];
-            tensor<fp32, [2048, 256]> decoder_linear1_0_weight = const()[name = tensor<string, []>("decoder_linear1_0_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33617920)))];
-            tensor<fp32, [256]> decoder_linear2_0_bias = const()[name = tensor<string, []>("decoder_linear2_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35715136)))];
-            tensor<fp32, [256, 2048]> decoder_linear2_0_weight = const()[name = tensor<string, []>("decoder_linear2_0_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35716224)))];
-            tensor<fp32, [256]> decoder_norm22_0_bias = const()[name = tensor<string, []>("decoder_norm22_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37813440)))];
-            tensor<fp32, [256]> decoder_norm22_0_weight = const()[name = tensor<string, []>("decoder_norm22_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37814528)))];
-            tensor<fp32, [256]> decoder_q_proj_1_bias = const()[name = tensor<string, []>("decoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37815616)))];
-            tensor<fp32, [256, 256]> decoder_q_proj_1_weight = const()[name = tensor<string, []>("decoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37816704)))];
-            tensor<fp32, [256]> decoder_k_proj_1_bias = const()[name = tensor<string, []>("decoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38078912)))];
-            tensor<fp32, [256, 256]> decoder_k_proj_1_weight = const()[name = tensor<string, []>("decoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38080000)))];
-            tensor<fp32, [256]> decoder_v_proj_1_bias = const()[name = tensor<string, []>("decoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38342208)))];
-            tensor<fp32, [256, 256]> decoder_v_proj_1_weight = const()[name = tensor<string, []>("decoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38343296)))];
-            tensor<fp32, [256]> decoder_g_proj_1_bias = const()[name = tensor<string, []>("decoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38605504)))];
-            tensor<fp32, [256, 256]> decoder_g_proj_1_weight = const()[name = tensor<string, []>("decoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38606592)))];
-            tensor<fp32, [256]> decoder_out_proj_1_bias = const()[name = tensor<string, []>("decoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38868800)))];
-            tensor<fp32, [256, 256]> decoder_out_proj_1_weight = const()[name = tensor<string, []>("decoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38869888)))];
-            tensor<fp32, [256]> decoder_norm11_1_bias = const()[name = tensor<string, []>("decoder_norm11_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39132096)))];
-            tensor<fp32, [256]> decoder_norm11_1_weight = const()[name = tensor<string, []>("decoder_norm11_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39133184)))];
-            tensor<fp32, [256]> decoder_self_attn2_1_out_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_1_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39134272)))];
-            tensor<fp32, [256, 256]> decoder_self_attn2_1_out_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_1_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39135360)))];
-            tensor<fp32, [768]> decoder_self_attn2_1_in_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_1_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39397568)))];
-            tensor<fp32, [768, 256]> decoder_self_attn2_1_in_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_1_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39400704)))];
-            tensor<fp32, [256]> decoder_norm21_1_bias = const()[name = tensor<string, []>("decoder_norm21_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40187200)))];
-            tensor<fp32, [256]> decoder_norm21_1_weight = const()[name = tensor<string, []>("decoder_norm21_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40188288)))];
-            tensor<fp32, [2048]> decoder_linear1_1_bias = const()[name = tensor<string, []>("decoder_linear1_1_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40189376)))];
-            tensor<fp32, [2048, 256]> decoder_linear1_1_weight = const()[name = tensor<string, []>("decoder_linear1_1_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40197632)))];
-            tensor<fp32, [256]> decoder_linear2_1_bias = const()[name = tensor<string, []>("decoder_linear2_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42294848)))];
-            tensor<fp32, [256, 2048]> decoder_linear2_1_weight = const()[name = tensor<string, []>("decoder_linear2_1_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42295936)))];
-            tensor<fp32, [256]> decoder_norm22_1_bias = const()[name = tensor<string, []>("decoder_norm22_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44393152)))];
-            tensor<fp32, [256]> decoder_norm22_1_weight = const()[name = tensor<string, []>("decoder_norm22_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44394240)))];
-            tensor<fp32, []> var_11 = const()[name = tensor<string, []>("op_11"), val = tensor<fp32, []>(0x1.5798eep-27)];
-            tensor<fp32, []> var_17 = const()[name = tensor<string, []>("op_17"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
-            tensor<int32, []> var_20 = const()[name = tensor<string, []>("op_20"), val = tensor<int32, []>(2)];
-            tensor<int32, []> var_23 = const()[name = tensor<string, []>("op_23"), val = tensor<int32, []>(0)];
-            tensor<int32, []> var_26 = const()[name = tensor<string, []>("op_26"), val = tensor<int32, []>(1)];
-            tensor<fp32, []> var_28 = const()[name = tensor<string, []>("op_28"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
-            tensor<bool, []> var_29 = const()[name = tensor<string, []>("op_29"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 2, 256]> input_1 = linear(bias = encoder_input_projection_linear_bias, weight = encoder_input_projection_linear_weight, x = features)[name = tensor<string, []>("linear_0")];
-            tensor<int32, [1]> input_3_axes_0 = const()[name = tensor<string, []>("input_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_3 = layer_norm(axes = input_3_axes_0, beta = encoder_pre_layer_norm_bias, epsilon = var_28, gamma = encoder_pre_layer_norm_weight, x = input_1)[name = tensor<string, []>("input_3")];
+    func main<ios17>(tensor<fp32, [1, 256, 18]> cnn_window, tensor<fp32, [2, 9, 4, 64, 64]> dec_kv, tensor<fp32, [2, 1]> dec_scale, tensor<fp32, [4, 1, 16, 256]> enc_conv_cache, tensor<fp32, [4, 1, 4, 64, 64]> enc_kv, tensor<fp32, [4, 1]> enc_scale, tensor<fp32, [1, 25, 23]> features, tensor<fp32, [2]> valid_mask) {
+            tensor<fp32, [256]> inner_encoder_cnn_bias = const()[name = tensor<string, []>("inner_encoder_cnn_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp32, [256, 256, 19]> inner_encoder_cnn_weight = const()[name = tensor<string, []>("inner_encoder_cnn_weight"), val = tensor<fp32, [256, 256, 19]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
+            tensor<fp32, [2, 2]> inner_encoder__causal_mask = const()[name = tensor<string, []>("inner_encoder__causal_mask"), val = tensor<fp32, [2, 2]>([[0x1p+0, 0x0p+0], [0x1p+0, 0x1p+0]])];
+            tensor<fp32, [2]> inner_encoder__t_index = const()[name = tensor<string, []>("inner_encoder__t_index"), val = tensor<fp32, [2]>([0x1p+0, 0x1p+1])];
+            tensor<fp32, [256]> inner_encoder_input_projection_linear_bias = const()[name = tensor<string, []>("inner_encoder_input_projection_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4981952)))];
+            tensor<fp32, [256, 345]> inner_encoder_input_projection_linear_weight = const()[name = tensor<string, []>("inner_encoder_input_projection_linear_weight"), val = tensor<fp32, [256, 345]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4983040)))];
+            tensor<fp32, [256]> inner_encoder_pre_layer_norm_bias = const()[name = tensor<string, []>("inner_encoder_pre_layer_norm_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5336384)))];
+            tensor<fp32, [256]> inner_encoder_pre_layer_norm_weight = const()[name = tensor<string, []>("inner_encoder_pre_layer_norm_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5337472)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5338560)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5339648)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5340736)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5344896)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6393536)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6394624)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_0_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7443264)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_0_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7444352)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7445440)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7446528)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7708736)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7709824)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7972032)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7973120)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8235328)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8236416)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8498624)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8499712)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8761920)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8763008)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_0_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8764096)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_0_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8766208)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_0_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9290560)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9307008)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9308096)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9309184)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9310272)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9311360)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_0_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9312448)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9574656)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9575744)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9576832)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9580992)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10629632)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10630720)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_0_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11679360)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_0_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11680448)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11681536)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11682624)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11683712)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11687872)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12736512)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12737600)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_1_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13786240)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_1_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13787328)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13788416)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13789504)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14051712)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14052800)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14315008)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14316096)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14578304)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14579392)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14841600)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14842688)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15104896)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15105984)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_1_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15107072)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_1_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15109184)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_1_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15633536)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15649984)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15651072)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15652160)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15653248)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15654336)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_1_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15655424)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15917632)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15918720)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15919808)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15923968)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16972608)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16973696)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_1_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18022336)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_1_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18023424)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18024512)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18025600)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18026688)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18030848)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19079488)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19080576)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_2_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20129216)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_2_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20130304)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20131392)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20132480)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20394688)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20395776)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20657984)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20659072)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20921280)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20922368)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21184576)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21185664)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21447872)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21448960)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_2_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21450048)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_2_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21452160)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_2_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21976512)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21992960)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21994048)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21995136)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21996224)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21997312)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_2_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21998400)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22260608)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22261696)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22262784)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22266944)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23315584)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23316672)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_2_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24365312)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_2_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24366400)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24367488)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24368576)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24369664)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24373824)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25422464)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25423552)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_3_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26472192)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_3_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26473280)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26474368)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26475456)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26737664)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26738752)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27000960)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27002048)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27264256)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27265344)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27527552)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27528640)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27790848)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27791936)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_3_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27793024)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_3_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27795136)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_3_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28319488)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28335936)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28337024)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28338112)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28339200)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28340288)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_3_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28341376)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28603584)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28604672)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28605760)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28609920)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29658560)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29659648)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_3_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30708288)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_3_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30709376)))];
+            tensor<fp32, [2, 2]> inner_decoder__causal_mask = const()[name = tensor<string, []>("inner_decoder__causal_mask"), val = tensor<fp32, [2, 2]>([[0x1p+0, 0x0p+0], [0x1p+0, 0x1p+0]])];
+            tensor<fp32, [256]> inner_decoder_convert_bias = const()[name = tensor<string, []>("inner_decoder_convert_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710464)))];
+            tensor<fp32, [256, 512]> inner_decoder_convert_weight = const()[name = tensor<string, []>("inner_decoder_convert_weight"), val = tensor<fp32, [256, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30711552)))];
+            tensor<fp32, [256]> inner_decoder_q_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31235904)))];
+            tensor<fp32, [256, 256]> inner_decoder_q_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31236992)))];
+            tensor<fp32, [256]> inner_decoder_k_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31499200)))];
+            tensor<fp32, [256, 256]> inner_decoder_k_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31500288)))];
+            tensor<fp32, [256]> inner_decoder_v_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31762496)))];
+            tensor<fp32, [256, 256]> inner_decoder_v_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31763584)))];
+            tensor<fp32, [256]> inner_decoder_g_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32025792)))];
+            tensor<fp32, [256, 256]> inner_decoder_g_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32026880)))];
+            tensor<fp32, [256]> inner_decoder_out_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32289088)))];
+            tensor<fp32, [256, 256]> inner_decoder_out_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32290176)))];
+            tensor<fp32, [256]> inner_decoder_norm11_0_bias = const()[name = tensor<string, []>("inner_decoder_norm11_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32552384)))];
+            tensor<fp32, [256]> inner_decoder_norm11_0_weight = const()[name = tensor<string, []>("inner_decoder_norm11_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32553472)))];
+            tensor<fp32, [256]> inner_decoder_self_attn2_0_out_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32554560)))];
+            tensor<fp32, [256, 256]> inner_decoder_self_attn2_0_out_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32555648)))];
+            tensor<fp32, [768]> inner_decoder_self_attn2_0_in_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32817856)))];
+            tensor<fp32, [768, 256]> inner_decoder_self_attn2_0_in_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32820992)))];
+            tensor<fp32, [256]> inner_decoder_norm21_0_bias = const()[name = tensor<string, []>("inner_decoder_norm21_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33607488)))];
+            tensor<fp32, [256]> inner_decoder_norm21_0_weight = const()[name = tensor<string, []>("inner_decoder_norm21_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33608576)))];
+            tensor<fp32, [2048]> inner_decoder_linear1_0_bias = const()[name = tensor<string, []>("inner_decoder_linear1_0_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33609664)))];
+            tensor<fp32, [2048, 256]> inner_decoder_linear1_0_weight = const()[name = tensor<string, []>("inner_decoder_linear1_0_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33617920)))];
+            tensor<fp32, [256]> inner_decoder_linear2_0_bias = const()[name = tensor<string, []>("inner_decoder_linear2_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35715136)))];
+            tensor<fp32, [256, 2048]> inner_decoder_linear2_0_weight = const()[name = tensor<string, []>("inner_decoder_linear2_0_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35716224)))];
+            tensor<fp32, [256]> inner_decoder_norm22_0_bias = const()[name = tensor<string, []>("inner_decoder_norm22_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37813440)))];
+            tensor<fp32, [256]> inner_decoder_norm22_0_weight = const()[name = tensor<string, []>("inner_decoder_norm22_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37814528)))];
+            tensor<fp32, [256]> inner_decoder_q_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37815616)))];
+            tensor<fp32, [256, 256]> inner_decoder_q_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37816704)))];
+            tensor<fp32, [256]> inner_decoder_k_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38078912)))];
+            tensor<fp32, [256, 256]> inner_decoder_k_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38080000)))];
+            tensor<fp32, [256]> inner_decoder_v_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38342208)))];
+            tensor<fp32, [256, 256]> inner_decoder_v_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38343296)))];
+            tensor<fp32, [256]> inner_decoder_g_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38605504)))];
+            tensor<fp32, [256, 256]> inner_decoder_g_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38606592)))];
+            tensor<fp32, [256]> inner_decoder_out_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38868800)))];
+            tensor<fp32, [256, 256]> inner_decoder_out_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38869888)))];
+            tensor<fp32, [256]> inner_decoder_norm11_1_bias = const()[name = tensor<string, []>("inner_decoder_norm11_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39132096)))];
+            tensor<fp32, [256]> inner_decoder_norm11_1_weight = const()[name = tensor<string, []>("inner_decoder_norm11_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39133184)))];
+            tensor<fp32, [256]> inner_decoder_self_attn2_1_out_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39134272)))];
+            tensor<fp32, [256, 256]> inner_decoder_self_attn2_1_out_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39135360)))];
+            tensor<fp32, [768]> inner_decoder_self_attn2_1_in_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39397568)))];
+            tensor<fp32, [768, 256]> inner_decoder_self_attn2_1_in_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39400704)))];
+            tensor<fp32, [256]> inner_decoder_norm21_1_bias = const()[name = tensor<string, []>("inner_decoder_norm21_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40187200)))];
+            tensor<fp32, [256]> inner_decoder_norm21_1_weight = const()[name = tensor<string, []>("inner_decoder_norm21_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40188288)))];
+            tensor<fp32, [2048]> inner_decoder_linear1_1_bias = const()[name = tensor<string, []>("inner_decoder_linear1_1_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40189376)))];
+            tensor<fp32, [2048, 256]> inner_decoder_linear1_1_weight = const()[name = tensor<string, []>("inner_decoder_linear1_1_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40197632)))];
+            tensor<fp32, [256]> inner_decoder_linear2_1_bias = const()[name = tensor<string, []>("inner_decoder_linear2_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42294848)))];
+            tensor<fp32, [256, 2048]> inner_decoder_linear2_1_weight = const()[name = tensor<string, []>("inner_decoder_linear2_1_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42295936)))];
+            tensor<fp32, [256]> inner_decoder_norm22_1_bias = const()[name = tensor<string, []>("inner_decoder_norm22_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44393152)))];
+            tensor<fp32, [256]> inner_decoder_norm22_1_weight = const()[name = tensor<string, []>("inner_decoder_norm22_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44394240)))];
+            tensor<int32, [3]> var_19_begin_0 = const()[name = tensor<string, []>("op_19_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_19_end_0 = const()[name = tensor<string, []>("op_19_end_0"), val = tensor<int32, [3]>([1, 15, 23])];
+            tensor<bool, [3]> var_19_end_mask_0 = const()[name = tensor<string, []>("op_19_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 15, 23]> var_19 = slice_by_index(begin = var_19_begin_0, end = var_19_end_0, end_mask = var_19_end_mask_0, x = features)[name = tensor<string, []>("op_19")];
+            tensor<int32, [3]> var_29_begin_0 = const()[name = tensor<string, []>("op_29_begin_0"), val = tensor<int32, [3]>([0, 10, 0])];
+            tensor<int32, [3]> var_29_end_0 = const()[name = tensor<string, []>("op_29_end_0"), val = tensor<int32, [3]>([1, 1, 23])];
+            tensor<bool, [3]> var_29_end_mask_0 = const()[name = tensor<string, []>("op_29_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 23]> var_29 = slice_by_index(begin = var_29_begin_0, end = var_29_end_0, end_mask = var_29_end_mask_0, x = features)[name = tensor<string, []>("op_29")];
+            tensor<int32, []> stacked_axis_0 = const()[name = tensor<string, []>("stacked_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 2, 15, 23]> stacked = stack(axis = stacked_axis_0, values = (var_19, var_29))[name = tensor<string, []>("stacked")];
+            tensor<int32, [3]> var_36 = const()[name = tensor<string, []>("op_36"), val = tensor<int32, [3]>([1, 2, 345])];
+            tensor<fp32, [1, 2, 345]> input_1 = reshape(shape = var_36, x = stacked)[name = tensor<string, []>("input_1")];
+            tensor<fp32, []> var_39 = const()[name = tensor<string, []>("op_39"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<bool, []> var_45 = const()[name = tensor<string, []>("op_45"), val = tensor<bool, []>(true)];
+            tensor<fp32, []> var_46 = const()[name = tensor<string, []>("op_46"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
+            tensor<int32, []> var_49 = const()[name = tensor<string, []>("op_49"), val = tensor<int32, []>(0)];
+            tensor<int32, []> var_51 = const()[name = tensor<string, []>("op_51"), val = tensor<int32, []>(2)];
+            tensor<int32, []> var_52 = const()[name = tensor<string, []>("op_52"), val = tensor<int32, []>(-1)];
+            tensor<fp32, []> var_54 = const()[name = tensor<string, []>("op_54"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
+            tensor<fp32, []> var_59 = const()[name = tensor<string, []>("op_59"), val = tensor<fp32, []>(0x1.5798eep-27)];
+            tensor<int32, []> var_62 = const()[name = tensor<string, []>("op_62"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 2, 256]> input_3 = linear(bias = inner_encoder_input_projection_linear_bias, weight = inner_encoder_input_projection_linear_weight, x = input_1)[name = tensor<string, []>("linear_0")];
             tensor<int32, [1]> input_5_axes_0 = const()[name = tensor<string, []>("input_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_5 = layer_norm(axes = input_5_axes_0, beta = encoder_ffn1_0_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_0_module_sequential_0_weight, x = input_3)[name = tensor<string, []>("input_5")];
-            tensor<fp32, [1, 2, 1024]> inputs_1 = linear(bias = encoder_ffn1_0_module_sequential_1_linear_bias, weight = encoder_ffn1_0_module_sequential_1_linear_weight, x = input_5)[name = tensor<string, []>("linear_1")];
-            tensor<fp32, [1, 2, 1024]> input_7 = silu(x = inputs_1)[name = tensor<string, []>("input_7")];
-            tensor<fp32, [1, 2, 256]> input_11 = linear(bias = encoder_ffn1_0_module_sequential_4_linear_bias, weight = encoder_ffn1_0_module_sequential_4_linear_weight, x = input_7)[name = tensor<string, []>("linear_2")];
-            tensor<fp32, []> var_147 = const()[name = tensor<string, []>("op_147"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 2, 256]> var_148 = mul(x = input_11, y = var_147)[name = tensor<string, []>("op_148")];
-            tensor<fp32, [1, 2, 256]> input_13 = add(x = var_148, y = input_3)[name = tensor<string, []>("input_13")];
+            tensor<fp32, [1, 2, 256]> input_5 = layer_norm(axes = input_5_axes_0, beta = inner_encoder_pre_layer_norm_bias, epsilon = var_46, gamma = inner_encoder_pre_layer_norm_weight, x = input_3)[name = tensor<string, []>("input_5")];
+            tensor<int32, [1]> input_7_axes_0 = const()[name = tensor<string, []>("input_7_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 2, 256]> input_7 = layer_norm(axes = input_7_axes_0, beta = inner_encoder_ffn1_0_module_sequential_0_bias, epsilon = var_46, gamma = inner_encoder_ffn1_0_module_sequential_0_weight, x = input_5)[name = tensor<string, []>("input_7")];
+            tensor<fp32, [1, 2, 1024]> inputs_1 = linear(bias = inner_encoder_ffn1_0_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_0_module_sequential_1_linear_weight, x = input_7)[name = tensor<string, []>("linear_1")];
+            tensor<fp32, [1, 2, 1024]> input_9 = silu(x = inputs_1)[name = tensor<string, []>("input_9")];
+            tensor<fp32, [1, 2, 256]> input_13 = linear(bias = inner_encoder_ffn1_0_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_0_module_sequential_4_linear_weight, x = input_9)[name = tensor<string, []>("linear_2")];
+            tensor<fp32, []> var_183 = const()[name = tensor<string, []>("op_183"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 2, 256]> var_184 = mul(x = input_13, y = var_183)[name = tensor<string, []>("op_184")];
+            tensor<fp32, [1, 2, 256]> input_15 = add(x = var_184, y = input_5)[name = tensor<string, []>("input_15")];
             tensor<int32, [1]> x_1_axes_0 = const()[name = tensor<string, []>("x_1_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> x_1 = layer_norm(axes = x_1_axes_0, beta = encoder_ret_lns_0_bias, epsilon = var_28, gamma = encoder_ret_lns_0_weight, x = input_13)[name = tensor<string, []>("x_1")];
+            tensor<fp32, [1, 2, 256]> x_1 = layer_norm(axes = x_1_axes_0, beta = inner_encoder_ret_lns_0_bias, epsilon = var_46, gamma = inner_encoder_ret_lns_0_weight, x = input_15)[name = tensor<string, []>("x_1")];
             tensor<int32, [5]> prev_kv_1_begin_0 = const()[name = tensor<string, []>("prev_kv_1_begin_0"), val = tensor<int32, [5]>([0, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_1_end_0 = const()[name = tensor<string, []>("prev_kv_1_end_0"), val = tensor<int32, [5]>([1, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_1_end_mask_0 = const()[name = tensor<string, []>("prev_kv_1_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -239,153 +253,153 @@ program(1.0)
             tensor<bool, [2]> prev_scale_1_end_mask_0 = const()[name = tensor<string, []>("prev_scale_1_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_1_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_1_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_1 = slice_by_index(begin = prev_scale_1_begin_0, end = prev_scale_1_end_0, end_mask = prev_scale_1_end_mask_0, squeeze_mask = prev_scale_1_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_1")];
-            tensor<fp32, [1, 2, 256]> var_162 = linear(bias = encoder_q_proj_0_bias, weight = encoder_q_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_3")];
-            tensor<int32, [4]> var_163 = const()[name = tensor<string, []>("op_163"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_164 = reshape(shape = var_163, x = var_162)[name = tensor<string, []>("op_164")];
+            tensor<fp32, [1, 2, 256]> var_198 = linear(bias = inner_encoder_q_proj_0_bias, weight = inner_encoder_q_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_3")];
+            tensor<int32, [4]> var_199 = const()[name = tensor<string, []>("op_199"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_200 = reshape(shape = var_199, x = var_198)[name = tensor<string, []>("op_200")];
             tensor<int32, [4]> q_1_perm_0 = const()[name = tensor<string, []>("q_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> var_168 = linear(bias = encoder_k_proj_0_bias, weight = encoder_k_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_4")];
-            tensor<fp32, []> var_169 = const()[name = tensor<string, []>("op_169"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 2, 256]> var_170 = mul(x = var_168, y = var_169)[name = tensor<string, []>("op_170")];
-            tensor<int32, [4]> var_171 = const()[name = tensor<string, []>("op_171"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_172 = reshape(shape = var_171, x = var_170)[name = tensor<string, []>("op_172")];
+            tensor<fp32, [1, 2, 256]> var_204 = linear(bias = inner_encoder_k_proj_0_bias, weight = inner_encoder_k_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_4")];
+            tensor<fp32, []> var_205 = const()[name = tensor<string, []>("op_205"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 2, 256]> var_206 = mul(x = var_204, y = var_205)[name = tensor<string, []>("op_206")];
+            tensor<int32, [4]> var_207 = const()[name = tensor<string, []>("op_207"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_208 = reshape(shape = var_207, x = var_206)[name = tensor<string, []>("op_208")];
             tensor<int32, [4]> k_1_perm_0 = const()[name = tensor<string, []>("k_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> var_176 = linear(bias = encoder_v_proj_0_bias, weight = encoder_v_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_5")];
-            tensor<int32, [4]> var_177 = const()[name = tensor<string, []>("op_177"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_178 = reshape(shape = var_177, x = var_176)[name = tensor<string, []>("op_178")];
+            tensor<fp32, [1, 2, 256]> var_212 = linear(bias = inner_encoder_v_proj_0_bias, weight = inner_encoder_v_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_5")];
+            tensor<int32, [4]> var_213 = const()[name = tensor<string, []>("op_213"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_214 = reshape(shape = var_213, x = var_212)[name = tensor<string, []>("op_214")];
             tensor<int32, [4]> v_1_perm_0 = const()[name = tensor<string, []>("v_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> input_17 = linear(bias = encoder_g_proj_0_bias, weight = encoder_g_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_6")];
+            tensor<fp32, [1, 2, 256]> input_19 = linear(bias = inner_encoder_g_proj_0_bias, weight = inner_encoder_g_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_6")];
             tensor<fp32, [1]> sqrt_s0_1 = sqrt(x = prev_scale_1)[name = tensor<string, []>("sqrt_s0_1")];
-            tensor<fp32, [2]> s_t_1 = add(x = prev_scale_1, y = encoder__t_index)[name = tensor<string, []>("s_t_1")];
+            tensor<fp32, [2]> s_t_1 = add(x = prev_scale_1, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_1")];
             tensor<fp32, [2]> sqrt_s_t_1 = sqrt(x = s_t_1)[name = tensor<string, []>("sqrt_s_t_1")];
             tensor<bool, []> qk_1_transpose_x_1 = const()[name = tensor<string, []>("qk_1_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_1_transpose_y_1 = const()[name = tensor<string, []>("qk_1_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 2, 64]> k_1 = transpose(perm = k_1_perm_0, x = var_172)[name = tensor<string, []>("transpose_57")];
-            tensor<fp32, [1, 4, 2, 64]> q_1 = transpose(perm = q_1_perm_0, x = var_164)[name = tensor<string, []>("transpose_58")];
+            tensor<fp32, [1, 4, 2, 64]> k_1 = transpose(perm = k_1_perm_0, x = var_208)[name = tensor<string, []>("transpose_57")];
+            tensor<fp32, [1, 4, 2, 64]> q_1 = transpose(perm = q_1_perm_0, x = var_200)[name = tensor<string, []>("transpose_58")];
             tensor<fp32, [1, 4, 2, 2]> qk_1 = matmul(transpose_x = qk_1_transpose_x_1, transpose_y = qk_1_transpose_y_1, x = q_1, y = k_1)[name = tensor<string, []>("qk_1")];
-            tensor<int32, [2]> var_188 = const()[name = tensor<string, []>("op_188"), val = tensor<int32, [2]>([2, 1])];
-            tensor<fp32, [2, 1]> var_189 = reshape(shape = var_188, x = sqrt_s_t_1)[name = tensor<string, []>("op_189")];
-            tensor<fp32, [2, 2]> M_1 = real_div(x = encoder__causal_mask, y = var_189)[name = tensor<string, []>("M_1")];
-            tensor<fp32, [1, 4, 2, 2]> var_191 = mul(x = qk_1, y = M_1)[name = tensor<string, []>("op_191")];
+            tensor<int32, [2]> var_224 = const()[name = tensor<string, []>("op_224"), val = tensor<int32, [2]>([2, 1])];
+            tensor<fp32, [2, 1]> var_225 = reshape(shape = var_224, x = sqrt_s_t_1)[name = tensor<string, []>("op_225")];
+            tensor<fp32, [2, 2]> M_1 = real_div(x = inner_encoder__causal_mask, y = var_225)[name = tensor<string, []>("M_1")];
+            tensor<fp32, [1, 4, 2, 2]> var_227 = mul(x = qk_1, y = M_1)[name = tensor<string, []>("op_227")];
             tensor<bool, []> inner_1_transpose_x_0 = const()[name = tensor<string, []>("inner_1_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_1_transpose_y_0 = const()[name = tensor<string, []>("inner_1_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 2, 64]> v_1 = transpose(perm = v_1_perm_0, x = var_178)[name = tensor<string, []>("transpose_56")];
-            tensor<fp32, [1, 4, 2, 64]> inner_1 = matmul(transpose_x = inner_1_transpose_x_0, transpose_y = inner_1_transpose_y_0, x = var_191, y = v_1)[name = tensor<string, []>("inner_1")];
-            tensor<bool, []> var_193_transpose_x_0 = const()[name = tensor<string, []>("op_193_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_193_transpose_y_0 = const()[name = tensor<string, []>("op_193_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 2, 64]> var_193 = matmul(transpose_x = var_193_transpose_x_0, transpose_y = var_193_transpose_y_0, x = q_1, y = prev_kv_1)[name = tensor<string, []>("op_193")];
-            tensor<fp32, [2]> var_194 = real_div(x = sqrt_s0_1, y = sqrt_s_t_1)[name = tensor<string, []>("op_194")];
-            tensor<int32, [4]> var_195 = const()[name = tensor<string, []>("op_195"), val = tensor<int32, [4]>([1, 1, 2, 1])];
-            tensor<fp32, [1, 1, 2, 1]> var_196 = reshape(shape = var_195, x = var_194)[name = tensor<string, []>("op_196")];
-            tensor<fp32, [1, 4, 2, 64]> cross_1 = mul(x = var_193, y = var_196)[name = tensor<string, []>("cross_1")];
+            tensor<fp32, [1, 4, 2, 64]> v_1 = transpose(perm = v_1_perm_0, x = var_214)[name = tensor<string, []>("transpose_56")];
+            tensor<fp32, [1, 4, 2, 64]> inner_1 = matmul(transpose_x = inner_1_transpose_x_0, transpose_y = inner_1_transpose_y_0, x = var_227, y = v_1)[name = tensor<string, []>("inner_1")];
+            tensor<bool, []> var_229_transpose_x_0 = const()[name = tensor<string, []>("op_229_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_229_transpose_y_0 = const()[name = tensor<string, []>("op_229_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 2, 64]> var_229 = matmul(transpose_x = var_229_transpose_x_0, transpose_y = var_229_transpose_y_0, x = q_1, y = prev_kv_1)[name = tensor<string, []>("op_229")];
+            tensor<fp32, [2]> var_230 = real_div(x = sqrt_s0_1, y = sqrt_s_t_1)[name = tensor<string, []>("op_230")];
+            tensor<int32, [4]> var_231 = const()[name = tensor<string, []>("op_231"), val = tensor<int32, [4]>([1, 1, 2, 1])];
+            tensor<fp32, [1, 1, 2, 1]> var_232 = reshape(shape = var_231, x = var_230)[name = tensor<string, []>("op_232")];
+            tensor<fp32, [1, 4, 2, 64]> cross_1 = mul(x = var_229, y = var_232)[name = tensor<string, []>("cross_1")];
             tensor<fp32, [1, 4, 2, 64]> out_1 = add(x = inner_1, y = cross_1)[name = tensor<string, []>("out_1")];
-            tensor<fp32, [1, 4, 64, 64]> var_199 = mul(x = prev_kv_1, y = sqrt_s0_1)[name = tensor<string, []>("op_199")];
-            tensor<bool, []> var_201_transpose_x_1 = const()[name = tensor<string, []>("op_201_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_201_transpose_y_1 = const()[name = tensor<string, []>("op_201_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_201 = matmul(transpose_x = var_201_transpose_x_1, transpose_y = var_201_transpose_y_1, x = k_1, y = v_1)[name = tensor<string, []>("op_201")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_1 = add(x = var_199, y = var_201)[name = tensor<string, []>("new_kv_unnorm_1")];
-            tensor<fp32, []> var_203 = const()[name = tensor<string, []>("op_203"), val = tensor<fp32, []>(0x1p+1)];
-            tensor<fp32, [1]> new_scale_1 = add(x = prev_scale_1, y = var_203)[name = tensor<string, []>("new_scale_1")];
-            tensor<fp32, [1]> var_205 = sqrt(x = new_scale_1)[name = tensor<string, []>("op_205")];
-            tensor<fp32, [1, 4, 64, 64]> var_206 = real_div(x = new_kv_unnorm_1, y = var_205)[name = tensor<string, []>("op_206")];
-            tensor<int32, [4]> var_207_perm_0 = const()[name = tensor<string, []>("op_207_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_235 = mul(x = prev_kv_1, y = sqrt_s0_1)[name = tensor<string, []>("op_235")];
+            tensor<bool, []> var_237_transpose_x_1 = const()[name = tensor<string, []>("op_237_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_237_transpose_y_1 = const()[name = tensor<string, []>("op_237_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_237 = matmul(transpose_x = var_237_transpose_x_1, transpose_y = var_237_transpose_y_1, x = k_1, y = v_1)[name = tensor<string, []>("op_237")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_1 = add(x = var_235, y = var_237)[name = tensor<string, []>("new_kv_unnorm_1")];
+            tensor<fp32, []> var_239 = const()[name = tensor<string, []>("op_239"), val = tensor<fp32, []>(0x1p+1)];
+            tensor<fp32, [1]> new_scale_1 = add(x = prev_scale_1, y = var_239)[name = tensor<string, []>("new_scale_1")];
+            tensor<fp32, [1]> var_241 = sqrt(x = new_scale_1)[name = tensor<string, []>("op_241")];
+            tensor<fp32, [1, 4, 64, 64]> var_242 = real_div(x = new_kv_unnorm_1, y = var_241)[name = tensor<string, []>("op_242")];
+            tensor<int32, [4]> var_243_perm_0 = const()[name = tensor<string, []>("op_243_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_3_axes_0 = const()[name = tensor<string, []>("out_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 4, 64]> var_207 = transpose(perm = var_207_perm_0, x = out_1)[name = tensor<string, []>("transpose_55")];
-            tensor<fp32, [1, 2, 4, 64]> out_3 = layer_norm(axes = out_3_axes_0, epsilon = var_17, x = var_207)[name = tensor<string, []>("out_3")];
-            tensor<int32, [3]> var_211 = const()[name = tensor<string, []>("op_211"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<fp32, [1, 2, 256]> out_5 = reshape(shape = var_211, x = out_3)[name = tensor<string, []>("out_5")];
-            tensor<fp32, [1, 2, 256]> var_213 = silu(x = input_17)[name = tensor<string, []>("op_213")];
-            tensor<fp32, [1, 2, 256]> input_19 = mul(x = var_213, y = out_5)[name = tensor<string, []>("input_19")];
-            tensor<fp32, [1, 2, 256]> ret_out_1 = linear(bias = encoder_out_proj_0_bias, weight = encoder_out_proj_0_weight, x = input_19)[name = tensor<string, []>("linear_7")];
-            tensor<fp32, [1, 2, 256]> x_3 = add(x = input_13, y = ret_out_1)[name = tensor<string, []>("x_3")];
+            tensor<fp32, [1, 2, 4, 64]> var_243 = transpose(perm = var_243_perm_0, x = out_1)[name = tensor<string, []>("transpose_55")];
+            tensor<fp32, [1, 2, 4, 64]> out_3 = layer_norm(axes = out_3_axes_0, epsilon = var_54, x = var_243)[name = tensor<string, []>("out_3")];
+            tensor<int32, [3]> var_247 = const()[name = tensor<string, []>("op_247"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp32, [1, 2, 256]> out_5 = reshape(shape = var_247, x = out_3)[name = tensor<string, []>("out_5")];
+            tensor<fp32, [1, 2, 256]> var_249 = silu(x = input_19)[name = tensor<string, []>("op_249")];
+            tensor<fp32, [1, 2, 256]> input_21 = mul(x = var_249, y = out_5)[name = tensor<string, []>("input_21")];
+            tensor<fp32, [1, 2, 256]> ret_out_1 = linear(bias = inner_encoder_out_proj_0_bias, weight = inner_encoder_out_proj_0_weight, x = input_21)[name = tensor<string, []>("linear_7")];
+            tensor<fp32, [1, 2, 256]> x_3 = add(x = input_15, y = ret_out_1)[name = tensor<string, []>("x_3")];
             tensor<int32, [4]> window_1_begin_0 = const()[name = tensor<string, []>("window_1_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> window_1_end_0 = const()[name = tensor<string, []>("window_1_end_0"), val = tensor<int32, [4]>([1, 1, 16, 256])];
             tensor<bool, [4]> window_1_end_mask_0 = const()[name = tensor<string, []>("window_1_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_1_squeeze_mask_0 = const()[name = tensor<string, []>("window_1_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_1 = slice_by_index(begin = window_1_begin_0, end = window_1_end_0, end_mask = window_1_end_mask_0, squeeze_mask = window_1_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_1")];
-            tensor<int32, [3]> var_221_begin_0 = const()[name = tensor<string, []>("op_221_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_221_end_0 = const()[name = tensor<string, []>("op_221_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_221_end_mask_0 = const()[name = tensor<string, []>("op_221_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_221 = slice_by_index(begin = var_221_begin_0, end = var_221_end_0, end_mask = var_221_end_mask_0, x = x_3)[name = tensor<string, []>("op_221")];
-            tensor<int32, [3]> var_224_begin_0 = const()[name = tensor<string, []>("op_224_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_224_end_0 = const()[name = tensor<string, []>("op_224_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_224_end_mask_0 = const()[name = tensor<string, []>("op_224_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_224 = slice_by_index(begin = var_224_begin_0, end = var_224_end_0, end_mask = var_224_end_mask_0, x = window_1)[name = tensor<string, []>("op_224")];
+            tensor<int32, [3]> var_257_begin_0 = const()[name = tensor<string, []>("op_257_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_257_end_0 = const()[name = tensor<string, []>("op_257_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_257_end_mask_0 = const()[name = tensor<string, []>("op_257_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_257 = slice_by_index(begin = var_257_begin_0, end = var_257_end_0, end_mask = var_257_end_mask_0, x = x_3)[name = tensor<string, []>("op_257")];
+            tensor<int32, [3]> var_260_begin_0 = const()[name = tensor<string, []>("op_260_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_260_end_0 = const()[name = tensor<string, []>("op_260_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_260_end_mask_0 = const()[name = tensor<string, []>("op_260_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_260 = slice_by_index(begin = var_260_begin_0, end = var_260_end_0, end_mask = var_260_end_mask_0, x = window_1)[name = tensor<string, []>("op_260")];
             tensor<bool, []> window_3_interleave_0 = const()[name = tensor<string, []>("window_3_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_3 = concat(axis = var_26, interleave = window_3_interleave_0, values = (var_224, var_221))[name = tensor<string, []>("window_3")];
-            tensor<int32, [3]> var_229_begin_0 = const()[name = tensor<string, []>("op_229_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_229_end_0 = const()[name = tensor<string, []>("op_229_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_229_end_mask_0 = const()[name = tensor<string, []>("op_229_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_229 = slice_by_index(begin = var_229_begin_0, end = var_229_end_0, end_mask = var_229_end_mask_0, x = x_3)[name = tensor<string, []>("op_229")];
-            tensor<int32, [3]> var_232_begin_0 = const()[name = tensor<string, []>("op_232_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_232_end_0 = const()[name = tensor<string, []>("op_232_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_232_end_mask_0 = const()[name = tensor<string, []>("op_232_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_232 = slice_by_index(begin = var_232_begin_0, end = var_232_end_0, end_mask = var_232_end_mask_0, x = window_3)[name = tensor<string, []>("op_232")];
+            tensor<fp32, [1, 16, 256]> window_3 = concat(axis = var_62, interleave = window_3_interleave_0, values = (var_260, var_257))[name = tensor<string, []>("window_3")];
+            tensor<int32, [3]> var_265_begin_0 = const()[name = tensor<string, []>("op_265_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_265_end_0 = const()[name = tensor<string, []>("op_265_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_265_end_mask_0 = const()[name = tensor<string, []>("op_265_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_265 = slice_by_index(begin = var_265_begin_0, end = var_265_end_0, end_mask = var_265_end_mask_0, x = x_3)[name = tensor<string, []>("op_265")];
+            tensor<int32, [3]> var_268_begin_0 = const()[name = tensor<string, []>("op_268_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_268_end_0 = const()[name = tensor<string, []>("op_268_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_268_end_mask_0 = const()[name = tensor<string, []>("op_268_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_268 = slice_by_index(begin = var_268_begin_0, end = var_268_end_0, end_mask = var_268_end_mask_0, x = window_3)[name = tensor<string, []>("op_268")];
             tensor<bool, []> window_5_interleave_0 = const()[name = tensor<string, []>("window_5_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_5 = concat(axis = var_26, interleave = window_5_interleave_0, values = (var_232, var_229))[name = tensor<string, []>("window_5")];
-            tensor<bool, []> input_21_interleave_0 = const()[name = tensor<string, []>("input_21_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [2, 16, 256]> input_21 = concat(axis = var_23, interleave = input_21_interleave_0, values = (window_3, window_5))[name = tensor<string, []>("input_21")];
+            tensor<fp32, [1, 16, 256]> window_5 = concat(axis = var_62, interleave = window_5_interleave_0, values = (var_268, var_265))[name = tensor<string, []>("window_5")];
+            tensor<bool, []> input_23_interleave_0 = const()[name = tensor<string, []>("input_23_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [2, 16, 256]> input_23 = concat(axis = var_49, interleave = input_23_interleave_0, values = (window_3, window_5))[name = tensor<string, []>("input_23")];
             tensor<int32, [1]> x_5_axes_0 = const()[name = tensor<string, []>("x_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [2, 16, 256]> x_5 = layer_norm(axes = x_5_axes_0, beta = encoder_conv_module_0_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_0_sequential_0_weight, x = input_21)[name = tensor<string, []>("x_5")];
-            tensor<int32, [3]> input_23_perm_0 = const()[name = tensor<string, []>("input_23_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [2, 16, 256]> x_5 = layer_norm(axes = x_5_axes_0, beta = inner_encoder_conv_module_0_sequential_0_bias, epsilon = var_46, gamma = inner_encoder_conv_module_0_sequential_0_weight, x = input_23)[name = tensor<string, []>("x_5")];
+            tensor<int32, [3]> input_25_perm_0 = const()[name = tensor<string, []>("input_25_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_3_pad_type_0 = const()[name = tensor<string, []>("inputs_3_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_3_strides_0 = const()[name = tensor<string, []>("inputs_3_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_3_pad_0 = const()[name = tensor<string, []>("inputs_3_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_3_dilations_0 = const()[name = tensor<string, []>("inputs_3_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_3_groups_0 = const()[name = tensor<string, []>("inputs_3_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> input_23 = transpose(perm = input_23_perm_0, x = x_5)[name = tensor<string, []>("transpose_54")];
-            tensor<fp32, [2, 512, 16]> inputs_3 = conv(bias = encoder_conv_module_0_sequential_2_conv_bias, dilations = inputs_3_dilations_0, groups = inputs_3_groups_0, pad = inputs_3_pad_0, pad_type = inputs_3_pad_type_0, strides = inputs_3_strides_0, weight = encoder_conv_module_0_sequential_2_conv_weight, x = input_23)[name = tensor<string, []>("inputs_3")];
-            tensor<int32, [2]> var_257_split_sizes_0 = const()[name = tensor<string, []>("op_257_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_257_axis_0 = const()[name = tensor<string, []>("op_257_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> var_257_0, tensor<fp32, [2, 256, 16]> var_257_1 = split(axis = var_257_axis_0, split_sizes = var_257_split_sizes_0, x = inputs_3)[name = tensor<string, []>("op_257")];
-            tensor<fp32, [2, 256, 16]> var_259 = sigmoid(x = var_257_1)[name = tensor<string, []>("op_259")];
-            tensor<fp32, [2, 256, 16]> inputs_5 = mul(x = var_257_0, y = var_259)[name = tensor<string, []>("inputs_5")];
+            tensor<fp32, [2, 256, 16]> input_25 = transpose(perm = input_25_perm_0, x = x_5)[name = tensor<string, []>("transpose_54")];
+            tensor<fp32, [2, 512, 16]> inputs_3 = conv(bias = inner_encoder_conv_module_0_sequential_2_conv_bias, dilations = inputs_3_dilations_0, groups = inputs_3_groups_0, pad = inputs_3_pad_0, pad_type = inputs_3_pad_type_0, strides = inputs_3_strides_0, weight = inner_encoder_conv_module_0_sequential_2_conv_weight, x = input_25)[name = tensor<string, []>("inputs_3")];
+            tensor<int32, [2]> var_293_split_sizes_0 = const()[name = tensor<string, []>("op_293_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_293_axis_0 = const()[name = tensor<string, []>("op_293_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [2, 256, 16]> var_293_0, tensor<fp32, [2, 256, 16]> var_293_1 = split(axis = var_293_axis_0, split_sizes = var_293_split_sizes_0, x = inputs_3)[name = tensor<string, []>("op_293")];
+            tensor<fp32, [2, 256, 16]> var_295 = sigmoid(x = var_293_1)[name = tensor<string, []>("op_295")];
+            tensor<fp32, [2, 256, 16]> inputs_5 = mul(x = var_293_0, y = var_295)[name = tensor<string, []>("inputs_5")];
             tensor<string, []> outputs_aug_1_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_1_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_1_pad_0 = const()[name = tensor<string, []>("outputs_aug_1_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_1_groups_0 = const()[name = tensor<string, []>("outputs_aug_1_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_1_strides_0 = const()[name = tensor<string, []>("outputs_aug_1_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_1_dilations_0 = const()[name = tensor<string, []>("outputs_aug_1_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [2, 256, 31]> outputs_aug_1 = conv(dilations = outputs_aug_1_dilations_0, groups = outputs_aug_1_groups_0, pad = outputs_aug_1_pad_0, pad_type = outputs_aug_1_pad_type_0, strides = outputs_aug_1_strides_0, weight = encoder_conv_module_0_sequential_4_conv_weight, x = inputs_5)[name = tensor<string, []>("outputs_aug_1")];
-            tensor<int32, [3]> input_25_begin_0 = const()[name = tensor<string, []>("input_25_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_25_end_0 = const()[name = tensor<string, []>("input_25_end_0"), val = tensor<int32, [3]>([2, 256, 16])];
-            tensor<bool, [3]> input_25_end_mask_0 = const()[name = tensor<string, []>("input_25_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [2, 256, 16]> input_25 = slice_by_index(begin = input_25_begin_0, end = input_25_end_0, end_mask = input_25_end_mask_0, x = outputs_aug_1)[name = tensor<string, []>("input_25")];
-            tensor<fp32, [2, 256, 16]> inputs_7 = batch_norm(beta = encoder_conv_module_0_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_0_sequential_5_weight, mean = encoder_conv_module_0_sequential_5_running_mean, variance = encoder_conv_module_0_sequential_5_running_var, x = input_25)[name = tensor<string, []>("inputs_7")];
-            tensor<fp32, [2, 256, 16]> input_27 = silu(x = inputs_7)[name = tensor<string, []>("input_27")];
-            tensor<string, []> input_29_pad_type_0 = const()[name = tensor<string, []>("input_29_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_29_strides_0 = const()[name = tensor<string, []>("input_29_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_29_pad_0 = const()[name = tensor<string, []>("input_29_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_29_dilations_0 = const()[name = tensor<string, []>("input_29_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_29_groups_0 = const()[name = tensor<string, []>("input_29_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> input_29 = conv(bias = encoder_conv_module_0_sequential_7_conv_bias, dilations = input_29_dilations_0, groups = input_29_groups_0, pad = input_29_pad_0, pad_type = input_29_pad_type_0, strides = input_29_strides_0, weight = encoder_conv_module_0_sequential_7_conv_weight, x = input_27)[name = tensor<string, []>("input_29")];
+            tensor<fp32, [2, 256, 31]> outputs_aug_1 = conv(dilations = outputs_aug_1_dilations_0, groups = outputs_aug_1_groups_0, pad = outputs_aug_1_pad_0, pad_type = outputs_aug_1_pad_type_0, strides = outputs_aug_1_strides_0, weight = inner_encoder_conv_module_0_sequential_4_conv_weight, x = inputs_5)[name = tensor<string, []>("outputs_aug_1")];
+            tensor<int32, [3]> input_27_begin_0 = const()[name = tensor<string, []>("input_27_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_27_end_0 = const()[name = tensor<string, []>("input_27_end_0"), val = tensor<int32, [3]>([2, 256, 16])];
+            tensor<bool, [3]> input_27_end_mask_0 = const()[name = tensor<string, []>("input_27_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [2, 256, 16]> input_27 = slice_by_index(begin = input_27_begin_0, end = input_27_end_0, end_mask = input_27_end_mask_0, x = outputs_aug_1)[name = tensor<string, []>("input_27")];
+            tensor<fp32, [2, 256, 16]> inputs_7 = batch_norm(beta = inner_encoder_conv_module_0_sequential_5_bias, epsilon = var_46, gamma = inner_encoder_conv_module_0_sequential_5_weight, mean = inner_encoder_conv_module_0_sequential_5_running_mean, variance = inner_encoder_conv_module_0_sequential_5_running_var, x = input_27)[name = tensor<string, []>("inputs_7")];
+            tensor<fp32, [2, 256, 16]> input_29 = silu(x = inputs_7)[name = tensor<string, []>("input_29")];
+            tensor<string, []> input_31_pad_type_0 = const()[name = tensor<string, []>("input_31_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_31_strides_0 = const()[name = tensor<string, []>("input_31_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_31_pad_0 = const()[name = tensor<string, []>("input_31_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_31_dilations_0 = const()[name = tensor<string, []>("input_31_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_31_groups_0 = const()[name = tensor<string, []>("input_31_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [2, 256, 16]> input_31 = conv(bias = inner_encoder_conv_module_0_sequential_7_conv_bias, dilations = input_31_dilations_0, groups = input_31_groups_0, pad = input_31_pad_0, pad_type = input_31_pad_type_0, strides = input_31_strides_0, weight = inner_encoder_conv_module_0_sequential_7_conv_weight, x = input_29)[name = tensor<string, []>("input_31")];
             tensor<int32, [3]> conv_out_1_perm_0 = const()[name = tensor<string, []>("conv_out_1_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_290_begin_0 = const()[name = tensor<string, []>("op_290_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_290_end_0 = const()[name = tensor<string, []>("op_290_end_0"), val = tensor<int32, [3]>([2, 16, 256])];
-            tensor<bool, [3]> var_290_end_mask_0 = const()[name = tensor<string, []>("op_290_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [2, 16, 256]> conv_out_1 = transpose(perm = conv_out_1_perm_0, x = input_29)[name = tensor<string, []>("transpose_53")];
-            tensor<fp32, [2, 1, 256]> var_290 = slice_by_index(begin = var_290_begin_0, end = var_290_end_0, end_mask = var_290_end_mask_0, x = conv_out_1)[name = tensor<string, []>("op_290")];
-            tensor<int32, [3]> var_292_perm_0 = const()[name = tensor<string, []>("op_292_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 2, 256]> var_292 = transpose(perm = var_292_perm_0, x = var_290)[name = tensor<string, []>("transpose_52")];
-            tensor<fp32, [1, 2, 256]> input_31 = add(x = x_3, y = var_292)[name = tensor<string, []>("input_31")];
-            tensor<int32, [1]> input_33_axes_0 = const()[name = tensor<string, []>("input_33_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_33 = layer_norm(axes = input_33_axes_0, beta = encoder_ffn2_0_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_0_module_sequential_0_weight, x = input_31)[name = tensor<string, []>("input_33")];
-            tensor<fp32, [1, 2, 1024]> inputs_9 = linear(bias = encoder_ffn2_0_module_sequential_1_linear_bias, weight = encoder_ffn2_0_module_sequential_1_linear_weight, x = input_33)[name = tensor<string, []>("linear_8")];
-            tensor<fp32, [1, 2, 1024]> input_35 = silu(x = inputs_9)[name = tensor<string, []>("input_35")];
-            tensor<fp32, [1, 2, 256]> input_39 = linear(bias = encoder_ffn2_0_module_sequential_4_linear_bias, weight = encoder_ffn2_0_module_sequential_4_linear_weight, x = input_35)[name = tensor<string, []>("linear_9")];
-            tensor<fp32, []> var_315 = const()[name = tensor<string, []>("op_315"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 2, 256]> var_316 = mul(x = input_39, y = var_315)[name = tensor<string, []>("op_316")];
-            tensor<fp32, [1, 2, 256]> input_41 = add(x = var_316, y = input_31)[name = tensor<string, []>("input_41")];
-            tensor<int32, [1]> input_43_axes_0 = const()[name = tensor<string, []>("input_43_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_43 = layer_norm(axes = input_43_axes_0, beta = encoder_layer_norm_0_bias, epsilon = var_28, gamma = encoder_layer_norm_0_weight, x = input_41)[name = tensor<string, []>("input_43")];
+            tensor<int32, [3]> var_326_begin_0 = const()[name = tensor<string, []>("op_326_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_326_end_0 = const()[name = tensor<string, []>("op_326_end_0"), val = tensor<int32, [3]>([2, 16, 256])];
+            tensor<bool, [3]> var_326_end_mask_0 = const()[name = tensor<string, []>("op_326_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [2, 16, 256]> conv_out_1 = transpose(perm = conv_out_1_perm_0, x = input_31)[name = tensor<string, []>("transpose_53")];
+            tensor<fp32, [2, 1, 256]> var_326 = slice_by_index(begin = var_326_begin_0, end = var_326_end_0, end_mask = var_326_end_mask_0, x = conv_out_1)[name = tensor<string, []>("op_326")];
+            tensor<int32, [3]> var_328_perm_0 = const()[name = tensor<string, []>("op_328_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 2, 256]> var_328 = transpose(perm = var_328_perm_0, x = var_326)[name = tensor<string, []>("transpose_52")];
+            tensor<fp32, [1, 2, 256]> input_33 = add(x = x_3, y = var_328)[name = tensor<string, []>("input_33")];
+            tensor<int32, [1]> input_35_axes_0 = const()[name = tensor<string, []>("input_35_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 2, 256]> input_35 = layer_norm(axes = input_35_axes_0, beta = inner_encoder_ffn2_0_module_sequential_0_bias, epsilon = var_46, gamma = inner_encoder_ffn2_0_module_sequential_0_weight, x = input_33)[name = tensor<string, []>("input_35")];
+            tensor<fp32, [1, 2, 1024]> inputs_9 = linear(bias = inner_encoder_ffn2_0_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_0_module_sequential_1_linear_weight, x = input_35)[name = tensor<string, []>("linear_8")];
+            tensor<fp32, [1, 2, 1024]> input_37 = silu(x = inputs_9)[name = tensor<string, []>("input_37")];
+            tensor<fp32, [1, 2, 256]> input_41 = linear(bias = inner_encoder_ffn2_0_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_0_module_sequential_4_linear_weight, x = input_37)[name = tensor<string, []>("linear_9")];
+            tensor<fp32, []> var_351 = const()[name = tensor<string, []>("op_351"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 2, 256]> var_352 = mul(x = input_41, y = var_351)[name = tensor<string, []>("op_352")];
+            tensor<fp32, [1, 2, 256]> input_43 = add(x = var_352, y = input_33)[name = tensor<string, []>("input_43")];
             tensor<int32, [1]> input_45_axes_0 = const()[name = tensor<string, []>("input_45_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_45 = layer_norm(axes = input_45_axes_0, beta = encoder_ffn1_1_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_1_module_sequential_0_weight, x = input_43)[name = tensor<string, []>("input_45")];
-            tensor<fp32, [1, 2, 1024]> inputs_11 = linear(bias = encoder_ffn1_1_module_sequential_1_linear_bias, weight = encoder_ffn1_1_module_sequential_1_linear_weight, x = input_45)[name = tensor<string, []>("linear_10")];
-            tensor<fp32, [1, 2, 1024]> input_47 = silu(x = inputs_11)[name = tensor<string, []>("input_47")];
-            tensor<fp32, [1, 2, 256]> input_51 = linear(bias = encoder_ffn1_1_module_sequential_4_linear_bias, weight = encoder_ffn1_1_module_sequential_4_linear_weight, x = input_47)[name = tensor<string, []>("linear_11")];
-            tensor<fp32, []> var_345 = const()[name = tensor<string, []>("op_345"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 2, 256]> var_346 = mul(x = input_51, y = var_345)[name = tensor<string, []>("op_346")];
-            tensor<fp32, [1, 2, 256]> input_53 = add(x = var_346, y = input_43)[name = tensor<string, []>("input_53")];
+            tensor<fp32, [1, 2, 256]> input_45 = layer_norm(axes = input_45_axes_0, beta = inner_encoder_layer_norm_0_bias, epsilon = var_46, gamma = inner_encoder_layer_norm_0_weight, x = input_43)[name = tensor<string, []>("input_45")];
+            tensor<int32, [1]> input_47_axes_0 = const()[name = tensor<string, []>("input_47_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 2, 256]> input_47 = layer_norm(axes = input_47_axes_0, beta = inner_encoder_ffn1_1_module_sequential_0_bias, epsilon = var_46, gamma = inner_encoder_ffn1_1_module_sequential_0_weight, x = input_45)[name = tensor<string, []>("input_47")];
+            tensor<fp32, [1, 2, 1024]> inputs_11 = linear(bias = inner_encoder_ffn1_1_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_1_module_sequential_1_linear_weight, x = input_47)[name = tensor<string, []>("linear_10")];
+            tensor<fp32, [1, 2, 1024]> input_49 = silu(x = inputs_11)[name = tensor<string, []>("input_49")];
+            tensor<fp32, [1, 2, 256]> input_53 = linear(bias = inner_encoder_ffn1_1_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_1_module_sequential_4_linear_weight, x = input_49)[name = tensor<string, []>("linear_11")];
+            tensor<fp32, []> var_381 = const()[name = tensor<string, []>("op_381"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 2, 256]> var_382 = mul(x = input_53, y = var_381)[name = tensor<string, []>("op_382")];
+            tensor<fp32, [1, 2, 256]> input_55 = add(x = var_382, y = input_45)[name = tensor<string, []>("input_55")];
             tensor<int32, [1]> x_7_axes_0 = const()[name = tensor<string, []>("x_7_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> x_7 = layer_norm(axes = x_7_axes_0, beta = encoder_ret_lns_1_bias, epsilon = var_28, gamma = encoder_ret_lns_1_weight, x = input_53)[name = tensor<string, []>("x_7")];
+            tensor<fp32, [1, 2, 256]> x_7 = layer_norm(axes = x_7_axes_0, beta = inner_encoder_ret_lns_1_bias, epsilon = var_46, gamma = inner_encoder_ret_lns_1_weight, x = input_55)[name = tensor<string, []>("x_7")];
             tensor<int32, [5]> prev_kv_3_begin_0 = const()[name = tensor<string, []>("prev_kv_3_begin_0"), val = tensor<int32, [5]>([1, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_3_end_0 = const()[name = tensor<string, []>("prev_kv_3_end_0"), val = tensor<int32, [5]>([2, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_3_end_mask_0 = const()[name = tensor<string, []>("prev_kv_3_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -396,153 +410,153 @@ program(1.0)
             tensor<bool, [2]> prev_scale_3_end_mask_0 = const()[name = tensor<string, []>("prev_scale_3_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_3_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_3_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_3 = slice_by_index(begin = prev_scale_3_begin_0, end = prev_scale_3_end_0, end_mask = prev_scale_3_end_mask_0, squeeze_mask = prev_scale_3_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_3")];
-            tensor<fp32, [1, 2, 256]> var_360 = linear(bias = encoder_q_proj_1_bias, weight = encoder_q_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_12")];
-            tensor<int32, [4]> var_361 = const()[name = tensor<string, []>("op_361"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_362 = reshape(shape = var_361, x = var_360)[name = tensor<string, []>("op_362")];
+            tensor<fp32, [1, 2, 256]> var_396 = linear(bias = inner_encoder_q_proj_1_bias, weight = inner_encoder_q_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_12")];
+            tensor<int32, [4]> var_397 = const()[name = tensor<string, []>("op_397"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_398 = reshape(shape = var_397, x = var_396)[name = tensor<string, []>("op_398")];
             tensor<int32, [4]> q_3_perm_0 = const()[name = tensor<string, []>("q_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> var_366 = linear(bias = encoder_k_proj_1_bias, weight = encoder_k_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_13")];
-            tensor<fp32, []> var_367 = const()[name = tensor<string, []>("op_367"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 2, 256]> var_368 = mul(x = var_366, y = var_367)[name = tensor<string, []>("op_368")];
-            tensor<int32, [4]> var_369 = const()[name = tensor<string, []>("op_369"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_370 = reshape(shape = var_369, x = var_368)[name = tensor<string, []>("op_370")];
+            tensor<fp32, [1, 2, 256]> var_402 = linear(bias = inner_encoder_k_proj_1_bias, weight = inner_encoder_k_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_13")];
+            tensor<fp32, []> var_403 = const()[name = tensor<string, []>("op_403"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 2, 256]> var_404 = mul(x = var_402, y = var_403)[name = tensor<string, []>("op_404")];
+            tensor<int32, [4]> var_405 = const()[name = tensor<string, []>("op_405"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_406 = reshape(shape = var_405, x = var_404)[name = tensor<string, []>("op_406")];
             tensor<int32, [4]> k_3_perm_0 = const()[name = tensor<string, []>("k_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> var_374 = linear(bias = encoder_v_proj_1_bias, weight = encoder_v_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_14")];
-            tensor<int32, [4]> var_375 = const()[name = tensor<string, []>("op_375"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_376 = reshape(shape = var_375, x = var_374)[name = tensor<string, []>("op_376")];
+            tensor<fp32, [1, 2, 256]> var_410 = linear(bias = inner_encoder_v_proj_1_bias, weight = inner_encoder_v_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_14")];
+            tensor<int32, [4]> var_411 = const()[name = tensor<string, []>("op_411"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_412 = reshape(shape = var_411, x = var_410)[name = tensor<string, []>("op_412")];
             tensor<int32, [4]> v_3_perm_0 = const()[name = tensor<string, []>("v_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> input_57 = linear(bias = encoder_g_proj_1_bias, weight = encoder_g_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_15")];
+            tensor<fp32, [1, 2, 256]> input_59 = linear(bias = inner_encoder_g_proj_1_bias, weight = inner_encoder_g_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_15")];
             tensor<fp32, [1]> sqrt_s0_3 = sqrt(x = prev_scale_3)[name = tensor<string, []>("sqrt_s0_3")];
-            tensor<fp32, [2]> s_t_3 = add(x = prev_scale_3, y = encoder__t_index)[name = tensor<string, []>("s_t_3")];
+            tensor<fp32, [2]> s_t_3 = add(x = prev_scale_3, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_3")];
             tensor<fp32, [2]> sqrt_s_t_3 = sqrt(x = s_t_3)[name = tensor<string, []>("sqrt_s_t_3")];
             tensor<bool, []> qk_3_transpose_x_1 = const()[name = tensor<string, []>("qk_3_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_3_transpose_y_1 = const()[name = tensor<string, []>("qk_3_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 2, 64]> k_3 = transpose(perm = k_3_perm_0, x = var_370)[name = tensor<string, []>("transpose_50")];
-            tensor<fp32, [1, 4, 2, 64]> q_3 = transpose(perm = q_3_perm_0, x = var_362)[name = tensor<string, []>("transpose_51")];
+            tensor<fp32, [1, 4, 2, 64]> k_3 = transpose(perm = k_3_perm_0, x = var_406)[name = tensor<string, []>("transpose_50")];
+            tensor<fp32, [1, 4, 2, 64]> q_3 = transpose(perm = q_3_perm_0, x = var_398)[name = tensor<string, []>("transpose_51")];
             tensor<fp32, [1, 4, 2, 2]> qk_3 = matmul(transpose_x = qk_3_transpose_x_1, transpose_y = qk_3_transpose_y_1, x = q_3, y = k_3)[name = tensor<string, []>("qk_3")];
-            tensor<int32, [2]> var_386 = const()[name = tensor<string, []>("op_386"), val = tensor<int32, [2]>([2, 1])];
-            tensor<fp32, [2, 1]> var_387 = reshape(shape = var_386, x = sqrt_s_t_3)[name = tensor<string, []>("op_387")];
-            tensor<fp32, [2, 2]> M_3 = real_div(x = encoder__causal_mask, y = var_387)[name = tensor<string, []>("M_3")];
-            tensor<fp32, [1, 4, 2, 2]> var_389 = mul(x = qk_3, y = M_3)[name = tensor<string, []>("op_389")];
+            tensor<int32, [2]> var_422 = const()[name = tensor<string, []>("op_422"), val = tensor<int32, [2]>([2, 1])];
+            tensor<fp32, [2, 1]> var_423 = reshape(shape = var_422, x = sqrt_s_t_3)[name = tensor<string, []>("op_423")];
+            tensor<fp32, [2, 2]> M_3 = real_div(x = inner_encoder__causal_mask, y = var_423)[name = tensor<string, []>("M_3")];
+            tensor<fp32, [1, 4, 2, 2]> var_425 = mul(x = qk_3, y = M_3)[name = tensor<string, []>("op_425")];
             tensor<bool, []> inner_3_transpose_x_0 = const()[name = tensor<string, []>("inner_3_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_3_transpose_y_0 = const()[name = tensor<string, []>("inner_3_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 2, 64]> v_3 = transpose(perm = v_3_perm_0, x = var_376)[name = tensor<string, []>("transpose_49")];
-            tensor<fp32, [1, 4, 2, 64]> inner_3 = matmul(transpose_x = inner_3_transpose_x_0, transpose_y = inner_3_transpose_y_0, x = var_389, y = v_3)[name = tensor<string, []>("inner_3")];
-            tensor<bool, []> var_391_transpose_x_0 = const()[name = tensor<string, []>("op_391_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_391_transpose_y_0 = const()[name = tensor<string, []>("op_391_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 2, 64]> var_391 = matmul(transpose_x = var_391_transpose_x_0, transpose_y = var_391_transpose_y_0, x = q_3, y = prev_kv_3)[name = tensor<string, []>("op_391")];
-            tensor<fp32, [2]> var_392 = real_div(x = sqrt_s0_3, y = sqrt_s_t_3)[name = tensor<string, []>("op_392")];
-            tensor<int32, [4]> var_393 = const()[name = tensor<string, []>("op_393"), val = tensor<int32, [4]>([1, 1, 2, 1])];
-            tensor<fp32, [1, 1, 2, 1]> var_394 = reshape(shape = var_393, x = var_392)[name = tensor<string, []>("op_394")];
-            tensor<fp32, [1, 4, 2, 64]> cross_3 = mul(x = var_391, y = var_394)[name = tensor<string, []>("cross_3")];
+            tensor<fp32, [1, 4, 2, 64]> v_3 = transpose(perm = v_3_perm_0, x = var_412)[name = tensor<string, []>("transpose_49")];
+            tensor<fp32, [1, 4, 2, 64]> inner_3 = matmul(transpose_x = inner_3_transpose_x_0, transpose_y = inner_3_transpose_y_0, x = var_425, y = v_3)[name = tensor<string, []>("inner_3")];
+            tensor<bool, []> var_427_transpose_x_0 = const()[name = tensor<string, []>("op_427_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_427_transpose_y_0 = const()[name = tensor<string, []>("op_427_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 2, 64]> var_427 = matmul(transpose_x = var_427_transpose_x_0, transpose_y = var_427_transpose_y_0, x = q_3, y = prev_kv_3)[name = tensor<string, []>("op_427")];
+            tensor<fp32, [2]> var_428 = real_div(x = sqrt_s0_3, y = sqrt_s_t_3)[name = tensor<string, []>("op_428")];
+            tensor<int32, [4]> var_429 = const()[name = tensor<string, []>("op_429"), val = tensor<int32, [4]>([1, 1, 2, 1])];
+            tensor<fp32, [1, 1, 2, 1]> var_430 = reshape(shape = var_429, x = var_428)[name = tensor<string, []>("op_430")];
+            tensor<fp32, [1, 4, 2, 64]> cross_3 = mul(x = var_427, y = var_430)[name = tensor<string, []>("cross_3")];
             tensor<fp32, [1, 4, 2, 64]> out_7 = add(x = inner_3, y = cross_3)[name = tensor<string, []>("out_7")];
-            tensor<fp32, [1, 4, 64, 64]> var_397 = mul(x = prev_kv_3, y = sqrt_s0_3)[name = tensor<string, []>("op_397")];
-            tensor<bool, []> var_399_transpose_x_1 = const()[name = tensor<string, []>("op_399_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_399_transpose_y_1 = const()[name = tensor<string, []>("op_399_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_399 = matmul(transpose_x = var_399_transpose_x_1, transpose_y = var_399_transpose_y_1, x = k_3, y = v_3)[name = tensor<string, []>("op_399")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_3 = add(x = var_397, y = var_399)[name = tensor<string, []>("new_kv_unnorm_3")];
-            tensor<fp32, []> var_401 = const()[name = tensor<string, []>("op_401"), val = tensor<fp32, []>(0x1p+1)];
-            tensor<fp32, [1]> new_scale_3 = add(x = prev_scale_3, y = var_401)[name = tensor<string, []>("new_scale_3")];
-            tensor<fp32, [1]> var_403 = sqrt(x = new_scale_3)[name = tensor<string, []>("op_403")];
-            tensor<fp32, [1, 4, 64, 64]> var_404 = real_div(x = new_kv_unnorm_3, y = var_403)[name = tensor<string, []>("op_404")];
-            tensor<int32, [4]> var_405_perm_0 = const()[name = tensor<string, []>("op_405_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_433 = mul(x = prev_kv_3, y = sqrt_s0_3)[name = tensor<string, []>("op_433")];
+            tensor<bool, []> var_435_transpose_x_1 = const()[name = tensor<string, []>("op_435_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_435_transpose_y_1 = const()[name = tensor<string, []>("op_435_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_435 = matmul(transpose_x = var_435_transpose_x_1, transpose_y = var_435_transpose_y_1, x = k_3, y = v_3)[name = tensor<string, []>("op_435")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_3 = add(x = var_433, y = var_435)[name = tensor<string, []>("new_kv_unnorm_3")];
+            tensor<fp32, []> var_437 = const()[name = tensor<string, []>("op_437"), val = tensor<fp32, []>(0x1p+1)];
+            tensor<fp32, [1]> new_scale_3 = add(x = prev_scale_3, y = var_437)[name = tensor<string, []>("new_scale_3")];
+            tensor<fp32, [1]> var_439 = sqrt(x = new_scale_3)[name = tensor<string, []>("op_439")];
+            tensor<fp32, [1, 4, 64, 64]> var_440 = real_div(x = new_kv_unnorm_3, y = var_439)[name = tensor<string, []>("op_440")];
+            tensor<int32, [4]> var_441_perm_0 = const()[name = tensor<string, []>("op_441_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_9_axes_0 = const()[name = tensor<string, []>("out_9_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 4, 64]> var_405 = transpose(perm = var_405_perm_0, x = out_7)[name = tensor<string, []>("transpose_48")];
-            tensor<fp32, [1, 2, 4, 64]> out_9 = layer_norm(axes = out_9_axes_0, epsilon = var_17, x = var_405)[name = tensor<string, []>("out_9")];
-            tensor<int32, [3]> var_409 = const()[name = tensor<string, []>("op_409"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<fp32, [1, 2, 256]> out_11 = reshape(shape = var_409, x = out_9)[name = tensor<string, []>("out_11")];
-            tensor<fp32, [1, 2, 256]> var_411 = silu(x = input_57)[name = tensor<string, []>("op_411")];
-            tensor<fp32, [1, 2, 256]> input_59 = mul(x = var_411, y = out_11)[name = tensor<string, []>("input_59")];
-            tensor<fp32, [1, 2, 256]> ret_out_3 = linear(bias = encoder_out_proj_1_bias, weight = encoder_out_proj_1_weight, x = input_59)[name = tensor<string, []>("linear_16")];
-            tensor<fp32, [1, 2, 256]> x_9 = add(x = input_53, y = ret_out_3)[name = tensor<string, []>("x_9")];
+            tensor<fp32, [1, 2, 4, 64]> var_441 = transpose(perm = var_441_perm_0, x = out_7)[name = tensor<string, []>("transpose_48")];
+            tensor<fp32, [1, 2, 4, 64]> out_9 = layer_norm(axes = out_9_axes_0, epsilon = var_54, x = var_441)[name = tensor<string, []>("out_9")];
+            tensor<int32, [3]> var_445 = const()[name = tensor<string, []>("op_445"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp32, [1, 2, 256]> out_11 = reshape(shape = var_445, x = out_9)[name = tensor<string, []>("out_11")];
+            tensor<fp32, [1, 2, 256]> var_447 = silu(x = input_59)[name = tensor<string, []>("op_447")];
+            tensor<fp32, [1, 2, 256]> input_61 = mul(x = var_447, y = out_11)[name = tensor<string, []>("input_61")];
+            tensor<fp32, [1, 2, 256]> ret_out_3 = linear(bias = inner_encoder_out_proj_1_bias, weight = inner_encoder_out_proj_1_weight, x = input_61)[name = tensor<string, []>("linear_16")];
+            tensor<fp32, [1, 2, 256]> x_9 = add(x = input_55, y = ret_out_3)[name = tensor<string, []>("x_9")];
             tensor<int32, [4]> window_7_begin_0 = const()[name = tensor<string, []>("window_7_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> window_7_end_0 = const()[name = tensor<string, []>("window_7_end_0"), val = tensor<int32, [4]>([2, 1, 16, 256])];
             tensor<bool, [4]> window_7_end_mask_0 = const()[name = tensor<string, []>("window_7_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_7_squeeze_mask_0 = const()[name = tensor<string, []>("window_7_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_7 = slice_by_index(begin = window_7_begin_0, end = window_7_end_0, end_mask = window_7_end_mask_0, squeeze_mask = window_7_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_7")];
-            tensor<int32, [3]> var_419_begin_0 = const()[name = tensor<string, []>("op_419_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_419_end_0 = const()[name = tensor<string, []>("op_419_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_419_end_mask_0 = const()[name = tensor<string, []>("op_419_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_419 = slice_by_index(begin = var_419_begin_0, end = var_419_end_0, end_mask = var_419_end_mask_0, x = x_9)[name = tensor<string, []>("op_419")];
-            tensor<int32, [3]> var_422_begin_0 = const()[name = tensor<string, []>("op_422_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_422_end_0 = const()[name = tensor<string, []>("op_422_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_422_end_mask_0 = const()[name = tensor<string, []>("op_422_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_422 = slice_by_index(begin = var_422_begin_0, end = var_422_end_0, end_mask = var_422_end_mask_0, x = window_7)[name = tensor<string, []>("op_422")];
+            tensor<int32, [3]> var_455_begin_0 = const()[name = tensor<string, []>("op_455_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_455_end_0 = const()[name = tensor<string, []>("op_455_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_455_end_mask_0 = const()[name = tensor<string, []>("op_455_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_455 = slice_by_index(begin = var_455_begin_0, end = var_455_end_0, end_mask = var_455_end_mask_0, x = x_9)[name = tensor<string, []>("op_455")];
+            tensor<int32, [3]> var_458_begin_0 = const()[name = tensor<string, []>("op_458_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_458_end_0 = const()[name = tensor<string, []>("op_458_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_458_end_mask_0 = const()[name = tensor<string, []>("op_458_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_458 = slice_by_index(begin = var_458_begin_0, end = var_458_end_0, end_mask = var_458_end_mask_0, x = window_7)[name = tensor<string, []>("op_458")];
             tensor<bool, []> window_9_interleave_0 = const()[name = tensor<string, []>("window_9_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_9 = concat(axis = var_26, interleave = window_9_interleave_0, values = (var_422, var_419))[name = tensor<string, []>("window_9")];
-            tensor<int32, [3]> var_427_begin_0 = const()[name = tensor<string, []>("op_427_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_427_end_0 = const()[name = tensor<string, []>("op_427_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_427_end_mask_0 = const()[name = tensor<string, []>("op_427_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_427 = slice_by_index(begin = var_427_begin_0, end = var_427_end_0, end_mask = var_427_end_mask_0, x = x_9)[name = tensor<string, []>("op_427")];
-            tensor<int32, [3]> var_430_begin_0 = const()[name = tensor<string, []>("op_430_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_430_end_0 = const()[name = tensor<string, []>("op_430_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_430_end_mask_0 = const()[name = tensor<string, []>("op_430_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_430 = slice_by_index(begin = var_430_begin_0, end = var_430_end_0, end_mask = var_430_end_mask_0, x = window_9)[name = tensor<string, []>("op_430")];
+            tensor<fp32, [1, 16, 256]> window_9 = concat(axis = var_62, interleave = window_9_interleave_0, values = (var_458, var_455))[name = tensor<string, []>("window_9")];
+            tensor<int32, [3]> var_463_begin_0 = const()[name = tensor<string, []>("op_463_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_463_end_0 = const()[name = tensor<string, []>("op_463_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_463_end_mask_0 = const()[name = tensor<string, []>("op_463_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_463 = slice_by_index(begin = var_463_begin_0, end = var_463_end_0, end_mask = var_463_end_mask_0, x = x_9)[name = tensor<string, []>("op_463")];
+            tensor<int32, [3]> var_466_begin_0 = const()[name = tensor<string, []>("op_466_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_466_end_0 = const()[name = tensor<string, []>("op_466_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_466_end_mask_0 = const()[name = tensor<string, []>("op_466_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_466 = slice_by_index(begin = var_466_begin_0, end = var_466_end_0, end_mask = var_466_end_mask_0, x = window_9)[name = tensor<string, []>("op_466")];
             tensor<bool, []> window_11_interleave_0 = const()[name = tensor<string, []>("window_11_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_11 = concat(axis = var_26, interleave = window_11_interleave_0, values = (var_430, var_427))[name = tensor<string, []>("window_11")];
-            tensor<bool, []> input_61_interleave_0 = const()[name = tensor<string, []>("input_61_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [2, 16, 256]> input_61 = concat(axis = var_23, interleave = input_61_interleave_0, values = (window_9, window_11))[name = tensor<string, []>("input_61")];
+            tensor<fp32, [1, 16, 256]> window_11 = concat(axis = var_62, interleave = window_11_interleave_0, values = (var_466, var_463))[name = tensor<string, []>("window_11")];
+            tensor<bool, []> input_63_interleave_0 = const()[name = tensor<string, []>("input_63_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [2, 16, 256]> input_63 = concat(axis = var_49, interleave = input_63_interleave_0, values = (window_9, window_11))[name = tensor<string, []>("input_63")];
             tensor<int32, [1]> x_11_axes_0 = const()[name = tensor<string, []>("x_11_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [2, 16, 256]> x_11 = layer_norm(axes = x_11_axes_0, beta = encoder_conv_module_1_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_1_sequential_0_weight, x = input_61)[name = tensor<string, []>("x_11")];
-            tensor<int32, [3]> input_63_perm_0 = const()[name = tensor<string, []>("input_63_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [2, 16, 256]> x_11 = layer_norm(axes = x_11_axes_0, beta = inner_encoder_conv_module_1_sequential_0_bias, epsilon = var_46, gamma = inner_encoder_conv_module_1_sequential_0_weight, x = input_63)[name = tensor<string, []>("x_11")];
+            tensor<int32, [3]> input_65_perm_0 = const()[name = tensor<string, []>("input_65_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_13_pad_type_0 = const()[name = tensor<string, []>("inputs_13_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_13_strides_0 = const()[name = tensor<string, []>("inputs_13_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_13_pad_0 = const()[name = tensor<string, []>("inputs_13_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_13_dilations_0 = const()[name = tensor<string, []>("inputs_13_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_13_groups_0 = const()[name = tensor<string, []>("inputs_13_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> input_63 = transpose(perm = input_63_perm_0, x = x_11)[name = tensor<string, []>("transpose_47")];
-            tensor<fp32, [2, 512, 16]> inputs_13 = conv(bias = encoder_conv_module_1_sequential_2_conv_bias, dilations = inputs_13_dilations_0, groups = inputs_13_groups_0, pad = inputs_13_pad_0, pad_type = inputs_13_pad_type_0, strides = inputs_13_strides_0, weight = encoder_conv_module_1_sequential_2_conv_weight, x = input_63)[name = tensor<string, []>("inputs_13")];
-            tensor<int32, [2]> var_455_split_sizes_0 = const()[name = tensor<string, []>("op_455_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_455_axis_0 = const()[name = tensor<string, []>("op_455_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> var_455_0, tensor<fp32, [2, 256, 16]> var_455_1 = split(axis = var_455_axis_0, split_sizes = var_455_split_sizes_0, x = inputs_13)[name = tensor<string, []>("op_455")];
-            tensor<fp32, [2, 256, 16]> var_457 = sigmoid(x = var_455_1)[name = tensor<string, []>("op_457")];
-            tensor<fp32, [2, 256, 16]> inputs_15 = mul(x = var_455_0, y = var_457)[name = tensor<string, []>("inputs_15")];
+            tensor<fp32, [2, 256, 16]> input_65 = transpose(perm = input_65_perm_0, x = x_11)[name = tensor<string, []>("transpose_47")];
+            tensor<fp32, [2, 512, 16]> inputs_13 = conv(bias = inner_encoder_conv_module_1_sequential_2_conv_bias, dilations = inputs_13_dilations_0, groups = inputs_13_groups_0, pad = inputs_13_pad_0, pad_type = inputs_13_pad_type_0, strides = inputs_13_strides_0, weight = inner_encoder_conv_module_1_sequential_2_conv_weight, x = input_65)[name = tensor<string, []>("inputs_13")];
+            tensor<int32, [2]> var_491_split_sizes_0 = const()[name = tensor<string, []>("op_491_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_491_axis_0 = const()[name = tensor<string, []>("op_491_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [2, 256, 16]> var_491_0, tensor<fp32, [2, 256, 16]> var_491_1 = split(axis = var_491_axis_0, split_sizes = var_491_split_sizes_0, x = inputs_13)[name = tensor<string, []>("op_491")];
+            tensor<fp32, [2, 256, 16]> var_493 = sigmoid(x = var_491_1)[name = tensor<string, []>("op_493")];
+            tensor<fp32, [2, 256, 16]> inputs_15 = mul(x = var_491_0, y = var_493)[name = tensor<string, []>("inputs_15")];
             tensor<string, []> outputs_aug_3_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_3_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_3_pad_0 = const()[name = tensor<string, []>("outputs_aug_3_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_3_groups_0 = const()[name = tensor<string, []>("outputs_aug_3_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_3_strides_0 = const()[name = tensor<string, []>("outputs_aug_3_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_3_dilations_0 = const()[name = tensor<string, []>("outputs_aug_3_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [2, 256, 31]> outputs_aug_3 = conv(dilations = outputs_aug_3_dilations_0, groups = outputs_aug_3_groups_0, pad = outputs_aug_3_pad_0, pad_type = outputs_aug_3_pad_type_0, strides = outputs_aug_3_strides_0, weight = encoder_conv_module_1_sequential_4_conv_weight, x = inputs_15)[name = tensor<string, []>("outputs_aug_3")];
-            tensor<int32, [3]> input_65_begin_0 = const()[name = tensor<string, []>("input_65_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_65_end_0 = const()[name = tensor<string, []>("input_65_end_0"), val = tensor<int32, [3]>([2, 256, 16])];
-            tensor<bool, [3]> input_65_end_mask_0 = const()[name = tensor<string, []>("input_65_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [2, 256, 16]> input_65 = slice_by_index(begin = input_65_begin_0, end = input_65_end_0, end_mask = input_65_end_mask_0, x = outputs_aug_3)[name = tensor<string, []>("input_65")];
-            tensor<fp32, [2, 256, 16]> inputs_17 = batch_norm(beta = encoder_conv_module_1_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_1_sequential_5_weight, mean = encoder_conv_module_1_sequential_5_running_mean, variance = encoder_conv_module_1_sequential_5_running_var, x = input_65)[name = tensor<string, []>("inputs_17")];
-            tensor<fp32, [2, 256, 16]> input_67 = silu(x = inputs_17)[name = tensor<string, []>("input_67")];
-            tensor<string, []> input_69_pad_type_0 = const()[name = tensor<string, []>("input_69_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_69_strides_0 = const()[name = tensor<string, []>("input_69_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_69_pad_0 = const()[name = tensor<string, []>("input_69_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_69_dilations_0 = const()[name = tensor<string, []>("input_69_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_69_groups_0 = const()[name = tensor<string, []>("input_69_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> input_69 = conv(bias = encoder_conv_module_1_sequential_7_conv_bias, dilations = input_69_dilations_0, groups = input_69_groups_0, pad = input_69_pad_0, pad_type = input_69_pad_type_0, strides = input_69_strides_0, weight = encoder_conv_module_1_sequential_7_conv_weight, x = input_67)[name = tensor<string, []>("input_69")];
+            tensor<fp32, [2, 256, 31]> outputs_aug_3 = conv(dilations = outputs_aug_3_dilations_0, groups = outputs_aug_3_groups_0, pad = outputs_aug_3_pad_0, pad_type = outputs_aug_3_pad_type_0, strides = outputs_aug_3_strides_0, weight = inner_encoder_conv_module_1_sequential_4_conv_weight, x = inputs_15)[name = tensor<string, []>("outputs_aug_3")];
+            tensor<int32, [3]> input_67_begin_0 = const()[name = tensor<string, []>("input_67_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_67_end_0 = const()[name = tensor<string, []>("input_67_end_0"), val = tensor<int32, [3]>([2, 256, 16])];
+            tensor<bool, [3]> input_67_end_mask_0 = const()[name = tensor<string, []>("input_67_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [2, 256, 16]> input_67 = slice_by_index(begin = input_67_begin_0, end = input_67_end_0, end_mask = input_67_end_mask_0, x = outputs_aug_3)[name = tensor<string, []>("input_67")];
+            tensor<fp32, [2, 256, 16]> inputs_17 = batch_norm(beta = inner_encoder_conv_module_1_sequential_5_bias, epsilon = var_46, gamma = inner_encoder_conv_module_1_sequential_5_weight, mean = inner_encoder_conv_module_1_sequential_5_running_mean, variance = inner_encoder_conv_module_1_sequential_5_running_var, x = input_67)[name = tensor<string, []>("inputs_17")];
+            tensor<fp32, [2, 256, 16]> input_69 = silu(x = inputs_17)[name = tensor<string, []>("input_69")];
+            tensor<string, []> input_71_pad_type_0 = const()[name = tensor<string, []>("input_71_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_71_strides_0 = const()[name = tensor<string, []>("input_71_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_71_pad_0 = const()[name = tensor<string, []>("input_71_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_71_dilations_0 = const()[name = tensor<string, []>("input_71_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_71_groups_0 = const()[name = tensor<string, []>("input_71_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [2, 256, 16]> input_71 = conv(bias = inner_encoder_conv_module_1_sequential_7_conv_bias, dilations = input_71_dilations_0, groups = input_71_groups_0, pad = input_71_pad_0, pad_type = input_71_pad_type_0, strides = input_71_strides_0, weight = inner_encoder_conv_module_1_sequential_7_conv_weight, x = input_69)[name = tensor<string, []>("input_71")];
             tensor<int32, [3]> conv_out_3_perm_0 = const()[name = tensor<string, []>("conv_out_3_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_488_begin_0 = const()[name = tensor<string, []>("op_488_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_488_end_0 = const()[name = tensor<string, []>("op_488_end_0"), val = tensor<int32, [3]>([2, 16, 256])];
-            tensor<bool, [3]> var_488_end_mask_0 = const()[name = tensor<string, []>("op_488_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [2, 16, 256]> conv_out_3 = transpose(perm = conv_out_3_perm_0, x = input_69)[name = tensor<string, []>("transpose_46")];
-            tensor<fp32, [2, 1, 256]> var_488 = slice_by_index(begin = var_488_begin_0, end = var_488_end_0, end_mask = var_488_end_mask_0, x = conv_out_3)[name = tensor<string, []>("op_488")];
-            tensor<int32, [3]> var_490_perm_0 = const()[name = tensor<string, []>("op_490_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 2, 256]> var_490 = transpose(perm = var_490_perm_0, x = var_488)[name = tensor<string, []>("transpose_45")];
-            tensor<fp32, [1, 2, 256]> input_71 = add(x = x_9, y = var_490)[name = tensor<string, []>("input_71")];
-            tensor<int32, [1]> input_73_axes_0 = const()[name = tensor<string, []>("input_73_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_73 = layer_norm(axes = input_73_axes_0, beta = encoder_ffn2_1_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_1_module_sequential_0_weight, x = input_71)[name = tensor<string, []>("input_73")];
-            tensor<fp32, [1, 2, 1024]> inputs_19 = linear(bias = encoder_ffn2_1_module_sequential_1_linear_bias, weight = encoder_ffn2_1_module_sequential_1_linear_weight, x = input_73)[name = tensor<string, []>("linear_17")];
-            tensor<fp32, [1, 2, 1024]> input_75 = silu(x = inputs_19)[name = tensor<string, []>("input_75")];
-            tensor<fp32, [1, 2, 256]> input_79 = linear(bias = encoder_ffn2_1_module_sequential_4_linear_bias, weight = encoder_ffn2_1_module_sequential_4_linear_weight, x = input_75)[name = tensor<string, []>("linear_18")];
-            tensor<fp32, []> var_513 = const()[name = tensor<string, []>("op_513"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 2, 256]> var_514 = mul(x = input_79, y = var_513)[name = tensor<string, []>("op_514")];
-            tensor<fp32, [1, 2, 256]> input_81 = add(x = var_514, y = input_71)[name = tensor<string, []>("input_81")];
-            tensor<int32, [1]> input_83_axes_0 = const()[name = tensor<string, []>("input_83_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_83 = layer_norm(axes = input_83_axes_0, beta = encoder_layer_norm_1_bias, epsilon = var_28, gamma = encoder_layer_norm_1_weight, x = input_81)[name = tensor<string, []>("input_83")];
+            tensor<int32, [3]> var_524_begin_0 = const()[name = tensor<string, []>("op_524_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_524_end_0 = const()[name = tensor<string, []>("op_524_end_0"), val = tensor<int32, [3]>([2, 16, 256])];
+            tensor<bool, [3]> var_524_end_mask_0 = const()[name = tensor<string, []>("op_524_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [2, 16, 256]> conv_out_3 = transpose(perm = conv_out_3_perm_0, x = input_71)[name = tensor<string, []>("transpose_46")];
+            tensor<fp32, [2, 1, 256]> var_524 = slice_by_index(begin = var_524_begin_0, end = var_524_end_0, end_mask = var_524_end_mask_0, x = conv_out_3)[name = tensor<string, []>("op_524")];
+            tensor<int32, [3]> var_526_perm_0 = const()[name = tensor<string, []>("op_526_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 2, 256]> var_526 = transpose(perm = var_526_perm_0, x = var_524)[name = tensor<string, []>("transpose_45")];
+            tensor<fp32, [1, 2, 256]> input_73 = add(x = x_9, y = var_526)[name = tensor<string, []>("input_73")];
+            tensor<int32, [1]> input_75_axes_0 = const()[name = tensor<string, []>("input_75_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 2, 256]> input_75 = layer_norm(axes = input_75_axes_0, beta = inner_encoder_ffn2_1_module_sequential_0_bias, epsilon = var_46, gamma = inner_encoder_ffn2_1_module_sequential_0_weight, x = input_73)[name = tensor<string, []>("input_75")];
+            tensor<fp32, [1, 2, 1024]> inputs_19 = linear(bias = inner_encoder_ffn2_1_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_1_module_sequential_1_linear_weight, x = input_75)[name = tensor<string, []>("linear_17")];
+            tensor<fp32, [1, 2, 1024]> input_77 = silu(x = inputs_19)[name = tensor<string, []>("input_77")];
+            tensor<fp32, [1, 2, 256]> input_81 = linear(bias = inner_encoder_ffn2_1_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_1_module_sequential_4_linear_weight, x = input_77)[name = tensor<string, []>("linear_18")];
+            tensor<fp32, []> var_549 = const()[name = tensor<string, []>("op_549"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 2, 256]> var_550 = mul(x = input_81, y = var_549)[name = tensor<string, []>("op_550")];
+            tensor<fp32, [1, 2, 256]> input_83 = add(x = var_550, y = input_73)[name = tensor<string, []>("input_83")];
             tensor<int32, [1]> input_85_axes_0 = const()[name = tensor<string, []>("input_85_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_85 = layer_norm(axes = input_85_axes_0, beta = encoder_ffn1_2_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_2_module_sequential_0_weight, x = input_83)[name = tensor<string, []>("input_85")];
-            tensor<fp32, [1, 2, 1024]> inputs_21 = linear(bias = encoder_ffn1_2_module_sequential_1_linear_bias, weight = encoder_ffn1_2_module_sequential_1_linear_weight, x = input_85)[name = tensor<string, []>("linear_19")];
-            tensor<fp32, [1, 2, 1024]> input_87 = silu(x = inputs_21)[name = tensor<string, []>("input_87")];
-            tensor<fp32, [1, 2, 256]> input_91 = linear(bias = encoder_ffn1_2_module_sequential_4_linear_bias, weight = encoder_ffn1_2_module_sequential_4_linear_weight, x = input_87)[name = tensor<string, []>("linear_20")];
-            tensor<fp32, []> var_543 = const()[name = tensor<string, []>("op_543"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 2, 256]> var_544 = mul(x = input_91, y = var_543)[name = tensor<string, []>("op_544")];
-            tensor<fp32, [1, 2, 256]> input_93 = add(x = var_544, y = input_83)[name = tensor<string, []>("input_93")];
+            tensor<fp32, [1, 2, 256]> input_85 = layer_norm(axes = input_85_axes_0, beta = inner_encoder_layer_norm_1_bias, epsilon = var_46, gamma = inner_encoder_layer_norm_1_weight, x = input_83)[name = tensor<string, []>("input_85")];
+            tensor<int32, [1]> input_87_axes_0 = const()[name = tensor<string, []>("input_87_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 2, 256]> input_87 = layer_norm(axes = input_87_axes_0, beta = inner_encoder_ffn1_2_module_sequential_0_bias, epsilon = var_46, gamma = inner_encoder_ffn1_2_module_sequential_0_weight, x = input_85)[name = tensor<string, []>("input_87")];
+            tensor<fp32, [1, 2, 1024]> inputs_21 = linear(bias = inner_encoder_ffn1_2_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_2_module_sequential_1_linear_weight, x = input_87)[name = tensor<string, []>("linear_19")];
+            tensor<fp32, [1, 2, 1024]> input_89 = silu(x = inputs_21)[name = tensor<string, []>("input_89")];
+            tensor<fp32, [1, 2, 256]> input_93 = linear(bias = inner_encoder_ffn1_2_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_2_module_sequential_4_linear_weight, x = input_89)[name = tensor<string, []>("linear_20")];
+            tensor<fp32, []> var_579 = const()[name = tensor<string, []>("op_579"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 2, 256]> var_580 = mul(x = input_93, y = var_579)[name = tensor<string, []>("op_580")];
+            tensor<fp32, [1, 2, 256]> input_95 = add(x = var_580, y = input_85)[name = tensor<string, []>("input_95")];
             tensor<int32, [1]> x_13_axes_0 = const()[name = tensor<string, []>("x_13_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> x_13 = layer_norm(axes = x_13_axes_0, beta = encoder_ret_lns_2_bias, epsilon = var_28, gamma = encoder_ret_lns_2_weight, x = input_93)[name = tensor<string, []>("x_13")];
+            tensor<fp32, [1, 2, 256]> x_13 = layer_norm(axes = x_13_axes_0, beta = inner_encoder_ret_lns_2_bias, epsilon = var_46, gamma = inner_encoder_ret_lns_2_weight, x = input_95)[name = tensor<string, []>("x_13")];
             tensor<int32, [5]> prev_kv_5_begin_0 = const()[name = tensor<string, []>("prev_kv_5_begin_0"), val = tensor<int32, [5]>([2, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_5_end_0 = const()[name = tensor<string, []>("prev_kv_5_end_0"), val = tensor<int32, [5]>([3, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_5_end_mask_0 = const()[name = tensor<string, []>("prev_kv_5_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -553,153 +567,153 @@ program(1.0)
             tensor<bool, [2]> prev_scale_5_end_mask_0 = const()[name = tensor<string, []>("prev_scale_5_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_5_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_5_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_5 = slice_by_index(begin = prev_scale_5_begin_0, end = prev_scale_5_end_0, end_mask = prev_scale_5_end_mask_0, squeeze_mask = prev_scale_5_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_5")];
-            tensor<fp32, [1, 2, 256]> var_558 = linear(bias = encoder_q_proj_2_bias, weight = encoder_q_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_21")];
-            tensor<int32, [4]> var_559 = const()[name = tensor<string, []>("op_559"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_560 = reshape(shape = var_559, x = var_558)[name = tensor<string, []>("op_560")];
+            tensor<fp32, [1, 2, 256]> var_594 = linear(bias = inner_encoder_q_proj_2_bias, weight = inner_encoder_q_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_21")];
+            tensor<int32, [4]> var_595 = const()[name = tensor<string, []>("op_595"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_596 = reshape(shape = var_595, x = var_594)[name = tensor<string, []>("op_596")];
             tensor<int32, [4]> q_5_perm_0 = const()[name = tensor<string, []>("q_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> var_564 = linear(bias = encoder_k_proj_2_bias, weight = encoder_k_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_22")];
-            tensor<fp32, []> var_565 = const()[name = tensor<string, []>("op_565"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 2, 256]> var_566 = mul(x = var_564, y = var_565)[name = tensor<string, []>("op_566")];
-            tensor<int32, [4]> var_567 = const()[name = tensor<string, []>("op_567"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_568 = reshape(shape = var_567, x = var_566)[name = tensor<string, []>("op_568")];
+            tensor<fp32, [1, 2, 256]> var_600 = linear(bias = inner_encoder_k_proj_2_bias, weight = inner_encoder_k_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_22")];
+            tensor<fp32, []> var_601 = const()[name = tensor<string, []>("op_601"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 2, 256]> var_602 = mul(x = var_600, y = var_601)[name = tensor<string, []>("op_602")];
+            tensor<int32, [4]> var_603 = const()[name = tensor<string, []>("op_603"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_604 = reshape(shape = var_603, x = var_602)[name = tensor<string, []>("op_604")];
             tensor<int32, [4]> k_5_perm_0 = const()[name = tensor<string, []>("k_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> var_572 = linear(bias = encoder_v_proj_2_bias, weight = encoder_v_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_23")];
-            tensor<int32, [4]> var_573 = const()[name = tensor<string, []>("op_573"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_574 = reshape(shape = var_573, x = var_572)[name = tensor<string, []>("op_574")];
+            tensor<fp32, [1, 2, 256]> var_608 = linear(bias = inner_encoder_v_proj_2_bias, weight = inner_encoder_v_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_23")];
+            tensor<int32, [4]> var_609 = const()[name = tensor<string, []>("op_609"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_610 = reshape(shape = var_609, x = var_608)[name = tensor<string, []>("op_610")];
             tensor<int32, [4]> v_5_perm_0 = const()[name = tensor<string, []>("v_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> input_97 = linear(bias = encoder_g_proj_2_bias, weight = encoder_g_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_24")];
+            tensor<fp32, [1, 2, 256]> input_99 = linear(bias = inner_encoder_g_proj_2_bias, weight = inner_encoder_g_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_24")];
             tensor<fp32, [1]> sqrt_s0_5 = sqrt(x = prev_scale_5)[name = tensor<string, []>("sqrt_s0_5")];
-            tensor<fp32, [2]> s_t_5 = add(x = prev_scale_5, y = encoder__t_index)[name = tensor<string, []>("s_t_5")];
+            tensor<fp32, [2]> s_t_5 = add(x = prev_scale_5, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_5")];
             tensor<fp32, [2]> sqrt_s_t_5 = sqrt(x = s_t_5)[name = tensor<string, []>("sqrt_s_t_5")];
             tensor<bool, []> qk_5_transpose_x_1 = const()[name = tensor<string, []>("qk_5_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_5_transpose_y_1 = const()[name = tensor<string, []>("qk_5_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 2, 64]> k_5 = transpose(perm = k_5_perm_0, x = var_568)[name = tensor<string, []>("transpose_43")];
-            tensor<fp32, [1, 4, 2, 64]> q_5 = transpose(perm = q_5_perm_0, x = var_560)[name = tensor<string, []>("transpose_44")];
+            tensor<fp32, [1, 4, 2, 64]> k_5 = transpose(perm = k_5_perm_0, x = var_604)[name = tensor<string, []>("transpose_43")];
+            tensor<fp32, [1, 4, 2, 64]> q_5 = transpose(perm = q_5_perm_0, x = var_596)[name = tensor<string, []>("transpose_44")];
             tensor<fp32, [1, 4, 2, 2]> qk_5 = matmul(transpose_x = qk_5_transpose_x_1, transpose_y = qk_5_transpose_y_1, x = q_5, y = k_5)[name = tensor<string, []>("qk_5")];
-            tensor<int32, [2]> var_584 = const()[name = tensor<string, []>("op_584"), val = tensor<int32, [2]>([2, 1])];
-            tensor<fp32, [2, 1]> var_585 = reshape(shape = var_584, x = sqrt_s_t_5)[name = tensor<string, []>("op_585")];
-            tensor<fp32, [2, 2]> M_5 = real_div(x = encoder__causal_mask, y = var_585)[name = tensor<string, []>("M_5")];
-            tensor<fp32, [1, 4, 2, 2]> var_587 = mul(x = qk_5, y = M_5)[name = tensor<string, []>("op_587")];
+            tensor<int32, [2]> var_620 = const()[name = tensor<string, []>("op_620"), val = tensor<int32, [2]>([2, 1])];
+            tensor<fp32, [2, 1]> var_621 = reshape(shape = var_620, x = sqrt_s_t_5)[name = tensor<string, []>("op_621")];
+            tensor<fp32, [2, 2]> M_5 = real_div(x = inner_encoder__causal_mask, y = var_621)[name = tensor<string, []>("M_5")];
+            tensor<fp32, [1, 4, 2, 2]> var_623 = mul(x = qk_5, y = M_5)[name = tensor<string, []>("op_623")];
             tensor<bool, []> inner_5_transpose_x_0 = const()[name = tensor<string, []>("inner_5_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_5_transpose_y_0 = const()[name = tensor<string, []>("inner_5_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 2, 64]> v_5 = transpose(perm = v_5_perm_0, x = var_574)[name = tensor<string, []>("transpose_42")];
-            tensor<fp32, [1, 4, 2, 64]> inner_5 = matmul(transpose_x = inner_5_transpose_x_0, transpose_y = inner_5_transpose_y_0, x = var_587, y = v_5)[name = tensor<string, []>("inner_5")];
-            tensor<bool, []> var_589_transpose_x_0 = const()[name = tensor<string, []>("op_589_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_589_transpose_y_0 = const()[name = tensor<string, []>("op_589_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 2, 64]> var_589 = matmul(transpose_x = var_589_transpose_x_0, transpose_y = var_589_transpose_y_0, x = q_5, y = prev_kv_5)[name = tensor<string, []>("op_589")];
-            tensor<fp32, [2]> var_590 = real_div(x = sqrt_s0_5, y = sqrt_s_t_5)[name = tensor<string, []>("op_590")];
-            tensor<int32, [4]> var_591 = const()[name = tensor<string, []>("op_591"), val = tensor<int32, [4]>([1, 1, 2, 1])];
-            tensor<fp32, [1, 1, 2, 1]> var_592 = reshape(shape = var_591, x = var_590)[name = tensor<string, []>("op_592")];
-            tensor<fp32, [1, 4, 2, 64]> cross_5 = mul(x = var_589, y = var_592)[name = tensor<string, []>("cross_5")];
+            tensor<fp32, [1, 4, 2, 64]> v_5 = transpose(perm = v_5_perm_0, x = var_610)[name = tensor<string, []>("transpose_42")];
+            tensor<fp32, [1, 4, 2, 64]> inner_5 = matmul(transpose_x = inner_5_transpose_x_0, transpose_y = inner_5_transpose_y_0, x = var_623, y = v_5)[name = tensor<string, []>("inner_5")];
+            tensor<bool, []> var_625_transpose_x_0 = const()[name = tensor<string, []>("op_625_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_625_transpose_y_0 = const()[name = tensor<string, []>("op_625_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 2, 64]> var_625 = matmul(transpose_x = var_625_transpose_x_0, transpose_y = var_625_transpose_y_0, x = q_5, y = prev_kv_5)[name = tensor<string, []>("op_625")];
+            tensor<fp32, [2]> var_626 = real_div(x = sqrt_s0_5, y = sqrt_s_t_5)[name = tensor<string, []>("op_626")];
+            tensor<int32, [4]> var_627 = const()[name = tensor<string, []>("op_627"), val = tensor<int32, [4]>([1, 1, 2, 1])];
+            tensor<fp32, [1, 1, 2, 1]> var_628 = reshape(shape = var_627, x = var_626)[name = tensor<string, []>("op_628")];
+            tensor<fp32, [1, 4, 2, 64]> cross_5 = mul(x = var_625, y = var_628)[name = tensor<string, []>("cross_5")];
             tensor<fp32, [1, 4, 2, 64]> out_13 = add(x = inner_5, y = cross_5)[name = tensor<string, []>("out_13")];
-            tensor<fp32, [1, 4, 64, 64]> var_595 = mul(x = prev_kv_5, y = sqrt_s0_5)[name = tensor<string, []>("op_595")];
-            tensor<bool, []> var_597_transpose_x_1 = const()[name = tensor<string, []>("op_597_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_597_transpose_y_1 = const()[name = tensor<string, []>("op_597_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_597 = matmul(transpose_x = var_597_transpose_x_1, transpose_y = var_597_transpose_y_1, x = k_5, y = v_5)[name = tensor<string, []>("op_597")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_5 = add(x = var_595, y = var_597)[name = tensor<string, []>("new_kv_unnorm_5")];
-            tensor<fp32, []> var_599 = const()[name = tensor<string, []>("op_599"), val = tensor<fp32, []>(0x1p+1)];
-            tensor<fp32, [1]> new_scale_5 = add(x = prev_scale_5, y = var_599)[name = tensor<string, []>("new_scale_5")];
-            tensor<fp32, [1]> var_601 = sqrt(x = new_scale_5)[name = tensor<string, []>("op_601")];
-            tensor<fp32, [1, 4, 64, 64]> var_602 = real_div(x = new_kv_unnorm_5, y = var_601)[name = tensor<string, []>("op_602")];
-            tensor<int32, [4]> var_603_perm_0 = const()[name = tensor<string, []>("op_603_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_631 = mul(x = prev_kv_5, y = sqrt_s0_5)[name = tensor<string, []>("op_631")];
+            tensor<bool, []> var_633_transpose_x_1 = const()[name = tensor<string, []>("op_633_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_633_transpose_y_1 = const()[name = tensor<string, []>("op_633_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_633 = matmul(transpose_x = var_633_transpose_x_1, transpose_y = var_633_transpose_y_1, x = k_5, y = v_5)[name = tensor<string, []>("op_633")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_5 = add(x = var_631, y = var_633)[name = tensor<string, []>("new_kv_unnorm_5")];
+            tensor<fp32, []> var_635 = const()[name = tensor<string, []>("op_635"), val = tensor<fp32, []>(0x1p+1)];
+            tensor<fp32, [1]> new_scale_5 = add(x = prev_scale_5, y = var_635)[name = tensor<string, []>("new_scale_5")];
+            tensor<fp32, [1]> var_637 = sqrt(x = new_scale_5)[name = tensor<string, []>("op_637")];
+            tensor<fp32, [1, 4, 64, 64]> var_638 = real_div(x = new_kv_unnorm_5, y = var_637)[name = tensor<string, []>("op_638")];
+            tensor<int32, [4]> var_639_perm_0 = const()[name = tensor<string, []>("op_639_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_15_axes_0 = const()[name = tensor<string, []>("out_15_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 4, 64]> var_603 = transpose(perm = var_603_perm_0, x = out_13)[name = tensor<string, []>("transpose_41")];
-            tensor<fp32, [1, 2, 4, 64]> out_15 = layer_norm(axes = out_15_axes_0, epsilon = var_17, x = var_603)[name = tensor<string, []>("out_15")];
-            tensor<int32, [3]> var_607 = const()[name = tensor<string, []>("op_607"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<fp32, [1, 2, 256]> out_17 = reshape(shape = var_607, x = out_15)[name = tensor<string, []>("out_17")];
-            tensor<fp32, [1, 2, 256]> var_609 = silu(x = input_97)[name = tensor<string, []>("op_609")];
-            tensor<fp32, [1, 2, 256]> input_99 = mul(x = var_609, y = out_17)[name = tensor<string, []>("input_99")];
-            tensor<fp32, [1, 2, 256]> ret_out_5 = linear(bias = encoder_out_proj_2_bias, weight = encoder_out_proj_2_weight, x = input_99)[name = tensor<string, []>("linear_25")];
-            tensor<fp32, [1, 2, 256]> x_15 = add(x = input_93, y = ret_out_5)[name = tensor<string, []>("x_15")];
+            tensor<fp32, [1, 2, 4, 64]> var_639 = transpose(perm = var_639_perm_0, x = out_13)[name = tensor<string, []>("transpose_41")];
+            tensor<fp32, [1, 2, 4, 64]> out_15 = layer_norm(axes = out_15_axes_0, epsilon = var_54, x = var_639)[name = tensor<string, []>("out_15")];
+            tensor<int32, [3]> var_643 = const()[name = tensor<string, []>("op_643"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp32, [1, 2, 256]> out_17 = reshape(shape = var_643, x = out_15)[name = tensor<string, []>("out_17")];
+            tensor<fp32, [1, 2, 256]> var_645 = silu(x = input_99)[name = tensor<string, []>("op_645")];
+            tensor<fp32, [1, 2, 256]> input_101 = mul(x = var_645, y = out_17)[name = tensor<string, []>("input_101")];
+            tensor<fp32, [1, 2, 256]> ret_out_5 = linear(bias = inner_encoder_out_proj_2_bias, weight = inner_encoder_out_proj_2_weight, x = input_101)[name = tensor<string, []>("linear_25")];
+            tensor<fp32, [1, 2, 256]> x_15 = add(x = input_95, y = ret_out_5)[name = tensor<string, []>("x_15")];
             tensor<int32, [4]> window_13_begin_0 = const()[name = tensor<string, []>("window_13_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> window_13_end_0 = const()[name = tensor<string, []>("window_13_end_0"), val = tensor<int32, [4]>([3, 1, 16, 256])];
             tensor<bool, [4]> window_13_end_mask_0 = const()[name = tensor<string, []>("window_13_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_13_squeeze_mask_0 = const()[name = tensor<string, []>("window_13_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_13 = slice_by_index(begin = window_13_begin_0, end = window_13_end_0, end_mask = window_13_end_mask_0, squeeze_mask = window_13_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_13")];
-            tensor<int32, [3]> var_617_begin_0 = const()[name = tensor<string, []>("op_617_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_617_end_0 = const()[name = tensor<string, []>("op_617_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_617_end_mask_0 = const()[name = tensor<string, []>("op_617_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_617 = slice_by_index(begin = var_617_begin_0, end = var_617_end_0, end_mask = var_617_end_mask_0, x = x_15)[name = tensor<string, []>("op_617")];
-            tensor<int32, [3]> var_620_begin_0 = const()[name = tensor<string, []>("op_620_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_620_end_0 = const()[name = tensor<string, []>("op_620_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_620_end_mask_0 = const()[name = tensor<string, []>("op_620_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_620 = slice_by_index(begin = var_620_begin_0, end = var_620_end_0, end_mask = var_620_end_mask_0, x = window_13)[name = tensor<string, []>("op_620")];
+            tensor<int32, [3]> var_653_begin_0 = const()[name = tensor<string, []>("op_653_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_653_end_0 = const()[name = tensor<string, []>("op_653_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_653_end_mask_0 = const()[name = tensor<string, []>("op_653_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_653 = slice_by_index(begin = var_653_begin_0, end = var_653_end_0, end_mask = var_653_end_mask_0, x = x_15)[name = tensor<string, []>("op_653")];
+            tensor<int32, [3]> var_656_begin_0 = const()[name = tensor<string, []>("op_656_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_656_end_0 = const()[name = tensor<string, []>("op_656_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_656_end_mask_0 = const()[name = tensor<string, []>("op_656_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_656 = slice_by_index(begin = var_656_begin_0, end = var_656_end_0, end_mask = var_656_end_mask_0, x = window_13)[name = tensor<string, []>("op_656")];
             tensor<bool, []> window_15_interleave_0 = const()[name = tensor<string, []>("window_15_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_15 = concat(axis = var_26, interleave = window_15_interleave_0, values = (var_620, var_617))[name = tensor<string, []>("window_15")];
-            tensor<int32, [3]> var_625_begin_0 = const()[name = tensor<string, []>("op_625_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_625_end_0 = const()[name = tensor<string, []>("op_625_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_625_end_mask_0 = const()[name = tensor<string, []>("op_625_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_625 = slice_by_index(begin = var_625_begin_0, end = var_625_end_0, end_mask = var_625_end_mask_0, x = x_15)[name = tensor<string, []>("op_625")];
-            tensor<int32, [3]> var_628_begin_0 = const()[name = tensor<string, []>("op_628_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_628_end_0 = const()[name = tensor<string, []>("op_628_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_628_end_mask_0 = const()[name = tensor<string, []>("op_628_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_628 = slice_by_index(begin = var_628_begin_0, end = var_628_end_0, end_mask = var_628_end_mask_0, x = window_15)[name = tensor<string, []>("op_628")];
+            tensor<fp32, [1, 16, 256]> window_15 = concat(axis = var_62, interleave = window_15_interleave_0, values = (var_656, var_653))[name = tensor<string, []>("window_15")];
+            tensor<int32, [3]> var_661_begin_0 = const()[name = tensor<string, []>("op_661_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_661_end_0 = const()[name = tensor<string, []>("op_661_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_661_end_mask_0 = const()[name = tensor<string, []>("op_661_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_661 = slice_by_index(begin = var_661_begin_0, end = var_661_end_0, end_mask = var_661_end_mask_0, x = x_15)[name = tensor<string, []>("op_661")];
+            tensor<int32, [3]> var_664_begin_0 = const()[name = tensor<string, []>("op_664_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_664_end_0 = const()[name = tensor<string, []>("op_664_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_664_end_mask_0 = const()[name = tensor<string, []>("op_664_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_664 = slice_by_index(begin = var_664_begin_0, end = var_664_end_0, end_mask = var_664_end_mask_0, x = window_15)[name = tensor<string, []>("op_664")];
             tensor<bool, []> window_17_interleave_0 = const()[name = tensor<string, []>("window_17_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_17 = concat(axis = var_26, interleave = window_17_interleave_0, values = (var_628, var_625))[name = tensor<string, []>("window_17")];
-            tensor<bool, []> input_101_interleave_0 = const()[name = tensor<string, []>("input_101_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [2, 16, 256]> input_101 = concat(axis = var_23, interleave = input_101_interleave_0, values = (window_15, window_17))[name = tensor<string, []>("input_101")];
+            tensor<fp32, [1, 16, 256]> window_17 = concat(axis = var_62, interleave = window_17_interleave_0, values = (var_664, var_661))[name = tensor<string, []>("window_17")];
+            tensor<bool, []> input_103_interleave_0 = const()[name = tensor<string, []>("input_103_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [2, 16, 256]> input_103 = concat(axis = var_49, interleave = input_103_interleave_0, values = (window_15, window_17))[name = tensor<string, []>("input_103")];
             tensor<int32, [1]> x_17_axes_0 = const()[name = tensor<string, []>("x_17_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [2, 16, 256]> x_17 = layer_norm(axes = x_17_axes_0, beta = encoder_conv_module_2_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_2_sequential_0_weight, x = input_101)[name = tensor<string, []>("x_17")];
-            tensor<int32, [3]> input_103_perm_0 = const()[name = tensor<string, []>("input_103_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [2, 16, 256]> x_17 = layer_norm(axes = x_17_axes_0, beta = inner_encoder_conv_module_2_sequential_0_bias, epsilon = var_46, gamma = inner_encoder_conv_module_2_sequential_0_weight, x = input_103)[name = tensor<string, []>("x_17")];
+            tensor<int32, [3]> input_105_perm_0 = const()[name = tensor<string, []>("input_105_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_23_pad_type_0 = const()[name = tensor<string, []>("inputs_23_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_23_strides_0 = const()[name = tensor<string, []>("inputs_23_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_23_pad_0 = const()[name = tensor<string, []>("inputs_23_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_23_dilations_0 = const()[name = tensor<string, []>("inputs_23_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_23_groups_0 = const()[name = tensor<string, []>("inputs_23_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> input_103 = transpose(perm = input_103_perm_0, x = x_17)[name = tensor<string, []>("transpose_40")];
-            tensor<fp32, [2, 512, 16]> inputs_23 = conv(bias = encoder_conv_module_2_sequential_2_conv_bias, dilations = inputs_23_dilations_0, groups = inputs_23_groups_0, pad = inputs_23_pad_0, pad_type = inputs_23_pad_type_0, strides = inputs_23_strides_0, weight = encoder_conv_module_2_sequential_2_conv_weight, x = input_103)[name = tensor<string, []>("inputs_23")];
-            tensor<int32, [2]> var_653_split_sizes_0 = const()[name = tensor<string, []>("op_653_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_653_axis_0 = const()[name = tensor<string, []>("op_653_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> var_653_0, tensor<fp32, [2, 256, 16]> var_653_1 = split(axis = var_653_axis_0, split_sizes = var_653_split_sizes_0, x = inputs_23)[name = tensor<string, []>("op_653")];
-            tensor<fp32, [2, 256, 16]> var_655 = sigmoid(x = var_653_1)[name = tensor<string, []>("op_655")];
-            tensor<fp32, [2, 256, 16]> inputs_25 = mul(x = var_653_0, y = var_655)[name = tensor<string, []>("inputs_25")];
+            tensor<fp32, [2, 256, 16]> input_105 = transpose(perm = input_105_perm_0, x = x_17)[name = tensor<string, []>("transpose_40")];
+            tensor<fp32, [2, 512, 16]> inputs_23 = conv(bias = inner_encoder_conv_module_2_sequential_2_conv_bias, dilations = inputs_23_dilations_0, groups = inputs_23_groups_0, pad = inputs_23_pad_0, pad_type = inputs_23_pad_type_0, strides = inputs_23_strides_0, weight = inner_encoder_conv_module_2_sequential_2_conv_weight, x = input_105)[name = tensor<string, []>("inputs_23")];
+            tensor<int32, [2]> var_689_split_sizes_0 = const()[name = tensor<string, []>("op_689_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_689_axis_0 = const()[name = tensor<string, []>("op_689_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [2, 256, 16]> var_689_0, tensor<fp32, [2, 256, 16]> var_689_1 = split(axis = var_689_axis_0, split_sizes = var_689_split_sizes_0, x = inputs_23)[name = tensor<string, []>("op_689")];
+            tensor<fp32, [2, 256, 16]> var_691 = sigmoid(x = var_689_1)[name = tensor<string, []>("op_691")];
+            tensor<fp32, [2, 256, 16]> inputs_25 = mul(x = var_689_0, y = var_691)[name = tensor<string, []>("inputs_25")];
             tensor<string, []> outputs_aug_5_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_5_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_5_pad_0 = const()[name = tensor<string, []>("outputs_aug_5_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_5_groups_0 = const()[name = tensor<string, []>("outputs_aug_5_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_5_strides_0 = const()[name = tensor<string, []>("outputs_aug_5_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_5_dilations_0 = const()[name = tensor<string, []>("outputs_aug_5_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [2, 256, 31]> outputs_aug_5 = conv(dilations = outputs_aug_5_dilations_0, groups = outputs_aug_5_groups_0, pad = outputs_aug_5_pad_0, pad_type = outputs_aug_5_pad_type_0, strides = outputs_aug_5_strides_0, weight = encoder_conv_module_2_sequential_4_conv_weight, x = inputs_25)[name = tensor<string, []>("outputs_aug_5")];
-            tensor<int32, [3]> input_105_begin_0 = const()[name = tensor<string, []>("input_105_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_105_end_0 = const()[name = tensor<string, []>("input_105_end_0"), val = tensor<int32, [3]>([2, 256, 16])];
-            tensor<bool, [3]> input_105_end_mask_0 = const()[name = tensor<string, []>("input_105_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [2, 256, 16]> input_105 = slice_by_index(begin = input_105_begin_0, end = input_105_end_0, end_mask = input_105_end_mask_0, x = outputs_aug_5)[name = tensor<string, []>("input_105")];
-            tensor<fp32, [2, 256, 16]> inputs_27 = batch_norm(beta = encoder_conv_module_2_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_2_sequential_5_weight, mean = encoder_conv_module_2_sequential_5_running_mean, variance = encoder_conv_module_2_sequential_5_running_var, x = input_105)[name = tensor<string, []>("inputs_27")];
-            tensor<fp32, [2, 256, 16]> input_107 = silu(x = inputs_27)[name = tensor<string, []>("input_107")];
-            tensor<string, []> input_109_pad_type_0 = const()[name = tensor<string, []>("input_109_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_109_strides_0 = const()[name = tensor<string, []>("input_109_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_109_pad_0 = const()[name = tensor<string, []>("input_109_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_109_dilations_0 = const()[name = tensor<string, []>("input_109_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_109_groups_0 = const()[name = tensor<string, []>("input_109_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> input_109 = conv(bias = encoder_conv_module_2_sequential_7_conv_bias, dilations = input_109_dilations_0, groups = input_109_groups_0, pad = input_109_pad_0, pad_type = input_109_pad_type_0, strides = input_109_strides_0, weight = encoder_conv_module_2_sequential_7_conv_weight, x = input_107)[name = tensor<string, []>("input_109")];
+            tensor<fp32, [2, 256, 31]> outputs_aug_5 = conv(dilations = outputs_aug_5_dilations_0, groups = outputs_aug_5_groups_0, pad = outputs_aug_5_pad_0, pad_type = outputs_aug_5_pad_type_0, strides = outputs_aug_5_strides_0, weight = inner_encoder_conv_module_2_sequential_4_conv_weight, x = inputs_25)[name = tensor<string, []>("outputs_aug_5")];
+            tensor<int32, [3]> input_107_begin_0 = const()[name = tensor<string, []>("input_107_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_107_end_0 = const()[name = tensor<string, []>("input_107_end_0"), val = tensor<int32, [3]>([2, 256, 16])];
+            tensor<bool, [3]> input_107_end_mask_0 = const()[name = tensor<string, []>("input_107_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [2, 256, 16]> input_107 = slice_by_index(begin = input_107_begin_0, end = input_107_end_0, end_mask = input_107_end_mask_0, x = outputs_aug_5)[name = tensor<string, []>("input_107")];
+            tensor<fp32, [2, 256, 16]> inputs_27 = batch_norm(beta = inner_encoder_conv_module_2_sequential_5_bias, epsilon = var_46, gamma = inner_encoder_conv_module_2_sequential_5_weight, mean = inner_encoder_conv_module_2_sequential_5_running_mean, variance = inner_encoder_conv_module_2_sequential_5_running_var, x = input_107)[name = tensor<string, []>("inputs_27")];
+            tensor<fp32, [2, 256, 16]> input_109 = silu(x = inputs_27)[name = tensor<string, []>("input_109")];
+            tensor<string, []> input_111_pad_type_0 = const()[name = tensor<string, []>("input_111_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_111_strides_0 = const()[name = tensor<string, []>("input_111_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_111_pad_0 = const()[name = tensor<string, []>("input_111_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_111_dilations_0 = const()[name = tensor<string, []>("input_111_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_111_groups_0 = const()[name = tensor<string, []>("input_111_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [2, 256, 16]> input_111 = conv(bias = inner_encoder_conv_module_2_sequential_7_conv_bias, dilations = input_111_dilations_0, groups = input_111_groups_0, pad = input_111_pad_0, pad_type = input_111_pad_type_0, strides = input_111_strides_0, weight = inner_encoder_conv_module_2_sequential_7_conv_weight, x = input_109)[name = tensor<string, []>("input_111")];
             tensor<int32, [3]> conv_out_5_perm_0 = const()[name = tensor<string, []>("conv_out_5_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_686_begin_0 = const()[name = tensor<string, []>("op_686_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_686_end_0 = const()[name = tensor<string, []>("op_686_end_0"), val = tensor<int32, [3]>([2, 16, 256])];
-            tensor<bool, [3]> var_686_end_mask_0 = const()[name = tensor<string, []>("op_686_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [2, 16, 256]> conv_out_5 = transpose(perm = conv_out_5_perm_0, x = input_109)[name = tensor<string, []>("transpose_39")];
-            tensor<fp32, [2, 1, 256]> var_686 = slice_by_index(begin = var_686_begin_0, end = var_686_end_0, end_mask = var_686_end_mask_0, x = conv_out_5)[name = tensor<string, []>("op_686")];
-            tensor<int32, [3]> var_688_perm_0 = const()[name = tensor<string, []>("op_688_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 2, 256]> var_688 = transpose(perm = var_688_perm_0, x = var_686)[name = tensor<string, []>("transpose_38")];
-            tensor<fp32, [1, 2, 256]> input_111 = add(x = x_15, y = var_688)[name = tensor<string, []>("input_111")];
-            tensor<int32, [1]> input_113_axes_0 = const()[name = tensor<string, []>("input_113_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_113 = layer_norm(axes = input_113_axes_0, beta = encoder_ffn2_2_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_2_module_sequential_0_weight, x = input_111)[name = tensor<string, []>("input_113")];
-            tensor<fp32, [1, 2, 1024]> inputs_29 = linear(bias = encoder_ffn2_2_module_sequential_1_linear_bias, weight = encoder_ffn2_2_module_sequential_1_linear_weight, x = input_113)[name = tensor<string, []>("linear_26")];
-            tensor<fp32, [1, 2, 1024]> input_115 = silu(x = inputs_29)[name = tensor<string, []>("input_115")];
-            tensor<fp32, [1, 2, 256]> input_119 = linear(bias = encoder_ffn2_2_module_sequential_4_linear_bias, weight = encoder_ffn2_2_module_sequential_4_linear_weight, x = input_115)[name = tensor<string, []>("linear_27")];
-            tensor<fp32, []> var_711 = const()[name = tensor<string, []>("op_711"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 2, 256]> var_712 = mul(x = input_119, y = var_711)[name = tensor<string, []>("op_712")];
-            tensor<fp32, [1, 2, 256]> input_121 = add(x = var_712, y = input_111)[name = tensor<string, []>("input_121")];
-            tensor<int32, [1]> input_123_axes_0 = const()[name = tensor<string, []>("input_123_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_123 = layer_norm(axes = input_123_axes_0, beta = encoder_layer_norm_2_bias, epsilon = var_28, gamma = encoder_layer_norm_2_weight, x = input_121)[name = tensor<string, []>("input_123")];
+            tensor<int32, [3]> var_722_begin_0 = const()[name = tensor<string, []>("op_722_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_722_end_0 = const()[name = tensor<string, []>("op_722_end_0"), val = tensor<int32, [3]>([2, 16, 256])];
+            tensor<bool, [3]> var_722_end_mask_0 = const()[name = tensor<string, []>("op_722_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [2, 16, 256]> conv_out_5 = transpose(perm = conv_out_5_perm_0, x = input_111)[name = tensor<string, []>("transpose_39")];
+            tensor<fp32, [2, 1, 256]> var_722 = slice_by_index(begin = var_722_begin_0, end = var_722_end_0, end_mask = var_722_end_mask_0, x = conv_out_5)[name = tensor<string, []>("op_722")];
+            tensor<int32, [3]> var_724_perm_0 = const()[name = tensor<string, []>("op_724_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 2, 256]> var_724 = transpose(perm = var_724_perm_0, x = var_722)[name = tensor<string, []>("transpose_38")];
+            tensor<fp32, [1, 2, 256]> input_113 = add(x = x_15, y = var_724)[name = tensor<string, []>("input_113")];
+            tensor<int32, [1]> input_115_axes_0 = const()[name = tensor<string, []>("input_115_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 2, 256]> input_115 = layer_norm(axes = input_115_axes_0, beta = inner_encoder_ffn2_2_module_sequential_0_bias, epsilon = var_46, gamma = inner_encoder_ffn2_2_module_sequential_0_weight, x = input_113)[name = tensor<string, []>("input_115")];
+            tensor<fp32, [1, 2, 1024]> inputs_29 = linear(bias = inner_encoder_ffn2_2_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_2_module_sequential_1_linear_weight, x = input_115)[name = tensor<string, []>("linear_26")];
+            tensor<fp32, [1, 2, 1024]> input_117 = silu(x = inputs_29)[name = tensor<string, []>("input_117")];
+            tensor<fp32, [1, 2, 256]> input_121 = linear(bias = inner_encoder_ffn2_2_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_2_module_sequential_4_linear_weight, x = input_117)[name = tensor<string, []>("linear_27")];
+            tensor<fp32, []> var_747 = const()[name = tensor<string, []>("op_747"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 2, 256]> var_748 = mul(x = input_121, y = var_747)[name = tensor<string, []>("op_748")];
+            tensor<fp32, [1, 2, 256]> input_123 = add(x = var_748, y = input_113)[name = tensor<string, []>("input_123")];
             tensor<int32, [1]> input_125_axes_0 = const()[name = tensor<string, []>("input_125_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_125 = layer_norm(axes = input_125_axes_0, beta = encoder_ffn1_3_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_3_module_sequential_0_weight, x = input_123)[name = tensor<string, []>("input_125")];
-            tensor<fp32, [1, 2, 1024]> inputs_31 = linear(bias = encoder_ffn1_3_module_sequential_1_linear_bias, weight = encoder_ffn1_3_module_sequential_1_linear_weight, x = input_125)[name = tensor<string, []>("linear_28")];
-            tensor<fp32, [1, 2, 1024]> input_127 = silu(x = inputs_31)[name = tensor<string, []>("input_127")];
-            tensor<fp32, [1, 2, 256]> input_131 = linear(bias = encoder_ffn1_3_module_sequential_4_linear_bias, weight = encoder_ffn1_3_module_sequential_4_linear_weight, x = input_127)[name = tensor<string, []>("linear_29")];
-            tensor<fp32, []> var_741 = const()[name = tensor<string, []>("op_741"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 2, 256]> var_742 = mul(x = input_131, y = var_741)[name = tensor<string, []>("op_742")];
-            tensor<fp32, [1, 2, 256]> input_133 = add(x = var_742, y = input_123)[name = tensor<string, []>("input_133")];
+            tensor<fp32, [1, 2, 256]> input_125 = layer_norm(axes = input_125_axes_0, beta = inner_encoder_layer_norm_2_bias, epsilon = var_46, gamma = inner_encoder_layer_norm_2_weight, x = input_123)[name = tensor<string, []>("input_125")];
+            tensor<int32, [1]> input_127_axes_0 = const()[name = tensor<string, []>("input_127_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 2, 256]> input_127 = layer_norm(axes = input_127_axes_0, beta = inner_encoder_ffn1_3_module_sequential_0_bias, epsilon = var_46, gamma = inner_encoder_ffn1_3_module_sequential_0_weight, x = input_125)[name = tensor<string, []>("input_127")];
+            tensor<fp32, [1, 2, 1024]> inputs_31 = linear(bias = inner_encoder_ffn1_3_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_3_module_sequential_1_linear_weight, x = input_127)[name = tensor<string, []>("linear_28")];
+            tensor<fp32, [1, 2, 1024]> input_129 = silu(x = inputs_31)[name = tensor<string, []>("input_129")];
+            tensor<fp32, [1, 2, 256]> input_133 = linear(bias = inner_encoder_ffn1_3_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_3_module_sequential_4_linear_weight, x = input_129)[name = tensor<string, []>("linear_29")];
+            tensor<fp32, []> var_777 = const()[name = tensor<string, []>("op_777"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 2, 256]> var_778 = mul(x = input_133, y = var_777)[name = tensor<string, []>("op_778")];
+            tensor<fp32, [1, 2, 256]> input_135 = add(x = var_778, y = input_125)[name = tensor<string, []>("input_135")];
             tensor<int32, [1]> x_19_axes_0 = const()[name = tensor<string, []>("x_19_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> x_19 = layer_norm(axes = x_19_axes_0, beta = encoder_ret_lns_3_bias, epsilon = var_28, gamma = encoder_ret_lns_3_weight, x = input_133)[name = tensor<string, []>("x_19")];
+            tensor<fp32, [1, 2, 256]> x_19 = layer_norm(axes = x_19_axes_0, beta = inner_encoder_ret_lns_3_bias, epsilon = var_46, gamma = inner_encoder_ret_lns_3_weight, x = input_135)[name = tensor<string, []>("x_19")];
             tensor<int32, [5]> prev_kv_7_begin_0 = const()[name = tensor<string, []>("prev_kv_7_begin_0"), val = tensor<int32, [5]>([3, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_7_end_0 = const()[name = tensor<string, []>("prev_kv_7_end_0"), val = tensor<int32, [5]>([4, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_7_end_mask_0 = const()[name = tensor<string, []>("prev_kv_7_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -710,189 +724,182 @@ program(1.0)
             tensor<bool, [2]> prev_scale_7_end_mask_0 = const()[name = tensor<string, []>("prev_scale_7_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_7_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_7_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_7 = slice_by_index(begin = prev_scale_7_begin_0, end = prev_scale_7_end_0, end_mask = prev_scale_7_end_mask_0, squeeze_mask = prev_scale_7_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_7")];
-            tensor<fp32, [1, 2, 256]> var_756 = linear(bias = encoder_q_proj_3_bias, weight = encoder_q_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_30")];
-            tensor<int32, [4]> var_757 = const()[name = tensor<string, []>("op_757"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_758 = reshape(shape = var_757, x = var_756)[name = tensor<string, []>("op_758")];
+            tensor<fp32, [1, 2, 256]> var_792 = linear(bias = inner_encoder_q_proj_3_bias, weight = inner_encoder_q_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_30")];
+            tensor<int32, [4]> var_793 = const()[name = tensor<string, []>("op_793"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_794 = reshape(shape = var_793, x = var_792)[name = tensor<string, []>("op_794")];
             tensor<int32, [4]> q_7_perm_0 = const()[name = tensor<string, []>("q_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> var_762 = linear(bias = encoder_k_proj_3_bias, weight = encoder_k_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_31")];
-            tensor<fp32, []> var_763 = const()[name = tensor<string, []>("op_763"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 2, 256]> var_764 = mul(x = var_762, y = var_763)[name = tensor<string, []>("op_764")];
-            tensor<int32, [4]> var_765 = const()[name = tensor<string, []>("op_765"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_766 = reshape(shape = var_765, x = var_764)[name = tensor<string, []>("op_766")];
+            tensor<fp32, [1, 2, 256]> var_798 = linear(bias = inner_encoder_k_proj_3_bias, weight = inner_encoder_k_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_31")];
+            tensor<fp32, []> var_799 = const()[name = tensor<string, []>("op_799"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 2, 256]> var_800 = mul(x = var_798, y = var_799)[name = tensor<string, []>("op_800")];
+            tensor<int32, [4]> var_801 = const()[name = tensor<string, []>("op_801"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_802 = reshape(shape = var_801, x = var_800)[name = tensor<string, []>("op_802")];
             tensor<int32, [4]> k_7_perm_0 = const()[name = tensor<string, []>("k_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> var_770 = linear(bias = encoder_v_proj_3_bias, weight = encoder_v_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_32")];
-            tensor<int32, [4]> var_771 = const()[name = tensor<string, []>("op_771"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_772 = reshape(shape = var_771, x = var_770)[name = tensor<string, []>("op_772")];
+            tensor<fp32, [1, 2, 256]> var_806 = linear(bias = inner_encoder_v_proj_3_bias, weight = inner_encoder_v_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_32")];
+            tensor<int32, [4]> var_807 = const()[name = tensor<string, []>("op_807"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_808 = reshape(shape = var_807, x = var_806)[name = tensor<string, []>("op_808")];
             tensor<int32, [4]> v_7_perm_0 = const()[name = tensor<string, []>("v_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> input_137 = linear(bias = encoder_g_proj_3_bias, weight = encoder_g_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_33")];
+            tensor<fp32, [1, 2, 256]> input_139 = linear(bias = inner_encoder_g_proj_3_bias, weight = inner_encoder_g_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_33")];
             tensor<fp32, [1]> sqrt_s0_7 = sqrt(x = prev_scale_7)[name = tensor<string, []>("sqrt_s0_7")];
-            tensor<fp32, [2]> s_t_7 = add(x = prev_scale_7, y = encoder__t_index)[name = tensor<string, []>("s_t_7")];
+            tensor<fp32, [2]> s_t_7 = add(x = prev_scale_7, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_7")];
             tensor<fp32, [2]> sqrt_s_t_7 = sqrt(x = s_t_7)[name = tensor<string, []>("sqrt_s_t_7")];
             tensor<bool, []> qk_7_transpose_x_1 = const()[name = tensor<string, []>("qk_7_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_7_transpose_y_1 = const()[name = tensor<string, []>("qk_7_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 2, 64]> k_7 = transpose(perm = k_7_perm_0, x = var_766)[name = tensor<string, []>("transpose_36")];
-            tensor<fp32, [1, 4, 2, 64]> q_7 = transpose(perm = q_7_perm_0, x = var_758)[name = tensor<string, []>("transpose_37")];
+            tensor<fp32, [1, 4, 2, 64]> k_7 = transpose(perm = k_7_perm_0, x = var_802)[name = tensor<string, []>("transpose_36")];
+            tensor<fp32, [1, 4, 2, 64]> q_7 = transpose(perm = q_7_perm_0, x = var_794)[name = tensor<string, []>("transpose_37")];
             tensor<fp32, [1, 4, 2, 2]> qk_7 = matmul(transpose_x = qk_7_transpose_x_1, transpose_y = qk_7_transpose_y_1, x = q_7, y = k_7)[name = tensor<string, []>("qk_7")];
-            tensor<int32, [2]> var_782 = const()[name = tensor<string, []>("op_782"), val = tensor<int32, [2]>([2, 1])];
-            tensor<fp32, [2, 1]> var_783 = reshape(shape = var_782, x = sqrt_s_t_7)[name = tensor<string, []>("op_783")];
-            tensor<fp32, [2, 2]> M_7 = real_div(x = encoder__causal_mask, y = var_783)[name = tensor<string, []>("M_7")];
-            tensor<fp32, [1, 4, 2, 2]> var_785 = mul(x = qk_7, y = M_7)[name = tensor<string, []>("op_785")];
+            tensor<int32, [2]> var_818 = const()[name = tensor<string, []>("op_818"), val = tensor<int32, [2]>([2, 1])];
+            tensor<fp32, [2, 1]> var_819 = reshape(shape = var_818, x = sqrt_s_t_7)[name = tensor<string, []>("op_819")];
+            tensor<fp32, [2, 2]> M_7 = real_div(x = inner_encoder__causal_mask, y = var_819)[name = tensor<string, []>("M_7")];
+            tensor<fp32, [1, 4, 2, 2]> var_821 = mul(x = qk_7, y = M_7)[name = tensor<string, []>("op_821")];
             tensor<bool, []> inner_7_transpose_x_0 = const()[name = tensor<string, []>("inner_7_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_7_transpose_y_0 = const()[name = tensor<string, []>("inner_7_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 2, 64]> v_7 = transpose(perm = v_7_perm_0, x = var_772)[name = tensor<string, []>("transpose_35")];
-            tensor<fp32, [1, 4, 2, 64]> inner_7 = matmul(transpose_x = inner_7_transpose_x_0, transpose_y = inner_7_transpose_y_0, x = var_785, y = v_7)[name = tensor<string, []>("inner_7")];
-            tensor<bool, []> var_787_transpose_x_0 = const()[name = tensor<string, []>("op_787_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_787_transpose_y_0 = const()[name = tensor<string, []>("op_787_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 2, 64]> var_787 = matmul(transpose_x = var_787_transpose_x_0, transpose_y = var_787_transpose_y_0, x = q_7, y = prev_kv_7)[name = tensor<string, []>("op_787")];
-            tensor<fp32, [2]> var_788 = real_div(x = sqrt_s0_7, y = sqrt_s_t_7)[name = tensor<string, []>("op_788")];
-            tensor<int32, [4]> var_789 = const()[name = tensor<string, []>("op_789"), val = tensor<int32, [4]>([1, 1, 2, 1])];
-            tensor<fp32, [1, 1, 2, 1]> var_790 = reshape(shape = var_789, x = var_788)[name = tensor<string, []>("op_790")];
-            tensor<fp32, [1, 4, 2, 64]> cross_7 = mul(x = var_787, y = var_790)[name = tensor<string, []>("cross_7")];
+            tensor<fp32, [1, 4, 2, 64]> v_7 = transpose(perm = v_7_perm_0, x = var_808)[name = tensor<string, []>("transpose_35")];
+            tensor<fp32, [1, 4, 2, 64]> inner_7 = matmul(transpose_x = inner_7_transpose_x_0, transpose_y = inner_7_transpose_y_0, x = var_821, y = v_7)[name = tensor<string, []>("inner_7")];
+            tensor<bool, []> var_823_transpose_x_0 = const()[name = tensor<string, []>("op_823_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_823_transpose_y_0 = const()[name = tensor<string, []>("op_823_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 2, 64]> var_823 = matmul(transpose_x = var_823_transpose_x_0, transpose_y = var_823_transpose_y_0, x = q_7, y = prev_kv_7)[name = tensor<string, []>("op_823")];
+            tensor<fp32, [2]> var_824 = real_div(x = sqrt_s0_7, y = sqrt_s_t_7)[name = tensor<string, []>("op_824")];
+            tensor<int32, [4]> var_825 = const()[name = tensor<string, []>("op_825"), val = tensor<int32, [4]>([1, 1, 2, 1])];
+            tensor<fp32, [1, 1, 2, 1]> var_826 = reshape(shape = var_825, x = var_824)[name = tensor<string, []>("op_826")];
+            tensor<fp32, [1, 4, 2, 64]> cross_7 = mul(x = var_823, y = var_826)[name = tensor<string, []>("cross_7")];
             tensor<fp32, [1, 4, 2, 64]> out_19 = add(x = inner_7, y = cross_7)[name = tensor<string, []>("out_19")];
-            tensor<fp32, [1, 4, 64, 64]> var_793 = mul(x = prev_kv_7, y = sqrt_s0_7)[name = tensor<string, []>("op_793")];
-            tensor<bool, []> var_795_transpose_x_1 = const()[name = tensor<string, []>("op_795_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_795_transpose_y_1 = const()[name = tensor<string, []>("op_795_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_795 = matmul(transpose_x = var_795_transpose_x_1, transpose_y = var_795_transpose_y_1, x = k_7, y = v_7)[name = tensor<string, []>("op_795")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_7 = add(x = var_793, y = var_795)[name = tensor<string, []>("new_kv_unnorm_7")];
-            tensor<fp32, []> var_797 = const()[name = tensor<string, []>("op_797"), val = tensor<fp32, []>(0x1p+1)];
-            tensor<fp32, [1]> new_scale_7 = add(x = prev_scale_7, y = var_797)[name = tensor<string, []>("new_scale_7")];
-            tensor<fp32, [1]> var_799 = sqrt(x = new_scale_7)[name = tensor<string, []>("op_799")];
-            tensor<fp32, [1, 4, 64, 64]> nkv_1 = real_div(x = new_kv_unnorm_7, y = var_799)[name = tensor<string, []>("nkv_1")];
-            tensor<int32, [4]> var_801_perm_0 = const()[name = tensor<string, []>("op_801_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_829 = mul(x = prev_kv_7, y = sqrt_s0_7)[name = tensor<string, []>("op_829")];
+            tensor<bool, []> var_831_transpose_x_1 = const()[name = tensor<string, []>("op_831_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_831_transpose_y_1 = const()[name = tensor<string, []>("op_831_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_831 = matmul(transpose_x = var_831_transpose_x_1, transpose_y = var_831_transpose_y_1, x = k_7, y = v_7)[name = tensor<string, []>("op_831")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_7 = add(x = var_829, y = var_831)[name = tensor<string, []>("new_kv_unnorm_7")];
+            tensor<fp32, []> var_833 = const()[name = tensor<string, []>("op_833"), val = tensor<fp32, []>(0x1p+1)];
+            tensor<fp32, [1]> new_scale_7 = add(x = prev_scale_7, y = var_833)[name = tensor<string, []>("new_scale_7")];
+            tensor<fp32, [1]> var_835 = sqrt(x = new_scale_7)[name = tensor<string, []>("op_835")];
+            tensor<fp32, [1, 4, 64, 64]> nkv_1 = real_div(x = new_kv_unnorm_7, y = var_835)[name = tensor<string, []>("nkv_1")];
+            tensor<int32, [4]> var_837_perm_0 = const()[name = tensor<string, []>("op_837_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_21_axes_0 = const()[name = tensor<string, []>("out_21_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 4, 64]> var_801 = transpose(perm = var_801_perm_0, x = out_19)[name = tensor<string, []>("transpose_34")];
-            tensor<fp32, [1, 2, 4, 64]> out_21 = layer_norm(axes = out_21_axes_0, epsilon = var_17, x = var_801)[name = tensor<string, []>("out_21")];
-            tensor<int32, [3]> var_805 = const()[name = tensor<string, []>("op_805"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<fp32, [1, 2, 256]> out_23 = reshape(shape = var_805, x = out_21)[name = tensor<string, []>("out_23")];
-            tensor<fp32, [1, 2, 256]> var_807 = silu(x = input_137)[name = tensor<string, []>("op_807")];
-            tensor<fp32, [1, 2, 256]> input_139 = mul(x = var_807, y = out_23)[name = tensor<string, []>("input_139")];
-            tensor<fp32, [1, 2, 256]> ret_out_7 = linear(bias = encoder_out_proj_3_bias, weight = encoder_out_proj_3_weight, x = input_139)[name = tensor<string, []>("linear_34")];
-            tensor<fp32, [1, 2, 256]> x_21 = add(x = input_133, y = ret_out_7)[name = tensor<string, []>("x_21")];
+            tensor<fp32, [1, 2, 4, 64]> var_837 = transpose(perm = var_837_perm_0, x = out_19)[name = tensor<string, []>("transpose_34")];
+            tensor<fp32, [1, 2, 4, 64]> out_21 = layer_norm(axes = out_21_axes_0, epsilon = var_54, x = var_837)[name = tensor<string, []>("out_21")];
+            tensor<int32, [3]> var_841 = const()[name = tensor<string, []>("op_841"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp32, [1, 2, 256]> out_23 = reshape(shape = var_841, x = out_21)[name = tensor<string, []>("out_23")];
+            tensor<fp32, [1, 2, 256]> var_843 = silu(x = input_139)[name = tensor<string, []>("op_843")];
+            tensor<fp32, [1, 2, 256]> input_141 = mul(x = var_843, y = out_23)[name = tensor<string, []>("input_141")];
+            tensor<fp32, [1, 2, 256]> ret_out_7 = linear(bias = inner_encoder_out_proj_3_bias, weight = inner_encoder_out_proj_3_weight, x = input_141)[name = tensor<string, []>("linear_34")];
+            tensor<fp32, [1, 2, 256]> x_21 = add(x = input_135, y = ret_out_7)[name = tensor<string, []>("x_21")];
             tensor<int32, [4]> window_19_begin_0 = const()[name = tensor<string, []>("window_19_begin_0"), val = tensor<int32, [4]>([3, 0, 0, 0])];
             tensor<int32, [4]> window_19_end_0 = const()[name = tensor<string, []>("window_19_end_0"), val = tensor<int32, [4]>([4, 1, 16, 256])];
             tensor<bool, [4]> window_19_end_mask_0 = const()[name = tensor<string, []>("window_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_19_squeeze_mask_0 = const()[name = tensor<string, []>("window_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_19 = slice_by_index(begin = window_19_begin_0, end = window_19_end_0, end_mask = window_19_end_mask_0, squeeze_mask = window_19_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_19")];
-            tensor<int32, [3]> var_815_begin_0 = const()[name = tensor<string, []>("op_815_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_815_end_0 = const()[name = tensor<string, []>("op_815_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_815_end_mask_0 = const()[name = tensor<string, []>("op_815_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_815 = slice_by_index(begin = var_815_begin_0, end = var_815_end_0, end_mask = var_815_end_mask_0, x = x_21)[name = tensor<string, []>("op_815")];
-            tensor<int32, [3]> var_818_begin_0 = const()[name = tensor<string, []>("op_818_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_818_end_0 = const()[name = tensor<string, []>("op_818_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_818_end_mask_0 = const()[name = tensor<string, []>("op_818_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_818 = slice_by_index(begin = var_818_begin_0, end = var_818_end_0, end_mask = var_818_end_mask_0, x = window_19)[name = tensor<string, []>("op_818")];
+            tensor<int32, [3]> var_851_begin_0 = const()[name = tensor<string, []>("op_851_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_851_end_0 = const()[name = tensor<string, []>("op_851_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_851_end_mask_0 = const()[name = tensor<string, []>("op_851_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_851 = slice_by_index(begin = var_851_begin_0, end = var_851_end_0, end_mask = var_851_end_mask_0, x = x_21)[name = tensor<string, []>("op_851")];
+            tensor<int32, [3]> var_854_begin_0 = const()[name = tensor<string, []>("op_854_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_854_end_0 = const()[name = tensor<string, []>("op_854_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_854_end_mask_0 = const()[name = tensor<string, []>("op_854_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_854 = slice_by_index(begin = var_854_begin_0, end = var_854_end_0, end_mask = var_854_end_mask_0, x = window_19)[name = tensor<string, []>("op_854")];
             tensor<bool, []> window_21_interleave_0 = const()[name = tensor<string, []>("window_21_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_21 = concat(axis = var_26, interleave = window_21_interleave_0, values = (var_818, var_815))[name = tensor<string, []>("window_21")];
-            tensor<int32, [3]> var_823_begin_0 = const()[name = tensor<string, []>("op_823_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_823_end_0 = const()[name = tensor<string, []>("op_823_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_823_end_mask_0 = const()[name = tensor<string, []>("op_823_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_823 = slice_by_index(begin = var_823_begin_0, end = var_823_end_0, end_mask = var_823_end_mask_0, x = x_21)[name = tensor<string, []>("op_823")];
-            tensor<int32, [3]> var_826_begin_0 = const()[name = tensor<string, []>("op_826_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_826_end_0 = const()[name = tensor<string, []>("op_826_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_826_end_mask_0 = const()[name = tensor<string, []>("op_826_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_826 = slice_by_index(begin = var_826_begin_0, end = var_826_end_0, end_mask = var_826_end_mask_0, x = window_21)[name = tensor<string, []>("op_826")];
+            tensor<fp32, [1, 16, 256]> window_21 = concat(axis = var_62, interleave = window_21_interleave_0, values = (var_854, var_851))[name = tensor<string, []>("window_21")];
+            tensor<int32, [3]> var_859_begin_0 = const()[name = tensor<string, []>("op_859_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_859_end_0 = const()[name = tensor<string, []>("op_859_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_859_end_mask_0 = const()[name = tensor<string, []>("op_859_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_859 = slice_by_index(begin = var_859_begin_0, end = var_859_end_0, end_mask = var_859_end_mask_0, x = x_21)[name = tensor<string, []>("op_859")];
+            tensor<int32, [3]> var_862_begin_0 = const()[name = tensor<string, []>("op_862_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_862_end_0 = const()[name = tensor<string, []>("op_862_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_862_end_mask_0 = const()[name = tensor<string, []>("op_862_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_862 = slice_by_index(begin = var_862_begin_0, end = var_862_end_0, end_mask = var_862_end_mask_0, x = window_21)[name = tensor<string, []>("op_862")];
             tensor<bool, []> window_interleave_0 = const()[name = tensor<string, []>("window_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window = concat(axis = var_26, interleave = window_interleave_0, values = (var_826, var_823))[name = tensor<string, []>("window")];
-            tensor<bool, []> input_141_interleave_0 = const()[name = tensor<string, []>("input_141_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [2, 16, 256]> input_141 = concat(axis = var_23, interleave = input_141_interleave_0, values = (window_21, window))[name = tensor<string, []>("input_141")];
+            tensor<fp32, [1, 16, 256]> window = concat(axis = var_62, interleave = window_interleave_0, values = (var_862, var_859))[name = tensor<string, []>("window")];
+            tensor<bool, []> input_143_interleave_0 = const()[name = tensor<string, []>("input_143_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [2, 16, 256]> input_143 = concat(axis = var_49, interleave = input_143_interleave_0, values = (window_21, window))[name = tensor<string, []>("input_143")];
             tensor<int32, [1]> x_23_axes_0 = const()[name = tensor<string, []>("x_23_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [2, 16, 256]> x_23 = layer_norm(axes = x_23_axes_0, beta = encoder_conv_module_3_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_3_sequential_0_weight, x = input_141)[name = tensor<string, []>("x_23")];
-            tensor<int32, [3]> input_143_perm_0 = const()[name = tensor<string, []>("input_143_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [2, 16, 256]> x_23 = layer_norm(axes = x_23_axes_0, beta = inner_encoder_conv_module_3_sequential_0_bias, epsilon = var_46, gamma = inner_encoder_conv_module_3_sequential_0_weight, x = input_143)[name = tensor<string, []>("x_23")];
+            tensor<int32, [3]> input_145_perm_0 = const()[name = tensor<string, []>("input_145_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_33_pad_type_0 = const()[name = tensor<string, []>("inputs_33_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_33_strides_0 = const()[name = tensor<string, []>("inputs_33_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_33_pad_0 = const()[name = tensor<string, []>("inputs_33_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_33_dilations_0 = const()[name = tensor<string, []>("inputs_33_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_33_groups_0 = const()[name = tensor<string, []>("inputs_33_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> input_143 = transpose(perm = input_143_perm_0, x = x_23)[name = tensor<string, []>("transpose_33")];
-            tensor<fp32, [2, 512, 16]> inputs_33 = conv(bias = encoder_conv_module_3_sequential_2_conv_bias, dilations = inputs_33_dilations_0, groups = inputs_33_groups_0, pad = inputs_33_pad_0, pad_type = inputs_33_pad_type_0, strides = inputs_33_strides_0, weight = encoder_conv_module_3_sequential_2_conv_weight, x = input_143)[name = tensor<string, []>("inputs_33")];
-            tensor<int32, [2]> var_851_split_sizes_0 = const()[name = tensor<string, []>("op_851_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_851_axis_0 = const()[name = tensor<string, []>("op_851_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> var_851_0, tensor<fp32, [2, 256, 16]> var_851_1 = split(axis = var_851_axis_0, split_sizes = var_851_split_sizes_0, x = inputs_33)[name = tensor<string, []>("op_851")];
-            tensor<fp32, [2, 256, 16]> var_853 = sigmoid(x = var_851_1)[name = tensor<string, []>("op_853")];
-            tensor<fp32, [2, 256, 16]> inputs_35 = mul(x = var_851_0, y = var_853)[name = tensor<string, []>("inputs_35")];
+            tensor<fp32, [2, 256, 16]> input_145 = transpose(perm = input_145_perm_0, x = x_23)[name = tensor<string, []>("transpose_33")];
+            tensor<fp32, [2, 512, 16]> inputs_33 = conv(bias = inner_encoder_conv_module_3_sequential_2_conv_bias, dilations = inputs_33_dilations_0, groups = inputs_33_groups_0, pad = inputs_33_pad_0, pad_type = inputs_33_pad_type_0, strides = inputs_33_strides_0, weight = inner_encoder_conv_module_3_sequential_2_conv_weight, x = input_145)[name = tensor<string, []>("inputs_33")];
+            tensor<int32, [2]> var_887_split_sizes_0 = const()[name = tensor<string, []>("op_887_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_887_axis_0 = const()[name = tensor<string, []>("op_887_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [2, 256, 16]> var_887_0, tensor<fp32, [2, 256, 16]> var_887_1 = split(axis = var_887_axis_0, split_sizes = var_887_split_sizes_0, x = inputs_33)[name = tensor<string, []>("op_887")];
+            tensor<fp32, [2, 256, 16]> var_889 = sigmoid(x = var_887_1)[name = tensor<string, []>("op_889")];
+            tensor<fp32, [2, 256, 16]> inputs_35 = mul(x = var_887_0, y = var_889)[name = tensor<string, []>("inputs_35")];
             tensor<string, []> outputs_aug_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_pad_0 = const()[name = tensor<string, []>("outputs_aug_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_groups_0 = const()[name = tensor<string, []>("outputs_aug_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_strides_0 = const()[name = tensor<string, []>("outputs_aug_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_dilations_0 = const()[name = tensor<string, []>("outputs_aug_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [2, 256, 31]> outputs_aug = conv(dilations = outputs_aug_dilations_0, groups = outputs_aug_groups_0, pad = outputs_aug_pad_0, pad_type = outputs_aug_pad_type_0, strides = outputs_aug_strides_0, weight = encoder_conv_module_3_sequential_4_conv_weight, x = inputs_35)[name = tensor<string, []>("outputs_aug")];
-            tensor<int32, [3]> input_145_begin_0 = const()[name = tensor<string, []>("input_145_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_145_end_0 = const()[name = tensor<string, []>("input_145_end_0"), val = tensor<int32, [3]>([2, 256, 16])];
-            tensor<bool, [3]> input_145_end_mask_0 = const()[name = tensor<string, []>("input_145_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [2, 256, 16]> input_145 = slice_by_index(begin = input_145_begin_0, end = input_145_end_0, end_mask = input_145_end_mask_0, x = outputs_aug)[name = tensor<string, []>("input_145")];
-            tensor<fp32, [2, 256, 16]> inputs_37 = batch_norm(beta = encoder_conv_module_3_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_3_sequential_5_weight, mean = encoder_conv_module_3_sequential_5_running_mean, variance = encoder_conv_module_3_sequential_5_running_var, x = input_145)[name = tensor<string, []>("inputs_37")];
-            tensor<fp32, [2, 256, 16]> input_147 = silu(x = inputs_37)[name = tensor<string, []>("input_147")];
-            tensor<string, []> input_149_pad_type_0 = const()[name = tensor<string, []>("input_149_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_149_strides_0 = const()[name = tensor<string, []>("input_149_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_149_pad_0 = const()[name = tensor<string, []>("input_149_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_149_dilations_0 = const()[name = tensor<string, []>("input_149_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_149_groups_0 = const()[name = tensor<string, []>("input_149_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> input_149 = conv(bias = encoder_conv_module_3_sequential_7_conv_bias, dilations = input_149_dilations_0, groups = input_149_groups_0, pad = input_149_pad_0, pad_type = input_149_pad_type_0, strides = input_149_strides_0, weight = encoder_conv_module_3_sequential_7_conv_weight, x = input_147)[name = tensor<string, []>("input_149")];
+            tensor<fp32, [2, 256, 31]> outputs_aug = conv(dilations = outputs_aug_dilations_0, groups = outputs_aug_groups_0, pad = outputs_aug_pad_0, pad_type = outputs_aug_pad_type_0, strides = outputs_aug_strides_0, weight = inner_encoder_conv_module_3_sequential_4_conv_weight, x = inputs_35)[name = tensor<string, []>("outputs_aug")];
+            tensor<int32, [3]> input_147_begin_0 = const()[name = tensor<string, []>("input_147_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_147_end_0 = const()[name = tensor<string, []>("input_147_end_0"), val = tensor<int32, [3]>([2, 256, 16])];
+            tensor<bool, [3]> input_147_end_mask_0 = const()[name = tensor<string, []>("input_147_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [2, 256, 16]> input_147 = slice_by_index(begin = input_147_begin_0, end = input_147_end_0, end_mask = input_147_end_mask_0, x = outputs_aug)[name = tensor<string, []>("input_147")];
+            tensor<fp32, [2, 256, 16]> inputs_37 = batch_norm(beta = inner_encoder_conv_module_3_sequential_5_bias, epsilon = var_46, gamma = inner_encoder_conv_module_3_sequential_5_weight, mean = inner_encoder_conv_module_3_sequential_5_running_mean, variance = inner_encoder_conv_module_3_sequential_5_running_var, x = input_147)[name = tensor<string, []>("inputs_37")];
+            tensor<fp32, [2, 256, 16]> input_149 = silu(x = inputs_37)[name = tensor<string, []>("input_149")];
+            tensor<string, []> input_151_pad_type_0 = const()[name = tensor<string, []>("input_151_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_151_strides_0 = const()[name = tensor<string, []>("input_151_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_151_pad_0 = const()[name = tensor<string, []>("input_151_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_151_dilations_0 = const()[name = tensor<string, []>("input_151_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_151_groups_0 = const()[name = tensor<string, []>("input_151_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [2, 256, 16]> input_151 = conv(bias = inner_encoder_conv_module_3_sequential_7_conv_bias, dilations = input_151_dilations_0, groups = input_151_groups_0, pad = input_151_pad_0, pad_type = input_151_pad_type_0, strides = input_151_strides_0, weight = inner_encoder_conv_module_3_sequential_7_conv_weight, x = input_149)[name = tensor<string, []>("input_151")];
             tensor<int32, [3]> conv_out_7_perm_0 = const()[name = tensor<string, []>("conv_out_7_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_884_begin_0 = const()[name = tensor<string, []>("op_884_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_884_end_0 = const()[name = tensor<string, []>("op_884_end_0"), val = tensor<int32, [3]>([2, 16, 256])];
-            tensor<bool, [3]> var_884_end_mask_0 = const()[name = tensor<string, []>("op_884_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [2, 16, 256]> conv_out_7 = transpose(perm = conv_out_7_perm_0, x = input_149)[name = tensor<string, []>("transpose_32")];
-            tensor<fp32, [2, 1, 256]> var_884 = slice_by_index(begin = var_884_begin_0, end = var_884_end_0, end_mask = var_884_end_mask_0, x = conv_out_7)[name = tensor<string, []>("op_884")];
-            tensor<int32, [3]> var_886_perm_0 = const()[name = tensor<string, []>("op_886_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 2, 256]> var_886 = transpose(perm = var_886_perm_0, x = var_884)[name = tensor<string, []>("transpose_31")];
-            tensor<fp32, [1, 2, 256]> input_151 = add(x = x_21, y = var_886)[name = tensor<string, []>("input_151")];
-            tensor<int32, [1]> input_153_axes_0 = const()[name = tensor<string, []>("input_153_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_153 = layer_norm(axes = input_153_axes_0, beta = encoder_ffn2_3_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_3_module_sequential_0_weight, x = input_151)[name = tensor<string, []>("input_153")];
-            tensor<fp32, [1, 2, 1024]> inputs = linear(bias = encoder_ffn2_3_module_sequential_1_linear_bias, weight = encoder_ffn2_3_module_sequential_1_linear_weight, x = input_153)[name = tensor<string, []>("linear_35")];
-            tensor<fp32, [1, 2, 1024]> input_155 = silu(x = inputs)[name = tensor<string, []>("input_155")];
-            tensor<fp32, [1, 2, 256]> input_159 = linear(bias = encoder_ffn2_3_module_sequential_4_linear_bias, weight = encoder_ffn2_3_module_sequential_4_linear_weight, x = input_155)[name = tensor<string, []>("linear_36")];
-            tensor<fp32, []> var_909 = const()[name = tensor<string, []>("op_909"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 2, 256]> var_910 = mul(x = input_159, y = var_909)[name = tensor<string, []>("op_910")];
-            tensor<fp32, [1, 2, 256]> input_161 = add(x = var_910, y = input_151)[name = tensor<string, []>("input_161")];
+            tensor<int32, [3]> var_920_begin_0 = const()[name = tensor<string, []>("op_920_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_920_end_0 = const()[name = tensor<string, []>("op_920_end_0"), val = tensor<int32, [3]>([2, 16, 256])];
+            tensor<bool, [3]> var_920_end_mask_0 = const()[name = tensor<string, []>("op_920_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [2, 16, 256]> conv_out_7 = transpose(perm = conv_out_7_perm_0, x = input_151)[name = tensor<string, []>("transpose_32")];
+            tensor<fp32, [2, 1, 256]> var_920 = slice_by_index(begin = var_920_begin_0, end = var_920_end_0, end_mask = var_920_end_mask_0, x = conv_out_7)[name = tensor<string, []>("op_920")];
+            tensor<int32, [3]> var_922_perm_0 = const()[name = tensor<string, []>("op_922_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 2, 256]> var_922 = transpose(perm = var_922_perm_0, x = var_920)[name = tensor<string, []>("transpose_31")];
+            tensor<fp32, [1, 2, 256]> input_153 = add(x = x_21, y = var_922)[name = tensor<string, []>("input_153")];
+            tensor<int32, [1]> input_155_axes_0 = const()[name = tensor<string, []>("input_155_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 2, 256]> input_155 = layer_norm(axes = input_155_axes_0, beta = inner_encoder_ffn2_3_module_sequential_0_bias, epsilon = var_46, gamma = inner_encoder_ffn2_3_module_sequential_0_weight, x = input_153)[name = tensor<string, []>("input_155")];
+            tensor<fp32, [1, 2, 1024]> inputs = linear(bias = inner_encoder_ffn2_3_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_3_module_sequential_1_linear_weight, x = input_155)[name = tensor<string, []>("linear_35")];
+            tensor<fp32, [1, 2, 1024]> input_157 = silu(x = inputs)[name = tensor<string, []>("input_157")];
+            tensor<fp32, [1, 2, 256]> input_161 = linear(bias = inner_encoder_ffn2_3_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_3_module_sequential_4_linear_weight, x = input_157)[name = tensor<string, []>("linear_36")];
+            tensor<fp32, []> var_945 = const()[name = tensor<string, []>("op_945"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 2, 256]> var_946 = mul(x = input_161, y = var_945)[name = tensor<string, []>("op_946")];
+            tensor<fp32, [1, 2, 256]> input_163 = add(x = var_946, y = input_153)[name = tensor<string, []>("input_163")];
             tensor<int32, [1]> x_25_axes_0 = const()[name = tensor<string, []>("x_25_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> x_25 = layer_norm(axes = x_25_axes_0, beta = encoder_layer_norm_3_bias, epsilon = var_28, gamma = encoder_layer_norm_3_weight, x = input_161)[name = tensor<string, []>("x_25")];
+            tensor<fp32, [1, 2, 256]> x_25 = layer_norm(axes = x_25_axes_0, beta = inner_encoder_layer_norm_3_bias, epsilon = var_46, gamma = inner_encoder_layer_norm_3_weight, x = input_163)[name = tensor<string, []>("x_25")];
             tensor<int32, [3]> x_ct_perm_0 = const()[name = tensor<string, []>("x_ct_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<bool, []> cat_interleave_0 = const()[name = tensor<string, []>("cat_interleave_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [1, 256, 2]> x_ct = transpose(perm = x_ct_perm_0, x = x_25)[name = tensor<string, []>("transpose_30")];
-            tensor<fp32, [1, 256, 20]> cat = concat(axis = var_20, interleave = cat_interleave_0, values = (cnn_window, x_ct))[name = tensor<string, []>("cat")];
+            tensor<fp32, [1, 256, 20]> cat = concat(axis = var_51, interleave = cat_interleave_0, values = (cnn_window, x_ct))[name = tensor<string, []>("cat")];
             tensor<string, []> conv_out_pad_type_0 = const()[name = tensor<string, []>("conv_out_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> conv_out_strides_0 = const()[name = tensor<string, []>("conv_out_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> conv_out_pad_0 = const()[name = tensor<string, []>("conv_out_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> conv_out_dilations_0 = const()[name = tensor<string, []>("conv_out_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> conv_out_groups_0 = const()[name = tensor<string, []>("conv_out_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 2]> conv_out = conv(bias = encoder_cnn_bias, dilations = conv_out_dilations_0, groups = conv_out_groups_0, pad = conv_out_pad_0, pad_type = conv_out_pad_type_0, strides = conv_out_strides_0, weight = encoder_cnn_weight, x = cat)[name = tensor<string, []>("conv_out")];
-            tensor<int32, [3]> var_928_begin_0 = const()[name = tensor<string, []>("op_928_begin_0"), val = tensor<int32, [3]>([0, 0, 2])];
-            tensor<int32, [3]> var_928_end_0 = const()[name = tensor<string, []>("op_928_end_0"), val = tensor<int32, [3]>([1, 256, 20])];
-            tensor<bool, [3]> var_928_end_mask_0 = const()[name = tensor<string, []>("op_928_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 256, 18]> cnn_window_new = slice_by_index(begin = var_928_begin_0, end = var_928_end_0, end_mask = var_928_end_mask_0, x = cat)[name = tensor<string, []>("op_928")];
-            tensor<int32, [3]> input_163_perm_0 = const()[name = tensor<string, []>("input_163_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [1]> var_930 = const()[name = tensor<string, []>("op_930"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_163 = transpose(perm = input_163_perm_0, x = conv_out)[name = tensor<string, []>("transpose_29")];
-            tensor<fp32, [1, 2, 1]> var_931 = reduce_l2_norm(axes = var_930, keep_dims = var_29, x = input_163)[name = tensor<string, []>("op_931")];
+            tensor<fp32, [1, 256, 2]> conv_out = conv(bias = inner_encoder_cnn_bias, dilations = conv_out_dilations_0, groups = conv_out_groups_0, pad = conv_out_pad_0, pad_type = conv_out_pad_type_0, strides = conv_out_strides_0, weight = inner_encoder_cnn_weight, x = cat)[name = tensor<string, []>("conv_out")];
+            tensor<int32, [3]> var_964_begin_0 = const()[name = tensor<string, []>("op_964_begin_0"), val = tensor<int32, [3]>([0, 0, 2])];
+            tensor<int32, [3]> var_964_end_0 = const()[name = tensor<string, []>("op_964_end_0"), val = tensor<int32, [3]>([1, 256, 20])];
+            tensor<bool, [3]> var_964_end_mask_0 = const()[name = tensor<string, []>("op_964_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 256, 18]> cnn_window_new = slice_by_index(begin = var_964_begin_0, end = var_964_end_0, end_mask = var_964_end_mask_0, x = cat)[name = tensor<string, []>("op_964")];
+            tensor<int32, [3]> input_165_perm_0 = const()[name = tensor<string, []>("input_165_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_966 = const()[name = tensor<string, []>("op_966"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 2, 256]> input_165 = transpose(perm = input_165_perm_0, x = conv_out)[name = tensor<string, []>("transpose_29")];
+            tensor<fp32, [1, 2, 1]> var_967 = reduce_l2_norm(axes = var_966, keep_dims = var_45, x = input_165)[name = tensor<string, []>("op_967")];
             tensor<fp32, []> const_12 = const()[name = tensor<string, []>("const_12"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1, 2, 1]> clip_0 = clip(alpha = var_11, beta = const_12, x = var_931)[name = tensor<string, []>("clip_0")];
-            tensor<fp32, [1, 2, 256]> emb = real_div(x = input_163, y = clip_0)[name = tensor<string, []>("emb")];
-            tensor<int32, []> var_935_axis_0 = const()[name = tensor<string, []>("op_935_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1, 4, 64, 64]> enc_kv_new = stack(axis = var_935_axis_0, values = (var_206, var_404, var_602, nkv_1))[name = tensor<string, []>("op_935")];
-            tensor<int32, []> var_937_axis_0 = const()[name = tensor<string, []>("op_937_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1]> enc_scale_new = stack(axis = var_937_axis_0, values = (new_scale_1, new_scale_3, new_scale_5, new_scale_7))[name = tensor<string, []>("op_937")];
-            tensor<int32, []> var_939_axis_0 = const()[name = tensor<string, []>("op_939_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1, 16, 256]> enc_conv_cache_new = stack(axis = var_939_axis_0, values = (window_5, window_11, window_17, window))[name = tensor<string, []>("op_939")];
-            tensor<fp32, []> var_948 = const()[name = tensor<string, []>("op_948"), val = tensor<fp32, []>(0x1.5798eep-27)];
-            tensor<fp32, []> var_953 = const()[name = tensor<string, []>("op_953"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
-            tensor<fp32, []> var_955 = const()[name = tensor<string, []>("op_955"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
-            tensor<bool, []> var_956 = const()[name = tensor<string, []>("op_956"), val = tensor<bool, []>(true)];
-            tensor<fp32, []> var_958 = const()[name = tensor<string, []>("op_958"), val = tensor<fp32, []>(0x1p+0)];
-            tensor<int32, []> var_962 = const()[name = tensor<string, []>("op_962"), val = tensor<int32, []>(-1)];
-            tensor<int32, []> var_968 = const()[name = tensor<string, []>("op_968"), val = tensor<int32, []>(0)];
+            tensor<fp32, [1, 2, 1]> clip_0 = clip(alpha = var_59, beta = const_12, x = var_967)[name = tensor<string, []>("clip_0")];
+            tensor<fp32, [1, 2, 256]> emb = real_div(x = input_165, y = clip_0)[name = tensor<string, []>("emb")];
+            tensor<int32, []> var_971_axis_0 = const()[name = tensor<string, []>("op_971_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1, 4, 64, 64]> enc_kv_new = stack(axis = var_971_axis_0, values = (var_242, var_440, var_638, nkv_1))[name = tensor<string, []>("op_971")];
+            tensor<int32, []> var_973_axis_0 = const()[name = tensor<string, []>("op_973_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1]> enc_scale_new = stack(axis = var_973_axis_0, values = (new_scale_1, new_scale_3, new_scale_5, new_scale_7))[name = tensor<string, []>("op_973")];
+            tensor<int32, []> var_975_axis_0 = const()[name = tensor<string, []>("op_975_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1, 16, 256]> enc_conv_cache_new = stack(axis = var_975_axis_0, values = (window_5, window_11, window_17, window))[name = tensor<string, []>("op_975")];
             tensor<fp32, [1, 2, 9, 256]> pos = const()[name = tensor<string, []>("pos"), val = tensor<fp32, [1, 2, 9, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44395328)))];
-            tensor<int32, [1]> var_1030_axes_0 = const()[name = tensor<string, []>("op_1030_axes_0"), val = tensor<int32, [1]>([2])];
-            tensor<fp32, [1, 2, 1, 256]> var_1030 = expand_dims(axes = var_1030_axes_0, x = emb)[name = tensor<string, []>("op_1030")];
+            tensor<int32, [1]> var_1043_axes_0 = const()[name = tensor<string, []>("op_1043_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp32, [1, 2, 1, 256]> var_1043 = expand_dims(axes = var_1043_axes_0, x = emb)[name = tensor<string, []>("op_1043")];
             tensor<int32, [4]> emb_exp_reps_0 = const()[name = tensor<string, []>("emb_exp_reps_0"), val = tensor<int32, [4]>([1, 1, 9, 1])];
-            tensor<fp32, [1, 2, 9, 256]> emb_exp = tile(reps = emb_exp_reps_0, x = var_1030)[name = tensor<string, []>("emb_exp")];
-            tensor<bool, []> input_165_interleave_0 = const()[name = tensor<string, []>("input_165_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 2, 9, 512]> input_165 = concat(axis = var_962, interleave = input_165_interleave_0, values = (emb_exp, pos))[name = tensor<string, []>("input_165")];
-            tensor<fp32, [1, 2, 9, 256]> x_27 = linear(bias = decoder_convert_bias, weight = decoder_convert_weight, x = input_165)[name = tensor<string, []>("linear_37")];
-            tensor<int32, [4]> var_1038_perm_0 = const()[name = tensor<string, []>("op_1038_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1042 = const()[name = tensor<string, []>("op_1042"), val = tensor<int32, [3]>([9, 2, 256])];
-            tensor<fp32, [1, 9, 2, 256]> var_1038 = transpose(perm = var_1038_perm_0, x = x_27)[name = tensor<string, []>("transpose_28")];
-            tensor<fp32, [9, 2, 256]> x_29 = reshape(shape = var_1042, x = var_1038)[name = tensor<string, []>("x_29")];
+            tensor<fp32, [1, 2, 9, 256]> emb_exp = tile(reps = emb_exp_reps_0, x = var_1043)[name = tensor<string, []>("emb_exp")];
+            tensor<bool, []> input_167_interleave_0 = const()[name = tensor<string, []>("input_167_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 2, 9, 512]> input_167 = concat(axis = var_52, interleave = input_167_interleave_0, values = (emb_exp, pos))[name = tensor<string, []>("input_167")];
+            tensor<fp32, [1, 2, 9, 256]> x_27 = linear(bias = inner_decoder_convert_bias, weight = inner_decoder_convert_weight, x = input_167)[name = tensor<string, []>("linear_37")];
+            tensor<int32, [4]> var_1051_perm_0 = const()[name = tensor<string, []>("op_1051_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1055 = const()[name = tensor<string, []>("op_1055"), val = tensor<int32, [3]>([9, 2, 256])];
+            tensor<fp32, [1, 9, 2, 256]> var_1051 = transpose(perm = var_1051_perm_0, x = x_27)[name = tensor<string, []>("transpose_28")];
+            tensor<fp32, [9, 2, 256]> x_29 = reshape(shape = var_1055, x = var_1051)[name = tensor<string, []>("x_29")];
             tensor<int32, [5]> prev_kv_9_begin_0 = const()[name = tensor<string, []>("prev_kv_9_begin_0"), val = tensor<int32, [5]>([0, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_9_end_0 = const()[name = tensor<string, []>("prev_kv_9_end_0"), val = tensor<int32, [5]>([1, 9, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_9_end_mask_0 = const()[name = tensor<string, []>("prev_kv_9_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -903,132 +910,132 @@ program(1.0)
             tensor<bool, [2]> prev_scale_9_end_mask_0 = const()[name = tensor<string, []>("prev_scale_9_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_9_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_9_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_9 = slice_by_index(begin = prev_scale_9_begin_0, end = prev_scale_9_end_0, end_mask = prev_scale_9_end_mask_0, squeeze_mask = prev_scale_9_squeeze_mask_0, x = dec_scale)[name = tensor<string, []>("prev_scale_9")];
-            tensor<fp32, [9, 2, 256]> var_1050 = linear(bias = decoder_q_proj_0_bias, weight = decoder_q_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_38")];
-            tensor<int32, [4]> var_1051 = const()[name = tensor<string, []>("op_1051"), val = tensor<int32, [4]>([9, 2, 4, 64])];
-            tensor<fp32, [9, 2, 4, 64]> var_1052 = reshape(shape = var_1051, x = var_1050)[name = tensor<string, []>("op_1052")];
+            tensor<fp32, [9, 2, 256]> var_1063 = linear(bias = inner_decoder_q_proj_0_bias, weight = inner_decoder_q_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_38")];
+            tensor<int32, [4]> var_1064 = const()[name = tensor<string, []>("op_1064"), val = tensor<int32, [4]>([9, 2, 4, 64])];
+            tensor<fp32, [9, 2, 4, 64]> var_1065 = reshape(shape = var_1064, x = var_1063)[name = tensor<string, []>("op_1065")];
             tensor<int32, [4]> q_9_perm_0 = const()[name = tensor<string, []>("q_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [9, 2, 256]> var_1056 = linear(bias = decoder_k_proj_0_bias, weight = decoder_k_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_39")];
-            tensor<fp32, []> var_1057 = const()[name = tensor<string, []>("op_1057"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [9, 2, 256]> var_1058 = mul(x = var_1056, y = var_1057)[name = tensor<string, []>("op_1058")];
-            tensor<int32, [4]> var_1059 = const()[name = tensor<string, []>("op_1059"), val = tensor<int32, [4]>([9, 2, 4, 64])];
-            tensor<fp32, [9, 2, 4, 64]> var_1060 = reshape(shape = var_1059, x = var_1058)[name = tensor<string, []>("op_1060")];
+            tensor<fp32, [9, 2, 256]> var_1069 = linear(bias = inner_decoder_k_proj_0_bias, weight = inner_decoder_k_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_39")];
+            tensor<fp32, []> var_1070 = const()[name = tensor<string, []>("op_1070"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [9, 2, 256]> var_1071 = mul(x = var_1069, y = var_1070)[name = tensor<string, []>("op_1071")];
+            tensor<int32, [4]> var_1072 = const()[name = tensor<string, []>("op_1072"), val = tensor<int32, [4]>([9, 2, 4, 64])];
+            tensor<fp32, [9, 2, 4, 64]> var_1073 = reshape(shape = var_1072, x = var_1071)[name = tensor<string, []>("op_1073")];
             tensor<int32, [4]> k_9_perm_0 = const()[name = tensor<string, []>("k_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [9, 2, 256]> var_1064 = linear(bias = decoder_v_proj_0_bias, weight = decoder_v_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_40")];
-            tensor<int32, [4]> var_1065 = const()[name = tensor<string, []>("op_1065"), val = tensor<int32, [4]>([9, 2, 4, 64])];
-            tensor<fp32, [9, 2, 4, 64]> var_1066 = reshape(shape = var_1065, x = var_1064)[name = tensor<string, []>("op_1066")];
+            tensor<fp32, [9, 2, 256]> var_1077 = linear(bias = inner_decoder_v_proj_0_bias, weight = inner_decoder_v_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_40")];
+            tensor<int32, [4]> var_1078 = const()[name = tensor<string, []>("op_1078"), val = tensor<int32, [4]>([9, 2, 4, 64])];
+            tensor<fp32, [9, 2, 4, 64]> var_1079 = reshape(shape = var_1078, x = var_1077)[name = tensor<string, []>("op_1079")];
             tensor<int32, [4]> v_9_perm_0 = const()[name = tensor<string, []>("v_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [9, 2, 256]> input_169 = linear(bias = decoder_g_proj_0_bias, weight = decoder_g_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_41")];
+            tensor<fp32, [9, 2, 256]> input_171 = linear(bias = inner_decoder_g_proj_0_bias, weight = inner_decoder_g_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_41")];
             tensor<bool, []> cumsum_mask_1_exclusive_0 = const()[name = tensor<string, []>("cumsum_mask_1_exclusive_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> cumsum_mask_1_reverse_0 = const()[name = tensor<string, []>("cumsum_mask_1_reverse_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [2]> cumsum_mask_1 = cumsum(axis = var_968, exclusive = cumsum_mask_1_exclusive_0, reverse = cumsum_mask_1_reverse_0, x = valid_mask)[name = tensor<string, []>("cumsum_mask_1")];
+            tensor<fp32, [2]> cumsum_mask_1 = cumsum(axis = var_49, exclusive = cumsum_mask_1_exclusive_0, reverse = cumsum_mask_1_reverse_0, x = valid_mask)[name = tensor<string, []>("cumsum_mask_1")];
             tensor<fp32, [1]> sqrt_s0_9 = sqrt(x = prev_scale_9)[name = tensor<string, []>("sqrt_s0_9")];
             tensor<fp32, [2]> s_t_9 = add(x = prev_scale_9, y = cumsum_mask_1)[name = tensor<string, []>("s_t_9")];
             tensor<fp32, []> const_20 = const()[name = tensor<string, []>("const_20"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [2]> clip_1 = clip(alpha = var_958, beta = const_20, x = s_t_9)[name = tensor<string, []>("clip_1")];
+            tensor<fp32, [2]> clip_1 = clip(alpha = var_39, beta = const_20, x = s_t_9)[name = tensor<string, []>("clip_1")];
             tensor<fp32, [2]> sqrt_s_t_9 = sqrt(x = clip_1)[name = tensor<string, []>("sqrt_s_t_9")];
             tensor<bool, []> qk_9_transpose_x_1 = const()[name = tensor<string, []>("qk_9_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_9_transpose_y_1 = const()[name = tensor<string, []>("qk_9_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [9, 4, 2, 64]> k_9 = transpose(perm = k_9_perm_0, x = var_1060)[name = tensor<string, []>("transpose_26")];
-            tensor<fp32, [9, 4, 2, 64]> q_9 = transpose(perm = q_9_perm_0, x = var_1052)[name = tensor<string, []>("transpose_27")];
+            tensor<fp32, [9, 4, 2, 64]> k_9 = transpose(perm = k_9_perm_0, x = var_1073)[name = tensor<string, []>("transpose_26")];
+            tensor<fp32, [9, 4, 2, 64]> q_9 = transpose(perm = q_9_perm_0, x = var_1065)[name = tensor<string, []>("transpose_27")];
             tensor<fp32, [9, 4, 2, 2]> qk_9 = matmul(transpose_x = qk_9_transpose_x_1, transpose_y = qk_9_transpose_y_1, x = q_9, y = k_9)[name = tensor<string, []>("qk_9")];
-            tensor<int32, [2]> var_1078 = const()[name = tensor<string, []>("op_1078"), val = tensor<int32, [2]>([1, 2])];
-            tensor<fp32, [1, 2]> var_1079 = reshape(shape = var_1078, x = valid_mask)[name = tensor<string, []>("op_1079")];
-            tensor<fp32, [2, 2]> causal_with_valid_1 = mul(x = decoder__causal_mask, y = var_1079)[name = tensor<string, []>("causal_with_valid_1")];
-            tensor<int32, [2]> var_1081 = const()[name = tensor<string, []>("op_1081"), val = tensor<int32, [2]>([2, 1])];
-            tensor<fp32, [2, 1]> var_1082 = reshape(shape = var_1081, x = sqrt_s_t_9)[name = tensor<string, []>("op_1082")];
-            tensor<fp32, [2, 2]> M_9 = real_div(x = causal_with_valid_1, y = var_1082)[name = tensor<string, []>("M_9")];
-            tensor<fp32, [9, 4, 2, 2]> var_1084 = mul(x = qk_9, y = M_9)[name = tensor<string, []>("op_1084")];
+            tensor<int32, [2]> var_1091 = const()[name = tensor<string, []>("op_1091"), val = tensor<int32, [2]>([1, 2])];
+            tensor<fp32, [1, 2]> var_1092 = reshape(shape = var_1091, x = valid_mask)[name = tensor<string, []>("op_1092")];
+            tensor<fp32, [2, 2]> causal_with_valid_1 = mul(x = inner_decoder__causal_mask, y = var_1092)[name = tensor<string, []>("causal_with_valid_1")];
+            tensor<int32, [2]> var_1094 = const()[name = tensor<string, []>("op_1094"), val = tensor<int32, [2]>([2, 1])];
+            tensor<fp32, [2, 1]> var_1095 = reshape(shape = var_1094, x = sqrt_s_t_9)[name = tensor<string, []>("op_1095")];
+            tensor<fp32, [2, 2]> M_9 = real_div(x = causal_with_valid_1, y = var_1095)[name = tensor<string, []>("M_9")];
+            tensor<fp32, [9, 4, 2, 2]> var_1097 = mul(x = qk_9, y = M_9)[name = tensor<string, []>("op_1097")];
             tensor<bool, []> inner_9_transpose_x_0 = const()[name = tensor<string, []>("inner_9_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_9_transpose_y_0 = const()[name = tensor<string, []>("inner_9_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [9, 4, 2, 64]> v_9 = transpose(perm = v_9_perm_0, x = var_1066)[name = tensor<string, []>("transpose_25")];
-            tensor<fp32, [9, 4, 2, 64]> inner_9 = matmul(transpose_x = inner_9_transpose_x_0, transpose_y = inner_9_transpose_y_0, x = var_1084, y = v_9)[name = tensor<string, []>("inner_9")];
-            tensor<bool, []> var_1086_transpose_x_0 = const()[name = tensor<string, []>("op_1086_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_1086_transpose_y_0 = const()[name = tensor<string, []>("op_1086_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [9, 4, 2, 64]> var_1086 = matmul(transpose_x = var_1086_transpose_x_0, transpose_y = var_1086_transpose_y_0, x = q_9, y = prev_kv_9)[name = tensor<string, []>("op_1086")];
-            tensor<fp32, [2]> var_1087 = real_div(x = sqrt_s0_9, y = sqrt_s_t_9)[name = tensor<string, []>("op_1087")];
-            tensor<int32, [4]> var_1088 = const()[name = tensor<string, []>("op_1088"), val = tensor<int32, [4]>([1, 1, 2, 1])];
-            tensor<fp32, [1, 1, 2, 1]> var_1089 = reshape(shape = var_1088, x = var_1087)[name = tensor<string, []>("op_1089")];
-            tensor<fp32, [9, 4, 2, 64]> cross_9 = mul(x = var_1086, y = var_1089)[name = tensor<string, []>("cross_9")];
+            tensor<fp32, [9, 4, 2, 64]> v_9 = transpose(perm = v_9_perm_0, x = var_1079)[name = tensor<string, []>("transpose_25")];
+            tensor<fp32, [9, 4, 2, 64]> inner_9 = matmul(transpose_x = inner_9_transpose_x_0, transpose_y = inner_9_transpose_y_0, x = var_1097, y = v_9)[name = tensor<string, []>("inner_9")];
+            tensor<bool, []> var_1099_transpose_x_0 = const()[name = tensor<string, []>("op_1099_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_1099_transpose_y_0 = const()[name = tensor<string, []>("op_1099_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [9, 4, 2, 64]> var_1099 = matmul(transpose_x = var_1099_transpose_x_0, transpose_y = var_1099_transpose_y_0, x = q_9, y = prev_kv_9)[name = tensor<string, []>("op_1099")];
+            tensor<fp32, [2]> var_1100 = real_div(x = sqrt_s0_9, y = sqrt_s_t_9)[name = tensor<string, []>("op_1100")];
+            tensor<int32, [4]> var_1101 = const()[name = tensor<string, []>("op_1101"), val = tensor<int32, [4]>([1, 1, 2, 1])];
+            tensor<fp32, [1, 1, 2, 1]> var_1102 = reshape(shape = var_1101, x = var_1100)[name = tensor<string, []>("op_1102")];
+            tensor<fp32, [9, 4, 2, 64]> cross_9 = mul(x = var_1099, y = var_1102)[name = tensor<string, []>("cross_9")];
             tensor<fp32, [9, 4, 2, 64]> out_25 = add(x = inner_9, y = cross_9)[name = tensor<string, []>("out_25")];
-            tensor<int32, [4]> var_1092 = const()[name = tensor<string, []>("op_1092"), val = tensor<int32, [4]>([1, 1, 2, 1])];
-            tensor<fp32, [1, 1, 2, 1]> var_1093 = reshape(shape = var_1092, x = valid_mask)[name = tensor<string, []>("op_1093")];
-            tensor<fp32, [9, 4, 2, 64]> v_masked_1 = mul(x = v_9, y = var_1093)[name = tensor<string, []>("v_masked_1")];
-            tensor<fp32, [9, 4, 64, 64]> var_1095 = mul(x = prev_kv_9, y = sqrt_s0_9)[name = tensor<string, []>("op_1095")];
-            tensor<bool, []> var_1097_transpose_x_1 = const()[name = tensor<string, []>("op_1097_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_1097_transpose_y_1 = const()[name = tensor<string, []>("op_1097_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [9, 4, 64, 64]> var_1097 = matmul(transpose_x = var_1097_transpose_x_1, transpose_y = var_1097_transpose_y_1, x = k_9, y = v_masked_1)[name = tensor<string, []>("op_1097")];
-            tensor<fp32, [9, 4, 64, 64]> new_kv_unnorm_9 = add(x = var_1095, y = var_1097)[name = tensor<string, []>("new_kv_unnorm_9")];
-            tensor<bool, []> var_1099_keep_dims_0 = const()[name = tensor<string, []>("op_1099_keep_dims_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, []> var_1099 = reduce_sum(keep_dims = var_1099_keep_dims_0, x = valid_mask)[name = tensor<string, []>("op_1099")];
-            tensor<int32, [1]> var_1100 = const()[name = tensor<string, []>("op_1100"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [1]> var_1101 = reshape(shape = var_1100, x = var_1099)[name = tensor<string, []>("op_1101")];
-            tensor<fp32, [1]> new_scale_9 = add(x = prev_scale_9, y = var_1101)[name = tensor<string, []>("new_scale_9")];
+            tensor<int32, [4]> var_1105 = const()[name = tensor<string, []>("op_1105"), val = tensor<int32, [4]>([1, 1, 2, 1])];
+            tensor<fp32, [1, 1, 2, 1]> var_1106 = reshape(shape = var_1105, x = valid_mask)[name = tensor<string, []>("op_1106")];
+            tensor<fp32, [9, 4, 2, 64]> v_masked_1 = mul(x = v_9, y = var_1106)[name = tensor<string, []>("v_masked_1")];
+            tensor<fp32, [9, 4, 64, 64]> var_1108 = mul(x = prev_kv_9, y = sqrt_s0_9)[name = tensor<string, []>("op_1108")];
+            tensor<bool, []> var_1110_transpose_x_1 = const()[name = tensor<string, []>("op_1110_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_1110_transpose_y_1 = const()[name = tensor<string, []>("op_1110_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [9, 4, 64, 64]> var_1110 = matmul(transpose_x = var_1110_transpose_x_1, transpose_y = var_1110_transpose_y_1, x = k_9, y = v_masked_1)[name = tensor<string, []>("op_1110")];
+            tensor<fp32, [9, 4, 64, 64]> new_kv_unnorm_9 = add(x = var_1108, y = var_1110)[name = tensor<string, []>("new_kv_unnorm_9")];
+            tensor<bool, []> var_1112_keep_dims_0 = const()[name = tensor<string, []>("op_1112_keep_dims_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, []> var_1112 = reduce_sum(keep_dims = var_1112_keep_dims_0, x = valid_mask)[name = tensor<string, []>("op_1112")];
+            tensor<int32, [1]> var_1113 = const()[name = tensor<string, []>("op_1113"), val = tensor<int32, [1]>([1])];
+            tensor<fp32, [1]> var_1114 = reshape(shape = var_1113, x = var_1112)[name = tensor<string, []>("op_1114")];
+            tensor<fp32, [1]> new_scale_9 = add(x = prev_scale_9, y = var_1114)[name = tensor<string, []>("new_scale_9")];
             tensor<fp32, []> const_21 = const()[name = tensor<string, []>("const_21"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_2 = clip(alpha = var_958, beta = const_21, x = new_scale_9)[name = tensor<string, []>("clip_2")];
+            tensor<fp32, [1]> clip_2 = clip(alpha = var_39, beta = const_21, x = new_scale_9)[name = tensor<string, []>("clip_2")];
             tensor<fp32, [1]> sqrt_new_scale_1 = sqrt(x = clip_2)[name = tensor<string, []>("sqrt_new_scale_1")];
-            tensor<fp32, [9, 4, 64, 64]> var_1105 = real_div(x = new_kv_unnorm_9, y = sqrt_new_scale_1)[name = tensor<string, []>("op_1105")];
-            tensor<int32, [4]> var_1106_perm_0 = const()[name = tensor<string, []>("op_1106_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [9, 4, 64, 64]> var_1118 = real_div(x = new_kv_unnorm_9, y = sqrt_new_scale_1)[name = tensor<string, []>("op_1118")];
+            tensor<int32, [4]> var_1119_perm_0 = const()[name = tensor<string, []>("op_1119_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_27_axes_0 = const()[name = tensor<string, []>("out_27_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [9, 2, 4, 64]> var_1106 = transpose(perm = var_1106_perm_0, x = out_25)[name = tensor<string, []>("transpose_24")];
-            tensor<fp32, [9, 2, 4, 64]> out_27 = layer_norm(axes = out_27_axes_0, epsilon = var_955, x = var_1106)[name = tensor<string, []>("out_27")];
-            tensor<int32, [3]> var_1110 = const()[name = tensor<string, []>("op_1110"), val = tensor<int32, [3]>([9, 2, 256])];
-            tensor<fp32, [9, 2, 256]> out_29 = reshape(shape = var_1110, x = out_27)[name = tensor<string, []>("out_29")];
-            tensor<fp32, [9, 2, 256]> var_1112 = silu(x = input_169)[name = tensor<string, []>("op_1112")];
-            tensor<fp32, [9, 2, 256]> input_171 = mul(x = var_1112, y = out_29)[name = tensor<string, []>("input_171")];
-            tensor<fp32, [9, 2, 256]> ret_out_9 = linear(bias = decoder_out_proj_0_bias, weight = decoder_out_proj_0_weight, x = input_171)[name = tensor<string, []>("linear_42")];
-            tensor<fp32, [9, 2, 256]> input_173 = add(x = x_29, y = ret_out_9)[name = tensor<string, []>("input_173")];
+            tensor<fp32, [9, 2, 4, 64]> var_1119 = transpose(perm = var_1119_perm_0, x = out_25)[name = tensor<string, []>("transpose_24")];
+            tensor<fp32, [9, 2, 4, 64]> out_27 = layer_norm(axes = out_27_axes_0, epsilon = var_54, x = var_1119)[name = tensor<string, []>("out_27")];
+            tensor<int32, [3]> var_1123 = const()[name = tensor<string, []>("op_1123"), val = tensor<int32, [3]>([9, 2, 256])];
+            tensor<fp32, [9, 2, 256]> out_29 = reshape(shape = var_1123, x = out_27)[name = tensor<string, []>("out_29")];
+            tensor<fp32, [9, 2, 256]> var_1125 = silu(x = input_171)[name = tensor<string, []>("op_1125")];
+            tensor<fp32, [9, 2, 256]> input_173 = mul(x = var_1125, y = out_29)[name = tensor<string, []>("input_173")];
+            tensor<fp32, [9, 2, 256]> ret_out_9 = linear(bias = inner_decoder_out_proj_0_bias, weight = inner_decoder_out_proj_0_weight, x = input_173)[name = tensor<string, []>("linear_42")];
+            tensor<fp32, [9, 2, 256]> input_175 = add(x = x_29, y = ret_out_9)[name = tensor<string, []>("input_175")];
             tensor<int32, [1]> xt_1_axes_0 = const()[name = tensor<string, []>("xt_1_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [9, 2, 256]> xt_1 = layer_norm(axes = xt_1_axes_0, beta = decoder_norm11_0_bias, epsilon = var_953, gamma = decoder_norm11_0_weight, x = input_173)[name = tensor<string, []>("xt_1")];
-            tensor<int32, [4]> var_1122 = const()[name = tensor<string, []>("op_1122"), val = tensor<int32, [4]>([1, 9, 2, 256])];
-            tensor<fp32, [1, 9, 2, 256]> var_1123 = reshape(shape = var_1122, x = xt_1)[name = tensor<string, []>("op_1123")];
-            tensor<int32, [4]> var_1124_perm_0 = const()[name = tensor<string, []>("op_1124_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1127 = const()[name = tensor<string, []>("op_1127"), val = tensor<int32, [3]>([2, 9, 256])];
-            tensor<fp32, [1, 2, 9, 256]> var_1124 = transpose(perm = var_1124_perm_0, x = var_1123)[name = tensor<string, []>("transpose_23")];
-            tensor<fp32, [2, 9, 256]> query_1 = reshape(shape = var_1127, x = var_1124)[name = tensor<string, []>("query_1")];
+            tensor<fp32, [9, 2, 256]> xt_1 = layer_norm(axes = xt_1_axes_0, beta = inner_decoder_norm11_0_bias, epsilon = var_46, gamma = inner_decoder_norm11_0_weight, x = input_175)[name = tensor<string, []>("xt_1")];
+            tensor<int32, [4]> var_1135 = const()[name = tensor<string, []>("op_1135"), val = tensor<int32, [4]>([1, 9, 2, 256])];
+            tensor<fp32, [1, 9, 2, 256]> var_1136 = reshape(shape = var_1135, x = xt_1)[name = tensor<string, []>("op_1136")];
+            tensor<int32, [4]> var_1137_perm_0 = const()[name = tensor<string, []>("op_1137_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1140 = const()[name = tensor<string, []>("op_1140"), val = tensor<int32, [3]>([2, 9, 256])];
+            tensor<fp32, [1, 2, 9, 256]> var_1137 = transpose(perm = var_1137_perm_0, x = var_1136)[name = tensor<string, []>("transpose_23")];
+            tensor<fp32, [2, 9, 256]> query_1 = reshape(shape = var_1140, x = var_1137)[name = tensor<string, []>("query_1")];
             tensor<int32, [3]> query_3_perm_0 = const()[name = tensor<string, []>("query_3_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [9, 2, 256]> query_3 = transpose(perm = query_3_perm_0, x = query_1)[name = tensor<string, []>("transpose_22")];
-            tensor<fp32, [9, 2, 768]> var_1150 = linear(bias = decoder_self_attn2_0_in_proj_bias, weight = decoder_self_attn2_0_in_proj_weight, x = query_3)[name = tensor<string, []>("linear_43")];
+            tensor<fp32, [9, 2, 768]> var_1163 = linear(bias = inner_decoder_self_attn2_0_in_proj_bias, weight = inner_decoder_self_attn2_0_in_proj_weight, x = query_3)[name = tensor<string, []>("linear_43")];
             tensor<int32, [4]> concat_1 = const()[name = tensor<string, []>("concat_1"), val = tensor<int32, [4]>([9, 2, 3, 256])];
-            tensor<fp32, [9, 2, 3, 256]> var_1152 = reshape(shape = concat_1, x = var_1150)[name = tensor<string, []>("op_1152")];
-            tensor<int32, [1]> var_1153_axes_0 = const()[name = tensor<string, []>("op_1153_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp32, [1, 9, 2, 3, 256]> var_1153 = expand_dims(axes = var_1153_axes_0, x = var_1152)[name = tensor<string, []>("op_1153")];
-            tensor<int32, [5]> var_1154_perm_0 = const()[name = tensor<string, []>("op_1154_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
-            tensor<int32, [1]> var_1155_axes_0 = const()[name = tensor<string, []>("op_1155_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp32, [3, 9, 2, 1, 256]> var_1154 = transpose(perm = var_1154_perm_0, x = var_1153)[name = tensor<string, []>("transpose_21")];
-            tensor<fp32, [3, 9, 2, 256]> var_1155 = squeeze(axes = var_1155_axes_0, x = var_1154)[name = tensor<string, []>("op_1155")];
+            tensor<fp32, [9, 2, 3, 256]> var_1165 = reshape(shape = concat_1, x = var_1163)[name = tensor<string, []>("op_1165")];
+            tensor<int32, [1]> var_1166_axes_0 = const()[name = tensor<string, []>("op_1166_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 9, 2, 3, 256]> var_1166 = expand_dims(axes = var_1166_axes_0, x = var_1165)[name = tensor<string, []>("op_1166")];
+            tensor<int32, [5]> var_1167_perm_0 = const()[name = tensor<string, []>("op_1167_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
+            tensor<int32, [1]> var_1168_axes_0 = const()[name = tensor<string, []>("op_1168_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp32, [3, 9, 2, 1, 256]> var_1167 = transpose(perm = var_1167_perm_0, x = var_1166)[name = tensor<string, []>("transpose_21")];
+            tensor<fp32, [3, 9, 2, 256]> var_1168 = squeeze(axes = var_1168_axes_0, x = var_1167)[name = tensor<string, []>("op_1168")];
             tensor<int32, [4]> q_11_begin_0 = const()[name = tensor<string, []>("q_11_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> q_11_end_0 = const()[name = tensor<string, []>("q_11_end_0"), val = tensor<int32, [4]>([1, 9, 2, 256])];
             tensor<bool, [4]> q_11_end_mask_0 = const()[name = tensor<string, []>("q_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> q_11_squeeze_mask_0 = const()[name = tensor<string, []>("q_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [9, 2, 256]> q_11 = slice_by_index(begin = q_11_begin_0, end = q_11_end_0, end_mask = q_11_end_mask_0, squeeze_mask = q_11_squeeze_mask_0, x = var_1155)[name = tensor<string, []>("q_11")];
+            tensor<fp32, [9, 2, 256]> q_11 = slice_by_index(begin = q_11_begin_0, end = q_11_end_0, end_mask = q_11_end_mask_0, squeeze_mask = q_11_squeeze_mask_0, x = var_1168)[name = tensor<string, []>("q_11")];
             tensor<int32, [4]> k_11_begin_0 = const()[name = tensor<string, []>("k_11_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> k_11_end_0 = const()[name = tensor<string, []>("k_11_end_0"), val = tensor<int32, [4]>([2, 9, 2, 256])];
             tensor<bool, [4]> k_11_end_mask_0 = const()[name = tensor<string, []>("k_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> k_11_squeeze_mask_0 = const()[name = tensor<string, []>("k_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [9, 2, 256]> k_11 = slice_by_index(begin = k_11_begin_0, end = k_11_end_0, end_mask = k_11_end_mask_0, squeeze_mask = k_11_squeeze_mask_0, x = var_1155)[name = tensor<string, []>("k_11")];
+            tensor<fp32, [9, 2, 256]> k_11 = slice_by_index(begin = k_11_begin_0, end = k_11_end_0, end_mask = k_11_end_mask_0, squeeze_mask = k_11_squeeze_mask_0, x = var_1168)[name = tensor<string, []>("k_11")];
             tensor<int32, [4]> v_11_begin_0 = const()[name = tensor<string, []>("v_11_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> v_11_end_0 = const()[name = tensor<string, []>("v_11_end_0"), val = tensor<int32, [4]>([3, 9, 2, 256])];
             tensor<bool, [4]> v_11_end_mask_0 = const()[name = tensor<string, []>("v_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> v_11_squeeze_mask_0 = const()[name = tensor<string, []>("v_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [9, 2, 256]> v_11 = slice_by_index(begin = v_11_begin_0, end = v_11_end_0, end_mask = v_11_end_mask_0, squeeze_mask = v_11_squeeze_mask_0, x = var_1155)[name = tensor<string, []>("v_11")];
-            tensor<int32, [3]> var_1163 = const()[name = tensor<string, []>("op_1163"), val = tensor<int32, [3]>([9, 8, 64])];
-            tensor<fp32, [9, 8, 64]> var_1164 = reshape(shape = var_1163, x = q_11)[name = tensor<string, []>("op_1164")];
+            tensor<fp32, [9, 2, 256]> v_11 = slice_by_index(begin = v_11_begin_0, end = v_11_end_0, end_mask = v_11_end_mask_0, squeeze_mask = v_11_squeeze_mask_0, x = var_1168)[name = tensor<string, []>("v_11")];
+            tensor<int32, [3]> var_1176 = const()[name = tensor<string, []>("op_1176"), val = tensor<int32, [3]>([9, 8, 64])];
+            tensor<fp32, [9, 8, 64]> var_1177 = reshape(shape = var_1176, x = q_11)[name = tensor<string, []>("op_1177")];
             tensor<int32, [3]> q_13_perm_0 = const()[name = tensor<string, []>("q_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1170 = const()[name = tensor<string, []>("op_1170"), val = tensor<int32, [3]>([9, 8, 64])];
-            tensor<fp32, [9, 8, 64]> var_1171 = reshape(shape = var_1170, x = k_11)[name = tensor<string, []>("op_1171")];
+            tensor<int32, [3]> var_1183 = const()[name = tensor<string, []>("op_1183"), val = tensor<int32, [3]>([9, 8, 64])];
+            tensor<fp32, [9, 8, 64]> var_1184 = reshape(shape = var_1183, x = k_11)[name = tensor<string, []>("op_1184")];
             tensor<int32, [3]> k_13_perm_0 = const()[name = tensor<string, []>("k_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1177 = const()[name = tensor<string, []>("op_1177"), val = tensor<int32, [3]>([9, 8, 64])];
-            tensor<fp32, [9, 8, 64]> var_1178 = reshape(shape = var_1177, x = v_11)[name = tensor<string, []>("op_1178")];
+            tensor<int32, [3]> var_1190 = const()[name = tensor<string, []>("op_1190"), val = tensor<int32, [3]>([9, 8, 64])];
+            tensor<fp32, [9, 8, 64]> var_1191 = reshape(shape = var_1190, x = v_11)[name = tensor<string, []>("op_1191")];
             tensor<int32, [3]> v_13_perm_0 = const()[name = tensor<string, []>("v_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [4]> var_1181 = const()[name = tensor<string, []>("op_1181"), val = tensor<int32, [4]>([2, 4, 9, 64])];
-            tensor<fp32, [8, 9, 64]> q_13 = transpose(perm = q_13_perm_0, x = var_1164)[name = tensor<string, []>("transpose_20")];
-            tensor<fp32, [2, 4, 9, 64]> q_15 = reshape(shape = var_1181, x = q_13)[name = tensor<string, []>("q_15")];
-            tensor<int32, [4]> var_1183 = const()[name = tensor<string, []>("op_1183"), val = tensor<int32, [4]>([2, 4, 9, 64])];
-            tensor<fp32, [8, 9, 64]> k_13 = transpose(perm = k_13_perm_0, x = var_1171)[name = tensor<string, []>("transpose_19")];
-            tensor<fp32, [2, 4, 9, 64]> k_15 = reshape(shape = var_1183, x = k_13)[name = tensor<string, []>("k_15")];
-            tensor<int32, [4]> var_1185 = const()[name = tensor<string, []>("op_1185"), val = tensor<int32, [4]>([2, 4, 9, 64])];
-            tensor<fp32, [8, 9, 64]> v_13 = transpose(perm = v_13_perm_0, x = var_1178)[name = tensor<string, []>("transpose_18")];
-            tensor<fp32, [2, 4, 9, 64]> v_15 = reshape(shape = var_1185, x = v_13)[name = tensor<string, []>("v_15")];
+            tensor<int32, [4]> var_1194 = const()[name = tensor<string, []>("op_1194"), val = tensor<int32, [4]>([2, 4, 9, 64])];
+            tensor<fp32, [8, 9, 64]> q_13 = transpose(perm = q_13_perm_0, x = var_1177)[name = tensor<string, []>("transpose_20")];
+            tensor<fp32, [2, 4, 9, 64]> q_15 = reshape(shape = var_1194, x = q_13)[name = tensor<string, []>("q_15")];
+            tensor<int32, [4]> var_1196 = const()[name = tensor<string, []>("op_1196"), val = tensor<int32, [4]>([2, 4, 9, 64])];
+            tensor<fp32, [8, 9, 64]> k_13 = transpose(perm = k_13_perm_0, x = var_1184)[name = tensor<string, []>("transpose_19")];
+            tensor<fp32, [2, 4, 9, 64]> k_15 = reshape(shape = var_1196, x = k_13)[name = tensor<string, []>("k_15")];
+            tensor<int32, [4]> var_1198 = const()[name = tensor<string, []>("op_1198"), val = tensor<int32, [4]>([2, 4, 9, 64])];
+            tensor<fp32, [8, 9, 64]> v_13 = transpose(perm = v_13_perm_0, x = var_1191)[name = tensor<string, []>("transpose_18")];
+            tensor<fp32, [2, 4, 9, 64]> v_15 = reshape(shape = var_1198, x = v_13)[name = tensor<string, []>("v_15")];
             tensor<fp32, []> mul_1_y_0 = const()[name = tensor<string, []>("mul_1_y_0"), val = tensor<fp32, []>(0x1p-3)];
             tensor<fp32, [2, 4, 9, 64]> mul_1 = mul(x = q_15, y = mul_1_y_0)[name = tensor<string, []>("mul_1")];
             tensor<bool, []> matmul_0_transpose_y_0 = const()[name = tensor<string, []>("matmul_0_transpose_y_0"), val = tensor<bool, []>(true)];
@@ -1039,30 +1046,30 @@ program(1.0)
             tensor<bool, []> attn_output_1_transpose_x_0 = const()[name = tensor<string, []>("attn_output_1_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> attn_output_1_transpose_y_0 = const()[name = tensor<string, []>("attn_output_1_transpose_y_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [2, 4, 9, 64]> attn_output_1 = matmul(transpose_x = attn_output_1_transpose_x_0, transpose_y = attn_output_1_transpose_y_0, x = softmax_0, y = v_15)[name = tensor<string, []>("attn_output_1")];
-            tensor<int32, [4]> var_1188 = const()[name = tensor<string, []>("op_1188"), val = tensor<int32, [4]>([2, 0, 1, 3])];
-            tensor<int32, [2]> var_1193 = const()[name = tensor<string, []>("op_1193"), val = tensor<int32, [2]>([18, 256])];
-            tensor<fp32, [9, 2, 4, 64]> var_1189 = transpose(perm = var_1188, x = attn_output_1)[name = tensor<string, []>("transpose_17")];
-            tensor<fp32, [18, 256]> attn_output_3 = reshape(shape = var_1193, x = var_1189)[name = tensor<string, []>("attn_output_3")];
-            tensor<fp32, [18, 256]> attn_output_5 = linear(bias = decoder_self_attn2_0_out_proj_bias, weight = decoder_self_attn2_0_out_proj_weight, x = attn_output_3)[name = tensor<string, []>("linear_44")];
-            tensor<int32, [3]> var_1197 = const()[name = tensor<string, []>("op_1197"), val = tensor<int32, [3]>([9, 2, 256])];
-            tensor<fp32, [9, 2, 256]> attn_output_7 = reshape(shape = var_1197, x = attn_output_5)[name = tensor<string, []>("attn_output_7")];
+            tensor<int32, [4]> var_1201 = const()[name = tensor<string, []>("op_1201"), val = tensor<int32, [4]>([2, 0, 1, 3])];
+            tensor<int32, [2]> var_1206 = const()[name = tensor<string, []>("op_1206"), val = tensor<int32, [2]>([18, 256])];
+            tensor<fp32, [9, 2, 4, 64]> var_1202 = transpose(perm = var_1201, x = attn_output_1)[name = tensor<string, []>("transpose_17")];
+            tensor<fp32, [18, 256]> attn_output_3 = reshape(shape = var_1206, x = var_1202)[name = tensor<string, []>("attn_output_3")];
+            tensor<fp32, [18, 256]> attn_output_5 = linear(bias = inner_decoder_self_attn2_0_out_proj_bias, weight = inner_decoder_self_attn2_0_out_proj_weight, x = attn_output_3)[name = tensor<string, []>("linear_44")];
+            tensor<int32, [3]> var_1210 = const()[name = tensor<string, []>("op_1210"), val = tensor<int32, [3]>([9, 2, 256])];
+            tensor<fp32, [9, 2, 256]> attn_output_7 = reshape(shape = var_1210, x = attn_output_5)[name = tensor<string, []>("attn_output_7")];
             tensor<int32, [3]> sa2_out_1_perm_0 = const()[name = tensor<string, []>("sa2_out_1_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [2, 9, 256]> sa2_out_1 = transpose(perm = sa2_out_1_perm_0, x = attn_output_7)[name = tensor<string, []>("transpose_16")];
-            tensor<fp32, [2, 9, 256]> input_175 = add(x = query_1, y = sa2_out_1)[name = tensor<string, []>("input_175")];
-            tensor<int32, [1]> input_177_axes_0 = const()[name = tensor<string, []>("input_177_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [2, 9, 256]> input_177 = layer_norm(axes = input_177_axes_0, beta = decoder_norm21_0_bias, epsilon = var_953, gamma = decoder_norm21_0_weight, x = input_175)[name = tensor<string, []>("input_177")];
-            tensor<fp32, [2, 9, 2048]> input_179 = linear(bias = decoder_linear1_0_bias, weight = decoder_linear1_0_weight, x = input_177)[name = tensor<string, []>("linear_45")];
-            tensor<fp32, [2, 9, 2048]> input_181 = relu(x = input_179)[name = tensor<string, []>("input_181")];
-            tensor<fp32, [2, 9, 256]> ffn_out_1 = linear(bias = decoder_linear2_0_bias, weight = decoder_linear2_0_weight, x = input_181)[name = tensor<string, []>("linear_46")];
-            tensor<fp32, [2, 9, 256]> input_183 = add(x = input_177, y = ffn_out_1)[name = tensor<string, []>("input_183")];
+            tensor<fp32, [2, 9, 256]> input_177 = add(x = query_1, y = sa2_out_1)[name = tensor<string, []>("input_177")];
+            tensor<int32, [1]> input_179_axes_0 = const()[name = tensor<string, []>("input_179_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [2, 9, 256]> input_179 = layer_norm(axes = input_179_axes_0, beta = inner_decoder_norm21_0_bias, epsilon = var_46, gamma = inner_decoder_norm21_0_weight, x = input_177)[name = tensor<string, []>("input_179")];
+            tensor<fp32, [2, 9, 2048]> input_181 = linear(bias = inner_decoder_linear1_0_bias, weight = inner_decoder_linear1_0_weight, x = input_179)[name = tensor<string, []>("linear_45")];
+            tensor<fp32, [2, 9, 2048]> input_183 = relu(x = input_181)[name = tensor<string, []>("input_183")];
+            tensor<fp32, [2, 9, 256]> ffn_out_1 = linear(bias = inner_decoder_linear2_0_bias, weight = inner_decoder_linear2_0_weight, x = input_183)[name = tensor<string, []>("linear_46")];
+            tensor<fp32, [2, 9, 256]> input_185 = add(x = input_179, y = ffn_out_1)[name = tensor<string, []>("input_185")];
             tensor<int32, [1]> xt_3_axes_0 = const()[name = tensor<string, []>("xt_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [2, 9, 256]> xt_3 = layer_norm(axes = xt_3_axes_0, beta = decoder_norm22_0_bias, epsilon = var_953, gamma = decoder_norm22_0_weight, x = input_183)[name = tensor<string, []>("xt_3")];
-            tensor<int32, [4]> var_1217 = const()[name = tensor<string, []>("op_1217"), val = tensor<int32, [4]>([1, 2, 9, 256])];
-            tensor<fp32, [1, 2, 9, 256]> x_31 = reshape(shape = var_1217, x = xt_3)[name = tensor<string, []>("x_31")];
-            tensor<int32, [4]> var_1219_perm_0 = const()[name = tensor<string, []>("op_1219_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1223 = const()[name = tensor<string, []>("op_1223"), val = tensor<int32, [3]>([9, 2, 256])];
-            tensor<fp32, [1, 9, 2, 256]> var_1219 = transpose(perm = var_1219_perm_0, x = x_31)[name = tensor<string, []>("transpose_15")];
-            tensor<fp32, [9, 2, 256]> x = reshape(shape = var_1223, x = var_1219)[name = tensor<string, []>("x")];
+            tensor<fp32, [2, 9, 256]> xt_3 = layer_norm(axes = xt_3_axes_0, beta = inner_decoder_norm22_0_bias, epsilon = var_46, gamma = inner_decoder_norm22_0_weight, x = input_185)[name = tensor<string, []>("xt_3")];
+            tensor<int32, [4]> var_1230 = const()[name = tensor<string, []>("op_1230"), val = tensor<int32, [4]>([1, 2, 9, 256])];
+            tensor<fp32, [1, 2, 9, 256]> x_31 = reshape(shape = var_1230, x = xt_3)[name = tensor<string, []>("x_31")];
+            tensor<int32, [4]> var_1232_perm_0 = const()[name = tensor<string, []>("op_1232_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1236 = const()[name = tensor<string, []>("op_1236"), val = tensor<int32, [3]>([9, 2, 256])];
+            tensor<fp32, [1, 9, 2, 256]> var_1232 = transpose(perm = var_1232_perm_0, x = x_31)[name = tensor<string, []>("transpose_15")];
+            tensor<fp32, [9, 2, 256]> x = reshape(shape = var_1236, x = var_1232)[name = tensor<string, []>("x")];
             tensor<int32, [5]> prev_kv_begin_0 = const()[name = tensor<string, []>("prev_kv_begin_0"), val = tensor<int32, [5]>([1, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_end_0 = const()[name = tensor<string, []>("prev_kv_end_0"), val = tensor<int32, [5]>([2, 9, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_end_mask_0 = const()[name = tensor<string, []>("prev_kv_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -1073,120 +1080,120 @@ program(1.0)
             tensor<bool, [2]> prev_scale_end_mask_0 = const()[name = tensor<string, []>("prev_scale_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale = slice_by_index(begin = prev_scale_begin_0, end = prev_scale_end_0, end_mask = prev_scale_end_mask_0, squeeze_mask = prev_scale_squeeze_mask_0, x = dec_scale)[name = tensor<string, []>("prev_scale")];
-            tensor<fp32, [9, 2, 256]> var_1231 = linear(bias = decoder_q_proj_1_bias, weight = decoder_q_proj_1_weight, x = x)[name = tensor<string, []>("linear_47")];
-            tensor<int32, [4]> var_1232 = const()[name = tensor<string, []>("op_1232"), val = tensor<int32, [4]>([9, 2, 4, 64])];
-            tensor<fp32, [9, 2, 4, 64]> var_1233 = reshape(shape = var_1232, x = var_1231)[name = tensor<string, []>("op_1233")];
+            tensor<fp32, [9, 2, 256]> var_1244 = linear(bias = inner_decoder_q_proj_1_bias, weight = inner_decoder_q_proj_1_weight, x = x)[name = tensor<string, []>("linear_47")];
+            tensor<int32, [4]> var_1245 = const()[name = tensor<string, []>("op_1245"), val = tensor<int32, [4]>([9, 2, 4, 64])];
+            tensor<fp32, [9, 2, 4, 64]> var_1246 = reshape(shape = var_1245, x = var_1244)[name = tensor<string, []>("op_1246")];
             tensor<int32, [4]> q_17_perm_0 = const()[name = tensor<string, []>("q_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [9, 2, 256]> var_1237 = linear(bias = decoder_k_proj_1_bias, weight = decoder_k_proj_1_weight, x = x)[name = tensor<string, []>("linear_48")];
-            tensor<fp32, []> var_1238 = const()[name = tensor<string, []>("op_1238"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [9, 2, 256]> var_1239 = mul(x = var_1237, y = var_1238)[name = tensor<string, []>("op_1239")];
-            tensor<int32, [4]> var_1240 = const()[name = tensor<string, []>("op_1240"), val = tensor<int32, [4]>([9, 2, 4, 64])];
-            tensor<fp32, [9, 2, 4, 64]> var_1241 = reshape(shape = var_1240, x = var_1239)[name = tensor<string, []>("op_1241")];
+            tensor<fp32, [9, 2, 256]> var_1250 = linear(bias = inner_decoder_k_proj_1_bias, weight = inner_decoder_k_proj_1_weight, x = x)[name = tensor<string, []>("linear_48")];
+            tensor<fp32, []> var_1251 = const()[name = tensor<string, []>("op_1251"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [9, 2, 256]> var_1252 = mul(x = var_1250, y = var_1251)[name = tensor<string, []>("op_1252")];
+            tensor<int32, [4]> var_1253 = const()[name = tensor<string, []>("op_1253"), val = tensor<int32, [4]>([9, 2, 4, 64])];
+            tensor<fp32, [9, 2, 4, 64]> var_1254 = reshape(shape = var_1253, x = var_1252)[name = tensor<string, []>("op_1254")];
             tensor<int32, [4]> k_17_perm_0 = const()[name = tensor<string, []>("k_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [9, 2, 256]> var_1245 = linear(bias = decoder_v_proj_1_bias, weight = decoder_v_proj_1_weight, x = x)[name = tensor<string, []>("linear_49")];
-            tensor<int32, [4]> var_1246 = const()[name = tensor<string, []>("op_1246"), val = tensor<int32, [4]>([9, 2, 4, 64])];
-            tensor<fp32, [9, 2, 4, 64]> var_1247 = reshape(shape = var_1246, x = var_1245)[name = tensor<string, []>("op_1247")];
+            tensor<fp32, [9, 2, 256]> var_1258 = linear(bias = inner_decoder_v_proj_1_bias, weight = inner_decoder_v_proj_1_weight, x = x)[name = tensor<string, []>("linear_49")];
+            tensor<int32, [4]> var_1259 = const()[name = tensor<string, []>("op_1259"), val = tensor<int32, [4]>([9, 2, 4, 64])];
+            tensor<fp32, [9, 2, 4, 64]> var_1260 = reshape(shape = var_1259, x = var_1258)[name = tensor<string, []>("op_1260")];
             tensor<int32, [4]> v_17_perm_0 = const()[name = tensor<string, []>("v_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [9, 2, 256]> input_187 = linear(bias = decoder_g_proj_1_bias, weight = decoder_g_proj_1_weight, x = x)[name = tensor<string, []>("linear_50")];
+            tensor<fp32, [9, 2, 256]> input_189 = linear(bias = inner_decoder_g_proj_1_bias, weight = inner_decoder_g_proj_1_weight, x = x)[name = tensor<string, []>("linear_50")];
             tensor<fp32, [1]> sqrt_s0 = sqrt(x = prev_scale)[name = tensor<string, []>("sqrt_s0")];
             tensor<fp32, [2]> s_t = add(x = prev_scale, y = cumsum_mask_1)[name = tensor<string, []>("s_t")];
             tensor<fp32, []> const_32 = const()[name = tensor<string, []>("const_32"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [2]> clip_3 = clip(alpha = var_958, beta = const_32, x = s_t)[name = tensor<string, []>("clip_3")];
+            tensor<fp32, [2]> clip_3 = clip(alpha = var_39, beta = const_32, x = s_t)[name = tensor<string, []>("clip_3")];
             tensor<fp32, [2]> sqrt_s_t = sqrt(x = clip_3)[name = tensor<string, []>("sqrt_s_t")];
             tensor<bool, []> qk_transpose_x_1 = const()[name = tensor<string, []>("qk_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_transpose_y_1 = const()[name = tensor<string, []>("qk_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [9, 4, 2, 64]> k_17 = transpose(perm = k_17_perm_0, x = var_1241)[name = tensor<string, []>("transpose_13")];
-            tensor<fp32, [9, 4, 2, 64]> q_17 = transpose(perm = q_17_perm_0, x = var_1233)[name = tensor<string, []>("transpose_14")];
+            tensor<fp32, [9, 4, 2, 64]> k_17 = transpose(perm = k_17_perm_0, x = var_1254)[name = tensor<string, []>("transpose_13")];
+            tensor<fp32, [9, 4, 2, 64]> q_17 = transpose(perm = q_17_perm_0, x = var_1246)[name = tensor<string, []>("transpose_14")];
             tensor<fp32, [9, 4, 2, 2]> qk = matmul(transpose_x = qk_transpose_x_1, transpose_y = qk_transpose_y_1, x = q_17, y = k_17)[name = tensor<string, []>("qk")];
-            tensor<int32, [2]> var_1262 = const()[name = tensor<string, []>("op_1262"), val = tensor<int32, [2]>([2, 1])];
-            tensor<fp32, [2, 1]> var_1263 = reshape(shape = var_1262, x = sqrt_s_t)[name = tensor<string, []>("op_1263")];
-            tensor<fp32, [2, 2]> M = real_div(x = causal_with_valid_1, y = var_1263)[name = tensor<string, []>("M")];
-            tensor<fp32, [9, 4, 2, 2]> var_1265 = mul(x = qk, y = M)[name = tensor<string, []>("op_1265")];
-            tensor<bool, []> inner_transpose_x_0 = const()[name = tensor<string, []>("inner_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> inner_transpose_y_0 = const()[name = tensor<string, []>("inner_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [9, 4, 2, 64]> v_17 = transpose(perm = v_17_perm_0, x = var_1247)[name = tensor<string, []>("transpose_12")];
-            tensor<fp32, [9, 4, 2, 64]> inner = matmul(transpose_x = inner_transpose_x_0, transpose_y = inner_transpose_y_0, x = var_1265, y = v_17)[name = tensor<string, []>("inner")];
-            tensor<bool, []> var_1267_transpose_x_0 = const()[name = tensor<string, []>("op_1267_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_1267_transpose_y_0 = const()[name = tensor<string, []>("op_1267_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [9, 4, 2, 64]> var_1267 = matmul(transpose_x = var_1267_transpose_x_0, transpose_y = var_1267_transpose_y_0, x = q_17, y = prev_kv)[name = tensor<string, []>("op_1267")];
-            tensor<fp32, [2]> var_1268 = real_div(x = sqrt_s0, y = sqrt_s_t)[name = tensor<string, []>("op_1268")];
-            tensor<int32, [4]> var_1269 = const()[name = tensor<string, []>("op_1269"), val = tensor<int32, [4]>([1, 1, 2, 1])];
-            tensor<fp32, [1, 1, 2, 1]> var_1270 = reshape(shape = var_1269, x = var_1268)[name = tensor<string, []>("op_1270")];
-            tensor<fp32, [9, 4, 2, 64]> cross = mul(x = var_1267, y = var_1270)[name = tensor<string, []>("cross")];
-            tensor<fp32, [9, 4, 2, 64]> out_31 = add(x = inner, y = cross)[name = tensor<string, []>("out_31")];
-            tensor<fp32, [9, 4, 2, 64]> v_masked = mul(x = v_17, y = var_1093)[name = tensor<string, []>("v_masked")];
-            tensor<fp32, [9, 4, 64, 64]> var_1276 = mul(x = prev_kv, y = sqrt_s0)[name = tensor<string, []>("op_1276")];
-            tensor<bool, []> var_1278_transpose_x_1 = const()[name = tensor<string, []>("op_1278_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_1278_transpose_y_1 = const()[name = tensor<string, []>("op_1278_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [9, 4, 64, 64]> var_1278 = matmul(transpose_x = var_1278_transpose_x_1, transpose_y = var_1278_transpose_y_1, x = k_17, y = v_masked)[name = tensor<string, []>("op_1278")];
-            tensor<fp32, [9, 4, 64, 64]> new_kv_unnorm = add(x = var_1276, y = var_1278)[name = tensor<string, []>("new_kv_unnorm")];
-            tensor<fp32, [1]> new_scale = add(x = prev_scale, y = var_1101)[name = tensor<string, []>("new_scale")];
+            tensor<int32, [2]> var_1275 = const()[name = tensor<string, []>("op_1275"), val = tensor<int32, [2]>([2, 1])];
+            tensor<fp32, [2, 1]> var_1276 = reshape(shape = var_1275, x = sqrt_s_t)[name = tensor<string, []>("op_1276")];
+            tensor<fp32, [2, 2]> M = real_div(x = causal_with_valid_1, y = var_1276)[name = tensor<string, []>("M")];
+            tensor<fp32, [9, 4, 2, 2]> var_1278 = mul(x = qk, y = M)[name = tensor<string, []>("op_1278")];
+            tensor<bool, []> inner_11_transpose_x_0 = const()[name = tensor<string, []>("inner_11_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> inner_11_transpose_y_0 = const()[name = tensor<string, []>("inner_11_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [9, 4, 2, 64]> v_17 = transpose(perm = v_17_perm_0, x = var_1260)[name = tensor<string, []>("transpose_12")];
+            tensor<fp32, [9, 4, 2, 64]> inner_11 = matmul(transpose_x = inner_11_transpose_x_0, transpose_y = inner_11_transpose_y_0, x = var_1278, y = v_17)[name = tensor<string, []>("inner_11")];
+            tensor<bool, []> var_1280_transpose_x_0 = const()[name = tensor<string, []>("op_1280_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_1280_transpose_y_0 = const()[name = tensor<string, []>("op_1280_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [9, 4, 2, 64]> var_1280 = matmul(transpose_x = var_1280_transpose_x_0, transpose_y = var_1280_transpose_y_0, x = q_17, y = prev_kv)[name = tensor<string, []>("op_1280")];
+            tensor<fp32, [2]> var_1281 = real_div(x = sqrt_s0, y = sqrt_s_t)[name = tensor<string, []>("op_1281")];
+            tensor<int32, [4]> var_1282 = const()[name = tensor<string, []>("op_1282"), val = tensor<int32, [4]>([1, 1, 2, 1])];
+            tensor<fp32, [1, 1, 2, 1]> var_1283 = reshape(shape = var_1282, x = var_1281)[name = tensor<string, []>("op_1283")];
+            tensor<fp32, [9, 4, 2, 64]> cross = mul(x = var_1280, y = var_1283)[name = tensor<string, []>("cross")];
+            tensor<fp32, [9, 4, 2, 64]> out_31 = add(x = inner_11, y = cross)[name = tensor<string, []>("out_31")];
+            tensor<fp32, [9, 4, 2, 64]> v_masked = mul(x = v_17, y = var_1106)[name = tensor<string, []>("v_masked")];
+            tensor<fp32, [9, 4, 64, 64]> var_1289 = mul(x = prev_kv, y = sqrt_s0)[name = tensor<string, []>("op_1289")];
+            tensor<bool, []> var_1291_transpose_x_1 = const()[name = tensor<string, []>("op_1291_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_1291_transpose_y_1 = const()[name = tensor<string, []>("op_1291_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [9, 4, 64, 64]> var_1291 = matmul(transpose_x = var_1291_transpose_x_1, transpose_y = var_1291_transpose_y_1, x = k_17, y = v_masked)[name = tensor<string, []>("op_1291")];
+            tensor<fp32, [9, 4, 64, 64]> new_kv_unnorm = add(x = var_1289, y = var_1291)[name = tensor<string, []>("new_kv_unnorm")];
+            tensor<fp32, [1]> new_scale = add(x = prev_scale, y = var_1114)[name = tensor<string, []>("new_scale")];
             tensor<fp32, []> const_33 = const()[name = tensor<string, []>("const_33"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_4 = clip(alpha = var_958, beta = const_33, x = new_scale)[name = tensor<string, []>("clip_4")];
+            tensor<fp32, [1]> clip_4 = clip(alpha = var_39, beta = const_33, x = new_scale)[name = tensor<string, []>("clip_4")];
             tensor<fp32, [1]> sqrt_new_scale = sqrt(x = clip_4)[name = tensor<string, []>("sqrt_new_scale")];
             tensor<fp32, [9, 4, 64, 64]> nkv = real_div(x = new_kv_unnorm, y = sqrt_new_scale)[name = tensor<string, []>("nkv")];
-            tensor<int32, [4]> var_1287_perm_0 = const()[name = tensor<string, []>("op_1287_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [4]> var_1300_perm_0 = const()[name = tensor<string, []>("op_1300_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_33_axes_0 = const()[name = tensor<string, []>("out_33_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [9, 2, 4, 64]> var_1287 = transpose(perm = var_1287_perm_0, x = out_31)[name = tensor<string, []>("transpose_11")];
-            tensor<fp32, [9, 2, 4, 64]> out_33 = layer_norm(axes = out_33_axes_0, epsilon = var_955, x = var_1287)[name = tensor<string, []>("out_33")];
-            tensor<int32, [3]> var_1291 = const()[name = tensor<string, []>("op_1291"), val = tensor<int32, [3]>([9, 2, 256])];
-            tensor<fp32, [9, 2, 256]> out = reshape(shape = var_1291, x = out_33)[name = tensor<string, []>("out")];
-            tensor<fp32, [9, 2, 256]> var_1293 = silu(x = input_187)[name = tensor<string, []>("op_1293")];
-            tensor<fp32, [9, 2, 256]> input_189 = mul(x = var_1293, y = out)[name = tensor<string, []>("input_189")];
-            tensor<fp32, [9, 2, 256]> ret_out = linear(bias = decoder_out_proj_1_bias, weight = decoder_out_proj_1_weight, x = input_189)[name = tensor<string, []>("linear_51")];
-            tensor<fp32, [9, 2, 256]> input_191 = add(x = x, y = ret_out)[name = tensor<string, []>("input_191")];
+            tensor<fp32, [9, 2, 4, 64]> var_1300 = transpose(perm = var_1300_perm_0, x = out_31)[name = tensor<string, []>("transpose_11")];
+            tensor<fp32, [9, 2, 4, 64]> out_33 = layer_norm(axes = out_33_axes_0, epsilon = var_54, x = var_1300)[name = tensor<string, []>("out_33")];
+            tensor<int32, [3]> var_1304 = const()[name = tensor<string, []>("op_1304"), val = tensor<int32, [3]>([9, 2, 256])];
+            tensor<fp32, [9, 2, 256]> out = reshape(shape = var_1304, x = out_33)[name = tensor<string, []>("out")];
+            tensor<fp32, [9, 2, 256]> var_1306 = silu(x = input_189)[name = tensor<string, []>("op_1306")];
+            tensor<fp32, [9, 2, 256]> input_191 = mul(x = var_1306, y = out)[name = tensor<string, []>("input_191")];
+            tensor<fp32, [9, 2, 256]> ret_out = linear(bias = inner_decoder_out_proj_1_bias, weight = inner_decoder_out_proj_1_weight, x = input_191)[name = tensor<string, []>("linear_51")];
+            tensor<fp32, [9, 2, 256]> input_193 = add(x = x, y = ret_out)[name = tensor<string, []>("input_193")];
             tensor<int32, [1]> xt_5_axes_0 = const()[name = tensor<string, []>("xt_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [9, 2, 256]> xt_5 = layer_norm(axes = xt_5_axes_0, beta = decoder_norm11_1_bias, epsilon = var_953, gamma = decoder_norm11_1_weight, x = input_191)[name = tensor<string, []>("xt_5")];
-            tensor<int32, [4]> var_1303 = const()[name = tensor<string, []>("op_1303"), val = tensor<int32, [4]>([1, 9, 2, 256])];
-            tensor<fp32, [1, 9, 2, 256]> var_1304 = reshape(shape = var_1303, x = xt_5)[name = tensor<string, []>("op_1304")];
-            tensor<int32, [4]> var_1305_perm_0 = const()[name = tensor<string, []>("op_1305_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1308 = const()[name = tensor<string, []>("op_1308"), val = tensor<int32, [3]>([2, 9, 256])];
-            tensor<fp32, [1, 2, 9, 256]> var_1305 = transpose(perm = var_1305_perm_0, x = var_1304)[name = tensor<string, []>("transpose_10")];
-            tensor<fp32, [2, 9, 256]> query_5 = reshape(shape = var_1308, x = var_1305)[name = tensor<string, []>("query_5")];
+            tensor<fp32, [9, 2, 256]> xt_5 = layer_norm(axes = xt_5_axes_0, beta = inner_decoder_norm11_1_bias, epsilon = var_46, gamma = inner_decoder_norm11_1_weight, x = input_193)[name = tensor<string, []>("xt_5")];
+            tensor<int32, [4]> var_1316 = const()[name = tensor<string, []>("op_1316"), val = tensor<int32, [4]>([1, 9, 2, 256])];
+            tensor<fp32, [1, 9, 2, 256]> var_1317 = reshape(shape = var_1316, x = xt_5)[name = tensor<string, []>("op_1317")];
+            tensor<int32, [4]> var_1318_perm_0 = const()[name = tensor<string, []>("op_1318_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1321 = const()[name = tensor<string, []>("op_1321"), val = tensor<int32, [3]>([2, 9, 256])];
+            tensor<fp32, [1, 2, 9, 256]> var_1318 = transpose(perm = var_1318_perm_0, x = var_1317)[name = tensor<string, []>("transpose_10")];
+            tensor<fp32, [2, 9, 256]> query_5 = reshape(shape = var_1321, x = var_1318)[name = tensor<string, []>("query_5")];
             tensor<int32, [3]> query_perm_0 = const()[name = tensor<string, []>("query_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [9, 2, 256]> query = transpose(perm = query_perm_0, x = query_5)[name = tensor<string, []>("transpose_9")];
-            tensor<fp32, [9, 2, 768]> var_1331 = linear(bias = decoder_self_attn2_1_in_proj_bias, weight = decoder_self_attn2_1_in_proj_weight, x = query)[name = tensor<string, []>("linear_52")];
+            tensor<fp32, [9, 2, 768]> var_1344 = linear(bias = inner_decoder_self_attn2_1_in_proj_bias, weight = inner_decoder_self_attn2_1_in_proj_weight, x = query)[name = tensor<string, []>("linear_52")];
             tensor<int32, [4]> concat_2 = const()[name = tensor<string, []>("concat_2"), val = tensor<int32, [4]>([9, 2, 3, 256])];
-            tensor<fp32, [9, 2, 3, 256]> var_1333 = reshape(shape = concat_2, x = var_1331)[name = tensor<string, []>("op_1333")];
-            tensor<int32, [1]> var_1334_axes_0 = const()[name = tensor<string, []>("op_1334_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp32, [1, 9, 2, 3, 256]> var_1334 = expand_dims(axes = var_1334_axes_0, x = var_1333)[name = tensor<string, []>("op_1334")];
-            tensor<int32, [5]> var_1335_perm_0 = const()[name = tensor<string, []>("op_1335_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
-            tensor<int32, [1]> var_1336_axes_0 = const()[name = tensor<string, []>("op_1336_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp32, [3, 9, 2, 1, 256]> var_1335 = transpose(perm = var_1335_perm_0, x = var_1334)[name = tensor<string, []>("transpose_8")];
-            tensor<fp32, [3, 9, 2, 256]> var_1336 = squeeze(axes = var_1336_axes_0, x = var_1335)[name = tensor<string, []>("op_1336")];
+            tensor<fp32, [9, 2, 3, 256]> var_1346 = reshape(shape = concat_2, x = var_1344)[name = tensor<string, []>("op_1346")];
+            tensor<int32, [1]> var_1347_axes_0 = const()[name = tensor<string, []>("op_1347_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 9, 2, 3, 256]> var_1347 = expand_dims(axes = var_1347_axes_0, x = var_1346)[name = tensor<string, []>("op_1347")];
+            tensor<int32, [5]> var_1348_perm_0 = const()[name = tensor<string, []>("op_1348_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
+            tensor<int32, [1]> var_1349_axes_0 = const()[name = tensor<string, []>("op_1349_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp32, [3, 9, 2, 1, 256]> var_1348 = transpose(perm = var_1348_perm_0, x = var_1347)[name = tensor<string, []>("transpose_8")];
+            tensor<fp32, [3, 9, 2, 256]> var_1349 = squeeze(axes = var_1349_axes_0, x = var_1348)[name = tensor<string, []>("op_1349")];
             tensor<int32, [4]> q_19_begin_0 = const()[name = tensor<string, []>("q_19_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> q_19_end_0 = const()[name = tensor<string, []>("q_19_end_0"), val = tensor<int32, [4]>([1, 9, 2, 256])];
             tensor<bool, [4]> q_19_end_mask_0 = const()[name = tensor<string, []>("q_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> q_19_squeeze_mask_0 = const()[name = tensor<string, []>("q_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [9, 2, 256]> q_19 = slice_by_index(begin = q_19_begin_0, end = q_19_end_0, end_mask = q_19_end_mask_0, squeeze_mask = q_19_squeeze_mask_0, x = var_1336)[name = tensor<string, []>("q_19")];
+            tensor<fp32, [9, 2, 256]> q_19 = slice_by_index(begin = q_19_begin_0, end = q_19_end_0, end_mask = q_19_end_mask_0, squeeze_mask = q_19_squeeze_mask_0, x = var_1349)[name = tensor<string, []>("q_19")];
             tensor<int32, [4]> k_19_begin_0 = const()[name = tensor<string, []>("k_19_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> k_19_end_0 = const()[name = tensor<string, []>("k_19_end_0"), val = tensor<int32, [4]>([2, 9, 2, 256])];
             tensor<bool, [4]> k_19_end_mask_0 = const()[name = tensor<string, []>("k_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> k_19_squeeze_mask_0 = const()[name = tensor<string, []>("k_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [9, 2, 256]> k_19 = slice_by_index(begin = k_19_begin_0, end = k_19_end_0, end_mask = k_19_end_mask_0, squeeze_mask = k_19_squeeze_mask_0, x = var_1336)[name = tensor<string, []>("k_19")];
+            tensor<fp32, [9, 2, 256]> k_19 = slice_by_index(begin = k_19_begin_0, end = k_19_end_0, end_mask = k_19_end_mask_0, squeeze_mask = k_19_squeeze_mask_0, x = var_1349)[name = tensor<string, []>("k_19")];
             tensor<int32, [4]> v_19_begin_0 = const()[name = tensor<string, []>("v_19_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> v_19_end_0 = const()[name = tensor<string, []>("v_19_end_0"), val = tensor<int32, [4]>([3, 9, 2, 256])];
             tensor<bool, [4]> v_19_end_mask_0 = const()[name = tensor<string, []>("v_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> v_19_squeeze_mask_0 = const()[name = tensor<string, []>("v_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [9, 2, 256]> v_19 = slice_by_index(begin = v_19_begin_0, end = v_19_end_0, end_mask = v_19_end_mask_0, squeeze_mask = v_19_squeeze_mask_0, x = var_1336)[name = tensor<string, []>("v_19")];
-            tensor<int32, [3]> var_1344 = const()[name = tensor<string, []>("op_1344"), val = tensor<int32, [3]>([9, 8, 64])];
-            tensor<fp32, [9, 8, 64]> var_1345 = reshape(shape = var_1344, x = q_19)[name = tensor<string, []>("op_1345")];
+            tensor<fp32, [9, 2, 256]> v_19 = slice_by_index(begin = v_19_begin_0, end = v_19_end_0, end_mask = v_19_end_mask_0, squeeze_mask = v_19_squeeze_mask_0, x = var_1349)[name = tensor<string, []>("v_19")];
+            tensor<int32, [3]> var_1357 = const()[name = tensor<string, []>("op_1357"), val = tensor<int32, [3]>([9, 8, 64])];
+            tensor<fp32, [9, 8, 64]> var_1358 = reshape(shape = var_1357, x = q_19)[name = tensor<string, []>("op_1358")];
             tensor<int32, [3]> q_21_perm_0 = const()[name = tensor<string, []>("q_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1351 = const()[name = tensor<string, []>("op_1351"), val = tensor<int32, [3]>([9, 8, 64])];
-            tensor<fp32, [9, 8, 64]> var_1352 = reshape(shape = var_1351, x = k_19)[name = tensor<string, []>("op_1352")];
+            tensor<int32, [3]> var_1364 = const()[name = tensor<string, []>("op_1364"), val = tensor<int32, [3]>([9, 8, 64])];
+            tensor<fp32, [9, 8, 64]> var_1365 = reshape(shape = var_1364, x = k_19)[name = tensor<string, []>("op_1365")];
             tensor<int32, [3]> k_21_perm_0 = const()[name = tensor<string, []>("k_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1358 = const()[name = tensor<string, []>("op_1358"), val = tensor<int32, [3]>([9, 8, 64])];
-            tensor<fp32, [9, 8, 64]> var_1359 = reshape(shape = var_1358, x = v_19)[name = tensor<string, []>("op_1359")];
+            tensor<int32, [3]> var_1371 = const()[name = tensor<string, []>("op_1371"), val = tensor<int32, [3]>([9, 8, 64])];
+            tensor<fp32, [9, 8, 64]> var_1372 = reshape(shape = var_1371, x = v_19)[name = tensor<string, []>("op_1372")];
             tensor<int32, [3]> v_21_perm_0 = const()[name = tensor<string, []>("v_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [4]> var_1362 = const()[name = tensor<string, []>("op_1362"), val = tensor<int32, [4]>([2, 4, 9, 64])];
-            tensor<fp32, [8, 9, 64]> q_21 = transpose(perm = q_21_perm_0, x = var_1345)[name = tensor<string, []>("transpose_7")];
-            tensor<fp32, [2, 4, 9, 64]> q = reshape(shape = var_1362, x = q_21)[name = tensor<string, []>("q")];
-            tensor<int32, [4]> var_1364 = const()[name = tensor<string, []>("op_1364"), val = tensor<int32, [4]>([2, 4, 9, 64])];
-            tensor<fp32, [8, 9, 64]> k_21 = transpose(perm = k_21_perm_0, x = var_1352)[name = tensor<string, []>("transpose_6")];
-            tensor<fp32, [2, 4, 9, 64]> k = reshape(shape = var_1364, x = k_21)[name = tensor<string, []>("k")];
-            tensor<int32, [4]> var_1366 = const()[name = tensor<string, []>("op_1366"), val = tensor<int32, [4]>([2, 4, 9, 64])];
-            tensor<fp32, [8, 9, 64]> v_21 = transpose(perm = v_21_perm_0, x = var_1359)[name = tensor<string, []>("transpose_5")];
-            tensor<fp32, [2, 4, 9, 64]> v = reshape(shape = var_1366, x = v_21)[name = tensor<string, []>("v")];
+            tensor<int32, [4]> var_1375 = const()[name = tensor<string, []>("op_1375"), val = tensor<int32, [4]>([2, 4, 9, 64])];
+            tensor<fp32, [8, 9, 64]> q_21 = transpose(perm = q_21_perm_0, x = var_1358)[name = tensor<string, []>("transpose_7")];
+            tensor<fp32, [2, 4, 9, 64]> q = reshape(shape = var_1375, x = q_21)[name = tensor<string, []>("q")];
+            tensor<int32, [4]> var_1377 = const()[name = tensor<string, []>("op_1377"), val = tensor<int32, [4]>([2, 4, 9, 64])];
+            tensor<fp32, [8, 9, 64]> k_21 = transpose(perm = k_21_perm_0, x = var_1365)[name = tensor<string, []>("transpose_6")];
+            tensor<fp32, [2, 4, 9, 64]> k = reshape(shape = var_1377, x = k_21)[name = tensor<string, []>("k")];
+            tensor<int32, [4]> var_1379 = const()[name = tensor<string, []>("op_1379"), val = tensor<int32, [4]>([2, 4, 9, 64])];
+            tensor<fp32, [8, 9, 64]> v_21 = transpose(perm = v_21_perm_0, x = var_1372)[name = tensor<string, []>("transpose_5")];
+            tensor<fp32, [2, 4, 9, 64]> v = reshape(shape = var_1379, x = v_21)[name = tensor<string, []>("v")];
             tensor<fp32, []> mul_3_y_0 = const()[name = tensor<string, []>("mul_3_y_0"), val = tensor<fp32, []>(0x1p-3)];
             tensor<fp32, [2, 4, 9, 64]> mul_3 = mul(x = q, y = mul_3_y_0)[name = tensor<string, []>("mul_3")];
             tensor<bool, []> matmul_1_transpose_y_0 = const()[name = tensor<string, []>("matmul_1_transpose_y_0"), val = tensor<bool, []>(true)];
@@ -1197,36 +1204,36 @@ program(1.0)
             tensor<bool, []> attn_output_9_transpose_x_0 = const()[name = tensor<string, []>("attn_output_9_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> attn_output_9_transpose_y_0 = const()[name = tensor<string, []>("attn_output_9_transpose_y_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [2, 4, 9, 64]> attn_output_9 = matmul(transpose_x = attn_output_9_transpose_x_0, transpose_y = attn_output_9_transpose_y_0, x = softmax_1, y = v)[name = tensor<string, []>("attn_output_9")];
-            tensor<int32, [4]> var_1369 = const()[name = tensor<string, []>("op_1369"), val = tensor<int32, [4]>([2, 0, 1, 3])];
-            tensor<int32, [2]> var_1374 = const()[name = tensor<string, []>("op_1374"), val = tensor<int32, [2]>([18, 256])];
-            tensor<fp32, [9, 2, 4, 64]> var_1370 = transpose(perm = var_1369, x = attn_output_9)[name = tensor<string, []>("transpose_4")];
-            tensor<fp32, [18, 256]> attn_output_11 = reshape(shape = var_1374, x = var_1370)[name = tensor<string, []>("attn_output_11")];
-            tensor<fp32, [18, 256]> attn_output_13 = linear(bias = decoder_self_attn2_1_out_proj_bias, weight = decoder_self_attn2_1_out_proj_weight, x = attn_output_11)[name = tensor<string, []>("linear_53")];
-            tensor<int32, [3]> var_1378 = const()[name = tensor<string, []>("op_1378"), val = tensor<int32, [3]>([9, 2, 256])];
-            tensor<fp32, [9, 2, 256]> attn_output = reshape(shape = var_1378, x = attn_output_13)[name = tensor<string, []>("attn_output")];
+            tensor<int32, [4]> var_1382 = const()[name = tensor<string, []>("op_1382"), val = tensor<int32, [4]>([2, 0, 1, 3])];
+            tensor<int32, [2]> var_1387 = const()[name = tensor<string, []>("op_1387"), val = tensor<int32, [2]>([18, 256])];
+            tensor<fp32, [9, 2, 4, 64]> var_1383 = transpose(perm = var_1382, x = attn_output_9)[name = tensor<string, []>("transpose_4")];
+            tensor<fp32, [18, 256]> attn_output_11 = reshape(shape = var_1387, x = var_1383)[name = tensor<string, []>("attn_output_11")];
+            tensor<fp32, [18, 256]> attn_output_13 = linear(bias = inner_decoder_self_attn2_1_out_proj_bias, weight = inner_decoder_self_attn2_1_out_proj_weight, x = attn_output_11)[name = tensor<string, []>("linear_53")];
+            tensor<int32, [3]> var_1391 = const()[name = tensor<string, []>("op_1391"), val = tensor<int32, [3]>([9, 2, 256])];
+            tensor<fp32, [9, 2, 256]> attn_output = reshape(shape = var_1391, x = attn_output_13)[name = tensor<string, []>("attn_output")];
             tensor<int32, [3]> sa2_out_perm_0 = const()[name = tensor<string, []>("sa2_out_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [2, 9, 256]> sa2_out = transpose(perm = sa2_out_perm_0, x = attn_output)[name = tensor<string, []>("transpose_3")];
-            tensor<fp32, [2, 9, 256]> input_193 = add(x = query_5, y = sa2_out)[name = tensor<string, []>("input_193")];
-            tensor<int32, [1]> input_195_axes_0 = const()[name = tensor<string, []>("input_195_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [2, 9, 256]> input_195 = layer_norm(axes = input_195_axes_0, beta = decoder_norm21_1_bias, epsilon = var_953, gamma = decoder_norm21_1_weight, x = input_193)[name = tensor<string, []>("input_195")];
-            tensor<fp32, [2, 9, 2048]> input_197 = linear(bias = decoder_linear1_1_bias, weight = decoder_linear1_1_weight, x = input_195)[name = tensor<string, []>("linear_54")];
-            tensor<fp32, [2, 9, 2048]> input_199 = relu(x = input_197)[name = tensor<string, []>("input_199")];
-            tensor<fp32, [2, 9, 256]> ffn_out = linear(bias = decoder_linear2_1_bias, weight = decoder_linear2_1_weight, x = input_199)[name = tensor<string, []>("linear_55")];
-            tensor<fp32, [2, 9, 256]> input_201 = add(x = input_195, y = ffn_out)[name = tensor<string, []>("input_201")];
+            tensor<fp32, [2, 9, 256]> input_195 = add(x = query_5, y = sa2_out)[name = tensor<string, []>("input_195")];
+            tensor<int32, [1]> input_197_axes_0 = const()[name = tensor<string, []>("input_197_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [2, 9, 256]> input_197 = layer_norm(axes = input_197_axes_0, beta = inner_decoder_norm21_1_bias, epsilon = var_46, gamma = inner_decoder_norm21_1_weight, x = input_195)[name = tensor<string, []>("input_197")];
+            tensor<fp32, [2, 9, 2048]> input_199 = linear(bias = inner_decoder_linear1_1_bias, weight = inner_decoder_linear1_1_weight, x = input_197)[name = tensor<string, []>("linear_54")];
+            tensor<fp32, [2, 9, 2048]> input_201 = relu(x = input_199)[name = tensor<string, []>("input_201")];
+            tensor<fp32, [2, 9, 256]> ffn_out = linear(bias = inner_decoder_linear2_1_bias, weight = inner_decoder_linear2_1_weight, x = input_201)[name = tensor<string, []>("linear_55")];
+            tensor<fp32, [2, 9, 256]> input_203 = add(x = input_197, y = ffn_out)[name = tensor<string, []>("input_203")];
             tensor<int32, [1]> xt_axes_0 = const()[name = tensor<string, []>("xt_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [2, 9, 256]> xt = layer_norm(axes = xt_axes_0, beta = decoder_norm22_1_bias, epsilon = var_953, gamma = decoder_norm22_1_weight, x = input_201)[name = tensor<string, []>("xt")];
-            tensor<int32, [4]> var_1398 = const()[name = tensor<string, []>("op_1398"), val = tensor<int32, [4]>([1, 2, 9, 256])];
-            tensor<fp32, [1, 2, 9, 256]> input = reshape(shape = var_1398, x = xt)[name = tensor<string, []>("input")];
-            tensor<int32, [1]> var_1400 = const()[name = tensor<string, []>("op_1400"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 9, 1]> var_1401 = reduce_l2_norm(axes = var_1400, keep_dims = var_956, x = input)[name = tensor<string, []>("op_1401")];
+            tensor<fp32, [2, 9, 256]> xt = layer_norm(axes = xt_axes_0, beta = inner_decoder_norm22_1_bias, epsilon = var_46, gamma = inner_decoder_norm22_1_weight, x = input_203)[name = tensor<string, []>("xt")];
+            tensor<int32, [4]> var_1411 = const()[name = tensor<string, []>("op_1411"), val = tensor<int32, [4]>([1, 2, 9, 256])];
+            tensor<fp32, [1, 2, 9, 256]> input = reshape(shape = var_1411, x = xt)[name = tensor<string, []>("input")];
+            tensor<int32, [1]> var_1413 = const()[name = tensor<string, []>("op_1413"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 2, 9, 1]> var_1414 = reduce_l2_norm(axes = var_1413, keep_dims = var_45, x = input)[name = tensor<string, []>("op_1414")];
             tensor<fp32, []> const_42 = const()[name = tensor<string, []>("const_42"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1, 2, 9, 1]> clip_5 = clip(alpha = var_948, beta = const_42, x = var_1401)[name = tensor<string, []>("clip_5")];
-            tensor<fp32, [1, 2, 9, 256]> var_1403 = real_div(x = input, y = clip_5)[name = tensor<string, []>("op_1403")];
+            tensor<fp32, [1, 2, 9, 1]> clip_5 = clip(alpha = var_59, beta = const_42, x = var_1414)[name = tensor<string, []>("clip_5")];
+            tensor<fp32, [1, 2, 9, 256]> var_1416 = real_div(x = input, y = clip_5)[name = tensor<string, []>("op_1416")];
             tensor<int32, [3]> concat_6 = const()[name = tensor<string, []>("concat_6"), val = tensor<int32, [3]>([2, 1, 256])];
             tensor<fp32, [2, 1, 256]> reshape_0 = reshape(shape = concat_6, x = emb)[name = tensor<string, []>("reshape_0")];
             tensor<int32, [4]> transpose_1_perm_0 = const()[name = tensor<string, []>("transpose_1_perm_0"), val = tensor<int32, [4]>([0, 1, 3, 2])];
             tensor<int32, [3]> concat_7 = const()[name = tensor<string, []>("concat_7"), val = tensor<int32, [3]>([2, 256, 9])];
-            tensor<fp32, [1, 2, 256, 9]> transpose_1 = transpose(perm = transpose_1_perm_0, x = var_1403)[name = tensor<string, []>("transpose_2")];
+            tensor<fp32, [1, 2, 256, 9]> transpose_1 = transpose(perm = transpose_1_perm_0, x = var_1416)[name = tensor<string, []>("transpose_2")];
             tensor<fp32, [2, 256, 9]> reshape_1 = reshape(shape = concat_7, x = transpose_1)[name = tensor<string, []>("reshape_1")];
             tensor<bool, []> matmul_2_transpose_x_0 = const()[name = tensor<string, []>("matmul_2_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> matmul_2_transpose_y_0 = const()[name = tensor<string, []>("matmul_2_transpose_y_0"), val = tensor<bool, []>(false)];
@@ -1237,10 +1244,10 @@ program(1.0)
             tensor<int32, [3]> output_end_0 = const()[name = tensor<string, []>("output_end_0"), val = tensor<int32, [3]>([1, 2, 8])];
             tensor<bool, [3]> output_end_mask_0 = const()[name = tensor<string, []>("output_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
             tensor<fp32, [1, 2, 7]> output = slice_by_index(begin = output_begin_0, end = output_end_0, end_mask = output_end_mask_0, x = reshape_2)[name = tensor<string, []>("output")];
-            tensor<fp32, [1, 2, 7]> probs = sigmoid(x = output)[name = tensor<string, []>("op_1407")];
-            tensor<int32, []> var_1409_axis_0 = const()[name = tensor<string, []>("op_1409_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [2, 9, 4, 64, 64]> dec_kv_new = stack(axis = var_1409_axis_0, values = (var_1105, nkv))[name = tensor<string, []>("op_1409")];
-            tensor<int32, []> var_1411_axis_0 = const()[name = tensor<string, []>("op_1411_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [2, 1]> dec_scale_new = stack(axis = var_1411_axis_0, values = (new_scale_9, new_scale))[name = tensor<string, []>("op_1411")];
+            tensor<fp32, [1, 2, 7]> probs = sigmoid(x = output)[name = tensor<string, []>("op_1420")];
+            tensor<int32, []> var_1422_axis_0 = const()[name = tensor<string, []>("op_1422_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [2, 9, 4, 64, 64]> dec_kv_new = stack(axis = var_1422_axis_0, values = (var_1118, nkv))[name = tensor<string, []>("op_1422")];
+            tensor<int32, []> var_1424_axis_0 = const()[name = tensor<string, []>("op_1424_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [2, 1]> dec_scale_new = stack(axis = var_1424_axis_0, values = (new_scale_9, new_scale))[name = tensor<string, []>("op_1424")];
         } -> (probs, enc_kv_new, enc_scale_new, enc_conv_cache_new, cnn_window_new, dec_kv_new, dec_scale_new);
 }
\ No newline at end of file
diff --git a/optimized/ch/200ms/ls_eend_ch_200ms.mlpackage/Data/com.apple.CoreML/model.mlmodel b/optimized/ch/200ms/ls_eend_ch_200ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
index bee7293d1930f8df17e82f398224b087ead0eed6..b0b764951a916e183e0396624d5f52625082c014 100644
--- a/optimized/ch/200ms/ls_eend_ch_200ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
+++ b/optimized/ch/200ms/ls_eend_ch_200ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4f15d275fba79646bf8e6b1acd94f18b344b4ba451e802249a4814d7601f426f
-size 179867
+oid sha256:c2fdc8691b90d30c33d9e6eeba0997e644c82adc5af2f87dd493ba202c0ffc11
+size 184847
diff --git a/optimized/ch/200ms/ls_eend_ch_200ms.mlpackage/Manifest.json b/optimized/ch/200ms/ls_eend_ch_200ms.mlpackage/Manifest.json
index cd59efc886fa4d1cfef4032c5411dbb7569655b1..d53e7f18f986fec2a3e83fc62b0c87de4207fbbd 100644
--- a/optimized/ch/200ms/ls_eend_ch_200ms.mlpackage/Manifest.json
+++ b/optimized/ch/200ms/ls_eend_ch_200ms.mlpackage/Manifest.json
@@ -1,18 +1,18 @@
 {
     "fileFormatVersion": "1.0.0",
     "itemInfoEntries": {
-        "5C0B804E-E764-41B0-9747-F2E8A83C47B7": {
-            "author": "com.apple.CoreML",
-            "description": "CoreML Model Specification",
-            "name": "model.mlmodel",
-            "path": "com.apple.CoreML/model.mlmodel"
-        },
-        "9610BCAB-14A1-4087-9959-ED1CB79286B1": {
+        "857B2C68-68CE-48B9-8135-33407386BD83": {
             "author": "com.apple.CoreML",
             "description": "CoreML Model Weights",
             "name": "weights",
             "path": "com.apple.CoreML/weights"
+        },
+        "D82F93A9-BACD-4FC2-824C-63C27C744141": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Specification",
+            "name": "model.mlmodel",
+            "path": "com.apple.CoreML/model.mlmodel"
         }
     },
-    "rootModelIdentifier": "5C0B804E-E764-41B0-9747-F2E8A83C47B7"
+    "rootModelIdentifier": "D82F93A9-BACD-4FC2-824C-63C27C744141"
 }
diff --git a/optimized/ch/300ms/ls_eend_ch_300ms.mlmodelc/analytics/coremldata.bin b/optimized/ch/300ms/ls_eend_ch_300ms.mlmodelc/analytics/coremldata.bin
index d8f91c11eb8b5980108c3d2fdd2be965c635d965..2af7b7c943db8138fbdd9decacfd9fbee182def8 100644
--- a/optimized/ch/300ms/ls_eend_ch_300ms.mlmodelc/analytics/coremldata.bin
+++ b/optimized/ch/300ms/ls_eend_ch_300ms.mlmodelc/analytics/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ceb496d3598e7070ae0538a32010b7e9d3ffdd68c9c569f14b6987731dc9f1ad
+oid sha256:266d247cb95834ecbe57555a6d80a22fe1fd9858d3ac21bbadbd681e462cf2e1
 size 243
diff --git a/optimized/ch/300ms/ls_eend_ch_300ms.mlmodelc/coremldata.bin b/optimized/ch/300ms/ls_eend_ch_300ms.mlmodelc/coremldata.bin
index 58a2886f988ba53cdc741a3c5780fe599407acd9..c772e73a83c2eb579b7a2e6be9a1f20af803b388 100644
--- a/optimized/ch/300ms/ls_eend_ch_300ms.mlmodelc/coremldata.bin
+++ b/optimized/ch/300ms/ls_eend_ch_300ms.mlmodelc/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:595472cafa7ff618c858be86076cfe0e894ffc64361b5318c631e631bd0a0669
-size 1301
+oid sha256:667d6e71963d032f6ece74f64e2e61c6391d8b221cffef41062b74c3268b58c9
+size 1404
diff --git a/optimized/ch/300ms/ls_eend_ch_300ms.mlmodelc/metadata.json b/optimized/ch/300ms/ls_eend_ch_300ms.mlmodelc/metadata.json
index 637d583fb946317262e183e23421f1a720e2a1b1..90bda323158607a50918acb5f9fa5f44cfb1d2a9 100644
--- a/optimized/ch/300ms/ls_eend_ch_300ms.mlmodelc/metadata.json
+++ b/optimized/ch/300ms/ls_eend_ch_300ms.mlmodelc/metadata.json
@@ -1,7 +1,7 @@
 [
   {
     "metadataOutputVersion" : "3.0",
-    "shortDescription" : "LS-EEND CALLHOME streaming diarizer (pipeline, T=3, max_speakers=7)",
+    "shortDescription" : "LS-EEND CALLHOME streaming diarizer (pipeline, T=3, max_speakers=7, layout=raw_mel, cu=all)",
     "outputSchema" : [
       {
         "hasShapeFlexibility" : "0",
@@ -81,7 +81,7 @@
     "specificationVersion" : 8,
     "mlProgramOperationTypeHistogram" : {
       "Ios16.reduceL2Norm" : 2,
-      "Ios17.reshape" : 68,
+      "Ios17.reshape" : 69,
       "Ios16.softmax" : 2,
       "Ios17.matmul" : 29,
       "Ios17.transpose" : 57,
@@ -89,7 +89,7 @@
       "Ios17.expandDims" : 3,
       "Ios17.add" : 46,
       "Ios16.sigmoid" : 5,
-      "Ios17.sliceByIndex" : 56,
+      "Ios17.sliceByIndex" : 59,
       "Tile" : 1,
       "Ios16.reduceSum" : 1,
       "Ios17.squeeze" : 2,
@@ -101,7 +101,7 @@
       "Ios16.silu" : 18,
       "Ios17.realDiv" : 20,
       "Ios17.linear" : 56,
-      "Stack" : 5,
+      "Stack" : 6,
       "Ios17.concat" : 18,
       "Ios16.relu" : 2,
       "Ios16.cumsum" : 1,
@@ -128,9 +128,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float32",
-        "formattedType" : "MultiArray (Float32 1 × 3 × 345)",
+        "formattedType" : "MultiArray (Float32 1 × 35 × 23)",
         "shortDescription" : "",
-        "shape" : "[1, 3, 345]",
+        "shape" : "[1, 35, 23]",
         "name" : "features",
         "type" : "MultiArray"
       },
@@ -206,8 +206,8 @@
       }
     ],
     "userDefinedMetadata" : {
-      "com.github.apple.coremltools.conversion_date" : "2026-04-16",
-      "config" : "{\"model_name\": \"ch\", \"model_label\": \"CALLHOME\", \"variant\": \"pipeline\", \"chunk_size\": 3, \"step_duration_ms\": 300, \"frame_duration_ms\": 100, \"frame_duration_seconds\": 0.1, \"max_speakers\": 7, \"max_nspks\": 9, \"n_units\": 256, \"n_heads\": 4, \"enc_n_layers\": 4, \"dec_n_layers\": 2, \"conv_kernel_size\": 16, \"conv_delay\": 9, \"sample_rate\": 8000, \"win_length\": 200, \"hop_length\": 80, \"n_mels\": 23, \"context_size\": 7, \"subsampling\": 10, \"feat_type\": \"logmel23_cummn\", \"pure_roll\": true}",
+      "com.github.apple.coremltools.conversion_date" : "2026-04-18",
+      "config" : "{\"model_name\": \"ch\", \"model_label\": \"CALLHOME\", \"variant\": \"pipeline\", \"chunk_size\": 3, \"step_duration_ms\": 300, \"frame_duration_ms\": 100, \"frame_duration_seconds\": 0.1, \"max_speakers\": 7, \"max_nspks\": 9, \"n_units\": 256, \"n_heads\": 4, \"enc_n_layers\": 4, \"dec_n_layers\": 2, \"conv_kernel_size\": 16, \"conv_delay\": 9, \"sample_rate\": 8000, \"win_length\": 200, \"hop_length\": 80, \"n_mels\": 23, \"context_size\": 7, \"subsampling\": 10, \"feat_type\": \"logmel23_cummn\", \"pure_roll\": true, \"input_layout\": \"raw_mel\", \"compute_units_export\": \"all\", \"raw_mel_length\": 35}",
       "com.github.apple.coremltools.source" : "torch==2.6.0",
       "com.github.apple.coremltools.version" : "9.0",
       "com.github.apple.coremltools.source_dialect" : "TorchScript"
diff --git a/optimized/ch/300ms/ls_eend_ch_300ms.mlmodelc/model.mil b/optimized/ch/300ms/ls_eend_ch_300ms.mlmodelc/model.mil
index 5eef33ebf1d0746b1ddb3ff22edac4fc03ac7b54..bef23272c0c03dca7b3c2e0e89406d225ef31cf4 100644
--- a/optimized/ch/300ms/ls_eend_ch_300ms.mlmodelc/model.mil
+++ b/optimized/ch/300ms/ls_eend_ch_300ms.mlmodelc/model.mil
@@ -1,234 +1,252 @@
 program(1.0)
 [buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.6.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "9.0"}})]
 {
-    func main<ios17>(tensor<fp32, [1, 256, 18]> cnn_window, tensor<fp32, [2, 9, 4, 64, 64]> dec_kv, tensor<fp32, [2, 1]> dec_scale, tensor<fp32, [4, 1, 16, 256]> enc_conv_cache, tensor<fp32, [4, 1, 4, 64, 64]> enc_kv, tensor<fp32, [4, 1]> enc_scale, tensor<fp32, [1, 3, 345]> features, tensor<fp32, [3]> valid_mask) {
-            tensor<fp32, [256]> encoder_cnn_bias = const()[name = tensor<string, []>("encoder_cnn_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
-            tensor<fp32, [256, 256, 19]> encoder_cnn_weight = const()[name = tensor<string, []>("encoder_cnn_weight"), val = tensor<fp32, [256, 256, 19]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
-            tensor<fp32, [3, 3]> encoder__causal_mask = const()[name = tensor<string, []>("encoder__causal_mask"), val = tensor<fp32, [3, 3]>([[0x1p+0, 0x0p+0, 0x0p+0], [0x1p+0, 0x1p+0, 0x0p+0], [0x1p+0, 0x1p+0, 0x1p+0]])];
-            tensor<fp32, [3]> encoder__t_index = const()[name = tensor<string, []>("encoder__t_index"), val = tensor<fp32, [3]>([0x1p+0, 0x1p+1, 0x1.8p+1])];
-            tensor<fp32, [256]> encoder_input_projection_linear_bias = const()[name = tensor<string, []>("encoder_input_projection_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4981952)))];
-            tensor<fp32, [256, 345]> encoder_input_projection_linear_weight = const()[name = tensor<string, []>("encoder_input_projection_linear_weight"), val = tensor<fp32, [256, 345]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4983040)))];
-            tensor<fp32, [256]> encoder_pre_layer_norm_bias = const()[name = tensor<string, []>("encoder_pre_layer_norm_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5336384)))];
-            tensor<fp32, [256]> encoder_pre_layer_norm_weight = const()[name = tensor<string, []>("encoder_pre_layer_norm_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5337472)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5338560)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5339648)))];
-            tensor<fp32, [1024]> encoder_ffn1_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5340736)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5344896)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6393536)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6394624)))];
-            tensor<fp32, [256]> encoder_ret_lns_0_bias = const()[name = tensor<string, []>("encoder_ret_lns_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7443264)))];
-            tensor<fp32, [256]> encoder_ret_lns_0_weight = const()[name = tensor<string, []>("encoder_ret_lns_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7444352)))];
-            tensor<fp32, [256]> encoder_q_proj_0_bias = const()[name = tensor<string, []>("encoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7445440)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_0_weight = const()[name = tensor<string, []>("encoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7446528)))];
-            tensor<fp32, [256]> encoder_k_proj_0_bias = const()[name = tensor<string, []>("encoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7708736)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_0_weight = const()[name = tensor<string, []>("encoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7709824)))];
-            tensor<fp32, [256]> encoder_v_proj_0_bias = const()[name = tensor<string, []>("encoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7972032)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_0_weight = const()[name = tensor<string, []>("encoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7973120)))];
-            tensor<fp32, [256]> encoder_g_proj_0_bias = const()[name = tensor<string, []>("encoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8235328)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_0_weight = const()[name = tensor<string, []>("encoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8236416)))];
-            tensor<fp32, [256]> encoder_out_proj_0_bias = const()[name = tensor<string, []>("encoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8498624)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_0_weight = const()[name = tensor<string, []>("encoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8499712)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8761920)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8763008)))];
-            tensor<fp32, [512]> encoder_conv_module_0_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8764096)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_0_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8766208)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_0_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9290560)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9307008)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9308096)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9309184)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9310272)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9311360)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_0_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9312448)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9574656)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9575744)))];
-            tensor<fp32, [1024]> encoder_ffn2_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9576832)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9580992)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10629632)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10630720)))];
-            tensor<fp32, [256]> encoder_layer_norm_0_bias = const()[name = tensor<string, []>("encoder_layer_norm_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11679360)))];
-            tensor<fp32, [256]> encoder_layer_norm_0_weight = const()[name = tensor<string, []>("encoder_layer_norm_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11680448)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11681536)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11682624)))];
-            tensor<fp32, [1024]> encoder_ffn1_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11683712)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11687872)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12736512)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12737600)))];
-            tensor<fp32, [256]> encoder_ret_lns_1_bias = const()[name = tensor<string, []>("encoder_ret_lns_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13786240)))];
-            tensor<fp32, [256]> encoder_ret_lns_1_weight = const()[name = tensor<string, []>("encoder_ret_lns_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13787328)))];
-            tensor<fp32, [256]> encoder_q_proj_1_bias = const()[name = tensor<string, []>("encoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13788416)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_1_weight = const()[name = tensor<string, []>("encoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13789504)))];
-            tensor<fp32, [256]> encoder_k_proj_1_bias = const()[name = tensor<string, []>("encoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14051712)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_1_weight = const()[name = tensor<string, []>("encoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14052800)))];
-            tensor<fp32, [256]> encoder_v_proj_1_bias = const()[name = tensor<string, []>("encoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14315008)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_1_weight = const()[name = tensor<string, []>("encoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14316096)))];
-            tensor<fp32, [256]> encoder_g_proj_1_bias = const()[name = tensor<string, []>("encoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14578304)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_1_weight = const()[name = tensor<string, []>("encoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14579392)))];
-            tensor<fp32, [256]> encoder_out_proj_1_bias = const()[name = tensor<string, []>("encoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14841600)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_1_weight = const()[name = tensor<string, []>("encoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14842688)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15104896)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15105984)))];
-            tensor<fp32, [512]> encoder_conv_module_1_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15107072)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_1_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15109184)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_1_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15633536)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15649984)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15651072)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15652160)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15653248)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15654336)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_1_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15655424)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15917632)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15918720)))];
-            tensor<fp32, [1024]> encoder_ffn2_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15919808)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15923968)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16972608)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16973696)))];
-            tensor<fp32, [256]> encoder_layer_norm_1_bias = const()[name = tensor<string, []>("encoder_layer_norm_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18022336)))];
-            tensor<fp32, [256]> encoder_layer_norm_1_weight = const()[name = tensor<string, []>("encoder_layer_norm_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18023424)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18024512)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18025600)))];
-            tensor<fp32, [1024]> encoder_ffn1_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18026688)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18030848)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19079488)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19080576)))];
-            tensor<fp32, [256]> encoder_ret_lns_2_bias = const()[name = tensor<string, []>("encoder_ret_lns_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20129216)))];
-            tensor<fp32, [256]> encoder_ret_lns_2_weight = const()[name = tensor<string, []>("encoder_ret_lns_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20130304)))];
-            tensor<fp32, [256]> encoder_q_proj_2_bias = const()[name = tensor<string, []>("encoder_q_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20131392)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_2_weight = const()[name = tensor<string, []>("encoder_q_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20132480)))];
-            tensor<fp32, [256]> encoder_k_proj_2_bias = const()[name = tensor<string, []>("encoder_k_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20394688)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_2_weight = const()[name = tensor<string, []>("encoder_k_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20395776)))];
-            tensor<fp32, [256]> encoder_v_proj_2_bias = const()[name = tensor<string, []>("encoder_v_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20657984)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_2_weight = const()[name = tensor<string, []>("encoder_v_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20659072)))];
-            tensor<fp32, [256]> encoder_g_proj_2_bias = const()[name = tensor<string, []>("encoder_g_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20921280)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_2_weight = const()[name = tensor<string, []>("encoder_g_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20922368)))];
-            tensor<fp32, [256]> encoder_out_proj_2_bias = const()[name = tensor<string, []>("encoder_out_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21184576)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_2_weight = const()[name = tensor<string, []>("encoder_out_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21185664)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21447872)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21448960)))];
-            tensor<fp32, [512]> encoder_conv_module_2_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21450048)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_2_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21452160)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_2_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21976512)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21992960)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21994048)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21995136)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21996224)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21997312)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_2_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21998400)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22260608)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22261696)))];
-            tensor<fp32, [1024]> encoder_ffn2_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22262784)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22266944)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23315584)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23316672)))];
-            tensor<fp32, [256]> encoder_layer_norm_2_bias = const()[name = tensor<string, []>("encoder_layer_norm_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24365312)))];
-            tensor<fp32, [256]> encoder_layer_norm_2_weight = const()[name = tensor<string, []>("encoder_layer_norm_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24366400)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24367488)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24368576)))];
-            tensor<fp32, [1024]> encoder_ffn1_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24369664)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24373824)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25422464)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25423552)))];
-            tensor<fp32, [256]> encoder_ret_lns_3_bias = const()[name = tensor<string, []>("encoder_ret_lns_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26472192)))];
-            tensor<fp32, [256]> encoder_ret_lns_3_weight = const()[name = tensor<string, []>("encoder_ret_lns_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26473280)))];
-            tensor<fp32, [256]> encoder_q_proj_3_bias = const()[name = tensor<string, []>("encoder_q_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26474368)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_3_weight = const()[name = tensor<string, []>("encoder_q_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26475456)))];
-            tensor<fp32, [256]> encoder_k_proj_3_bias = const()[name = tensor<string, []>("encoder_k_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26737664)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_3_weight = const()[name = tensor<string, []>("encoder_k_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26738752)))];
-            tensor<fp32, [256]> encoder_v_proj_3_bias = const()[name = tensor<string, []>("encoder_v_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27000960)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_3_weight = const()[name = tensor<string, []>("encoder_v_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27002048)))];
-            tensor<fp32, [256]> encoder_g_proj_3_bias = const()[name = tensor<string, []>("encoder_g_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27264256)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_3_weight = const()[name = tensor<string, []>("encoder_g_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27265344)))];
-            tensor<fp32, [256]> encoder_out_proj_3_bias = const()[name = tensor<string, []>("encoder_out_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27527552)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_3_weight = const()[name = tensor<string, []>("encoder_out_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27528640)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27790848)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27791936)))];
-            tensor<fp32, [512]> encoder_conv_module_3_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27793024)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_3_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27795136)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_3_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28319488)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28335936)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28337024)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28338112)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28339200)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28340288)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_3_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28341376)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28603584)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28604672)))];
-            tensor<fp32, [1024]> encoder_ffn2_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28605760)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28609920)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29658560)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29659648)))];
-            tensor<fp32, [256]> encoder_layer_norm_3_bias = const()[name = tensor<string, []>("encoder_layer_norm_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30708288)))];
-            tensor<fp32, [256]> encoder_layer_norm_3_weight = const()[name = tensor<string, []>("encoder_layer_norm_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30709376)))];
-            tensor<fp32, [3, 3]> decoder__causal_mask = const()[name = tensor<string, []>("decoder__causal_mask"), val = tensor<fp32, [3, 3]>([[0x1p+0, 0x0p+0, 0x0p+0], [0x1p+0, 0x1p+0, 0x0p+0], [0x1p+0, 0x1p+0, 0x1p+0]])];
-            tensor<fp32, [256]> decoder_convert_bias = const()[name = tensor<string, []>("decoder_convert_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710464)))];
-            tensor<fp32, [256, 512]> decoder_convert_weight = const()[name = tensor<string, []>("decoder_convert_weight"), val = tensor<fp32, [256, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30711552)))];
-            tensor<fp32, [256]> decoder_q_proj_0_bias = const()[name = tensor<string, []>("decoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31235904)))];
-            tensor<fp32, [256, 256]> decoder_q_proj_0_weight = const()[name = tensor<string, []>("decoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31236992)))];
-            tensor<fp32, [256]> decoder_k_proj_0_bias = const()[name = tensor<string, []>("decoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31499200)))];
-            tensor<fp32, [256, 256]> decoder_k_proj_0_weight = const()[name = tensor<string, []>("decoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31500288)))];
-            tensor<fp32, [256]> decoder_v_proj_0_bias = const()[name = tensor<string, []>("decoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31762496)))];
-            tensor<fp32, [256, 256]> decoder_v_proj_0_weight = const()[name = tensor<string, []>("decoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31763584)))];
-            tensor<fp32, [256]> decoder_g_proj_0_bias = const()[name = tensor<string, []>("decoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32025792)))];
-            tensor<fp32, [256, 256]> decoder_g_proj_0_weight = const()[name = tensor<string, []>("decoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32026880)))];
-            tensor<fp32, [256]> decoder_out_proj_0_bias = const()[name = tensor<string, []>("decoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32289088)))];
-            tensor<fp32, [256, 256]> decoder_out_proj_0_weight = const()[name = tensor<string, []>("decoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32290176)))];
-            tensor<fp32, [256]> decoder_norm11_0_bias = const()[name = tensor<string, []>("decoder_norm11_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32552384)))];
-            tensor<fp32, [256]> decoder_norm11_0_weight = const()[name = tensor<string, []>("decoder_norm11_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32553472)))];
-            tensor<fp32, [256]> decoder_self_attn2_0_out_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_0_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32554560)))];
-            tensor<fp32, [256, 256]> decoder_self_attn2_0_out_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_0_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32555648)))];
-            tensor<fp32, [768]> decoder_self_attn2_0_in_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_0_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32817856)))];
-            tensor<fp32, [768, 256]> decoder_self_attn2_0_in_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_0_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32820992)))];
-            tensor<fp32, [256]> decoder_norm21_0_bias = const()[name = tensor<string, []>("decoder_norm21_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33607488)))];
-            tensor<fp32, [256]> decoder_norm21_0_weight = const()[name = tensor<string, []>("decoder_norm21_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33608576)))];
-            tensor<fp32, [2048]> decoder_linear1_0_bias = const()[name = tensor<string, []>("decoder_linear1_0_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33609664)))];
-            tensor<fp32, [2048, 256]> decoder_linear1_0_weight = const()[name = tensor<string, []>("decoder_linear1_0_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33617920)))];
-            tensor<fp32, [256]> decoder_linear2_0_bias = const()[name = tensor<string, []>("decoder_linear2_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35715136)))];
-            tensor<fp32, [256, 2048]> decoder_linear2_0_weight = const()[name = tensor<string, []>("decoder_linear2_0_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35716224)))];
-            tensor<fp32, [256]> decoder_norm22_0_bias = const()[name = tensor<string, []>("decoder_norm22_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37813440)))];
-            tensor<fp32, [256]> decoder_norm22_0_weight = const()[name = tensor<string, []>("decoder_norm22_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37814528)))];
-            tensor<fp32, [256]> decoder_q_proj_1_bias = const()[name = tensor<string, []>("decoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37815616)))];
-            tensor<fp32, [256, 256]> decoder_q_proj_1_weight = const()[name = tensor<string, []>("decoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37816704)))];
-            tensor<fp32, [256]> decoder_k_proj_1_bias = const()[name = tensor<string, []>("decoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38078912)))];
-            tensor<fp32, [256, 256]> decoder_k_proj_1_weight = const()[name = tensor<string, []>("decoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38080000)))];
-            tensor<fp32, [256]> decoder_v_proj_1_bias = const()[name = tensor<string, []>("decoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38342208)))];
-            tensor<fp32, [256, 256]> decoder_v_proj_1_weight = const()[name = tensor<string, []>("decoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38343296)))];
-            tensor<fp32, [256]> decoder_g_proj_1_bias = const()[name = tensor<string, []>("decoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38605504)))];
-            tensor<fp32, [256, 256]> decoder_g_proj_1_weight = const()[name = tensor<string, []>("decoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38606592)))];
-            tensor<fp32, [256]> decoder_out_proj_1_bias = const()[name = tensor<string, []>("decoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38868800)))];
-            tensor<fp32, [256, 256]> decoder_out_proj_1_weight = const()[name = tensor<string, []>("decoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38869888)))];
-            tensor<fp32, [256]> decoder_norm11_1_bias = const()[name = tensor<string, []>("decoder_norm11_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39132096)))];
-            tensor<fp32, [256]> decoder_norm11_1_weight = const()[name = tensor<string, []>("decoder_norm11_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39133184)))];
-            tensor<fp32, [256]> decoder_self_attn2_1_out_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_1_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39134272)))];
-            tensor<fp32, [256, 256]> decoder_self_attn2_1_out_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_1_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39135360)))];
-            tensor<fp32, [768]> decoder_self_attn2_1_in_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_1_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39397568)))];
-            tensor<fp32, [768, 256]> decoder_self_attn2_1_in_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_1_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39400704)))];
-            tensor<fp32, [256]> decoder_norm21_1_bias = const()[name = tensor<string, []>("decoder_norm21_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40187200)))];
-            tensor<fp32, [256]> decoder_norm21_1_weight = const()[name = tensor<string, []>("decoder_norm21_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40188288)))];
-            tensor<fp32, [2048]> decoder_linear1_1_bias = const()[name = tensor<string, []>("decoder_linear1_1_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40189376)))];
-            tensor<fp32, [2048, 256]> decoder_linear1_1_weight = const()[name = tensor<string, []>("decoder_linear1_1_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40197632)))];
-            tensor<fp32, [256]> decoder_linear2_1_bias = const()[name = tensor<string, []>("decoder_linear2_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42294848)))];
-            tensor<fp32, [256, 2048]> decoder_linear2_1_weight = const()[name = tensor<string, []>("decoder_linear2_1_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42295936)))];
-            tensor<fp32, [256]> decoder_norm22_1_bias = const()[name = tensor<string, []>("decoder_norm22_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44393152)))];
-            tensor<fp32, [256]> decoder_norm22_1_weight = const()[name = tensor<string, []>("decoder_norm22_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44394240)))];
-            tensor<fp32, []> var_11 = const()[name = tensor<string, []>("op_11"), val = tensor<fp32, []>(0x1.5798eep-27)];
-            tensor<fp32, []> var_17 = const()[name = tensor<string, []>("op_17"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
-            tensor<int32, []> var_20 = const()[name = tensor<string, []>("op_20"), val = tensor<int32, []>(2)];
-            tensor<int32, []> var_23 = const()[name = tensor<string, []>("op_23"), val = tensor<int32, []>(0)];
-            tensor<int32, []> var_26 = const()[name = tensor<string, []>("op_26"), val = tensor<int32, []>(1)];
-            tensor<fp32, []> var_28 = const()[name = tensor<string, []>("op_28"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
-            tensor<bool, []> var_29 = const()[name = tensor<string, []>("op_29"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 3, 256]> input_1 = linear(bias = encoder_input_projection_linear_bias, weight = encoder_input_projection_linear_weight, x = features)[name = tensor<string, []>("linear_0")];
-            tensor<int32, [1]> input_3_axes_0 = const()[name = tensor<string, []>("input_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_3 = layer_norm(axes = input_3_axes_0, beta = encoder_pre_layer_norm_bias, epsilon = var_28, gamma = encoder_pre_layer_norm_weight, x = input_1)[name = tensor<string, []>("input_3")];
+    func main<ios17>(tensor<fp32, [1, 256, 18]> cnn_window, tensor<fp32, [2, 9, 4, 64, 64]> dec_kv, tensor<fp32, [2, 1]> dec_scale, tensor<fp32, [4, 1, 16, 256]> enc_conv_cache, tensor<fp32, [4, 1, 4, 64, 64]> enc_kv, tensor<fp32, [4, 1]> enc_scale, tensor<fp32, [1, 35, 23]> features, tensor<fp32, [3]> valid_mask) {
+            tensor<fp32, [256]> inner_encoder_cnn_bias = const()[name = tensor<string, []>("inner_encoder_cnn_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp32, [256, 256, 19]> inner_encoder_cnn_weight = const()[name = tensor<string, []>("inner_encoder_cnn_weight"), val = tensor<fp32, [256, 256, 19]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
+            tensor<fp32, [3, 3]> inner_encoder__causal_mask = const()[name = tensor<string, []>("inner_encoder__causal_mask"), val = tensor<fp32, [3, 3]>([[0x1p+0, 0x0p+0, 0x0p+0], [0x1p+0, 0x1p+0, 0x0p+0], [0x1p+0, 0x1p+0, 0x1p+0]])];
+            tensor<fp32, [3]> inner_encoder__t_index = const()[name = tensor<string, []>("inner_encoder__t_index"), val = tensor<fp32, [3]>([0x1p+0, 0x1p+1, 0x1.8p+1])];
+            tensor<fp32, [256]> inner_encoder_input_projection_linear_bias = const()[name = tensor<string, []>("inner_encoder_input_projection_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4981952)))];
+            tensor<fp32, [256, 345]> inner_encoder_input_projection_linear_weight = const()[name = tensor<string, []>("inner_encoder_input_projection_linear_weight"), val = tensor<fp32, [256, 345]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4983040)))];
+            tensor<fp32, [256]> inner_encoder_pre_layer_norm_bias = const()[name = tensor<string, []>("inner_encoder_pre_layer_norm_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5336384)))];
+            tensor<fp32, [256]> inner_encoder_pre_layer_norm_weight = const()[name = tensor<string, []>("inner_encoder_pre_layer_norm_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5337472)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5338560)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5339648)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5340736)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5344896)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6393536)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6394624)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_0_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7443264)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_0_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7444352)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7445440)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7446528)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7708736)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7709824)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7972032)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7973120)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8235328)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8236416)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8498624)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8499712)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8761920)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8763008)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_0_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8764096)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_0_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8766208)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_0_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9290560)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9307008)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9308096)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9309184)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9310272)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9311360)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_0_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9312448)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9574656)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9575744)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9576832)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9580992)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10629632)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10630720)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_0_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11679360)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_0_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11680448)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11681536)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11682624)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11683712)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11687872)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12736512)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12737600)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_1_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13786240)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_1_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13787328)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13788416)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13789504)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14051712)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14052800)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14315008)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14316096)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14578304)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14579392)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14841600)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14842688)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15104896)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15105984)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_1_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15107072)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_1_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15109184)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_1_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15633536)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15649984)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15651072)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15652160)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15653248)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15654336)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_1_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15655424)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15917632)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15918720)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15919808)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15923968)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16972608)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16973696)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_1_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18022336)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_1_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18023424)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18024512)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18025600)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18026688)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18030848)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19079488)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19080576)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_2_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20129216)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_2_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20130304)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20131392)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20132480)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20394688)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20395776)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20657984)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20659072)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20921280)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20922368)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21184576)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21185664)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21447872)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21448960)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_2_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21450048)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_2_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21452160)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_2_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21976512)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21992960)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21994048)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21995136)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21996224)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21997312)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_2_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21998400)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22260608)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22261696)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22262784)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22266944)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23315584)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23316672)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_2_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24365312)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_2_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24366400)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24367488)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24368576)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24369664)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24373824)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25422464)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25423552)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_3_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26472192)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_3_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26473280)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26474368)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26475456)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26737664)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26738752)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27000960)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27002048)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27264256)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27265344)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27527552)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27528640)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27790848)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27791936)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_3_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27793024)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_3_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27795136)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_3_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28319488)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28335936)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28337024)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28338112)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28339200)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28340288)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_3_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28341376)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28603584)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28604672)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28605760)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28609920)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29658560)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29659648)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_3_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30708288)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_3_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30709376)))];
+            tensor<fp32, [3, 3]> inner_decoder__causal_mask = const()[name = tensor<string, []>("inner_decoder__causal_mask"), val = tensor<fp32, [3, 3]>([[0x1p+0, 0x0p+0, 0x0p+0], [0x1p+0, 0x1p+0, 0x0p+0], [0x1p+0, 0x1p+0, 0x1p+0]])];
+            tensor<fp32, [256]> inner_decoder_convert_bias = const()[name = tensor<string, []>("inner_decoder_convert_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710464)))];
+            tensor<fp32, [256, 512]> inner_decoder_convert_weight = const()[name = tensor<string, []>("inner_decoder_convert_weight"), val = tensor<fp32, [256, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30711552)))];
+            tensor<fp32, [256]> inner_decoder_q_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31235904)))];
+            tensor<fp32, [256, 256]> inner_decoder_q_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31236992)))];
+            tensor<fp32, [256]> inner_decoder_k_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31499200)))];
+            tensor<fp32, [256, 256]> inner_decoder_k_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31500288)))];
+            tensor<fp32, [256]> inner_decoder_v_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31762496)))];
+            tensor<fp32, [256, 256]> inner_decoder_v_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31763584)))];
+            tensor<fp32, [256]> inner_decoder_g_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32025792)))];
+            tensor<fp32, [256, 256]> inner_decoder_g_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32026880)))];
+            tensor<fp32, [256]> inner_decoder_out_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32289088)))];
+            tensor<fp32, [256, 256]> inner_decoder_out_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32290176)))];
+            tensor<fp32, [256]> inner_decoder_norm11_0_bias = const()[name = tensor<string, []>("inner_decoder_norm11_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32552384)))];
+            tensor<fp32, [256]> inner_decoder_norm11_0_weight = const()[name = tensor<string, []>("inner_decoder_norm11_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32553472)))];
+            tensor<fp32, [256]> inner_decoder_self_attn2_0_out_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32554560)))];
+            tensor<fp32, [256, 256]> inner_decoder_self_attn2_0_out_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32555648)))];
+            tensor<fp32, [768]> inner_decoder_self_attn2_0_in_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32817856)))];
+            tensor<fp32, [768, 256]> inner_decoder_self_attn2_0_in_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32820992)))];
+            tensor<fp32, [256]> inner_decoder_norm21_0_bias = const()[name = tensor<string, []>("inner_decoder_norm21_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33607488)))];
+            tensor<fp32, [256]> inner_decoder_norm21_0_weight = const()[name = tensor<string, []>("inner_decoder_norm21_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33608576)))];
+            tensor<fp32, [2048]> inner_decoder_linear1_0_bias = const()[name = tensor<string, []>("inner_decoder_linear1_0_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33609664)))];
+            tensor<fp32, [2048, 256]> inner_decoder_linear1_0_weight = const()[name = tensor<string, []>("inner_decoder_linear1_0_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33617920)))];
+            tensor<fp32, [256]> inner_decoder_linear2_0_bias = const()[name = tensor<string, []>("inner_decoder_linear2_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35715136)))];
+            tensor<fp32, [256, 2048]> inner_decoder_linear2_0_weight = const()[name = tensor<string, []>("inner_decoder_linear2_0_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35716224)))];
+            tensor<fp32, [256]> inner_decoder_norm22_0_bias = const()[name = tensor<string, []>("inner_decoder_norm22_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37813440)))];
+            tensor<fp32, [256]> inner_decoder_norm22_0_weight = const()[name = tensor<string, []>("inner_decoder_norm22_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37814528)))];
+            tensor<fp32, [256]> inner_decoder_q_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37815616)))];
+            tensor<fp32, [256, 256]> inner_decoder_q_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37816704)))];
+            tensor<fp32, [256]> inner_decoder_k_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38078912)))];
+            tensor<fp32, [256, 256]> inner_decoder_k_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38080000)))];
+            tensor<fp32, [256]> inner_decoder_v_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38342208)))];
+            tensor<fp32, [256, 256]> inner_decoder_v_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38343296)))];
+            tensor<fp32, [256]> inner_decoder_g_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38605504)))];
+            tensor<fp32, [256, 256]> inner_decoder_g_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38606592)))];
+            tensor<fp32, [256]> inner_decoder_out_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38868800)))];
+            tensor<fp32, [256, 256]> inner_decoder_out_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38869888)))];
+            tensor<fp32, [256]> inner_decoder_norm11_1_bias = const()[name = tensor<string, []>("inner_decoder_norm11_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39132096)))];
+            tensor<fp32, [256]> inner_decoder_norm11_1_weight = const()[name = tensor<string, []>("inner_decoder_norm11_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39133184)))];
+            tensor<fp32, [256]> inner_decoder_self_attn2_1_out_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39134272)))];
+            tensor<fp32, [256, 256]> inner_decoder_self_attn2_1_out_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39135360)))];
+            tensor<fp32, [768]> inner_decoder_self_attn2_1_in_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39397568)))];
+            tensor<fp32, [768, 256]> inner_decoder_self_attn2_1_in_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39400704)))];
+            tensor<fp32, [256]> inner_decoder_norm21_1_bias = const()[name = tensor<string, []>("inner_decoder_norm21_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40187200)))];
+            tensor<fp32, [256]> inner_decoder_norm21_1_weight = const()[name = tensor<string, []>("inner_decoder_norm21_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40188288)))];
+            tensor<fp32, [2048]> inner_decoder_linear1_1_bias = const()[name = tensor<string, []>("inner_decoder_linear1_1_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40189376)))];
+            tensor<fp32, [2048, 256]> inner_decoder_linear1_1_weight = const()[name = tensor<string, []>("inner_decoder_linear1_1_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40197632)))];
+            tensor<fp32, [256]> inner_decoder_linear2_1_bias = const()[name = tensor<string, []>("inner_decoder_linear2_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42294848)))];
+            tensor<fp32, [256, 2048]> inner_decoder_linear2_1_weight = const()[name = tensor<string, []>("inner_decoder_linear2_1_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42295936)))];
+            tensor<fp32, [256]> inner_decoder_norm22_1_bias = const()[name = tensor<string, []>("inner_decoder_norm22_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44393152)))];
+            tensor<fp32, [256]> inner_decoder_norm22_1_weight = const()[name = tensor<string, []>("inner_decoder_norm22_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44394240)))];
+            tensor<int32, [3]> var_19_begin_0 = const()[name = tensor<string, []>("op_19_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_19_end_0 = const()[name = tensor<string, []>("op_19_end_0"), val = tensor<int32, [3]>([1, 15, 23])];
+            tensor<bool, [3]> var_19_end_mask_0 = const()[name = tensor<string, []>("op_19_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 15, 23]> var_19 = slice_by_index(begin = var_19_begin_0, end = var_19_end_0, end_mask = var_19_end_mask_0, x = features)[name = tensor<string, []>("op_19")];
+            tensor<int32, [3]> var_29_begin_0 = const()[name = tensor<string, []>("op_29_begin_0"), val = tensor<int32, [3]>([0, 10, 0])];
+            tensor<int32, [3]> var_29_end_0 = const()[name = tensor<string, []>("op_29_end_0"), val = tensor<int32, [3]>([1, 25, 23])];
+            tensor<bool, [3]> var_29_end_mask_0 = const()[name = tensor<string, []>("op_29_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 15, 23]> var_29 = slice_by_index(begin = var_29_begin_0, end = var_29_end_0, end_mask = var_29_end_mask_0, x = features)[name = tensor<string, []>("op_29")];
+            tensor<int32, [3]> var_39_begin_0 = const()[name = tensor<string, []>("op_39_begin_0"), val = tensor<int32, [3]>([0, 20, 0])];
+            tensor<int32, [3]> var_39_end_0 = const()[name = tensor<string, []>("op_39_end_0"), val = tensor<int32, [3]>([1, 1, 23])];
+            tensor<bool, [3]> var_39_end_mask_0 = const()[name = tensor<string, []>("op_39_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 23]> var_39 = slice_by_index(begin = var_39_begin_0, end = var_39_end_0, end_mask = var_39_end_mask_0, x = features)[name = tensor<string, []>("op_39")];
+            tensor<int32, []> stacked_axis_0 = const()[name = tensor<string, []>("stacked_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 3, 15, 23]> stacked = stack(axis = stacked_axis_0, values = (var_19, var_29, var_39))[name = tensor<string, []>("stacked")];
+            tensor<int32, [3]> var_46 = const()[name = tensor<string, []>("op_46"), val = tensor<int32, [3]>([1, 3, 345])];
+            tensor<fp32, [1, 3, 345]> input_1 = reshape(shape = var_46, x = stacked)[name = tensor<string, []>("input_1")];
+            tensor<fp32, []> var_49 = const()[name = tensor<string, []>("op_49"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<bool, []> var_55 = const()[name = tensor<string, []>("op_55"), val = tensor<bool, []>(true)];
+            tensor<fp32, []> var_56 = const()[name = tensor<string, []>("op_56"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
+            tensor<int32, []> var_59 = const()[name = tensor<string, []>("op_59"), val = tensor<int32, []>(0)];
+            tensor<int32, []> var_61 = const()[name = tensor<string, []>("op_61"), val = tensor<int32, []>(2)];
+            tensor<int32, []> var_62 = const()[name = tensor<string, []>("op_62"), val = tensor<int32, []>(-1)];
+            tensor<fp32, []> var_64 = const()[name = tensor<string, []>("op_64"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
+            tensor<fp32, []> var_69 = const()[name = tensor<string, []>("op_69"), val = tensor<fp32, []>(0x1.5798eep-27)];
+            tensor<int32, []> var_72 = const()[name = tensor<string, []>("op_72"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 3, 256]> input_3 = linear(bias = inner_encoder_input_projection_linear_bias, weight = inner_encoder_input_projection_linear_weight, x = input_1)[name = tensor<string, []>("linear_0")];
             tensor<int32, [1]> input_5_axes_0 = const()[name = tensor<string, []>("input_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_5 = layer_norm(axes = input_5_axes_0, beta = encoder_ffn1_0_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_0_module_sequential_0_weight, x = input_3)[name = tensor<string, []>("input_5")];
-            tensor<fp32, [1, 3, 1024]> inputs_1 = linear(bias = encoder_ffn1_0_module_sequential_1_linear_bias, weight = encoder_ffn1_0_module_sequential_1_linear_weight, x = input_5)[name = tensor<string, []>("linear_1")];
-            tensor<fp32, [1, 3, 1024]> input_7 = silu(x = inputs_1)[name = tensor<string, []>("input_7")];
-            tensor<fp32, [1, 3, 256]> input_11 = linear(bias = encoder_ffn1_0_module_sequential_4_linear_bias, weight = encoder_ffn1_0_module_sequential_4_linear_weight, x = input_7)[name = tensor<string, []>("linear_2")];
-            tensor<fp32, []> var_147 = const()[name = tensor<string, []>("op_147"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 3, 256]> var_148 = mul(x = input_11, y = var_147)[name = tensor<string, []>("op_148")];
-            tensor<fp32, [1, 3, 256]> input_13 = add(x = var_148, y = input_3)[name = tensor<string, []>("input_13")];
+            tensor<fp32, [1, 3, 256]> input_5 = layer_norm(axes = input_5_axes_0, beta = inner_encoder_pre_layer_norm_bias, epsilon = var_56, gamma = inner_encoder_pre_layer_norm_weight, x = input_3)[name = tensor<string, []>("input_5")];
+            tensor<int32, [1]> input_7_axes_0 = const()[name = tensor<string, []>("input_7_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 3, 256]> input_7 = layer_norm(axes = input_7_axes_0, beta = inner_encoder_ffn1_0_module_sequential_0_bias, epsilon = var_56, gamma = inner_encoder_ffn1_0_module_sequential_0_weight, x = input_5)[name = tensor<string, []>("input_7")];
+            tensor<fp32, [1, 3, 1024]> inputs_1 = linear(bias = inner_encoder_ffn1_0_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_0_module_sequential_1_linear_weight, x = input_7)[name = tensor<string, []>("linear_1")];
+            tensor<fp32, [1, 3, 1024]> input_9 = silu(x = inputs_1)[name = tensor<string, []>("input_9")];
+            tensor<fp32, [1, 3, 256]> input_13 = linear(bias = inner_encoder_ffn1_0_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_0_module_sequential_4_linear_weight, x = input_9)[name = tensor<string, []>("linear_2")];
+            tensor<fp32, []> var_193 = const()[name = tensor<string, []>("op_193"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 3, 256]> var_194 = mul(x = input_13, y = var_193)[name = tensor<string, []>("op_194")];
+            tensor<fp32, [1, 3, 256]> input_15 = add(x = var_194, y = input_5)[name = tensor<string, []>("input_15")];
             tensor<int32, [1]> x_1_axes_0 = const()[name = tensor<string, []>("x_1_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> x_1 = layer_norm(axes = x_1_axes_0, beta = encoder_ret_lns_0_bias, epsilon = var_28, gamma = encoder_ret_lns_0_weight, x = input_13)[name = tensor<string, []>("x_1")];
+            tensor<fp32, [1, 3, 256]> x_1 = layer_norm(axes = x_1_axes_0, beta = inner_encoder_ret_lns_0_bias, epsilon = var_56, gamma = inner_encoder_ret_lns_0_weight, x = input_15)[name = tensor<string, []>("x_1")];
             tensor<int32, [5]> prev_kv_1_begin_0 = const()[name = tensor<string, []>("prev_kv_1_begin_0"), val = tensor<int32, [5]>([0, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_1_end_0 = const()[name = tensor<string, []>("prev_kv_1_end_0"), val = tensor<int32, [5]>([1, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_1_end_mask_0 = const()[name = tensor<string, []>("prev_kv_1_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -239,163 +257,163 @@ program(1.0)
             tensor<bool, [2]> prev_scale_1_end_mask_0 = const()[name = tensor<string, []>("prev_scale_1_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_1_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_1_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_1 = slice_by_index(begin = prev_scale_1_begin_0, end = prev_scale_1_end_0, end_mask = prev_scale_1_end_mask_0, squeeze_mask = prev_scale_1_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_1")];
-            tensor<fp32, [1, 3, 256]> var_162 = linear(bias = encoder_q_proj_0_bias, weight = encoder_q_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_3")];
-            tensor<int32, [4]> var_163 = const()[name = tensor<string, []>("op_163"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_164 = reshape(shape = var_163, x = var_162)[name = tensor<string, []>("op_164")];
+            tensor<fp32, [1, 3, 256]> var_208 = linear(bias = inner_encoder_q_proj_0_bias, weight = inner_encoder_q_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_3")];
+            tensor<int32, [4]> var_209 = const()[name = tensor<string, []>("op_209"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_210 = reshape(shape = var_209, x = var_208)[name = tensor<string, []>("op_210")];
             tensor<int32, [4]> q_1_perm_0 = const()[name = tensor<string, []>("q_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> var_168 = linear(bias = encoder_k_proj_0_bias, weight = encoder_k_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_4")];
-            tensor<fp32, []> var_169 = const()[name = tensor<string, []>("op_169"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 3, 256]> var_170 = mul(x = var_168, y = var_169)[name = tensor<string, []>("op_170")];
-            tensor<int32, [4]> var_171 = const()[name = tensor<string, []>("op_171"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_172 = reshape(shape = var_171, x = var_170)[name = tensor<string, []>("op_172")];
+            tensor<fp32, [1, 3, 256]> var_214 = linear(bias = inner_encoder_k_proj_0_bias, weight = inner_encoder_k_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_4")];
+            tensor<fp32, []> var_215 = const()[name = tensor<string, []>("op_215"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 3, 256]> var_216 = mul(x = var_214, y = var_215)[name = tensor<string, []>("op_216")];
+            tensor<int32, [4]> var_217 = const()[name = tensor<string, []>("op_217"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_218 = reshape(shape = var_217, x = var_216)[name = tensor<string, []>("op_218")];
             tensor<int32, [4]> k_1_perm_0 = const()[name = tensor<string, []>("k_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> var_176 = linear(bias = encoder_v_proj_0_bias, weight = encoder_v_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_5")];
-            tensor<int32, [4]> var_177 = const()[name = tensor<string, []>("op_177"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_178 = reshape(shape = var_177, x = var_176)[name = tensor<string, []>("op_178")];
+            tensor<fp32, [1, 3, 256]> var_222 = linear(bias = inner_encoder_v_proj_0_bias, weight = inner_encoder_v_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_5")];
+            tensor<int32, [4]> var_223 = const()[name = tensor<string, []>("op_223"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_224 = reshape(shape = var_223, x = var_222)[name = tensor<string, []>("op_224")];
             tensor<int32, [4]> v_1_perm_0 = const()[name = tensor<string, []>("v_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> input_17 = linear(bias = encoder_g_proj_0_bias, weight = encoder_g_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_6")];
+            tensor<fp32, [1, 3, 256]> input_19 = linear(bias = inner_encoder_g_proj_0_bias, weight = inner_encoder_g_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_6")];
             tensor<fp32, [1]> sqrt_s0_1 = sqrt(x = prev_scale_1)[name = tensor<string, []>("sqrt_s0_1")];
-            tensor<fp32, [3]> s_t_1 = add(x = prev_scale_1, y = encoder__t_index)[name = tensor<string, []>("s_t_1")];
+            tensor<fp32, [3]> s_t_1 = add(x = prev_scale_1, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_1")];
             tensor<fp32, [3]> sqrt_s_t_1 = sqrt(x = s_t_1)[name = tensor<string, []>("sqrt_s_t_1")];
             tensor<bool, []> qk_1_transpose_x_1 = const()[name = tensor<string, []>("qk_1_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_1_transpose_y_1 = const()[name = tensor<string, []>("qk_1_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 3, 64]> k_1 = transpose(perm = k_1_perm_0, x = var_172)[name = tensor<string, []>("transpose_57")];
-            tensor<fp32, [1, 4, 3, 64]> q_1 = transpose(perm = q_1_perm_0, x = var_164)[name = tensor<string, []>("transpose_58")];
+            tensor<fp32, [1, 4, 3, 64]> k_1 = transpose(perm = k_1_perm_0, x = var_218)[name = tensor<string, []>("transpose_57")];
+            tensor<fp32, [1, 4, 3, 64]> q_1 = transpose(perm = q_1_perm_0, x = var_210)[name = tensor<string, []>("transpose_58")];
             tensor<fp32, [1, 4, 3, 3]> qk_1 = matmul(transpose_x = qk_1_transpose_x_1, transpose_y = qk_1_transpose_y_1, x = q_1, y = k_1)[name = tensor<string, []>("qk_1")];
-            tensor<int32, [2]> var_188 = const()[name = tensor<string, []>("op_188"), val = tensor<int32, [2]>([3, 1])];
-            tensor<fp32, [3, 1]> var_189 = reshape(shape = var_188, x = sqrt_s_t_1)[name = tensor<string, []>("op_189")];
-            tensor<fp32, [3, 3]> M_1 = real_div(x = encoder__causal_mask, y = var_189)[name = tensor<string, []>("M_1")];
-            tensor<fp32, [1, 4, 3, 3]> var_191 = mul(x = qk_1, y = M_1)[name = tensor<string, []>("op_191")];
+            tensor<int32, [2]> var_234 = const()[name = tensor<string, []>("op_234"), val = tensor<int32, [2]>([3, 1])];
+            tensor<fp32, [3, 1]> var_235 = reshape(shape = var_234, x = sqrt_s_t_1)[name = tensor<string, []>("op_235")];
+            tensor<fp32, [3, 3]> M_1 = real_div(x = inner_encoder__causal_mask, y = var_235)[name = tensor<string, []>("M_1")];
+            tensor<fp32, [1, 4, 3, 3]> var_237 = mul(x = qk_1, y = M_1)[name = tensor<string, []>("op_237")];
             tensor<bool, []> inner_1_transpose_x_0 = const()[name = tensor<string, []>("inner_1_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_1_transpose_y_0 = const()[name = tensor<string, []>("inner_1_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 3, 64]> v_1 = transpose(perm = v_1_perm_0, x = var_178)[name = tensor<string, []>("transpose_56")];
-            tensor<fp32, [1, 4, 3, 64]> inner_1 = matmul(transpose_x = inner_1_transpose_x_0, transpose_y = inner_1_transpose_y_0, x = var_191, y = v_1)[name = tensor<string, []>("inner_1")];
-            tensor<bool, []> var_193_transpose_x_0 = const()[name = tensor<string, []>("op_193_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_193_transpose_y_0 = const()[name = tensor<string, []>("op_193_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 3, 64]> var_193 = matmul(transpose_x = var_193_transpose_x_0, transpose_y = var_193_transpose_y_0, x = q_1, y = prev_kv_1)[name = tensor<string, []>("op_193")];
-            tensor<fp32, [3]> var_194 = real_div(x = sqrt_s0_1, y = sqrt_s_t_1)[name = tensor<string, []>("op_194")];
-            tensor<int32, [4]> var_195 = const()[name = tensor<string, []>("op_195"), val = tensor<int32, [4]>([1, 1, 3, 1])];
-            tensor<fp32, [1, 1, 3, 1]> var_196 = reshape(shape = var_195, x = var_194)[name = tensor<string, []>("op_196")];
-            tensor<fp32, [1, 4, 3, 64]> cross_1 = mul(x = var_193, y = var_196)[name = tensor<string, []>("cross_1")];
+            tensor<fp32, [1, 4, 3, 64]> v_1 = transpose(perm = v_1_perm_0, x = var_224)[name = tensor<string, []>("transpose_56")];
+            tensor<fp32, [1, 4, 3, 64]> inner_1 = matmul(transpose_x = inner_1_transpose_x_0, transpose_y = inner_1_transpose_y_0, x = var_237, y = v_1)[name = tensor<string, []>("inner_1")];
+            tensor<bool, []> var_239_transpose_x_0 = const()[name = tensor<string, []>("op_239_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_239_transpose_y_0 = const()[name = tensor<string, []>("op_239_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 3, 64]> var_239 = matmul(transpose_x = var_239_transpose_x_0, transpose_y = var_239_transpose_y_0, x = q_1, y = prev_kv_1)[name = tensor<string, []>("op_239")];
+            tensor<fp32, [3]> var_240 = real_div(x = sqrt_s0_1, y = sqrt_s_t_1)[name = tensor<string, []>("op_240")];
+            tensor<int32, [4]> var_241 = const()[name = tensor<string, []>("op_241"), val = tensor<int32, [4]>([1, 1, 3, 1])];
+            tensor<fp32, [1, 1, 3, 1]> var_242 = reshape(shape = var_241, x = var_240)[name = tensor<string, []>("op_242")];
+            tensor<fp32, [1, 4, 3, 64]> cross_1 = mul(x = var_239, y = var_242)[name = tensor<string, []>("cross_1")];
             tensor<fp32, [1, 4, 3, 64]> out_1 = add(x = inner_1, y = cross_1)[name = tensor<string, []>("out_1")];
-            tensor<fp32, [1, 4, 64, 64]> var_199 = mul(x = prev_kv_1, y = sqrt_s0_1)[name = tensor<string, []>("op_199")];
-            tensor<bool, []> var_201_transpose_x_1 = const()[name = tensor<string, []>("op_201_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_201_transpose_y_1 = const()[name = tensor<string, []>("op_201_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_201 = matmul(transpose_x = var_201_transpose_x_1, transpose_y = var_201_transpose_y_1, x = k_1, y = v_1)[name = tensor<string, []>("op_201")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_1 = add(x = var_199, y = var_201)[name = tensor<string, []>("new_kv_unnorm_1")];
-            tensor<fp32, []> var_203 = const()[name = tensor<string, []>("op_203"), val = tensor<fp32, []>(0x1.8p+1)];
-            tensor<fp32, [1]> new_scale_1 = add(x = prev_scale_1, y = var_203)[name = tensor<string, []>("new_scale_1")];
-            tensor<fp32, [1]> var_205 = sqrt(x = new_scale_1)[name = tensor<string, []>("op_205")];
-            tensor<fp32, [1, 4, 64, 64]> var_206 = real_div(x = new_kv_unnorm_1, y = var_205)[name = tensor<string, []>("op_206")];
-            tensor<int32, [4]> var_207_perm_0 = const()[name = tensor<string, []>("op_207_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_245 = mul(x = prev_kv_1, y = sqrt_s0_1)[name = tensor<string, []>("op_245")];
+            tensor<bool, []> var_247_transpose_x_1 = const()[name = tensor<string, []>("op_247_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_247_transpose_y_1 = const()[name = tensor<string, []>("op_247_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_247 = matmul(transpose_x = var_247_transpose_x_1, transpose_y = var_247_transpose_y_1, x = k_1, y = v_1)[name = tensor<string, []>("op_247")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_1 = add(x = var_245, y = var_247)[name = tensor<string, []>("new_kv_unnorm_1")];
+            tensor<fp32, []> var_249 = const()[name = tensor<string, []>("op_249"), val = tensor<fp32, []>(0x1.8p+1)];
+            tensor<fp32, [1]> new_scale_1 = add(x = prev_scale_1, y = var_249)[name = tensor<string, []>("new_scale_1")];
+            tensor<fp32, [1]> var_251 = sqrt(x = new_scale_1)[name = tensor<string, []>("op_251")];
+            tensor<fp32, [1, 4, 64, 64]> var_252 = real_div(x = new_kv_unnorm_1, y = var_251)[name = tensor<string, []>("op_252")];
+            tensor<int32, [4]> var_253_perm_0 = const()[name = tensor<string, []>("op_253_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_3_axes_0 = const()[name = tensor<string, []>("out_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 4, 64]> var_207 = transpose(perm = var_207_perm_0, x = out_1)[name = tensor<string, []>("transpose_55")];
-            tensor<fp32, [1, 3, 4, 64]> out_3 = layer_norm(axes = out_3_axes_0, epsilon = var_17, x = var_207)[name = tensor<string, []>("out_3")];
-            tensor<int32, [3]> var_211 = const()[name = tensor<string, []>("op_211"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<fp32, [1, 3, 256]> out_5 = reshape(shape = var_211, x = out_3)[name = tensor<string, []>("out_5")];
-            tensor<fp32, [1, 3, 256]> var_213 = silu(x = input_17)[name = tensor<string, []>("op_213")];
-            tensor<fp32, [1, 3, 256]> input_19 = mul(x = var_213, y = out_5)[name = tensor<string, []>("input_19")];
-            tensor<fp32, [1, 3, 256]> ret_out_1 = linear(bias = encoder_out_proj_0_bias, weight = encoder_out_proj_0_weight, x = input_19)[name = tensor<string, []>("linear_7")];
-            tensor<fp32, [1, 3, 256]> x_3 = add(x = input_13, y = ret_out_1)[name = tensor<string, []>("x_3")];
+            tensor<fp32, [1, 3, 4, 64]> var_253 = transpose(perm = var_253_perm_0, x = out_1)[name = tensor<string, []>("transpose_55")];
+            tensor<fp32, [1, 3, 4, 64]> out_3 = layer_norm(axes = out_3_axes_0, epsilon = var_64, x = var_253)[name = tensor<string, []>("out_3")];
+            tensor<int32, [3]> var_257 = const()[name = tensor<string, []>("op_257"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<fp32, [1, 3, 256]> out_5 = reshape(shape = var_257, x = out_3)[name = tensor<string, []>("out_5")];
+            tensor<fp32, [1, 3, 256]> var_259 = silu(x = input_19)[name = tensor<string, []>("op_259")];
+            tensor<fp32, [1, 3, 256]> input_21 = mul(x = var_259, y = out_5)[name = tensor<string, []>("input_21")];
+            tensor<fp32, [1, 3, 256]> ret_out_1 = linear(bias = inner_encoder_out_proj_0_bias, weight = inner_encoder_out_proj_0_weight, x = input_21)[name = tensor<string, []>("linear_7")];
+            tensor<fp32, [1, 3, 256]> x_3 = add(x = input_15, y = ret_out_1)[name = tensor<string, []>("x_3")];
             tensor<int32, [4]> window_1_begin_0 = const()[name = tensor<string, []>("window_1_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> window_1_end_0 = const()[name = tensor<string, []>("window_1_end_0"), val = tensor<int32, [4]>([1, 1, 16, 256])];
             tensor<bool, [4]> window_1_end_mask_0 = const()[name = tensor<string, []>("window_1_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_1_squeeze_mask_0 = const()[name = tensor<string, []>("window_1_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_1 = slice_by_index(begin = window_1_begin_0, end = window_1_end_0, end_mask = window_1_end_mask_0, squeeze_mask = window_1_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_1")];
-            tensor<int32, [3]> var_221_begin_0 = const()[name = tensor<string, []>("op_221_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_221_end_0 = const()[name = tensor<string, []>("op_221_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_221_end_mask_0 = const()[name = tensor<string, []>("op_221_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_221 = slice_by_index(begin = var_221_begin_0, end = var_221_end_0, end_mask = var_221_end_mask_0, x = x_3)[name = tensor<string, []>("op_221")];
-            tensor<int32, [3]> var_224_begin_0 = const()[name = tensor<string, []>("op_224_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_224_end_0 = const()[name = tensor<string, []>("op_224_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_224_end_mask_0 = const()[name = tensor<string, []>("op_224_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_224 = slice_by_index(begin = var_224_begin_0, end = var_224_end_0, end_mask = var_224_end_mask_0, x = window_1)[name = tensor<string, []>("op_224")];
+            tensor<int32, [3]> var_267_begin_0 = const()[name = tensor<string, []>("op_267_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_267_end_0 = const()[name = tensor<string, []>("op_267_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_267_end_mask_0 = const()[name = tensor<string, []>("op_267_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_267 = slice_by_index(begin = var_267_begin_0, end = var_267_end_0, end_mask = var_267_end_mask_0, x = x_3)[name = tensor<string, []>("op_267")];
+            tensor<int32, [3]> var_270_begin_0 = const()[name = tensor<string, []>("op_270_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_270_end_0 = const()[name = tensor<string, []>("op_270_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_270_end_mask_0 = const()[name = tensor<string, []>("op_270_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_270 = slice_by_index(begin = var_270_begin_0, end = var_270_end_0, end_mask = var_270_end_mask_0, x = window_1)[name = tensor<string, []>("op_270")];
             tensor<bool, []> window_3_interleave_0 = const()[name = tensor<string, []>("window_3_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_3 = concat(axis = var_26, interleave = window_3_interleave_0, values = (var_224, var_221))[name = tensor<string, []>("window_3")];
-            tensor<int32, [3]> var_229_begin_0 = const()[name = tensor<string, []>("op_229_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_229_end_0 = const()[name = tensor<string, []>("op_229_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_229_end_mask_0 = const()[name = tensor<string, []>("op_229_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_229 = slice_by_index(begin = var_229_begin_0, end = var_229_end_0, end_mask = var_229_end_mask_0, x = x_3)[name = tensor<string, []>("op_229")];
-            tensor<int32, [3]> var_232_begin_0 = const()[name = tensor<string, []>("op_232_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_232_end_0 = const()[name = tensor<string, []>("op_232_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_232_end_mask_0 = const()[name = tensor<string, []>("op_232_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_232 = slice_by_index(begin = var_232_begin_0, end = var_232_end_0, end_mask = var_232_end_mask_0, x = window_3)[name = tensor<string, []>("op_232")];
+            tensor<fp32, [1, 16, 256]> window_3 = concat(axis = var_72, interleave = window_3_interleave_0, values = (var_270, var_267))[name = tensor<string, []>("window_3")];
+            tensor<int32, [3]> var_275_begin_0 = const()[name = tensor<string, []>("op_275_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_275_end_0 = const()[name = tensor<string, []>("op_275_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_275_end_mask_0 = const()[name = tensor<string, []>("op_275_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_275 = slice_by_index(begin = var_275_begin_0, end = var_275_end_0, end_mask = var_275_end_mask_0, x = x_3)[name = tensor<string, []>("op_275")];
+            tensor<int32, [3]> var_278_begin_0 = const()[name = tensor<string, []>("op_278_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_278_end_0 = const()[name = tensor<string, []>("op_278_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_278_end_mask_0 = const()[name = tensor<string, []>("op_278_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_278 = slice_by_index(begin = var_278_begin_0, end = var_278_end_0, end_mask = var_278_end_mask_0, x = window_3)[name = tensor<string, []>("op_278")];
             tensor<bool, []> window_5_interleave_0 = const()[name = tensor<string, []>("window_5_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_5 = concat(axis = var_26, interleave = window_5_interleave_0, values = (var_232, var_229))[name = tensor<string, []>("window_5")];
-            tensor<int32, [3]> var_237_begin_0 = const()[name = tensor<string, []>("op_237_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_237_end_0 = const()[name = tensor<string, []>("op_237_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_237_end_mask_0 = const()[name = tensor<string, []>("op_237_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_237 = slice_by_index(begin = var_237_begin_0, end = var_237_end_0, end_mask = var_237_end_mask_0, x = x_3)[name = tensor<string, []>("op_237")];
-            tensor<int32, [3]> var_240_begin_0 = const()[name = tensor<string, []>("op_240_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_240_end_0 = const()[name = tensor<string, []>("op_240_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_240_end_mask_0 = const()[name = tensor<string, []>("op_240_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_240 = slice_by_index(begin = var_240_begin_0, end = var_240_end_0, end_mask = var_240_end_mask_0, x = window_5)[name = tensor<string, []>("op_240")];
+            tensor<fp32, [1, 16, 256]> window_5 = concat(axis = var_72, interleave = window_5_interleave_0, values = (var_278, var_275))[name = tensor<string, []>("window_5")];
+            tensor<int32, [3]> var_283_begin_0 = const()[name = tensor<string, []>("op_283_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_283_end_0 = const()[name = tensor<string, []>("op_283_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_283_end_mask_0 = const()[name = tensor<string, []>("op_283_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_283 = slice_by_index(begin = var_283_begin_0, end = var_283_end_0, end_mask = var_283_end_mask_0, x = x_3)[name = tensor<string, []>("op_283")];
+            tensor<int32, [3]> var_286_begin_0 = const()[name = tensor<string, []>("op_286_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_286_end_0 = const()[name = tensor<string, []>("op_286_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_286_end_mask_0 = const()[name = tensor<string, []>("op_286_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_286 = slice_by_index(begin = var_286_begin_0, end = var_286_end_0, end_mask = var_286_end_mask_0, x = window_5)[name = tensor<string, []>("op_286")];
             tensor<bool, []> window_7_interleave_0 = const()[name = tensor<string, []>("window_7_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_7 = concat(axis = var_26, interleave = window_7_interleave_0, values = (var_240, var_237))[name = tensor<string, []>("window_7")];
-            tensor<bool, []> input_21_interleave_0 = const()[name = tensor<string, []>("input_21_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [3, 16, 256]> input_21 = concat(axis = var_23, interleave = input_21_interleave_0, values = (window_3, window_5, window_7))[name = tensor<string, []>("input_21")];
+            tensor<fp32, [1, 16, 256]> window_7 = concat(axis = var_72, interleave = window_7_interleave_0, values = (var_286, var_283))[name = tensor<string, []>("window_7")];
+            tensor<bool, []> input_23_interleave_0 = const()[name = tensor<string, []>("input_23_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [3, 16, 256]> input_23 = concat(axis = var_59, interleave = input_23_interleave_0, values = (window_3, window_5, window_7))[name = tensor<string, []>("input_23")];
             tensor<int32, [1]> x_5_axes_0 = const()[name = tensor<string, []>("x_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [3, 16, 256]> x_5 = layer_norm(axes = x_5_axes_0, beta = encoder_conv_module_0_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_0_sequential_0_weight, x = input_21)[name = tensor<string, []>("x_5")];
-            tensor<int32, [3]> input_23_perm_0 = const()[name = tensor<string, []>("input_23_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [3, 16, 256]> x_5 = layer_norm(axes = x_5_axes_0, beta = inner_encoder_conv_module_0_sequential_0_bias, epsilon = var_56, gamma = inner_encoder_conv_module_0_sequential_0_weight, x = input_23)[name = tensor<string, []>("x_5")];
+            tensor<int32, [3]> input_25_perm_0 = const()[name = tensor<string, []>("input_25_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_3_pad_type_0 = const()[name = tensor<string, []>("inputs_3_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_3_strides_0 = const()[name = tensor<string, []>("inputs_3_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_3_pad_0 = const()[name = tensor<string, []>("inputs_3_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_3_dilations_0 = const()[name = tensor<string, []>("inputs_3_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_3_groups_0 = const()[name = tensor<string, []>("inputs_3_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> input_23 = transpose(perm = input_23_perm_0, x = x_5)[name = tensor<string, []>("transpose_54")];
-            tensor<fp32, [3, 512, 16]> inputs_3 = conv(bias = encoder_conv_module_0_sequential_2_conv_bias, dilations = inputs_3_dilations_0, groups = inputs_3_groups_0, pad = inputs_3_pad_0, pad_type = inputs_3_pad_type_0, strides = inputs_3_strides_0, weight = encoder_conv_module_0_sequential_2_conv_weight, x = input_23)[name = tensor<string, []>("inputs_3")];
-            tensor<int32, [2]> var_265_split_sizes_0 = const()[name = tensor<string, []>("op_265_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_265_axis_0 = const()[name = tensor<string, []>("op_265_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> var_265_0, tensor<fp32, [3, 256, 16]> var_265_1 = split(axis = var_265_axis_0, split_sizes = var_265_split_sizes_0, x = inputs_3)[name = tensor<string, []>("op_265")];
-            tensor<fp32, [3, 256, 16]> var_267 = sigmoid(x = var_265_1)[name = tensor<string, []>("op_267")];
-            tensor<fp32, [3, 256, 16]> inputs_5 = mul(x = var_265_0, y = var_267)[name = tensor<string, []>("inputs_5")];
+            tensor<fp32, [3, 256, 16]> input_25 = transpose(perm = input_25_perm_0, x = x_5)[name = tensor<string, []>("transpose_54")];
+            tensor<fp32, [3, 512, 16]> inputs_3 = conv(bias = inner_encoder_conv_module_0_sequential_2_conv_bias, dilations = inputs_3_dilations_0, groups = inputs_3_groups_0, pad = inputs_3_pad_0, pad_type = inputs_3_pad_type_0, strides = inputs_3_strides_0, weight = inner_encoder_conv_module_0_sequential_2_conv_weight, x = input_25)[name = tensor<string, []>("inputs_3")];
+            tensor<int32, [2]> var_311_split_sizes_0 = const()[name = tensor<string, []>("op_311_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_311_axis_0 = const()[name = tensor<string, []>("op_311_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [3, 256, 16]> var_311_0, tensor<fp32, [3, 256, 16]> var_311_1 = split(axis = var_311_axis_0, split_sizes = var_311_split_sizes_0, x = inputs_3)[name = tensor<string, []>("op_311")];
+            tensor<fp32, [3, 256, 16]> var_313 = sigmoid(x = var_311_1)[name = tensor<string, []>("op_313")];
+            tensor<fp32, [3, 256, 16]> inputs_5 = mul(x = var_311_0, y = var_313)[name = tensor<string, []>("inputs_5")];
             tensor<string, []> outputs_aug_1_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_1_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_1_pad_0 = const()[name = tensor<string, []>("outputs_aug_1_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_1_groups_0 = const()[name = tensor<string, []>("outputs_aug_1_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_1_strides_0 = const()[name = tensor<string, []>("outputs_aug_1_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_1_dilations_0 = const()[name = tensor<string, []>("outputs_aug_1_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [3, 256, 31]> outputs_aug_1 = conv(dilations = outputs_aug_1_dilations_0, groups = outputs_aug_1_groups_0, pad = outputs_aug_1_pad_0, pad_type = outputs_aug_1_pad_type_0, strides = outputs_aug_1_strides_0, weight = encoder_conv_module_0_sequential_4_conv_weight, x = inputs_5)[name = tensor<string, []>("outputs_aug_1")];
-            tensor<int32, [3]> input_25_begin_0 = const()[name = tensor<string, []>("input_25_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_25_end_0 = const()[name = tensor<string, []>("input_25_end_0"), val = tensor<int32, [3]>([3, 256, 16])];
-            tensor<bool, [3]> input_25_end_mask_0 = const()[name = tensor<string, []>("input_25_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [3, 256, 16]> input_25 = slice_by_index(begin = input_25_begin_0, end = input_25_end_0, end_mask = input_25_end_mask_0, x = outputs_aug_1)[name = tensor<string, []>("input_25")];
-            tensor<fp32, [3, 256, 16]> inputs_7 = batch_norm(beta = encoder_conv_module_0_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_0_sequential_5_weight, mean = encoder_conv_module_0_sequential_5_running_mean, variance = encoder_conv_module_0_sequential_5_running_var, x = input_25)[name = tensor<string, []>("inputs_7")];
-            tensor<fp32, [3, 256, 16]> input_27 = silu(x = inputs_7)[name = tensor<string, []>("input_27")];
-            tensor<string, []> input_29_pad_type_0 = const()[name = tensor<string, []>("input_29_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_29_strides_0 = const()[name = tensor<string, []>("input_29_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_29_pad_0 = const()[name = tensor<string, []>("input_29_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_29_dilations_0 = const()[name = tensor<string, []>("input_29_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_29_groups_0 = const()[name = tensor<string, []>("input_29_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> input_29 = conv(bias = encoder_conv_module_0_sequential_7_conv_bias, dilations = input_29_dilations_0, groups = input_29_groups_0, pad = input_29_pad_0, pad_type = input_29_pad_type_0, strides = input_29_strides_0, weight = encoder_conv_module_0_sequential_7_conv_weight, x = input_27)[name = tensor<string, []>("input_29")];
+            tensor<fp32, [3, 256, 31]> outputs_aug_1 = conv(dilations = outputs_aug_1_dilations_0, groups = outputs_aug_1_groups_0, pad = outputs_aug_1_pad_0, pad_type = outputs_aug_1_pad_type_0, strides = outputs_aug_1_strides_0, weight = inner_encoder_conv_module_0_sequential_4_conv_weight, x = inputs_5)[name = tensor<string, []>("outputs_aug_1")];
+            tensor<int32, [3]> input_27_begin_0 = const()[name = tensor<string, []>("input_27_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_27_end_0 = const()[name = tensor<string, []>("input_27_end_0"), val = tensor<int32, [3]>([3, 256, 16])];
+            tensor<bool, [3]> input_27_end_mask_0 = const()[name = tensor<string, []>("input_27_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [3, 256, 16]> input_27 = slice_by_index(begin = input_27_begin_0, end = input_27_end_0, end_mask = input_27_end_mask_0, x = outputs_aug_1)[name = tensor<string, []>("input_27")];
+            tensor<fp32, [3, 256, 16]> inputs_7 = batch_norm(beta = inner_encoder_conv_module_0_sequential_5_bias, epsilon = var_56, gamma = inner_encoder_conv_module_0_sequential_5_weight, mean = inner_encoder_conv_module_0_sequential_5_running_mean, variance = inner_encoder_conv_module_0_sequential_5_running_var, x = input_27)[name = tensor<string, []>("inputs_7")];
+            tensor<fp32, [3, 256, 16]> input_29 = silu(x = inputs_7)[name = tensor<string, []>("input_29")];
+            tensor<string, []> input_31_pad_type_0 = const()[name = tensor<string, []>("input_31_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_31_strides_0 = const()[name = tensor<string, []>("input_31_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_31_pad_0 = const()[name = tensor<string, []>("input_31_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_31_dilations_0 = const()[name = tensor<string, []>("input_31_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_31_groups_0 = const()[name = tensor<string, []>("input_31_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [3, 256, 16]> input_31 = conv(bias = inner_encoder_conv_module_0_sequential_7_conv_bias, dilations = input_31_dilations_0, groups = input_31_groups_0, pad = input_31_pad_0, pad_type = input_31_pad_type_0, strides = input_31_strides_0, weight = inner_encoder_conv_module_0_sequential_7_conv_weight, x = input_29)[name = tensor<string, []>("input_31")];
             tensor<int32, [3]> conv_out_1_perm_0 = const()[name = tensor<string, []>("conv_out_1_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_298_begin_0 = const()[name = tensor<string, []>("op_298_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_298_end_0 = const()[name = tensor<string, []>("op_298_end_0"), val = tensor<int32, [3]>([3, 16, 256])];
-            tensor<bool, [3]> var_298_end_mask_0 = const()[name = tensor<string, []>("op_298_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [3, 16, 256]> conv_out_1 = transpose(perm = conv_out_1_perm_0, x = input_29)[name = tensor<string, []>("transpose_53")];
-            tensor<fp32, [3, 1, 256]> var_298 = slice_by_index(begin = var_298_begin_0, end = var_298_end_0, end_mask = var_298_end_mask_0, x = conv_out_1)[name = tensor<string, []>("op_298")];
-            tensor<int32, [3]> var_300_perm_0 = const()[name = tensor<string, []>("op_300_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 3, 256]> var_300 = transpose(perm = var_300_perm_0, x = var_298)[name = tensor<string, []>("transpose_52")];
-            tensor<fp32, [1, 3, 256]> input_31 = add(x = x_3, y = var_300)[name = tensor<string, []>("input_31")];
-            tensor<int32, [1]> input_33_axes_0 = const()[name = tensor<string, []>("input_33_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_33 = layer_norm(axes = input_33_axes_0, beta = encoder_ffn2_0_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_0_module_sequential_0_weight, x = input_31)[name = tensor<string, []>("input_33")];
-            tensor<fp32, [1, 3, 1024]> inputs_9 = linear(bias = encoder_ffn2_0_module_sequential_1_linear_bias, weight = encoder_ffn2_0_module_sequential_1_linear_weight, x = input_33)[name = tensor<string, []>("linear_8")];
-            tensor<fp32, [1, 3, 1024]> input_35 = silu(x = inputs_9)[name = tensor<string, []>("input_35")];
-            tensor<fp32, [1, 3, 256]> input_39 = linear(bias = encoder_ffn2_0_module_sequential_4_linear_bias, weight = encoder_ffn2_0_module_sequential_4_linear_weight, x = input_35)[name = tensor<string, []>("linear_9")];
-            tensor<fp32, []> var_323 = const()[name = tensor<string, []>("op_323"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 3, 256]> var_324 = mul(x = input_39, y = var_323)[name = tensor<string, []>("op_324")];
-            tensor<fp32, [1, 3, 256]> input_41 = add(x = var_324, y = input_31)[name = tensor<string, []>("input_41")];
-            tensor<int32, [1]> input_43_axes_0 = const()[name = tensor<string, []>("input_43_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_43 = layer_norm(axes = input_43_axes_0, beta = encoder_layer_norm_0_bias, epsilon = var_28, gamma = encoder_layer_norm_0_weight, x = input_41)[name = tensor<string, []>("input_43")];
+            tensor<int32, [3]> var_344_begin_0 = const()[name = tensor<string, []>("op_344_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_344_end_0 = const()[name = tensor<string, []>("op_344_end_0"), val = tensor<int32, [3]>([3, 16, 256])];
+            tensor<bool, [3]> var_344_end_mask_0 = const()[name = tensor<string, []>("op_344_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [3, 16, 256]> conv_out_1 = transpose(perm = conv_out_1_perm_0, x = input_31)[name = tensor<string, []>("transpose_53")];
+            tensor<fp32, [3, 1, 256]> var_344 = slice_by_index(begin = var_344_begin_0, end = var_344_end_0, end_mask = var_344_end_mask_0, x = conv_out_1)[name = tensor<string, []>("op_344")];
+            tensor<int32, [3]> var_346_perm_0 = const()[name = tensor<string, []>("op_346_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 3, 256]> var_346 = transpose(perm = var_346_perm_0, x = var_344)[name = tensor<string, []>("transpose_52")];
+            tensor<fp32, [1, 3, 256]> input_33 = add(x = x_3, y = var_346)[name = tensor<string, []>("input_33")];
+            tensor<int32, [1]> input_35_axes_0 = const()[name = tensor<string, []>("input_35_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 3, 256]> input_35 = layer_norm(axes = input_35_axes_0, beta = inner_encoder_ffn2_0_module_sequential_0_bias, epsilon = var_56, gamma = inner_encoder_ffn2_0_module_sequential_0_weight, x = input_33)[name = tensor<string, []>("input_35")];
+            tensor<fp32, [1, 3, 1024]> inputs_9 = linear(bias = inner_encoder_ffn2_0_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_0_module_sequential_1_linear_weight, x = input_35)[name = tensor<string, []>("linear_8")];
+            tensor<fp32, [1, 3, 1024]> input_37 = silu(x = inputs_9)[name = tensor<string, []>("input_37")];
+            tensor<fp32, [1, 3, 256]> input_41 = linear(bias = inner_encoder_ffn2_0_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_0_module_sequential_4_linear_weight, x = input_37)[name = tensor<string, []>("linear_9")];
+            tensor<fp32, []> var_369 = const()[name = tensor<string, []>("op_369"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 3, 256]> var_370 = mul(x = input_41, y = var_369)[name = tensor<string, []>("op_370")];
+            tensor<fp32, [1, 3, 256]> input_43 = add(x = var_370, y = input_33)[name = tensor<string, []>("input_43")];
             tensor<int32, [1]> input_45_axes_0 = const()[name = tensor<string, []>("input_45_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_45 = layer_norm(axes = input_45_axes_0, beta = encoder_ffn1_1_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_1_module_sequential_0_weight, x = input_43)[name = tensor<string, []>("input_45")];
-            tensor<fp32, [1, 3, 1024]> inputs_11 = linear(bias = encoder_ffn1_1_module_sequential_1_linear_bias, weight = encoder_ffn1_1_module_sequential_1_linear_weight, x = input_45)[name = tensor<string, []>("linear_10")];
-            tensor<fp32, [1, 3, 1024]> input_47 = silu(x = inputs_11)[name = tensor<string, []>("input_47")];
-            tensor<fp32, [1, 3, 256]> input_51 = linear(bias = encoder_ffn1_1_module_sequential_4_linear_bias, weight = encoder_ffn1_1_module_sequential_4_linear_weight, x = input_47)[name = tensor<string, []>("linear_11")];
-            tensor<fp32, []> var_353 = const()[name = tensor<string, []>("op_353"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 3, 256]> var_354 = mul(x = input_51, y = var_353)[name = tensor<string, []>("op_354")];
-            tensor<fp32, [1, 3, 256]> input_53 = add(x = var_354, y = input_43)[name = tensor<string, []>("input_53")];
+            tensor<fp32, [1, 3, 256]> input_45 = layer_norm(axes = input_45_axes_0, beta = inner_encoder_layer_norm_0_bias, epsilon = var_56, gamma = inner_encoder_layer_norm_0_weight, x = input_43)[name = tensor<string, []>("input_45")];
+            tensor<int32, [1]> input_47_axes_0 = const()[name = tensor<string, []>("input_47_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 3, 256]> input_47 = layer_norm(axes = input_47_axes_0, beta = inner_encoder_ffn1_1_module_sequential_0_bias, epsilon = var_56, gamma = inner_encoder_ffn1_1_module_sequential_0_weight, x = input_45)[name = tensor<string, []>("input_47")];
+            tensor<fp32, [1, 3, 1024]> inputs_11 = linear(bias = inner_encoder_ffn1_1_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_1_module_sequential_1_linear_weight, x = input_47)[name = tensor<string, []>("linear_10")];
+            tensor<fp32, [1, 3, 1024]> input_49 = silu(x = inputs_11)[name = tensor<string, []>("input_49")];
+            tensor<fp32, [1, 3, 256]> input_53 = linear(bias = inner_encoder_ffn1_1_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_1_module_sequential_4_linear_weight, x = input_49)[name = tensor<string, []>("linear_11")];
+            tensor<fp32, []> var_399 = const()[name = tensor<string, []>("op_399"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 3, 256]> var_400 = mul(x = input_53, y = var_399)[name = tensor<string, []>("op_400")];
+            tensor<fp32, [1, 3, 256]> input_55 = add(x = var_400, y = input_45)[name = tensor<string, []>("input_55")];
             tensor<int32, [1]> x_7_axes_0 = const()[name = tensor<string, []>("x_7_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> x_7 = layer_norm(axes = x_7_axes_0, beta = encoder_ret_lns_1_bias, epsilon = var_28, gamma = encoder_ret_lns_1_weight, x = input_53)[name = tensor<string, []>("x_7")];
+            tensor<fp32, [1, 3, 256]> x_7 = layer_norm(axes = x_7_axes_0, beta = inner_encoder_ret_lns_1_bias, epsilon = var_56, gamma = inner_encoder_ret_lns_1_weight, x = input_55)[name = tensor<string, []>("x_7")];
             tensor<int32, [5]> prev_kv_3_begin_0 = const()[name = tensor<string, []>("prev_kv_3_begin_0"), val = tensor<int32, [5]>([1, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_3_end_0 = const()[name = tensor<string, []>("prev_kv_3_end_0"), val = tensor<int32, [5]>([2, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_3_end_mask_0 = const()[name = tensor<string, []>("prev_kv_3_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -406,163 +424,163 @@ program(1.0)
             tensor<bool, [2]> prev_scale_3_end_mask_0 = const()[name = tensor<string, []>("prev_scale_3_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_3_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_3_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_3 = slice_by_index(begin = prev_scale_3_begin_0, end = prev_scale_3_end_0, end_mask = prev_scale_3_end_mask_0, squeeze_mask = prev_scale_3_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_3")];
-            tensor<fp32, [1, 3, 256]> var_368 = linear(bias = encoder_q_proj_1_bias, weight = encoder_q_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_12")];
-            tensor<int32, [4]> var_369 = const()[name = tensor<string, []>("op_369"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_370 = reshape(shape = var_369, x = var_368)[name = tensor<string, []>("op_370")];
+            tensor<fp32, [1, 3, 256]> var_414 = linear(bias = inner_encoder_q_proj_1_bias, weight = inner_encoder_q_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_12")];
+            tensor<int32, [4]> var_415 = const()[name = tensor<string, []>("op_415"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_416 = reshape(shape = var_415, x = var_414)[name = tensor<string, []>("op_416")];
             tensor<int32, [4]> q_3_perm_0 = const()[name = tensor<string, []>("q_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> var_374 = linear(bias = encoder_k_proj_1_bias, weight = encoder_k_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_13")];
-            tensor<fp32, []> var_375 = const()[name = tensor<string, []>("op_375"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 3, 256]> var_376 = mul(x = var_374, y = var_375)[name = tensor<string, []>("op_376")];
-            tensor<int32, [4]> var_377 = const()[name = tensor<string, []>("op_377"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_378 = reshape(shape = var_377, x = var_376)[name = tensor<string, []>("op_378")];
+            tensor<fp32, [1, 3, 256]> var_420 = linear(bias = inner_encoder_k_proj_1_bias, weight = inner_encoder_k_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_13")];
+            tensor<fp32, []> var_421 = const()[name = tensor<string, []>("op_421"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 3, 256]> var_422 = mul(x = var_420, y = var_421)[name = tensor<string, []>("op_422")];
+            tensor<int32, [4]> var_423 = const()[name = tensor<string, []>("op_423"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_424 = reshape(shape = var_423, x = var_422)[name = tensor<string, []>("op_424")];
             tensor<int32, [4]> k_3_perm_0 = const()[name = tensor<string, []>("k_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> var_382 = linear(bias = encoder_v_proj_1_bias, weight = encoder_v_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_14")];
-            tensor<int32, [4]> var_383 = const()[name = tensor<string, []>("op_383"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_384 = reshape(shape = var_383, x = var_382)[name = tensor<string, []>("op_384")];
+            tensor<fp32, [1, 3, 256]> var_428 = linear(bias = inner_encoder_v_proj_1_bias, weight = inner_encoder_v_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_14")];
+            tensor<int32, [4]> var_429 = const()[name = tensor<string, []>("op_429"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_430 = reshape(shape = var_429, x = var_428)[name = tensor<string, []>("op_430")];
             tensor<int32, [4]> v_3_perm_0 = const()[name = tensor<string, []>("v_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> input_57 = linear(bias = encoder_g_proj_1_bias, weight = encoder_g_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_15")];
+            tensor<fp32, [1, 3, 256]> input_59 = linear(bias = inner_encoder_g_proj_1_bias, weight = inner_encoder_g_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_15")];
             tensor<fp32, [1]> sqrt_s0_3 = sqrt(x = prev_scale_3)[name = tensor<string, []>("sqrt_s0_3")];
-            tensor<fp32, [3]> s_t_3 = add(x = prev_scale_3, y = encoder__t_index)[name = tensor<string, []>("s_t_3")];
+            tensor<fp32, [3]> s_t_3 = add(x = prev_scale_3, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_3")];
             tensor<fp32, [3]> sqrt_s_t_3 = sqrt(x = s_t_3)[name = tensor<string, []>("sqrt_s_t_3")];
             tensor<bool, []> qk_3_transpose_x_1 = const()[name = tensor<string, []>("qk_3_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_3_transpose_y_1 = const()[name = tensor<string, []>("qk_3_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 3, 64]> k_3 = transpose(perm = k_3_perm_0, x = var_378)[name = tensor<string, []>("transpose_50")];
-            tensor<fp32, [1, 4, 3, 64]> q_3 = transpose(perm = q_3_perm_0, x = var_370)[name = tensor<string, []>("transpose_51")];
+            tensor<fp32, [1, 4, 3, 64]> k_3 = transpose(perm = k_3_perm_0, x = var_424)[name = tensor<string, []>("transpose_50")];
+            tensor<fp32, [1, 4, 3, 64]> q_3 = transpose(perm = q_3_perm_0, x = var_416)[name = tensor<string, []>("transpose_51")];
             tensor<fp32, [1, 4, 3, 3]> qk_3 = matmul(transpose_x = qk_3_transpose_x_1, transpose_y = qk_3_transpose_y_1, x = q_3, y = k_3)[name = tensor<string, []>("qk_3")];
-            tensor<int32, [2]> var_394 = const()[name = tensor<string, []>("op_394"), val = tensor<int32, [2]>([3, 1])];
-            tensor<fp32, [3, 1]> var_395 = reshape(shape = var_394, x = sqrt_s_t_3)[name = tensor<string, []>("op_395")];
-            tensor<fp32, [3, 3]> M_3 = real_div(x = encoder__causal_mask, y = var_395)[name = tensor<string, []>("M_3")];
-            tensor<fp32, [1, 4, 3, 3]> var_397 = mul(x = qk_3, y = M_3)[name = tensor<string, []>("op_397")];
+            tensor<int32, [2]> var_440 = const()[name = tensor<string, []>("op_440"), val = tensor<int32, [2]>([3, 1])];
+            tensor<fp32, [3, 1]> var_441 = reshape(shape = var_440, x = sqrt_s_t_3)[name = tensor<string, []>("op_441")];
+            tensor<fp32, [3, 3]> M_3 = real_div(x = inner_encoder__causal_mask, y = var_441)[name = tensor<string, []>("M_3")];
+            tensor<fp32, [1, 4, 3, 3]> var_443 = mul(x = qk_3, y = M_3)[name = tensor<string, []>("op_443")];
             tensor<bool, []> inner_3_transpose_x_0 = const()[name = tensor<string, []>("inner_3_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_3_transpose_y_0 = const()[name = tensor<string, []>("inner_3_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 3, 64]> v_3 = transpose(perm = v_3_perm_0, x = var_384)[name = tensor<string, []>("transpose_49")];
-            tensor<fp32, [1, 4, 3, 64]> inner_3 = matmul(transpose_x = inner_3_transpose_x_0, transpose_y = inner_3_transpose_y_0, x = var_397, y = v_3)[name = tensor<string, []>("inner_3")];
-            tensor<bool, []> var_399_transpose_x_0 = const()[name = tensor<string, []>("op_399_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_399_transpose_y_0 = const()[name = tensor<string, []>("op_399_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 3, 64]> var_399 = matmul(transpose_x = var_399_transpose_x_0, transpose_y = var_399_transpose_y_0, x = q_3, y = prev_kv_3)[name = tensor<string, []>("op_399")];
-            tensor<fp32, [3]> var_400 = real_div(x = sqrt_s0_3, y = sqrt_s_t_3)[name = tensor<string, []>("op_400")];
-            tensor<int32, [4]> var_401 = const()[name = tensor<string, []>("op_401"), val = tensor<int32, [4]>([1, 1, 3, 1])];
-            tensor<fp32, [1, 1, 3, 1]> var_402 = reshape(shape = var_401, x = var_400)[name = tensor<string, []>("op_402")];
-            tensor<fp32, [1, 4, 3, 64]> cross_3 = mul(x = var_399, y = var_402)[name = tensor<string, []>("cross_3")];
+            tensor<fp32, [1, 4, 3, 64]> v_3 = transpose(perm = v_3_perm_0, x = var_430)[name = tensor<string, []>("transpose_49")];
+            tensor<fp32, [1, 4, 3, 64]> inner_3 = matmul(transpose_x = inner_3_transpose_x_0, transpose_y = inner_3_transpose_y_0, x = var_443, y = v_3)[name = tensor<string, []>("inner_3")];
+            tensor<bool, []> var_445_transpose_x_0 = const()[name = tensor<string, []>("op_445_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_445_transpose_y_0 = const()[name = tensor<string, []>("op_445_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 3, 64]> var_445 = matmul(transpose_x = var_445_transpose_x_0, transpose_y = var_445_transpose_y_0, x = q_3, y = prev_kv_3)[name = tensor<string, []>("op_445")];
+            tensor<fp32, [3]> var_446 = real_div(x = sqrt_s0_3, y = sqrt_s_t_3)[name = tensor<string, []>("op_446")];
+            tensor<int32, [4]> var_447 = const()[name = tensor<string, []>("op_447"), val = tensor<int32, [4]>([1, 1, 3, 1])];
+            tensor<fp32, [1, 1, 3, 1]> var_448 = reshape(shape = var_447, x = var_446)[name = tensor<string, []>("op_448")];
+            tensor<fp32, [1, 4, 3, 64]> cross_3 = mul(x = var_445, y = var_448)[name = tensor<string, []>("cross_3")];
             tensor<fp32, [1, 4, 3, 64]> out_7 = add(x = inner_3, y = cross_3)[name = tensor<string, []>("out_7")];
-            tensor<fp32, [1, 4, 64, 64]> var_405 = mul(x = prev_kv_3, y = sqrt_s0_3)[name = tensor<string, []>("op_405")];
-            tensor<bool, []> var_407_transpose_x_1 = const()[name = tensor<string, []>("op_407_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_407_transpose_y_1 = const()[name = tensor<string, []>("op_407_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_407 = matmul(transpose_x = var_407_transpose_x_1, transpose_y = var_407_transpose_y_1, x = k_3, y = v_3)[name = tensor<string, []>("op_407")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_3 = add(x = var_405, y = var_407)[name = tensor<string, []>("new_kv_unnorm_3")];
-            tensor<fp32, []> var_409 = const()[name = tensor<string, []>("op_409"), val = tensor<fp32, []>(0x1.8p+1)];
-            tensor<fp32, [1]> new_scale_3 = add(x = prev_scale_3, y = var_409)[name = tensor<string, []>("new_scale_3")];
-            tensor<fp32, [1]> var_411 = sqrt(x = new_scale_3)[name = tensor<string, []>("op_411")];
-            tensor<fp32, [1, 4, 64, 64]> var_412 = real_div(x = new_kv_unnorm_3, y = var_411)[name = tensor<string, []>("op_412")];
-            tensor<int32, [4]> var_413_perm_0 = const()[name = tensor<string, []>("op_413_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_451 = mul(x = prev_kv_3, y = sqrt_s0_3)[name = tensor<string, []>("op_451")];
+            tensor<bool, []> var_453_transpose_x_1 = const()[name = tensor<string, []>("op_453_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_453_transpose_y_1 = const()[name = tensor<string, []>("op_453_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_453 = matmul(transpose_x = var_453_transpose_x_1, transpose_y = var_453_transpose_y_1, x = k_3, y = v_3)[name = tensor<string, []>("op_453")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_3 = add(x = var_451, y = var_453)[name = tensor<string, []>("new_kv_unnorm_3")];
+            tensor<fp32, []> var_455 = const()[name = tensor<string, []>("op_455"), val = tensor<fp32, []>(0x1.8p+1)];
+            tensor<fp32, [1]> new_scale_3 = add(x = prev_scale_3, y = var_455)[name = tensor<string, []>("new_scale_3")];
+            tensor<fp32, [1]> var_457 = sqrt(x = new_scale_3)[name = tensor<string, []>("op_457")];
+            tensor<fp32, [1, 4, 64, 64]> var_458 = real_div(x = new_kv_unnorm_3, y = var_457)[name = tensor<string, []>("op_458")];
+            tensor<int32, [4]> var_459_perm_0 = const()[name = tensor<string, []>("op_459_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_9_axes_0 = const()[name = tensor<string, []>("out_9_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 4, 64]> var_413 = transpose(perm = var_413_perm_0, x = out_7)[name = tensor<string, []>("transpose_48")];
-            tensor<fp32, [1, 3, 4, 64]> out_9 = layer_norm(axes = out_9_axes_0, epsilon = var_17, x = var_413)[name = tensor<string, []>("out_9")];
-            tensor<int32, [3]> var_417 = const()[name = tensor<string, []>("op_417"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<fp32, [1, 3, 256]> out_11 = reshape(shape = var_417, x = out_9)[name = tensor<string, []>("out_11")];
-            tensor<fp32, [1, 3, 256]> var_419 = silu(x = input_57)[name = tensor<string, []>("op_419")];
-            tensor<fp32, [1, 3, 256]> input_59 = mul(x = var_419, y = out_11)[name = tensor<string, []>("input_59")];
-            tensor<fp32, [1, 3, 256]> ret_out_3 = linear(bias = encoder_out_proj_1_bias, weight = encoder_out_proj_1_weight, x = input_59)[name = tensor<string, []>("linear_16")];
-            tensor<fp32, [1, 3, 256]> x_9 = add(x = input_53, y = ret_out_3)[name = tensor<string, []>("x_9")];
+            tensor<fp32, [1, 3, 4, 64]> var_459 = transpose(perm = var_459_perm_0, x = out_7)[name = tensor<string, []>("transpose_48")];
+            tensor<fp32, [1, 3, 4, 64]> out_9 = layer_norm(axes = out_9_axes_0, epsilon = var_64, x = var_459)[name = tensor<string, []>("out_9")];
+            tensor<int32, [3]> var_463 = const()[name = tensor<string, []>("op_463"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<fp32, [1, 3, 256]> out_11 = reshape(shape = var_463, x = out_9)[name = tensor<string, []>("out_11")];
+            tensor<fp32, [1, 3, 256]> var_465 = silu(x = input_59)[name = tensor<string, []>("op_465")];
+            tensor<fp32, [1, 3, 256]> input_61 = mul(x = var_465, y = out_11)[name = tensor<string, []>("input_61")];
+            tensor<fp32, [1, 3, 256]> ret_out_3 = linear(bias = inner_encoder_out_proj_1_bias, weight = inner_encoder_out_proj_1_weight, x = input_61)[name = tensor<string, []>("linear_16")];
+            tensor<fp32, [1, 3, 256]> x_9 = add(x = input_55, y = ret_out_3)[name = tensor<string, []>("x_9")];
             tensor<int32, [4]> window_9_begin_0 = const()[name = tensor<string, []>("window_9_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> window_9_end_0 = const()[name = tensor<string, []>("window_9_end_0"), val = tensor<int32, [4]>([2, 1, 16, 256])];
             tensor<bool, [4]> window_9_end_mask_0 = const()[name = tensor<string, []>("window_9_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_9_squeeze_mask_0 = const()[name = tensor<string, []>("window_9_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_9 = slice_by_index(begin = window_9_begin_0, end = window_9_end_0, end_mask = window_9_end_mask_0, squeeze_mask = window_9_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_9")];
-            tensor<int32, [3]> var_427_begin_0 = const()[name = tensor<string, []>("op_427_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_427_end_0 = const()[name = tensor<string, []>("op_427_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_427_end_mask_0 = const()[name = tensor<string, []>("op_427_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_427 = slice_by_index(begin = var_427_begin_0, end = var_427_end_0, end_mask = var_427_end_mask_0, x = x_9)[name = tensor<string, []>("op_427")];
-            tensor<int32, [3]> var_430_begin_0 = const()[name = tensor<string, []>("op_430_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_430_end_0 = const()[name = tensor<string, []>("op_430_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_430_end_mask_0 = const()[name = tensor<string, []>("op_430_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_430 = slice_by_index(begin = var_430_begin_0, end = var_430_end_0, end_mask = var_430_end_mask_0, x = window_9)[name = tensor<string, []>("op_430")];
+            tensor<int32, [3]> var_473_begin_0 = const()[name = tensor<string, []>("op_473_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_473_end_0 = const()[name = tensor<string, []>("op_473_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_473_end_mask_0 = const()[name = tensor<string, []>("op_473_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_473 = slice_by_index(begin = var_473_begin_0, end = var_473_end_0, end_mask = var_473_end_mask_0, x = x_9)[name = tensor<string, []>("op_473")];
+            tensor<int32, [3]> var_476_begin_0 = const()[name = tensor<string, []>("op_476_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_476_end_0 = const()[name = tensor<string, []>("op_476_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_476_end_mask_0 = const()[name = tensor<string, []>("op_476_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_476 = slice_by_index(begin = var_476_begin_0, end = var_476_end_0, end_mask = var_476_end_mask_0, x = window_9)[name = tensor<string, []>("op_476")];
             tensor<bool, []> window_11_interleave_0 = const()[name = tensor<string, []>("window_11_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_11 = concat(axis = var_26, interleave = window_11_interleave_0, values = (var_430, var_427))[name = tensor<string, []>("window_11")];
-            tensor<int32, [3]> var_435_begin_0 = const()[name = tensor<string, []>("op_435_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_435_end_0 = const()[name = tensor<string, []>("op_435_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_435_end_mask_0 = const()[name = tensor<string, []>("op_435_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_435 = slice_by_index(begin = var_435_begin_0, end = var_435_end_0, end_mask = var_435_end_mask_0, x = x_9)[name = tensor<string, []>("op_435")];
-            tensor<int32, [3]> var_438_begin_0 = const()[name = tensor<string, []>("op_438_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_438_end_0 = const()[name = tensor<string, []>("op_438_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_438_end_mask_0 = const()[name = tensor<string, []>("op_438_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_438 = slice_by_index(begin = var_438_begin_0, end = var_438_end_0, end_mask = var_438_end_mask_0, x = window_11)[name = tensor<string, []>("op_438")];
+            tensor<fp32, [1, 16, 256]> window_11 = concat(axis = var_72, interleave = window_11_interleave_0, values = (var_476, var_473))[name = tensor<string, []>("window_11")];
+            tensor<int32, [3]> var_481_begin_0 = const()[name = tensor<string, []>("op_481_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_481_end_0 = const()[name = tensor<string, []>("op_481_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_481_end_mask_0 = const()[name = tensor<string, []>("op_481_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_481 = slice_by_index(begin = var_481_begin_0, end = var_481_end_0, end_mask = var_481_end_mask_0, x = x_9)[name = tensor<string, []>("op_481")];
+            tensor<int32, [3]> var_484_begin_0 = const()[name = tensor<string, []>("op_484_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_484_end_0 = const()[name = tensor<string, []>("op_484_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_484_end_mask_0 = const()[name = tensor<string, []>("op_484_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_484 = slice_by_index(begin = var_484_begin_0, end = var_484_end_0, end_mask = var_484_end_mask_0, x = window_11)[name = tensor<string, []>("op_484")];
             tensor<bool, []> window_13_interleave_0 = const()[name = tensor<string, []>("window_13_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_13 = concat(axis = var_26, interleave = window_13_interleave_0, values = (var_438, var_435))[name = tensor<string, []>("window_13")];
-            tensor<int32, [3]> var_443_begin_0 = const()[name = tensor<string, []>("op_443_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_443_end_0 = const()[name = tensor<string, []>("op_443_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_443_end_mask_0 = const()[name = tensor<string, []>("op_443_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_443 = slice_by_index(begin = var_443_begin_0, end = var_443_end_0, end_mask = var_443_end_mask_0, x = x_9)[name = tensor<string, []>("op_443")];
-            tensor<int32, [3]> var_446_begin_0 = const()[name = tensor<string, []>("op_446_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_446_end_0 = const()[name = tensor<string, []>("op_446_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_446_end_mask_0 = const()[name = tensor<string, []>("op_446_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_446 = slice_by_index(begin = var_446_begin_0, end = var_446_end_0, end_mask = var_446_end_mask_0, x = window_13)[name = tensor<string, []>("op_446")];
+            tensor<fp32, [1, 16, 256]> window_13 = concat(axis = var_72, interleave = window_13_interleave_0, values = (var_484, var_481))[name = tensor<string, []>("window_13")];
+            tensor<int32, [3]> var_489_begin_0 = const()[name = tensor<string, []>("op_489_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_489_end_0 = const()[name = tensor<string, []>("op_489_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_489_end_mask_0 = const()[name = tensor<string, []>("op_489_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_489 = slice_by_index(begin = var_489_begin_0, end = var_489_end_0, end_mask = var_489_end_mask_0, x = x_9)[name = tensor<string, []>("op_489")];
+            tensor<int32, [3]> var_492_begin_0 = const()[name = tensor<string, []>("op_492_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_492_end_0 = const()[name = tensor<string, []>("op_492_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_492_end_mask_0 = const()[name = tensor<string, []>("op_492_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_492 = slice_by_index(begin = var_492_begin_0, end = var_492_end_0, end_mask = var_492_end_mask_0, x = window_13)[name = tensor<string, []>("op_492")];
             tensor<bool, []> window_15_interleave_0 = const()[name = tensor<string, []>("window_15_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_15 = concat(axis = var_26, interleave = window_15_interleave_0, values = (var_446, var_443))[name = tensor<string, []>("window_15")];
-            tensor<bool, []> input_61_interleave_0 = const()[name = tensor<string, []>("input_61_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [3, 16, 256]> input_61 = concat(axis = var_23, interleave = input_61_interleave_0, values = (window_11, window_13, window_15))[name = tensor<string, []>("input_61")];
+            tensor<fp32, [1, 16, 256]> window_15 = concat(axis = var_72, interleave = window_15_interleave_0, values = (var_492, var_489))[name = tensor<string, []>("window_15")];
+            tensor<bool, []> input_63_interleave_0 = const()[name = tensor<string, []>("input_63_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [3, 16, 256]> input_63 = concat(axis = var_59, interleave = input_63_interleave_0, values = (window_11, window_13, window_15))[name = tensor<string, []>("input_63")];
             tensor<int32, [1]> x_11_axes_0 = const()[name = tensor<string, []>("x_11_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [3, 16, 256]> x_11 = layer_norm(axes = x_11_axes_0, beta = encoder_conv_module_1_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_1_sequential_0_weight, x = input_61)[name = tensor<string, []>("x_11")];
-            tensor<int32, [3]> input_63_perm_0 = const()[name = tensor<string, []>("input_63_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [3, 16, 256]> x_11 = layer_norm(axes = x_11_axes_0, beta = inner_encoder_conv_module_1_sequential_0_bias, epsilon = var_56, gamma = inner_encoder_conv_module_1_sequential_0_weight, x = input_63)[name = tensor<string, []>("x_11")];
+            tensor<int32, [3]> input_65_perm_0 = const()[name = tensor<string, []>("input_65_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_13_pad_type_0 = const()[name = tensor<string, []>("inputs_13_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_13_strides_0 = const()[name = tensor<string, []>("inputs_13_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_13_pad_0 = const()[name = tensor<string, []>("inputs_13_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_13_dilations_0 = const()[name = tensor<string, []>("inputs_13_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_13_groups_0 = const()[name = tensor<string, []>("inputs_13_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> input_63 = transpose(perm = input_63_perm_0, x = x_11)[name = tensor<string, []>("transpose_47")];
-            tensor<fp32, [3, 512, 16]> inputs_13 = conv(bias = encoder_conv_module_1_sequential_2_conv_bias, dilations = inputs_13_dilations_0, groups = inputs_13_groups_0, pad = inputs_13_pad_0, pad_type = inputs_13_pad_type_0, strides = inputs_13_strides_0, weight = encoder_conv_module_1_sequential_2_conv_weight, x = input_63)[name = tensor<string, []>("inputs_13")];
-            tensor<int32, [2]> var_471_split_sizes_0 = const()[name = tensor<string, []>("op_471_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_471_axis_0 = const()[name = tensor<string, []>("op_471_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> var_471_0, tensor<fp32, [3, 256, 16]> var_471_1 = split(axis = var_471_axis_0, split_sizes = var_471_split_sizes_0, x = inputs_13)[name = tensor<string, []>("op_471")];
-            tensor<fp32, [3, 256, 16]> var_473 = sigmoid(x = var_471_1)[name = tensor<string, []>("op_473")];
-            tensor<fp32, [3, 256, 16]> inputs_15 = mul(x = var_471_0, y = var_473)[name = tensor<string, []>("inputs_15")];
+            tensor<fp32, [3, 256, 16]> input_65 = transpose(perm = input_65_perm_0, x = x_11)[name = tensor<string, []>("transpose_47")];
+            tensor<fp32, [3, 512, 16]> inputs_13 = conv(bias = inner_encoder_conv_module_1_sequential_2_conv_bias, dilations = inputs_13_dilations_0, groups = inputs_13_groups_0, pad = inputs_13_pad_0, pad_type = inputs_13_pad_type_0, strides = inputs_13_strides_0, weight = inner_encoder_conv_module_1_sequential_2_conv_weight, x = input_65)[name = tensor<string, []>("inputs_13")];
+            tensor<int32, [2]> var_517_split_sizes_0 = const()[name = tensor<string, []>("op_517_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_517_axis_0 = const()[name = tensor<string, []>("op_517_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [3, 256, 16]> var_517_0, tensor<fp32, [3, 256, 16]> var_517_1 = split(axis = var_517_axis_0, split_sizes = var_517_split_sizes_0, x = inputs_13)[name = tensor<string, []>("op_517")];
+            tensor<fp32, [3, 256, 16]> var_519 = sigmoid(x = var_517_1)[name = tensor<string, []>("op_519")];
+            tensor<fp32, [3, 256, 16]> inputs_15 = mul(x = var_517_0, y = var_519)[name = tensor<string, []>("inputs_15")];
             tensor<string, []> outputs_aug_3_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_3_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_3_pad_0 = const()[name = tensor<string, []>("outputs_aug_3_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_3_groups_0 = const()[name = tensor<string, []>("outputs_aug_3_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_3_strides_0 = const()[name = tensor<string, []>("outputs_aug_3_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_3_dilations_0 = const()[name = tensor<string, []>("outputs_aug_3_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [3, 256, 31]> outputs_aug_3 = conv(dilations = outputs_aug_3_dilations_0, groups = outputs_aug_3_groups_0, pad = outputs_aug_3_pad_0, pad_type = outputs_aug_3_pad_type_0, strides = outputs_aug_3_strides_0, weight = encoder_conv_module_1_sequential_4_conv_weight, x = inputs_15)[name = tensor<string, []>("outputs_aug_3")];
-            tensor<int32, [3]> input_65_begin_0 = const()[name = tensor<string, []>("input_65_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_65_end_0 = const()[name = tensor<string, []>("input_65_end_0"), val = tensor<int32, [3]>([3, 256, 16])];
-            tensor<bool, [3]> input_65_end_mask_0 = const()[name = tensor<string, []>("input_65_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [3, 256, 16]> input_65 = slice_by_index(begin = input_65_begin_0, end = input_65_end_0, end_mask = input_65_end_mask_0, x = outputs_aug_3)[name = tensor<string, []>("input_65")];
-            tensor<fp32, [3, 256, 16]> inputs_17 = batch_norm(beta = encoder_conv_module_1_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_1_sequential_5_weight, mean = encoder_conv_module_1_sequential_5_running_mean, variance = encoder_conv_module_1_sequential_5_running_var, x = input_65)[name = tensor<string, []>("inputs_17")];
-            tensor<fp32, [3, 256, 16]> input_67 = silu(x = inputs_17)[name = tensor<string, []>("input_67")];
-            tensor<string, []> input_69_pad_type_0 = const()[name = tensor<string, []>("input_69_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_69_strides_0 = const()[name = tensor<string, []>("input_69_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_69_pad_0 = const()[name = tensor<string, []>("input_69_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_69_dilations_0 = const()[name = tensor<string, []>("input_69_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_69_groups_0 = const()[name = tensor<string, []>("input_69_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> input_69 = conv(bias = encoder_conv_module_1_sequential_7_conv_bias, dilations = input_69_dilations_0, groups = input_69_groups_0, pad = input_69_pad_0, pad_type = input_69_pad_type_0, strides = input_69_strides_0, weight = encoder_conv_module_1_sequential_7_conv_weight, x = input_67)[name = tensor<string, []>("input_69")];
+            tensor<fp32, [3, 256, 31]> outputs_aug_3 = conv(dilations = outputs_aug_3_dilations_0, groups = outputs_aug_3_groups_0, pad = outputs_aug_3_pad_0, pad_type = outputs_aug_3_pad_type_0, strides = outputs_aug_3_strides_0, weight = inner_encoder_conv_module_1_sequential_4_conv_weight, x = inputs_15)[name = tensor<string, []>("outputs_aug_3")];
+            tensor<int32, [3]> input_67_begin_0 = const()[name = tensor<string, []>("input_67_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_67_end_0 = const()[name = tensor<string, []>("input_67_end_0"), val = tensor<int32, [3]>([3, 256, 16])];
+            tensor<bool, [3]> input_67_end_mask_0 = const()[name = tensor<string, []>("input_67_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [3, 256, 16]> input_67 = slice_by_index(begin = input_67_begin_0, end = input_67_end_0, end_mask = input_67_end_mask_0, x = outputs_aug_3)[name = tensor<string, []>("input_67")];
+            tensor<fp32, [3, 256, 16]> inputs_17 = batch_norm(beta = inner_encoder_conv_module_1_sequential_5_bias, epsilon = var_56, gamma = inner_encoder_conv_module_1_sequential_5_weight, mean = inner_encoder_conv_module_1_sequential_5_running_mean, variance = inner_encoder_conv_module_1_sequential_5_running_var, x = input_67)[name = tensor<string, []>("inputs_17")];
+            tensor<fp32, [3, 256, 16]> input_69 = silu(x = inputs_17)[name = tensor<string, []>("input_69")];
+            tensor<string, []> input_71_pad_type_0 = const()[name = tensor<string, []>("input_71_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_71_strides_0 = const()[name = tensor<string, []>("input_71_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_71_pad_0 = const()[name = tensor<string, []>("input_71_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_71_dilations_0 = const()[name = tensor<string, []>("input_71_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_71_groups_0 = const()[name = tensor<string, []>("input_71_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [3, 256, 16]> input_71 = conv(bias = inner_encoder_conv_module_1_sequential_7_conv_bias, dilations = input_71_dilations_0, groups = input_71_groups_0, pad = input_71_pad_0, pad_type = input_71_pad_type_0, strides = input_71_strides_0, weight = inner_encoder_conv_module_1_sequential_7_conv_weight, x = input_69)[name = tensor<string, []>("input_71")];
             tensor<int32, [3]> conv_out_3_perm_0 = const()[name = tensor<string, []>("conv_out_3_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_504_begin_0 = const()[name = tensor<string, []>("op_504_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_504_end_0 = const()[name = tensor<string, []>("op_504_end_0"), val = tensor<int32, [3]>([3, 16, 256])];
-            tensor<bool, [3]> var_504_end_mask_0 = const()[name = tensor<string, []>("op_504_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [3, 16, 256]> conv_out_3 = transpose(perm = conv_out_3_perm_0, x = input_69)[name = tensor<string, []>("transpose_46")];
-            tensor<fp32, [3, 1, 256]> var_504 = slice_by_index(begin = var_504_begin_0, end = var_504_end_0, end_mask = var_504_end_mask_0, x = conv_out_3)[name = tensor<string, []>("op_504")];
-            tensor<int32, [3]> var_506_perm_0 = const()[name = tensor<string, []>("op_506_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 3, 256]> var_506 = transpose(perm = var_506_perm_0, x = var_504)[name = tensor<string, []>("transpose_45")];
-            tensor<fp32, [1, 3, 256]> input_71 = add(x = x_9, y = var_506)[name = tensor<string, []>("input_71")];
-            tensor<int32, [1]> input_73_axes_0 = const()[name = tensor<string, []>("input_73_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_73 = layer_norm(axes = input_73_axes_0, beta = encoder_ffn2_1_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_1_module_sequential_0_weight, x = input_71)[name = tensor<string, []>("input_73")];
-            tensor<fp32, [1, 3, 1024]> inputs_19 = linear(bias = encoder_ffn2_1_module_sequential_1_linear_bias, weight = encoder_ffn2_1_module_sequential_1_linear_weight, x = input_73)[name = tensor<string, []>("linear_17")];
-            tensor<fp32, [1, 3, 1024]> input_75 = silu(x = inputs_19)[name = tensor<string, []>("input_75")];
-            tensor<fp32, [1, 3, 256]> input_79 = linear(bias = encoder_ffn2_1_module_sequential_4_linear_bias, weight = encoder_ffn2_1_module_sequential_4_linear_weight, x = input_75)[name = tensor<string, []>("linear_18")];
-            tensor<fp32, []> var_529 = const()[name = tensor<string, []>("op_529"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 3, 256]> var_530 = mul(x = input_79, y = var_529)[name = tensor<string, []>("op_530")];
-            tensor<fp32, [1, 3, 256]> input_81 = add(x = var_530, y = input_71)[name = tensor<string, []>("input_81")];
-            tensor<int32, [1]> input_83_axes_0 = const()[name = tensor<string, []>("input_83_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_83 = layer_norm(axes = input_83_axes_0, beta = encoder_layer_norm_1_bias, epsilon = var_28, gamma = encoder_layer_norm_1_weight, x = input_81)[name = tensor<string, []>("input_83")];
+            tensor<int32, [3]> var_550_begin_0 = const()[name = tensor<string, []>("op_550_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_550_end_0 = const()[name = tensor<string, []>("op_550_end_0"), val = tensor<int32, [3]>([3, 16, 256])];
+            tensor<bool, [3]> var_550_end_mask_0 = const()[name = tensor<string, []>("op_550_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [3, 16, 256]> conv_out_3 = transpose(perm = conv_out_3_perm_0, x = input_71)[name = tensor<string, []>("transpose_46")];
+            tensor<fp32, [3, 1, 256]> var_550 = slice_by_index(begin = var_550_begin_0, end = var_550_end_0, end_mask = var_550_end_mask_0, x = conv_out_3)[name = tensor<string, []>("op_550")];
+            tensor<int32, [3]> var_552_perm_0 = const()[name = tensor<string, []>("op_552_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 3, 256]> var_552 = transpose(perm = var_552_perm_0, x = var_550)[name = tensor<string, []>("transpose_45")];
+            tensor<fp32, [1, 3, 256]> input_73 = add(x = x_9, y = var_552)[name = tensor<string, []>("input_73")];
+            tensor<int32, [1]> input_75_axes_0 = const()[name = tensor<string, []>("input_75_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 3, 256]> input_75 = layer_norm(axes = input_75_axes_0, beta = inner_encoder_ffn2_1_module_sequential_0_bias, epsilon = var_56, gamma = inner_encoder_ffn2_1_module_sequential_0_weight, x = input_73)[name = tensor<string, []>("input_75")];
+            tensor<fp32, [1, 3, 1024]> inputs_19 = linear(bias = inner_encoder_ffn2_1_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_1_module_sequential_1_linear_weight, x = input_75)[name = tensor<string, []>("linear_17")];
+            tensor<fp32, [1, 3, 1024]> input_77 = silu(x = inputs_19)[name = tensor<string, []>("input_77")];
+            tensor<fp32, [1, 3, 256]> input_81 = linear(bias = inner_encoder_ffn2_1_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_1_module_sequential_4_linear_weight, x = input_77)[name = tensor<string, []>("linear_18")];
+            tensor<fp32, []> var_575 = const()[name = tensor<string, []>("op_575"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 3, 256]> var_576 = mul(x = input_81, y = var_575)[name = tensor<string, []>("op_576")];
+            tensor<fp32, [1, 3, 256]> input_83 = add(x = var_576, y = input_73)[name = tensor<string, []>("input_83")];
             tensor<int32, [1]> input_85_axes_0 = const()[name = tensor<string, []>("input_85_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_85 = layer_norm(axes = input_85_axes_0, beta = encoder_ffn1_2_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_2_module_sequential_0_weight, x = input_83)[name = tensor<string, []>("input_85")];
-            tensor<fp32, [1, 3, 1024]> inputs_21 = linear(bias = encoder_ffn1_2_module_sequential_1_linear_bias, weight = encoder_ffn1_2_module_sequential_1_linear_weight, x = input_85)[name = tensor<string, []>("linear_19")];
-            tensor<fp32, [1, 3, 1024]> input_87 = silu(x = inputs_21)[name = tensor<string, []>("input_87")];
-            tensor<fp32, [1, 3, 256]> input_91 = linear(bias = encoder_ffn1_2_module_sequential_4_linear_bias, weight = encoder_ffn1_2_module_sequential_4_linear_weight, x = input_87)[name = tensor<string, []>("linear_20")];
-            tensor<fp32, []> var_559 = const()[name = tensor<string, []>("op_559"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 3, 256]> var_560 = mul(x = input_91, y = var_559)[name = tensor<string, []>("op_560")];
-            tensor<fp32, [1, 3, 256]> input_93 = add(x = var_560, y = input_83)[name = tensor<string, []>("input_93")];
+            tensor<fp32, [1, 3, 256]> input_85 = layer_norm(axes = input_85_axes_0, beta = inner_encoder_layer_norm_1_bias, epsilon = var_56, gamma = inner_encoder_layer_norm_1_weight, x = input_83)[name = tensor<string, []>("input_85")];
+            tensor<int32, [1]> input_87_axes_0 = const()[name = tensor<string, []>("input_87_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 3, 256]> input_87 = layer_norm(axes = input_87_axes_0, beta = inner_encoder_ffn1_2_module_sequential_0_bias, epsilon = var_56, gamma = inner_encoder_ffn1_2_module_sequential_0_weight, x = input_85)[name = tensor<string, []>("input_87")];
+            tensor<fp32, [1, 3, 1024]> inputs_21 = linear(bias = inner_encoder_ffn1_2_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_2_module_sequential_1_linear_weight, x = input_87)[name = tensor<string, []>("linear_19")];
+            tensor<fp32, [1, 3, 1024]> input_89 = silu(x = inputs_21)[name = tensor<string, []>("input_89")];
+            tensor<fp32, [1, 3, 256]> input_93 = linear(bias = inner_encoder_ffn1_2_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_2_module_sequential_4_linear_weight, x = input_89)[name = tensor<string, []>("linear_20")];
+            tensor<fp32, []> var_605 = const()[name = tensor<string, []>("op_605"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 3, 256]> var_606 = mul(x = input_93, y = var_605)[name = tensor<string, []>("op_606")];
+            tensor<fp32, [1, 3, 256]> input_95 = add(x = var_606, y = input_85)[name = tensor<string, []>("input_95")];
             tensor<int32, [1]> x_13_axes_0 = const()[name = tensor<string, []>("x_13_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> x_13 = layer_norm(axes = x_13_axes_0, beta = encoder_ret_lns_2_bias, epsilon = var_28, gamma = encoder_ret_lns_2_weight, x = input_93)[name = tensor<string, []>("x_13")];
+            tensor<fp32, [1, 3, 256]> x_13 = layer_norm(axes = x_13_axes_0, beta = inner_encoder_ret_lns_2_bias, epsilon = var_56, gamma = inner_encoder_ret_lns_2_weight, x = input_95)[name = tensor<string, []>("x_13")];
             tensor<int32, [5]> prev_kv_5_begin_0 = const()[name = tensor<string, []>("prev_kv_5_begin_0"), val = tensor<int32, [5]>([2, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_5_end_0 = const()[name = tensor<string, []>("prev_kv_5_end_0"), val = tensor<int32, [5]>([3, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_5_end_mask_0 = const()[name = tensor<string, []>("prev_kv_5_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -573,163 +591,163 @@ program(1.0)
             tensor<bool, [2]> prev_scale_5_end_mask_0 = const()[name = tensor<string, []>("prev_scale_5_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_5_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_5_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_5 = slice_by_index(begin = prev_scale_5_begin_0, end = prev_scale_5_end_0, end_mask = prev_scale_5_end_mask_0, squeeze_mask = prev_scale_5_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_5")];
-            tensor<fp32, [1, 3, 256]> var_574 = linear(bias = encoder_q_proj_2_bias, weight = encoder_q_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_21")];
-            tensor<int32, [4]> var_575 = const()[name = tensor<string, []>("op_575"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_576 = reshape(shape = var_575, x = var_574)[name = tensor<string, []>("op_576")];
+            tensor<fp32, [1, 3, 256]> var_620 = linear(bias = inner_encoder_q_proj_2_bias, weight = inner_encoder_q_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_21")];
+            tensor<int32, [4]> var_621 = const()[name = tensor<string, []>("op_621"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_622 = reshape(shape = var_621, x = var_620)[name = tensor<string, []>("op_622")];
             tensor<int32, [4]> q_5_perm_0 = const()[name = tensor<string, []>("q_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> var_580 = linear(bias = encoder_k_proj_2_bias, weight = encoder_k_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_22")];
-            tensor<fp32, []> var_581 = const()[name = tensor<string, []>("op_581"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 3, 256]> var_582 = mul(x = var_580, y = var_581)[name = tensor<string, []>("op_582")];
-            tensor<int32, [4]> var_583 = const()[name = tensor<string, []>("op_583"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_584 = reshape(shape = var_583, x = var_582)[name = tensor<string, []>("op_584")];
+            tensor<fp32, [1, 3, 256]> var_626 = linear(bias = inner_encoder_k_proj_2_bias, weight = inner_encoder_k_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_22")];
+            tensor<fp32, []> var_627 = const()[name = tensor<string, []>("op_627"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 3, 256]> var_628 = mul(x = var_626, y = var_627)[name = tensor<string, []>("op_628")];
+            tensor<int32, [4]> var_629 = const()[name = tensor<string, []>("op_629"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_630 = reshape(shape = var_629, x = var_628)[name = tensor<string, []>("op_630")];
             tensor<int32, [4]> k_5_perm_0 = const()[name = tensor<string, []>("k_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> var_588 = linear(bias = encoder_v_proj_2_bias, weight = encoder_v_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_23")];
-            tensor<int32, [4]> var_589 = const()[name = tensor<string, []>("op_589"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_590 = reshape(shape = var_589, x = var_588)[name = tensor<string, []>("op_590")];
+            tensor<fp32, [1, 3, 256]> var_634 = linear(bias = inner_encoder_v_proj_2_bias, weight = inner_encoder_v_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_23")];
+            tensor<int32, [4]> var_635 = const()[name = tensor<string, []>("op_635"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_636 = reshape(shape = var_635, x = var_634)[name = tensor<string, []>("op_636")];
             tensor<int32, [4]> v_5_perm_0 = const()[name = tensor<string, []>("v_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> input_97 = linear(bias = encoder_g_proj_2_bias, weight = encoder_g_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_24")];
+            tensor<fp32, [1, 3, 256]> input_99 = linear(bias = inner_encoder_g_proj_2_bias, weight = inner_encoder_g_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_24")];
             tensor<fp32, [1]> sqrt_s0_5 = sqrt(x = prev_scale_5)[name = tensor<string, []>("sqrt_s0_5")];
-            tensor<fp32, [3]> s_t_5 = add(x = prev_scale_5, y = encoder__t_index)[name = tensor<string, []>("s_t_5")];
+            tensor<fp32, [3]> s_t_5 = add(x = prev_scale_5, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_5")];
             tensor<fp32, [3]> sqrt_s_t_5 = sqrt(x = s_t_5)[name = tensor<string, []>("sqrt_s_t_5")];
             tensor<bool, []> qk_5_transpose_x_1 = const()[name = tensor<string, []>("qk_5_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_5_transpose_y_1 = const()[name = tensor<string, []>("qk_5_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 3, 64]> k_5 = transpose(perm = k_5_perm_0, x = var_584)[name = tensor<string, []>("transpose_43")];
-            tensor<fp32, [1, 4, 3, 64]> q_5 = transpose(perm = q_5_perm_0, x = var_576)[name = tensor<string, []>("transpose_44")];
+            tensor<fp32, [1, 4, 3, 64]> k_5 = transpose(perm = k_5_perm_0, x = var_630)[name = tensor<string, []>("transpose_43")];
+            tensor<fp32, [1, 4, 3, 64]> q_5 = transpose(perm = q_5_perm_0, x = var_622)[name = tensor<string, []>("transpose_44")];
             tensor<fp32, [1, 4, 3, 3]> qk_5 = matmul(transpose_x = qk_5_transpose_x_1, transpose_y = qk_5_transpose_y_1, x = q_5, y = k_5)[name = tensor<string, []>("qk_5")];
-            tensor<int32, [2]> var_600 = const()[name = tensor<string, []>("op_600"), val = tensor<int32, [2]>([3, 1])];
-            tensor<fp32, [3, 1]> var_601 = reshape(shape = var_600, x = sqrt_s_t_5)[name = tensor<string, []>("op_601")];
-            tensor<fp32, [3, 3]> M_5 = real_div(x = encoder__causal_mask, y = var_601)[name = tensor<string, []>("M_5")];
-            tensor<fp32, [1, 4, 3, 3]> var_603 = mul(x = qk_5, y = M_5)[name = tensor<string, []>("op_603")];
+            tensor<int32, [2]> var_646 = const()[name = tensor<string, []>("op_646"), val = tensor<int32, [2]>([3, 1])];
+            tensor<fp32, [3, 1]> var_647 = reshape(shape = var_646, x = sqrt_s_t_5)[name = tensor<string, []>("op_647")];
+            tensor<fp32, [3, 3]> M_5 = real_div(x = inner_encoder__causal_mask, y = var_647)[name = tensor<string, []>("M_5")];
+            tensor<fp32, [1, 4, 3, 3]> var_649 = mul(x = qk_5, y = M_5)[name = tensor<string, []>("op_649")];
             tensor<bool, []> inner_5_transpose_x_0 = const()[name = tensor<string, []>("inner_5_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_5_transpose_y_0 = const()[name = tensor<string, []>("inner_5_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 3, 64]> v_5 = transpose(perm = v_5_perm_0, x = var_590)[name = tensor<string, []>("transpose_42")];
-            tensor<fp32, [1, 4, 3, 64]> inner_5 = matmul(transpose_x = inner_5_transpose_x_0, transpose_y = inner_5_transpose_y_0, x = var_603, y = v_5)[name = tensor<string, []>("inner_5")];
-            tensor<bool, []> var_605_transpose_x_0 = const()[name = tensor<string, []>("op_605_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_605_transpose_y_0 = const()[name = tensor<string, []>("op_605_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 3, 64]> var_605 = matmul(transpose_x = var_605_transpose_x_0, transpose_y = var_605_transpose_y_0, x = q_5, y = prev_kv_5)[name = tensor<string, []>("op_605")];
-            tensor<fp32, [3]> var_606 = real_div(x = sqrt_s0_5, y = sqrt_s_t_5)[name = tensor<string, []>("op_606")];
-            tensor<int32, [4]> var_607 = const()[name = tensor<string, []>("op_607"), val = tensor<int32, [4]>([1, 1, 3, 1])];
-            tensor<fp32, [1, 1, 3, 1]> var_608 = reshape(shape = var_607, x = var_606)[name = tensor<string, []>("op_608")];
-            tensor<fp32, [1, 4, 3, 64]> cross_5 = mul(x = var_605, y = var_608)[name = tensor<string, []>("cross_5")];
+            tensor<fp32, [1, 4, 3, 64]> v_5 = transpose(perm = v_5_perm_0, x = var_636)[name = tensor<string, []>("transpose_42")];
+            tensor<fp32, [1, 4, 3, 64]> inner_5 = matmul(transpose_x = inner_5_transpose_x_0, transpose_y = inner_5_transpose_y_0, x = var_649, y = v_5)[name = tensor<string, []>("inner_5")];
+            tensor<bool, []> var_651_transpose_x_0 = const()[name = tensor<string, []>("op_651_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_651_transpose_y_0 = const()[name = tensor<string, []>("op_651_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 3, 64]> var_651 = matmul(transpose_x = var_651_transpose_x_0, transpose_y = var_651_transpose_y_0, x = q_5, y = prev_kv_5)[name = tensor<string, []>("op_651")];
+            tensor<fp32, [3]> var_652 = real_div(x = sqrt_s0_5, y = sqrt_s_t_5)[name = tensor<string, []>("op_652")];
+            tensor<int32, [4]> var_653 = const()[name = tensor<string, []>("op_653"), val = tensor<int32, [4]>([1, 1, 3, 1])];
+            tensor<fp32, [1, 1, 3, 1]> var_654 = reshape(shape = var_653, x = var_652)[name = tensor<string, []>("op_654")];
+            tensor<fp32, [1, 4, 3, 64]> cross_5 = mul(x = var_651, y = var_654)[name = tensor<string, []>("cross_5")];
             tensor<fp32, [1, 4, 3, 64]> out_13 = add(x = inner_5, y = cross_5)[name = tensor<string, []>("out_13")];
-            tensor<fp32, [1, 4, 64, 64]> var_611 = mul(x = prev_kv_5, y = sqrt_s0_5)[name = tensor<string, []>("op_611")];
-            tensor<bool, []> var_613_transpose_x_1 = const()[name = tensor<string, []>("op_613_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_613_transpose_y_1 = const()[name = tensor<string, []>("op_613_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_613 = matmul(transpose_x = var_613_transpose_x_1, transpose_y = var_613_transpose_y_1, x = k_5, y = v_5)[name = tensor<string, []>("op_613")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_5 = add(x = var_611, y = var_613)[name = tensor<string, []>("new_kv_unnorm_5")];
-            tensor<fp32, []> var_615 = const()[name = tensor<string, []>("op_615"), val = tensor<fp32, []>(0x1.8p+1)];
-            tensor<fp32, [1]> new_scale_5 = add(x = prev_scale_5, y = var_615)[name = tensor<string, []>("new_scale_5")];
-            tensor<fp32, [1]> var_617 = sqrt(x = new_scale_5)[name = tensor<string, []>("op_617")];
-            tensor<fp32, [1, 4, 64, 64]> var_618 = real_div(x = new_kv_unnorm_5, y = var_617)[name = tensor<string, []>("op_618")];
-            tensor<int32, [4]> var_619_perm_0 = const()[name = tensor<string, []>("op_619_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_657 = mul(x = prev_kv_5, y = sqrt_s0_5)[name = tensor<string, []>("op_657")];
+            tensor<bool, []> var_659_transpose_x_1 = const()[name = tensor<string, []>("op_659_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_659_transpose_y_1 = const()[name = tensor<string, []>("op_659_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_659 = matmul(transpose_x = var_659_transpose_x_1, transpose_y = var_659_transpose_y_1, x = k_5, y = v_5)[name = tensor<string, []>("op_659")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_5 = add(x = var_657, y = var_659)[name = tensor<string, []>("new_kv_unnorm_5")];
+            tensor<fp32, []> var_661 = const()[name = tensor<string, []>("op_661"), val = tensor<fp32, []>(0x1.8p+1)];
+            tensor<fp32, [1]> new_scale_5 = add(x = prev_scale_5, y = var_661)[name = tensor<string, []>("new_scale_5")];
+            tensor<fp32, [1]> var_663 = sqrt(x = new_scale_5)[name = tensor<string, []>("op_663")];
+            tensor<fp32, [1, 4, 64, 64]> var_664 = real_div(x = new_kv_unnorm_5, y = var_663)[name = tensor<string, []>("op_664")];
+            tensor<int32, [4]> var_665_perm_0 = const()[name = tensor<string, []>("op_665_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_15_axes_0 = const()[name = tensor<string, []>("out_15_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 4, 64]> var_619 = transpose(perm = var_619_perm_0, x = out_13)[name = tensor<string, []>("transpose_41")];
-            tensor<fp32, [1, 3, 4, 64]> out_15 = layer_norm(axes = out_15_axes_0, epsilon = var_17, x = var_619)[name = tensor<string, []>("out_15")];
-            tensor<int32, [3]> var_623 = const()[name = tensor<string, []>("op_623"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<fp32, [1, 3, 256]> out_17 = reshape(shape = var_623, x = out_15)[name = tensor<string, []>("out_17")];
-            tensor<fp32, [1, 3, 256]> var_625 = silu(x = input_97)[name = tensor<string, []>("op_625")];
-            tensor<fp32, [1, 3, 256]> input_99 = mul(x = var_625, y = out_17)[name = tensor<string, []>("input_99")];
-            tensor<fp32, [1, 3, 256]> ret_out_5 = linear(bias = encoder_out_proj_2_bias, weight = encoder_out_proj_2_weight, x = input_99)[name = tensor<string, []>("linear_25")];
-            tensor<fp32, [1, 3, 256]> x_15 = add(x = input_93, y = ret_out_5)[name = tensor<string, []>("x_15")];
+            tensor<fp32, [1, 3, 4, 64]> var_665 = transpose(perm = var_665_perm_0, x = out_13)[name = tensor<string, []>("transpose_41")];
+            tensor<fp32, [1, 3, 4, 64]> out_15 = layer_norm(axes = out_15_axes_0, epsilon = var_64, x = var_665)[name = tensor<string, []>("out_15")];
+            tensor<int32, [3]> var_669 = const()[name = tensor<string, []>("op_669"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<fp32, [1, 3, 256]> out_17 = reshape(shape = var_669, x = out_15)[name = tensor<string, []>("out_17")];
+            tensor<fp32, [1, 3, 256]> var_671 = silu(x = input_99)[name = tensor<string, []>("op_671")];
+            tensor<fp32, [1, 3, 256]> input_101 = mul(x = var_671, y = out_17)[name = tensor<string, []>("input_101")];
+            tensor<fp32, [1, 3, 256]> ret_out_5 = linear(bias = inner_encoder_out_proj_2_bias, weight = inner_encoder_out_proj_2_weight, x = input_101)[name = tensor<string, []>("linear_25")];
+            tensor<fp32, [1, 3, 256]> x_15 = add(x = input_95, y = ret_out_5)[name = tensor<string, []>("x_15")];
             tensor<int32, [4]> window_17_begin_0 = const()[name = tensor<string, []>("window_17_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> window_17_end_0 = const()[name = tensor<string, []>("window_17_end_0"), val = tensor<int32, [4]>([3, 1, 16, 256])];
             tensor<bool, [4]> window_17_end_mask_0 = const()[name = tensor<string, []>("window_17_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_17_squeeze_mask_0 = const()[name = tensor<string, []>("window_17_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_17 = slice_by_index(begin = window_17_begin_0, end = window_17_end_0, end_mask = window_17_end_mask_0, squeeze_mask = window_17_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_17")];
-            tensor<int32, [3]> var_633_begin_0 = const()[name = tensor<string, []>("op_633_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_633_end_0 = const()[name = tensor<string, []>("op_633_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_633_end_mask_0 = const()[name = tensor<string, []>("op_633_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_633 = slice_by_index(begin = var_633_begin_0, end = var_633_end_0, end_mask = var_633_end_mask_0, x = x_15)[name = tensor<string, []>("op_633")];
-            tensor<int32, [3]> var_636_begin_0 = const()[name = tensor<string, []>("op_636_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_636_end_0 = const()[name = tensor<string, []>("op_636_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_636_end_mask_0 = const()[name = tensor<string, []>("op_636_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_636 = slice_by_index(begin = var_636_begin_0, end = var_636_end_0, end_mask = var_636_end_mask_0, x = window_17)[name = tensor<string, []>("op_636")];
+            tensor<int32, [3]> var_679_begin_0 = const()[name = tensor<string, []>("op_679_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_679_end_0 = const()[name = tensor<string, []>("op_679_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_679_end_mask_0 = const()[name = tensor<string, []>("op_679_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_679 = slice_by_index(begin = var_679_begin_0, end = var_679_end_0, end_mask = var_679_end_mask_0, x = x_15)[name = tensor<string, []>("op_679")];
+            tensor<int32, [3]> var_682_begin_0 = const()[name = tensor<string, []>("op_682_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_682_end_0 = const()[name = tensor<string, []>("op_682_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_682_end_mask_0 = const()[name = tensor<string, []>("op_682_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_682 = slice_by_index(begin = var_682_begin_0, end = var_682_end_0, end_mask = var_682_end_mask_0, x = window_17)[name = tensor<string, []>("op_682")];
             tensor<bool, []> window_19_interleave_0 = const()[name = tensor<string, []>("window_19_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_19 = concat(axis = var_26, interleave = window_19_interleave_0, values = (var_636, var_633))[name = tensor<string, []>("window_19")];
-            tensor<int32, [3]> var_641_begin_0 = const()[name = tensor<string, []>("op_641_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_641_end_0 = const()[name = tensor<string, []>("op_641_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_641_end_mask_0 = const()[name = tensor<string, []>("op_641_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_641 = slice_by_index(begin = var_641_begin_0, end = var_641_end_0, end_mask = var_641_end_mask_0, x = x_15)[name = tensor<string, []>("op_641")];
-            tensor<int32, [3]> var_644_begin_0 = const()[name = tensor<string, []>("op_644_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_644_end_0 = const()[name = tensor<string, []>("op_644_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_644_end_mask_0 = const()[name = tensor<string, []>("op_644_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_644 = slice_by_index(begin = var_644_begin_0, end = var_644_end_0, end_mask = var_644_end_mask_0, x = window_19)[name = tensor<string, []>("op_644")];
+            tensor<fp32, [1, 16, 256]> window_19 = concat(axis = var_72, interleave = window_19_interleave_0, values = (var_682, var_679))[name = tensor<string, []>("window_19")];
+            tensor<int32, [3]> var_687_begin_0 = const()[name = tensor<string, []>("op_687_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_687_end_0 = const()[name = tensor<string, []>("op_687_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_687_end_mask_0 = const()[name = tensor<string, []>("op_687_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_687 = slice_by_index(begin = var_687_begin_0, end = var_687_end_0, end_mask = var_687_end_mask_0, x = x_15)[name = tensor<string, []>("op_687")];
+            tensor<int32, [3]> var_690_begin_0 = const()[name = tensor<string, []>("op_690_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_690_end_0 = const()[name = tensor<string, []>("op_690_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_690_end_mask_0 = const()[name = tensor<string, []>("op_690_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_690 = slice_by_index(begin = var_690_begin_0, end = var_690_end_0, end_mask = var_690_end_mask_0, x = window_19)[name = tensor<string, []>("op_690")];
             tensor<bool, []> window_21_interleave_0 = const()[name = tensor<string, []>("window_21_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_21 = concat(axis = var_26, interleave = window_21_interleave_0, values = (var_644, var_641))[name = tensor<string, []>("window_21")];
-            tensor<int32, [3]> var_649_begin_0 = const()[name = tensor<string, []>("op_649_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_649_end_0 = const()[name = tensor<string, []>("op_649_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_649_end_mask_0 = const()[name = tensor<string, []>("op_649_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_649 = slice_by_index(begin = var_649_begin_0, end = var_649_end_0, end_mask = var_649_end_mask_0, x = x_15)[name = tensor<string, []>("op_649")];
-            tensor<int32, [3]> var_652_begin_0 = const()[name = tensor<string, []>("op_652_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_652_end_0 = const()[name = tensor<string, []>("op_652_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_652_end_mask_0 = const()[name = tensor<string, []>("op_652_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_652 = slice_by_index(begin = var_652_begin_0, end = var_652_end_0, end_mask = var_652_end_mask_0, x = window_21)[name = tensor<string, []>("op_652")];
+            tensor<fp32, [1, 16, 256]> window_21 = concat(axis = var_72, interleave = window_21_interleave_0, values = (var_690, var_687))[name = tensor<string, []>("window_21")];
+            tensor<int32, [3]> var_695_begin_0 = const()[name = tensor<string, []>("op_695_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_695_end_0 = const()[name = tensor<string, []>("op_695_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_695_end_mask_0 = const()[name = tensor<string, []>("op_695_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_695 = slice_by_index(begin = var_695_begin_0, end = var_695_end_0, end_mask = var_695_end_mask_0, x = x_15)[name = tensor<string, []>("op_695")];
+            tensor<int32, [3]> var_698_begin_0 = const()[name = tensor<string, []>("op_698_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_698_end_0 = const()[name = tensor<string, []>("op_698_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_698_end_mask_0 = const()[name = tensor<string, []>("op_698_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_698 = slice_by_index(begin = var_698_begin_0, end = var_698_end_0, end_mask = var_698_end_mask_0, x = window_21)[name = tensor<string, []>("op_698")];
             tensor<bool, []> window_23_interleave_0 = const()[name = tensor<string, []>("window_23_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_23 = concat(axis = var_26, interleave = window_23_interleave_0, values = (var_652, var_649))[name = tensor<string, []>("window_23")];
-            tensor<bool, []> input_101_interleave_0 = const()[name = tensor<string, []>("input_101_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [3, 16, 256]> input_101 = concat(axis = var_23, interleave = input_101_interleave_0, values = (window_19, window_21, window_23))[name = tensor<string, []>("input_101")];
+            tensor<fp32, [1, 16, 256]> window_23 = concat(axis = var_72, interleave = window_23_interleave_0, values = (var_698, var_695))[name = tensor<string, []>("window_23")];
+            tensor<bool, []> input_103_interleave_0 = const()[name = tensor<string, []>("input_103_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [3, 16, 256]> input_103 = concat(axis = var_59, interleave = input_103_interleave_0, values = (window_19, window_21, window_23))[name = tensor<string, []>("input_103")];
             tensor<int32, [1]> x_17_axes_0 = const()[name = tensor<string, []>("x_17_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [3, 16, 256]> x_17 = layer_norm(axes = x_17_axes_0, beta = encoder_conv_module_2_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_2_sequential_0_weight, x = input_101)[name = tensor<string, []>("x_17")];
-            tensor<int32, [3]> input_103_perm_0 = const()[name = tensor<string, []>("input_103_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [3, 16, 256]> x_17 = layer_norm(axes = x_17_axes_0, beta = inner_encoder_conv_module_2_sequential_0_bias, epsilon = var_56, gamma = inner_encoder_conv_module_2_sequential_0_weight, x = input_103)[name = tensor<string, []>("x_17")];
+            tensor<int32, [3]> input_105_perm_0 = const()[name = tensor<string, []>("input_105_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_23_pad_type_0 = const()[name = tensor<string, []>("inputs_23_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_23_strides_0 = const()[name = tensor<string, []>("inputs_23_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_23_pad_0 = const()[name = tensor<string, []>("inputs_23_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_23_dilations_0 = const()[name = tensor<string, []>("inputs_23_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_23_groups_0 = const()[name = tensor<string, []>("inputs_23_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> input_103 = transpose(perm = input_103_perm_0, x = x_17)[name = tensor<string, []>("transpose_40")];
-            tensor<fp32, [3, 512, 16]> inputs_23 = conv(bias = encoder_conv_module_2_sequential_2_conv_bias, dilations = inputs_23_dilations_0, groups = inputs_23_groups_0, pad = inputs_23_pad_0, pad_type = inputs_23_pad_type_0, strides = inputs_23_strides_0, weight = encoder_conv_module_2_sequential_2_conv_weight, x = input_103)[name = tensor<string, []>("inputs_23")];
-            tensor<int32, [2]> var_677_split_sizes_0 = const()[name = tensor<string, []>("op_677_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_677_axis_0 = const()[name = tensor<string, []>("op_677_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> var_677_0, tensor<fp32, [3, 256, 16]> var_677_1 = split(axis = var_677_axis_0, split_sizes = var_677_split_sizes_0, x = inputs_23)[name = tensor<string, []>("op_677")];
-            tensor<fp32, [3, 256, 16]> var_679 = sigmoid(x = var_677_1)[name = tensor<string, []>("op_679")];
-            tensor<fp32, [3, 256, 16]> inputs_25 = mul(x = var_677_0, y = var_679)[name = tensor<string, []>("inputs_25")];
+            tensor<fp32, [3, 256, 16]> input_105 = transpose(perm = input_105_perm_0, x = x_17)[name = tensor<string, []>("transpose_40")];
+            tensor<fp32, [3, 512, 16]> inputs_23 = conv(bias = inner_encoder_conv_module_2_sequential_2_conv_bias, dilations = inputs_23_dilations_0, groups = inputs_23_groups_0, pad = inputs_23_pad_0, pad_type = inputs_23_pad_type_0, strides = inputs_23_strides_0, weight = inner_encoder_conv_module_2_sequential_2_conv_weight, x = input_105)[name = tensor<string, []>("inputs_23")];
+            tensor<int32, [2]> var_723_split_sizes_0 = const()[name = tensor<string, []>("op_723_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_723_axis_0 = const()[name = tensor<string, []>("op_723_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [3, 256, 16]> var_723_0, tensor<fp32, [3, 256, 16]> var_723_1 = split(axis = var_723_axis_0, split_sizes = var_723_split_sizes_0, x = inputs_23)[name = tensor<string, []>("op_723")];
+            tensor<fp32, [3, 256, 16]> var_725 = sigmoid(x = var_723_1)[name = tensor<string, []>("op_725")];
+            tensor<fp32, [3, 256, 16]> inputs_25 = mul(x = var_723_0, y = var_725)[name = tensor<string, []>("inputs_25")];
             tensor<string, []> outputs_aug_5_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_5_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_5_pad_0 = const()[name = tensor<string, []>("outputs_aug_5_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_5_groups_0 = const()[name = tensor<string, []>("outputs_aug_5_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_5_strides_0 = const()[name = tensor<string, []>("outputs_aug_5_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_5_dilations_0 = const()[name = tensor<string, []>("outputs_aug_5_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [3, 256, 31]> outputs_aug_5 = conv(dilations = outputs_aug_5_dilations_0, groups = outputs_aug_5_groups_0, pad = outputs_aug_5_pad_0, pad_type = outputs_aug_5_pad_type_0, strides = outputs_aug_5_strides_0, weight = encoder_conv_module_2_sequential_4_conv_weight, x = inputs_25)[name = tensor<string, []>("outputs_aug_5")];
-            tensor<int32, [3]> input_105_begin_0 = const()[name = tensor<string, []>("input_105_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_105_end_0 = const()[name = tensor<string, []>("input_105_end_0"), val = tensor<int32, [3]>([3, 256, 16])];
-            tensor<bool, [3]> input_105_end_mask_0 = const()[name = tensor<string, []>("input_105_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [3, 256, 16]> input_105 = slice_by_index(begin = input_105_begin_0, end = input_105_end_0, end_mask = input_105_end_mask_0, x = outputs_aug_5)[name = tensor<string, []>("input_105")];
-            tensor<fp32, [3, 256, 16]> inputs_27 = batch_norm(beta = encoder_conv_module_2_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_2_sequential_5_weight, mean = encoder_conv_module_2_sequential_5_running_mean, variance = encoder_conv_module_2_sequential_5_running_var, x = input_105)[name = tensor<string, []>("inputs_27")];
-            tensor<fp32, [3, 256, 16]> input_107 = silu(x = inputs_27)[name = tensor<string, []>("input_107")];
-            tensor<string, []> input_109_pad_type_0 = const()[name = tensor<string, []>("input_109_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_109_strides_0 = const()[name = tensor<string, []>("input_109_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_109_pad_0 = const()[name = tensor<string, []>("input_109_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_109_dilations_0 = const()[name = tensor<string, []>("input_109_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_109_groups_0 = const()[name = tensor<string, []>("input_109_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> input_109 = conv(bias = encoder_conv_module_2_sequential_7_conv_bias, dilations = input_109_dilations_0, groups = input_109_groups_0, pad = input_109_pad_0, pad_type = input_109_pad_type_0, strides = input_109_strides_0, weight = encoder_conv_module_2_sequential_7_conv_weight, x = input_107)[name = tensor<string, []>("input_109")];
+            tensor<fp32, [3, 256, 31]> outputs_aug_5 = conv(dilations = outputs_aug_5_dilations_0, groups = outputs_aug_5_groups_0, pad = outputs_aug_5_pad_0, pad_type = outputs_aug_5_pad_type_0, strides = outputs_aug_5_strides_0, weight = inner_encoder_conv_module_2_sequential_4_conv_weight, x = inputs_25)[name = tensor<string, []>("outputs_aug_5")];
+            tensor<int32, [3]> input_107_begin_0 = const()[name = tensor<string, []>("input_107_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_107_end_0 = const()[name = tensor<string, []>("input_107_end_0"), val = tensor<int32, [3]>([3, 256, 16])];
+            tensor<bool, [3]> input_107_end_mask_0 = const()[name = tensor<string, []>("input_107_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [3, 256, 16]> input_107 = slice_by_index(begin = input_107_begin_0, end = input_107_end_0, end_mask = input_107_end_mask_0, x = outputs_aug_5)[name = tensor<string, []>("input_107")];
+            tensor<fp32, [3, 256, 16]> inputs_27 = batch_norm(beta = inner_encoder_conv_module_2_sequential_5_bias, epsilon = var_56, gamma = inner_encoder_conv_module_2_sequential_5_weight, mean = inner_encoder_conv_module_2_sequential_5_running_mean, variance = inner_encoder_conv_module_2_sequential_5_running_var, x = input_107)[name = tensor<string, []>("inputs_27")];
+            tensor<fp32, [3, 256, 16]> input_109 = silu(x = inputs_27)[name = tensor<string, []>("input_109")];
+            tensor<string, []> input_111_pad_type_0 = const()[name = tensor<string, []>("input_111_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_111_strides_0 = const()[name = tensor<string, []>("input_111_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_111_pad_0 = const()[name = tensor<string, []>("input_111_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_111_dilations_0 = const()[name = tensor<string, []>("input_111_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_111_groups_0 = const()[name = tensor<string, []>("input_111_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [3, 256, 16]> input_111 = conv(bias = inner_encoder_conv_module_2_sequential_7_conv_bias, dilations = input_111_dilations_0, groups = input_111_groups_0, pad = input_111_pad_0, pad_type = input_111_pad_type_0, strides = input_111_strides_0, weight = inner_encoder_conv_module_2_sequential_7_conv_weight, x = input_109)[name = tensor<string, []>("input_111")];
             tensor<int32, [3]> conv_out_5_perm_0 = const()[name = tensor<string, []>("conv_out_5_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_710_begin_0 = const()[name = tensor<string, []>("op_710_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_710_end_0 = const()[name = tensor<string, []>("op_710_end_0"), val = tensor<int32, [3]>([3, 16, 256])];
-            tensor<bool, [3]> var_710_end_mask_0 = const()[name = tensor<string, []>("op_710_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [3, 16, 256]> conv_out_5 = transpose(perm = conv_out_5_perm_0, x = input_109)[name = tensor<string, []>("transpose_39")];
-            tensor<fp32, [3, 1, 256]> var_710 = slice_by_index(begin = var_710_begin_0, end = var_710_end_0, end_mask = var_710_end_mask_0, x = conv_out_5)[name = tensor<string, []>("op_710")];
-            tensor<int32, [3]> var_712_perm_0 = const()[name = tensor<string, []>("op_712_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 3, 256]> var_712 = transpose(perm = var_712_perm_0, x = var_710)[name = tensor<string, []>("transpose_38")];
-            tensor<fp32, [1, 3, 256]> input_111 = add(x = x_15, y = var_712)[name = tensor<string, []>("input_111")];
-            tensor<int32, [1]> input_113_axes_0 = const()[name = tensor<string, []>("input_113_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_113 = layer_norm(axes = input_113_axes_0, beta = encoder_ffn2_2_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_2_module_sequential_0_weight, x = input_111)[name = tensor<string, []>("input_113")];
-            tensor<fp32, [1, 3, 1024]> inputs_29 = linear(bias = encoder_ffn2_2_module_sequential_1_linear_bias, weight = encoder_ffn2_2_module_sequential_1_linear_weight, x = input_113)[name = tensor<string, []>("linear_26")];
-            tensor<fp32, [1, 3, 1024]> input_115 = silu(x = inputs_29)[name = tensor<string, []>("input_115")];
-            tensor<fp32, [1, 3, 256]> input_119 = linear(bias = encoder_ffn2_2_module_sequential_4_linear_bias, weight = encoder_ffn2_2_module_sequential_4_linear_weight, x = input_115)[name = tensor<string, []>("linear_27")];
-            tensor<fp32, []> var_735 = const()[name = tensor<string, []>("op_735"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 3, 256]> var_736 = mul(x = input_119, y = var_735)[name = tensor<string, []>("op_736")];
-            tensor<fp32, [1, 3, 256]> input_121 = add(x = var_736, y = input_111)[name = tensor<string, []>("input_121")];
-            tensor<int32, [1]> input_123_axes_0 = const()[name = tensor<string, []>("input_123_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_123 = layer_norm(axes = input_123_axes_0, beta = encoder_layer_norm_2_bias, epsilon = var_28, gamma = encoder_layer_norm_2_weight, x = input_121)[name = tensor<string, []>("input_123")];
+            tensor<int32, [3]> var_756_begin_0 = const()[name = tensor<string, []>("op_756_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_756_end_0 = const()[name = tensor<string, []>("op_756_end_0"), val = tensor<int32, [3]>([3, 16, 256])];
+            tensor<bool, [3]> var_756_end_mask_0 = const()[name = tensor<string, []>("op_756_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [3, 16, 256]> conv_out_5 = transpose(perm = conv_out_5_perm_0, x = input_111)[name = tensor<string, []>("transpose_39")];
+            tensor<fp32, [3, 1, 256]> var_756 = slice_by_index(begin = var_756_begin_0, end = var_756_end_0, end_mask = var_756_end_mask_0, x = conv_out_5)[name = tensor<string, []>("op_756")];
+            tensor<int32, [3]> var_758_perm_0 = const()[name = tensor<string, []>("op_758_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 3, 256]> var_758 = transpose(perm = var_758_perm_0, x = var_756)[name = tensor<string, []>("transpose_38")];
+            tensor<fp32, [1, 3, 256]> input_113 = add(x = x_15, y = var_758)[name = tensor<string, []>("input_113")];
+            tensor<int32, [1]> input_115_axes_0 = const()[name = tensor<string, []>("input_115_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 3, 256]> input_115 = layer_norm(axes = input_115_axes_0, beta = inner_encoder_ffn2_2_module_sequential_0_bias, epsilon = var_56, gamma = inner_encoder_ffn2_2_module_sequential_0_weight, x = input_113)[name = tensor<string, []>("input_115")];
+            tensor<fp32, [1, 3, 1024]> inputs_29 = linear(bias = inner_encoder_ffn2_2_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_2_module_sequential_1_linear_weight, x = input_115)[name = tensor<string, []>("linear_26")];
+            tensor<fp32, [1, 3, 1024]> input_117 = silu(x = inputs_29)[name = tensor<string, []>("input_117")];
+            tensor<fp32, [1, 3, 256]> input_121 = linear(bias = inner_encoder_ffn2_2_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_2_module_sequential_4_linear_weight, x = input_117)[name = tensor<string, []>("linear_27")];
+            tensor<fp32, []> var_781 = const()[name = tensor<string, []>("op_781"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 3, 256]> var_782 = mul(x = input_121, y = var_781)[name = tensor<string, []>("op_782")];
+            tensor<fp32, [1, 3, 256]> input_123 = add(x = var_782, y = input_113)[name = tensor<string, []>("input_123")];
             tensor<int32, [1]> input_125_axes_0 = const()[name = tensor<string, []>("input_125_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_125 = layer_norm(axes = input_125_axes_0, beta = encoder_ffn1_3_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_3_module_sequential_0_weight, x = input_123)[name = tensor<string, []>("input_125")];
-            tensor<fp32, [1, 3, 1024]> inputs_31 = linear(bias = encoder_ffn1_3_module_sequential_1_linear_bias, weight = encoder_ffn1_3_module_sequential_1_linear_weight, x = input_125)[name = tensor<string, []>("linear_28")];
-            tensor<fp32, [1, 3, 1024]> input_127 = silu(x = inputs_31)[name = tensor<string, []>("input_127")];
-            tensor<fp32, [1, 3, 256]> input_131 = linear(bias = encoder_ffn1_3_module_sequential_4_linear_bias, weight = encoder_ffn1_3_module_sequential_4_linear_weight, x = input_127)[name = tensor<string, []>("linear_29")];
-            tensor<fp32, []> var_765 = const()[name = tensor<string, []>("op_765"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 3, 256]> var_766 = mul(x = input_131, y = var_765)[name = tensor<string, []>("op_766")];
-            tensor<fp32, [1, 3, 256]> input_133 = add(x = var_766, y = input_123)[name = tensor<string, []>("input_133")];
+            tensor<fp32, [1, 3, 256]> input_125 = layer_norm(axes = input_125_axes_0, beta = inner_encoder_layer_norm_2_bias, epsilon = var_56, gamma = inner_encoder_layer_norm_2_weight, x = input_123)[name = tensor<string, []>("input_125")];
+            tensor<int32, [1]> input_127_axes_0 = const()[name = tensor<string, []>("input_127_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 3, 256]> input_127 = layer_norm(axes = input_127_axes_0, beta = inner_encoder_ffn1_3_module_sequential_0_bias, epsilon = var_56, gamma = inner_encoder_ffn1_3_module_sequential_0_weight, x = input_125)[name = tensor<string, []>("input_127")];
+            tensor<fp32, [1, 3, 1024]> inputs_31 = linear(bias = inner_encoder_ffn1_3_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_3_module_sequential_1_linear_weight, x = input_127)[name = tensor<string, []>("linear_28")];
+            tensor<fp32, [1, 3, 1024]> input_129 = silu(x = inputs_31)[name = tensor<string, []>("input_129")];
+            tensor<fp32, [1, 3, 256]> input_133 = linear(bias = inner_encoder_ffn1_3_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_3_module_sequential_4_linear_weight, x = input_129)[name = tensor<string, []>("linear_29")];
+            tensor<fp32, []> var_811 = const()[name = tensor<string, []>("op_811"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 3, 256]> var_812 = mul(x = input_133, y = var_811)[name = tensor<string, []>("op_812")];
+            tensor<fp32, [1, 3, 256]> input_135 = add(x = var_812, y = input_125)[name = tensor<string, []>("input_135")];
             tensor<int32, [1]> x_19_axes_0 = const()[name = tensor<string, []>("x_19_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> x_19 = layer_norm(axes = x_19_axes_0, beta = encoder_ret_lns_3_bias, epsilon = var_28, gamma = encoder_ret_lns_3_weight, x = input_133)[name = tensor<string, []>("x_19")];
+            tensor<fp32, [1, 3, 256]> x_19 = layer_norm(axes = x_19_axes_0, beta = inner_encoder_ret_lns_3_bias, epsilon = var_56, gamma = inner_encoder_ret_lns_3_weight, x = input_135)[name = tensor<string, []>("x_19")];
             tensor<int32, [5]> prev_kv_7_begin_0 = const()[name = tensor<string, []>("prev_kv_7_begin_0"), val = tensor<int32, [5]>([3, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_7_end_0 = const()[name = tensor<string, []>("prev_kv_7_end_0"), val = tensor<int32, [5]>([4, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_7_end_mask_0 = const()[name = tensor<string, []>("prev_kv_7_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -740,199 +758,192 @@ program(1.0)
             tensor<bool, [2]> prev_scale_7_end_mask_0 = const()[name = tensor<string, []>("prev_scale_7_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_7_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_7_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_7 = slice_by_index(begin = prev_scale_7_begin_0, end = prev_scale_7_end_0, end_mask = prev_scale_7_end_mask_0, squeeze_mask = prev_scale_7_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_7")];
-            tensor<fp32, [1, 3, 256]> var_780 = linear(bias = encoder_q_proj_3_bias, weight = encoder_q_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_30")];
-            tensor<int32, [4]> var_781 = const()[name = tensor<string, []>("op_781"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_782 = reshape(shape = var_781, x = var_780)[name = tensor<string, []>("op_782")];
+            tensor<fp32, [1, 3, 256]> var_826 = linear(bias = inner_encoder_q_proj_3_bias, weight = inner_encoder_q_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_30")];
+            tensor<int32, [4]> var_827 = const()[name = tensor<string, []>("op_827"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_828 = reshape(shape = var_827, x = var_826)[name = tensor<string, []>("op_828")];
             tensor<int32, [4]> q_7_perm_0 = const()[name = tensor<string, []>("q_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> var_786 = linear(bias = encoder_k_proj_3_bias, weight = encoder_k_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_31")];
-            tensor<fp32, []> var_787 = const()[name = tensor<string, []>("op_787"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 3, 256]> var_788 = mul(x = var_786, y = var_787)[name = tensor<string, []>("op_788")];
-            tensor<int32, [4]> var_789 = const()[name = tensor<string, []>("op_789"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_790 = reshape(shape = var_789, x = var_788)[name = tensor<string, []>("op_790")];
+            tensor<fp32, [1, 3, 256]> var_832 = linear(bias = inner_encoder_k_proj_3_bias, weight = inner_encoder_k_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_31")];
+            tensor<fp32, []> var_833 = const()[name = tensor<string, []>("op_833"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 3, 256]> var_834 = mul(x = var_832, y = var_833)[name = tensor<string, []>("op_834")];
+            tensor<int32, [4]> var_835 = const()[name = tensor<string, []>("op_835"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_836 = reshape(shape = var_835, x = var_834)[name = tensor<string, []>("op_836")];
             tensor<int32, [4]> k_7_perm_0 = const()[name = tensor<string, []>("k_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> var_794 = linear(bias = encoder_v_proj_3_bias, weight = encoder_v_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_32")];
-            tensor<int32, [4]> var_795 = const()[name = tensor<string, []>("op_795"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_796 = reshape(shape = var_795, x = var_794)[name = tensor<string, []>("op_796")];
+            tensor<fp32, [1, 3, 256]> var_840 = linear(bias = inner_encoder_v_proj_3_bias, weight = inner_encoder_v_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_32")];
+            tensor<int32, [4]> var_841 = const()[name = tensor<string, []>("op_841"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_842 = reshape(shape = var_841, x = var_840)[name = tensor<string, []>("op_842")];
             tensor<int32, [4]> v_7_perm_0 = const()[name = tensor<string, []>("v_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> input_137 = linear(bias = encoder_g_proj_3_bias, weight = encoder_g_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_33")];
+            tensor<fp32, [1, 3, 256]> input_139 = linear(bias = inner_encoder_g_proj_3_bias, weight = inner_encoder_g_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_33")];
             tensor<fp32, [1]> sqrt_s0_7 = sqrt(x = prev_scale_7)[name = tensor<string, []>("sqrt_s0_7")];
-            tensor<fp32, [3]> s_t_7 = add(x = prev_scale_7, y = encoder__t_index)[name = tensor<string, []>("s_t_7")];
+            tensor<fp32, [3]> s_t_7 = add(x = prev_scale_7, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_7")];
             tensor<fp32, [3]> sqrt_s_t_7 = sqrt(x = s_t_7)[name = tensor<string, []>("sqrt_s_t_7")];
             tensor<bool, []> qk_7_transpose_x_1 = const()[name = tensor<string, []>("qk_7_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_7_transpose_y_1 = const()[name = tensor<string, []>("qk_7_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 3, 64]> k_7 = transpose(perm = k_7_perm_0, x = var_790)[name = tensor<string, []>("transpose_36")];
-            tensor<fp32, [1, 4, 3, 64]> q_7 = transpose(perm = q_7_perm_0, x = var_782)[name = tensor<string, []>("transpose_37")];
+            tensor<fp32, [1, 4, 3, 64]> k_7 = transpose(perm = k_7_perm_0, x = var_836)[name = tensor<string, []>("transpose_36")];
+            tensor<fp32, [1, 4, 3, 64]> q_7 = transpose(perm = q_7_perm_0, x = var_828)[name = tensor<string, []>("transpose_37")];
             tensor<fp32, [1, 4, 3, 3]> qk_7 = matmul(transpose_x = qk_7_transpose_x_1, transpose_y = qk_7_transpose_y_1, x = q_7, y = k_7)[name = tensor<string, []>("qk_7")];
-            tensor<int32, [2]> var_806 = const()[name = tensor<string, []>("op_806"), val = tensor<int32, [2]>([3, 1])];
-            tensor<fp32, [3, 1]> var_807 = reshape(shape = var_806, x = sqrt_s_t_7)[name = tensor<string, []>("op_807")];
-            tensor<fp32, [3, 3]> M_7 = real_div(x = encoder__causal_mask, y = var_807)[name = tensor<string, []>("M_7")];
-            tensor<fp32, [1, 4, 3, 3]> var_809 = mul(x = qk_7, y = M_7)[name = tensor<string, []>("op_809")];
+            tensor<int32, [2]> var_852 = const()[name = tensor<string, []>("op_852"), val = tensor<int32, [2]>([3, 1])];
+            tensor<fp32, [3, 1]> var_853 = reshape(shape = var_852, x = sqrt_s_t_7)[name = tensor<string, []>("op_853")];
+            tensor<fp32, [3, 3]> M_7 = real_div(x = inner_encoder__causal_mask, y = var_853)[name = tensor<string, []>("M_7")];
+            tensor<fp32, [1, 4, 3, 3]> var_855 = mul(x = qk_7, y = M_7)[name = tensor<string, []>("op_855")];
             tensor<bool, []> inner_7_transpose_x_0 = const()[name = tensor<string, []>("inner_7_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_7_transpose_y_0 = const()[name = tensor<string, []>("inner_7_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 3, 64]> v_7 = transpose(perm = v_7_perm_0, x = var_796)[name = tensor<string, []>("transpose_35")];
-            tensor<fp32, [1, 4, 3, 64]> inner_7 = matmul(transpose_x = inner_7_transpose_x_0, transpose_y = inner_7_transpose_y_0, x = var_809, y = v_7)[name = tensor<string, []>("inner_7")];
-            tensor<bool, []> var_811_transpose_x_0 = const()[name = tensor<string, []>("op_811_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_811_transpose_y_0 = const()[name = tensor<string, []>("op_811_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 3, 64]> var_811 = matmul(transpose_x = var_811_transpose_x_0, transpose_y = var_811_transpose_y_0, x = q_7, y = prev_kv_7)[name = tensor<string, []>("op_811")];
-            tensor<fp32, [3]> var_812 = real_div(x = sqrt_s0_7, y = sqrt_s_t_7)[name = tensor<string, []>("op_812")];
-            tensor<int32, [4]> var_813 = const()[name = tensor<string, []>("op_813"), val = tensor<int32, [4]>([1, 1, 3, 1])];
-            tensor<fp32, [1, 1, 3, 1]> var_814 = reshape(shape = var_813, x = var_812)[name = tensor<string, []>("op_814")];
-            tensor<fp32, [1, 4, 3, 64]> cross_7 = mul(x = var_811, y = var_814)[name = tensor<string, []>("cross_7")];
+            tensor<fp32, [1, 4, 3, 64]> v_7 = transpose(perm = v_7_perm_0, x = var_842)[name = tensor<string, []>("transpose_35")];
+            tensor<fp32, [1, 4, 3, 64]> inner_7 = matmul(transpose_x = inner_7_transpose_x_0, transpose_y = inner_7_transpose_y_0, x = var_855, y = v_7)[name = tensor<string, []>("inner_7")];
+            tensor<bool, []> var_857_transpose_x_0 = const()[name = tensor<string, []>("op_857_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_857_transpose_y_0 = const()[name = tensor<string, []>("op_857_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 3, 64]> var_857 = matmul(transpose_x = var_857_transpose_x_0, transpose_y = var_857_transpose_y_0, x = q_7, y = prev_kv_7)[name = tensor<string, []>("op_857")];
+            tensor<fp32, [3]> var_858 = real_div(x = sqrt_s0_7, y = sqrt_s_t_7)[name = tensor<string, []>("op_858")];
+            tensor<int32, [4]> var_859 = const()[name = tensor<string, []>("op_859"), val = tensor<int32, [4]>([1, 1, 3, 1])];
+            tensor<fp32, [1, 1, 3, 1]> var_860 = reshape(shape = var_859, x = var_858)[name = tensor<string, []>("op_860")];
+            tensor<fp32, [1, 4, 3, 64]> cross_7 = mul(x = var_857, y = var_860)[name = tensor<string, []>("cross_7")];
             tensor<fp32, [1, 4, 3, 64]> out_19 = add(x = inner_7, y = cross_7)[name = tensor<string, []>("out_19")];
-            tensor<fp32, [1, 4, 64, 64]> var_817 = mul(x = prev_kv_7, y = sqrt_s0_7)[name = tensor<string, []>("op_817")];
-            tensor<bool, []> var_819_transpose_x_1 = const()[name = tensor<string, []>("op_819_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_819_transpose_y_1 = const()[name = tensor<string, []>("op_819_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_819 = matmul(transpose_x = var_819_transpose_x_1, transpose_y = var_819_transpose_y_1, x = k_7, y = v_7)[name = tensor<string, []>("op_819")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_7 = add(x = var_817, y = var_819)[name = tensor<string, []>("new_kv_unnorm_7")];
-            tensor<fp32, []> var_821 = const()[name = tensor<string, []>("op_821"), val = tensor<fp32, []>(0x1.8p+1)];
-            tensor<fp32, [1]> new_scale_7 = add(x = prev_scale_7, y = var_821)[name = tensor<string, []>("new_scale_7")];
-            tensor<fp32, [1]> var_823 = sqrt(x = new_scale_7)[name = tensor<string, []>("op_823")];
-            tensor<fp32, [1, 4, 64, 64]> nkv_1 = real_div(x = new_kv_unnorm_7, y = var_823)[name = tensor<string, []>("nkv_1")];
-            tensor<int32, [4]> var_825_perm_0 = const()[name = tensor<string, []>("op_825_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_863 = mul(x = prev_kv_7, y = sqrt_s0_7)[name = tensor<string, []>("op_863")];
+            tensor<bool, []> var_865_transpose_x_1 = const()[name = tensor<string, []>("op_865_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_865_transpose_y_1 = const()[name = tensor<string, []>("op_865_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_865 = matmul(transpose_x = var_865_transpose_x_1, transpose_y = var_865_transpose_y_1, x = k_7, y = v_7)[name = tensor<string, []>("op_865")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_7 = add(x = var_863, y = var_865)[name = tensor<string, []>("new_kv_unnorm_7")];
+            tensor<fp32, []> var_867 = const()[name = tensor<string, []>("op_867"), val = tensor<fp32, []>(0x1.8p+1)];
+            tensor<fp32, [1]> new_scale_7 = add(x = prev_scale_7, y = var_867)[name = tensor<string, []>("new_scale_7")];
+            tensor<fp32, [1]> var_869 = sqrt(x = new_scale_7)[name = tensor<string, []>("op_869")];
+            tensor<fp32, [1, 4, 64, 64]> nkv_1 = real_div(x = new_kv_unnorm_7, y = var_869)[name = tensor<string, []>("nkv_1")];
+            tensor<int32, [4]> var_871_perm_0 = const()[name = tensor<string, []>("op_871_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_21_axes_0 = const()[name = tensor<string, []>("out_21_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 4, 64]> var_825 = transpose(perm = var_825_perm_0, x = out_19)[name = tensor<string, []>("transpose_34")];
-            tensor<fp32, [1, 3, 4, 64]> out_21 = layer_norm(axes = out_21_axes_0, epsilon = var_17, x = var_825)[name = tensor<string, []>("out_21")];
-            tensor<int32, [3]> var_829 = const()[name = tensor<string, []>("op_829"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<fp32, [1, 3, 256]> out_23 = reshape(shape = var_829, x = out_21)[name = tensor<string, []>("out_23")];
-            tensor<fp32, [1, 3, 256]> var_831 = silu(x = input_137)[name = tensor<string, []>("op_831")];
-            tensor<fp32, [1, 3, 256]> input_139 = mul(x = var_831, y = out_23)[name = tensor<string, []>("input_139")];
-            tensor<fp32, [1, 3, 256]> ret_out_7 = linear(bias = encoder_out_proj_3_bias, weight = encoder_out_proj_3_weight, x = input_139)[name = tensor<string, []>("linear_34")];
-            tensor<fp32, [1, 3, 256]> x_21 = add(x = input_133, y = ret_out_7)[name = tensor<string, []>("x_21")];
+            tensor<fp32, [1, 3, 4, 64]> var_871 = transpose(perm = var_871_perm_0, x = out_19)[name = tensor<string, []>("transpose_34")];
+            tensor<fp32, [1, 3, 4, 64]> out_21 = layer_norm(axes = out_21_axes_0, epsilon = var_64, x = var_871)[name = tensor<string, []>("out_21")];
+            tensor<int32, [3]> var_875 = const()[name = tensor<string, []>("op_875"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<fp32, [1, 3, 256]> out_23 = reshape(shape = var_875, x = out_21)[name = tensor<string, []>("out_23")];
+            tensor<fp32, [1, 3, 256]> var_877 = silu(x = input_139)[name = tensor<string, []>("op_877")];
+            tensor<fp32, [1, 3, 256]> input_141 = mul(x = var_877, y = out_23)[name = tensor<string, []>("input_141")];
+            tensor<fp32, [1, 3, 256]> ret_out_7 = linear(bias = inner_encoder_out_proj_3_bias, weight = inner_encoder_out_proj_3_weight, x = input_141)[name = tensor<string, []>("linear_34")];
+            tensor<fp32, [1, 3, 256]> x_21 = add(x = input_135, y = ret_out_7)[name = tensor<string, []>("x_21")];
             tensor<int32, [4]> window_25_begin_0 = const()[name = tensor<string, []>("window_25_begin_0"), val = tensor<int32, [4]>([3, 0, 0, 0])];
             tensor<int32, [4]> window_25_end_0 = const()[name = tensor<string, []>("window_25_end_0"), val = tensor<int32, [4]>([4, 1, 16, 256])];
             tensor<bool, [4]> window_25_end_mask_0 = const()[name = tensor<string, []>("window_25_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_25_squeeze_mask_0 = const()[name = tensor<string, []>("window_25_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_25 = slice_by_index(begin = window_25_begin_0, end = window_25_end_0, end_mask = window_25_end_mask_0, squeeze_mask = window_25_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_25")];
-            tensor<int32, [3]> var_839_begin_0 = const()[name = tensor<string, []>("op_839_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_839_end_0 = const()[name = tensor<string, []>("op_839_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_839_end_mask_0 = const()[name = tensor<string, []>("op_839_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_839 = slice_by_index(begin = var_839_begin_0, end = var_839_end_0, end_mask = var_839_end_mask_0, x = x_21)[name = tensor<string, []>("op_839")];
-            tensor<int32, [3]> var_842_begin_0 = const()[name = tensor<string, []>("op_842_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_842_end_0 = const()[name = tensor<string, []>("op_842_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_842_end_mask_0 = const()[name = tensor<string, []>("op_842_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_842 = slice_by_index(begin = var_842_begin_0, end = var_842_end_0, end_mask = var_842_end_mask_0, x = window_25)[name = tensor<string, []>("op_842")];
+            tensor<int32, [3]> var_885_begin_0 = const()[name = tensor<string, []>("op_885_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_885_end_0 = const()[name = tensor<string, []>("op_885_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_885_end_mask_0 = const()[name = tensor<string, []>("op_885_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_885 = slice_by_index(begin = var_885_begin_0, end = var_885_end_0, end_mask = var_885_end_mask_0, x = x_21)[name = tensor<string, []>("op_885")];
+            tensor<int32, [3]> var_888_begin_0 = const()[name = tensor<string, []>("op_888_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_888_end_0 = const()[name = tensor<string, []>("op_888_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_888_end_mask_0 = const()[name = tensor<string, []>("op_888_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_888 = slice_by_index(begin = var_888_begin_0, end = var_888_end_0, end_mask = var_888_end_mask_0, x = window_25)[name = tensor<string, []>("op_888")];
             tensor<bool, []> window_27_interleave_0 = const()[name = tensor<string, []>("window_27_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_27 = concat(axis = var_26, interleave = window_27_interleave_0, values = (var_842, var_839))[name = tensor<string, []>("window_27")];
-            tensor<int32, [3]> var_847_begin_0 = const()[name = tensor<string, []>("op_847_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_847_end_0 = const()[name = tensor<string, []>("op_847_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_847_end_mask_0 = const()[name = tensor<string, []>("op_847_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_847 = slice_by_index(begin = var_847_begin_0, end = var_847_end_0, end_mask = var_847_end_mask_0, x = x_21)[name = tensor<string, []>("op_847")];
-            tensor<int32, [3]> var_850_begin_0 = const()[name = tensor<string, []>("op_850_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_850_end_0 = const()[name = tensor<string, []>("op_850_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_850_end_mask_0 = const()[name = tensor<string, []>("op_850_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_850 = slice_by_index(begin = var_850_begin_0, end = var_850_end_0, end_mask = var_850_end_mask_0, x = window_27)[name = tensor<string, []>("op_850")];
+            tensor<fp32, [1, 16, 256]> window_27 = concat(axis = var_72, interleave = window_27_interleave_0, values = (var_888, var_885))[name = tensor<string, []>("window_27")];
+            tensor<int32, [3]> var_893_begin_0 = const()[name = tensor<string, []>("op_893_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_893_end_0 = const()[name = tensor<string, []>("op_893_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_893_end_mask_0 = const()[name = tensor<string, []>("op_893_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_893 = slice_by_index(begin = var_893_begin_0, end = var_893_end_0, end_mask = var_893_end_mask_0, x = x_21)[name = tensor<string, []>("op_893")];
+            tensor<int32, [3]> var_896_begin_0 = const()[name = tensor<string, []>("op_896_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_896_end_0 = const()[name = tensor<string, []>("op_896_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_896_end_mask_0 = const()[name = tensor<string, []>("op_896_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_896 = slice_by_index(begin = var_896_begin_0, end = var_896_end_0, end_mask = var_896_end_mask_0, x = window_27)[name = tensor<string, []>("op_896")];
             tensor<bool, []> window_29_interleave_0 = const()[name = tensor<string, []>("window_29_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_29 = concat(axis = var_26, interleave = window_29_interleave_0, values = (var_850, var_847))[name = tensor<string, []>("window_29")];
-            tensor<int32, [3]> var_855_begin_0 = const()[name = tensor<string, []>("op_855_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_855_end_0 = const()[name = tensor<string, []>("op_855_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_855_end_mask_0 = const()[name = tensor<string, []>("op_855_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_855 = slice_by_index(begin = var_855_begin_0, end = var_855_end_0, end_mask = var_855_end_mask_0, x = x_21)[name = tensor<string, []>("op_855")];
-            tensor<int32, [3]> var_858_begin_0 = const()[name = tensor<string, []>("op_858_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_858_end_0 = const()[name = tensor<string, []>("op_858_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_858_end_mask_0 = const()[name = tensor<string, []>("op_858_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_858 = slice_by_index(begin = var_858_begin_0, end = var_858_end_0, end_mask = var_858_end_mask_0, x = window_29)[name = tensor<string, []>("op_858")];
+            tensor<fp32, [1, 16, 256]> window_29 = concat(axis = var_72, interleave = window_29_interleave_0, values = (var_896, var_893))[name = tensor<string, []>("window_29")];
+            tensor<int32, [3]> var_901_begin_0 = const()[name = tensor<string, []>("op_901_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_901_end_0 = const()[name = tensor<string, []>("op_901_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_901_end_mask_0 = const()[name = tensor<string, []>("op_901_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_901 = slice_by_index(begin = var_901_begin_0, end = var_901_end_0, end_mask = var_901_end_mask_0, x = x_21)[name = tensor<string, []>("op_901")];
+            tensor<int32, [3]> var_904_begin_0 = const()[name = tensor<string, []>("op_904_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_904_end_0 = const()[name = tensor<string, []>("op_904_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_904_end_mask_0 = const()[name = tensor<string, []>("op_904_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_904 = slice_by_index(begin = var_904_begin_0, end = var_904_end_0, end_mask = var_904_end_mask_0, x = window_29)[name = tensor<string, []>("op_904")];
             tensor<bool, []> window_interleave_0 = const()[name = tensor<string, []>("window_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window = concat(axis = var_26, interleave = window_interleave_0, values = (var_858, var_855))[name = tensor<string, []>("window")];
-            tensor<bool, []> input_141_interleave_0 = const()[name = tensor<string, []>("input_141_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [3, 16, 256]> input_141 = concat(axis = var_23, interleave = input_141_interleave_0, values = (window_27, window_29, window))[name = tensor<string, []>("input_141")];
+            tensor<fp32, [1, 16, 256]> window = concat(axis = var_72, interleave = window_interleave_0, values = (var_904, var_901))[name = tensor<string, []>("window")];
+            tensor<bool, []> input_143_interleave_0 = const()[name = tensor<string, []>("input_143_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [3, 16, 256]> input_143 = concat(axis = var_59, interleave = input_143_interleave_0, values = (window_27, window_29, window))[name = tensor<string, []>("input_143")];
             tensor<int32, [1]> x_23_axes_0 = const()[name = tensor<string, []>("x_23_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [3, 16, 256]> x_23 = layer_norm(axes = x_23_axes_0, beta = encoder_conv_module_3_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_3_sequential_0_weight, x = input_141)[name = tensor<string, []>("x_23")];
-            tensor<int32, [3]> input_143_perm_0 = const()[name = tensor<string, []>("input_143_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [3, 16, 256]> x_23 = layer_norm(axes = x_23_axes_0, beta = inner_encoder_conv_module_3_sequential_0_bias, epsilon = var_56, gamma = inner_encoder_conv_module_3_sequential_0_weight, x = input_143)[name = tensor<string, []>("x_23")];
+            tensor<int32, [3]> input_145_perm_0 = const()[name = tensor<string, []>("input_145_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_33_pad_type_0 = const()[name = tensor<string, []>("inputs_33_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_33_strides_0 = const()[name = tensor<string, []>("inputs_33_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_33_pad_0 = const()[name = tensor<string, []>("inputs_33_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_33_dilations_0 = const()[name = tensor<string, []>("inputs_33_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_33_groups_0 = const()[name = tensor<string, []>("inputs_33_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> input_143 = transpose(perm = input_143_perm_0, x = x_23)[name = tensor<string, []>("transpose_33")];
-            tensor<fp32, [3, 512, 16]> inputs_33 = conv(bias = encoder_conv_module_3_sequential_2_conv_bias, dilations = inputs_33_dilations_0, groups = inputs_33_groups_0, pad = inputs_33_pad_0, pad_type = inputs_33_pad_type_0, strides = inputs_33_strides_0, weight = encoder_conv_module_3_sequential_2_conv_weight, x = input_143)[name = tensor<string, []>("inputs_33")];
-            tensor<int32, [2]> var_883_split_sizes_0 = const()[name = tensor<string, []>("op_883_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_883_axis_0 = const()[name = tensor<string, []>("op_883_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> var_883_0, tensor<fp32, [3, 256, 16]> var_883_1 = split(axis = var_883_axis_0, split_sizes = var_883_split_sizes_0, x = inputs_33)[name = tensor<string, []>("op_883")];
-            tensor<fp32, [3, 256, 16]> var_885 = sigmoid(x = var_883_1)[name = tensor<string, []>("op_885")];
-            tensor<fp32, [3, 256, 16]> inputs_35 = mul(x = var_883_0, y = var_885)[name = tensor<string, []>("inputs_35")];
+            tensor<fp32, [3, 256, 16]> input_145 = transpose(perm = input_145_perm_0, x = x_23)[name = tensor<string, []>("transpose_33")];
+            tensor<fp32, [3, 512, 16]> inputs_33 = conv(bias = inner_encoder_conv_module_3_sequential_2_conv_bias, dilations = inputs_33_dilations_0, groups = inputs_33_groups_0, pad = inputs_33_pad_0, pad_type = inputs_33_pad_type_0, strides = inputs_33_strides_0, weight = inner_encoder_conv_module_3_sequential_2_conv_weight, x = input_145)[name = tensor<string, []>("inputs_33")];
+            tensor<int32, [2]> var_929_split_sizes_0 = const()[name = tensor<string, []>("op_929_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_929_axis_0 = const()[name = tensor<string, []>("op_929_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [3, 256, 16]> var_929_0, tensor<fp32, [3, 256, 16]> var_929_1 = split(axis = var_929_axis_0, split_sizes = var_929_split_sizes_0, x = inputs_33)[name = tensor<string, []>("op_929")];
+            tensor<fp32, [3, 256, 16]> var_931 = sigmoid(x = var_929_1)[name = tensor<string, []>("op_931")];
+            tensor<fp32, [3, 256, 16]> inputs_35 = mul(x = var_929_0, y = var_931)[name = tensor<string, []>("inputs_35")];
             tensor<string, []> outputs_aug_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_pad_0 = const()[name = tensor<string, []>("outputs_aug_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_groups_0 = const()[name = tensor<string, []>("outputs_aug_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_strides_0 = const()[name = tensor<string, []>("outputs_aug_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_dilations_0 = const()[name = tensor<string, []>("outputs_aug_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [3, 256, 31]> outputs_aug = conv(dilations = outputs_aug_dilations_0, groups = outputs_aug_groups_0, pad = outputs_aug_pad_0, pad_type = outputs_aug_pad_type_0, strides = outputs_aug_strides_0, weight = encoder_conv_module_3_sequential_4_conv_weight, x = inputs_35)[name = tensor<string, []>("outputs_aug")];
-            tensor<int32, [3]> input_145_begin_0 = const()[name = tensor<string, []>("input_145_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_145_end_0 = const()[name = tensor<string, []>("input_145_end_0"), val = tensor<int32, [3]>([3, 256, 16])];
-            tensor<bool, [3]> input_145_end_mask_0 = const()[name = tensor<string, []>("input_145_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [3, 256, 16]> input_145 = slice_by_index(begin = input_145_begin_0, end = input_145_end_0, end_mask = input_145_end_mask_0, x = outputs_aug)[name = tensor<string, []>("input_145")];
-            tensor<fp32, [3, 256, 16]> inputs_37 = batch_norm(beta = encoder_conv_module_3_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_3_sequential_5_weight, mean = encoder_conv_module_3_sequential_5_running_mean, variance = encoder_conv_module_3_sequential_5_running_var, x = input_145)[name = tensor<string, []>("inputs_37")];
-            tensor<fp32, [3, 256, 16]> input_147 = silu(x = inputs_37)[name = tensor<string, []>("input_147")];
-            tensor<string, []> input_149_pad_type_0 = const()[name = tensor<string, []>("input_149_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_149_strides_0 = const()[name = tensor<string, []>("input_149_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_149_pad_0 = const()[name = tensor<string, []>("input_149_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_149_dilations_0 = const()[name = tensor<string, []>("input_149_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_149_groups_0 = const()[name = tensor<string, []>("input_149_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> input_149 = conv(bias = encoder_conv_module_3_sequential_7_conv_bias, dilations = input_149_dilations_0, groups = input_149_groups_0, pad = input_149_pad_0, pad_type = input_149_pad_type_0, strides = input_149_strides_0, weight = encoder_conv_module_3_sequential_7_conv_weight, x = input_147)[name = tensor<string, []>("input_149")];
+            tensor<fp32, [3, 256, 31]> outputs_aug = conv(dilations = outputs_aug_dilations_0, groups = outputs_aug_groups_0, pad = outputs_aug_pad_0, pad_type = outputs_aug_pad_type_0, strides = outputs_aug_strides_0, weight = inner_encoder_conv_module_3_sequential_4_conv_weight, x = inputs_35)[name = tensor<string, []>("outputs_aug")];
+            tensor<int32, [3]> input_147_begin_0 = const()[name = tensor<string, []>("input_147_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_147_end_0 = const()[name = tensor<string, []>("input_147_end_0"), val = tensor<int32, [3]>([3, 256, 16])];
+            tensor<bool, [3]> input_147_end_mask_0 = const()[name = tensor<string, []>("input_147_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [3, 256, 16]> input_147 = slice_by_index(begin = input_147_begin_0, end = input_147_end_0, end_mask = input_147_end_mask_0, x = outputs_aug)[name = tensor<string, []>("input_147")];
+            tensor<fp32, [3, 256, 16]> inputs_37 = batch_norm(beta = inner_encoder_conv_module_3_sequential_5_bias, epsilon = var_56, gamma = inner_encoder_conv_module_3_sequential_5_weight, mean = inner_encoder_conv_module_3_sequential_5_running_mean, variance = inner_encoder_conv_module_3_sequential_5_running_var, x = input_147)[name = tensor<string, []>("inputs_37")];
+            tensor<fp32, [3, 256, 16]> input_149 = silu(x = inputs_37)[name = tensor<string, []>("input_149")];
+            tensor<string, []> input_151_pad_type_0 = const()[name = tensor<string, []>("input_151_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_151_strides_0 = const()[name = tensor<string, []>("input_151_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_151_pad_0 = const()[name = tensor<string, []>("input_151_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_151_dilations_0 = const()[name = tensor<string, []>("input_151_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_151_groups_0 = const()[name = tensor<string, []>("input_151_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [3, 256, 16]> input_151 = conv(bias = inner_encoder_conv_module_3_sequential_7_conv_bias, dilations = input_151_dilations_0, groups = input_151_groups_0, pad = input_151_pad_0, pad_type = input_151_pad_type_0, strides = input_151_strides_0, weight = inner_encoder_conv_module_3_sequential_7_conv_weight, x = input_149)[name = tensor<string, []>("input_151")];
             tensor<int32, [3]> conv_out_7_perm_0 = const()[name = tensor<string, []>("conv_out_7_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_916_begin_0 = const()[name = tensor<string, []>("op_916_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_916_end_0 = const()[name = tensor<string, []>("op_916_end_0"), val = tensor<int32, [3]>([3, 16, 256])];
-            tensor<bool, [3]> var_916_end_mask_0 = const()[name = tensor<string, []>("op_916_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [3, 16, 256]> conv_out_7 = transpose(perm = conv_out_7_perm_0, x = input_149)[name = tensor<string, []>("transpose_32")];
-            tensor<fp32, [3, 1, 256]> var_916 = slice_by_index(begin = var_916_begin_0, end = var_916_end_0, end_mask = var_916_end_mask_0, x = conv_out_7)[name = tensor<string, []>("op_916")];
-            tensor<int32, [3]> var_918_perm_0 = const()[name = tensor<string, []>("op_918_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 3, 256]> var_918 = transpose(perm = var_918_perm_0, x = var_916)[name = tensor<string, []>("transpose_31")];
-            tensor<fp32, [1, 3, 256]> input_151 = add(x = x_21, y = var_918)[name = tensor<string, []>("input_151")];
-            tensor<int32, [1]> input_153_axes_0 = const()[name = tensor<string, []>("input_153_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_153 = layer_norm(axes = input_153_axes_0, beta = encoder_ffn2_3_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_3_module_sequential_0_weight, x = input_151)[name = tensor<string, []>("input_153")];
-            tensor<fp32, [1, 3, 1024]> inputs = linear(bias = encoder_ffn2_3_module_sequential_1_linear_bias, weight = encoder_ffn2_3_module_sequential_1_linear_weight, x = input_153)[name = tensor<string, []>("linear_35")];
-            tensor<fp32, [1, 3, 1024]> input_155 = silu(x = inputs)[name = tensor<string, []>("input_155")];
-            tensor<fp32, [1, 3, 256]> input_159 = linear(bias = encoder_ffn2_3_module_sequential_4_linear_bias, weight = encoder_ffn2_3_module_sequential_4_linear_weight, x = input_155)[name = tensor<string, []>("linear_36")];
-            tensor<fp32, []> var_941 = const()[name = tensor<string, []>("op_941"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 3, 256]> var_942 = mul(x = input_159, y = var_941)[name = tensor<string, []>("op_942")];
-            tensor<fp32, [1, 3, 256]> input_161 = add(x = var_942, y = input_151)[name = tensor<string, []>("input_161")];
+            tensor<int32, [3]> var_962_begin_0 = const()[name = tensor<string, []>("op_962_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_962_end_0 = const()[name = tensor<string, []>("op_962_end_0"), val = tensor<int32, [3]>([3, 16, 256])];
+            tensor<bool, [3]> var_962_end_mask_0 = const()[name = tensor<string, []>("op_962_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [3, 16, 256]> conv_out_7 = transpose(perm = conv_out_7_perm_0, x = input_151)[name = tensor<string, []>("transpose_32")];
+            tensor<fp32, [3, 1, 256]> var_962 = slice_by_index(begin = var_962_begin_0, end = var_962_end_0, end_mask = var_962_end_mask_0, x = conv_out_7)[name = tensor<string, []>("op_962")];
+            tensor<int32, [3]> var_964_perm_0 = const()[name = tensor<string, []>("op_964_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 3, 256]> var_964 = transpose(perm = var_964_perm_0, x = var_962)[name = tensor<string, []>("transpose_31")];
+            tensor<fp32, [1, 3, 256]> input_153 = add(x = x_21, y = var_964)[name = tensor<string, []>("input_153")];
+            tensor<int32, [1]> input_155_axes_0 = const()[name = tensor<string, []>("input_155_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 3, 256]> input_155 = layer_norm(axes = input_155_axes_0, beta = inner_encoder_ffn2_3_module_sequential_0_bias, epsilon = var_56, gamma = inner_encoder_ffn2_3_module_sequential_0_weight, x = input_153)[name = tensor<string, []>("input_155")];
+            tensor<fp32, [1, 3, 1024]> inputs = linear(bias = inner_encoder_ffn2_3_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_3_module_sequential_1_linear_weight, x = input_155)[name = tensor<string, []>("linear_35")];
+            tensor<fp32, [1, 3, 1024]> input_157 = silu(x = inputs)[name = tensor<string, []>("input_157")];
+            tensor<fp32, [1, 3, 256]> input_161 = linear(bias = inner_encoder_ffn2_3_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_3_module_sequential_4_linear_weight, x = input_157)[name = tensor<string, []>("linear_36")];
+            tensor<fp32, []> var_987 = const()[name = tensor<string, []>("op_987"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 3, 256]> var_988 = mul(x = input_161, y = var_987)[name = tensor<string, []>("op_988")];
+            tensor<fp32, [1, 3, 256]> input_163 = add(x = var_988, y = input_153)[name = tensor<string, []>("input_163")];
             tensor<int32, [1]> x_25_axes_0 = const()[name = tensor<string, []>("x_25_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> x_25 = layer_norm(axes = x_25_axes_0, beta = encoder_layer_norm_3_bias, epsilon = var_28, gamma = encoder_layer_norm_3_weight, x = input_161)[name = tensor<string, []>("x_25")];
+            tensor<fp32, [1, 3, 256]> x_25 = layer_norm(axes = x_25_axes_0, beta = inner_encoder_layer_norm_3_bias, epsilon = var_56, gamma = inner_encoder_layer_norm_3_weight, x = input_163)[name = tensor<string, []>("x_25")];
             tensor<int32, [3]> x_ct_perm_0 = const()[name = tensor<string, []>("x_ct_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<bool, []> cat_interleave_0 = const()[name = tensor<string, []>("cat_interleave_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [1, 256, 3]> x_ct = transpose(perm = x_ct_perm_0, x = x_25)[name = tensor<string, []>("transpose_30")];
-            tensor<fp32, [1, 256, 21]> cat = concat(axis = var_20, interleave = cat_interleave_0, values = (cnn_window, x_ct))[name = tensor<string, []>("cat")];
+            tensor<fp32, [1, 256, 21]> cat = concat(axis = var_61, interleave = cat_interleave_0, values = (cnn_window, x_ct))[name = tensor<string, []>("cat")];
             tensor<string, []> conv_out_pad_type_0 = const()[name = tensor<string, []>("conv_out_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> conv_out_strides_0 = const()[name = tensor<string, []>("conv_out_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> conv_out_pad_0 = const()[name = tensor<string, []>("conv_out_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> conv_out_dilations_0 = const()[name = tensor<string, []>("conv_out_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> conv_out_groups_0 = const()[name = tensor<string, []>("conv_out_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 3]> conv_out = conv(bias = encoder_cnn_bias, dilations = conv_out_dilations_0, groups = conv_out_groups_0, pad = conv_out_pad_0, pad_type = conv_out_pad_type_0, strides = conv_out_strides_0, weight = encoder_cnn_weight, x = cat)[name = tensor<string, []>("conv_out")];
-            tensor<int32, [3]> var_960_begin_0 = const()[name = tensor<string, []>("op_960_begin_0"), val = tensor<int32, [3]>([0, 0, 3])];
-            tensor<int32, [3]> var_960_end_0 = const()[name = tensor<string, []>("op_960_end_0"), val = tensor<int32, [3]>([1, 256, 21])];
-            tensor<bool, [3]> var_960_end_mask_0 = const()[name = tensor<string, []>("op_960_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 256, 18]> cnn_window_new = slice_by_index(begin = var_960_begin_0, end = var_960_end_0, end_mask = var_960_end_mask_0, x = cat)[name = tensor<string, []>("op_960")];
-            tensor<int32, [3]> input_163_perm_0 = const()[name = tensor<string, []>("input_163_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [1]> var_962 = const()[name = tensor<string, []>("op_962"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_163 = transpose(perm = input_163_perm_0, x = conv_out)[name = tensor<string, []>("transpose_29")];
-            tensor<fp32, [1, 3, 1]> var_963 = reduce_l2_norm(axes = var_962, keep_dims = var_29, x = input_163)[name = tensor<string, []>("op_963")];
+            tensor<fp32, [1, 256, 3]> conv_out = conv(bias = inner_encoder_cnn_bias, dilations = conv_out_dilations_0, groups = conv_out_groups_0, pad = conv_out_pad_0, pad_type = conv_out_pad_type_0, strides = conv_out_strides_0, weight = inner_encoder_cnn_weight, x = cat)[name = tensor<string, []>("conv_out")];
+            tensor<int32, [3]> var_1006_begin_0 = const()[name = tensor<string, []>("op_1006_begin_0"), val = tensor<int32, [3]>([0, 0, 3])];
+            tensor<int32, [3]> var_1006_end_0 = const()[name = tensor<string, []>("op_1006_end_0"), val = tensor<int32, [3]>([1, 256, 21])];
+            tensor<bool, [3]> var_1006_end_mask_0 = const()[name = tensor<string, []>("op_1006_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 256, 18]> cnn_window_new = slice_by_index(begin = var_1006_begin_0, end = var_1006_end_0, end_mask = var_1006_end_mask_0, x = cat)[name = tensor<string, []>("op_1006")];
+            tensor<int32, [3]> input_165_perm_0 = const()[name = tensor<string, []>("input_165_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_1008 = const()[name = tensor<string, []>("op_1008"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 3, 256]> input_165 = transpose(perm = input_165_perm_0, x = conv_out)[name = tensor<string, []>("transpose_29")];
+            tensor<fp32, [1, 3, 1]> var_1009 = reduce_l2_norm(axes = var_1008, keep_dims = var_55, x = input_165)[name = tensor<string, []>("op_1009")];
             tensor<fp32, []> const_12 = const()[name = tensor<string, []>("const_12"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1, 3, 1]> clip_0 = clip(alpha = var_11, beta = const_12, x = var_963)[name = tensor<string, []>("clip_0")];
-            tensor<fp32, [1, 3, 256]> emb = real_div(x = input_163, y = clip_0)[name = tensor<string, []>("emb")];
-            tensor<int32, []> var_967_axis_0 = const()[name = tensor<string, []>("op_967_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1, 4, 64, 64]> enc_kv_new = stack(axis = var_967_axis_0, values = (var_206, var_412, var_618, nkv_1))[name = tensor<string, []>("op_967")];
-            tensor<int32, []> var_969_axis_0 = const()[name = tensor<string, []>("op_969_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1]> enc_scale_new = stack(axis = var_969_axis_0, values = (new_scale_1, new_scale_3, new_scale_5, new_scale_7))[name = tensor<string, []>("op_969")];
-            tensor<int32, []> var_971_axis_0 = const()[name = tensor<string, []>("op_971_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1, 16, 256]> enc_conv_cache_new = stack(axis = var_971_axis_0, values = (window_7, window_15, window_23, window))[name = tensor<string, []>("op_971")];
-            tensor<fp32, []> var_980 = const()[name = tensor<string, []>("op_980"), val = tensor<fp32, []>(0x1.5798eep-27)];
-            tensor<fp32, []> var_985 = const()[name = tensor<string, []>("op_985"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
-            tensor<fp32, []> var_987 = const()[name = tensor<string, []>("op_987"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
-            tensor<bool, []> var_988 = const()[name = tensor<string, []>("op_988"), val = tensor<bool, []>(true)];
-            tensor<fp32, []> var_990 = const()[name = tensor<string, []>("op_990"), val = tensor<fp32, []>(0x1p+0)];
-            tensor<int32, []> var_994 = const()[name = tensor<string, []>("op_994"), val = tensor<int32, []>(-1)];
-            tensor<int32, []> var_1000 = const()[name = tensor<string, []>("op_1000"), val = tensor<int32, []>(0)];
+            tensor<fp32, [1, 3, 1]> clip_0 = clip(alpha = var_69, beta = const_12, x = var_1009)[name = tensor<string, []>("clip_0")];
+            tensor<fp32, [1, 3, 256]> emb = real_div(x = input_165, y = clip_0)[name = tensor<string, []>("emb")];
+            tensor<int32, []> var_1013_axis_0 = const()[name = tensor<string, []>("op_1013_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1, 4, 64, 64]> enc_kv_new = stack(axis = var_1013_axis_0, values = (var_252, var_458, var_664, nkv_1))[name = tensor<string, []>("op_1013")];
+            tensor<int32, []> var_1015_axis_0 = const()[name = tensor<string, []>("op_1015_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1]> enc_scale_new = stack(axis = var_1015_axis_0, values = (new_scale_1, new_scale_3, new_scale_5, new_scale_7))[name = tensor<string, []>("op_1015")];
+            tensor<int32, []> var_1017_axis_0 = const()[name = tensor<string, []>("op_1017_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1, 16, 256]> enc_conv_cache_new = stack(axis = var_1017_axis_0, values = (window_7, window_15, window_23, window))[name = tensor<string, []>("op_1017")];
             tensor<fp32, [1, 3, 9, 256]> pos = const()[name = tensor<string, []>("pos"), val = tensor<fp32, [1, 3, 9, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44395328)))];
-            tensor<int32, [1]> var_1062_axes_0 = const()[name = tensor<string, []>("op_1062_axes_0"), val = tensor<int32, [1]>([2])];
-            tensor<fp32, [1, 3, 1, 256]> var_1062 = expand_dims(axes = var_1062_axes_0, x = emb)[name = tensor<string, []>("op_1062")];
+            tensor<int32, [1]> var_1085_axes_0 = const()[name = tensor<string, []>("op_1085_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp32, [1, 3, 1, 256]> var_1085 = expand_dims(axes = var_1085_axes_0, x = emb)[name = tensor<string, []>("op_1085")];
             tensor<int32, [4]> emb_exp_reps_0 = const()[name = tensor<string, []>("emb_exp_reps_0"), val = tensor<int32, [4]>([1, 1, 9, 1])];
-            tensor<fp32, [1, 3, 9, 256]> emb_exp = tile(reps = emb_exp_reps_0, x = var_1062)[name = tensor<string, []>("emb_exp")];
-            tensor<bool, []> input_165_interleave_0 = const()[name = tensor<string, []>("input_165_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 3, 9, 512]> input_165 = concat(axis = var_994, interleave = input_165_interleave_0, values = (emb_exp, pos))[name = tensor<string, []>("input_165")];
-            tensor<fp32, [1, 3, 9, 256]> x_27 = linear(bias = decoder_convert_bias, weight = decoder_convert_weight, x = input_165)[name = tensor<string, []>("linear_37")];
-            tensor<int32, [4]> var_1070_perm_0 = const()[name = tensor<string, []>("op_1070_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1074 = const()[name = tensor<string, []>("op_1074"), val = tensor<int32, [3]>([9, 3, 256])];
-            tensor<fp32, [1, 9, 3, 256]> var_1070 = transpose(perm = var_1070_perm_0, x = x_27)[name = tensor<string, []>("transpose_28")];
-            tensor<fp32, [9, 3, 256]> x_29 = reshape(shape = var_1074, x = var_1070)[name = tensor<string, []>("x_29")];
+            tensor<fp32, [1, 3, 9, 256]> emb_exp = tile(reps = emb_exp_reps_0, x = var_1085)[name = tensor<string, []>("emb_exp")];
+            tensor<bool, []> input_167_interleave_0 = const()[name = tensor<string, []>("input_167_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 3, 9, 512]> input_167 = concat(axis = var_62, interleave = input_167_interleave_0, values = (emb_exp, pos))[name = tensor<string, []>("input_167")];
+            tensor<fp32, [1, 3, 9, 256]> x_27 = linear(bias = inner_decoder_convert_bias, weight = inner_decoder_convert_weight, x = input_167)[name = tensor<string, []>("linear_37")];
+            tensor<int32, [4]> var_1093_perm_0 = const()[name = tensor<string, []>("op_1093_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1097 = const()[name = tensor<string, []>("op_1097"), val = tensor<int32, [3]>([9, 3, 256])];
+            tensor<fp32, [1, 9, 3, 256]> var_1093 = transpose(perm = var_1093_perm_0, x = x_27)[name = tensor<string, []>("transpose_28")];
+            tensor<fp32, [9, 3, 256]> x_29 = reshape(shape = var_1097, x = var_1093)[name = tensor<string, []>("x_29")];
             tensor<int32, [5]> prev_kv_9_begin_0 = const()[name = tensor<string, []>("prev_kv_9_begin_0"), val = tensor<int32, [5]>([0, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_9_end_0 = const()[name = tensor<string, []>("prev_kv_9_end_0"), val = tensor<int32, [5]>([1, 9, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_9_end_mask_0 = const()[name = tensor<string, []>("prev_kv_9_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -943,132 +954,132 @@ program(1.0)
             tensor<bool, [2]> prev_scale_9_end_mask_0 = const()[name = tensor<string, []>("prev_scale_9_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_9_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_9_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_9 = slice_by_index(begin = prev_scale_9_begin_0, end = prev_scale_9_end_0, end_mask = prev_scale_9_end_mask_0, squeeze_mask = prev_scale_9_squeeze_mask_0, x = dec_scale)[name = tensor<string, []>("prev_scale_9")];
-            tensor<fp32, [9, 3, 256]> var_1082 = linear(bias = decoder_q_proj_0_bias, weight = decoder_q_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_38")];
-            tensor<int32, [4]> var_1083 = const()[name = tensor<string, []>("op_1083"), val = tensor<int32, [4]>([9, 3, 4, 64])];
-            tensor<fp32, [9, 3, 4, 64]> var_1084 = reshape(shape = var_1083, x = var_1082)[name = tensor<string, []>("op_1084")];
+            tensor<fp32, [9, 3, 256]> var_1105 = linear(bias = inner_decoder_q_proj_0_bias, weight = inner_decoder_q_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_38")];
+            tensor<int32, [4]> var_1106 = const()[name = tensor<string, []>("op_1106"), val = tensor<int32, [4]>([9, 3, 4, 64])];
+            tensor<fp32, [9, 3, 4, 64]> var_1107 = reshape(shape = var_1106, x = var_1105)[name = tensor<string, []>("op_1107")];
             tensor<int32, [4]> q_9_perm_0 = const()[name = tensor<string, []>("q_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [9, 3, 256]> var_1088 = linear(bias = decoder_k_proj_0_bias, weight = decoder_k_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_39")];
-            tensor<fp32, []> var_1089 = const()[name = tensor<string, []>("op_1089"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [9, 3, 256]> var_1090 = mul(x = var_1088, y = var_1089)[name = tensor<string, []>("op_1090")];
-            tensor<int32, [4]> var_1091 = const()[name = tensor<string, []>("op_1091"), val = tensor<int32, [4]>([9, 3, 4, 64])];
-            tensor<fp32, [9, 3, 4, 64]> var_1092 = reshape(shape = var_1091, x = var_1090)[name = tensor<string, []>("op_1092")];
+            tensor<fp32, [9, 3, 256]> var_1111 = linear(bias = inner_decoder_k_proj_0_bias, weight = inner_decoder_k_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_39")];
+            tensor<fp32, []> var_1112 = const()[name = tensor<string, []>("op_1112"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [9, 3, 256]> var_1113 = mul(x = var_1111, y = var_1112)[name = tensor<string, []>("op_1113")];
+            tensor<int32, [4]> var_1114 = const()[name = tensor<string, []>("op_1114"), val = tensor<int32, [4]>([9, 3, 4, 64])];
+            tensor<fp32, [9, 3, 4, 64]> var_1115 = reshape(shape = var_1114, x = var_1113)[name = tensor<string, []>("op_1115")];
             tensor<int32, [4]> k_9_perm_0 = const()[name = tensor<string, []>("k_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [9, 3, 256]> var_1096 = linear(bias = decoder_v_proj_0_bias, weight = decoder_v_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_40")];
-            tensor<int32, [4]> var_1097 = const()[name = tensor<string, []>("op_1097"), val = tensor<int32, [4]>([9, 3, 4, 64])];
-            tensor<fp32, [9, 3, 4, 64]> var_1098 = reshape(shape = var_1097, x = var_1096)[name = tensor<string, []>("op_1098")];
+            tensor<fp32, [9, 3, 256]> var_1119 = linear(bias = inner_decoder_v_proj_0_bias, weight = inner_decoder_v_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_40")];
+            tensor<int32, [4]> var_1120 = const()[name = tensor<string, []>("op_1120"), val = tensor<int32, [4]>([9, 3, 4, 64])];
+            tensor<fp32, [9, 3, 4, 64]> var_1121 = reshape(shape = var_1120, x = var_1119)[name = tensor<string, []>("op_1121")];
             tensor<int32, [4]> v_9_perm_0 = const()[name = tensor<string, []>("v_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [9, 3, 256]> input_169 = linear(bias = decoder_g_proj_0_bias, weight = decoder_g_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_41")];
+            tensor<fp32, [9, 3, 256]> input_171 = linear(bias = inner_decoder_g_proj_0_bias, weight = inner_decoder_g_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_41")];
             tensor<bool, []> cumsum_mask_1_exclusive_0 = const()[name = tensor<string, []>("cumsum_mask_1_exclusive_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> cumsum_mask_1_reverse_0 = const()[name = tensor<string, []>("cumsum_mask_1_reverse_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [3]> cumsum_mask_1 = cumsum(axis = var_1000, exclusive = cumsum_mask_1_exclusive_0, reverse = cumsum_mask_1_reverse_0, x = valid_mask)[name = tensor<string, []>("cumsum_mask_1")];
+            tensor<fp32, [3]> cumsum_mask_1 = cumsum(axis = var_59, exclusive = cumsum_mask_1_exclusive_0, reverse = cumsum_mask_1_reverse_0, x = valid_mask)[name = tensor<string, []>("cumsum_mask_1")];
             tensor<fp32, [1]> sqrt_s0_9 = sqrt(x = prev_scale_9)[name = tensor<string, []>("sqrt_s0_9")];
             tensor<fp32, [3]> s_t_9 = add(x = prev_scale_9, y = cumsum_mask_1)[name = tensor<string, []>("s_t_9")];
             tensor<fp32, []> const_20 = const()[name = tensor<string, []>("const_20"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [3]> clip_1 = clip(alpha = var_990, beta = const_20, x = s_t_9)[name = tensor<string, []>("clip_1")];
+            tensor<fp32, [3]> clip_1 = clip(alpha = var_49, beta = const_20, x = s_t_9)[name = tensor<string, []>("clip_1")];
             tensor<fp32, [3]> sqrt_s_t_9 = sqrt(x = clip_1)[name = tensor<string, []>("sqrt_s_t_9")];
             tensor<bool, []> qk_9_transpose_x_1 = const()[name = tensor<string, []>("qk_9_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_9_transpose_y_1 = const()[name = tensor<string, []>("qk_9_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [9, 4, 3, 64]> k_9 = transpose(perm = k_9_perm_0, x = var_1092)[name = tensor<string, []>("transpose_26")];
-            tensor<fp32, [9, 4, 3, 64]> q_9 = transpose(perm = q_9_perm_0, x = var_1084)[name = tensor<string, []>("transpose_27")];
+            tensor<fp32, [9, 4, 3, 64]> k_9 = transpose(perm = k_9_perm_0, x = var_1115)[name = tensor<string, []>("transpose_26")];
+            tensor<fp32, [9, 4, 3, 64]> q_9 = transpose(perm = q_9_perm_0, x = var_1107)[name = tensor<string, []>("transpose_27")];
             tensor<fp32, [9, 4, 3, 3]> qk_9 = matmul(transpose_x = qk_9_transpose_x_1, transpose_y = qk_9_transpose_y_1, x = q_9, y = k_9)[name = tensor<string, []>("qk_9")];
-            tensor<int32, [2]> var_1110 = const()[name = tensor<string, []>("op_1110"), val = tensor<int32, [2]>([1, 3])];
-            tensor<fp32, [1, 3]> var_1111 = reshape(shape = var_1110, x = valid_mask)[name = tensor<string, []>("op_1111")];
-            tensor<fp32, [3, 3]> causal_with_valid_1 = mul(x = decoder__causal_mask, y = var_1111)[name = tensor<string, []>("causal_with_valid_1")];
-            tensor<int32, [2]> var_1113 = const()[name = tensor<string, []>("op_1113"), val = tensor<int32, [2]>([3, 1])];
-            tensor<fp32, [3, 1]> var_1114 = reshape(shape = var_1113, x = sqrt_s_t_9)[name = tensor<string, []>("op_1114")];
-            tensor<fp32, [3, 3]> M_9 = real_div(x = causal_with_valid_1, y = var_1114)[name = tensor<string, []>("M_9")];
-            tensor<fp32, [9, 4, 3, 3]> var_1116 = mul(x = qk_9, y = M_9)[name = tensor<string, []>("op_1116")];
+            tensor<int32, [2]> var_1133 = const()[name = tensor<string, []>("op_1133"), val = tensor<int32, [2]>([1, 3])];
+            tensor<fp32, [1, 3]> var_1134 = reshape(shape = var_1133, x = valid_mask)[name = tensor<string, []>("op_1134")];
+            tensor<fp32, [3, 3]> causal_with_valid_1 = mul(x = inner_decoder__causal_mask, y = var_1134)[name = tensor<string, []>("causal_with_valid_1")];
+            tensor<int32, [2]> var_1136 = const()[name = tensor<string, []>("op_1136"), val = tensor<int32, [2]>([3, 1])];
+            tensor<fp32, [3, 1]> var_1137 = reshape(shape = var_1136, x = sqrt_s_t_9)[name = tensor<string, []>("op_1137")];
+            tensor<fp32, [3, 3]> M_9 = real_div(x = causal_with_valid_1, y = var_1137)[name = tensor<string, []>("M_9")];
+            tensor<fp32, [9, 4, 3, 3]> var_1139 = mul(x = qk_9, y = M_9)[name = tensor<string, []>("op_1139")];
             tensor<bool, []> inner_9_transpose_x_0 = const()[name = tensor<string, []>("inner_9_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_9_transpose_y_0 = const()[name = tensor<string, []>("inner_9_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [9, 4, 3, 64]> v_9 = transpose(perm = v_9_perm_0, x = var_1098)[name = tensor<string, []>("transpose_25")];
-            tensor<fp32, [9, 4, 3, 64]> inner_9 = matmul(transpose_x = inner_9_transpose_x_0, transpose_y = inner_9_transpose_y_0, x = var_1116, y = v_9)[name = tensor<string, []>("inner_9")];
-            tensor<bool, []> var_1118_transpose_x_0 = const()[name = tensor<string, []>("op_1118_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_1118_transpose_y_0 = const()[name = tensor<string, []>("op_1118_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [9, 4, 3, 64]> var_1118 = matmul(transpose_x = var_1118_transpose_x_0, transpose_y = var_1118_transpose_y_0, x = q_9, y = prev_kv_9)[name = tensor<string, []>("op_1118")];
-            tensor<fp32, [3]> var_1119 = real_div(x = sqrt_s0_9, y = sqrt_s_t_9)[name = tensor<string, []>("op_1119")];
-            tensor<int32, [4]> var_1120 = const()[name = tensor<string, []>("op_1120"), val = tensor<int32, [4]>([1, 1, 3, 1])];
-            tensor<fp32, [1, 1, 3, 1]> var_1121 = reshape(shape = var_1120, x = var_1119)[name = tensor<string, []>("op_1121")];
-            tensor<fp32, [9, 4, 3, 64]> cross_9 = mul(x = var_1118, y = var_1121)[name = tensor<string, []>("cross_9")];
+            tensor<fp32, [9, 4, 3, 64]> v_9 = transpose(perm = v_9_perm_0, x = var_1121)[name = tensor<string, []>("transpose_25")];
+            tensor<fp32, [9, 4, 3, 64]> inner_9 = matmul(transpose_x = inner_9_transpose_x_0, transpose_y = inner_9_transpose_y_0, x = var_1139, y = v_9)[name = tensor<string, []>("inner_9")];
+            tensor<bool, []> var_1141_transpose_x_0 = const()[name = tensor<string, []>("op_1141_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_1141_transpose_y_0 = const()[name = tensor<string, []>("op_1141_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [9, 4, 3, 64]> var_1141 = matmul(transpose_x = var_1141_transpose_x_0, transpose_y = var_1141_transpose_y_0, x = q_9, y = prev_kv_9)[name = tensor<string, []>("op_1141")];
+            tensor<fp32, [3]> var_1142 = real_div(x = sqrt_s0_9, y = sqrt_s_t_9)[name = tensor<string, []>("op_1142")];
+            tensor<int32, [4]> var_1143 = const()[name = tensor<string, []>("op_1143"), val = tensor<int32, [4]>([1, 1, 3, 1])];
+            tensor<fp32, [1, 1, 3, 1]> var_1144 = reshape(shape = var_1143, x = var_1142)[name = tensor<string, []>("op_1144")];
+            tensor<fp32, [9, 4, 3, 64]> cross_9 = mul(x = var_1141, y = var_1144)[name = tensor<string, []>("cross_9")];
             tensor<fp32, [9, 4, 3, 64]> out_25 = add(x = inner_9, y = cross_9)[name = tensor<string, []>("out_25")];
-            tensor<int32, [4]> var_1124 = const()[name = tensor<string, []>("op_1124"), val = tensor<int32, [4]>([1, 1, 3, 1])];
-            tensor<fp32, [1, 1, 3, 1]> var_1125 = reshape(shape = var_1124, x = valid_mask)[name = tensor<string, []>("op_1125")];
-            tensor<fp32, [9, 4, 3, 64]> v_masked_1 = mul(x = v_9, y = var_1125)[name = tensor<string, []>("v_masked_1")];
-            tensor<fp32, [9, 4, 64, 64]> var_1127 = mul(x = prev_kv_9, y = sqrt_s0_9)[name = tensor<string, []>("op_1127")];
-            tensor<bool, []> var_1129_transpose_x_1 = const()[name = tensor<string, []>("op_1129_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_1129_transpose_y_1 = const()[name = tensor<string, []>("op_1129_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [9, 4, 64, 64]> var_1129 = matmul(transpose_x = var_1129_transpose_x_1, transpose_y = var_1129_transpose_y_1, x = k_9, y = v_masked_1)[name = tensor<string, []>("op_1129")];
-            tensor<fp32, [9, 4, 64, 64]> new_kv_unnorm_9 = add(x = var_1127, y = var_1129)[name = tensor<string, []>("new_kv_unnorm_9")];
-            tensor<bool, []> var_1131_keep_dims_0 = const()[name = tensor<string, []>("op_1131_keep_dims_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, []> var_1131 = reduce_sum(keep_dims = var_1131_keep_dims_0, x = valid_mask)[name = tensor<string, []>("op_1131")];
-            tensor<int32, [1]> var_1132 = const()[name = tensor<string, []>("op_1132"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [1]> var_1133 = reshape(shape = var_1132, x = var_1131)[name = tensor<string, []>("op_1133")];
-            tensor<fp32, [1]> new_scale_9 = add(x = prev_scale_9, y = var_1133)[name = tensor<string, []>("new_scale_9")];
+            tensor<int32, [4]> var_1147 = const()[name = tensor<string, []>("op_1147"), val = tensor<int32, [4]>([1, 1, 3, 1])];
+            tensor<fp32, [1, 1, 3, 1]> var_1148 = reshape(shape = var_1147, x = valid_mask)[name = tensor<string, []>("op_1148")];
+            tensor<fp32, [9, 4, 3, 64]> v_masked_1 = mul(x = v_9, y = var_1148)[name = tensor<string, []>("v_masked_1")];
+            tensor<fp32, [9, 4, 64, 64]> var_1150 = mul(x = prev_kv_9, y = sqrt_s0_9)[name = tensor<string, []>("op_1150")];
+            tensor<bool, []> var_1152_transpose_x_1 = const()[name = tensor<string, []>("op_1152_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_1152_transpose_y_1 = const()[name = tensor<string, []>("op_1152_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [9, 4, 64, 64]> var_1152 = matmul(transpose_x = var_1152_transpose_x_1, transpose_y = var_1152_transpose_y_1, x = k_9, y = v_masked_1)[name = tensor<string, []>("op_1152")];
+            tensor<fp32, [9, 4, 64, 64]> new_kv_unnorm_9 = add(x = var_1150, y = var_1152)[name = tensor<string, []>("new_kv_unnorm_9")];
+            tensor<bool, []> var_1154_keep_dims_0 = const()[name = tensor<string, []>("op_1154_keep_dims_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, []> var_1154 = reduce_sum(keep_dims = var_1154_keep_dims_0, x = valid_mask)[name = tensor<string, []>("op_1154")];
+            tensor<int32, [1]> var_1155 = const()[name = tensor<string, []>("op_1155"), val = tensor<int32, [1]>([1])];
+            tensor<fp32, [1]> var_1156 = reshape(shape = var_1155, x = var_1154)[name = tensor<string, []>("op_1156")];
+            tensor<fp32, [1]> new_scale_9 = add(x = prev_scale_9, y = var_1156)[name = tensor<string, []>("new_scale_9")];
             tensor<fp32, []> const_21 = const()[name = tensor<string, []>("const_21"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_2 = clip(alpha = var_990, beta = const_21, x = new_scale_9)[name = tensor<string, []>("clip_2")];
+            tensor<fp32, [1]> clip_2 = clip(alpha = var_49, beta = const_21, x = new_scale_9)[name = tensor<string, []>("clip_2")];
             tensor<fp32, [1]> sqrt_new_scale_1 = sqrt(x = clip_2)[name = tensor<string, []>("sqrt_new_scale_1")];
-            tensor<fp32, [9, 4, 64, 64]> var_1137 = real_div(x = new_kv_unnorm_9, y = sqrt_new_scale_1)[name = tensor<string, []>("op_1137")];
-            tensor<int32, [4]> var_1138_perm_0 = const()[name = tensor<string, []>("op_1138_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [9, 4, 64, 64]> var_1160 = real_div(x = new_kv_unnorm_9, y = sqrt_new_scale_1)[name = tensor<string, []>("op_1160")];
+            tensor<int32, [4]> var_1161_perm_0 = const()[name = tensor<string, []>("op_1161_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_27_axes_0 = const()[name = tensor<string, []>("out_27_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [9, 3, 4, 64]> var_1138 = transpose(perm = var_1138_perm_0, x = out_25)[name = tensor<string, []>("transpose_24")];
-            tensor<fp32, [9, 3, 4, 64]> out_27 = layer_norm(axes = out_27_axes_0, epsilon = var_987, x = var_1138)[name = tensor<string, []>("out_27")];
-            tensor<int32, [3]> var_1142 = const()[name = tensor<string, []>("op_1142"), val = tensor<int32, [3]>([9, 3, 256])];
-            tensor<fp32, [9, 3, 256]> out_29 = reshape(shape = var_1142, x = out_27)[name = tensor<string, []>("out_29")];
-            tensor<fp32, [9, 3, 256]> var_1144 = silu(x = input_169)[name = tensor<string, []>("op_1144")];
-            tensor<fp32, [9, 3, 256]> input_171 = mul(x = var_1144, y = out_29)[name = tensor<string, []>("input_171")];
-            tensor<fp32, [9, 3, 256]> ret_out_9 = linear(bias = decoder_out_proj_0_bias, weight = decoder_out_proj_0_weight, x = input_171)[name = tensor<string, []>("linear_42")];
-            tensor<fp32, [9, 3, 256]> input_173 = add(x = x_29, y = ret_out_9)[name = tensor<string, []>("input_173")];
+            tensor<fp32, [9, 3, 4, 64]> var_1161 = transpose(perm = var_1161_perm_0, x = out_25)[name = tensor<string, []>("transpose_24")];
+            tensor<fp32, [9, 3, 4, 64]> out_27 = layer_norm(axes = out_27_axes_0, epsilon = var_64, x = var_1161)[name = tensor<string, []>("out_27")];
+            tensor<int32, [3]> var_1165 = const()[name = tensor<string, []>("op_1165"), val = tensor<int32, [3]>([9, 3, 256])];
+            tensor<fp32, [9, 3, 256]> out_29 = reshape(shape = var_1165, x = out_27)[name = tensor<string, []>("out_29")];
+            tensor<fp32, [9, 3, 256]> var_1167 = silu(x = input_171)[name = tensor<string, []>("op_1167")];
+            tensor<fp32, [9, 3, 256]> input_173 = mul(x = var_1167, y = out_29)[name = tensor<string, []>("input_173")];
+            tensor<fp32, [9, 3, 256]> ret_out_9 = linear(bias = inner_decoder_out_proj_0_bias, weight = inner_decoder_out_proj_0_weight, x = input_173)[name = tensor<string, []>("linear_42")];
+            tensor<fp32, [9, 3, 256]> input_175 = add(x = x_29, y = ret_out_9)[name = tensor<string, []>("input_175")];
             tensor<int32, [1]> xt_1_axes_0 = const()[name = tensor<string, []>("xt_1_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [9, 3, 256]> xt_1 = layer_norm(axes = xt_1_axes_0, beta = decoder_norm11_0_bias, epsilon = var_985, gamma = decoder_norm11_0_weight, x = input_173)[name = tensor<string, []>("xt_1")];
-            tensor<int32, [4]> var_1154 = const()[name = tensor<string, []>("op_1154"), val = tensor<int32, [4]>([1, 9, 3, 256])];
-            tensor<fp32, [1, 9, 3, 256]> var_1155 = reshape(shape = var_1154, x = xt_1)[name = tensor<string, []>("op_1155")];
-            tensor<int32, [4]> var_1156_perm_0 = const()[name = tensor<string, []>("op_1156_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1159 = const()[name = tensor<string, []>("op_1159"), val = tensor<int32, [3]>([3, 9, 256])];
-            tensor<fp32, [1, 3, 9, 256]> var_1156 = transpose(perm = var_1156_perm_0, x = var_1155)[name = tensor<string, []>("transpose_23")];
-            tensor<fp32, [3, 9, 256]> query_1 = reshape(shape = var_1159, x = var_1156)[name = tensor<string, []>("query_1")];
+            tensor<fp32, [9, 3, 256]> xt_1 = layer_norm(axes = xt_1_axes_0, beta = inner_decoder_norm11_0_bias, epsilon = var_56, gamma = inner_decoder_norm11_0_weight, x = input_175)[name = tensor<string, []>("xt_1")];
+            tensor<int32, [4]> var_1177 = const()[name = tensor<string, []>("op_1177"), val = tensor<int32, [4]>([1, 9, 3, 256])];
+            tensor<fp32, [1, 9, 3, 256]> var_1178 = reshape(shape = var_1177, x = xt_1)[name = tensor<string, []>("op_1178")];
+            tensor<int32, [4]> var_1179_perm_0 = const()[name = tensor<string, []>("op_1179_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1182 = const()[name = tensor<string, []>("op_1182"), val = tensor<int32, [3]>([3, 9, 256])];
+            tensor<fp32, [1, 3, 9, 256]> var_1179 = transpose(perm = var_1179_perm_0, x = var_1178)[name = tensor<string, []>("transpose_23")];
+            tensor<fp32, [3, 9, 256]> query_1 = reshape(shape = var_1182, x = var_1179)[name = tensor<string, []>("query_1")];
             tensor<int32, [3]> query_3_perm_0 = const()[name = tensor<string, []>("query_3_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [9, 3, 256]> query_3 = transpose(perm = query_3_perm_0, x = query_1)[name = tensor<string, []>("transpose_22")];
-            tensor<fp32, [9, 3, 768]> var_1182 = linear(bias = decoder_self_attn2_0_in_proj_bias, weight = decoder_self_attn2_0_in_proj_weight, x = query_3)[name = tensor<string, []>("linear_43")];
+            tensor<fp32, [9, 3, 768]> var_1205 = linear(bias = inner_decoder_self_attn2_0_in_proj_bias, weight = inner_decoder_self_attn2_0_in_proj_weight, x = query_3)[name = tensor<string, []>("linear_43")];
             tensor<int32, [4]> concat_1 = const()[name = tensor<string, []>("concat_1"), val = tensor<int32, [4]>([9, 3, 3, 256])];
-            tensor<fp32, [9, 3, 3, 256]> var_1184 = reshape(shape = concat_1, x = var_1182)[name = tensor<string, []>("op_1184")];
-            tensor<int32, [1]> var_1185_axes_0 = const()[name = tensor<string, []>("op_1185_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp32, [1, 9, 3, 3, 256]> var_1185 = expand_dims(axes = var_1185_axes_0, x = var_1184)[name = tensor<string, []>("op_1185")];
-            tensor<int32, [5]> var_1186_perm_0 = const()[name = tensor<string, []>("op_1186_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
-            tensor<int32, [1]> var_1187_axes_0 = const()[name = tensor<string, []>("op_1187_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp32, [3, 9, 3, 1, 256]> var_1186 = transpose(perm = var_1186_perm_0, x = var_1185)[name = tensor<string, []>("transpose_21")];
-            tensor<fp32, [3, 9, 3, 256]> var_1187 = squeeze(axes = var_1187_axes_0, x = var_1186)[name = tensor<string, []>("op_1187")];
+            tensor<fp32, [9, 3, 3, 256]> var_1207 = reshape(shape = concat_1, x = var_1205)[name = tensor<string, []>("op_1207")];
+            tensor<int32, [1]> var_1208_axes_0 = const()[name = tensor<string, []>("op_1208_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 9, 3, 3, 256]> var_1208 = expand_dims(axes = var_1208_axes_0, x = var_1207)[name = tensor<string, []>("op_1208")];
+            tensor<int32, [5]> var_1209_perm_0 = const()[name = tensor<string, []>("op_1209_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
+            tensor<int32, [1]> var_1210_axes_0 = const()[name = tensor<string, []>("op_1210_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp32, [3, 9, 3, 1, 256]> var_1209 = transpose(perm = var_1209_perm_0, x = var_1208)[name = tensor<string, []>("transpose_21")];
+            tensor<fp32, [3, 9, 3, 256]> var_1210 = squeeze(axes = var_1210_axes_0, x = var_1209)[name = tensor<string, []>("op_1210")];
             tensor<int32, [4]> q_11_begin_0 = const()[name = tensor<string, []>("q_11_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> q_11_end_0 = const()[name = tensor<string, []>("q_11_end_0"), val = tensor<int32, [4]>([1, 9, 3, 256])];
             tensor<bool, [4]> q_11_end_mask_0 = const()[name = tensor<string, []>("q_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> q_11_squeeze_mask_0 = const()[name = tensor<string, []>("q_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [9, 3, 256]> q_11 = slice_by_index(begin = q_11_begin_0, end = q_11_end_0, end_mask = q_11_end_mask_0, squeeze_mask = q_11_squeeze_mask_0, x = var_1187)[name = tensor<string, []>("q_11")];
+            tensor<fp32, [9, 3, 256]> q_11 = slice_by_index(begin = q_11_begin_0, end = q_11_end_0, end_mask = q_11_end_mask_0, squeeze_mask = q_11_squeeze_mask_0, x = var_1210)[name = tensor<string, []>("q_11")];
             tensor<int32, [4]> k_11_begin_0 = const()[name = tensor<string, []>("k_11_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> k_11_end_0 = const()[name = tensor<string, []>("k_11_end_0"), val = tensor<int32, [4]>([2, 9, 3, 256])];
             tensor<bool, [4]> k_11_end_mask_0 = const()[name = tensor<string, []>("k_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> k_11_squeeze_mask_0 = const()[name = tensor<string, []>("k_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [9, 3, 256]> k_11 = slice_by_index(begin = k_11_begin_0, end = k_11_end_0, end_mask = k_11_end_mask_0, squeeze_mask = k_11_squeeze_mask_0, x = var_1187)[name = tensor<string, []>("k_11")];
+            tensor<fp32, [9, 3, 256]> k_11 = slice_by_index(begin = k_11_begin_0, end = k_11_end_0, end_mask = k_11_end_mask_0, squeeze_mask = k_11_squeeze_mask_0, x = var_1210)[name = tensor<string, []>("k_11")];
             tensor<int32, [4]> v_11_begin_0 = const()[name = tensor<string, []>("v_11_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> v_11_end_0 = const()[name = tensor<string, []>("v_11_end_0"), val = tensor<int32, [4]>([3, 9, 3, 256])];
             tensor<bool, [4]> v_11_end_mask_0 = const()[name = tensor<string, []>("v_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> v_11_squeeze_mask_0 = const()[name = tensor<string, []>("v_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [9, 3, 256]> v_11 = slice_by_index(begin = v_11_begin_0, end = v_11_end_0, end_mask = v_11_end_mask_0, squeeze_mask = v_11_squeeze_mask_0, x = var_1187)[name = tensor<string, []>("v_11")];
-            tensor<int32, [3]> var_1195 = const()[name = tensor<string, []>("op_1195"), val = tensor<int32, [3]>([9, 12, 64])];
-            tensor<fp32, [9, 12, 64]> var_1196 = reshape(shape = var_1195, x = q_11)[name = tensor<string, []>("op_1196")];
+            tensor<fp32, [9, 3, 256]> v_11 = slice_by_index(begin = v_11_begin_0, end = v_11_end_0, end_mask = v_11_end_mask_0, squeeze_mask = v_11_squeeze_mask_0, x = var_1210)[name = tensor<string, []>("v_11")];
+            tensor<int32, [3]> var_1218 = const()[name = tensor<string, []>("op_1218"), val = tensor<int32, [3]>([9, 12, 64])];
+            tensor<fp32, [9, 12, 64]> var_1219 = reshape(shape = var_1218, x = q_11)[name = tensor<string, []>("op_1219")];
             tensor<int32, [3]> q_13_perm_0 = const()[name = tensor<string, []>("q_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1202 = const()[name = tensor<string, []>("op_1202"), val = tensor<int32, [3]>([9, 12, 64])];
-            tensor<fp32, [9, 12, 64]> var_1203 = reshape(shape = var_1202, x = k_11)[name = tensor<string, []>("op_1203")];
+            tensor<int32, [3]> var_1225 = const()[name = tensor<string, []>("op_1225"), val = tensor<int32, [3]>([9, 12, 64])];
+            tensor<fp32, [9, 12, 64]> var_1226 = reshape(shape = var_1225, x = k_11)[name = tensor<string, []>("op_1226")];
             tensor<int32, [3]> k_13_perm_0 = const()[name = tensor<string, []>("k_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1209 = const()[name = tensor<string, []>("op_1209"), val = tensor<int32, [3]>([9, 12, 64])];
-            tensor<fp32, [9, 12, 64]> var_1210 = reshape(shape = var_1209, x = v_11)[name = tensor<string, []>("op_1210")];
+            tensor<int32, [3]> var_1232 = const()[name = tensor<string, []>("op_1232"), val = tensor<int32, [3]>([9, 12, 64])];
+            tensor<fp32, [9, 12, 64]> var_1233 = reshape(shape = var_1232, x = v_11)[name = tensor<string, []>("op_1233")];
             tensor<int32, [3]> v_13_perm_0 = const()[name = tensor<string, []>("v_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [4]> var_1213 = const()[name = tensor<string, []>("op_1213"), val = tensor<int32, [4]>([3, 4, 9, 64])];
-            tensor<fp32, [12, 9, 64]> q_13 = transpose(perm = q_13_perm_0, x = var_1196)[name = tensor<string, []>("transpose_20")];
-            tensor<fp32, [3, 4, 9, 64]> q_15 = reshape(shape = var_1213, x = q_13)[name = tensor<string, []>("q_15")];
-            tensor<int32, [4]> var_1215 = const()[name = tensor<string, []>("op_1215"), val = tensor<int32, [4]>([3, 4, 9, 64])];
-            tensor<fp32, [12, 9, 64]> k_13 = transpose(perm = k_13_perm_0, x = var_1203)[name = tensor<string, []>("transpose_19")];
-            tensor<fp32, [3, 4, 9, 64]> k_15 = reshape(shape = var_1215, x = k_13)[name = tensor<string, []>("k_15")];
-            tensor<int32, [4]> var_1217 = const()[name = tensor<string, []>("op_1217"), val = tensor<int32, [4]>([3, 4, 9, 64])];
-            tensor<fp32, [12, 9, 64]> v_13 = transpose(perm = v_13_perm_0, x = var_1210)[name = tensor<string, []>("transpose_18")];
-            tensor<fp32, [3, 4, 9, 64]> v_15 = reshape(shape = var_1217, x = v_13)[name = tensor<string, []>("v_15")];
+            tensor<int32, [4]> var_1236 = const()[name = tensor<string, []>("op_1236"), val = tensor<int32, [4]>([3, 4, 9, 64])];
+            tensor<fp32, [12, 9, 64]> q_13 = transpose(perm = q_13_perm_0, x = var_1219)[name = tensor<string, []>("transpose_20")];
+            tensor<fp32, [3, 4, 9, 64]> q_15 = reshape(shape = var_1236, x = q_13)[name = tensor<string, []>("q_15")];
+            tensor<int32, [4]> var_1238 = const()[name = tensor<string, []>("op_1238"), val = tensor<int32, [4]>([3, 4, 9, 64])];
+            tensor<fp32, [12, 9, 64]> k_13 = transpose(perm = k_13_perm_0, x = var_1226)[name = tensor<string, []>("transpose_19")];
+            tensor<fp32, [3, 4, 9, 64]> k_15 = reshape(shape = var_1238, x = k_13)[name = tensor<string, []>("k_15")];
+            tensor<int32, [4]> var_1240 = const()[name = tensor<string, []>("op_1240"), val = tensor<int32, [4]>([3, 4, 9, 64])];
+            tensor<fp32, [12, 9, 64]> v_13 = transpose(perm = v_13_perm_0, x = var_1233)[name = tensor<string, []>("transpose_18")];
+            tensor<fp32, [3, 4, 9, 64]> v_15 = reshape(shape = var_1240, x = v_13)[name = tensor<string, []>("v_15")];
             tensor<fp32, []> mul_1_y_0 = const()[name = tensor<string, []>("mul_1_y_0"), val = tensor<fp32, []>(0x1p-3)];
             tensor<fp32, [3, 4, 9, 64]> mul_1 = mul(x = q_15, y = mul_1_y_0)[name = tensor<string, []>("mul_1")];
             tensor<bool, []> matmul_0_transpose_y_0 = const()[name = tensor<string, []>("matmul_0_transpose_y_0"), val = tensor<bool, []>(true)];
@@ -1079,30 +1090,30 @@ program(1.0)
             tensor<bool, []> attn_output_1_transpose_x_0 = const()[name = tensor<string, []>("attn_output_1_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> attn_output_1_transpose_y_0 = const()[name = tensor<string, []>("attn_output_1_transpose_y_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [3, 4, 9, 64]> attn_output_1 = matmul(transpose_x = attn_output_1_transpose_x_0, transpose_y = attn_output_1_transpose_y_0, x = softmax_0, y = v_15)[name = tensor<string, []>("attn_output_1")];
-            tensor<int32, [4]> var_1220 = const()[name = tensor<string, []>("op_1220"), val = tensor<int32, [4]>([2, 0, 1, 3])];
-            tensor<int32, [2]> var_1225 = const()[name = tensor<string, []>("op_1225"), val = tensor<int32, [2]>([27, 256])];
-            tensor<fp32, [9, 3, 4, 64]> var_1221 = transpose(perm = var_1220, x = attn_output_1)[name = tensor<string, []>("transpose_17")];
-            tensor<fp32, [27, 256]> attn_output_3 = reshape(shape = var_1225, x = var_1221)[name = tensor<string, []>("attn_output_3")];
-            tensor<fp32, [27, 256]> attn_output_5 = linear(bias = decoder_self_attn2_0_out_proj_bias, weight = decoder_self_attn2_0_out_proj_weight, x = attn_output_3)[name = tensor<string, []>("linear_44")];
-            tensor<int32, [3]> var_1229 = const()[name = tensor<string, []>("op_1229"), val = tensor<int32, [3]>([9, 3, 256])];
-            tensor<fp32, [9, 3, 256]> attn_output_7 = reshape(shape = var_1229, x = attn_output_5)[name = tensor<string, []>("attn_output_7")];
+            tensor<int32, [4]> var_1243 = const()[name = tensor<string, []>("op_1243"), val = tensor<int32, [4]>([2, 0, 1, 3])];
+            tensor<int32, [2]> var_1248 = const()[name = tensor<string, []>("op_1248"), val = tensor<int32, [2]>([27, 256])];
+            tensor<fp32, [9, 3, 4, 64]> var_1244 = transpose(perm = var_1243, x = attn_output_1)[name = tensor<string, []>("transpose_17")];
+            tensor<fp32, [27, 256]> attn_output_3 = reshape(shape = var_1248, x = var_1244)[name = tensor<string, []>("attn_output_3")];
+            tensor<fp32, [27, 256]> attn_output_5 = linear(bias = inner_decoder_self_attn2_0_out_proj_bias, weight = inner_decoder_self_attn2_0_out_proj_weight, x = attn_output_3)[name = tensor<string, []>("linear_44")];
+            tensor<int32, [3]> var_1252 = const()[name = tensor<string, []>("op_1252"), val = tensor<int32, [3]>([9, 3, 256])];
+            tensor<fp32, [9, 3, 256]> attn_output_7 = reshape(shape = var_1252, x = attn_output_5)[name = tensor<string, []>("attn_output_7")];
             tensor<int32, [3]> sa2_out_1_perm_0 = const()[name = tensor<string, []>("sa2_out_1_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [3, 9, 256]> sa2_out_1 = transpose(perm = sa2_out_1_perm_0, x = attn_output_7)[name = tensor<string, []>("transpose_16")];
-            tensor<fp32, [3, 9, 256]> input_175 = add(x = query_1, y = sa2_out_1)[name = tensor<string, []>("input_175")];
-            tensor<int32, [1]> input_177_axes_0 = const()[name = tensor<string, []>("input_177_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [3, 9, 256]> input_177 = layer_norm(axes = input_177_axes_0, beta = decoder_norm21_0_bias, epsilon = var_985, gamma = decoder_norm21_0_weight, x = input_175)[name = tensor<string, []>("input_177")];
-            tensor<fp32, [3, 9, 2048]> input_179 = linear(bias = decoder_linear1_0_bias, weight = decoder_linear1_0_weight, x = input_177)[name = tensor<string, []>("linear_45")];
-            tensor<fp32, [3, 9, 2048]> input_181 = relu(x = input_179)[name = tensor<string, []>("input_181")];
-            tensor<fp32, [3, 9, 256]> ffn_out_1 = linear(bias = decoder_linear2_0_bias, weight = decoder_linear2_0_weight, x = input_181)[name = tensor<string, []>("linear_46")];
-            tensor<fp32, [3, 9, 256]> input_183 = add(x = input_177, y = ffn_out_1)[name = tensor<string, []>("input_183")];
+            tensor<fp32, [3, 9, 256]> input_177 = add(x = query_1, y = sa2_out_1)[name = tensor<string, []>("input_177")];
+            tensor<int32, [1]> input_179_axes_0 = const()[name = tensor<string, []>("input_179_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [3, 9, 256]> input_179 = layer_norm(axes = input_179_axes_0, beta = inner_decoder_norm21_0_bias, epsilon = var_56, gamma = inner_decoder_norm21_0_weight, x = input_177)[name = tensor<string, []>("input_179")];
+            tensor<fp32, [3, 9, 2048]> input_181 = linear(bias = inner_decoder_linear1_0_bias, weight = inner_decoder_linear1_0_weight, x = input_179)[name = tensor<string, []>("linear_45")];
+            tensor<fp32, [3, 9, 2048]> input_183 = relu(x = input_181)[name = tensor<string, []>("input_183")];
+            tensor<fp32, [3, 9, 256]> ffn_out_1 = linear(bias = inner_decoder_linear2_0_bias, weight = inner_decoder_linear2_0_weight, x = input_183)[name = tensor<string, []>("linear_46")];
+            tensor<fp32, [3, 9, 256]> input_185 = add(x = input_179, y = ffn_out_1)[name = tensor<string, []>("input_185")];
             tensor<int32, [1]> xt_3_axes_0 = const()[name = tensor<string, []>("xt_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [3, 9, 256]> xt_3 = layer_norm(axes = xt_3_axes_0, beta = decoder_norm22_0_bias, epsilon = var_985, gamma = decoder_norm22_0_weight, x = input_183)[name = tensor<string, []>("xt_3")];
-            tensor<int32, [4]> var_1249 = const()[name = tensor<string, []>("op_1249"), val = tensor<int32, [4]>([1, 3, 9, 256])];
-            tensor<fp32, [1, 3, 9, 256]> x_31 = reshape(shape = var_1249, x = xt_3)[name = tensor<string, []>("x_31")];
-            tensor<int32, [4]> var_1251_perm_0 = const()[name = tensor<string, []>("op_1251_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1255 = const()[name = tensor<string, []>("op_1255"), val = tensor<int32, [3]>([9, 3, 256])];
-            tensor<fp32, [1, 9, 3, 256]> var_1251 = transpose(perm = var_1251_perm_0, x = x_31)[name = tensor<string, []>("transpose_15")];
-            tensor<fp32, [9, 3, 256]> x = reshape(shape = var_1255, x = var_1251)[name = tensor<string, []>("x")];
+            tensor<fp32, [3, 9, 256]> xt_3 = layer_norm(axes = xt_3_axes_0, beta = inner_decoder_norm22_0_bias, epsilon = var_56, gamma = inner_decoder_norm22_0_weight, x = input_185)[name = tensor<string, []>("xt_3")];
+            tensor<int32, [4]> var_1272 = const()[name = tensor<string, []>("op_1272"), val = tensor<int32, [4]>([1, 3, 9, 256])];
+            tensor<fp32, [1, 3, 9, 256]> x_31 = reshape(shape = var_1272, x = xt_3)[name = tensor<string, []>("x_31")];
+            tensor<int32, [4]> var_1274_perm_0 = const()[name = tensor<string, []>("op_1274_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1278 = const()[name = tensor<string, []>("op_1278"), val = tensor<int32, [3]>([9, 3, 256])];
+            tensor<fp32, [1, 9, 3, 256]> var_1274 = transpose(perm = var_1274_perm_0, x = x_31)[name = tensor<string, []>("transpose_15")];
+            tensor<fp32, [9, 3, 256]> x = reshape(shape = var_1278, x = var_1274)[name = tensor<string, []>("x")];
             tensor<int32, [5]> prev_kv_begin_0 = const()[name = tensor<string, []>("prev_kv_begin_0"), val = tensor<int32, [5]>([1, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_end_0 = const()[name = tensor<string, []>("prev_kv_end_0"), val = tensor<int32, [5]>([2, 9, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_end_mask_0 = const()[name = tensor<string, []>("prev_kv_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -1113,120 +1124,120 @@ program(1.0)
             tensor<bool, [2]> prev_scale_end_mask_0 = const()[name = tensor<string, []>("prev_scale_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale = slice_by_index(begin = prev_scale_begin_0, end = prev_scale_end_0, end_mask = prev_scale_end_mask_0, squeeze_mask = prev_scale_squeeze_mask_0, x = dec_scale)[name = tensor<string, []>("prev_scale")];
-            tensor<fp32, [9, 3, 256]> var_1263 = linear(bias = decoder_q_proj_1_bias, weight = decoder_q_proj_1_weight, x = x)[name = tensor<string, []>("linear_47")];
-            tensor<int32, [4]> var_1264 = const()[name = tensor<string, []>("op_1264"), val = tensor<int32, [4]>([9, 3, 4, 64])];
-            tensor<fp32, [9, 3, 4, 64]> var_1265 = reshape(shape = var_1264, x = var_1263)[name = tensor<string, []>("op_1265")];
+            tensor<fp32, [9, 3, 256]> var_1286 = linear(bias = inner_decoder_q_proj_1_bias, weight = inner_decoder_q_proj_1_weight, x = x)[name = tensor<string, []>("linear_47")];
+            tensor<int32, [4]> var_1287 = const()[name = tensor<string, []>("op_1287"), val = tensor<int32, [4]>([9, 3, 4, 64])];
+            tensor<fp32, [9, 3, 4, 64]> var_1288 = reshape(shape = var_1287, x = var_1286)[name = tensor<string, []>("op_1288")];
             tensor<int32, [4]> q_17_perm_0 = const()[name = tensor<string, []>("q_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [9, 3, 256]> var_1269 = linear(bias = decoder_k_proj_1_bias, weight = decoder_k_proj_1_weight, x = x)[name = tensor<string, []>("linear_48")];
-            tensor<fp32, []> var_1270 = const()[name = tensor<string, []>("op_1270"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [9, 3, 256]> var_1271 = mul(x = var_1269, y = var_1270)[name = tensor<string, []>("op_1271")];
-            tensor<int32, [4]> var_1272 = const()[name = tensor<string, []>("op_1272"), val = tensor<int32, [4]>([9, 3, 4, 64])];
-            tensor<fp32, [9, 3, 4, 64]> var_1273 = reshape(shape = var_1272, x = var_1271)[name = tensor<string, []>("op_1273")];
+            tensor<fp32, [9, 3, 256]> var_1292 = linear(bias = inner_decoder_k_proj_1_bias, weight = inner_decoder_k_proj_1_weight, x = x)[name = tensor<string, []>("linear_48")];
+            tensor<fp32, []> var_1293 = const()[name = tensor<string, []>("op_1293"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [9, 3, 256]> var_1294 = mul(x = var_1292, y = var_1293)[name = tensor<string, []>("op_1294")];
+            tensor<int32, [4]> var_1295 = const()[name = tensor<string, []>("op_1295"), val = tensor<int32, [4]>([9, 3, 4, 64])];
+            tensor<fp32, [9, 3, 4, 64]> var_1296 = reshape(shape = var_1295, x = var_1294)[name = tensor<string, []>("op_1296")];
             tensor<int32, [4]> k_17_perm_0 = const()[name = tensor<string, []>("k_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [9, 3, 256]> var_1277 = linear(bias = decoder_v_proj_1_bias, weight = decoder_v_proj_1_weight, x = x)[name = tensor<string, []>("linear_49")];
-            tensor<int32, [4]> var_1278 = const()[name = tensor<string, []>("op_1278"), val = tensor<int32, [4]>([9, 3, 4, 64])];
-            tensor<fp32, [9, 3, 4, 64]> var_1279 = reshape(shape = var_1278, x = var_1277)[name = tensor<string, []>("op_1279")];
+            tensor<fp32, [9, 3, 256]> var_1300 = linear(bias = inner_decoder_v_proj_1_bias, weight = inner_decoder_v_proj_1_weight, x = x)[name = tensor<string, []>("linear_49")];
+            tensor<int32, [4]> var_1301 = const()[name = tensor<string, []>("op_1301"), val = tensor<int32, [4]>([9, 3, 4, 64])];
+            tensor<fp32, [9, 3, 4, 64]> var_1302 = reshape(shape = var_1301, x = var_1300)[name = tensor<string, []>("op_1302")];
             tensor<int32, [4]> v_17_perm_0 = const()[name = tensor<string, []>("v_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [9, 3, 256]> input_187 = linear(bias = decoder_g_proj_1_bias, weight = decoder_g_proj_1_weight, x = x)[name = tensor<string, []>("linear_50")];
+            tensor<fp32, [9, 3, 256]> input_189 = linear(bias = inner_decoder_g_proj_1_bias, weight = inner_decoder_g_proj_1_weight, x = x)[name = tensor<string, []>("linear_50")];
             tensor<fp32, [1]> sqrt_s0 = sqrt(x = prev_scale)[name = tensor<string, []>("sqrt_s0")];
             tensor<fp32, [3]> s_t = add(x = prev_scale, y = cumsum_mask_1)[name = tensor<string, []>("s_t")];
             tensor<fp32, []> const_32 = const()[name = tensor<string, []>("const_32"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [3]> clip_3 = clip(alpha = var_990, beta = const_32, x = s_t)[name = tensor<string, []>("clip_3")];
+            tensor<fp32, [3]> clip_3 = clip(alpha = var_49, beta = const_32, x = s_t)[name = tensor<string, []>("clip_3")];
             tensor<fp32, [3]> sqrt_s_t = sqrt(x = clip_3)[name = tensor<string, []>("sqrt_s_t")];
             tensor<bool, []> qk_transpose_x_1 = const()[name = tensor<string, []>("qk_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_transpose_y_1 = const()[name = tensor<string, []>("qk_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [9, 4, 3, 64]> k_17 = transpose(perm = k_17_perm_0, x = var_1273)[name = tensor<string, []>("transpose_13")];
-            tensor<fp32, [9, 4, 3, 64]> q_17 = transpose(perm = q_17_perm_0, x = var_1265)[name = tensor<string, []>("transpose_14")];
+            tensor<fp32, [9, 4, 3, 64]> k_17 = transpose(perm = k_17_perm_0, x = var_1296)[name = tensor<string, []>("transpose_13")];
+            tensor<fp32, [9, 4, 3, 64]> q_17 = transpose(perm = q_17_perm_0, x = var_1288)[name = tensor<string, []>("transpose_14")];
             tensor<fp32, [9, 4, 3, 3]> qk = matmul(transpose_x = qk_transpose_x_1, transpose_y = qk_transpose_y_1, x = q_17, y = k_17)[name = tensor<string, []>("qk")];
-            tensor<int32, [2]> var_1294 = const()[name = tensor<string, []>("op_1294"), val = tensor<int32, [2]>([3, 1])];
-            tensor<fp32, [3, 1]> var_1295 = reshape(shape = var_1294, x = sqrt_s_t)[name = tensor<string, []>("op_1295")];
-            tensor<fp32, [3, 3]> M = real_div(x = causal_with_valid_1, y = var_1295)[name = tensor<string, []>("M")];
-            tensor<fp32, [9, 4, 3, 3]> var_1297 = mul(x = qk, y = M)[name = tensor<string, []>("op_1297")];
-            tensor<bool, []> inner_transpose_x_0 = const()[name = tensor<string, []>("inner_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> inner_transpose_y_0 = const()[name = tensor<string, []>("inner_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [9, 4, 3, 64]> v_17 = transpose(perm = v_17_perm_0, x = var_1279)[name = tensor<string, []>("transpose_12")];
-            tensor<fp32, [9, 4, 3, 64]> inner = matmul(transpose_x = inner_transpose_x_0, transpose_y = inner_transpose_y_0, x = var_1297, y = v_17)[name = tensor<string, []>("inner")];
-            tensor<bool, []> var_1299_transpose_x_0 = const()[name = tensor<string, []>("op_1299_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_1299_transpose_y_0 = const()[name = tensor<string, []>("op_1299_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [9, 4, 3, 64]> var_1299 = matmul(transpose_x = var_1299_transpose_x_0, transpose_y = var_1299_transpose_y_0, x = q_17, y = prev_kv)[name = tensor<string, []>("op_1299")];
-            tensor<fp32, [3]> var_1300 = real_div(x = sqrt_s0, y = sqrt_s_t)[name = tensor<string, []>("op_1300")];
-            tensor<int32, [4]> var_1301 = const()[name = tensor<string, []>("op_1301"), val = tensor<int32, [4]>([1, 1, 3, 1])];
-            tensor<fp32, [1, 1, 3, 1]> var_1302 = reshape(shape = var_1301, x = var_1300)[name = tensor<string, []>("op_1302")];
-            tensor<fp32, [9, 4, 3, 64]> cross = mul(x = var_1299, y = var_1302)[name = tensor<string, []>("cross")];
-            tensor<fp32, [9, 4, 3, 64]> out_31 = add(x = inner, y = cross)[name = tensor<string, []>("out_31")];
-            tensor<fp32, [9, 4, 3, 64]> v_masked = mul(x = v_17, y = var_1125)[name = tensor<string, []>("v_masked")];
-            tensor<fp32, [9, 4, 64, 64]> var_1308 = mul(x = prev_kv, y = sqrt_s0)[name = tensor<string, []>("op_1308")];
-            tensor<bool, []> var_1310_transpose_x_1 = const()[name = tensor<string, []>("op_1310_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_1310_transpose_y_1 = const()[name = tensor<string, []>("op_1310_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [9, 4, 64, 64]> var_1310 = matmul(transpose_x = var_1310_transpose_x_1, transpose_y = var_1310_transpose_y_1, x = k_17, y = v_masked)[name = tensor<string, []>("op_1310")];
-            tensor<fp32, [9, 4, 64, 64]> new_kv_unnorm = add(x = var_1308, y = var_1310)[name = tensor<string, []>("new_kv_unnorm")];
-            tensor<fp32, [1]> new_scale = add(x = prev_scale, y = var_1133)[name = tensor<string, []>("new_scale")];
+            tensor<int32, [2]> var_1317 = const()[name = tensor<string, []>("op_1317"), val = tensor<int32, [2]>([3, 1])];
+            tensor<fp32, [3, 1]> var_1318 = reshape(shape = var_1317, x = sqrt_s_t)[name = tensor<string, []>("op_1318")];
+            tensor<fp32, [3, 3]> M = real_div(x = causal_with_valid_1, y = var_1318)[name = tensor<string, []>("M")];
+            tensor<fp32, [9, 4, 3, 3]> var_1320 = mul(x = qk, y = M)[name = tensor<string, []>("op_1320")];
+            tensor<bool, []> inner_11_transpose_x_0 = const()[name = tensor<string, []>("inner_11_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> inner_11_transpose_y_0 = const()[name = tensor<string, []>("inner_11_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [9, 4, 3, 64]> v_17 = transpose(perm = v_17_perm_0, x = var_1302)[name = tensor<string, []>("transpose_12")];
+            tensor<fp32, [9, 4, 3, 64]> inner_11 = matmul(transpose_x = inner_11_transpose_x_0, transpose_y = inner_11_transpose_y_0, x = var_1320, y = v_17)[name = tensor<string, []>("inner_11")];
+            tensor<bool, []> var_1322_transpose_x_0 = const()[name = tensor<string, []>("op_1322_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_1322_transpose_y_0 = const()[name = tensor<string, []>("op_1322_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [9, 4, 3, 64]> var_1322 = matmul(transpose_x = var_1322_transpose_x_0, transpose_y = var_1322_transpose_y_0, x = q_17, y = prev_kv)[name = tensor<string, []>("op_1322")];
+            tensor<fp32, [3]> var_1323 = real_div(x = sqrt_s0, y = sqrt_s_t)[name = tensor<string, []>("op_1323")];
+            tensor<int32, [4]> var_1324 = const()[name = tensor<string, []>("op_1324"), val = tensor<int32, [4]>([1, 1, 3, 1])];
+            tensor<fp32, [1, 1, 3, 1]> var_1325 = reshape(shape = var_1324, x = var_1323)[name = tensor<string, []>("op_1325")];
+            tensor<fp32, [9, 4, 3, 64]> cross = mul(x = var_1322, y = var_1325)[name = tensor<string, []>("cross")];
+            tensor<fp32, [9, 4, 3, 64]> out_31 = add(x = inner_11, y = cross)[name = tensor<string, []>("out_31")];
+            tensor<fp32, [9, 4, 3, 64]> v_masked = mul(x = v_17, y = var_1148)[name = tensor<string, []>("v_masked")];
+            tensor<fp32, [9, 4, 64, 64]> var_1331 = mul(x = prev_kv, y = sqrt_s0)[name = tensor<string, []>("op_1331")];
+            tensor<bool, []> var_1333_transpose_x_1 = const()[name = tensor<string, []>("op_1333_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_1333_transpose_y_1 = const()[name = tensor<string, []>("op_1333_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [9, 4, 64, 64]> var_1333 = matmul(transpose_x = var_1333_transpose_x_1, transpose_y = var_1333_transpose_y_1, x = k_17, y = v_masked)[name = tensor<string, []>("op_1333")];
+            tensor<fp32, [9, 4, 64, 64]> new_kv_unnorm = add(x = var_1331, y = var_1333)[name = tensor<string, []>("new_kv_unnorm")];
+            tensor<fp32, [1]> new_scale = add(x = prev_scale, y = var_1156)[name = tensor<string, []>("new_scale")];
             tensor<fp32, []> const_33 = const()[name = tensor<string, []>("const_33"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_4 = clip(alpha = var_990, beta = const_33, x = new_scale)[name = tensor<string, []>("clip_4")];
+            tensor<fp32, [1]> clip_4 = clip(alpha = var_49, beta = const_33, x = new_scale)[name = tensor<string, []>("clip_4")];
             tensor<fp32, [1]> sqrt_new_scale = sqrt(x = clip_4)[name = tensor<string, []>("sqrt_new_scale")];
             tensor<fp32, [9, 4, 64, 64]> nkv = real_div(x = new_kv_unnorm, y = sqrt_new_scale)[name = tensor<string, []>("nkv")];
-            tensor<int32, [4]> var_1319_perm_0 = const()[name = tensor<string, []>("op_1319_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [4]> var_1342_perm_0 = const()[name = tensor<string, []>("op_1342_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_33_axes_0 = const()[name = tensor<string, []>("out_33_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [9, 3, 4, 64]> var_1319 = transpose(perm = var_1319_perm_0, x = out_31)[name = tensor<string, []>("transpose_11")];
-            tensor<fp32, [9, 3, 4, 64]> out_33 = layer_norm(axes = out_33_axes_0, epsilon = var_987, x = var_1319)[name = tensor<string, []>("out_33")];
-            tensor<int32, [3]> var_1323 = const()[name = tensor<string, []>("op_1323"), val = tensor<int32, [3]>([9, 3, 256])];
-            tensor<fp32, [9, 3, 256]> out = reshape(shape = var_1323, x = out_33)[name = tensor<string, []>("out")];
-            tensor<fp32, [9, 3, 256]> var_1325 = silu(x = input_187)[name = tensor<string, []>("op_1325")];
-            tensor<fp32, [9, 3, 256]> input_189 = mul(x = var_1325, y = out)[name = tensor<string, []>("input_189")];
-            tensor<fp32, [9, 3, 256]> ret_out = linear(bias = decoder_out_proj_1_bias, weight = decoder_out_proj_1_weight, x = input_189)[name = tensor<string, []>("linear_51")];
-            tensor<fp32, [9, 3, 256]> input_191 = add(x = x, y = ret_out)[name = tensor<string, []>("input_191")];
+            tensor<fp32, [9, 3, 4, 64]> var_1342 = transpose(perm = var_1342_perm_0, x = out_31)[name = tensor<string, []>("transpose_11")];
+            tensor<fp32, [9, 3, 4, 64]> out_33 = layer_norm(axes = out_33_axes_0, epsilon = var_64, x = var_1342)[name = tensor<string, []>("out_33")];
+            tensor<int32, [3]> var_1346 = const()[name = tensor<string, []>("op_1346"), val = tensor<int32, [3]>([9, 3, 256])];
+            tensor<fp32, [9, 3, 256]> out = reshape(shape = var_1346, x = out_33)[name = tensor<string, []>("out")];
+            tensor<fp32, [9, 3, 256]> var_1348 = silu(x = input_189)[name = tensor<string, []>("op_1348")];
+            tensor<fp32, [9, 3, 256]> input_191 = mul(x = var_1348, y = out)[name = tensor<string, []>("input_191")];
+            tensor<fp32, [9, 3, 256]> ret_out = linear(bias = inner_decoder_out_proj_1_bias, weight = inner_decoder_out_proj_1_weight, x = input_191)[name = tensor<string, []>("linear_51")];
+            tensor<fp32, [9, 3, 256]> input_193 = add(x = x, y = ret_out)[name = tensor<string, []>("input_193")];
             tensor<int32, [1]> xt_5_axes_0 = const()[name = tensor<string, []>("xt_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [9, 3, 256]> xt_5 = layer_norm(axes = xt_5_axes_0, beta = decoder_norm11_1_bias, epsilon = var_985, gamma = decoder_norm11_1_weight, x = input_191)[name = tensor<string, []>("xt_5")];
-            tensor<int32, [4]> var_1335 = const()[name = tensor<string, []>("op_1335"), val = tensor<int32, [4]>([1, 9, 3, 256])];
-            tensor<fp32, [1, 9, 3, 256]> var_1336 = reshape(shape = var_1335, x = xt_5)[name = tensor<string, []>("op_1336")];
-            tensor<int32, [4]> var_1337_perm_0 = const()[name = tensor<string, []>("op_1337_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1340 = const()[name = tensor<string, []>("op_1340"), val = tensor<int32, [3]>([3, 9, 256])];
-            tensor<fp32, [1, 3, 9, 256]> var_1337 = transpose(perm = var_1337_perm_0, x = var_1336)[name = tensor<string, []>("transpose_10")];
-            tensor<fp32, [3, 9, 256]> query_5 = reshape(shape = var_1340, x = var_1337)[name = tensor<string, []>("query_5")];
+            tensor<fp32, [9, 3, 256]> xt_5 = layer_norm(axes = xt_5_axes_0, beta = inner_decoder_norm11_1_bias, epsilon = var_56, gamma = inner_decoder_norm11_1_weight, x = input_193)[name = tensor<string, []>("xt_5")];
+            tensor<int32, [4]> var_1358 = const()[name = tensor<string, []>("op_1358"), val = tensor<int32, [4]>([1, 9, 3, 256])];
+            tensor<fp32, [1, 9, 3, 256]> var_1359 = reshape(shape = var_1358, x = xt_5)[name = tensor<string, []>("op_1359")];
+            tensor<int32, [4]> var_1360_perm_0 = const()[name = tensor<string, []>("op_1360_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1363 = const()[name = tensor<string, []>("op_1363"), val = tensor<int32, [3]>([3, 9, 256])];
+            tensor<fp32, [1, 3, 9, 256]> var_1360 = transpose(perm = var_1360_perm_0, x = var_1359)[name = tensor<string, []>("transpose_10")];
+            tensor<fp32, [3, 9, 256]> query_5 = reshape(shape = var_1363, x = var_1360)[name = tensor<string, []>("query_5")];
             tensor<int32, [3]> query_perm_0 = const()[name = tensor<string, []>("query_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [9, 3, 256]> query = transpose(perm = query_perm_0, x = query_5)[name = tensor<string, []>("transpose_9")];
-            tensor<fp32, [9, 3, 768]> var_1363 = linear(bias = decoder_self_attn2_1_in_proj_bias, weight = decoder_self_attn2_1_in_proj_weight, x = query)[name = tensor<string, []>("linear_52")];
+            tensor<fp32, [9, 3, 768]> var_1386 = linear(bias = inner_decoder_self_attn2_1_in_proj_bias, weight = inner_decoder_self_attn2_1_in_proj_weight, x = query)[name = tensor<string, []>("linear_52")];
             tensor<int32, [4]> concat_2 = const()[name = tensor<string, []>("concat_2"), val = tensor<int32, [4]>([9, 3, 3, 256])];
-            tensor<fp32, [9, 3, 3, 256]> var_1365 = reshape(shape = concat_2, x = var_1363)[name = tensor<string, []>("op_1365")];
-            tensor<int32, [1]> var_1366_axes_0 = const()[name = tensor<string, []>("op_1366_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp32, [1, 9, 3, 3, 256]> var_1366 = expand_dims(axes = var_1366_axes_0, x = var_1365)[name = tensor<string, []>("op_1366")];
-            tensor<int32, [5]> var_1367_perm_0 = const()[name = tensor<string, []>("op_1367_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
-            tensor<int32, [1]> var_1368_axes_0 = const()[name = tensor<string, []>("op_1368_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp32, [3, 9, 3, 1, 256]> var_1367 = transpose(perm = var_1367_perm_0, x = var_1366)[name = tensor<string, []>("transpose_8")];
-            tensor<fp32, [3, 9, 3, 256]> var_1368 = squeeze(axes = var_1368_axes_0, x = var_1367)[name = tensor<string, []>("op_1368")];
+            tensor<fp32, [9, 3, 3, 256]> var_1388 = reshape(shape = concat_2, x = var_1386)[name = tensor<string, []>("op_1388")];
+            tensor<int32, [1]> var_1389_axes_0 = const()[name = tensor<string, []>("op_1389_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 9, 3, 3, 256]> var_1389 = expand_dims(axes = var_1389_axes_0, x = var_1388)[name = tensor<string, []>("op_1389")];
+            tensor<int32, [5]> var_1390_perm_0 = const()[name = tensor<string, []>("op_1390_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
+            tensor<int32, [1]> var_1391_axes_0 = const()[name = tensor<string, []>("op_1391_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp32, [3, 9, 3, 1, 256]> var_1390 = transpose(perm = var_1390_perm_0, x = var_1389)[name = tensor<string, []>("transpose_8")];
+            tensor<fp32, [3, 9, 3, 256]> var_1391 = squeeze(axes = var_1391_axes_0, x = var_1390)[name = tensor<string, []>("op_1391")];
             tensor<int32, [4]> q_19_begin_0 = const()[name = tensor<string, []>("q_19_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> q_19_end_0 = const()[name = tensor<string, []>("q_19_end_0"), val = tensor<int32, [4]>([1, 9, 3, 256])];
             tensor<bool, [4]> q_19_end_mask_0 = const()[name = tensor<string, []>("q_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> q_19_squeeze_mask_0 = const()[name = tensor<string, []>("q_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [9, 3, 256]> q_19 = slice_by_index(begin = q_19_begin_0, end = q_19_end_0, end_mask = q_19_end_mask_0, squeeze_mask = q_19_squeeze_mask_0, x = var_1368)[name = tensor<string, []>("q_19")];
+            tensor<fp32, [9, 3, 256]> q_19 = slice_by_index(begin = q_19_begin_0, end = q_19_end_0, end_mask = q_19_end_mask_0, squeeze_mask = q_19_squeeze_mask_0, x = var_1391)[name = tensor<string, []>("q_19")];
             tensor<int32, [4]> k_19_begin_0 = const()[name = tensor<string, []>("k_19_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> k_19_end_0 = const()[name = tensor<string, []>("k_19_end_0"), val = tensor<int32, [4]>([2, 9, 3, 256])];
             tensor<bool, [4]> k_19_end_mask_0 = const()[name = tensor<string, []>("k_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> k_19_squeeze_mask_0 = const()[name = tensor<string, []>("k_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [9, 3, 256]> k_19 = slice_by_index(begin = k_19_begin_0, end = k_19_end_0, end_mask = k_19_end_mask_0, squeeze_mask = k_19_squeeze_mask_0, x = var_1368)[name = tensor<string, []>("k_19")];
+            tensor<fp32, [9, 3, 256]> k_19 = slice_by_index(begin = k_19_begin_0, end = k_19_end_0, end_mask = k_19_end_mask_0, squeeze_mask = k_19_squeeze_mask_0, x = var_1391)[name = tensor<string, []>("k_19")];
             tensor<int32, [4]> v_19_begin_0 = const()[name = tensor<string, []>("v_19_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> v_19_end_0 = const()[name = tensor<string, []>("v_19_end_0"), val = tensor<int32, [4]>([3, 9, 3, 256])];
             tensor<bool, [4]> v_19_end_mask_0 = const()[name = tensor<string, []>("v_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> v_19_squeeze_mask_0 = const()[name = tensor<string, []>("v_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [9, 3, 256]> v_19 = slice_by_index(begin = v_19_begin_0, end = v_19_end_0, end_mask = v_19_end_mask_0, squeeze_mask = v_19_squeeze_mask_0, x = var_1368)[name = tensor<string, []>("v_19")];
-            tensor<int32, [3]> var_1376 = const()[name = tensor<string, []>("op_1376"), val = tensor<int32, [3]>([9, 12, 64])];
-            tensor<fp32, [9, 12, 64]> var_1377 = reshape(shape = var_1376, x = q_19)[name = tensor<string, []>("op_1377")];
+            tensor<fp32, [9, 3, 256]> v_19 = slice_by_index(begin = v_19_begin_0, end = v_19_end_0, end_mask = v_19_end_mask_0, squeeze_mask = v_19_squeeze_mask_0, x = var_1391)[name = tensor<string, []>("v_19")];
+            tensor<int32, [3]> var_1399 = const()[name = tensor<string, []>("op_1399"), val = tensor<int32, [3]>([9, 12, 64])];
+            tensor<fp32, [9, 12, 64]> var_1400 = reshape(shape = var_1399, x = q_19)[name = tensor<string, []>("op_1400")];
             tensor<int32, [3]> q_21_perm_0 = const()[name = tensor<string, []>("q_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1383 = const()[name = tensor<string, []>("op_1383"), val = tensor<int32, [3]>([9, 12, 64])];
-            tensor<fp32, [9, 12, 64]> var_1384 = reshape(shape = var_1383, x = k_19)[name = tensor<string, []>("op_1384")];
+            tensor<int32, [3]> var_1406 = const()[name = tensor<string, []>("op_1406"), val = tensor<int32, [3]>([9, 12, 64])];
+            tensor<fp32, [9, 12, 64]> var_1407 = reshape(shape = var_1406, x = k_19)[name = tensor<string, []>("op_1407")];
             tensor<int32, [3]> k_21_perm_0 = const()[name = tensor<string, []>("k_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1390 = const()[name = tensor<string, []>("op_1390"), val = tensor<int32, [3]>([9, 12, 64])];
-            tensor<fp32, [9, 12, 64]> var_1391 = reshape(shape = var_1390, x = v_19)[name = tensor<string, []>("op_1391")];
+            tensor<int32, [3]> var_1413 = const()[name = tensor<string, []>("op_1413"), val = tensor<int32, [3]>([9, 12, 64])];
+            tensor<fp32, [9, 12, 64]> var_1414 = reshape(shape = var_1413, x = v_19)[name = tensor<string, []>("op_1414")];
             tensor<int32, [3]> v_21_perm_0 = const()[name = tensor<string, []>("v_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [4]> var_1394 = const()[name = tensor<string, []>("op_1394"), val = tensor<int32, [4]>([3, 4, 9, 64])];
-            tensor<fp32, [12, 9, 64]> q_21 = transpose(perm = q_21_perm_0, x = var_1377)[name = tensor<string, []>("transpose_7")];
-            tensor<fp32, [3, 4, 9, 64]> q = reshape(shape = var_1394, x = q_21)[name = tensor<string, []>("q")];
-            tensor<int32, [4]> var_1396 = const()[name = tensor<string, []>("op_1396"), val = tensor<int32, [4]>([3, 4, 9, 64])];
-            tensor<fp32, [12, 9, 64]> k_21 = transpose(perm = k_21_perm_0, x = var_1384)[name = tensor<string, []>("transpose_6")];
-            tensor<fp32, [3, 4, 9, 64]> k = reshape(shape = var_1396, x = k_21)[name = tensor<string, []>("k")];
-            tensor<int32, [4]> var_1398 = const()[name = tensor<string, []>("op_1398"), val = tensor<int32, [4]>([3, 4, 9, 64])];
-            tensor<fp32, [12, 9, 64]> v_21 = transpose(perm = v_21_perm_0, x = var_1391)[name = tensor<string, []>("transpose_5")];
-            tensor<fp32, [3, 4, 9, 64]> v = reshape(shape = var_1398, x = v_21)[name = tensor<string, []>("v")];
+            tensor<int32, [4]> var_1417 = const()[name = tensor<string, []>("op_1417"), val = tensor<int32, [4]>([3, 4, 9, 64])];
+            tensor<fp32, [12, 9, 64]> q_21 = transpose(perm = q_21_perm_0, x = var_1400)[name = tensor<string, []>("transpose_7")];
+            tensor<fp32, [3, 4, 9, 64]> q = reshape(shape = var_1417, x = q_21)[name = tensor<string, []>("q")];
+            tensor<int32, [4]> var_1419 = const()[name = tensor<string, []>("op_1419"), val = tensor<int32, [4]>([3, 4, 9, 64])];
+            tensor<fp32, [12, 9, 64]> k_21 = transpose(perm = k_21_perm_0, x = var_1407)[name = tensor<string, []>("transpose_6")];
+            tensor<fp32, [3, 4, 9, 64]> k = reshape(shape = var_1419, x = k_21)[name = tensor<string, []>("k")];
+            tensor<int32, [4]> var_1421 = const()[name = tensor<string, []>("op_1421"), val = tensor<int32, [4]>([3, 4, 9, 64])];
+            tensor<fp32, [12, 9, 64]> v_21 = transpose(perm = v_21_perm_0, x = var_1414)[name = tensor<string, []>("transpose_5")];
+            tensor<fp32, [3, 4, 9, 64]> v = reshape(shape = var_1421, x = v_21)[name = tensor<string, []>("v")];
             tensor<fp32, []> mul_3_y_0 = const()[name = tensor<string, []>("mul_3_y_0"), val = tensor<fp32, []>(0x1p-3)];
             tensor<fp32, [3, 4, 9, 64]> mul_3 = mul(x = q, y = mul_3_y_0)[name = tensor<string, []>("mul_3")];
             tensor<bool, []> matmul_1_transpose_y_0 = const()[name = tensor<string, []>("matmul_1_transpose_y_0"), val = tensor<bool, []>(true)];
@@ -1237,36 +1248,36 @@ program(1.0)
             tensor<bool, []> attn_output_9_transpose_x_0 = const()[name = tensor<string, []>("attn_output_9_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> attn_output_9_transpose_y_0 = const()[name = tensor<string, []>("attn_output_9_transpose_y_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [3, 4, 9, 64]> attn_output_9 = matmul(transpose_x = attn_output_9_transpose_x_0, transpose_y = attn_output_9_transpose_y_0, x = softmax_1, y = v)[name = tensor<string, []>("attn_output_9")];
-            tensor<int32, [4]> var_1401 = const()[name = tensor<string, []>("op_1401"), val = tensor<int32, [4]>([2, 0, 1, 3])];
-            tensor<int32, [2]> var_1406 = const()[name = tensor<string, []>("op_1406"), val = tensor<int32, [2]>([27, 256])];
-            tensor<fp32, [9, 3, 4, 64]> var_1402 = transpose(perm = var_1401, x = attn_output_9)[name = tensor<string, []>("transpose_4")];
-            tensor<fp32, [27, 256]> attn_output_11 = reshape(shape = var_1406, x = var_1402)[name = tensor<string, []>("attn_output_11")];
-            tensor<fp32, [27, 256]> attn_output_13 = linear(bias = decoder_self_attn2_1_out_proj_bias, weight = decoder_self_attn2_1_out_proj_weight, x = attn_output_11)[name = tensor<string, []>("linear_53")];
-            tensor<int32, [3]> var_1410 = const()[name = tensor<string, []>("op_1410"), val = tensor<int32, [3]>([9, 3, 256])];
-            tensor<fp32, [9, 3, 256]> attn_output = reshape(shape = var_1410, x = attn_output_13)[name = tensor<string, []>("attn_output")];
+            tensor<int32, [4]> var_1424 = const()[name = tensor<string, []>("op_1424"), val = tensor<int32, [4]>([2, 0, 1, 3])];
+            tensor<int32, [2]> var_1429 = const()[name = tensor<string, []>("op_1429"), val = tensor<int32, [2]>([27, 256])];
+            tensor<fp32, [9, 3, 4, 64]> var_1425 = transpose(perm = var_1424, x = attn_output_9)[name = tensor<string, []>("transpose_4")];
+            tensor<fp32, [27, 256]> attn_output_11 = reshape(shape = var_1429, x = var_1425)[name = tensor<string, []>("attn_output_11")];
+            tensor<fp32, [27, 256]> attn_output_13 = linear(bias = inner_decoder_self_attn2_1_out_proj_bias, weight = inner_decoder_self_attn2_1_out_proj_weight, x = attn_output_11)[name = tensor<string, []>("linear_53")];
+            tensor<int32, [3]> var_1433 = const()[name = tensor<string, []>("op_1433"), val = tensor<int32, [3]>([9, 3, 256])];
+            tensor<fp32, [9, 3, 256]> attn_output = reshape(shape = var_1433, x = attn_output_13)[name = tensor<string, []>("attn_output")];
             tensor<int32, [3]> sa2_out_perm_0 = const()[name = tensor<string, []>("sa2_out_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [3, 9, 256]> sa2_out = transpose(perm = sa2_out_perm_0, x = attn_output)[name = tensor<string, []>("transpose_3")];
-            tensor<fp32, [3, 9, 256]> input_193 = add(x = query_5, y = sa2_out)[name = tensor<string, []>("input_193")];
-            tensor<int32, [1]> input_195_axes_0 = const()[name = tensor<string, []>("input_195_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [3, 9, 256]> input_195 = layer_norm(axes = input_195_axes_0, beta = decoder_norm21_1_bias, epsilon = var_985, gamma = decoder_norm21_1_weight, x = input_193)[name = tensor<string, []>("input_195")];
-            tensor<fp32, [3, 9, 2048]> input_197 = linear(bias = decoder_linear1_1_bias, weight = decoder_linear1_1_weight, x = input_195)[name = tensor<string, []>("linear_54")];
-            tensor<fp32, [3, 9, 2048]> input_199 = relu(x = input_197)[name = tensor<string, []>("input_199")];
-            tensor<fp32, [3, 9, 256]> ffn_out = linear(bias = decoder_linear2_1_bias, weight = decoder_linear2_1_weight, x = input_199)[name = tensor<string, []>("linear_55")];
-            tensor<fp32, [3, 9, 256]> input_201 = add(x = input_195, y = ffn_out)[name = tensor<string, []>("input_201")];
+            tensor<fp32, [3, 9, 256]> input_195 = add(x = query_5, y = sa2_out)[name = tensor<string, []>("input_195")];
+            tensor<int32, [1]> input_197_axes_0 = const()[name = tensor<string, []>("input_197_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [3, 9, 256]> input_197 = layer_norm(axes = input_197_axes_0, beta = inner_decoder_norm21_1_bias, epsilon = var_56, gamma = inner_decoder_norm21_1_weight, x = input_195)[name = tensor<string, []>("input_197")];
+            tensor<fp32, [3, 9, 2048]> input_199 = linear(bias = inner_decoder_linear1_1_bias, weight = inner_decoder_linear1_1_weight, x = input_197)[name = tensor<string, []>("linear_54")];
+            tensor<fp32, [3, 9, 2048]> input_201 = relu(x = input_199)[name = tensor<string, []>("input_201")];
+            tensor<fp32, [3, 9, 256]> ffn_out = linear(bias = inner_decoder_linear2_1_bias, weight = inner_decoder_linear2_1_weight, x = input_201)[name = tensor<string, []>("linear_55")];
+            tensor<fp32, [3, 9, 256]> input_203 = add(x = input_197, y = ffn_out)[name = tensor<string, []>("input_203")];
             tensor<int32, [1]> xt_axes_0 = const()[name = tensor<string, []>("xt_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [3, 9, 256]> xt = layer_norm(axes = xt_axes_0, beta = decoder_norm22_1_bias, epsilon = var_985, gamma = decoder_norm22_1_weight, x = input_201)[name = tensor<string, []>("xt")];
-            tensor<int32, [4]> var_1430 = const()[name = tensor<string, []>("op_1430"), val = tensor<int32, [4]>([1, 3, 9, 256])];
-            tensor<fp32, [1, 3, 9, 256]> input = reshape(shape = var_1430, x = xt)[name = tensor<string, []>("input")];
-            tensor<int32, [1]> var_1432 = const()[name = tensor<string, []>("op_1432"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 9, 1]> var_1433 = reduce_l2_norm(axes = var_1432, keep_dims = var_988, x = input)[name = tensor<string, []>("op_1433")];
+            tensor<fp32, [3, 9, 256]> xt = layer_norm(axes = xt_axes_0, beta = inner_decoder_norm22_1_bias, epsilon = var_56, gamma = inner_decoder_norm22_1_weight, x = input_203)[name = tensor<string, []>("xt")];
+            tensor<int32, [4]> var_1453 = const()[name = tensor<string, []>("op_1453"), val = tensor<int32, [4]>([1, 3, 9, 256])];
+            tensor<fp32, [1, 3, 9, 256]> input = reshape(shape = var_1453, x = xt)[name = tensor<string, []>("input")];
+            tensor<int32, [1]> var_1455 = const()[name = tensor<string, []>("op_1455"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 3, 9, 1]> var_1456 = reduce_l2_norm(axes = var_1455, keep_dims = var_55, x = input)[name = tensor<string, []>("op_1456")];
             tensor<fp32, []> const_42 = const()[name = tensor<string, []>("const_42"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1, 3, 9, 1]> clip_5 = clip(alpha = var_980, beta = const_42, x = var_1433)[name = tensor<string, []>("clip_5")];
-            tensor<fp32, [1, 3, 9, 256]> var_1435 = real_div(x = input, y = clip_5)[name = tensor<string, []>("op_1435")];
+            tensor<fp32, [1, 3, 9, 1]> clip_5 = clip(alpha = var_69, beta = const_42, x = var_1456)[name = tensor<string, []>("clip_5")];
+            tensor<fp32, [1, 3, 9, 256]> var_1458 = real_div(x = input, y = clip_5)[name = tensor<string, []>("op_1458")];
             tensor<int32, [3]> concat_6 = const()[name = tensor<string, []>("concat_6"), val = tensor<int32, [3]>([3, 1, 256])];
             tensor<fp32, [3, 1, 256]> reshape_0 = reshape(shape = concat_6, x = emb)[name = tensor<string, []>("reshape_0")];
             tensor<int32, [4]> transpose_1_perm_0 = const()[name = tensor<string, []>("transpose_1_perm_0"), val = tensor<int32, [4]>([0, 1, 3, 2])];
             tensor<int32, [3]> concat_7 = const()[name = tensor<string, []>("concat_7"), val = tensor<int32, [3]>([3, 256, 9])];
-            tensor<fp32, [1, 3, 256, 9]> transpose_1 = transpose(perm = transpose_1_perm_0, x = var_1435)[name = tensor<string, []>("transpose_2")];
+            tensor<fp32, [1, 3, 256, 9]> transpose_1 = transpose(perm = transpose_1_perm_0, x = var_1458)[name = tensor<string, []>("transpose_2")];
             tensor<fp32, [3, 256, 9]> reshape_1 = reshape(shape = concat_7, x = transpose_1)[name = tensor<string, []>("reshape_1")];
             tensor<bool, []> matmul_2_transpose_x_0 = const()[name = tensor<string, []>("matmul_2_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> matmul_2_transpose_y_0 = const()[name = tensor<string, []>("matmul_2_transpose_y_0"), val = tensor<bool, []>(false)];
@@ -1277,10 +1288,10 @@ program(1.0)
             tensor<int32, [3]> output_end_0 = const()[name = tensor<string, []>("output_end_0"), val = tensor<int32, [3]>([1, 3, 8])];
             tensor<bool, [3]> output_end_mask_0 = const()[name = tensor<string, []>("output_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
             tensor<fp32, [1, 3, 7]> output = slice_by_index(begin = output_begin_0, end = output_end_0, end_mask = output_end_mask_0, x = reshape_2)[name = tensor<string, []>("output")];
-            tensor<fp32, [1, 3, 7]> probs = sigmoid(x = output)[name = tensor<string, []>("op_1439")];
-            tensor<int32, []> var_1441_axis_0 = const()[name = tensor<string, []>("op_1441_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [2, 9, 4, 64, 64]> dec_kv_new = stack(axis = var_1441_axis_0, values = (var_1137, nkv))[name = tensor<string, []>("op_1441")];
-            tensor<int32, []> var_1443_axis_0 = const()[name = tensor<string, []>("op_1443_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [2, 1]> dec_scale_new = stack(axis = var_1443_axis_0, values = (new_scale_9, new_scale))[name = tensor<string, []>("op_1443")];
+            tensor<fp32, [1, 3, 7]> probs = sigmoid(x = output)[name = tensor<string, []>("op_1462")];
+            tensor<int32, []> var_1464_axis_0 = const()[name = tensor<string, []>("op_1464_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [2, 9, 4, 64, 64]> dec_kv_new = stack(axis = var_1464_axis_0, values = (var_1160, nkv))[name = tensor<string, []>("op_1464")];
+            tensor<int32, []> var_1466_axis_0 = const()[name = tensor<string, []>("op_1466_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [2, 1]> dec_scale_new = stack(axis = var_1466_axis_0, values = (new_scale_9, new_scale))[name = tensor<string, []>("op_1466")];
         } -> (probs, enc_kv_new, enc_scale_new, enc_conv_cache_new, cnn_window_new, dec_kv_new, dec_scale_new);
 }
\ No newline at end of file
diff --git a/optimized/ch/300ms/ls_eend_ch_300ms.mlpackage/Data/com.apple.CoreML/model.mlmodel b/optimized/ch/300ms/ls_eend_ch_300ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
index 09eaf79a16883ad1d32f03778b55635fbbffe830..8d1afc73657769e2f6f912f61c64f916553c8734 100644
--- a/optimized/ch/300ms/ls_eend_ch_300ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
+++ b/optimized/ch/300ms/ls_eend_ch_300ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6e633b39d71e29671336855550a3677233cf59cb49fe3e97a9c94677d5acb3f1
-size 185460
+oid sha256:3f5830d50c24f80450f2c08d152e16bb1da8692b3b26381056871626e0808fbc
+size 191005
diff --git a/optimized/ch/300ms/ls_eend_ch_300ms.mlpackage/Manifest.json b/optimized/ch/300ms/ls_eend_ch_300ms.mlpackage/Manifest.json
index 250225980694a093a91bfcf274f8ddeff9d47549..f08e9b846fafdcbf668951e72cf7d4330ceb98f1 100644
--- a/optimized/ch/300ms/ls_eend_ch_300ms.mlpackage/Manifest.json
+++ b/optimized/ch/300ms/ls_eend_ch_300ms.mlpackage/Manifest.json
@@ -1,18 +1,18 @@
 {
     "fileFormatVersion": "1.0.0",
     "itemInfoEntries": {
-        "539B39A7-7E21-40EC-852C-90E40A6CCF50": {
-            "author": "com.apple.CoreML",
-            "description": "CoreML Model Specification",
-            "name": "model.mlmodel",
-            "path": "com.apple.CoreML/model.mlmodel"
-        },
-        "743E81BC-0B9E-4702-AC8D-3B134F65C5D1": {
+        "2FAA383D-FADF-4A2C-B5BA-06B37FB7467E": {
             "author": "com.apple.CoreML",
             "description": "CoreML Model Weights",
             "name": "weights",
             "path": "com.apple.CoreML/weights"
+        },
+        "60829799-5C9A-43C1-B425-7785E458D03C": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Specification",
+            "name": "model.mlmodel",
+            "path": "com.apple.CoreML/model.mlmodel"
         }
     },
-    "rootModelIdentifier": "539B39A7-7E21-40EC-852C-90E40A6CCF50"
+    "rootModelIdentifier": "60829799-5C9A-43C1-B425-7785E458D03C"
 }
diff --git a/optimized/ch/400ms/ls_eend_ch_400ms.mlmodelc/analytics/coremldata.bin b/optimized/ch/400ms/ls_eend_ch_400ms.mlmodelc/analytics/coremldata.bin
index 8ef1ae6589fe15764e0c192933da3263e5f868b8..e90bf2e6774457a495929bffad5627e2beeb349f 100644
--- a/optimized/ch/400ms/ls_eend_ch_400ms.mlmodelc/analytics/coremldata.bin
+++ b/optimized/ch/400ms/ls_eend_ch_400ms.mlmodelc/analytics/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:030a2bc7f91827f6efdac515150972512aa462ecb691365f25d97480cc537e7b
+oid sha256:d475261f5ecfad58b0970d8de176466ec976310030a3c98ebbde85306d5a944b
 size 243
diff --git a/optimized/ch/400ms/ls_eend_ch_400ms.mlmodelc/coremldata.bin b/optimized/ch/400ms/ls_eend_ch_400ms.mlmodelc/coremldata.bin
index a82c5d753734fd18061bb4de2c2b1b5334ef3f57..ce4dc38e34ee492c982a194732ec4cb2dae10950 100644
--- a/optimized/ch/400ms/ls_eend_ch_400ms.mlmodelc/coremldata.bin
+++ b/optimized/ch/400ms/ls_eend_ch_400ms.mlmodelc/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d8fe19dbc10aceae5862d5502e24f0c7c00f356c66d3828aae97b2c41383a71b
-size 1301
+oid sha256:fe75c57185d939ba1e81b4dcd5f4f2b95480d18f25636cd453e7b4eec7f29680
+size 1404
diff --git a/optimized/ch/400ms/ls_eend_ch_400ms.mlmodelc/metadata.json b/optimized/ch/400ms/ls_eend_ch_400ms.mlmodelc/metadata.json
index 1479f59b57daba4565d10a86496002152bdb5208..b0e2f6533304881151f96ea28c711c20a4457ff4 100644
--- a/optimized/ch/400ms/ls_eend_ch_400ms.mlmodelc/metadata.json
+++ b/optimized/ch/400ms/ls_eend_ch_400ms.mlmodelc/metadata.json
@@ -1,7 +1,7 @@
 [
   {
     "metadataOutputVersion" : "3.0",
-    "shortDescription" : "LS-EEND CALLHOME streaming diarizer (pipeline, T=4, max_speakers=7)",
+    "shortDescription" : "LS-EEND CALLHOME streaming diarizer (pipeline, T=4, max_speakers=7, layout=raw_mel, cu=all)",
     "outputSchema" : [
       {
         "hasShapeFlexibility" : "0",
@@ -81,7 +81,7 @@
     "specificationVersion" : 8,
     "mlProgramOperationTypeHistogram" : {
       "Ios16.reduceL2Norm" : 2,
-      "Ios17.reshape" : 68,
+      "Ios17.reshape" : 69,
       "Ios16.softmax" : 2,
       "Ios17.matmul" : 29,
       "Ios17.transpose" : 57,
@@ -89,7 +89,7 @@
       "Ios17.expandDims" : 3,
       "Ios17.add" : 46,
       "Ios16.sigmoid" : 5,
-      "Ios17.sliceByIndex" : 64,
+      "Ios17.sliceByIndex" : 68,
       "Tile" : 1,
       "Ios16.reduceSum" : 1,
       "Ios17.squeeze" : 2,
@@ -101,7 +101,7 @@
       "Ios16.silu" : 18,
       "Ios17.realDiv" : 20,
       "Ios17.linear" : 56,
-      "Stack" : 5,
+      "Stack" : 6,
       "Ios17.concat" : 22,
       "Ios16.relu" : 2,
       "Ios16.cumsum" : 1,
@@ -128,9 +128,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float32",
-        "formattedType" : "MultiArray (Float32 1 × 4 × 345)",
+        "formattedType" : "MultiArray (Float32 1 × 45 × 23)",
         "shortDescription" : "",
-        "shape" : "[1, 4, 345]",
+        "shape" : "[1, 45, 23]",
         "name" : "features",
         "type" : "MultiArray"
       },
@@ -206,8 +206,8 @@
       }
     ],
     "userDefinedMetadata" : {
-      "com.github.apple.coremltools.conversion_date" : "2026-04-16",
-      "config" : "{\"model_name\": \"ch\", \"model_label\": \"CALLHOME\", \"variant\": \"pipeline\", \"chunk_size\": 4, \"step_duration_ms\": 400, \"frame_duration_ms\": 100, \"frame_duration_seconds\": 0.1, \"max_speakers\": 7, \"max_nspks\": 9, \"n_units\": 256, \"n_heads\": 4, \"enc_n_layers\": 4, \"dec_n_layers\": 2, \"conv_kernel_size\": 16, \"conv_delay\": 9, \"sample_rate\": 8000, \"win_length\": 200, \"hop_length\": 80, \"n_mels\": 23, \"context_size\": 7, \"subsampling\": 10, \"feat_type\": \"logmel23_cummn\", \"pure_roll\": true}",
+      "com.github.apple.coremltools.conversion_date" : "2026-04-18",
+      "config" : "{\"model_name\": \"ch\", \"model_label\": \"CALLHOME\", \"variant\": \"pipeline\", \"chunk_size\": 4, \"step_duration_ms\": 400, \"frame_duration_ms\": 100, \"frame_duration_seconds\": 0.1, \"max_speakers\": 7, \"max_nspks\": 9, \"n_units\": 256, \"n_heads\": 4, \"enc_n_layers\": 4, \"dec_n_layers\": 2, \"conv_kernel_size\": 16, \"conv_delay\": 9, \"sample_rate\": 8000, \"win_length\": 200, \"hop_length\": 80, \"n_mels\": 23, \"context_size\": 7, \"subsampling\": 10, \"feat_type\": \"logmel23_cummn\", \"pure_roll\": true, \"input_layout\": \"raw_mel\", \"compute_units_export\": \"all\", \"raw_mel_length\": 45}",
       "com.github.apple.coremltools.source" : "torch==2.6.0",
       "com.github.apple.coremltools.version" : "9.0",
       "com.github.apple.coremltools.source_dialect" : "TorchScript"
diff --git a/optimized/ch/400ms/ls_eend_ch_400ms.mlmodelc/model.mil b/optimized/ch/400ms/ls_eend_ch_400ms.mlmodelc/model.mil
index 8e884a60ea34232c188dc4cdb669760172249293..0c554f0ab421d447767fe6f10a2fde3b97d7301b 100644
--- a/optimized/ch/400ms/ls_eend_ch_400ms.mlmodelc/model.mil
+++ b/optimized/ch/400ms/ls_eend_ch_400ms.mlmodelc/model.mil
@@ -1,234 +1,256 @@
 program(1.0)
 [buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.6.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "9.0"}})]
 {
-    func main<ios17>(tensor<fp32, [1, 256, 18]> cnn_window, tensor<fp32, [2, 9, 4, 64, 64]> dec_kv, tensor<fp32, [2, 1]> dec_scale, tensor<fp32, [4, 1, 16, 256]> enc_conv_cache, tensor<fp32, [4, 1, 4, 64, 64]> enc_kv, tensor<fp32, [4, 1]> enc_scale, tensor<fp32, [1, 4, 345]> features, tensor<fp32, [4]> valid_mask) {
-            tensor<fp32, [256]> encoder_cnn_bias = const()[name = tensor<string, []>("encoder_cnn_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
-            tensor<fp32, [256, 256, 19]> encoder_cnn_weight = const()[name = tensor<string, []>("encoder_cnn_weight"), val = tensor<fp32, [256, 256, 19]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
-            tensor<fp32, [4, 4]> encoder__causal_mask = const()[name = tensor<string, []>("encoder__causal_mask"), val = tensor<fp32, [4, 4]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4981952)))];
-            tensor<fp32, [4]> encoder__t_index = const()[name = tensor<string, []>("encoder__t_index"), val = tensor<fp32, [4]>([0x1p+0, 0x1p+1, 0x1.8p+1, 0x1p+2])];
-            tensor<fp32, [256]> encoder_input_projection_linear_bias = const()[name = tensor<string, []>("encoder_input_projection_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4982080)))];
-            tensor<fp32, [256, 345]> encoder_input_projection_linear_weight = const()[name = tensor<string, []>("encoder_input_projection_linear_weight"), val = tensor<fp32, [256, 345]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4983168)))];
-            tensor<fp32, [256]> encoder_pre_layer_norm_bias = const()[name = tensor<string, []>("encoder_pre_layer_norm_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5336512)))];
-            tensor<fp32, [256]> encoder_pre_layer_norm_weight = const()[name = tensor<string, []>("encoder_pre_layer_norm_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5337600)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5338688)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5339776)))];
-            tensor<fp32, [1024]> encoder_ffn1_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5340864)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5345024)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6393664)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6394752)))];
-            tensor<fp32, [256]> encoder_ret_lns_0_bias = const()[name = tensor<string, []>("encoder_ret_lns_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7443392)))];
-            tensor<fp32, [256]> encoder_ret_lns_0_weight = const()[name = tensor<string, []>("encoder_ret_lns_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7444480)))];
-            tensor<fp32, [256]> encoder_q_proj_0_bias = const()[name = tensor<string, []>("encoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7445568)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_0_weight = const()[name = tensor<string, []>("encoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7446656)))];
-            tensor<fp32, [256]> encoder_k_proj_0_bias = const()[name = tensor<string, []>("encoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7708864)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_0_weight = const()[name = tensor<string, []>("encoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7709952)))];
-            tensor<fp32, [256]> encoder_v_proj_0_bias = const()[name = tensor<string, []>("encoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7972160)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_0_weight = const()[name = tensor<string, []>("encoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7973248)))];
-            tensor<fp32, [256]> encoder_g_proj_0_bias = const()[name = tensor<string, []>("encoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8235456)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_0_weight = const()[name = tensor<string, []>("encoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8236544)))];
-            tensor<fp32, [256]> encoder_out_proj_0_bias = const()[name = tensor<string, []>("encoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8498752)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_0_weight = const()[name = tensor<string, []>("encoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8499840)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8762048)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8763136)))];
-            tensor<fp32, [512]> encoder_conv_module_0_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8764224)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_0_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8766336)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_0_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9290688)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9307136)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9308224)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9309312)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9310400)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9311488)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_0_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9312576)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9574784)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9575872)))];
-            tensor<fp32, [1024]> encoder_ffn2_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9576960)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9581120)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10629760)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10630848)))];
-            tensor<fp32, [256]> encoder_layer_norm_0_bias = const()[name = tensor<string, []>("encoder_layer_norm_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11679488)))];
-            tensor<fp32, [256]> encoder_layer_norm_0_weight = const()[name = tensor<string, []>("encoder_layer_norm_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11680576)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11681664)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11682752)))];
-            tensor<fp32, [1024]> encoder_ffn1_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11683840)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11688000)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12736640)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12737728)))];
-            tensor<fp32, [256]> encoder_ret_lns_1_bias = const()[name = tensor<string, []>("encoder_ret_lns_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13786368)))];
-            tensor<fp32, [256]> encoder_ret_lns_1_weight = const()[name = tensor<string, []>("encoder_ret_lns_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13787456)))];
-            tensor<fp32, [256]> encoder_q_proj_1_bias = const()[name = tensor<string, []>("encoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13788544)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_1_weight = const()[name = tensor<string, []>("encoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13789632)))];
-            tensor<fp32, [256]> encoder_k_proj_1_bias = const()[name = tensor<string, []>("encoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14051840)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_1_weight = const()[name = tensor<string, []>("encoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14052928)))];
-            tensor<fp32, [256]> encoder_v_proj_1_bias = const()[name = tensor<string, []>("encoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14315136)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_1_weight = const()[name = tensor<string, []>("encoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14316224)))];
-            tensor<fp32, [256]> encoder_g_proj_1_bias = const()[name = tensor<string, []>("encoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14578432)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_1_weight = const()[name = tensor<string, []>("encoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14579520)))];
-            tensor<fp32, [256]> encoder_out_proj_1_bias = const()[name = tensor<string, []>("encoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14841728)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_1_weight = const()[name = tensor<string, []>("encoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14842816)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15105024)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15106112)))];
-            tensor<fp32, [512]> encoder_conv_module_1_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15107200)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_1_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15109312)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_1_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15633664)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15650112)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15651200)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15652288)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15653376)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15654464)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_1_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15655552)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15917760)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15918848)))];
-            tensor<fp32, [1024]> encoder_ffn2_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15919936)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15924096)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16972736)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16973824)))];
-            tensor<fp32, [256]> encoder_layer_norm_1_bias = const()[name = tensor<string, []>("encoder_layer_norm_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18022464)))];
-            tensor<fp32, [256]> encoder_layer_norm_1_weight = const()[name = tensor<string, []>("encoder_layer_norm_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18023552)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18024640)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18025728)))];
-            tensor<fp32, [1024]> encoder_ffn1_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18026816)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18030976)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19079616)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19080704)))];
-            tensor<fp32, [256]> encoder_ret_lns_2_bias = const()[name = tensor<string, []>("encoder_ret_lns_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20129344)))];
-            tensor<fp32, [256]> encoder_ret_lns_2_weight = const()[name = tensor<string, []>("encoder_ret_lns_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20130432)))];
-            tensor<fp32, [256]> encoder_q_proj_2_bias = const()[name = tensor<string, []>("encoder_q_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20131520)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_2_weight = const()[name = tensor<string, []>("encoder_q_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20132608)))];
-            tensor<fp32, [256]> encoder_k_proj_2_bias = const()[name = tensor<string, []>("encoder_k_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20394816)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_2_weight = const()[name = tensor<string, []>("encoder_k_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20395904)))];
-            tensor<fp32, [256]> encoder_v_proj_2_bias = const()[name = tensor<string, []>("encoder_v_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20658112)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_2_weight = const()[name = tensor<string, []>("encoder_v_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20659200)))];
-            tensor<fp32, [256]> encoder_g_proj_2_bias = const()[name = tensor<string, []>("encoder_g_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20921408)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_2_weight = const()[name = tensor<string, []>("encoder_g_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20922496)))];
-            tensor<fp32, [256]> encoder_out_proj_2_bias = const()[name = tensor<string, []>("encoder_out_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21184704)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_2_weight = const()[name = tensor<string, []>("encoder_out_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21185792)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21448000)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21449088)))];
-            tensor<fp32, [512]> encoder_conv_module_2_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21450176)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_2_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21452288)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_2_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21976640)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21993088)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21994176)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21995264)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21996352)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21997440)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_2_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21998528)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22260736)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22261824)))];
-            tensor<fp32, [1024]> encoder_ffn2_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22262912)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22267072)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23315712)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23316800)))];
-            tensor<fp32, [256]> encoder_layer_norm_2_bias = const()[name = tensor<string, []>("encoder_layer_norm_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24365440)))];
-            tensor<fp32, [256]> encoder_layer_norm_2_weight = const()[name = tensor<string, []>("encoder_layer_norm_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24366528)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24367616)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24368704)))];
-            tensor<fp32, [1024]> encoder_ffn1_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24369792)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24373952)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25422592)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25423680)))];
-            tensor<fp32, [256]> encoder_ret_lns_3_bias = const()[name = tensor<string, []>("encoder_ret_lns_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26472320)))];
-            tensor<fp32, [256]> encoder_ret_lns_3_weight = const()[name = tensor<string, []>("encoder_ret_lns_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26473408)))];
-            tensor<fp32, [256]> encoder_q_proj_3_bias = const()[name = tensor<string, []>("encoder_q_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26474496)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_3_weight = const()[name = tensor<string, []>("encoder_q_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26475584)))];
-            tensor<fp32, [256]> encoder_k_proj_3_bias = const()[name = tensor<string, []>("encoder_k_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26737792)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_3_weight = const()[name = tensor<string, []>("encoder_k_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26738880)))];
-            tensor<fp32, [256]> encoder_v_proj_3_bias = const()[name = tensor<string, []>("encoder_v_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27001088)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_3_weight = const()[name = tensor<string, []>("encoder_v_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27002176)))];
-            tensor<fp32, [256]> encoder_g_proj_3_bias = const()[name = tensor<string, []>("encoder_g_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27264384)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_3_weight = const()[name = tensor<string, []>("encoder_g_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27265472)))];
-            tensor<fp32, [256]> encoder_out_proj_3_bias = const()[name = tensor<string, []>("encoder_out_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27527680)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_3_weight = const()[name = tensor<string, []>("encoder_out_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27528768)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27790976)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27792064)))];
-            tensor<fp32, [512]> encoder_conv_module_3_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27793152)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_3_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27795264)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_3_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28319616)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28336064)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28337152)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28338240)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28339328)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28340416)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_3_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28341504)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28603712)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28604800)))];
-            tensor<fp32, [1024]> encoder_ffn2_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28605888)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28610048)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29658688)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29659776)))];
-            tensor<fp32, [256]> encoder_layer_norm_3_bias = const()[name = tensor<string, []>("encoder_layer_norm_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30708416)))];
-            tensor<fp32, [256]> encoder_layer_norm_3_weight = const()[name = tensor<string, []>("encoder_layer_norm_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30709504)))];
-            tensor<fp32, [4, 4]> decoder__causal_mask = const()[name = tensor<string, []>("decoder__causal_mask"), val = tensor<fp32, [4, 4]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710592)))];
-            tensor<fp32, [256]> decoder_convert_bias = const()[name = tensor<string, []>("decoder_convert_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710720)))];
-            tensor<fp32, [256, 512]> decoder_convert_weight = const()[name = tensor<string, []>("decoder_convert_weight"), val = tensor<fp32, [256, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30711808)))];
-            tensor<fp32, [256]> decoder_q_proj_0_bias = const()[name = tensor<string, []>("decoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31236160)))];
-            tensor<fp32, [256, 256]> decoder_q_proj_0_weight = const()[name = tensor<string, []>("decoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31237248)))];
-            tensor<fp32, [256]> decoder_k_proj_0_bias = const()[name = tensor<string, []>("decoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31499456)))];
-            tensor<fp32, [256, 256]> decoder_k_proj_0_weight = const()[name = tensor<string, []>("decoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31500544)))];
-            tensor<fp32, [256]> decoder_v_proj_0_bias = const()[name = tensor<string, []>("decoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31762752)))];
-            tensor<fp32, [256, 256]> decoder_v_proj_0_weight = const()[name = tensor<string, []>("decoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31763840)))];
-            tensor<fp32, [256]> decoder_g_proj_0_bias = const()[name = tensor<string, []>("decoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32026048)))];
-            tensor<fp32, [256, 256]> decoder_g_proj_0_weight = const()[name = tensor<string, []>("decoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32027136)))];
-            tensor<fp32, [256]> decoder_out_proj_0_bias = const()[name = tensor<string, []>("decoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32289344)))];
-            tensor<fp32, [256, 256]> decoder_out_proj_0_weight = const()[name = tensor<string, []>("decoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32290432)))];
-            tensor<fp32, [256]> decoder_norm11_0_bias = const()[name = tensor<string, []>("decoder_norm11_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32552640)))];
-            tensor<fp32, [256]> decoder_norm11_0_weight = const()[name = tensor<string, []>("decoder_norm11_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32553728)))];
-            tensor<fp32, [256]> decoder_self_attn2_0_out_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_0_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32554816)))];
-            tensor<fp32, [256, 256]> decoder_self_attn2_0_out_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_0_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32555904)))];
-            tensor<fp32, [768]> decoder_self_attn2_0_in_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_0_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32818112)))];
-            tensor<fp32, [768, 256]> decoder_self_attn2_0_in_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_0_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32821248)))];
-            tensor<fp32, [256]> decoder_norm21_0_bias = const()[name = tensor<string, []>("decoder_norm21_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33607744)))];
-            tensor<fp32, [256]> decoder_norm21_0_weight = const()[name = tensor<string, []>("decoder_norm21_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33608832)))];
-            tensor<fp32, [2048]> decoder_linear1_0_bias = const()[name = tensor<string, []>("decoder_linear1_0_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33609920)))];
-            tensor<fp32, [2048, 256]> decoder_linear1_0_weight = const()[name = tensor<string, []>("decoder_linear1_0_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33618176)))];
-            tensor<fp32, [256]> decoder_linear2_0_bias = const()[name = tensor<string, []>("decoder_linear2_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35715392)))];
-            tensor<fp32, [256, 2048]> decoder_linear2_0_weight = const()[name = tensor<string, []>("decoder_linear2_0_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35716480)))];
-            tensor<fp32, [256]> decoder_norm22_0_bias = const()[name = tensor<string, []>("decoder_norm22_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37813696)))];
-            tensor<fp32, [256]> decoder_norm22_0_weight = const()[name = tensor<string, []>("decoder_norm22_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37814784)))];
-            tensor<fp32, [256]> decoder_q_proj_1_bias = const()[name = tensor<string, []>("decoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37815872)))];
-            tensor<fp32, [256, 256]> decoder_q_proj_1_weight = const()[name = tensor<string, []>("decoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37816960)))];
-            tensor<fp32, [256]> decoder_k_proj_1_bias = const()[name = tensor<string, []>("decoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38079168)))];
-            tensor<fp32, [256, 256]> decoder_k_proj_1_weight = const()[name = tensor<string, []>("decoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38080256)))];
-            tensor<fp32, [256]> decoder_v_proj_1_bias = const()[name = tensor<string, []>("decoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38342464)))];
-            tensor<fp32, [256, 256]> decoder_v_proj_1_weight = const()[name = tensor<string, []>("decoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38343552)))];
-            tensor<fp32, [256]> decoder_g_proj_1_bias = const()[name = tensor<string, []>("decoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38605760)))];
-            tensor<fp32, [256, 256]> decoder_g_proj_1_weight = const()[name = tensor<string, []>("decoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38606848)))];
-            tensor<fp32, [256]> decoder_out_proj_1_bias = const()[name = tensor<string, []>("decoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38869056)))];
-            tensor<fp32, [256, 256]> decoder_out_proj_1_weight = const()[name = tensor<string, []>("decoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38870144)))];
-            tensor<fp32, [256]> decoder_norm11_1_bias = const()[name = tensor<string, []>("decoder_norm11_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39132352)))];
-            tensor<fp32, [256]> decoder_norm11_1_weight = const()[name = tensor<string, []>("decoder_norm11_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39133440)))];
-            tensor<fp32, [256]> decoder_self_attn2_1_out_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_1_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39134528)))];
-            tensor<fp32, [256, 256]> decoder_self_attn2_1_out_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_1_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39135616)))];
-            tensor<fp32, [768]> decoder_self_attn2_1_in_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_1_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39397824)))];
-            tensor<fp32, [768, 256]> decoder_self_attn2_1_in_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_1_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39400960)))];
-            tensor<fp32, [256]> decoder_norm21_1_bias = const()[name = tensor<string, []>("decoder_norm21_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40187456)))];
-            tensor<fp32, [256]> decoder_norm21_1_weight = const()[name = tensor<string, []>("decoder_norm21_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40188544)))];
-            tensor<fp32, [2048]> decoder_linear1_1_bias = const()[name = tensor<string, []>("decoder_linear1_1_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40189632)))];
-            tensor<fp32, [2048, 256]> decoder_linear1_1_weight = const()[name = tensor<string, []>("decoder_linear1_1_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40197888)))];
-            tensor<fp32, [256]> decoder_linear2_1_bias = const()[name = tensor<string, []>("decoder_linear2_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42295104)))];
-            tensor<fp32, [256, 2048]> decoder_linear2_1_weight = const()[name = tensor<string, []>("decoder_linear2_1_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42296192)))];
-            tensor<fp32, [256]> decoder_norm22_1_bias = const()[name = tensor<string, []>("decoder_norm22_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44393408)))];
-            tensor<fp32, [256]> decoder_norm22_1_weight = const()[name = tensor<string, []>("decoder_norm22_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44394496)))];
-            tensor<fp32, []> var_11 = const()[name = tensor<string, []>("op_11"), val = tensor<fp32, []>(0x1.5798eep-27)];
-            tensor<fp32, []> var_17 = const()[name = tensor<string, []>("op_17"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
-            tensor<int32, []> var_20 = const()[name = tensor<string, []>("op_20"), val = tensor<int32, []>(2)];
-            tensor<int32, []> var_23 = const()[name = tensor<string, []>("op_23"), val = tensor<int32, []>(0)];
-            tensor<int32, []> var_26 = const()[name = tensor<string, []>("op_26"), val = tensor<int32, []>(1)];
-            tensor<fp32, []> var_28 = const()[name = tensor<string, []>("op_28"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
-            tensor<bool, []> var_29 = const()[name = tensor<string, []>("op_29"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 256]> input_1 = linear(bias = encoder_input_projection_linear_bias, weight = encoder_input_projection_linear_weight, x = features)[name = tensor<string, []>("linear_0")];
-            tensor<int32, [1]> input_3_axes_0 = const()[name = tensor<string, []>("input_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_3 = layer_norm(axes = input_3_axes_0, beta = encoder_pre_layer_norm_bias, epsilon = var_28, gamma = encoder_pre_layer_norm_weight, x = input_1)[name = tensor<string, []>("input_3")];
+    func main<ios17>(tensor<fp32, [1, 256, 18]> cnn_window, tensor<fp32, [2, 9, 4, 64, 64]> dec_kv, tensor<fp32, [2, 1]> dec_scale, tensor<fp32, [4, 1, 16, 256]> enc_conv_cache, tensor<fp32, [4, 1, 4, 64, 64]> enc_kv, tensor<fp32, [4, 1]> enc_scale, tensor<fp32, [1, 45, 23]> features, tensor<fp32, [4]> valid_mask) {
+            tensor<fp32, [256]> inner_encoder_cnn_bias = const()[name = tensor<string, []>("inner_encoder_cnn_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp32, [256, 256, 19]> inner_encoder_cnn_weight = const()[name = tensor<string, []>("inner_encoder_cnn_weight"), val = tensor<fp32, [256, 256, 19]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
+            tensor<fp32, [4, 4]> inner_encoder__causal_mask = const()[name = tensor<string, []>("inner_encoder__causal_mask"), val = tensor<fp32, [4, 4]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4981952)))];
+            tensor<fp32, [4]> inner_encoder__t_index = const()[name = tensor<string, []>("inner_encoder__t_index"), val = tensor<fp32, [4]>([0x1p+0, 0x1p+1, 0x1.8p+1, 0x1p+2])];
+            tensor<fp32, [256]> inner_encoder_input_projection_linear_bias = const()[name = tensor<string, []>("inner_encoder_input_projection_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4982080)))];
+            tensor<fp32, [256, 345]> inner_encoder_input_projection_linear_weight = const()[name = tensor<string, []>("inner_encoder_input_projection_linear_weight"), val = tensor<fp32, [256, 345]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4983168)))];
+            tensor<fp32, [256]> inner_encoder_pre_layer_norm_bias = const()[name = tensor<string, []>("inner_encoder_pre_layer_norm_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5336512)))];
+            tensor<fp32, [256]> inner_encoder_pre_layer_norm_weight = const()[name = tensor<string, []>("inner_encoder_pre_layer_norm_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5337600)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5338688)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5339776)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5340864)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5345024)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6393664)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6394752)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_0_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7443392)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_0_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7444480)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7445568)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7446656)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7708864)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7709952)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7972160)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7973248)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8235456)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8236544)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8498752)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8499840)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8762048)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8763136)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_0_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8764224)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_0_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8766336)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_0_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9290688)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9307136)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9308224)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9309312)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9310400)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9311488)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_0_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9312576)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9574784)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9575872)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9576960)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9581120)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10629760)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10630848)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_0_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11679488)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_0_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11680576)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11681664)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11682752)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11683840)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11688000)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12736640)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12737728)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_1_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13786368)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_1_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13787456)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13788544)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13789632)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14051840)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14052928)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14315136)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14316224)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14578432)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14579520)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14841728)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14842816)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15105024)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15106112)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_1_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15107200)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_1_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15109312)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_1_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15633664)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15650112)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15651200)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15652288)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15653376)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15654464)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_1_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15655552)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15917760)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15918848)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15919936)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15924096)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16972736)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16973824)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_1_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18022464)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_1_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18023552)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18024640)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18025728)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18026816)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18030976)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19079616)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19080704)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_2_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20129344)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_2_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20130432)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20131520)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20132608)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20394816)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20395904)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20658112)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20659200)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20921408)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20922496)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21184704)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21185792)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21448000)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21449088)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_2_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21450176)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_2_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21452288)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_2_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21976640)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21993088)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21994176)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21995264)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21996352)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21997440)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_2_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21998528)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22260736)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22261824)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22262912)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22267072)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23315712)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23316800)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_2_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24365440)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_2_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24366528)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24367616)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24368704)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24369792)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24373952)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25422592)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25423680)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_3_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26472320)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_3_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26473408)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26474496)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26475584)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26737792)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26738880)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27001088)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27002176)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27264384)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27265472)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27527680)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27528768)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27790976)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27792064)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_3_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27793152)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_3_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27795264)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_3_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28319616)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28336064)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28337152)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28338240)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28339328)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28340416)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_3_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28341504)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28603712)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28604800)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28605888)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28610048)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29658688)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29659776)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_3_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30708416)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_3_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30709504)))];
+            tensor<fp32, [4, 4]> inner_decoder__causal_mask = const()[name = tensor<string, []>("inner_decoder__causal_mask"), val = tensor<fp32, [4, 4]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710592)))];
+            tensor<fp32, [256]> inner_decoder_convert_bias = const()[name = tensor<string, []>("inner_decoder_convert_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710720)))];
+            tensor<fp32, [256, 512]> inner_decoder_convert_weight = const()[name = tensor<string, []>("inner_decoder_convert_weight"), val = tensor<fp32, [256, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30711808)))];
+            tensor<fp32, [256]> inner_decoder_q_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31236160)))];
+            tensor<fp32, [256, 256]> inner_decoder_q_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31237248)))];
+            tensor<fp32, [256]> inner_decoder_k_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31499456)))];
+            tensor<fp32, [256, 256]> inner_decoder_k_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31500544)))];
+            tensor<fp32, [256]> inner_decoder_v_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31762752)))];
+            tensor<fp32, [256, 256]> inner_decoder_v_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31763840)))];
+            tensor<fp32, [256]> inner_decoder_g_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32026048)))];
+            tensor<fp32, [256, 256]> inner_decoder_g_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32027136)))];
+            tensor<fp32, [256]> inner_decoder_out_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32289344)))];
+            tensor<fp32, [256, 256]> inner_decoder_out_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32290432)))];
+            tensor<fp32, [256]> inner_decoder_norm11_0_bias = const()[name = tensor<string, []>("inner_decoder_norm11_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32552640)))];
+            tensor<fp32, [256]> inner_decoder_norm11_0_weight = const()[name = tensor<string, []>("inner_decoder_norm11_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32553728)))];
+            tensor<fp32, [256]> inner_decoder_self_attn2_0_out_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32554816)))];
+            tensor<fp32, [256, 256]> inner_decoder_self_attn2_0_out_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32555904)))];
+            tensor<fp32, [768]> inner_decoder_self_attn2_0_in_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32818112)))];
+            tensor<fp32, [768, 256]> inner_decoder_self_attn2_0_in_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32821248)))];
+            tensor<fp32, [256]> inner_decoder_norm21_0_bias = const()[name = tensor<string, []>("inner_decoder_norm21_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33607744)))];
+            tensor<fp32, [256]> inner_decoder_norm21_0_weight = const()[name = tensor<string, []>("inner_decoder_norm21_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33608832)))];
+            tensor<fp32, [2048]> inner_decoder_linear1_0_bias = const()[name = tensor<string, []>("inner_decoder_linear1_0_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33609920)))];
+            tensor<fp32, [2048, 256]> inner_decoder_linear1_0_weight = const()[name = tensor<string, []>("inner_decoder_linear1_0_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33618176)))];
+            tensor<fp32, [256]> inner_decoder_linear2_0_bias = const()[name = tensor<string, []>("inner_decoder_linear2_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35715392)))];
+            tensor<fp32, [256, 2048]> inner_decoder_linear2_0_weight = const()[name = tensor<string, []>("inner_decoder_linear2_0_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35716480)))];
+            tensor<fp32, [256]> inner_decoder_norm22_0_bias = const()[name = tensor<string, []>("inner_decoder_norm22_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37813696)))];
+            tensor<fp32, [256]> inner_decoder_norm22_0_weight = const()[name = tensor<string, []>("inner_decoder_norm22_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37814784)))];
+            tensor<fp32, [256]> inner_decoder_q_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37815872)))];
+            tensor<fp32, [256, 256]> inner_decoder_q_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37816960)))];
+            tensor<fp32, [256]> inner_decoder_k_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38079168)))];
+            tensor<fp32, [256, 256]> inner_decoder_k_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38080256)))];
+            tensor<fp32, [256]> inner_decoder_v_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38342464)))];
+            tensor<fp32, [256, 256]> inner_decoder_v_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38343552)))];
+            tensor<fp32, [256]> inner_decoder_g_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38605760)))];
+            tensor<fp32, [256, 256]> inner_decoder_g_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38606848)))];
+            tensor<fp32, [256]> inner_decoder_out_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38869056)))];
+            tensor<fp32, [256, 256]> inner_decoder_out_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38870144)))];
+            tensor<fp32, [256]> inner_decoder_norm11_1_bias = const()[name = tensor<string, []>("inner_decoder_norm11_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39132352)))];
+            tensor<fp32, [256]> inner_decoder_norm11_1_weight = const()[name = tensor<string, []>("inner_decoder_norm11_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39133440)))];
+            tensor<fp32, [256]> inner_decoder_self_attn2_1_out_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39134528)))];
+            tensor<fp32, [256, 256]> inner_decoder_self_attn2_1_out_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39135616)))];
+            tensor<fp32, [768]> inner_decoder_self_attn2_1_in_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39397824)))];
+            tensor<fp32, [768, 256]> inner_decoder_self_attn2_1_in_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39400960)))];
+            tensor<fp32, [256]> inner_decoder_norm21_1_bias = const()[name = tensor<string, []>("inner_decoder_norm21_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40187456)))];
+            tensor<fp32, [256]> inner_decoder_norm21_1_weight = const()[name = tensor<string, []>("inner_decoder_norm21_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40188544)))];
+            tensor<fp32, [2048]> inner_decoder_linear1_1_bias = const()[name = tensor<string, []>("inner_decoder_linear1_1_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40189632)))];
+            tensor<fp32, [2048, 256]> inner_decoder_linear1_1_weight = const()[name = tensor<string, []>("inner_decoder_linear1_1_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40197888)))];
+            tensor<fp32, [256]> inner_decoder_linear2_1_bias = const()[name = tensor<string, []>("inner_decoder_linear2_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42295104)))];
+            tensor<fp32, [256, 2048]> inner_decoder_linear2_1_weight = const()[name = tensor<string, []>("inner_decoder_linear2_1_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42296192)))];
+            tensor<fp32, [256]> inner_decoder_norm22_1_bias = const()[name = tensor<string, []>("inner_decoder_norm22_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44393408)))];
+            tensor<fp32, [256]> inner_decoder_norm22_1_weight = const()[name = tensor<string, []>("inner_decoder_norm22_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44394496)))];
+            tensor<int32, [3]> var_19_begin_0 = const()[name = tensor<string, []>("op_19_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_19_end_0 = const()[name = tensor<string, []>("op_19_end_0"), val = tensor<int32, [3]>([1, 15, 23])];
+            tensor<bool, [3]> var_19_end_mask_0 = const()[name = tensor<string, []>("op_19_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 15, 23]> var_19 = slice_by_index(begin = var_19_begin_0, end = var_19_end_0, end_mask = var_19_end_mask_0, x = features)[name = tensor<string, []>("op_19")];
+            tensor<int32, [3]> var_29_begin_0 = const()[name = tensor<string, []>("op_29_begin_0"), val = tensor<int32, [3]>([0, 10, 0])];
+            tensor<int32, [3]> var_29_end_0 = const()[name = tensor<string, []>("op_29_end_0"), val = tensor<int32, [3]>([1, 25, 23])];
+            tensor<bool, [3]> var_29_end_mask_0 = const()[name = tensor<string, []>("op_29_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 15, 23]> var_29 = slice_by_index(begin = var_29_begin_0, end = var_29_end_0, end_mask = var_29_end_mask_0, x = features)[name = tensor<string, []>("op_29")];
+            tensor<int32, [3]> var_39_begin_0 = const()[name = tensor<string, []>("op_39_begin_0"), val = tensor<int32, [3]>([0, 20, 0])];
+            tensor<int32, [3]> var_39_end_0 = const()[name = tensor<string, []>("op_39_end_0"), val = tensor<int32, [3]>([1, 35, 23])];
+            tensor<bool, [3]> var_39_end_mask_0 = const()[name = tensor<string, []>("op_39_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 15, 23]> var_39 = slice_by_index(begin = var_39_begin_0, end = var_39_end_0, end_mask = var_39_end_mask_0, x = features)[name = tensor<string, []>("op_39")];
+            tensor<int32, [3]> var_49_begin_0 = const()[name = tensor<string, []>("op_49_begin_0"), val = tensor<int32, [3]>([0, 30, 0])];
+            tensor<int32, [3]> var_49_end_0 = const()[name = tensor<string, []>("op_49_end_0"), val = tensor<int32, [3]>([1, 1, 23])];
+            tensor<bool, [3]> var_49_end_mask_0 = const()[name = tensor<string, []>("op_49_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 23]> var_49 = slice_by_index(begin = var_49_begin_0, end = var_49_end_0, end_mask = var_49_end_mask_0, x = features)[name = tensor<string, []>("op_49")];
+            tensor<int32, []> stacked_axis_0 = const()[name = tensor<string, []>("stacked_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 4, 15, 23]> stacked = stack(axis = stacked_axis_0, values = (var_19, var_29, var_39, var_49))[name = tensor<string, []>("stacked")];
+            tensor<int32, [3]> var_56 = const()[name = tensor<string, []>("op_56"), val = tensor<int32, [3]>([1, 4, 345])];
+            tensor<fp32, [1, 4, 345]> input_1 = reshape(shape = var_56, x = stacked)[name = tensor<string, []>("input_1")];
+            tensor<fp32, []> var_59 = const()[name = tensor<string, []>("op_59"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<bool, []> var_65 = const()[name = tensor<string, []>("op_65"), val = tensor<bool, []>(true)];
+            tensor<fp32, []> var_66 = const()[name = tensor<string, []>("op_66"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
+            tensor<int32, []> var_69 = const()[name = tensor<string, []>("op_69"), val = tensor<int32, []>(0)];
+            tensor<int32, []> var_71 = const()[name = tensor<string, []>("op_71"), val = tensor<int32, []>(2)];
+            tensor<int32, []> var_72 = const()[name = tensor<string, []>("op_72"), val = tensor<int32, []>(-1)];
+            tensor<fp32, []> var_74 = const()[name = tensor<string, []>("op_74"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
+            tensor<fp32, []> var_79 = const()[name = tensor<string, []>("op_79"), val = tensor<fp32, []>(0x1.5798eep-27)];
+            tensor<int32, []> var_82 = const()[name = tensor<string, []>("op_82"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 4, 256]> input_3 = linear(bias = inner_encoder_input_projection_linear_bias, weight = inner_encoder_input_projection_linear_weight, x = input_1)[name = tensor<string, []>("linear_0")];
             tensor<int32, [1]> input_5_axes_0 = const()[name = tensor<string, []>("input_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_5 = layer_norm(axes = input_5_axes_0, beta = encoder_ffn1_0_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_0_module_sequential_0_weight, x = input_3)[name = tensor<string, []>("input_5")];
-            tensor<fp32, [1, 4, 1024]> inputs_1 = linear(bias = encoder_ffn1_0_module_sequential_1_linear_bias, weight = encoder_ffn1_0_module_sequential_1_linear_weight, x = input_5)[name = tensor<string, []>("linear_1")];
-            tensor<fp32, [1, 4, 1024]> input_7 = silu(x = inputs_1)[name = tensor<string, []>("input_7")];
-            tensor<fp32, [1, 4, 256]> input_11 = linear(bias = encoder_ffn1_0_module_sequential_4_linear_bias, weight = encoder_ffn1_0_module_sequential_4_linear_weight, x = input_7)[name = tensor<string, []>("linear_2")];
-            tensor<fp32, []> var_147 = const()[name = tensor<string, []>("op_147"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 4, 256]> var_148 = mul(x = input_11, y = var_147)[name = tensor<string, []>("op_148")];
-            tensor<fp32, [1, 4, 256]> input_13 = add(x = var_148, y = input_3)[name = tensor<string, []>("input_13")];
+            tensor<fp32, [1, 4, 256]> input_5 = layer_norm(axes = input_5_axes_0, beta = inner_encoder_pre_layer_norm_bias, epsilon = var_66, gamma = inner_encoder_pre_layer_norm_weight, x = input_3)[name = tensor<string, []>("input_5")];
+            tensor<int32, [1]> input_7_axes_0 = const()[name = tensor<string, []>("input_7_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 4, 256]> input_7 = layer_norm(axes = input_7_axes_0, beta = inner_encoder_ffn1_0_module_sequential_0_bias, epsilon = var_66, gamma = inner_encoder_ffn1_0_module_sequential_0_weight, x = input_5)[name = tensor<string, []>("input_7")];
+            tensor<fp32, [1, 4, 1024]> inputs_1 = linear(bias = inner_encoder_ffn1_0_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_0_module_sequential_1_linear_weight, x = input_7)[name = tensor<string, []>("linear_1")];
+            tensor<fp32, [1, 4, 1024]> input_9 = silu(x = inputs_1)[name = tensor<string, []>("input_9")];
+            tensor<fp32, [1, 4, 256]> input_13 = linear(bias = inner_encoder_ffn1_0_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_0_module_sequential_4_linear_weight, x = input_9)[name = tensor<string, []>("linear_2")];
+            tensor<fp32, []> var_203 = const()[name = tensor<string, []>("op_203"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 4, 256]> var_204 = mul(x = input_13, y = var_203)[name = tensor<string, []>("op_204")];
+            tensor<fp32, [1, 4, 256]> input_15 = add(x = var_204, y = input_5)[name = tensor<string, []>("input_15")];
             tensor<int32, [1]> x_1_axes_0 = const()[name = tensor<string, []>("x_1_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> x_1 = layer_norm(axes = x_1_axes_0, beta = encoder_ret_lns_0_bias, epsilon = var_28, gamma = encoder_ret_lns_0_weight, x = input_13)[name = tensor<string, []>("x_1")];
+            tensor<fp32, [1, 4, 256]> x_1 = layer_norm(axes = x_1_axes_0, beta = inner_encoder_ret_lns_0_bias, epsilon = var_66, gamma = inner_encoder_ret_lns_0_weight, x = input_15)[name = tensor<string, []>("x_1")];
             tensor<int32, [5]> prev_kv_1_begin_0 = const()[name = tensor<string, []>("prev_kv_1_begin_0"), val = tensor<int32, [5]>([0, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_1_end_0 = const()[name = tensor<string, []>("prev_kv_1_end_0"), val = tensor<int32, [5]>([1, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_1_end_mask_0 = const()[name = tensor<string, []>("prev_kv_1_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -239,173 +261,173 @@ program(1.0)
             tensor<bool, [2]> prev_scale_1_end_mask_0 = const()[name = tensor<string, []>("prev_scale_1_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_1_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_1_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_1 = slice_by_index(begin = prev_scale_1_begin_0, end = prev_scale_1_end_0, end_mask = prev_scale_1_end_mask_0, squeeze_mask = prev_scale_1_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_1")];
-            tensor<fp32, [1, 4, 256]> var_162 = linear(bias = encoder_q_proj_0_bias, weight = encoder_q_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_3")];
-            tensor<int32, [4]> var_163 = const()[name = tensor<string, []>("op_163"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_164 = reshape(shape = var_163, x = var_162)[name = tensor<string, []>("op_164")];
+            tensor<fp32, [1, 4, 256]> var_218 = linear(bias = inner_encoder_q_proj_0_bias, weight = inner_encoder_q_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_3")];
+            tensor<int32, [4]> var_219 = const()[name = tensor<string, []>("op_219"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_220 = reshape(shape = var_219, x = var_218)[name = tensor<string, []>("op_220")];
             tensor<int32, [4]> q_1_perm_0 = const()[name = tensor<string, []>("q_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> var_168 = linear(bias = encoder_k_proj_0_bias, weight = encoder_k_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_4")];
-            tensor<fp32, []> var_169 = const()[name = tensor<string, []>("op_169"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 4, 256]> var_170 = mul(x = var_168, y = var_169)[name = tensor<string, []>("op_170")];
-            tensor<int32, [4]> var_171 = const()[name = tensor<string, []>("op_171"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_172 = reshape(shape = var_171, x = var_170)[name = tensor<string, []>("op_172")];
+            tensor<fp32, [1, 4, 256]> var_224 = linear(bias = inner_encoder_k_proj_0_bias, weight = inner_encoder_k_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_4")];
+            tensor<fp32, []> var_225 = const()[name = tensor<string, []>("op_225"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 4, 256]> var_226 = mul(x = var_224, y = var_225)[name = tensor<string, []>("op_226")];
+            tensor<int32, [4]> var_227 = const()[name = tensor<string, []>("op_227"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_228 = reshape(shape = var_227, x = var_226)[name = tensor<string, []>("op_228")];
             tensor<int32, [4]> k_1_perm_0 = const()[name = tensor<string, []>("k_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> var_176 = linear(bias = encoder_v_proj_0_bias, weight = encoder_v_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_5")];
-            tensor<int32, [4]> var_177 = const()[name = tensor<string, []>("op_177"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_178 = reshape(shape = var_177, x = var_176)[name = tensor<string, []>("op_178")];
+            tensor<fp32, [1, 4, 256]> var_232 = linear(bias = inner_encoder_v_proj_0_bias, weight = inner_encoder_v_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_5")];
+            tensor<int32, [4]> var_233 = const()[name = tensor<string, []>("op_233"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_234 = reshape(shape = var_233, x = var_232)[name = tensor<string, []>("op_234")];
             tensor<int32, [4]> v_1_perm_0 = const()[name = tensor<string, []>("v_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> input_17 = linear(bias = encoder_g_proj_0_bias, weight = encoder_g_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_6")];
+            tensor<fp32, [1, 4, 256]> input_19 = linear(bias = inner_encoder_g_proj_0_bias, weight = inner_encoder_g_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_6")];
             tensor<fp32, [1]> sqrt_s0_1 = sqrt(x = prev_scale_1)[name = tensor<string, []>("sqrt_s0_1")];
-            tensor<fp32, [4]> s_t_1 = add(x = prev_scale_1, y = encoder__t_index)[name = tensor<string, []>("s_t_1")];
+            tensor<fp32, [4]> s_t_1 = add(x = prev_scale_1, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_1")];
             tensor<fp32, [4]> sqrt_s_t_1 = sqrt(x = s_t_1)[name = tensor<string, []>("sqrt_s_t_1")];
             tensor<bool, []> qk_1_transpose_x_1 = const()[name = tensor<string, []>("qk_1_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_1_transpose_y_1 = const()[name = tensor<string, []>("qk_1_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 4, 64]> k_1 = transpose(perm = k_1_perm_0, x = var_172)[name = tensor<string, []>("transpose_57")];
-            tensor<fp32, [1, 4, 4, 64]> q_1 = transpose(perm = q_1_perm_0, x = var_164)[name = tensor<string, []>("transpose_58")];
+            tensor<fp32, [1, 4, 4, 64]> k_1 = transpose(perm = k_1_perm_0, x = var_228)[name = tensor<string, []>("transpose_57")];
+            tensor<fp32, [1, 4, 4, 64]> q_1 = transpose(perm = q_1_perm_0, x = var_220)[name = tensor<string, []>("transpose_58")];
             tensor<fp32, [1, 4, 4, 4]> qk_1 = matmul(transpose_x = qk_1_transpose_x_1, transpose_y = qk_1_transpose_y_1, x = q_1, y = k_1)[name = tensor<string, []>("qk_1")];
-            tensor<int32, [2]> var_188 = const()[name = tensor<string, []>("op_188"), val = tensor<int32, [2]>([4, 1])];
-            tensor<fp32, [4, 1]> var_189 = reshape(shape = var_188, x = sqrt_s_t_1)[name = tensor<string, []>("op_189")];
-            tensor<fp32, [4, 4]> M_1 = real_div(x = encoder__causal_mask, y = var_189)[name = tensor<string, []>("M_1")];
-            tensor<fp32, [1, 4, 4, 4]> var_191 = mul(x = qk_1, y = M_1)[name = tensor<string, []>("op_191")];
+            tensor<int32, [2]> var_244 = const()[name = tensor<string, []>("op_244"), val = tensor<int32, [2]>([4, 1])];
+            tensor<fp32, [4, 1]> var_245 = reshape(shape = var_244, x = sqrt_s_t_1)[name = tensor<string, []>("op_245")];
+            tensor<fp32, [4, 4]> M_1 = real_div(x = inner_encoder__causal_mask, y = var_245)[name = tensor<string, []>("M_1")];
+            tensor<fp32, [1, 4, 4, 4]> var_247 = mul(x = qk_1, y = M_1)[name = tensor<string, []>("op_247")];
             tensor<bool, []> inner_1_transpose_x_0 = const()[name = tensor<string, []>("inner_1_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_1_transpose_y_0 = const()[name = tensor<string, []>("inner_1_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 4, 64]> v_1 = transpose(perm = v_1_perm_0, x = var_178)[name = tensor<string, []>("transpose_56")];
-            tensor<fp32, [1, 4, 4, 64]> inner_1 = matmul(transpose_x = inner_1_transpose_x_0, transpose_y = inner_1_transpose_y_0, x = var_191, y = v_1)[name = tensor<string, []>("inner_1")];
-            tensor<bool, []> var_193_transpose_x_0 = const()[name = tensor<string, []>("op_193_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_193_transpose_y_0 = const()[name = tensor<string, []>("op_193_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 4, 64]> var_193 = matmul(transpose_x = var_193_transpose_x_0, transpose_y = var_193_transpose_y_0, x = q_1, y = prev_kv_1)[name = tensor<string, []>("op_193")];
-            tensor<fp32, [4]> var_194 = real_div(x = sqrt_s0_1, y = sqrt_s_t_1)[name = tensor<string, []>("op_194")];
-            tensor<int32, [4]> var_195 = const()[name = tensor<string, []>("op_195"), val = tensor<int32, [4]>([1, 1, 4, 1])];
-            tensor<fp32, [1, 1, 4, 1]> var_196 = reshape(shape = var_195, x = var_194)[name = tensor<string, []>("op_196")];
-            tensor<fp32, [1, 4, 4, 64]> cross_1 = mul(x = var_193, y = var_196)[name = tensor<string, []>("cross_1")];
+            tensor<fp32, [1, 4, 4, 64]> v_1 = transpose(perm = v_1_perm_0, x = var_234)[name = tensor<string, []>("transpose_56")];
+            tensor<fp32, [1, 4, 4, 64]> inner_1 = matmul(transpose_x = inner_1_transpose_x_0, transpose_y = inner_1_transpose_y_0, x = var_247, y = v_1)[name = tensor<string, []>("inner_1")];
+            tensor<bool, []> var_249_transpose_x_0 = const()[name = tensor<string, []>("op_249_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_249_transpose_y_0 = const()[name = tensor<string, []>("op_249_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 4, 64]> var_249 = matmul(transpose_x = var_249_transpose_x_0, transpose_y = var_249_transpose_y_0, x = q_1, y = prev_kv_1)[name = tensor<string, []>("op_249")];
+            tensor<fp32, [4]> var_250 = real_div(x = sqrt_s0_1, y = sqrt_s_t_1)[name = tensor<string, []>("op_250")];
+            tensor<int32, [4]> var_251 = const()[name = tensor<string, []>("op_251"), val = tensor<int32, [4]>([1, 1, 4, 1])];
+            tensor<fp32, [1, 1, 4, 1]> var_252 = reshape(shape = var_251, x = var_250)[name = tensor<string, []>("op_252")];
+            tensor<fp32, [1, 4, 4, 64]> cross_1 = mul(x = var_249, y = var_252)[name = tensor<string, []>("cross_1")];
             tensor<fp32, [1, 4, 4, 64]> out_1 = add(x = inner_1, y = cross_1)[name = tensor<string, []>("out_1")];
-            tensor<fp32, [1, 4, 64, 64]> var_199 = mul(x = prev_kv_1, y = sqrt_s0_1)[name = tensor<string, []>("op_199")];
-            tensor<bool, []> var_201_transpose_x_1 = const()[name = tensor<string, []>("op_201_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_201_transpose_y_1 = const()[name = tensor<string, []>("op_201_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_201 = matmul(transpose_x = var_201_transpose_x_1, transpose_y = var_201_transpose_y_1, x = k_1, y = v_1)[name = tensor<string, []>("op_201")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_1 = add(x = var_199, y = var_201)[name = tensor<string, []>("new_kv_unnorm_1")];
-            tensor<fp32, []> var_203 = const()[name = tensor<string, []>("op_203"), val = tensor<fp32, []>(0x1p+2)];
-            tensor<fp32, [1]> new_scale_1 = add(x = prev_scale_1, y = var_203)[name = tensor<string, []>("new_scale_1")];
-            tensor<fp32, [1]> var_205 = sqrt(x = new_scale_1)[name = tensor<string, []>("op_205")];
-            tensor<fp32, [1, 4, 64, 64]> var_206 = real_div(x = new_kv_unnorm_1, y = var_205)[name = tensor<string, []>("op_206")];
-            tensor<int32, [4]> var_207_perm_0 = const()[name = tensor<string, []>("op_207_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_255 = mul(x = prev_kv_1, y = sqrt_s0_1)[name = tensor<string, []>("op_255")];
+            tensor<bool, []> var_257_transpose_x_1 = const()[name = tensor<string, []>("op_257_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_257_transpose_y_1 = const()[name = tensor<string, []>("op_257_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_257 = matmul(transpose_x = var_257_transpose_x_1, transpose_y = var_257_transpose_y_1, x = k_1, y = v_1)[name = tensor<string, []>("op_257")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_1 = add(x = var_255, y = var_257)[name = tensor<string, []>("new_kv_unnorm_1")];
+            tensor<fp32, []> var_259 = const()[name = tensor<string, []>("op_259"), val = tensor<fp32, []>(0x1p+2)];
+            tensor<fp32, [1]> new_scale_1 = add(x = prev_scale_1, y = var_259)[name = tensor<string, []>("new_scale_1")];
+            tensor<fp32, [1]> var_261 = sqrt(x = new_scale_1)[name = tensor<string, []>("op_261")];
+            tensor<fp32, [1, 4, 64, 64]> var_262 = real_div(x = new_kv_unnorm_1, y = var_261)[name = tensor<string, []>("op_262")];
+            tensor<int32, [4]> var_263_perm_0 = const()[name = tensor<string, []>("op_263_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_3_axes_0 = const()[name = tensor<string, []>("out_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 4, 64]> var_207 = transpose(perm = var_207_perm_0, x = out_1)[name = tensor<string, []>("transpose_55")];
-            tensor<fp32, [1, 4, 4, 64]> out_3 = layer_norm(axes = out_3_axes_0, epsilon = var_17, x = var_207)[name = tensor<string, []>("out_3")];
-            tensor<int32, [3]> var_211 = const()[name = tensor<string, []>("op_211"), val = tensor<int32, [3]>([1, 4, 256])];
-            tensor<fp32, [1, 4, 256]> out_5 = reshape(shape = var_211, x = out_3)[name = tensor<string, []>("out_5")];
-            tensor<fp32, [1, 4, 256]> var_213 = silu(x = input_17)[name = tensor<string, []>("op_213")];
-            tensor<fp32, [1, 4, 256]> input_19 = mul(x = var_213, y = out_5)[name = tensor<string, []>("input_19")];
-            tensor<fp32, [1, 4, 256]> ret_out_1 = linear(bias = encoder_out_proj_0_bias, weight = encoder_out_proj_0_weight, x = input_19)[name = tensor<string, []>("linear_7")];
-            tensor<fp32, [1, 4, 256]> x_3 = add(x = input_13, y = ret_out_1)[name = tensor<string, []>("x_3")];
+            tensor<fp32, [1, 4, 4, 64]> var_263 = transpose(perm = var_263_perm_0, x = out_1)[name = tensor<string, []>("transpose_55")];
+            tensor<fp32, [1, 4, 4, 64]> out_3 = layer_norm(axes = out_3_axes_0, epsilon = var_74, x = var_263)[name = tensor<string, []>("out_3")];
+            tensor<int32, [3]> var_267 = const()[name = tensor<string, []>("op_267"), val = tensor<int32, [3]>([1, 4, 256])];
+            tensor<fp32, [1, 4, 256]> out_5 = reshape(shape = var_267, x = out_3)[name = tensor<string, []>("out_5")];
+            tensor<fp32, [1, 4, 256]> var_269 = silu(x = input_19)[name = tensor<string, []>("op_269")];
+            tensor<fp32, [1, 4, 256]> input_21 = mul(x = var_269, y = out_5)[name = tensor<string, []>("input_21")];
+            tensor<fp32, [1, 4, 256]> ret_out_1 = linear(bias = inner_encoder_out_proj_0_bias, weight = inner_encoder_out_proj_0_weight, x = input_21)[name = tensor<string, []>("linear_7")];
+            tensor<fp32, [1, 4, 256]> x_3 = add(x = input_15, y = ret_out_1)[name = tensor<string, []>("x_3")];
             tensor<int32, [4]> window_1_begin_0 = const()[name = tensor<string, []>("window_1_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> window_1_end_0 = const()[name = tensor<string, []>("window_1_end_0"), val = tensor<int32, [4]>([1, 1, 16, 256])];
             tensor<bool, [4]> window_1_end_mask_0 = const()[name = tensor<string, []>("window_1_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_1_squeeze_mask_0 = const()[name = tensor<string, []>("window_1_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_1 = slice_by_index(begin = window_1_begin_0, end = window_1_end_0, end_mask = window_1_end_mask_0, squeeze_mask = window_1_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_1")];
-            tensor<int32, [3]> var_221_begin_0 = const()[name = tensor<string, []>("op_221_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_221_end_0 = const()[name = tensor<string, []>("op_221_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_221_end_mask_0 = const()[name = tensor<string, []>("op_221_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_221 = slice_by_index(begin = var_221_begin_0, end = var_221_end_0, end_mask = var_221_end_mask_0, x = x_3)[name = tensor<string, []>("op_221")];
-            tensor<int32, [3]> var_224_begin_0 = const()[name = tensor<string, []>("op_224_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_224_end_0 = const()[name = tensor<string, []>("op_224_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_224_end_mask_0 = const()[name = tensor<string, []>("op_224_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_224 = slice_by_index(begin = var_224_begin_0, end = var_224_end_0, end_mask = var_224_end_mask_0, x = window_1)[name = tensor<string, []>("op_224")];
+            tensor<int32, [3]> var_277_begin_0 = const()[name = tensor<string, []>("op_277_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_277_end_0 = const()[name = tensor<string, []>("op_277_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_277_end_mask_0 = const()[name = tensor<string, []>("op_277_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_277 = slice_by_index(begin = var_277_begin_0, end = var_277_end_0, end_mask = var_277_end_mask_0, x = x_3)[name = tensor<string, []>("op_277")];
+            tensor<int32, [3]> var_280_begin_0 = const()[name = tensor<string, []>("op_280_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_280_end_0 = const()[name = tensor<string, []>("op_280_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_280_end_mask_0 = const()[name = tensor<string, []>("op_280_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_280 = slice_by_index(begin = var_280_begin_0, end = var_280_end_0, end_mask = var_280_end_mask_0, x = window_1)[name = tensor<string, []>("op_280")];
             tensor<bool, []> window_3_interleave_0 = const()[name = tensor<string, []>("window_3_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_3 = concat(axis = var_26, interleave = window_3_interleave_0, values = (var_224, var_221))[name = tensor<string, []>("window_3")];
-            tensor<int32, [3]> var_229_begin_0 = const()[name = tensor<string, []>("op_229_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_229_end_0 = const()[name = tensor<string, []>("op_229_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_229_end_mask_0 = const()[name = tensor<string, []>("op_229_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_229 = slice_by_index(begin = var_229_begin_0, end = var_229_end_0, end_mask = var_229_end_mask_0, x = x_3)[name = tensor<string, []>("op_229")];
-            tensor<int32, [3]> var_232_begin_0 = const()[name = tensor<string, []>("op_232_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_232_end_0 = const()[name = tensor<string, []>("op_232_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_232_end_mask_0 = const()[name = tensor<string, []>("op_232_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_232 = slice_by_index(begin = var_232_begin_0, end = var_232_end_0, end_mask = var_232_end_mask_0, x = window_3)[name = tensor<string, []>("op_232")];
+            tensor<fp32, [1, 16, 256]> window_3 = concat(axis = var_82, interleave = window_3_interleave_0, values = (var_280, var_277))[name = tensor<string, []>("window_3")];
+            tensor<int32, [3]> var_285_begin_0 = const()[name = tensor<string, []>("op_285_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_285_end_0 = const()[name = tensor<string, []>("op_285_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_285_end_mask_0 = const()[name = tensor<string, []>("op_285_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_285 = slice_by_index(begin = var_285_begin_0, end = var_285_end_0, end_mask = var_285_end_mask_0, x = x_3)[name = tensor<string, []>("op_285")];
+            tensor<int32, [3]> var_288_begin_0 = const()[name = tensor<string, []>("op_288_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_288_end_0 = const()[name = tensor<string, []>("op_288_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_288_end_mask_0 = const()[name = tensor<string, []>("op_288_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_288 = slice_by_index(begin = var_288_begin_0, end = var_288_end_0, end_mask = var_288_end_mask_0, x = window_3)[name = tensor<string, []>("op_288")];
             tensor<bool, []> window_5_interleave_0 = const()[name = tensor<string, []>("window_5_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_5 = concat(axis = var_26, interleave = window_5_interleave_0, values = (var_232, var_229))[name = tensor<string, []>("window_5")];
-            tensor<int32, [3]> var_237_begin_0 = const()[name = tensor<string, []>("op_237_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_237_end_0 = const()[name = tensor<string, []>("op_237_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<bool, [3]> var_237_end_mask_0 = const()[name = tensor<string, []>("op_237_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_237 = slice_by_index(begin = var_237_begin_0, end = var_237_end_0, end_mask = var_237_end_mask_0, x = x_3)[name = tensor<string, []>("op_237")];
-            tensor<int32, [3]> var_240_begin_0 = const()[name = tensor<string, []>("op_240_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_240_end_0 = const()[name = tensor<string, []>("op_240_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_240_end_mask_0 = const()[name = tensor<string, []>("op_240_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_240 = slice_by_index(begin = var_240_begin_0, end = var_240_end_0, end_mask = var_240_end_mask_0, x = window_5)[name = tensor<string, []>("op_240")];
+            tensor<fp32, [1, 16, 256]> window_5 = concat(axis = var_82, interleave = window_5_interleave_0, values = (var_288, var_285))[name = tensor<string, []>("window_5")];
+            tensor<int32, [3]> var_293_begin_0 = const()[name = tensor<string, []>("op_293_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_293_end_0 = const()[name = tensor<string, []>("op_293_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<bool, [3]> var_293_end_mask_0 = const()[name = tensor<string, []>("op_293_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_293 = slice_by_index(begin = var_293_begin_0, end = var_293_end_0, end_mask = var_293_end_mask_0, x = x_3)[name = tensor<string, []>("op_293")];
+            tensor<int32, [3]> var_296_begin_0 = const()[name = tensor<string, []>("op_296_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_296_end_0 = const()[name = tensor<string, []>("op_296_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_296_end_mask_0 = const()[name = tensor<string, []>("op_296_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_296 = slice_by_index(begin = var_296_begin_0, end = var_296_end_0, end_mask = var_296_end_mask_0, x = window_5)[name = tensor<string, []>("op_296")];
             tensor<bool, []> window_7_interleave_0 = const()[name = tensor<string, []>("window_7_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_7 = concat(axis = var_26, interleave = window_7_interleave_0, values = (var_240, var_237))[name = tensor<string, []>("window_7")];
-            tensor<int32, [3]> var_245_begin_0 = const()[name = tensor<string, []>("op_245_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
-            tensor<int32, [3]> var_245_end_0 = const()[name = tensor<string, []>("op_245_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_245_end_mask_0 = const()[name = tensor<string, []>("op_245_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_245 = slice_by_index(begin = var_245_begin_0, end = var_245_end_0, end_mask = var_245_end_mask_0, x = x_3)[name = tensor<string, []>("op_245")];
-            tensor<int32, [3]> var_248_begin_0 = const()[name = tensor<string, []>("op_248_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_248_end_0 = const()[name = tensor<string, []>("op_248_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_248_end_mask_0 = const()[name = tensor<string, []>("op_248_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_248 = slice_by_index(begin = var_248_begin_0, end = var_248_end_0, end_mask = var_248_end_mask_0, x = window_7)[name = tensor<string, []>("op_248")];
+            tensor<fp32, [1, 16, 256]> window_7 = concat(axis = var_82, interleave = window_7_interleave_0, values = (var_296, var_293))[name = tensor<string, []>("window_7")];
+            tensor<int32, [3]> var_301_begin_0 = const()[name = tensor<string, []>("op_301_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
+            tensor<int32, [3]> var_301_end_0 = const()[name = tensor<string, []>("op_301_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_301_end_mask_0 = const()[name = tensor<string, []>("op_301_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_301 = slice_by_index(begin = var_301_begin_0, end = var_301_end_0, end_mask = var_301_end_mask_0, x = x_3)[name = tensor<string, []>("op_301")];
+            tensor<int32, [3]> var_304_begin_0 = const()[name = tensor<string, []>("op_304_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_304_end_0 = const()[name = tensor<string, []>("op_304_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_304_end_mask_0 = const()[name = tensor<string, []>("op_304_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_304 = slice_by_index(begin = var_304_begin_0, end = var_304_end_0, end_mask = var_304_end_mask_0, x = window_7)[name = tensor<string, []>("op_304")];
             tensor<bool, []> window_9_interleave_0 = const()[name = tensor<string, []>("window_9_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_9 = concat(axis = var_26, interleave = window_9_interleave_0, values = (var_248, var_245))[name = tensor<string, []>("window_9")];
-            tensor<bool, []> input_21_interleave_0 = const()[name = tensor<string, []>("input_21_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [4, 16, 256]> input_21 = concat(axis = var_23, interleave = input_21_interleave_0, values = (window_3, window_5, window_7, window_9))[name = tensor<string, []>("input_21")];
+            tensor<fp32, [1, 16, 256]> window_9 = concat(axis = var_82, interleave = window_9_interleave_0, values = (var_304, var_301))[name = tensor<string, []>("window_9")];
+            tensor<bool, []> input_23_interleave_0 = const()[name = tensor<string, []>("input_23_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [4, 16, 256]> input_23 = concat(axis = var_69, interleave = input_23_interleave_0, values = (window_3, window_5, window_7, window_9))[name = tensor<string, []>("input_23")];
             tensor<int32, [1]> x_5_axes_0 = const()[name = tensor<string, []>("x_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [4, 16, 256]> x_5 = layer_norm(axes = x_5_axes_0, beta = encoder_conv_module_0_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_0_sequential_0_weight, x = input_21)[name = tensor<string, []>("x_5")];
-            tensor<int32, [3]> input_23_perm_0 = const()[name = tensor<string, []>("input_23_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [4, 16, 256]> x_5 = layer_norm(axes = x_5_axes_0, beta = inner_encoder_conv_module_0_sequential_0_bias, epsilon = var_66, gamma = inner_encoder_conv_module_0_sequential_0_weight, x = input_23)[name = tensor<string, []>("x_5")];
+            tensor<int32, [3]> input_25_perm_0 = const()[name = tensor<string, []>("input_25_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_3_pad_type_0 = const()[name = tensor<string, []>("inputs_3_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_3_strides_0 = const()[name = tensor<string, []>("inputs_3_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_3_pad_0 = const()[name = tensor<string, []>("inputs_3_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_3_dilations_0 = const()[name = tensor<string, []>("inputs_3_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_3_groups_0 = const()[name = tensor<string, []>("inputs_3_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> input_23 = transpose(perm = input_23_perm_0, x = x_5)[name = tensor<string, []>("transpose_54")];
-            tensor<fp32, [4, 512, 16]> inputs_3 = conv(bias = encoder_conv_module_0_sequential_2_conv_bias, dilations = inputs_3_dilations_0, groups = inputs_3_groups_0, pad = inputs_3_pad_0, pad_type = inputs_3_pad_type_0, strides = inputs_3_strides_0, weight = encoder_conv_module_0_sequential_2_conv_weight, x = input_23)[name = tensor<string, []>("inputs_3")];
-            tensor<int32, [2]> var_273_split_sizes_0 = const()[name = tensor<string, []>("op_273_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_273_axis_0 = const()[name = tensor<string, []>("op_273_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> var_273_0, tensor<fp32, [4, 256, 16]> var_273_1 = split(axis = var_273_axis_0, split_sizes = var_273_split_sizes_0, x = inputs_3)[name = tensor<string, []>("op_273")];
-            tensor<fp32, [4, 256, 16]> var_275 = sigmoid(x = var_273_1)[name = tensor<string, []>("op_275")];
-            tensor<fp32, [4, 256, 16]> inputs_5 = mul(x = var_273_0, y = var_275)[name = tensor<string, []>("inputs_5")];
+            tensor<fp32, [4, 256, 16]> input_25 = transpose(perm = input_25_perm_0, x = x_5)[name = tensor<string, []>("transpose_54")];
+            tensor<fp32, [4, 512, 16]> inputs_3 = conv(bias = inner_encoder_conv_module_0_sequential_2_conv_bias, dilations = inputs_3_dilations_0, groups = inputs_3_groups_0, pad = inputs_3_pad_0, pad_type = inputs_3_pad_type_0, strides = inputs_3_strides_0, weight = inner_encoder_conv_module_0_sequential_2_conv_weight, x = input_25)[name = tensor<string, []>("inputs_3")];
+            tensor<int32, [2]> var_329_split_sizes_0 = const()[name = tensor<string, []>("op_329_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_329_axis_0 = const()[name = tensor<string, []>("op_329_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [4, 256, 16]> var_329_0, tensor<fp32, [4, 256, 16]> var_329_1 = split(axis = var_329_axis_0, split_sizes = var_329_split_sizes_0, x = inputs_3)[name = tensor<string, []>("op_329")];
+            tensor<fp32, [4, 256, 16]> var_331 = sigmoid(x = var_329_1)[name = tensor<string, []>("op_331")];
+            tensor<fp32, [4, 256, 16]> inputs_5 = mul(x = var_329_0, y = var_331)[name = tensor<string, []>("inputs_5")];
             tensor<string, []> outputs_aug_1_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_1_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_1_pad_0 = const()[name = tensor<string, []>("outputs_aug_1_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_1_groups_0 = const()[name = tensor<string, []>("outputs_aug_1_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_1_strides_0 = const()[name = tensor<string, []>("outputs_aug_1_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_1_dilations_0 = const()[name = tensor<string, []>("outputs_aug_1_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [4, 256, 31]> outputs_aug_1 = conv(dilations = outputs_aug_1_dilations_0, groups = outputs_aug_1_groups_0, pad = outputs_aug_1_pad_0, pad_type = outputs_aug_1_pad_type_0, strides = outputs_aug_1_strides_0, weight = encoder_conv_module_0_sequential_4_conv_weight, x = inputs_5)[name = tensor<string, []>("outputs_aug_1")];
-            tensor<int32, [3]> input_25_begin_0 = const()[name = tensor<string, []>("input_25_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_25_end_0 = const()[name = tensor<string, []>("input_25_end_0"), val = tensor<int32, [3]>([4, 256, 16])];
-            tensor<bool, [3]> input_25_end_mask_0 = const()[name = tensor<string, []>("input_25_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [4, 256, 16]> input_25 = slice_by_index(begin = input_25_begin_0, end = input_25_end_0, end_mask = input_25_end_mask_0, x = outputs_aug_1)[name = tensor<string, []>("input_25")];
-            tensor<fp32, [4, 256, 16]> inputs_7 = batch_norm(beta = encoder_conv_module_0_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_0_sequential_5_weight, mean = encoder_conv_module_0_sequential_5_running_mean, variance = encoder_conv_module_0_sequential_5_running_var, x = input_25)[name = tensor<string, []>("inputs_7")];
-            tensor<fp32, [4, 256, 16]> input_27 = silu(x = inputs_7)[name = tensor<string, []>("input_27")];
-            tensor<string, []> input_29_pad_type_0 = const()[name = tensor<string, []>("input_29_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_29_strides_0 = const()[name = tensor<string, []>("input_29_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_29_pad_0 = const()[name = tensor<string, []>("input_29_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_29_dilations_0 = const()[name = tensor<string, []>("input_29_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_29_groups_0 = const()[name = tensor<string, []>("input_29_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> input_29 = conv(bias = encoder_conv_module_0_sequential_7_conv_bias, dilations = input_29_dilations_0, groups = input_29_groups_0, pad = input_29_pad_0, pad_type = input_29_pad_type_0, strides = input_29_strides_0, weight = encoder_conv_module_0_sequential_7_conv_weight, x = input_27)[name = tensor<string, []>("input_29")];
+            tensor<fp32, [4, 256, 31]> outputs_aug_1 = conv(dilations = outputs_aug_1_dilations_0, groups = outputs_aug_1_groups_0, pad = outputs_aug_1_pad_0, pad_type = outputs_aug_1_pad_type_0, strides = outputs_aug_1_strides_0, weight = inner_encoder_conv_module_0_sequential_4_conv_weight, x = inputs_5)[name = tensor<string, []>("outputs_aug_1")];
+            tensor<int32, [3]> input_27_begin_0 = const()[name = tensor<string, []>("input_27_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_27_end_0 = const()[name = tensor<string, []>("input_27_end_0"), val = tensor<int32, [3]>([4, 256, 16])];
+            tensor<bool, [3]> input_27_end_mask_0 = const()[name = tensor<string, []>("input_27_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [4, 256, 16]> input_27 = slice_by_index(begin = input_27_begin_0, end = input_27_end_0, end_mask = input_27_end_mask_0, x = outputs_aug_1)[name = tensor<string, []>("input_27")];
+            tensor<fp32, [4, 256, 16]> inputs_7 = batch_norm(beta = inner_encoder_conv_module_0_sequential_5_bias, epsilon = var_66, gamma = inner_encoder_conv_module_0_sequential_5_weight, mean = inner_encoder_conv_module_0_sequential_5_running_mean, variance = inner_encoder_conv_module_0_sequential_5_running_var, x = input_27)[name = tensor<string, []>("inputs_7")];
+            tensor<fp32, [4, 256, 16]> input_29 = silu(x = inputs_7)[name = tensor<string, []>("input_29")];
+            tensor<string, []> input_31_pad_type_0 = const()[name = tensor<string, []>("input_31_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_31_strides_0 = const()[name = tensor<string, []>("input_31_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_31_pad_0 = const()[name = tensor<string, []>("input_31_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_31_dilations_0 = const()[name = tensor<string, []>("input_31_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_31_groups_0 = const()[name = tensor<string, []>("input_31_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [4, 256, 16]> input_31 = conv(bias = inner_encoder_conv_module_0_sequential_7_conv_bias, dilations = input_31_dilations_0, groups = input_31_groups_0, pad = input_31_pad_0, pad_type = input_31_pad_type_0, strides = input_31_strides_0, weight = inner_encoder_conv_module_0_sequential_7_conv_weight, x = input_29)[name = tensor<string, []>("input_31")];
             tensor<int32, [3]> conv_out_1_perm_0 = const()[name = tensor<string, []>("conv_out_1_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_306_begin_0 = const()[name = tensor<string, []>("op_306_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_306_end_0 = const()[name = tensor<string, []>("op_306_end_0"), val = tensor<int32, [3]>([4, 16, 256])];
-            tensor<bool, [3]> var_306_end_mask_0 = const()[name = tensor<string, []>("op_306_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [4, 16, 256]> conv_out_1 = transpose(perm = conv_out_1_perm_0, x = input_29)[name = tensor<string, []>("transpose_53")];
-            tensor<fp32, [4, 1, 256]> var_306 = slice_by_index(begin = var_306_begin_0, end = var_306_end_0, end_mask = var_306_end_mask_0, x = conv_out_1)[name = tensor<string, []>("op_306")];
-            tensor<int32, [3]> var_308_perm_0 = const()[name = tensor<string, []>("op_308_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 4, 256]> var_308 = transpose(perm = var_308_perm_0, x = var_306)[name = tensor<string, []>("transpose_52")];
-            tensor<fp32, [1, 4, 256]> input_31 = add(x = x_3, y = var_308)[name = tensor<string, []>("input_31")];
-            tensor<int32, [1]> input_33_axes_0 = const()[name = tensor<string, []>("input_33_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_33 = layer_norm(axes = input_33_axes_0, beta = encoder_ffn2_0_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_0_module_sequential_0_weight, x = input_31)[name = tensor<string, []>("input_33")];
-            tensor<fp32, [1, 4, 1024]> inputs_9 = linear(bias = encoder_ffn2_0_module_sequential_1_linear_bias, weight = encoder_ffn2_0_module_sequential_1_linear_weight, x = input_33)[name = tensor<string, []>("linear_8")];
-            tensor<fp32, [1, 4, 1024]> input_35 = silu(x = inputs_9)[name = tensor<string, []>("input_35")];
-            tensor<fp32, [1, 4, 256]> input_39 = linear(bias = encoder_ffn2_0_module_sequential_4_linear_bias, weight = encoder_ffn2_0_module_sequential_4_linear_weight, x = input_35)[name = tensor<string, []>("linear_9")];
-            tensor<fp32, []> var_331 = const()[name = tensor<string, []>("op_331"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 4, 256]> var_332 = mul(x = input_39, y = var_331)[name = tensor<string, []>("op_332")];
-            tensor<fp32, [1, 4, 256]> input_41 = add(x = var_332, y = input_31)[name = tensor<string, []>("input_41")];
-            tensor<int32, [1]> input_43_axes_0 = const()[name = tensor<string, []>("input_43_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_43 = layer_norm(axes = input_43_axes_0, beta = encoder_layer_norm_0_bias, epsilon = var_28, gamma = encoder_layer_norm_0_weight, x = input_41)[name = tensor<string, []>("input_43")];
+            tensor<int32, [3]> var_362_begin_0 = const()[name = tensor<string, []>("op_362_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_362_end_0 = const()[name = tensor<string, []>("op_362_end_0"), val = tensor<int32, [3]>([4, 16, 256])];
+            tensor<bool, [3]> var_362_end_mask_0 = const()[name = tensor<string, []>("op_362_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [4, 16, 256]> conv_out_1 = transpose(perm = conv_out_1_perm_0, x = input_31)[name = tensor<string, []>("transpose_53")];
+            tensor<fp32, [4, 1, 256]> var_362 = slice_by_index(begin = var_362_begin_0, end = var_362_end_0, end_mask = var_362_end_mask_0, x = conv_out_1)[name = tensor<string, []>("op_362")];
+            tensor<int32, [3]> var_364_perm_0 = const()[name = tensor<string, []>("op_364_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 4, 256]> var_364 = transpose(perm = var_364_perm_0, x = var_362)[name = tensor<string, []>("transpose_52")];
+            tensor<fp32, [1, 4, 256]> input_33 = add(x = x_3, y = var_364)[name = tensor<string, []>("input_33")];
+            tensor<int32, [1]> input_35_axes_0 = const()[name = tensor<string, []>("input_35_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 4, 256]> input_35 = layer_norm(axes = input_35_axes_0, beta = inner_encoder_ffn2_0_module_sequential_0_bias, epsilon = var_66, gamma = inner_encoder_ffn2_0_module_sequential_0_weight, x = input_33)[name = tensor<string, []>("input_35")];
+            tensor<fp32, [1, 4, 1024]> inputs_9 = linear(bias = inner_encoder_ffn2_0_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_0_module_sequential_1_linear_weight, x = input_35)[name = tensor<string, []>("linear_8")];
+            tensor<fp32, [1, 4, 1024]> input_37 = silu(x = inputs_9)[name = tensor<string, []>("input_37")];
+            tensor<fp32, [1, 4, 256]> input_41 = linear(bias = inner_encoder_ffn2_0_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_0_module_sequential_4_linear_weight, x = input_37)[name = tensor<string, []>("linear_9")];
+            tensor<fp32, []> var_387 = const()[name = tensor<string, []>("op_387"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 4, 256]> var_388 = mul(x = input_41, y = var_387)[name = tensor<string, []>("op_388")];
+            tensor<fp32, [1, 4, 256]> input_43 = add(x = var_388, y = input_33)[name = tensor<string, []>("input_43")];
             tensor<int32, [1]> input_45_axes_0 = const()[name = tensor<string, []>("input_45_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_45 = layer_norm(axes = input_45_axes_0, beta = encoder_ffn1_1_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_1_module_sequential_0_weight, x = input_43)[name = tensor<string, []>("input_45")];
-            tensor<fp32, [1, 4, 1024]> inputs_11 = linear(bias = encoder_ffn1_1_module_sequential_1_linear_bias, weight = encoder_ffn1_1_module_sequential_1_linear_weight, x = input_45)[name = tensor<string, []>("linear_10")];
-            tensor<fp32, [1, 4, 1024]> input_47 = silu(x = inputs_11)[name = tensor<string, []>("input_47")];
-            tensor<fp32, [1, 4, 256]> input_51 = linear(bias = encoder_ffn1_1_module_sequential_4_linear_bias, weight = encoder_ffn1_1_module_sequential_4_linear_weight, x = input_47)[name = tensor<string, []>("linear_11")];
-            tensor<fp32, []> var_361 = const()[name = tensor<string, []>("op_361"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 4, 256]> var_362 = mul(x = input_51, y = var_361)[name = tensor<string, []>("op_362")];
-            tensor<fp32, [1, 4, 256]> input_53 = add(x = var_362, y = input_43)[name = tensor<string, []>("input_53")];
+            tensor<fp32, [1, 4, 256]> input_45 = layer_norm(axes = input_45_axes_0, beta = inner_encoder_layer_norm_0_bias, epsilon = var_66, gamma = inner_encoder_layer_norm_0_weight, x = input_43)[name = tensor<string, []>("input_45")];
+            tensor<int32, [1]> input_47_axes_0 = const()[name = tensor<string, []>("input_47_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 4, 256]> input_47 = layer_norm(axes = input_47_axes_0, beta = inner_encoder_ffn1_1_module_sequential_0_bias, epsilon = var_66, gamma = inner_encoder_ffn1_1_module_sequential_0_weight, x = input_45)[name = tensor<string, []>("input_47")];
+            tensor<fp32, [1, 4, 1024]> inputs_11 = linear(bias = inner_encoder_ffn1_1_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_1_module_sequential_1_linear_weight, x = input_47)[name = tensor<string, []>("linear_10")];
+            tensor<fp32, [1, 4, 1024]> input_49 = silu(x = inputs_11)[name = tensor<string, []>("input_49")];
+            tensor<fp32, [1, 4, 256]> input_53 = linear(bias = inner_encoder_ffn1_1_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_1_module_sequential_4_linear_weight, x = input_49)[name = tensor<string, []>("linear_11")];
+            tensor<fp32, []> var_417 = const()[name = tensor<string, []>("op_417"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 4, 256]> var_418 = mul(x = input_53, y = var_417)[name = tensor<string, []>("op_418")];
+            tensor<fp32, [1, 4, 256]> input_55 = add(x = var_418, y = input_45)[name = tensor<string, []>("input_55")];
             tensor<int32, [1]> x_7_axes_0 = const()[name = tensor<string, []>("x_7_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> x_7 = layer_norm(axes = x_7_axes_0, beta = encoder_ret_lns_1_bias, epsilon = var_28, gamma = encoder_ret_lns_1_weight, x = input_53)[name = tensor<string, []>("x_7")];
+            tensor<fp32, [1, 4, 256]> x_7 = layer_norm(axes = x_7_axes_0, beta = inner_encoder_ret_lns_1_bias, epsilon = var_66, gamma = inner_encoder_ret_lns_1_weight, x = input_55)[name = tensor<string, []>("x_7")];
             tensor<int32, [5]> prev_kv_3_begin_0 = const()[name = tensor<string, []>("prev_kv_3_begin_0"), val = tensor<int32, [5]>([1, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_3_end_0 = const()[name = tensor<string, []>("prev_kv_3_end_0"), val = tensor<int32, [5]>([2, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_3_end_mask_0 = const()[name = tensor<string, []>("prev_kv_3_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -416,173 +438,173 @@ program(1.0)
             tensor<bool, [2]> prev_scale_3_end_mask_0 = const()[name = tensor<string, []>("prev_scale_3_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_3_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_3_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_3 = slice_by_index(begin = prev_scale_3_begin_0, end = prev_scale_3_end_0, end_mask = prev_scale_3_end_mask_0, squeeze_mask = prev_scale_3_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_3")];
-            tensor<fp32, [1, 4, 256]> var_376 = linear(bias = encoder_q_proj_1_bias, weight = encoder_q_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_12")];
-            tensor<int32, [4]> var_377 = const()[name = tensor<string, []>("op_377"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_378 = reshape(shape = var_377, x = var_376)[name = tensor<string, []>("op_378")];
+            tensor<fp32, [1, 4, 256]> var_432 = linear(bias = inner_encoder_q_proj_1_bias, weight = inner_encoder_q_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_12")];
+            tensor<int32, [4]> var_433 = const()[name = tensor<string, []>("op_433"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_434 = reshape(shape = var_433, x = var_432)[name = tensor<string, []>("op_434")];
             tensor<int32, [4]> q_3_perm_0 = const()[name = tensor<string, []>("q_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> var_382 = linear(bias = encoder_k_proj_1_bias, weight = encoder_k_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_13")];
-            tensor<fp32, []> var_383 = const()[name = tensor<string, []>("op_383"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 4, 256]> var_384 = mul(x = var_382, y = var_383)[name = tensor<string, []>("op_384")];
-            tensor<int32, [4]> var_385 = const()[name = tensor<string, []>("op_385"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_386 = reshape(shape = var_385, x = var_384)[name = tensor<string, []>("op_386")];
+            tensor<fp32, [1, 4, 256]> var_438 = linear(bias = inner_encoder_k_proj_1_bias, weight = inner_encoder_k_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_13")];
+            tensor<fp32, []> var_439 = const()[name = tensor<string, []>("op_439"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 4, 256]> var_440 = mul(x = var_438, y = var_439)[name = tensor<string, []>("op_440")];
+            tensor<int32, [4]> var_441 = const()[name = tensor<string, []>("op_441"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_442 = reshape(shape = var_441, x = var_440)[name = tensor<string, []>("op_442")];
             tensor<int32, [4]> k_3_perm_0 = const()[name = tensor<string, []>("k_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> var_390 = linear(bias = encoder_v_proj_1_bias, weight = encoder_v_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_14")];
-            tensor<int32, [4]> var_391 = const()[name = tensor<string, []>("op_391"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_392 = reshape(shape = var_391, x = var_390)[name = tensor<string, []>("op_392")];
+            tensor<fp32, [1, 4, 256]> var_446 = linear(bias = inner_encoder_v_proj_1_bias, weight = inner_encoder_v_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_14")];
+            tensor<int32, [4]> var_447 = const()[name = tensor<string, []>("op_447"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_448 = reshape(shape = var_447, x = var_446)[name = tensor<string, []>("op_448")];
             tensor<int32, [4]> v_3_perm_0 = const()[name = tensor<string, []>("v_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> input_57 = linear(bias = encoder_g_proj_1_bias, weight = encoder_g_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_15")];
+            tensor<fp32, [1, 4, 256]> input_59 = linear(bias = inner_encoder_g_proj_1_bias, weight = inner_encoder_g_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_15")];
             tensor<fp32, [1]> sqrt_s0_3 = sqrt(x = prev_scale_3)[name = tensor<string, []>("sqrt_s0_3")];
-            tensor<fp32, [4]> s_t_3 = add(x = prev_scale_3, y = encoder__t_index)[name = tensor<string, []>("s_t_3")];
+            tensor<fp32, [4]> s_t_3 = add(x = prev_scale_3, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_3")];
             tensor<fp32, [4]> sqrt_s_t_3 = sqrt(x = s_t_3)[name = tensor<string, []>("sqrt_s_t_3")];
             tensor<bool, []> qk_3_transpose_x_1 = const()[name = tensor<string, []>("qk_3_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_3_transpose_y_1 = const()[name = tensor<string, []>("qk_3_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 4, 64]> k_3 = transpose(perm = k_3_perm_0, x = var_386)[name = tensor<string, []>("transpose_50")];
-            tensor<fp32, [1, 4, 4, 64]> q_3 = transpose(perm = q_3_perm_0, x = var_378)[name = tensor<string, []>("transpose_51")];
+            tensor<fp32, [1, 4, 4, 64]> k_3 = transpose(perm = k_3_perm_0, x = var_442)[name = tensor<string, []>("transpose_50")];
+            tensor<fp32, [1, 4, 4, 64]> q_3 = transpose(perm = q_3_perm_0, x = var_434)[name = tensor<string, []>("transpose_51")];
             tensor<fp32, [1, 4, 4, 4]> qk_3 = matmul(transpose_x = qk_3_transpose_x_1, transpose_y = qk_3_transpose_y_1, x = q_3, y = k_3)[name = tensor<string, []>("qk_3")];
-            tensor<int32, [2]> var_402 = const()[name = tensor<string, []>("op_402"), val = tensor<int32, [2]>([4, 1])];
-            tensor<fp32, [4, 1]> var_403 = reshape(shape = var_402, x = sqrt_s_t_3)[name = tensor<string, []>("op_403")];
-            tensor<fp32, [4, 4]> M_3 = real_div(x = encoder__causal_mask, y = var_403)[name = tensor<string, []>("M_3")];
-            tensor<fp32, [1, 4, 4, 4]> var_405 = mul(x = qk_3, y = M_3)[name = tensor<string, []>("op_405")];
+            tensor<int32, [2]> var_458 = const()[name = tensor<string, []>("op_458"), val = tensor<int32, [2]>([4, 1])];
+            tensor<fp32, [4, 1]> var_459 = reshape(shape = var_458, x = sqrt_s_t_3)[name = tensor<string, []>("op_459")];
+            tensor<fp32, [4, 4]> M_3 = real_div(x = inner_encoder__causal_mask, y = var_459)[name = tensor<string, []>("M_3")];
+            tensor<fp32, [1, 4, 4, 4]> var_461 = mul(x = qk_3, y = M_3)[name = tensor<string, []>("op_461")];
             tensor<bool, []> inner_3_transpose_x_0 = const()[name = tensor<string, []>("inner_3_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_3_transpose_y_0 = const()[name = tensor<string, []>("inner_3_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 4, 64]> v_3 = transpose(perm = v_3_perm_0, x = var_392)[name = tensor<string, []>("transpose_49")];
-            tensor<fp32, [1, 4, 4, 64]> inner_3 = matmul(transpose_x = inner_3_transpose_x_0, transpose_y = inner_3_transpose_y_0, x = var_405, y = v_3)[name = tensor<string, []>("inner_3")];
-            tensor<bool, []> var_407_transpose_x_0 = const()[name = tensor<string, []>("op_407_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_407_transpose_y_0 = const()[name = tensor<string, []>("op_407_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 4, 64]> var_407 = matmul(transpose_x = var_407_transpose_x_0, transpose_y = var_407_transpose_y_0, x = q_3, y = prev_kv_3)[name = tensor<string, []>("op_407")];
-            tensor<fp32, [4]> var_408 = real_div(x = sqrt_s0_3, y = sqrt_s_t_3)[name = tensor<string, []>("op_408")];
-            tensor<int32, [4]> var_409 = const()[name = tensor<string, []>("op_409"), val = tensor<int32, [4]>([1, 1, 4, 1])];
-            tensor<fp32, [1, 1, 4, 1]> var_410 = reshape(shape = var_409, x = var_408)[name = tensor<string, []>("op_410")];
-            tensor<fp32, [1, 4, 4, 64]> cross_3 = mul(x = var_407, y = var_410)[name = tensor<string, []>("cross_3")];
+            tensor<fp32, [1, 4, 4, 64]> v_3 = transpose(perm = v_3_perm_0, x = var_448)[name = tensor<string, []>("transpose_49")];
+            tensor<fp32, [1, 4, 4, 64]> inner_3 = matmul(transpose_x = inner_3_transpose_x_0, transpose_y = inner_3_transpose_y_0, x = var_461, y = v_3)[name = tensor<string, []>("inner_3")];
+            tensor<bool, []> var_463_transpose_x_0 = const()[name = tensor<string, []>("op_463_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_463_transpose_y_0 = const()[name = tensor<string, []>("op_463_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 4, 64]> var_463 = matmul(transpose_x = var_463_transpose_x_0, transpose_y = var_463_transpose_y_0, x = q_3, y = prev_kv_3)[name = tensor<string, []>("op_463")];
+            tensor<fp32, [4]> var_464 = real_div(x = sqrt_s0_3, y = sqrt_s_t_3)[name = tensor<string, []>("op_464")];
+            tensor<int32, [4]> var_465 = const()[name = tensor<string, []>("op_465"), val = tensor<int32, [4]>([1, 1, 4, 1])];
+            tensor<fp32, [1, 1, 4, 1]> var_466 = reshape(shape = var_465, x = var_464)[name = tensor<string, []>("op_466")];
+            tensor<fp32, [1, 4, 4, 64]> cross_3 = mul(x = var_463, y = var_466)[name = tensor<string, []>("cross_3")];
             tensor<fp32, [1, 4, 4, 64]> out_7 = add(x = inner_3, y = cross_3)[name = tensor<string, []>("out_7")];
-            tensor<fp32, [1, 4, 64, 64]> var_413 = mul(x = prev_kv_3, y = sqrt_s0_3)[name = tensor<string, []>("op_413")];
-            tensor<bool, []> var_415_transpose_x_1 = const()[name = tensor<string, []>("op_415_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_415_transpose_y_1 = const()[name = tensor<string, []>("op_415_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_415 = matmul(transpose_x = var_415_transpose_x_1, transpose_y = var_415_transpose_y_1, x = k_3, y = v_3)[name = tensor<string, []>("op_415")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_3 = add(x = var_413, y = var_415)[name = tensor<string, []>("new_kv_unnorm_3")];
-            tensor<fp32, []> var_417 = const()[name = tensor<string, []>("op_417"), val = tensor<fp32, []>(0x1p+2)];
-            tensor<fp32, [1]> new_scale_3 = add(x = prev_scale_3, y = var_417)[name = tensor<string, []>("new_scale_3")];
-            tensor<fp32, [1]> var_419 = sqrt(x = new_scale_3)[name = tensor<string, []>("op_419")];
-            tensor<fp32, [1, 4, 64, 64]> var_420 = real_div(x = new_kv_unnorm_3, y = var_419)[name = tensor<string, []>("op_420")];
-            tensor<int32, [4]> var_421_perm_0 = const()[name = tensor<string, []>("op_421_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_469 = mul(x = prev_kv_3, y = sqrt_s0_3)[name = tensor<string, []>("op_469")];
+            tensor<bool, []> var_471_transpose_x_1 = const()[name = tensor<string, []>("op_471_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_471_transpose_y_1 = const()[name = tensor<string, []>("op_471_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_471 = matmul(transpose_x = var_471_transpose_x_1, transpose_y = var_471_transpose_y_1, x = k_3, y = v_3)[name = tensor<string, []>("op_471")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_3 = add(x = var_469, y = var_471)[name = tensor<string, []>("new_kv_unnorm_3")];
+            tensor<fp32, []> var_473 = const()[name = tensor<string, []>("op_473"), val = tensor<fp32, []>(0x1p+2)];
+            tensor<fp32, [1]> new_scale_3 = add(x = prev_scale_3, y = var_473)[name = tensor<string, []>("new_scale_3")];
+            tensor<fp32, [1]> var_475 = sqrt(x = new_scale_3)[name = tensor<string, []>("op_475")];
+            tensor<fp32, [1, 4, 64, 64]> var_476 = real_div(x = new_kv_unnorm_3, y = var_475)[name = tensor<string, []>("op_476")];
+            tensor<int32, [4]> var_477_perm_0 = const()[name = tensor<string, []>("op_477_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_9_axes_0 = const()[name = tensor<string, []>("out_9_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 4, 64]> var_421 = transpose(perm = var_421_perm_0, x = out_7)[name = tensor<string, []>("transpose_48")];
-            tensor<fp32, [1, 4, 4, 64]> out_9 = layer_norm(axes = out_9_axes_0, epsilon = var_17, x = var_421)[name = tensor<string, []>("out_9")];
-            tensor<int32, [3]> var_425 = const()[name = tensor<string, []>("op_425"), val = tensor<int32, [3]>([1, 4, 256])];
-            tensor<fp32, [1, 4, 256]> out_11 = reshape(shape = var_425, x = out_9)[name = tensor<string, []>("out_11")];
-            tensor<fp32, [1, 4, 256]> var_427 = silu(x = input_57)[name = tensor<string, []>("op_427")];
-            tensor<fp32, [1, 4, 256]> input_59 = mul(x = var_427, y = out_11)[name = tensor<string, []>("input_59")];
-            tensor<fp32, [1, 4, 256]> ret_out_3 = linear(bias = encoder_out_proj_1_bias, weight = encoder_out_proj_1_weight, x = input_59)[name = tensor<string, []>("linear_16")];
-            tensor<fp32, [1, 4, 256]> x_9 = add(x = input_53, y = ret_out_3)[name = tensor<string, []>("x_9")];
+            tensor<fp32, [1, 4, 4, 64]> var_477 = transpose(perm = var_477_perm_0, x = out_7)[name = tensor<string, []>("transpose_48")];
+            tensor<fp32, [1, 4, 4, 64]> out_9 = layer_norm(axes = out_9_axes_0, epsilon = var_74, x = var_477)[name = tensor<string, []>("out_9")];
+            tensor<int32, [3]> var_481 = const()[name = tensor<string, []>("op_481"), val = tensor<int32, [3]>([1, 4, 256])];
+            tensor<fp32, [1, 4, 256]> out_11 = reshape(shape = var_481, x = out_9)[name = tensor<string, []>("out_11")];
+            tensor<fp32, [1, 4, 256]> var_483 = silu(x = input_59)[name = tensor<string, []>("op_483")];
+            tensor<fp32, [1, 4, 256]> input_61 = mul(x = var_483, y = out_11)[name = tensor<string, []>("input_61")];
+            tensor<fp32, [1, 4, 256]> ret_out_3 = linear(bias = inner_encoder_out_proj_1_bias, weight = inner_encoder_out_proj_1_weight, x = input_61)[name = tensor<string, []>("linear_16")];
+            tensor<fp32, [1, 4, 256]> x_9 = add(x = input_55, y = ret_out_3)[name = tensor<string, []>("x_9")];
             tensor<int32, [4]> window_11_begin_0 = const()[name = tensor<string, []>("window_11_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> window_11_end_0 = const()[name = tensor<string, []>("window_11_end_0"), val = tensor<int32, [4]>([2, 1, 16, 256])];
             tensor<bool, [4]> window_11_end_mask_0 = const()[name = tensor<string, []>("window_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_11_squeeze_mask_0 = const()[name = tensor<string, []>("window_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_11 = slice_by_index(begin = window_11_begin_0, end = window_11_end_0, end_mask = window_11_end_mask_0, squeeze_mask = window_11_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_11")];
-            tensor<int32, [3]> var_435_begin_0 = const()[name = tensor<string, []>("op_435_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_435_end_0 = const()[name = tensor<string, []>("op_435_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_435_end_mask_0 = const()[name = tensor<string, []>("op_435_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_435 = slice_by_index(begin = var_435_begin_0, end = var_435_end_0, end_mask = var_435_end_mask_0, x = x_9)[name = tensor<string, []>("op_435")];
-            tensor<int32, [3]> var_438_begin_0 = const()[name = tensor<string, []>("op_438_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_438_end_0 = const()[name = tensor<string, []>("op_438_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_438_end_mask_0 = const()[name = tensor<string, []>("op_438_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_438 = slice_by_index(begin = var_438_begin_0, end = var_438_end_0, end_mask = var_438_end_mask_0, x = window_11)[name = tensor<string, []>("op_438")];
+            tensor<int32, [3]> var_491_begin_0 = const()[name = tensor<string, []>("op_491_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_491_end_0 = const()[name = tensor<string, []>("op_491_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_491_end_mask_0 = const()[name = tensor<string, []>("op_491_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_491 = slice_by_index(begin = var_491_begin_0, end = var_491_end_0, end_mask = var_491_end_mask_0, x = x_9)[name = tensor<string, []>("op_491")];
+            tensor<int32, [3]> var_494_begin_0 = const()[name = tensor<string, []>("op_494_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_494_end_0 = const()[name = tensor<string, []>("op_494_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_494_end_mask_0 = const()[name = tensor<string, []>("op_494_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_494 = slice_by_index(begin = var_494_begin_0, end = var_494_end_0, end_mask = var_494_end_mask_0, x = window_11)[name = tensor<string, []>("op_494")];
             tensor<bool, []> window_13_interleave_0 = const()[name = tensor<string, []>("window_13_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_13 = concat(axis = var_26, interleave = window_13_interleave_0, values = (var_438, var_435))[name = tensor<string, []>("window_13")];
-            tensor<int32, [3]> var_443_begin_0 = const()[name = tensor<string, []>("op_443_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_443_end_0 = const()[name = tensor<string, []>("op_443_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_443_end_mask_0 = const()[name = tensor<string, []>("op_443_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_443 = slice_by_index(begin = var_443_begin_0, end = var_443_end_0, end_mask = var_443_end_mask_0, x = x_9)[name = tensor<string, []>("op_443")];
-            tensor<int32, [3]> var_446_begin_0 = const()[name = tensor<string, []>("op_446_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_446_end_0 = const()[name = tensor<string, []>("op_446_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_446_end_mask_0 = const()[name = tensor<string, []>("op_446_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_446 = slice_by_index(begin = var_446_begin_0, end = var_446_end_0, end_mask = var_446_end_mask_0, x = window_13)[name = tensor<string, []>("op_446")];
+            tensor<fp32, [1, 16, 256]> window_13 = concat(axis = var_82, interleave = window_13_interleave_0, values = (var_494, var_491))[name = tensor<string, []>("window_13")];
+            tensor<int32, [3]> var_499_begin_0 = const()[name = tensor<string, []>("op_499_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_499_end_0 = const()[name = tensor<string, []>("op_499_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_499_end_mask_0 = const()[name = tensor<string, []>("op_499_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_499 = slice_by_index(begin = var_499_begin_0, end = var_499_end_0, end_mask = var_499_end_mask_0, x = x_9)[name = tensor<string, []>("op_499")];
+            tensor<int32, [3]> var_502_begin_0 = const()[name = tensor<string, []>("op_502_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_502_end_0 = const()[name = tensor<string, []>("op_502_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_502_end_mask_0 = const()[name = tensor<string, []>("op_502_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_502 = slice_by_index(begin = var_502_begin_0, end = var_502_end_0, end_mask = var_502_end_mask_0, x = window_13)[name = tensor<string, []>("op_502")];
             tensor<bool, []> window_15_interleave_0 = const()[name = tensor<string, []>("window_15_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_15 = concat(axis = var_26, interleave = window_15_interleave_0, values = (var_446, var_443))[name = tensor<string, []>("window_15")];
-            tensor<int32, [3]> var_451_begin_0 = const()[name = tensor<string, []>("op_451_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_451_end_0 = const()[name = tensor<string, []>("op_451_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<bool, [3]> var_451_end_mask_0 = const()[name = tensor<string, []>("op_451_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_451 = slice_by_index(begin = var_451_begin_0, end = var_451_end_0, end_mask = var_451_end_mask_0, x = x_9)[name = tensor<string, []>("op_451")];
-            tensor<int32, [3]> var_454_begin_0 = const()[name = tensor<string, []>("op_454_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_454_end_0 = const()[name = tensor<string, []>("op_454_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_454_end_mask_0 = const()[name = tensor<string, []>("op_454_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_454 = slice_by_index(begin = var_454_begin_0, end = var_454_end_0, end_mask = var_454_end_mask_0, x = window_15)[name = tensor<string, []>("op_454")];
+            tensor<fp32, [1, 16, 256]> window_15 = concat(axis = var_82, interleave = window_15_interleave_0, values = (var_502, var_499))[name = tensor<string, []>("window_15")];
+            tensor<int32, [3]> var_507_begin_0 = const()[name = tensor<string, []>("op_507_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_507_end_0 = const()[name = tensor<string, []>("op_507_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<bool, [3]> var_507_end_mask_0 = const()[name = tensor<string, []>("op_507_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_507 = slice_by_index(begin = var_507_begin_0, end = var_507_end_0, end_mask = var_507_end_mask_0, x = x_9)[name = tensor<string, []>("op_507")];
+            tensor<int32, [3]> var_510_begin_0 = const()[name = tensor<string, []>("op_510_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_510_end_0 = const()[name = tensor<string, []>("op_510_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_510_end_mask_0 = const()[name = tensor<string, []>("op_510_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_510 = slice_by_index(begin = var_510_begin_0, end = var_510_end_0, end_mask = var_510_end_mask_0, x = window_15)[name = tensor<string, []>("op_510")];
             tensor<bool, []> window_17_interleave_0 = const()[name = tensor<string, []>("window_17_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_17 = concat(axis = var_26, interleave = window_17_interleave_0, values = (var_454, var_451))[name = tensor<string, []>("window_17")];
-            tensor<int32, [3]> var_459_begin_0 = const()[name = tensor<string, []>("op_459_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
-            tensor<int32, [3]> var_459_end_0 = const()[name = tensor<string, []>("op_459_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_459_end_mask_0 = const()[name = tensor<string, []>("op_459_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_459 = slice_by_index(begin = var_459_begin_0, end = var_459_end_0, end_mask = var_459_end_mask_0, x = x_9)[name = tensor<string, []>("op_459")];
-            tensor<int32, [3]> var_462_begin_0 = const()[name = tensor<string, []>("op_462_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_462_end_0 = const()[name = tensor<string, []>("op_462_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_462_end_mask_0 = const()[name = tensor<string, []>("op_462_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_462 = slice_by_index(begin = var_462_begin_0, end = var_462_end_0, end_mask = var_462_end_mask_0, x = window_17)[name = tensor<string, []>("op_462")];
+            tensor<fp32, [1, 16, 256]> window_17 = concat(axis = var_82, interleave = window_17_interleave_0, values = (var_510, var_507))[name = tensor<string, []>("window_17")];
+            tensor<int32, [3]> var_515_begin_0 = const()[name = tensor<string, []>("op_515_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
+            tensor<int32, [3]> var_515_end_0 = const()[name = tensor<string, []>("op_515_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_515_end_mask_0 = const()[name = tensor<string, []>("op_515_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_515 = slice_by_index(begin = var_515_begin_0, end = var_515_end_0, end_mask = var_515_end_mask_0, x = x_9)[name = tensor<string, []>("op_515")];
+            tensor<int32, [3]> var_518_begin_0 = const()[name = tensor<string, []>("op_518_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_518_end_0 = const()[name = tensor<string, []>("op_518_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_518_end_mask_0 = const()[name = tensor<string, []>("op_518_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_518 = slice_by_index(begin = var_518_begin_0, end = var_518_end_0, end_mask = var_518_end_mask_0, x = window_17)[name = tensor<string, []>("op_518")];
             tensor<bool, []> window_19_interleave_0 = const()[name = tensor<string, []>("window_19_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_19 = concat(axis = var_26, interleave = window_19_interleave_0, values = (var_462, var_459))[name = tensor<string, []>("window_19")];
-            tensor<bool, []> input_61_interleave_0 = const()[name = tensor<string, []>("input_61_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [4, 16, 256]> input_61 = concat(axis = var_23, interleave = input_61_interleave_0, values = (window_13, window_15, window_17, window_19))[name = tensor<string, []>("input_61")];
+            tensor<fp32, [1, 16, 256]> window_19 = concat(axis = var_82, interleave = window_19_interleave_0, values = (var_518, var_515))[name = tensor<string, []>("window_19")];
+            tensor<bool, []> input_63_interleave_0 = const()[name = tensor<string, []>("input_63_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [4, 16, 256]> input_63 = concat(axis = var_69, interleave = input_63_interleave_0, values = (window_13, window_15, window_17, window_19))[name = tensor<string, []>("input_63")];
             tensor<int32, [1]> x_11_axes_0 = const()[name = tensor<string, []>("x_11_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [4, 16, 256]> x_11 = layer_norm(axes = x_11_axes_0, beta = encoder_conv_module_1_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_1_sequential_0_weight, x = input_61)[name = tensor<string, []>("x_11")];
-            tensor<int32, [3]> input_63_perm_0 = const()[name = tensor<string, []>("input_63_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [4, 16, 256]> x_11 = layer_norm(axes = x_11_axes_0, beta = inner_encoder_conv_module_1_sequential_0_bias, epsilon = var_66, gamma = inner_encoder_conv_module_1_sequential_0_weight, x = input_63)[name = tensor<string, []>("x_11")];
+            tensor<int32, [3]> input_65_perm_0 = const()[name = tensor<string, []>("input_65_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_13_pad_type_0 = const()[name = tensor<string, []>("inputs_13_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_13_strides_0 = const()[name = tensor<string, []>("inputs_13_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_13_pad_0 = const()[name = tensor<string, []>("inputs_13_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_13_dilations_0 = const()[name = tensor<string, []>("inputs_13_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_13_groups_0 = const()[name = tensor<string, []>("inputs_13_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> input_63 = transpose(perm = input_63_perm_0, x = x_11)[name = tensor<string, []>("transpose_47")];
-            tensor<fp32, [4, 512, 16]> inputs_13 = conv(bias = encoder_conv_module_1_sequential_2_conv_bias, dilations = inputs_13_dilations_0, groups = inputs_13_groups_0, pad = inputs_13_pad_0, pad_type = inputs_13_pad_type_0, strides = inputs_13_strides_0, weight = encoder_conv_module_1_sequential_2_conv_weight, x = input_63)[name = tensor<string, []>("inputs_13")];
-            tensor<int32, [2]> var_487_split_sizes_0 = const()[name = tensor<string, []>("op_487_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_487_axis_0 = const()[name = tensor<string, []>("op_487_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> var_487_0, tensor<fp32, [4, 256, 16]> var_487_1 = split(axis = var_487_axis_0, split_sizes = var_487_split_sizes_0, x = inputs_13)[name = tensor<string, []>("op_487")];
-            tensor<fp32, [4, 256, 16]> var_489 = sigmoid(x = var_487_1)[name = tensor<string, []>("op_489")];
-            tensor<fp32, [4, 256, 16]> inputs_15 = mul(x = var_487_0, y = var_489)[name = tensor<string, []>("inputs_15")];
+            tensor<fp32, [4, 256, 16]> input_65 = transpose(perm = input_65_perm_0, x = x_11)[name = tensor<string, []>("transpose_47")];
+            tensor<fp32, [4, 512, 16]> inputs_13 = conv(bias = inner_encoder_conv_module_1_sequential_2_conv_bias, dilations = inputs_13_dilations_0, groups = inputs_13_groups_0, pad = inputs_13_pad_0, pad_type = inputs_13_pad_type_0, strides = inputs_13_strides_0, weight = inner_encoder_conv_module_1_sequential_2_conv_weight, x = input_65)[name = tensor<string, []>("inputs_13")];
+            tensor<int32, [2]> var_543_split_sizes_0 = const()[name = tensor<string, []>("op_543_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_543_axis_0 = const()[name = tensor<string, []>("op_543_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [4, 256, 16]> var_543_0, tensor<fp32, [4, 256, 16]> var_543_1 = split(axis = var_543_axis_0, split_sizes = var_543_split_sizes_0, x = inputs_13)[name = tensor<string, []>("op_543")];
+            tensor<fp32, [4, 256, 16]> var_545 = sigmoid(x = var_543_1)[name = tensor<string, []>("op_545")];
+            tensor<fp32, [4, 256, 16]> inputs_15 = mul(x = var_543_0, y = var_545)[name = tensor<string, []>("inputs_15")];
             tensor<string, []> outputs_aug_3_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_3_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_3_pad_0 = const()[name = tensor<string, []>("outputs_aug_3_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_3_groups_0 = const()[name = tensor<string, []>("outputs_aug_3_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_3_strides_0 = const()[name = tensor<string, []>("outputs_aug_3_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_3_dilations_0 = const()[name = tensor<string, []>("outputs_aug_3_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [4, 256, 31]> outputs_aug_3 = conv(dilations = outputs_aug_3_dilations_0, groups = outputs_aug_3_groups_0, pad = outputs_aug_3_pad_0, pad_type = outputs_aug_3_pad_type_0, strides = outputs_aug_3_strides_0, weight = encoder_conv_module_1_sequential_4_conv_weight, x = inputs_15)[name = tensor<string, []>("outputs_aug_3")];
-            tensor<int32, [3]> input_65_begin_0 = const()[name = tensor<string, []>("input_65_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_65_end_0 = const()[name = tensor<string, []>("input_65_end_0"), val = tensor<int32, [3]>([4, 256, 16])];
-            tensor<bool, [3]> input_65_end_mask_0 = const()[name = tensor<string, []>("input_65_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [4, 256, 16]> input_65 = slice_by_index(begin = input_65_begin_0, end = input_65_end_0, end_mask = input_65_end_mask_0, x = outputs_aug_3)[name = tensor<string, []>("input_65")];
-            tensor<fp32, [4, 256, 16]> inputs_17 = batch_norm(beta = encoder_conv_module_1_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_1_sequential_5_weight, mean = encoder_conv_module_1_sequential_5_running_mean, variance = encoder_conv_module_1_sequential_5_running_var, x = input_65)[name = tensor<string, []>("inputs_17")];
-            tensor<fp32, [4, 256, 16]> input_67 = silu(x = inputs_17)[name = tensor<string, []>("input_67")];
-            tensor<string, []> input_69_pad_type_0 = const()[name = tensor<string, []>("input_69_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_69_strides_0 = const()[name = tensor<string, []>("input_69_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_69_pad_0 = const()[name = tensor<string, []>("input_69_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_69_dilations_0 = const()[name = tensor<string, []>("input_69_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_69_groups_0 = const()[name = tensor<string, []>("input_69_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> input_69 = conv(bias = encoder_conv_module_1_sequential_7_conv_bias, dilations = input_69_dilations_0, groups = input_69_groups_0, pad = input_69_pad_0, pad_type = input_69_pad_type_0, strides = input_69_strides_0, weight = encoder_conv_module_1_sequential_7_conv_weight, x = input_67)[name = tensor<string, []>("input_69")];
+            tensor<fp32, [4, 256, 31]> outputs_aug_3 = conv(dilations = outputs_aug_3_dilations_0, groups = outputs_aug_3_groups_0, pad = outputs_aug_3_pad_0, pad_type = outputs_aug_3_pad_type_0, strides = outputs_aug_3_strides_0, weight = inner_encoder_conv_module_1_sequential_4_conv_weight, x = inputs_15)[name = tensor<string, []>("outputs_aug_3")];
+            tensor<int32, [3]> input_67_begin_0 = const()[name = tensor<string, []>("input_67_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_67_end_0 = const()[name = tensor<string, []>("input_67_end_0"), val = tensor<int32, [3]>([4, 256, 16])];
+            tensor<bool, [3]> input_67_end_mask_0 = const()[name = tensor<string, []>("input_67_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [4, 256, 16]> input_67 = slice_by_index(begin = input_67_begin_0, end = input_67_end_0, end_mask = input_67_end_mask_0, x = outputs_aug_3)[name = tensor<string, []>("input_67")];
+            tensor<fp32, [4, 256, 16]> inputs_17 = batch_norm(beta = inner_encoder_conv_module_1_sequential_5_bias, epsilon = var_66, gamma = inner_encoder_conv_module_1_sequential_5_weight, mean = inner_encoder_conv_module_1_sequential_5_running_mean, variance = inner_encoder_conv_module_1_sequential_5_running_var, x = input_67)[name = tensor<string, []>("inputs_17")];
+            tensor<fp32, [4, 256, 16]> input_69 = silu(x = inputs_17)[name = tensor<string, []>("input_69")];
+            tensor<string, []> input_71_pad_type_0 = const()[name = tensor<string, []>("input_71_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_71_strides_0 = const()[name = tensor<string, []>("input_71_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_71_pad_0 = const()[name = tensor<string, []>("input_71_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_71_dilations_0 = const()[name = tensor<string, []>("input_71_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_71_groups_0 = const()[name = tensor<string, []>("input_71_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [4, 256, 16]> input_71 = conv(bias = inner_encoder_conv_module_1_sequential_7_conv_bias, dilations = input_71_dilations_0, groups = input_71_groups_0, pad = input_71_pad_0, pad_type = input_71_pad_type_0, strides = input_71_strides_0, weight = inner_encoder_conv_module_1_sequential_7_conv_weight, x = input_69)[name = tensor<string, []>("input_71")];
             tensor<int32, [3]> conv_out_3_perm_0 = const()[name = tensor<string, []>("conv_out_3_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_520_begin_0 = const()[name = tensor<string, []>("op_520_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_520_end_0 = const()[name = tensor<string, []>("op_520_end_0"), val = tensor<int32, [3]>([4, 16, 256])];
-            tensor<bool, [3]> var_520_end_mask_0 = const()[name = tensor<string, []>("op_520_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [4, 16, 256]> conv_out_3 = transpose(perm = conv_out_3_perm_0, x = input_69)[name = tensor<string, []>("transpose_46")];
-            tensor<fp32, [4, 1, 256]> var_520 = slice_by_index(begin = var_520_begin_0, end = var_520_end_0, end_mask = var_520_end_mask_0, x = conv_out_3)[name = tensor<string, []>("op_520")];
-            tensor<int32, [3]> var_522_perm_0 = const()[name = tensor<string, []>("op_522_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 4, 256]> var_522 = transpose(perm = var_522_perm_0, x = var_520)[name = tensor<string, []>("transpose_45")];
-            tensor<fp32, [1, 4, 256]> input_71 = add(x = x_9, y = var_522)[name = tensor<string, []>("input_71")];
-            tensor<int32, [1]> input_73_axes_0 = const()[name = tensor<string, []>("input_73_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_73 = layer_norm(axes = input_73_axes_0, beta = encoder_ffn2_1_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_1_module_sequential_0_weight, x = input_71)[name = tensor<string, []>("input_73")];
-            tensor<fp32, [1, 4, 1024]> inputs_19 = linear(bias = encoder_ffn2_1_module_sequential_1_linear_bias, weight = encoder_ffn2_1_module_sequential_1_linear_weight, x = input_73)[name = tensor<string, []>("linear_17")];
-            tensor<fp32, [1, 4, 1024]> input_75 = silu(x = inputs_19)[name = tensor<string, []>("input_75")];
-            tensor<fp32, [1, 4, 256]> input_79 = linear(bias = encoder_ffn2_1_module_sequential_4_linear_bias, weight = encoder_ffn2_1_module_sequential_4_linear_weight, x = input_75)[name = tensor<string, []>("linear_18")];
-            tensor<fp32, []> var_545 = const()[name = tensor<string, []>("op_545"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 4, 256]> var_546 = mul(x = input_79, y = var_545)[name = tensor<string, []>("op_546")];
-            tensor<fp32, [1, 4, 256]> input_81 = add(x = var_546, y = input_71)[name = tensor<string, []>("input_81")];
-            tensor<int32, [1]> input_83_axes_0 = const()[name = tensor<string, []>("input_83_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_83 = layer_norm(axes = input_83_axes_0, beta = encoder_layer_norm_1_bias, epsilon = var_28, gamma = encoder_layer_norm_1_weight, x = input_81)[name = tensor<string, []>("input_83")];
+            tensor<int32, [3]> var_576_begin_0 = const()[name = tensor<string, []>("op_576_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_576_end_0 = const()[name = tensor<string, []>("op_576_end_0"), val = tensor<int32, [3]>([4, 16, 256])];
+            tensor<bool, [3]> var_576_end_mask_0 = const()[name = tensor<string, []>("op_576_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [4, 16, 256]> conv_out_3 = transpose(perm = conv_out_3_perm_0, x = input_71)[name = tensor<string, []>("transpose_46")];
+            tensor<fp32, [4, 1, 256]> var_576 = slice_by_index(begin = var_576_begin_0, end = var_576_end_0, end_mask = var_576_end_mask_0, x = conv_out_3)[name = tensor<string, []>("op_576")];
+            tensor<int32, [3]> var_578_perm_0 = const()[name = tensor<string, []>("op_578_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 4, 256]> var_578 = transpose(perm = var_578_perm_0, x = var_576)[name = tensor<string, []>("transpose_45")];
+            tensor<fp32, [1, 4, 256]> input_73 = add(x = x_9, y = var_578)[name = tensor<string, []>("input_73")];
+            tensor<int32, [1]> input_75_axes_0 = const()[name = tensor<string, []>("input_75_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 4, 256]> input_75 = layer_norm(axes = input_75_axes_0, beta = inner_encoder_ffn2_1_module_sequential_0_bias, epsilon = var_66, gamma = inner_encoder_ffn2_1_module_sequential_0_weight, x = input_73)[name = tensor<string, []>("input_75")];
+            tensor<fp32, [1, 4, 1024]> inputs_19 = linear(bias = inner_encoder_ffn2_1_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_1_module_sequential_1_linear_weight, x = input_75)[name = tensor<string, []>("linear_17")];
+            tensor<fp32, [1, 4, 1024]> input_77 = silu(x = inputs_19)[name = tensor<string, []>("input_77")];
+            tensor<fp32, [1, 4, 256]> input_81 = linear(bias = inner_encoder_ffn2_1_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_1_module_sequential_4_linear_weight, x = input_77)[name = tensor<string, []>("linear_18")];
+            tensor<fp32, []> var_601 = const()[name = tensor<string, []>("op_601"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 4, 256]> var_602 = mul(x = input_81, y = var_601)[name = tensor<string, []>("op_602")];
+            tensor<fp32, [1, 4, 256]> input_83 = add(x = var_602, y = input_73)[name = tensor<string, []>("input_83")];
             tensor<int32, [1]> input_85_axes_0 = const()[name = tensor<string, []>("input_85_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_85 = layer_norm(axes = input_85_axes_0, beta = encoder_ffn1_2_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_2_module_sequential_0_weight, x = input_83)[name = tensor<string, []>("input_85")];
-            tensor<fp32, [1, 4, 1024]> inputs_21 = linear(bias = encoder_ffn1_2_module_sequential_1_linear_bias, weight = encoder_ffn1_2_module_sequential_1_linear_weight, x = input_85)[name = tensor<string, []>("linear_19")];
-            tensor<fp32, [1, 4, 1024]> input_87 = silu(x = inputs_21)[name = tensor<string, []>("input_87")];
-            tensor<fp32, [1, 4, 256]> input_91 = linear(bias = encoder_ffn1_2_module_sequential_4_linear_bias, weight = encoder_ffn1_2_module_sequential_4_linear_weight, x = input_87)[name = tensor<string, []>("linear_20")];
-            tensor<fp32, []> var_575 = const()[name = tensor<string, []>("op_575"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 4, 256]> var_576 = mul(x = input_91, y = var_575)[name = tensor<string, []>("op_576")];
-            tensor<fp32, [1, 4, 256]> input_93 = add(x = var_576, y = input_83)[name = tensor<string, []>("input_93")];
+            tensor<fp32, [1, 4, 256]> input_85 = layer_norm(axes = input_85_axes_0, beta = inner_encoder_layer_norm_1_bias, epsilon = var_66, gamma = inner_encoder_layer_norm_1_weight, x = input_83)[name = tensor<string, []>("input_85")];
+            tensor<int32, [1]> input_87_axes_0 = const()[name = tensor<string, []>("input_87_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 4, 256]> input_87 = layer_norm(axes = input_87_axes_0, beta = inner_encoder_ffn1_2_module_sequential_0_bias, epsilon = var_66, gamma = inner_encoder_ffn1_2_module_sequential_0_weight, x = input_85)[name = tensor<string, []>("input_87")];
+            tensor<fp32, [1, 4, 1024]> inputs_21 = linear(bias = inner_encoder_ffn1_2_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_2_module_sequential_1_linear_weight, x = input_87)[name = tensor<string, []>("linear_19")];
+            tensor<fp32, [1, 4, 1024]> input_89 = silu(x = inputs_21)[name = tensor<string, []>("input_89")];
+            tensor<fp32, [1, 4, 256]> input_93 = linear(bias = inner_encoder_ffn1_2_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_2_module_sequential_4_linear_weight, x = input_89)[name = tensor<string, []>("linear_20")];
+            tensor<fp32, []> var_631 = const()[name = tensor<string, []>("op_631"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 4, 256]> var_632 = mul(x = input_93, y = var_631)[name = tensor<string, []>("op_632")];
+            tensor<fp32, [1, 4, 256]> input_95 = add(x = var_632, y = input_85)[name = tensor<string, []>("input_95")];
             tensor<int32, [1]> x_13_axes_0 = const()[name = tensor<string, []>("x_13_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> x_13 = layer_norm(axes = x_13_axes_0, beta = encoder_ret_lns_2_bias, epsilon = var_28, gamma = encoder_ret_lns_2_weight, x = input_93)[name = tensor<string, []>("x_13")];
+            tensor<fp32, [1, 4, 256]> x_13 = layer_norm(axes = x_13_axes_0, beta = inner_encoder_ret_lns_2_bias, epsilon = var_66, gamma = inner_encoder_ret_lns_2_weight, x = input_95)[name = tensor<string, []>("x_13")];
             tensor<int32, [5]> prev_kv_5_begin_0 = const()[name = tensor<string, []>("prev_kv_5_begin_0"), val = tensor<int32, [5]>([2, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_5_end_0 = const()[name = tensor<string, []>("prev_kv_5_end_0"), val = tensor<int32, [5]>([3, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_5_end_mask_0 = const()[name = tensor<string, []>("prev_kv_5_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -593,173 +615,173 @@ program(1.0)
             tensor<bool, [2]> prev_scale_5_end_mask_0 = const()[name = tensor<string, []>("prev_scale_5_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_5_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_5_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_5 = slice_by_index(begin = prev_scale_5_begin_0, end = prev_scale_5_end_0, end_mask = prev_scale_5_end_mask_0, squeeze_mask = prev_scale_5_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_5")];
-            tensor<fp32, [1, 4, 256]> var_590 = linear(bias = encoder_q_proj_2_bias, weight = encoder_q_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_21")];
-            tensor<int32, [4]> var_591 = const()[name = tensor<string, []>("op_591"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_592 = reshape(shape = var_591, x = var_590)[name = tensor<string, []>("op_592")];
+            tensor<fp32, [1, 4, 256]> var_646 = linear(bias = inner_encoder_q_proj_2_bias, weight = inner_encoder_q_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_21")];
+            tensor<int32, [4]> var_647 = const()[name = tensor<string, []>("op_647"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_648 = reshape(shape = var_647, x = var_646)[name = tensor<string, []>("op_648")];
             tensor<int32, [4]> q_5_perm_0 = const()[name = tensor<string, []>("q_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> var_596 = linear(bias = encoder_k_proj_2_bias, weight = encoder_k_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_22")];
-            tensor<fp32, []> var_597 = const()[name = tensor<string, []>("op_597"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 4, 256]> var_598 = mul(x = var_596, y = var_597)[name = tensor<string, []>("op_598")];
-            tensor<int32, [4]> var_599 = const()[name = tensor<string, []>("op_599"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_600 = reshape(shape = var_599, x = var_598)[name = tensor<string, []>("op_600")];
+            tensor<fp32, [1, 4, 256]> var_652 = linear(bias = inner_encoder_k_proj_2_bias, weight = inner_encoder_k_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_22")];
+            tensor<fp32, []> var_653 = const()[name = tensor<string, []>("op_653"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 4, 256]> var_654 = mul(x = var_652, y = var_653)[name = tensor<string, []>("op_654")];
+            tensor<int32, [4]> var_655 = const()[name = tensor<string, []>("op_655"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_656 = reshape(shape = var_655, x = var_654)[name = tensor<string, []>("op_656")];
             tensor<int32, [4]> k_5_perm_0 = const()[name = tensor<string, []>("k_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> var_604 = linear(bias = encoder_v_proj_2_bias, weight = encoder_v_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_23")];
-            tensor<int32, [4]> var_605 = const()[name = tensor<string, []>("op_605"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_606 = reshape(shape = var_605, x = var_604)[name = tensor<string, []>("op_606")];
+            tensor<fp32, [1, 4, 256]> var_660 = linear(bias = inner_encoder_v_proj_2_bias, weight = inner_encoder_v_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_23")];
+            tensor<int32, [4]> var_661 = const()[name = tensor<string, []>("op_661"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_662 = reshape(shape = var_661, x = var_660)[name = tensor<string, []>("op_662")];
             tensor<int32, [4]> v_5_perm_0 = const()[name = tensor<string, []>("v_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> input_97 = linear(bias = encoder_g_proj_2_bias, weight = encoder_g_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_24")];
+            tensor<fp32, [1, 4, 256]> input_99 = linear(bias = inner_encoder_g_proj_2_bias, weight = inner_encoder_g_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_24")];
             tensor<fp32, [1]> sqrt_s0_5 = sqrt(x = prev_scale_5)[name = tensor<string, []>("sqrt_s0_5")];
-            tensor<fp32, [4]> s_t_5 = add(x = prev_scale_5, y = encoder__t_index)[name = tensor<string, []>("s_t_5")];
+            tensor<fp32, [4]> s_t_5 = add(x = prev_scale_5, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_5")];
             tensor<fp32, [4]> sqrt_s_t_5 = sqrt(x = s_t_5)[name = tensor<string, []>("sqrt_s_t_5")];
             tensor<bool, []> qk_5_transpose_x_1 = const()[name = tensor<string, []>("qk_5_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_5_transpose_y_1 = const()[name = tensor<string, []>("qk_5_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 4, 64]> k_5 = transpose(perm = k_5_perm_0, x = var_600)[name = tensor<string, []>("transpose_43")];
-            tensor<fp32, [1, 4, 4, 64]> q_5 = transpose(perm = q_5_perm_0, x = var_592)[name = tensor<string, []>("transpose_44")];
+            tensor<fp32, [1, 4, 4, 64]> k_5 = transpose(perm = k_5_perm_0, x = var_656)[name = tensor<string, []>("transpose_43")];
+            tensor<fp32, [1, 4, 4, 64]> q_5 = transpose(perm = q_5_perm_0, x = var_648)[name = tensor<string, []>("transpose_44")];
             tensor<fp32, [1, 4, 4, 4]> qk_5 = matmul(transpose_x = qk_5_transpose_x_1, transpose_y = qk_5_transpose_y_1, x = q_5, y = k_5)[name = tensor<string, []>("qk_5")];
-            tensor<int32, [2]> var_616 = const()[name = tensor<string, []>("op_616"), val = tensor<int32, [2]>([4, 1])];
-            tensor<fp32, [4, 1]> var_617 = reshape(shape = var_616, x = sqrt_s_t_5)[name = tensor<string, []>("op_617")];
-            tensor<fp32, [4, 4]> M_5 = real_div(x = encoder__causal_mask, y = var_617)[name = tensor<string, []>("M_5")];
-            tensor<fp32, [1, 4, 4, 4]> var_619 = mul(x = qk_5, y = M_5)[name = tensor<string, []>("op_619")];
+            tensor<int32, [2]> var_672 = const()[name = tensor<string, []>("op_672"), val = tensor<int32, [2]>([4, 1])];
+            tensor<fp32, [4, 1]> var_673 = reshape(shape = var_672, x = sqrt_s_t_5)[name = tensor<string, []>("op_673")];
+            tensor<fp32, [4, 4]> M_5 = real_div(x = inner_encoder__causal_mask, y = var_673)[name = tensor<string, []>("M_5")];
+            tensor<fp32, [1, 4, 4, 4]> var_675 = mul(x = qk_5, y = M_5)[name = tensor<string, []>("op_675")];
             tensor<bool, []> inner_5_transpose_x_0 = const()[name = tensor<string, []>("inner_5_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_5_transpose_y_0 = const()[name = tensor<string, []>("inner_5_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 4, 64]> v_5 = transpose(perm = v_5_perm_0, x = var_606)[name = tensor<string, []>("transpose_42")];
-            tensor<fp32, [1, 4, 4, 64]> inner_5 = matmul(transpose_x = inner_5_transpose_x_0, transpose_y = inner_5_transpose_y_0, x = var_619, y = v_5)[name = tensor<string, []>("inner_5")];
-            tensor<bool, []> var_621_transpose_x_0 = const()[name = tensor<string, []>("op_621_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_621_transpose_y_0 = const()[name = tensor<string, []>("op_621_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 4, 64]> var_621 = matmul(transpose_x = var_621_transpose_x_0, transpose_y = var_621_transpose_y_0, x = q_5, y = prev_kv_5)[name = tensor<string, []>("op_621")];
-            tensor<fp32, [4]> var_622 = real_div(x = sqrt_s0_5, y = sqrt_s_t_5)[name = tensor<string, []>("op_622")];
-            tensor<int32, [4]> var_623 = const()[name = tensor<string, []>("op_623"), val = tensor<int32, [4]>([1, 1, 4, 1])];
-            tensor<fp32, [1, 1, 4, 1]> var_624 = reshape(shape = var_623, x = var_622)[name = tensor<string, []>("op_624")];
-            tensor<fp32, [1, 4, 4, 64]> cross_5 = mul(x = var_621, y = var_624)[name = tensor<string, []>("cross_5")];
+            tensor<fp32, [1, 4, 4, 64]> v_5 = transpose(perm = v_5_perm_0, x = var_662)[name = tensor<string, []>("transpose_42")];
+            tensor<fp32, [1, 4, 4, 64]> inner_5 = matmul(transpose_x = inner_5_transpose_x_0, transpose_y = inner_5_transpose_y_0, x = var_675, y = v_5)[name = tensor<string, []>("inner_5")];
+            tensor<bool, []> var_677_transpose_x_0 = const()[name = tensor<string, []>("op_677_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_677_transpose_y_0 = const()[name = tensor<string, []>("op_677_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 4, 64]> var_677 = matmul(transpose_x = var_677_transpose_x_0, transpose_y = var_677_transpose_y_0, x = q_5, y = prev_kv_5)[name = tensor<string, []>("op_677")];
+            tensor<fp32, [4]> var_678 = real_div(x = sqrt_s0_5, y = sqrt_s_t_5)[name = tensor<string, []>("op_678")];
+            tensor<int32, [4]> var_679 = const()[name = tensor<string, []>("op_679"), val = tensor<int32, [4]>([1, 1, 4, 1])];
+            tensor<fp32, [1, 1, 4, 1]> var_680 = reshape(shape = var_679, x = var_678)[name = tensor<string, []>("op_680")];
+            tensor<fp32, [1, 4, 4, 64]> cross_5 = mul(x = var_677, y = var_680)[name = tensor<string, []>("cross_5")];
             tensor<fp32, [1, 4, 4, 64]> out_13 = add(x = inner_5, y = cross_5)[name = tensor<string, []>("out_13")];
-            tensor<fp32, [1, 4, 64, 64]> var_627 = mul(x = prev_kv_5, y = sqrt_s0_5)[name = tensor<string, []>("op_627")];
-            tensor<bool, []> var_629_transpose_x_1 = const()[name = tensor<string, []>("op_629_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_629_transpose_y_1 = const()[name = tensor<string, []>("op_629_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_629 = matmul(transpose_x = var_629_transpose_x_1, transpose_y = var_629_transpose_y_1, x = k_5, y = v_5)[name = tensor<string, []>("op_629")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_5 = add(x = var_627, y = var_629)[name = tensor<string, []>("new_kv_unnorm_5")];
-            tensor<fp32, []> var_631 = const()[name = tensor<string, []>("op_631"), val = tensor<fp32, []>(0x1p+2)];
-            tensor<fp32, [1]> new_scale_5 = add(x = prev_scale_5, y = var_631)[name = tensor<string, []>("new_scale_5")];
-            tensor<fp32, [1]> var_633 = sqrt(x = new_scale_5)[name = tensor<string, []>("op_633")];
-            tensor<fp32, [1, 4, 64, 64]> var_634 = real_div(x = new_kv_unnorm_5, y = var_633)[name = tensor<string, []>("op_634")];
-            tensor<int32, [4]> var_635_perm_0 = const()[name = tensor<string, []>("op_635_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_683 = mul(x = prev_kv_5, y = sqrt_s0_5)[name = tensor<string, []>("op_683")];
+            tensor<bool, []> var_685_transpose_x_1 = const()[name = tensor<string, []>("op_685_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_685_transpose_y_1 = const()[name = tensor<string, []>("op_685_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_685 = matmul(transpose_x = var_685_transpose_x_1, transpose_y = var_685_transpose_y_1, x = k_5, y = v_5)[name = tensor<string, []>("op_685")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_5 = add(x = var_683, y = var_685)[name = tensor<string, []>("new_kv_unnorm_5")];
+            tensor<fp32, []> var_687 = const()[name = tensor<string, []>("op_687"), val = tensor<fp32, []>(0x1p+2)];
+            tensor<fp32, [1]> new_scale_5 = add(x = prev_scale_5, y = var_687)[name = tensor<string, []>("new_scale_5")];
+            tensor<fp32, [1]> var_689 = sqrt(x = new_scale_5)[name = tensor<string, []>("op_689")];
+            tensor<fp32, [1, 4, 64, 64]> var_690 = real_div(x = new_kv_unnorm_5, y = var_689)[name = tensor<string, []>("op_690")];
+            tensor<int32, [4]> var_691_perm_0 = const()[name = tensor<string, []>("op_691_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_15_axes_0 = const()[name = tensor<string, []>("out_15_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 4, 64]> var_635 = transpose(perm = var_635_perm_0, x = out_13)[name = tensor<string, []>("transpose_41")];
-            tensor<fp32, [1, 4, 4, 64]> out_15 = layer_norm(axes = out_15_axes_0, epsilon = var_17, x = var_635)[name = tensor<string, []>("out_15")];
-            tensor<int32, [3]> var_639 = const()[name = tensor<string, []>("op_639"), val = tensor<int32, [3]>([1, 4, 256])];
-            tensor<fp32, [1, 4, 256]> out_17 = reshape(shape = var_639, x = out_15)[name = tensor<string, []>("out_17")];
-            tensor<fp32, [1, 4, 256]> var_641 = silu(x = input_97)[name = tensor<string, []>("op_641")];
-            tensor<fp32, [1, 4, 256]> input_99 = mul(x = var_641, y = out_17)[name = tensor<string, []>("input_99")];
-            tensor<fp32, [1, 4, 256]> ret_out_5 = linear(bias = encoder_out_proj_2_bias, weight = encoder_out_proj_2_weight, x = input_99)[name = tensor<string, []>("linear_25")];
-            tensor<fp32, [1, 4, 256]> x_15 = add(x = input_93, y = ret_out_5)[name = tensor<string, []>("x_15")];
+            tensor<fp32, [1, 4, 4, 64]> var_691 = transpose(perm = var_691_perm_0, x = out_13)[name = tensor<string, []>("transpose_41")];
+            tensor<fp32, [1, 4, 4, 64]> out_15 = layer_norm(axes = out_15_axes_0, epsilon = var_74, x = var_691)[name = tensor<string, []>("out_15")];
+            tensor<int32, [3]> var_695 = const()[name = tensor<string, []>("op_695"), val = tensor<int32, [3]>([1, 4, 256])];
+            tensor<fp32, [1, 4, 256]> out_17 = reshape(shape = var_695, x = out_15)[name = tensor<string, []>("out_17")];
+            tensor<fp32, [1, 4, 256]> var_697 = silu(x = input_99)[name = tensor<string, []>("op_697")];
+            tensor<fp32, [1, 4, 256]> input_101 = mul(x = var_697, y = out_17)[name = tensor<string, []>("input_101")];
+            tensor<fp32, [1, 4, 256]> ret_out_5 = linear(bias = inner_encoder_out_proj_2_bias, weight = inner_encoder_out_proj_2_weight, x = input_101)[name = tensor<string, []>("linear_25")];
+            tensor<fp32, [1, 4, 256]> x_15 = add(x = input_95, y = ret_out_5)[name = tensor<string, []>("x_15")];
             tensor<int32, [4]> window_21_begin_0 = const()[name = tensor<string, []>("window_21_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> window_21_end_0 = const()[name = tensor<string, []>("window_21_end_0"), val = tensor<int32, [4]>([3, 1, 16, 256])];
             tensor<bool, [4]> window_21_end_mask_0 = const()[name = tensor<string, []>("window_21_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_21_squeeze_mask_0 = const()[name = tensor<string, []>("window_21_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_21 = slice_by_index(begin = window_21_begin_0, end = window_21_end_0, end_mask = window_21_end_mask_0, squeeze_mask = window_21_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_21")];
-            tensor<int32, [3]> var_649_begin_0 = const()[name = tensor<string, []>("op_649_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_649_end_0 = const()[name = tensor<string, []>("op_649_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_649_end_mask_0 = const()[name = tensor<string, []>("op_649_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_649 = slice_by_index(begin = var_649_begin_0, end = var_649_end_0, end_mask = var_649_end_mask_0, x = x_15)[name = tensor<string, []>("op_649")];
-            tensor<int32, [3]> var_652_begin_0 = const()[name = tensor<string, []>("op_652_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_652_end_0 = const()[name = tensor<string, []>("op_652_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_652_end_mask_0 = const()[name = tensor<string, []>("op_652_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_652 = slice_by_index(begin = var_652_begin_0, end = var_652_end_0, end_mask = var_652_end_mask_0, x = window_21)[name = tensor<string, []>("op_652")];
+            tensor<int32, [3]> var_705_begin_0 = const()[name = tensor<string, []>("op_705_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_705_end_0 = const()[name = tensor<string, []>("op_705_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_705_end_mask_0 = const()[name = tensor<string, []>("op_705_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_705 = slice_by_index(begin = var_705_begin_0, end = var_705_end_0, end_mask = var_705_end_mask_0, x = x_15)[name = tensor<string, []>("op_705")];
+            tensor<int32, [3]> var_708_begin_0 = const()[name = tensor<string, []>("op_708_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_708_end_0 = const()[name = tensor<string, []>("op_708_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_708_end_mask_0 = const()[name = tensor<string, []>("op_708_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_708 = slice_by_index(begin = var_708_begin_0, end = var_708_end_0, end_mask = var_708_end_mask_0, x = window_21)[name = tensor<string, []>("op_708")];
             tensor<bool, []> window_23_interleave_0 = const()[name = tensor<string, []>("window_23_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_23 = concat(axis = var_26, interleave = window_23_interleave_0, values = (var_652, var_649))[name = tensor<string, []>("window_23")];
-            tensor<int32, [3]> var_657_begin_0 = const()[name = tensor<string, []>("op_657_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_657_end_0 = const()[name = tensor<string, []>("op_657_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_657_end_mask_0 = const()[name = tensor<string, []>("op_657_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_657 = slice_by_index(begin = var_657_begin_0, end = var_657_end_0, end_mask = var_657_end_mask_0, x = x_15)[name = tensor<string, []>("op_657")];
-            tensor<int32, [3]> var_660_begin_0 = const()[name = tensor<string, []>("op_660_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_660_end_0 = const()[name = tensor<string, []>("op_660_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_660_end_mask_0 = const()[name = tensor<string, []>("op_660_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_660 = slice_by_index(begin = var_660_begin_0, end = var_660_end_0, end_mask = var_660_end_mask_0, x = window_23)[name = tensor<string, []>("op_660")];
+            tensor<fp32, [1, 16, 256]> window_23 = concat(axis = var_82, interleave = window_23_interleave_0, values = (var_708, var_705))[name = tensor<string, []>("window_23")];
+            tensor<int32, [3]> var_713_begin_0 = const()[name = tensor<string, []>("op_713_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_713_end_0 = const()[name = tensor<string, []>("op_713_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_713_end_mask_0 = const()[name = tensor<string, []>("op_713_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_713 = slice_by_index(begin = var_713_begin_0, end = var_713_end_0, end_mask = var_713_end_mask_0, x = x_15)[name = tensor<string, []>("op_713")];
+            tensor<int32, [3]> var_716_begin_0 = const()[name = tensor<string, []>("op_716_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_716_end_0 = const()[name = tensor<string, []>("op_716_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_716_end_mask_0 = const()[name = tensor<string, []>("op_716_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_716 = slice_by_index(begin = var_716_begin_0, end = var_716_end_0, end_mask = var_716_end_mask_0, x = window_23)[name = tensor<string, []>("op_716")];
             tensor<bool, []> window_25_interleave_0 = const()[name = tensor<string, []>("window_25_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_25 = concat(axis = var_26, interleave = window_25_interleave_0, values = (var_660, var_657))[name = tensor<string, []>("window_25")];
-            tensor<int32, [3]> var_665_begin_0 = const()[name = tensor<string, []>("op_665_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_665_end_0 = const()[name = tensor<string, []>("op_665_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<bool, [3]> var_665_end_mask_0 = const()[name = tensor<string, []>("op_665_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_665 = slice_by_index(begin = var_665_begin_0, end = var_665_end_0, end_mask = var_665_end_mask_0, x = x_15)[name = tensor<string, []>("op_665")];
-            tensor<int32, [3]> var_668_begin_0 = const()[name = tensor<string, []>("op_668_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_668_end_0 = const()[name = tensor<string, []>("op_668_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_668_end_mask_0 = const()[name = tensor<string, []>("op_668_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_668 = slice_by_index(begin = var_668_begin_0, end = var_668_end_0, end_mask = var_668_end_mask_0, x = window_25)[name = tensor<string, []>("op_668")];
+            tensor<fp32, [1, 16, 256]> window_25 = concat(axis = var_82, interleave = window_25_interleave_0, values = (var_716, var_713))[name = tensor<string, []>("window_25")];
+            tensor<int32, [3]> var_721_begin_0 = const()[name = tensor<string, []>("op_721_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_721_end_0 = const()[name = tensor<string, []>("op_721_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<bool, [3]> var_721_end_mask_0 = const()[name = tensor<string, []>("op_721_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_721 = slice_by_index(begin = var_721_begin_0, end = var_721_end_0, end_mask = var_721_end_mask_0, x = x_15)[name = tensor<string, []>("op_721")];
+            tensor<int32, [3]> var_724_begin_0 = const()[name = tensor<string, []>("op_724_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_724_end_0 = const()[name = tensor<string, []>("op_724_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_724_end_mask_0 = const()[name = tensor<string, []>("op_724_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_724 = slice_by_index(begin = var_724_begin_0, end = var_724_end_0, end_mask = var_724_end_mask_0, x = window_25)[name = tensor<string, []>("op_724")];
             tensor<bool, []> window_27_interleave_0 = const()[name = tensor<string, []>("window_27_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_27 = concat(axis = var_26, interleave = window_27_interleave_0, values = (var_668, var_665))[name = tensor<string, []>("window_27")];
-            tensor<int32, [3]> var_673_begin_0 = const()[name = tensor<string, []>("op_673_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
-            tensor<int32, [3]> var_673_end_0 = const()[name = tensor<string, []>("op_673_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_673_end_mask_0 = const()[name = tensor<string, []>("op_673_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_673 = slice_by_index(begin = var_673_begin_0, end = var_673_end_0, end_mask = var_673_end_mask_0, x = x_15)[name = tensor<string, []>("op_673")];
-            tensor<int32, [3]> var_676_begin_0 = const()[name = tensor<string, []>("op_676_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_676_end_0 = const()[name = tensor<string, []>("op_676_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_676_end_mask_0 = const()[name = tensor<string, []>("op_676_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_676 = slice_by_index(begin = var_676_begin_0, end = var_676_end_0, end_mask = var_676_end_mask_0, x = window_27)[name = tensor<string, []>("op_676")];
+            tensor<fp32, [1, 16, 256]> window_27 = concat(axis = var_82, interleave = window_27_interleave_0, values = (var_724, var_721))[name = tensor<string, []>("window_27")];
+            tensor<int32, [3]> var_729_begin_0 = const()[name = tensor<string, []>("op_729_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
+            tensor<int32, [3]> var_729_end_0 = const()[name = tensor<string, []>("op_729_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_729_end_mask_0 = const()[name = tensor<string, []>("op_729_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_729 = slice_by_index(begin = var_729_begin_0, end = var_729_end_0, end_mask = var_729_end_mask_0, x = x_15)[name = tensor<string, []>("op_729")];
+            tensor<int32, [3]> var_732_begin_0 = const()[name = tensor<string, []>("op_732_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_732_end_0 = const()[name = tensor<string, []>("op_732_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_732_end_mask_0 = const()[name = tensor<string, []>("op_732_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_732 = slice_by_index(begin = var_732_begin_0, end = var_732_end_0, end_mask = var_732_end_mask_0, x = window_27)[name = tensor<string, []>("op_732")];
             tensor<bool, []> window_29_interleave_0 = const()[name = tensor<string, []>("window_29_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_29 = concat(axis = var_26, interleave = window_29_interleave_0, values = (var_676, var_673))[name = tensor<string, []>("window_29")];
-            tensor<bool, []> input_101_interleave_0 = const()[name = tensor<string, []>("input_101_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [4, 16, 256]> input_101 = concat(axis = var_23, interleave = input_101_interleave_0, values = (window_23, window_25, window_27, window_29))[name = tensor<string, []>("input_101")];
+            tensor<fp32, [1, 16, 256]> window_29 = concat(axis = var_82, interleave = window_29_interleave_0, values = (var_732, var_729))[name = tensor<string, []>("window_29")];
+            tensor<bool, []> input_103_interleave_0 = const()[name = tensor<string, []>("input_103_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [4, 16, 256]> input_103 = concat(axis = var_69, interleave = input_103_interleave_0, values = (window_23, window_25, window_27, window_29))[name = tensor<string, []>("input_103")];
             tensor<int32, [1]> x_17_axes_0 = const()[name = tensor<string, []>("x_17_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [4, 16, 256]> x_17 = layer_norm(axes = x_17_axes_0, beta = encoder_conv_module_2_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_2_sequential_0_weight, x = input_101)[name = tensor<string, []>("x_17")];
-            tensor<int32, [3]> input_103_perm_0 = const()[name = tensor<string, []>("input_103_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [4, 16, 256]> x_17 = layer_norm(axes = x_17_axes_0, beta = inner_encoder_conv_module_2_sequential_0_bias, epsilon = var_66, gamma = inner_encoder_conv_module_2_sequential_0_weight, x = input_103)[name = tensor<string, []>("x_17")];
+            tensor<int32, [3]> input_105_perm_0 = const()[name = tensor<string, []>("input_105_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_23_pad_type_0 = const()[name = tensor<string, []>("inputs_23_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_23_strides_0 = const()[name = tensor<string, []>("inputs_23_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_23_pad_0 = const()[name = tensor<string, []>("inputs_23_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_23_dilations_0 = const()[name = tensor<string, []>("inputs_23_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_23_groups_0 = const()[name = tensor<string, []>("inputs_23_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> input_103 = transpose(perm = input_103_perm_0, x = x_17)[name = tensor<string, []>("transpose_40")];
-            tensor<fp32, [4, 512, 16]> inputs_23 = conv(bias = encoder_conv_module_2_sequential_2_conv_bias, dilations = inputs_23_dilations_0, groups = inputs_23_groups_0, pad = inputs_23_pad_0, pad_type = inputs_23_pad_type_0, strides = inputs_23_strides_0, weight = encoder_conv_module_2_sequential_2_conv_weight, x = input_103)[name = tensor<string, []>("inputs_23")];
-            tensor<int32, [2]> var_701_split_sizes_0 = const()[name = tensor<string, []>("op_701_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_701_axis_0 = const()[name = tensor<string, []>("op_701_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> var_701_0, tensor<fp32, [4, 256, 16]> var_701_1 = split(axis = var_701_axis_0, split_sizes = var_701_split_sizes_0, x = inputs_23)[name = tensor<string, []>("op_701")];
-            tensor<fp32, [4, 256, 16]> var_703 = sigmoid(x = var_701_1)[name = tensor<string, []>("op_703")];
-            tensor<fp32, [4, 256, 16]> inputs_25 = mul(x = var_701_0, y = var_703)[name = tensor<string, []>("inputs_25")];
+            tensor<fp32, [4, 256, 16]> input_105 = transpose(perm = input_105_perm_0, x = x_17)[name = tensor<string, []>("transpose_40")];
+            tensor<fp32, [4, 512, 16]> inputs_23 = conv(bias = inner_encoder_conv_module_2_sequential_2_conv_bias, dilations = inputs_23_dilations_0, groups = inputs_23_groups_0, pad = inputs_23_pad_0, pad_type = inputs_23_pad_type_0, strides = inputs_23_strides_0, weight = inner_encoder_conv_module_2_sequential_2_conv_weight, x = input_105)[name = tensor<string, []>("inputs_23")];
+            tensor<int32, [2]> var_757_split_sizes_0 = const()[name = tensor<string, []>("op_757_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_757_axis_0 = const()[name = tensor<string, []>("op_757_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [4, 256, 16]> var_757_0, tensor<fp32, [4, 256, 16]> var_757_1 = split(axis = var_757_axis_0, split_sizes = var_757_split_sizes_0, x = inputs_23)[name = tensor<string, []>("op_757")];
+            tensor<fp32, [4, 256, 16]> var_759 = sigmoid(x = var_757_1)[name = tensor<string, []>("op_759")];
+            tensor<fp32, [4, 256, 16]> inputs_25 = mul(x = var_757_0, y = var_759)[name = tensor<string, []>("inputs_25")];
             tensor<string, []> outputs_aug_5_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_5_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_5_pad_0 = const()[name = tensor<string, []>("outputs_aug_5_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_5_groups_0 = const()[name = tensor<string, []>("outputs_aug_5_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_5_strides_0 = const()[name = tensor<string, []>("outputs_aug_5_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_5_dilations_0 = const()[name = tensor<string, []>("outputs_aug_5_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [4, 256, 31]> outputs_aug_5 = conv(dilations = outputs_aug_5_dilations_0, groups = outputs_aug_5_groups_0, pad = outputs_aug_5_pad_0, pad_type = outputs_aug_5_pad_type_0, strides = outputs_aug_5_strides_0, weight = encoder_conv_module_2_sequential_4_conv_weight, x = inputs_25)[name = tensor<string, []>("outputs_aug_5")];
-            tensor<int32, [3]> input_105_begin_0 = const()[name = tensor<string, []>("input_105_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_105_end_0 = const()[name = tensor<string, []>("input_105_end_0"), val = tensor<int32, [3]>([4, 256, 16])];
-            tensor<bool, [3]> input_105_end_mask_0 = const()[name = tensor<string, []>("input_105_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [4, 256, 16]> input_105 = slice_by_index(begin = input_105_begin_0, end = input_105_end_0, end_mask = input_105_end_mask_0, x = outputs_aug_5)[name = tensor<string, []>("input_105")];
-            tensor<fp32, [4, 256, 16]> inputs_27 = batch_norm(beta = encoder_conv_module_2_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_2_sequential_5_weight, mean = encoder_conv_module_2_sequential_5_running_mean, variance = encoder_conv_module_2_sequential_5_running_var, x = input_105)[name = tensor<string, []>("inputs_27")];
-            tensor<fp32, [4, 256, 16]> input_107 = silu(x = inputs_27)[name = tensor<string, []>("input_107")];
-            tensor<string, []> input_109_pad_type_0 = const()[name = tensor<string, []>("input_109_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_109_strides_0 = const()[name = tensor<string, []>("input_109_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_109_pad_0 = const()[name = tensor<string, []>("input_109_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_109_dilations_0 = const()[name = tensor<string, []>("input_109_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_109_groups_0 = const()[name = tensor<string, []>("input_109_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> input_109 = conv(bias = encoder_conv_module_2_sequential_7_conv_bias, dilations = input_109_dilations_0, groups = input_109_groups_0, pad = input_109_pad_0, pad_type = input_109_pad_type_0, strides = input_109_strides_0, weight = encoder_conv_module_2_sequential_7_conv_weight, x = input_107)[name = tensor<string, []>("input_109")];
+            tensor<fp32, [4, 256, 31]> outputs_aug_5 = conv(dilations = outputs_aug_5_dilations_0, groups = outputs_aug_5_groups_0, pad = outputs_aug_5_pad_0, pad_type = outputs_aug_5_pad_type_0, strides = outputs_aug_5_strides_0, weight = inner_encoder_conv_module_2_sequential_4_conv_weight, x = inputs_25)[name = tensor<string, []>("outputs_aug_5")];
+            tensor<int32, [3]> input_107_begin_0 = const()[name = tensor<string, []>("input_107_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_107_end_0 = const()[name = tensor<string, []>("input_107_end_0"), val = tensor<int32, [3]>([4, 256, 16])];
+            tensor<bool, [3]> input_107_end_mask_0 = const()[name = tensor<string, []>("input_107_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [4, 256, 16]> input_107 = slice_by_index(begin = input_107_begin_0, end = input_107_end_0, end_mask = input_107_end_mask_0, x = outputs_aug_5)[name = tensor<string, []>("input_107")];
+            tensor<fp32, [4, 256, 16]> inputs_27 = batch_norm(beta = inner_encoder_conv_module_2_sequential_5_bias, epsilon = var_66, gamma = inner_encoder_conv_module_2_sequential_5_weight, mean = inner_encoder_conv_module_2_sequential_5_running_mean, variance = inner_encoder_conv_module_2_sequential_5_running_var, x = input_107)[name = tensor<string, []>("inputs_27")];
+            tensor<fp32, [4, 256, 16]> input_109 = silu(x = inputs_27)[name = tensor<string, []>("input_109")];
+            tensor<string, []> input_111_pad_type_0 = const()[name = tensor<string, []>("input_111_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_111_strides_0 = const()[name = tensor<string, []>("input_111_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_111_pad_0 = const()[name = tensor<string, []>("input_111_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_111_dilations_0 = const()[name = tensor<string, []>("input_111_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_111_groups_0 = const()[name = tensor<string, []>("input_111_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [4, 256, 16]> input_111 = conv(bias = inner_encoder_conv_module_2_sequential_7_conv_bias, dilations = input_111_dilations_0, groups = input_111_groups_0, pad = input_111_pad_0, pad_type = input_111_pad_type_0, strides = input_111_strides_0, weight = inner_encoder_conv_module_2_sequential_7_conv_weight, x = input_109)[name = tensor<string, []>("input_111")];
             tensor<int32, [3]> conv_out_5_perm_0 = const()[name = tensor<string, []>("conv_out_5_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_734_begin_0 = const()[name = tensor<string, []>("op_734_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_734_end_0 = const()[name = tensor<string, []>("op_734_end_0"), val = tensor<int32, [3]>([4, 16, 256])];
-            tensor<bool, [3]> var_734_end_mask_0 = const()[name = tensor<string, []>("op_734_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [4, 16, 256]> conv_out_5 = transpose(perm = conv_out_5_perm_0, x = input_109)[name = tensor<string, []>("transpose_39")];
-            tensor<fp32, [4, 1, 256]> var_734 = slice_by_index(begin = var_734_begin_0, end = var_734_end_0, end_mask = var_734_end_mask_0, x = conv_out_5)[name = tensor<string, []>("op_734")];
-            tensor<int32, [3]> var_736_perm_0 = const()[name = tensor<string, []>("op_736_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 4, 256]> var_736 = transpose(perm = var_736_perm_0, x = var_734)[name = tensor<string, []>("transpose_38")];
-            tensor<fp32, [1, 4, 256]> input_111 = add(x = x_15, y = var_736)[name = tensor<string, []>("input_111")];
-            tensor<int32, [1]> input_113_axes_0 = const()[name = tensor<string, []>("input_113_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_113 = layer_norm(axes = input_113_axes_0, beta = encoder_ffn2_2_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_2_module_sequential_0_weight, x = input_111)[name = tensor<string, []>("input_113")];
-            tensor<fp32, [1, 4, 1024]> inputs_29 = linear(bias = encoder_ffn2_2_module_sequential_1_linear_bias, weight = encoder_ffn2_2_module_sequential_1_linear_weight, x = input_113)[name = tensor<string, []>("linear_26")];
-            tensor<fp32, [1, 4, 1024]> input_115 = silu(x = inputs_29)[name = tensor<string, []>("input_115")];
-            tensor<fp32, [1, 4, 256]> input_119 = linear(bias = encoder_ffn2_2_module_sequential_4_linear_bias, weight = encoder_ffn2_2_module_sequential_4_linear_weight, x = input_115)[name = tensor<string, []>("linear_27")];
-            tensor<fp32, []> var_759 = const()[name = tensor<string, []>("op_759"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 4, 256]> var_760 = mul(x = input_119, y = var_759)[name = tensor<string, []>("op_760")];
-            tensor<fp32, [1, 4, 256]> input_121 = add(x = var_760, y = input_111)[name = tensor<string, []>("input_121")];
-            tensor<int32, [1]> input_123_axes_0 = const()[name = tensor<string, []>("input_123_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_123 = layer_norm(axes = input_123_axes_0, beta = encoder_layer_norm_2_bias, epsilon = var_28, gamma = encoder_layer_norm_2_weight, x = input_121)[name = tensor<string, []>("input_123")];
+            tensor<int32, [3]> var_790_begin_0 = const()[name = tensor<string, []>("op_790_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_790_end_0 = const()[name = tensor<string, []>("op_790_end_0"), val = tensor<int32, [3]>([4, 16, 256])];
+            tensor<bool, [3]> var_790_end_mask_0 = const()[name = tensor<string, []>("op_790_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [4, 16, 256]> conv_out_5 = transpose(perm = conv_out_5_perm_0, x = input_111)[name = tensor<string, []>("transpose_39")];
+            tensor<fp32, [4, 1, 256]> var_790 = slice_by_index(begin = var_790_begin_0, end = var_790_end_0, end_mask = var_790_end_mask_0, x = conv_out_5)[name = tensor<string, []>("op_790")];
+            tensor<int32, [3]> var_792_perm_0 = const()[name = tensor<string, []>("op_792_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 4, 256]> var_792 = transpose(perm = var_792_perm_0, x = var_790)[name = tensor<string, []>("transpose_38")];
+            tensor<fp32, [1, 4, 256]> input_113 = add(x = x_15, y = var_792)[name = tensor<string, []>("input_113")];
+            tensor<int32, [1]> input_115_axes_0 = const()[name = tensor<string, []>("input_115_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 4, 256]> input_115 = layer_norm(axes = input_115_axes_0, beta = inner_encoder_ffn2_2_module_sequential_0_bias, epsilon = var_66, gamma = inner_encoder_ffn2_2_module_sequential_0_weight, x = input_113)[name = tensor<string, []>("input_115")];
+            tensor<fp32, [1, 4, 1024]> inputs_29 = linear(bias = inner_encoder_ffn2_2_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_2_module_sequential_1_linear_weight, x = input_115)[name = tensor<string, []>("linear_26")];
+            tensor<fp32, [1, 4, 1024]> input_117 = silu(x = inputs_29)[name = tensor<string, []>("input_117")];
+            tensor<fp32, [1, 4, 256]> input_121 = linear(bias = inner_encoder_ffn2_2_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_2_module_sequential_4_linear_weight, x = input_117)[name = tensor<string, []>("linear_27")];
+            tensor<fp32, []> var_815 = const()[name = tensor<string, []>("op_815"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 4, 256]> var_816 = mul(x = input_121, y = var_815)[name = tensor<string, []>("op_816")];
+            tensor<fp32, [1, 4, 256]> input_123 = add(x = var_816, y = input_113)[name = tensor<string, []>("input_123")];
             tensor<int32, [1]> input_125_axes_0 = const()[name = tensor<string, []>("input_125_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_125 = layer_norm(axes = input_125_axes_0, beta = encoder_ffn1_3_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_3_module_sequential_0_weight, x = input_123)[name = tensor<string, []>("input_125")];
-            tensor<fp32, [1, 4, 1024]> inputs_31 = linear(bias = encoder_ffn1_3_module_sequential_1_linear_bias, weight = encoder_ffn1_3_module_sequential_1_linear_weight, x = input_125)[name = tensor<string, []>("linear_28")];
-            tensor<fp32, [1, 4, 1024]> input_127 = silu(x = inputs_31)[name = tensor<string, []>("input_127")];
-            tensor<fp32, [1, 4, 256]> input_131 = linear(bias = encoder_ffn1_3_module_sequential_4_linear_bias, weight = encoder_ffn1_3_module_sequential_4_linear_weight, x = input_127)[name = tensor<string, []>("linear_29")];
-            tensor<fp32, []> var_789 = const()[name = tensor<string, []>("op_789"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 4, 256]> var_790 = mul(x = input_131, y = var_789)[name = tensor<string, []>("op_790")];
-            tensor<fp32, [1, 4, 256]> input_133 = add(x = var_790, y = input_123)[name = tensor<string, []>("input_133")];
+            tensor<fp32, [1, 4, 256]> input_125 = layer_norm(axes = input_125_axes_0, beta = inner_encoder_layer_norm_2_bias, epsilon = var_66, gamma = inner_encoder_layer_norm_2_weight, x = input_123)[name = tensor<string, []>("input_125")];
+            tensor<int32, [1]> input_127_axes_0 = const()[name = tensor<string, []>("input_127_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 4, 256]> input_127 = layer_norm(axes = input_127_axes_0, beta = inner_encoder_ffn1_3_module_sequential_0_bias, epsilon = var_66, gamma = inner_encoder_ffn1_3_module_sequential_0_weight, x = input_125)[name = tensor<string, []>("input_127")];
+            tensor<fp32, [1, 4, 1024]> inputs_31 = linear(bias = inner_encoder_ffn1_3_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_3_module_sequential_1_linear_weight, x = input_127)[name = tensor<string, []>("linear_28")];
+            tensor<fp32, [1, 4, 1024]> input_129 = silu(x = inputs_31)[name = tensor<string, []>("input_129")];
+            tensor<fp32, [1, 4, 256]> input_133 = linear(bias = inner_encoder_ffn1_3_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_3_module_sequential_4_linear_weight, x = input_129)[name = tensor<string, []>("linear_29")];
+            tensor<fp32, []> var_845 = const()[name = tensor<string, []>("op_845"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 4, 256]> var_846 = mul(x = input_133, y = var_845)[name = tensor<string, []>("op_846")];
+            tensor<fp32, [1, 4, 256]> input_135 = add(x = var_846, y = input_125)[name = tensor<string, []>("input_135")];
             tensor<int32, [1]> x_19_axes_0 = const()[name = tensor<string, []>("x_19_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> x_19 = layer_norm(axes = x_19_axes_0, beta = encoder_ret_lns_3_bias, epsilon = var_28, gamma = encoder_ret_lns_3_weight, x = input_133)[name = tensor<string, []>("x_19")];
+            tensor<fp32, [1, 4, 256]> x_19 = layer_norm(axes = x_19_axes_0, beta = inner_encoder_ret_lns_3_bias, epsilon = var_66, gamma = inner_encoder_ret_lns_3_weight, x = input_135)[name = tensor<string, []>("x_19")];
             tensor<int32, [5]> prev_kv_7_begin_0 = const()[name = tensor<string, []>("prev_kv_7_begin_0"), val = tensor<int32, [5]>([3, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_7_end_0 = const()[name = tensor<string, []>("prev_kv_7_end_0"), val = tensor<int32, [5]>([4, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_7_end_mask_0 = const()[name = tensor<string, []>("prev_kv_7_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -770,209 +792,202 @@ program(1.0)
             tensor<bool, [2]> prev_scale_7_end_mask_0 = const()[name = tensor<string, []>("prev_scale_7_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_7_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_7_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_7 = slice_by_index(begin = prev_scale_7_begin_0, end = prev_scale_7_end_0, end_mask = prev_scale_7_end_mask_0, squeeze_mask = prev_scale_7_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_7")];
-            tensor<fp32, [1, 4, 256]> var_804 = linear(bias = encoder_q_proj_3_bias, weight = encoder_q_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_30")];
-            tensor<int32, [4]> var_805 = const()[name = tensor<string, []>("op_805"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_806 = reshape(shape = var_805, x = var_804)[name = tensor<string, []>("op_806")];
+            tensor<fp32, [1, 4, 256]> var_860 = linear(bias = inner_encoder_q_proj_3_bias, weight = inner_encoder_q_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_30")];
+            tensor<int32, [4]> var_861 = const()[name = tensor<string, []>("op_861"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_862 = reshape(shape = var_861, x = var_860)[name = tensor<string, []>("op_862")];
             tensor<int32, [4]> q_7_perm_0 = const()[name = tensor<string, []>("q_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> var_810 = linear(bias = encoder_k_proj_3_bias, weight = encoder_k_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_31")];
-            tensor<fp32, []> var_811 = const()[name = tensor<string, []>("op_811"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 4, 256]> var_812 = mul(x = var_810, y = var_811)[name = tensor<string, []>("op_812")];
-            tensor<int32, [4]> var_813 = const()[name = tensor<string, []>("op_813"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_814 = reshape(shape = var_813, x = var_812)[name = tensor<string, []>("op_814")];
+            tensor<fp32, [1, 4, 256]> var_866 = linear(bias = inner_encoder_k_proj_3_bias, weight = inner_encoder_k_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_31")];
+            tensor<fp32, []> var_867 = const()[name = tensor<string, []>("op_867"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 4, 256]> var_868 = mul(x = var_866, y = var_867)[name = tensor<string, []>("op_868")];
+            tensor<int32, [4]> var_869 = const()[name = tensor<string, []>("op_869"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_870 = reshape(shape = var_869, x = var_868)[name = tensor<string, []>("op_870")];
             tensor<int32, [4]> k_7_perm_0 = const()[name = tensor<string, []>("k_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> var_818 = linear(bias = encoder_v_proj_3_bias, weight = encoder_v_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_32")];
-            tensor<int32, [4]> var_819 = const()[name = tensor<string, []>("op_819"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_820 = reshape(shape = var_819, x = var_818)[name = tensor<string, []>("op_820")];
+            tensor<fp32, [1, 4, 256]> var_874 = linear(bias = inner_encoder_v_proj_3_bias, weight = inner_encoder_v_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_32")];
+            tensor<int32, [4]> var_875 = const()[name = tensor<string, []>("op_875"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_876 = reshape(shape = var_875, x = var_874)[name = tensor<string, []>("op_876")];
             tensor<int32, [4]> v_7_perm_0 = const()[name = tensor<string, []>("v_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> input_137 = linear(bias = encoder_g_proj_3_bias, weight = encoder_g_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_33")];
+            tensor<fp32, [1, 4, 256]> input_139 = linear(bias = inner_encoder_g_proj_3_bias, weight = inner_encoder_g_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_33")];
             tensor<fp32, [1]> sqrt_s0_7 = sqrt(x = prev_scale_7)[name = tensor<string, []>("sqrt_s0_7")];
-            tensor<fp32, [4]> s_t_7 = add(x = prev_scale_7, y = encoder__t_index)[name = tensor<string, []>("s_t_7")];
+            tensor<fp32, [4]> s_t_7 = add(x = prev_scale_7, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_7")];
             tensor<fp32, [4]> sqrt_s_t_7 = sqrt(x = s_t_7)[name = tensor<string, []>("sqrt_s_t_7")];
             tensor<bool, []> qk_7_transpose_x_1 = const()[name = tensor<string, []>("qk_7_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_7_transpose_y_1 = const()[name = tensor<string, []>("qk_7_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 4, 64]> k_7 = transpose(perm = k_7_perm_0, x = var_814)[name = tensor<string, []>("transpose_36")];
-            tensor<fp32, [1, 4, 4, 64]> q_7 = transpose(perm = q_7_perm_0, x = var_806)[name = tensor<string, []>("transpose_37")];
+            tensor<fp32, [1, 4, 4, 64]> k_7 = transpose(perm = k_7_perm_0, x = var_870)[name = tensor<string, []>("transpose_36")];
+            tensor<fp32, [1, 4, 4, 64]> q_7 = transpose(perm = q_7_perm_0, x = var_862)[name = tensor<string, []>("transpose_37")];
             tensor<fp32, [1, 4, 4, 4]> qk_7 = matmul(transpose_x = qk_7_transpose_x_1, transpose_y = qk_7_transpose_y_1, x = q_7, y = k_7)[name = tensor<string, []>("qk_7")];
-            tensor<int32, [2]> var_830 = const()[name = tensor<string, []>("op_830"), val = tensor<int32, [2]>([4, 1])];
-            tensor<fp32, [4, 1]> var_831 = reshape(shape = var_830, x = sqrt_s_t_7)[name = tensor<string, []>("op_831")];
-            tensor<fp32, [4, 4]> M_7 = real_div(x = encoder__causal_mask, y = var_831)[name = tensor<string, []>("M_7")];
-            tensor<fp32, [1, 4, 4, 4]> var_833 = mul(x = qk_7, y = M_7)[name = tensor<string, []>("op_833")];
+            tensor<int32, [2]> var_886 = const()[name = tensor<string, []>("op_886"), val = tensor<int32, [2]>([4, 1])];
+            tensor<fp32, [4, 1]> var_887 = reshape(shape = var_886, x = sqrt_s_t_7)[name = tensor<string, []>("op_887")];
+            tensor<fp32, [4, 4]> M_7 = real_div(x = inner_encoder__causal_mask, y = var_887)[name = tensor<string, []>("M_7")];
+            tensor<fp32, [1, 4, 4, 4]> var_889 = mul(x = qk_7, y = M_7)[name = tensor<string, []>("op_889")];
             tensor<bool, []> inner_7_transpose_x_0 = const()[name = tensor<string, []>("inner_7_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_7_transpose_y_0 = const()[name = tensor<string, []>("inner_7_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 4, 64]> v_7 = transpose(perm = v_7_perm_0, x = var_820)[name = tensor<string, []>("transpose_35")];
-            tensor<fp32, [1, 4, 4, 64]> inner_7 = matmul(transpose_x = inner_7_transpose_x_0, transpose_y = inner_7_transpose_y_0, x = var_833, y = v_7)[name = tensor<string, []>("inner_7")];
-            tensor<bool, []> var_835_transpose_x_0 = const()[name = tensor<string, []>("op_835_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_835_transpose_y_0 = const()[name = tensor<string, []>("op_835_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 4, 64]> var_835 = matmul(transpose_x = var_835_transpose_x_0, transpose_y = var_835_transpose_y_0, x = q_7, y = prev_kv_7)[name = tensor<string, []>("op_835")];
-            tensor<fp32, [4]> var_836 = real_div(x = sqrt_s0_7, y = sqrt_s_t_7)[name = tensor<string, []>("op_836")];
-            tensor<int32, [4]> var_837 = const()[name = tensor<string, []>("op_837"), val = tensor<int32, [4]>([1, 1, 4, 1])];
-            tensor<fp32, [1, 1, 4, 1]> var_838 = reshape(shape = var_837, x = var_836)[name = tensor<string, []>("op_838")];
-            tensor<fp32, [1, 4, 4, 64]> cross_7 = mul(x = var_835, y = var_838)[name = tensor<string, []>("cross_7")];
+            tensor<fp32, [1, 4, 4, 64]> v_7 = transpose(perm = v_7_perm_0, x = var_876)[name = tensor<string, []>("transpose_35")];
+            tensor<fp32, [1, 4, 4, 64]> inner_7 = matmul(transpose_x = inner_7_transpose_x_0, transpose_y = inner_7_transpose_y_0, x = var_889, y = v_7)[name = tensor<string, []>("inner_7")];
+            tensor<bool, []> var_891_transpose_x_0 = const()[name = tensor<string, []>("op_891_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_891_transpose_y_0 = const()[name = tensor<string, []>("op_891_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 4, 64]> var_891 = matmul(transpose_x = var_891_transpose_x_0, transpose_y = var_891_transpose_y_0, x = q_7, y = prev_kv_7)[name = tensor<string, []>("op_891")];
+            tensor<fp32, [4]> var_892 = real_div(x = sqrt_s0_7, y = sqrt_s_t_7)[name = tensor<string, []>("op_892")];
+            tensor<int32, [4]> var_893 = const()[name = tensor<string, []>("op_893"), val = tensor<int32, [4]>([1, 1, 4, 1])];
+            tensor<fp32, [1, 1, 4, 1]> var_894 = reshape(shape = var_893, x = var_892)[name = tensor<string, []>("op_894")];
+            tensor<fp32, [1, 4, 4, 64]> cross_7 = mul(x = var_891, y = var_894)[name = tensor<string, []>("cross_7")];
             tensor<fp32, [1, 4, 4, 64]> out_19 = add(x = inner_7, y = cross_7)[name = tensor<string, []>("out_19")];
-            tensor<fp32, [1, 4, 64, 64]> var_841 = mul(x = prev_kv_7, y = sqrt_s0_7)[name = tensor<string, []>("op_841")];
-            tensor<bool, []> var_843_transpose_x_1 = const()[name = tensor<string, []>("op_843_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_843_transpose_y_1 = const()[name = tensor<string, []>("op_843_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_843 = matmul(transpose_x = var_843_transpose_x_1, transpose_y = var_843_transpose_y_1, x = k_7, y = v_7)[name = tensor<string, []>("op_843")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_7 = add(x = var_841, y = var_843)[name = tensor<string, []>("new_kv_unnorm_7")];
-            tensor<fp32, []> var_845 = const()[name = tensor<string, []>("op_845"), val = tensor<fp32, []>(0x1p+2)];
-            tensor<fp32, [1]> new_scale_7 = add(x = prev_scale_7, y = var_845)[name = tensor<string, []>("new_scale_7")];
-            tensor<fp32, [1]> var_847 = sqrt(x = new_scale_7)[name = tensor<string, []>("op_847")];
-            tensor<fp32, [1, 4, 64, 64]> nkv_1 = real_div(x = new_kv_unnorm_7, y = var_847)[name = tensor<string, []>("nkv_1")];
-            tensor<int32, [4]> var_849_perm_0 = const()[name = tensor<string, []>("op_849_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_897 = mul(x = prev_kv_7, y = sqrt_s0_7)[name = tensor<string, []>("op_897")];
+            tensor<bool, []> var_899_transpose_x_1 = const()[name = tensor<string, []>("op_899_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_899_transpose_y_1 = const()[name = tensor<string, []>("op_899_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_899 = matmul(transpose_x = var_899_transpose_x_1, transpose_y = var_899_transpose_y_1, x = k_7, y = v_7)[name = tensor<string, []>("op_899")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_7 = add(x = var_897, y = var_899)[name = tensor<string, []>("new_kv_unnorm_7")];
+            tensor<fp32, []> var_901 = const()[name = tensor<string, []>("op_901"), val = tensor<fp32, []>(0x1p+2)];
+            tensor<fp32, [1]> new_scale_7 = add(x = prev_scale_7, y = var_901)[name = tensor<string, []>("new_scale_7")];
+            tensor<fp32, [1]> var_903 = sqrt(x = new_scale_7)[name = tensor<string, []>("op_903")];
+            tensor<fp32, [1, 4, 64, 64]> nkv_1 = real_div(x = new_kv_unnorm_7, y = var_903)[name = tensor<string, []>("nkv_1")];
+            tensor<int32, [4]> var_905_perm_0 = const()[name = tensor<string, []>("op_905_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_21_axes_0 = const()[name = tensor<string, []>("out_21_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 4, 64]> var_849 = transpose(perm = var_849_perm_0, x = out_19)[name = tensor<string, []>("transpose_34")];
-            tensor<fp32, [1, 4, 4, 64]> out_21 = layer_norm(axes = out_21_axes_0, epsilon = var_17, x = var_849)[name = tensor<string, []>("out_21")];
-            tensor<int32, [3]> var_853 = const()[name = tensor<string, []>("op_853"), val = tensor<int32, [3]>([1, 4, 256])];
-            tensor<fp32, [1, 4, 256]> out_23 = reshape(shape = var_853, x = out_21)[name = tensor<string, []>("out_23")];
-            tensor<fp32, [1, 4, 256]> var_855 = silu(x = input_137)[name = tensor<string, []>("op_855")];
-            tensor<fp32, [1, 4, 256]> input_139 = mul(x = var_855, y = out_23)[name = tensor<string, []>("input_139")];
-            tensor<fp32, [1, 4, 256]> ret_out_7 = linear(bias = encoder_out_proj_3_bias, weight = encoder_out_proj_3_weight, x = input_139)[name = tensor<string, []>("linear_34")];
-            tensor<fp32, [1, 4, 256]> x_21 = add(x = input_133, y = ret_out_7)[name = tensor<string, []>("x_21")];
+            tensor<fp32, [1, 4, 4, 64]> var_905 = transpose(perm = var_905_perm_0, x = out_19)[name = tensor<string, []>("transpose_34")];
+            tensor<fp32, [1, 4, 4, 64]> out_21 = layer_norm(axes = out_21_axes_0, epsilon = var_74, x = var_905)[name = tensor<string, []>("out_21")];
+            tensor<int32, [3]> var_909 = const()[name = tensor<string, []>("op_909"), val = tensor<int32, [3]>([1, 4, 256])];
+            tensor<fp32, [1, 4, 256]> out_23 = reshape(shape = var_909, x = out_21)[name = tensor<string, []>("out_23")];
+            tensor<fp32, [1, 4, 256]> var_911 = silu(x = input_139)[name = tensor<string, []>("op_911")];
+            tensor<fp32, [1, 4, 256]> input_141 = mul(x = var_911, y = out_23)[name = tensor<string, []>("input_141")];
+            tensor<fp32, [1, 4, 256]> ret_out_7 = linear(bias = inner_encoder_out_proj_3_bias, weight = inner_encoder_out_proj_3_weight, x = input_141)[name = tensor<string, []>("linear_34")];
+            tensor<fp32, [1, 4, 256]> x_21 = add(x = input_135, y = ret_out_7)[name = tensor<string, []>("x_21")];
             tensor<int32, [4]> window_31_begin_0 = const()[name = tensor<string, []>("window_31_begin_0"), val = tensor<int32, [4]>([3, 0, 0, 0])];
             tensor<int32, [4]> window_31_end_0 = const()[name = tensor<string, []>("window_31_end_0"), val = tensor<int32, [4]>([4, 1, 16, 256])];
             tensor<bool, [4]> window_31_end_mask_0 = const()[name = tensor<string, []>("window_31_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_31_squeeze_mask_0 = const()[name = tensor<string, []>("window_31_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_31 = slice_by_index(begin = window_31_begin_0, end = window_31_end_0, end_mask = window_31_end_mask_0, squeeze_mask = window_31_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_31")];
-            tensor<int32, [3]> var_863_begin_0 = const()[name = tensor<string, []>("op_863_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_863_end_0 = const()[name = tensor<string, []>("op_863_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_863_end_mask_0 = const()[name = tensor<string, []>("op_863_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_863 = slice_by_index(begin = var_863_begin_0, end = var_863_end_0, end_mask = var_863_end_mask_0, x = x_21)[name = tensor<string, []>("op_863")];
-            tensor<int32, [3]> var_866_begin_0 = const()[name = tensor<string, []>("op_866_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_866_end_0 = const()[name = tensor<string, []>("op_866_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_866_end_mask_0 = const()[name = tensor<string, []>("op_866_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_866 = slice_by_index(begin = var_866_begin_0, end = var_866_end_0, end_mask = var_866_end_mask_0, x = window_31)[name = tensor<string, []>("op_866")];
+            tensor<int32, [3]> var_919_begin_0 = const()[name = tensor<string, []>("op_919_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_919_end_0 = const()[name = tensor<string, []>("op_919_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_919_end_mask_0 = const()[name = tensor<string, []>("op_919_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_919 = slice_by_index(begin = var_919_begin_0, end = var_919_end_0, end_mask = var_919_end_mask_0, x = x_21)[name = tensor<string, []>("op_919")];
+            tensor<int32, [3]> var_922_begin_0 = const()[name = tensor<string, []>("op_922_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_922_end_0 = const()[name = tensor<string, []>("op_922_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_922_end_mask_0 = const()[name = tensor<string, []>("op_922_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_922 = slice_by_index(begin = var_922_begin_0, end = var_922_end_0, end_mask = var_922_end_mask_0, x = window_31)[name = tensor<string, []>("op_922")];
             tensor<bool, []> window_33_interleave_0 = const()[name = tensor<string, []>("window_33_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_33 = concat(axis = var_26, interleave = window_33_interleave_0, values = (var_866, var_863))[name = tensor<string, []>("window_33")];
-            tensor<int32, [3]> var_871_begin_0 = const()[name = tensor<string, []>("op_871_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_871_end_0 = const()[name = tensor<string, []>("op_871_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_871_end_mask_0 = const()[name = tensor<string, []>("op_871_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_871 = slice_by_index(begin = var_871_begin_0, end = var_871_end_0, end_mask = var_871_end_mask_0, x = x_21)[name = tensor<string, []>("op_871")];
-            tensor<int32, [3]> var_874_begin_0 = const()[name = tensor<string, []>("op_874_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_874_end_0 = const()[name = tensor<string, []>("op_874_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_874_end_mask_0 = const()[name = tensor<string, []>("op_874_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_874 = slice_by_index(begin = var_874_begin_0, end = var_874_end_0, end_mask = var_874_end_mask_0, x = window_33)[name = tensor<string, []>("op_874")];
+            tensor<fp32, [1, 16, 256]> window_33 = concat(axis = var_82, interleave = window_33_interleave_0, values = (var_922, var_919))[name = tensor<string, []>("window_33")];
+            tensor<int32, [3]> var_927_begin_0 = const()[name = tensor<string, []>("op_927_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_927_end_0 = const()[name = tensor<string, []>("op_927_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_927_end_mask_0 = const()[name = tensor<string, []>("op_927_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_927 = slice_by_index(begin = var_927_begin_0, end = var_927_end_0, end_mask = var_927_end_mask_0, x = x_21)[name = tensor<string, []>("op_927")];
+            tensor<int32, [3]> var_930_begin_0 = const()[name = tensor<string, []>("op_930_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_930_end_0 = const()[name = tensor<string, []>("op_930_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_930_end_mask_0 = const()[name = tensor<string, []>("op_930_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_930 = slice_by_index(begin = var_930_begin_0, end = var_930_end_0, end_mask = var_930_end_mask_0, x = window_33)[name = tensor<string, []>("op_930")];
             tensor<bool, []> window_35_interleave_0 = const()[name = tensor<string, []>("window_35_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_35 = concat(axis = var_26, interleave = window_35_interleave_0, values = (var_874, var_871))[name = tensor<string, []>("window_35")];
-            tensor<int32, [3]> var_879_begin_0 = const()[name = tensor<string, []>("op_879_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_879_end_0 = const()[name = tensor<string, []>("op_879_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<bool, [3]> var_879_end_mask_0 = const()[name = tensor<string, []>("op_879_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_879 = slice_by_index(begin = var_879_begin_0, end = var_879_end_0, end_mask = var_879_end_mask_0, x = x_21)[name = tensor<string, []>("op_879")];
-            tensor<int32, [3]> var_882_begin_0 = const()[name = tensor<string, []>("op_882_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_882_end_0 = const()[name = tensor<string, []>("op_882_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_882_end_mask_0 = const()[name = tensor<string, []>("op_882_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_882 = slice_by_index(begin = var_882_begin_0, end = var_882_end_0, end_mask = var_882_end_mask_0, x = window_35)[name = tensor<string, []>("op_882")];
+            tensor<fp32, [1, 16, 256]> window_35 = concat(axis = var_82, interleave = window_35_interleave_0, values = (var_930, var_927))[name = tensor<string, []>("window_35")];
+            tensor<int32, [3]> var_935_begin_0 = const()[name = tensor<string, []>("op_935_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_935_end_0 = const()[name = tensor<string, []>("op_935_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<bool, [3]> var_935_end_mask_0 = const()[name = tensor<string, []>("op_935_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_935 = slice_by_index(begin = var_935_begin_0, end = var_935_end_0, end_mask = var_935_end_mask_0, x = x_21)[name = tensor<string, []>("op_935")];
+            tensor<int32, [3]> var_938_begin_0 = const()[name = tensor<string, []>("op_938_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_938_end_0 = const()[name = tensor<string, []>("op_938_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_938_end_mask_0 = const()[name = tensor<string, []>("op_938_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_938 = slice_by_index(begin = var_938_begin_0, end = var_938_end_0, end_mask = var_938_end_mask_0, x = window_35)[name = tensor<string, []>("op_938")];
             tensor<bool, []> window_37_interleave_0 = const()[name = tensor<string, []>("window_37_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_37 = concat(axis = var_26, interleave = window_37_interleave_0, values = (var_882, var_879))[name = tensor<string, []>("window_37")];
-            tensor<int32, [3]> var_887_begin_0 = const()[name = tensor<string, []>("op_887_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
-            tensor<int32, [3]> var_887_end_0 = const()[name = tensor<string, []>("op_887_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_887_end_mask_0 = const()[name = tensor<string, []>("op_887_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_887 = slice_by_index(begin = var_887_begin_0, end = var_887_end_0, end_mask = var_887_end_mask_0, x = x_21)[name = tensor<string, []>("op_887")];
-            tensor<int32, [3]> var_890_begin_0 = const()[name = tensor<string, []>("op_890_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_890_end_0 = const()[name = tensor<string, []>("op_890_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_890_end_mask_0 = const()[name = tensor<string, []>("op_890_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_890 = slice_by_index(begin = var_890_begin_0, end = var_890_end_0, end_mask = var_890_end_mask_0, x = window_37)[name = tensor<string, []>("op_890")];
+            tensor<fp32, [1, 16, 256]> window_37 = concat(axis = var_82, interleave = window_37_interleave_0, values = (var_938, var_935))[name = tensor<string, []>("window_37")];
+            tensor<int32, [3]> var_943_begin_0 = const()[name = tensor<string, []>("op_943_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
+            tensor<int32, [3]> var_943_end_0 = const()[name = tensor<string, []>("op_943_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_943_end_mask_0 = const()[name = tensor<string, []>("op_943_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_943 = slice_by_index(begin = var_943_begin_0, end = var_943_end_0, end_mask = var_943_end_mask_0, x = x_21)[name = tensor<string, []>("op_943")];
+            tensor<int32, [3]> var_946_begin_0 = const()[name = tensor<string, []>("op_946_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_946_end_0 = const()[name = tensor<string, []>("op_946_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_946_end_mask_0 = const()[name = tensor<string, []>("op_946_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_946 = slice_by_index(begin = var_946_begin_0, end = var_946_end_0, end_mask = var_946_end_mask_0, x = window_37)[name = tensor<string, []>("op_946")];
             tensor<bool, []> window_interleave_0 = const()[name = tensor<string, []>("window_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window = concat(axis = var_26, interleave = window_interleave_0, values = (var_890, var_887))[name = tensor<string, []>("window")];
-            tensor<bool, []> input_141_interleave_0 = const()[name = tensor<string, []>("input_141_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [4, 16, 256]> input_141 = concat(axis = var_23, interleave = input_141_interleave_0, values = (window_33, window_35, window_37, window))[name = tensor<string, []>("input_141")];
+            tensor<fp32, [1, 16, 256]> window = concat(axis = var_82, interleave = window_interleave_0, values = (var_946, var_943))[name = tensor<string, []>("window")];
+            tensor<bool, []> input_143_interleave_0 = const()[name = tensor<string, []>("input_143_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [4, 16, 256]> input_143 = concat(axis = var_69, interleave = input_143_interleave_0, values = (window_33, window_35, window_37, window))[name = tensor<string, []>("input_143")];
             tensor<int32, [1]> x_23_axes_0 = const()[name = tensor<string, []>("x_23_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [4, 16, 256]> x_23 = layer_norm(axes = x_23_axes_0, beta = encoder_conv_module_3_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_3_sequential_0_weight, x = input_141)[name = tensor<string, []>("x_23")];
-            tensor<int32, [3]> input_143_perm_0 = const()[name = tensor<string, []>("input_143_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [4, 16, 256]> x_23 = layer_norm(axes = x_23_axes_0, beta = inner_encoder_conv_module_3_sequential_0_bias, epsilon = var_66, gamma = inner_encoder_conv_module_3_sequential_0_weight, x = input_143)[name = tensor<string, []>("x_23")];
+            tensor<int32, [3]> input_145_perm_0 = const()[name = tensor<string, []>("input_145_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_33_pad_type_0 = const()[name = tensor<string, []>("inputs_33_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_33_strides_0 = const()[name = tensor<string, []>("inputs_33_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_33_pad_0 = const()[name = tensor<string, []>("inputs_33_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_33_dilations_0 = const()[name = tensor<string, []>("inputs_33_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_33_groups_0 = const()[name = tensor<string, []>("inputs_33_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> input_143 = transpose(perm = input_143_perm_0, x = x_23)[name = tensor<string, []>("transpose_33")];
-            tensor<fp32, [4, 512, 16]> inputs_33 = conv(bias = encoder_conv_module_3_sequential_2_conv_bias, dilations = inputs_33_dilations_0, groups = inputs_33_groups_0, pad = inputs_33_pad_0, pad_type = inputs_33_pad_type_0, strides = inputs_33_strides_0, weight = encoder_conv_module_3_sequential_2_conv_weight, x = input_143)[name = tensor<string, []>("inputs_33")];
-            tensor<int32, [2]> var_915_split_sizes_0 = const()[name = tensor<string, []>("op_915_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_915_axis_0 = const()[name = tensor<string, []>("op_915_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> var_915_0, tensor<fp32, [4, 256, 16]> var_915_1 = split(axis = var_915_axis_0, split_sizes = var_915_split_sizes_0, x = inputs_33)[name = tensor<string, []>("op_915")];
-            tensor<fp32, [4, 256, 16]> var_917 = sigmoid(x = var_915_1)[name = tensor<string, []>("op_917")];
-            tensor<fp32, [4, 256, 16]> inputs_35 = mul(x = var_915_0, y = var_917)[name = tensor<string, []>("inputs_35")];
+            tensor<fp32, [4, 256, 16]> input_145 = transpose(perm = input_145_perm_0, x = x_23)[name = tensor<string, []>("transpose_33")];
+            tensor<fp32, [4, 512, 16]> inputs_33 = conv(bias = inner_encoder_conv_module_3_sequential_2_conv_bias, dilations = inputs_33_dilations_0, groups = inputs_33_groups_0, pad = inputs_33_pad_0, pad_type = inputs_33_pad_type_0, strides = inputs_33_strides_0, weight = inner_encoder_conv_module_3_sequential_2_conv_weight, x = input_145)[name = tensor<string, []>("inputs_33")];
+            tensor<int32, [2]> var_971_split_sizes_0 = const()[name = tensor<string, []>("op_971_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_971_axis_0 = const()[name = tensor<string, []>("op_971_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [4, 256, 16]> var_971_0, tensor<fp32, [4, 256, 16]> var_971_1 = split(axis = var_971_axis_0, split_sizes = var_971_split_sizes_0, x = inputs_33)[name = tensor<string, []>("op_971")];
+            tensor<fp32, [4, 256, 16]> var_973 = sigmoid(x = var_971_1)[name = tensor<string, []>("op_973")];
+            tensor<fp32, [4, 256, 16]> inputs_35 = mul(x = var_971_0, y = var_973)[name = tensor<string, []>("inputs_35")];
             tensor<string, []> outputs_aug_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_pad_0 = const()[name = tensor<string, []>("outputs_aug_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_groups_0 = const()[name = tensor<string, []>("outputs_aug_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_strides_0 = const()[name = tensor<string, []>("outputs_aug_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_dilations_0 = const()[name = tensor<string, []>("outputs_aug_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [4, 256, 31]> outputs_aug = conv(dilations = outputs_aug_dilations_0, groups = outputs_aug_groups_0, pad = outputs_aug_pad_0, pad_type = outputs_aug_pad_type_0, strides = outputs_aug_strides_0, weight = encoder_conv_module_3_sequential_4_conv_weight, x = inputs_35)[name = tensor<string, []>("outputs_aug")];
-            tensor<int32, [3]> input_145_begin_0 = const()[name = tensor<string, []>("input_145_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_145_end_0 = const()[name = tensor<string, []>("input_145_end_0"), val = tensor<int32, [3]>([4, 256, 16])];
-            tensor<bool, [3]> input_145_end_mask_0 = const()[name = tensor<string, []>("input_145_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [4, 256, 16]> input_145 = slice_by_index(begin = input_145_begin_0, end = input_145_end_0, end_mask = input_145_end_mask_0, x = outputs_aug)[name = tensor<string, []>("input_145")];
-            tensor<fp32, [4, 256, 16]> inputs_37 = batch_norm(beta = encoder_conv_module_3_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_3_sequential_5_weight, mean = encoder_conv_module_3_sequential_5_running_mean, variance = encoder_conv_module_3_sequential_5_running_var, x = input_145)[name = tensor<string, []>("inputs_37")];
-            tensor<fp32, [4, 256, 16]> input_147 = silu(x = inputs_37)[name = tensor<string, []>("input_147")];
-            tensor<string, []> input_149_pad_type_0 = const()[name = tensor<string, []>("input_149_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_149_strides_0 = const()[name = tensor<string, []>("input_149_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_149_pad_0 = const()[name = tensor<string, []>("input_149_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_149_dilations_0 = const()[name = tensor<string, []>("input_149_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_149_groups_0 = const()[name = tensor<string, []>("input_149_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> input_149 = conv(bias = encoder_conv_module_3_sequential_7_conv_bias, dilations = input_149_dilations_0, groups = input_149_groups_0, pad = input_149_pad_0, pad_type = input_149_pad_type_0, strides = input_149_strides_0, weight = encoder_conv_module_3_sequential_7_conv_weight, x = input_147)[name = tensor<string, []>("input_149")];
+            tensor<fp32, [4, 256, 31]> outputs_aug = conv(dilations = outputs_aug_dilations_0, groups = outputs_aug_groups_0, pad = outputs_aug_pad_0, pad_type = outputs_aug_pad_type_0, strides = outputs_aug_strides_0, weight = inner_encoder_conv_module_3_sequential_4_conv_weight, x = inputs_35)[name = tensor<string, []>("outputs_aug")];
+            tensor<int32, [3]> input_147_begin_0 = const()[name = tensor<string, []>("input_147_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_147_end_0 = const()[name = tensor<string, []>("input_147_end_0"), val = tensor<int32, [3]>([4, 256, 16])];
+            tensor<bool, [3]> input_147_end_mask_0 = const()[name = tensor<string, []>("input_147_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [4, 256, 16]> input_147 = slice_by_index(begin = input_147_begin_0, end = input_147_end_0, end_mask = input_147_end_mask_0, x = outputs_aug)[name = tensor<string, []>("input_147")];
+            tensor<fp32, [4, 256, 16]> inputs_37 = batch_norm(beta = inner_encoder_conv_module_3_sequential_5_bias, epsilon = var_66, gamma = inner_encoder_conv_module_3_sequential_5_weight, mean = inner_encoder_conv_module_3_sequential_5_running_mean, variance = inner_encoder_conv_module_3_sequential_5_running_var, x = input_147)[name = tensor<string, []>("inputs_37")];
+            tensor<fp32, [4, 256, 16]> input_149 = silu(x = inputs_37)[name = tensor<string, []>("input_149")];
+            tensor<string, []> input_151_pad_type_0 = const()[name = tensor<string, []>("input_151_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_151_strides_0 = const()[name = tensor<string, []>("input_151_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_151_pad_0 = const()[name = tensor<string, []>("input_151_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_151_dilations_0 = const()[name = tensor<string, []>("input_151_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_151_groups_0 = const()[name = tensor<string, []>("input_151_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [4, 256, 16]> input_151 = conv(bias = inner_encoder_conv_module_3_sequential_7_conv_bias, dilations = input_151_dilations_0, groups = input_151_groups_0, pad = input_151_pad_0, pad_type = input_151_pad_type_0, strides = input_151_strides_0, weight = inner_encoder_conv_module_3_sequential_7_conv_weight, x = input_149)[name = tensor<string, []>("input_151")];
             tensor<int32, [3]> conv_out_7_perm_0 = const()[name = tensor<string, []>("conv_out_7_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_948_begin_0 = const()[name = tensor<string, []>("op_948_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_948_end_0 = const()[name = tensor<string, []>("op_948_end_0"), val = tensor<int32, [3]>([4, 16, 256])];
-            tensor<bool, [3]> var_948_end_mask_0 = const()[name = tensor<string, []>("op_948_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [4, 16, 256]> conv_out_7 = transpose(perm = conv_out_7_perm_0, x = input_149)[name = tensor<string, []>("transpose_32")];
-            tensor<fp32, [4, 1, 256]> var_948 = slice_by_index(begin = var_948_begin_0, end = var_948_end_0, end_mask = var_948_end_mask_0, x = conv_out_7)[name = tensor<string, []>("op_948")];
-            tensor<int32, [3]> var_950_perm_0 = const()[name = tensor<string, []>("op_950_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 4, 256]> var_950 = transpose(perm = var_950_perm_0, x = var_948)[name = tensor<string, []>("transpose_31")];
-            tensor<fp32, [1, 4, 256]> input_151 = add(x = x_21, y = var_950)[name = tensor<string, []>("input_151")];
-            tensor<int32, [1]> input_153_axes_0 = const()[name = tensor<string, []>("input_153_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_153 = layer_norm(axes = input_153_axes_0, beta = encoder_ffn2_3_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_3_module_sequential_0_weight, x = input_151)[name = tensor<string, []>("input_153")];
-            tensor<fp32, [1, 4, 1024]> inputs = linear(bias = encoder_ffn2_3_module_sequential_1_linear_bias, weight = encoder_ffn2_3_module_sequential_1_linear_weight, x = input_153)[name = tensor<string, []>("linear_35")];
-            tensor<fp32, [1, 4, 1024]> input_155 = silu(x = inputs)[name = tensor<string, []>("input_155")];
-            tensor<fp32, [1, 4, 256]> input_159 = linear(bias = encoder_ffn2_3_module_sequential_4_linear_bias, weight = encoder_ffn2_3_module_sequential_4_linear_weight, x = input_155)[name = tensor<string, []>("linear_36")];
-            tensor<fp32, []> var_973 = const()[name = tensor<string, []>("op_973"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 4, 256]> var_974 = mul(x = input_159, y = var_973)[name = tensor<string, []>("op_974")];
-            tensor<fp32, [1, 4, 256]> input_161 = add(x = var_974, y = input_151)[name = tensor<string, []>("input_161")];
+            tensor<int32, [3]> var_1004_begin_0 = const()[name = tensor<string, []>("op_1004_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_1004_end_0 = const()[name = tensor<string, []>("op_1004_end_0"), val = tensor<int32, [3]>([4, 16, 256])];
+            tensor<bool, [3]> var_1004_end_mask_0 = const()[name = tensor<string, []>("op_1004_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [4, 16, 256]> conv_out_7 = transpose(perm = conv_out_7_perm_0, x = input_151)[name = tensor<string, []>("transpose_32")];
+            tensor<fp32, [4, 1, 256]> var_1004 = slice_by_index(begin = var_1004_begin_0, end = var_1004_end_0, end_mask = var_1004_end_mask_0, x = conv_out_7)[name = tensor<string, []>("op_1004")];
+            tensor<int32, [3]> var_1006_perm_0 = const()[name = tensor<string, []>("op_1006_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 4, 256]> var_1006 = transpose(perm = var_1006_perm_0, x = var_1004)[name = tensor<string, []>("transpose_31")];
+            tensor<fp32, [1, 4, 256]> input_153 = add(x = x_21, y = var_1006)[name = tensor<string, []>("input_153")];
+            tensor<int32, [1]> input_155_axes_0 = const()[name = tensor<string, []>("input_155_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 4, 256]> input_155 = layer_norm(axes = input_155_axes_0, beta = inner_encoder_ffn2_3_module_sequential_0_bias, epsilon = var_66, gamma = inner_encoder_ffn2_3_module_sequential_0_weight, x = input_153)[name = tensor<string, []>("input_155")];
+            tensor<fp32, [1, 4, 1024]> inputs = linear(bias = inner_encoder_ffn2_3_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_3_module_sequential_1_linear_weight, x = input_155)[name = tensor<string, []>("linear_35")];
+            tensor<fp32, [1, 4, 1024]> input_157 = silu(x = inputs)[name = tensor<string, []>("input_157")];
+            tensor<fp32, [1, 4, 256]> input_161 = linear(bias = inner_encoder_ffn2_3_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_3_module_sequential_4_linear_weight, x = input_157)[name = tensor<string, []>("linear_36")];
+            tensor<fp32, []> var_1029 = const()[name = tensor<string, []>("op_1029"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 4, 256]> var_1030 = mul(x = input_161, y = var_1029)[name = tensor<string, []>("op_1030")];
+            tensor<fp32, [1, 4, 256]> input_163 = add(x = var_1030, y = input_153)[name = tensor<string, []>("input_163")];
             tensor<int32, [1]> x_25_axes_0 = const()[name = tensor<string, []>("x_25_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> x_25 = layer_norm(axes = x_25_axes_0, beta = encoder_layer_norm_3_bias, epsilon = var_28, gamma = encoder_layer_norm_3_weight, x = input_161)[name = tensor<string, []>("x_25")];
+            tensor<fp32, [1, 4, 256]> x_25 = layer_norm(axes = x_25_axes_0, beta = inner_encoder_layer_norm_3_bias, epsilon = var_66, gamma = inner_encoder_layer_norm_3_weight, x = input_163)[name = tensor<string, []>("x_25")];
             tensor<int32, [3]> x_ct_perm_0 = const()[name = tensor<string, []>("x_ct_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<bool, []> cat_interleave_0 = const()[name = tensor<string, []>("cat_interleave_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [1, 256, 4]> x_ct = transpose(perm = x_ct_perm_0, x = x_25)[name = tensor<string, []>("transpose_30")];
-            tensor<fp32, [1, 256, 22]> cat = concat(axis = var_20, interleave = cat_interleave_0, values = (cnn_window, x_ct))[name = tensor<string, []>("cat")];
+            tensor<fp32, [1, 256, 22]> cat = concat(axis = var_71, interleave = cat_interleave_0, values = (cnn_window, x_ct))[name = tensor<string, []>("cat")];
             tensor<string, []> conv_out_pad_type_0 = const()[name = tensor<string, []>("conv_out_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> conv_out_strides_0 = const()[name = tensor<string, []>("conv_out_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> conv_out_pad_0 = const()[name = tensor<string, []>("conv_out_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> conv_out_dilations_0 = const()[name = tensor<string, []>("conv_out_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> conv_out_groups_0 = const()[name = tensor<string, []>("conv_out_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 4]> conv_out = conv(bias = encoder_cnn_bias, dilations = conv_out_dilations_0, groups = conv_out_groups_0, pad = conv_out_pad_0, pad_type = conv_out_pad_type_0, strides = conv_out_strides_0, weight = encoder_cnn_weight, x = cat)[name = tensor<string, []>("conv_out")];
-            tensor<int32, [3]> var_992_begin_0 = const()[name = tensor<string, []>("op_992_begin_0"), val = tensor<int32, [3]>([0, 0, 4])];
-            tensor<int32, [3]> var_992_end_0 = const()[name = tensor<string, []>("op_992_end_0"), val = tensor<int32, [3]>([1, 256, 22])];
-            tensor<bool, [3]> var_992_end_mask_0 = const()[name = tensor<string, []>("op_992_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 256, 18]> cnn_window_new = slice_by_index(begin = var_992_begin_0, end = var_992_end_0, end_mask = var_992_end_mask_0, x = cat)[name = tensor<string, []>("op_992")];
-            tensor<int32, [3]> input_163_perm_0 = const()[name = tensor<string, []>("input_163_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [1]> var_994 = const()[name = tensor<string, []>("op_994"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_163 = transpose(perm = input_163_perm_0, x = conv_out)[name = tensor<string, []>("transpose_29")];
-            tensor<fp32, [1, 4, 1]> var_995 = reduce_l2_norm(axes = var_994, keep_dims = var_29, x = input_163)[name = tensor<string, []>("op_995")];
+            tensor<fp32, [1, 256, 4]> conv_out = conv(bias = inner_encoder_cnn_bias, dilations = conv_out_dilations_0, groups = conv_out_groups_0, pad = conv_out_pad_0, pad_type = conv_out_pad_type_0, strides = conv_out_strides_0, weight = inner_encoder_cnn_weight, x = cat)[name = tensor<string, []>("conv_out")];
+            tensor<int32, [3]> var_1048_begin_0 = const()[name = tensor<string, []>("op_1048_begin_0"), val = tensor<int32, [3]>([0, 0, 4])];
+            tensor<int32, [3]> var_1048_end_0 = const()[name = tensor<string, []>("op_1048_end_0"), val = tensor<int32, [3]>([1, 256, 22])];
+            tensor<bool, [3]> var_1048_end_mask_0 = const()[name = tensor<string, []>("op_1048_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 256, 18]> cnn_window_new = slice_by_index(begin = var_1048_begin_0, end = var_1048_end_0, end_mask = var_1048_end_mask_0, x = cat)[name = tensor<string, []>("op_1048")];
+            tensor<int32, [3]> input_165_perm_0 = const()[name = tensor<string, []>("input_165_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_1050 = const()[name = tensor<string, []>("op_1050"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 4, 256]> input_165 = transpose(perm = input_165_perm_0, x = conv_out)[name = tensor<string, []>("transpose_29")];
+            tensor<fp32, [1, 4, 1]> var_1051 = reduce_l2_norm(axes = var_1050, keep_dims = var_65, x = input_165)[name = tensor<string, []>("op_1051")];
             tensor<fp32, []> const_12 = const()[name = tensor<string, []>("const_12"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1, 4, 1]> clip_0 = clip(alpha = var_11, beta = const_12, x = var_995)[name = tensor<string, []>("clip_0")];
-            tensor<fp32, [1, 4, 256]> emb = real_div(x = input_163, y = clip_0)[name = tensor<string, []>("emb")];
-            tensor<int32, []> var_999_axis_0 = const()[name = tensor<string, []>("op_999_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1, 4, 64, 64]> enc_kv_new = stack(axis = var_999_axis_0, values = (var_206, var_420, var_634, nkv_1))[name = tensor<string, []>("op_999")];
-            tensor<int32, []> var_1001_axis_0 = const()[name = tensor<string, []>("op_1001_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1]> enc_scale_new = stack(axis = var_1001_axis_0, values = (new_scale_1, new_scale_3, new_scale_5, new_scale_7))[name = tensor<string, []>("op_1001")];
-            tensor<int32, []> var_1003_axis_0 = const()[name = tensor<string, []>("op_1003_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1, 16, 256]> enc_conv_cache_new = stack(axis = var_1003_axis_0, values = (window_9, window_19, window_29, window))[name = tensor<string, []>("op_1003")];
-            tensor<fp32, []> var_1012 = const()[name = tensor<string, []>("op_1012"), val = tensor<fp32, []>(0x1.5798eep-27)];
-            tensor<fp32, []> var_1017 = const()[name = tensor<string, []>("op_1017"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
-            tensor<fp32, []> var_1019 = const()[name = tensor<string, []>("op_1019"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
-            tensor<bool, []> var_1020 = const()[name = tensor<string, []>("op_1020"), val = tensor<bool, []>(true)];
-            tensor<fp32, []> var_1022 = const()[name = tensor<string, []>("op_1022"), val = tensor<fp32, []>(0x1p+0)];
-            tensor<int32, []> var_1026 = const()[name = tensor<string, []>("op_1026"), val = tensor<int32, []>(-1)];
-            tensor<int32, []> var_1032 = const()[name = tensor<string, []>("op_1032"), val = tensor<int32, []>(0)];
+            tensor<fp32, [1, 4, 1]> clip_0 = clip(alpha = var_79, beta = const_12, x = var_1051)[name = tensor<string, []>("clip_0")];
+            tensor<fp32, [1, 4, 256]> emb = real_div(x = input_165, y = clip_0)[name = tensor<string, []>("emb")];
+            tensor<int32, []> var_1055_axis_0 = const()[name = tensor<string, []>("op_1055_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1, 4, 64, 64]> enc_kv_new = stack(axis = var_1055_axis_0, values = (var_262, var_476, var_690, nkv_1))[name = tensor<string, []>("op_1055")];
+            tensor<int32, []> var_1057_axis_0 = const()[name = tensor<string, []>("op_1057_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1]> enc_scale_new = stack(axis = var_1057_axis_0, values = (new_scale_1, new_scale_3, new_scale_5, new_scale_7))[name = tensor<string, []>("op_1057")];
+            tensor<int32, []> var_1059_axis_0 = const()[name = tensor<string, []>("op_1059_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1, 16, 256]> enc_conv_cache_new = stack(axis = var_1059_axis_0, values = (window_9, window_19, window_29, window))[name = tensor<string, []>("op_1059")];
             tensor<fp32, [1, 4, 9, 256]> pos = const()[name = tensor<string, []>("pos"), val = tensor<fp32, [1, 4, 9, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44395584)))];
-            tensor<int32, [1]> var_1094_axes_0 = const()[name = tensor<string, []>("op_1094_axes_0"), val = tensor<int32, [1]>([2])];
-            tensor<fp32, [1, 4, 1, 256]> var_1094 = expand_dims(axes = var_1094_axes_0, x = emb)[name = tensor<string, []>("op_1094")];
+            tensor<int32, [1]> var_1127_axes_0 = const()[name = tensor<string, []>("op_1127_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp32, [1, 4, 1, 256]> var_1127 = expand_dims(axes = var_1127_axes_0, x = emb)[name = tensor<string, []>("op_1127")];
             tensor<int32, [4]> emb_exp_reps_0 = const()[name = tensor<string, []>("emb_exp_reps_0"), val = tensor<int32, [4]>([1, 1, 9, 1])];
-            tensor<fp32, [1, 4, 9, 256]> emb_exp = tile(reps = emb_exp_reps_0, x = var_1094)[name = tensor<string, []>("emb_exp")];
-            tensor<bool, []> input_165_interleave_0 = const()[name = tensor<string, []>("input_165_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 9, 512]> input_165 = concat(axis = var_1026, interleave = input_165_interleave_0, values = (emb_exp, pos))[name = tensor<string, []>("input_165")];
-            tensor<fp32, [1, 4, 9, 256]> x_27 = linear(bias = decoder_convert_bias, weight = decoder_convert_weight, x = input_165)[name = tensor<string, []>("linear_37")];
-            tensor<int32, [4]> var_1102_perm_0 = const()[name = tensor<string, []>("op_1102_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1106 = const()[name = tensor<string, []>("op_1106"), val = tensor<int32, [3]>([9, 4, 256])];
-            tensor<fp32, [1, 9, 4, 256]> var_1102 = transpose(perm = var_1102_perm_0, x = x_27)[name = tensor<string, []>("transpose_28")];
-            tensor<fp32, [9, 4, 256]> x_29 = reshape(shape = var_1106, x = var_1102)[name = tensor<string, []>("x_29")];
+            tensor<fp32, [1, 4, 9, 256]> emb_exp = tile(reps = emb_exp_reps_0, x = var_1127)[name = tensor<string, []>("emb_exp")];
+            tensor<bool, []> input_167_interleave_0 = const()[name = tensor<string, []>("input_167_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 9, 512]> input_167 = concat(axis = var_72, interleave = input_167_interleave_0, values = (emb_exp, pos))[name = tensor<string, []>("input_167")];
+            tensor<fp32, [1, 4, 9, 256]> x_27 = linear(bias = inner_decoder_convert_bias, weight = inner_decoder_convert_weight, x = input_167)[name = tensor<string, []>("linear_37")];
+            tensor<int32, [4]> var_1135_perm_0 = const()[name = tensor<string, []>("op_1135_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1139 = const()[name = tensor<string, []>("op_1139"), val = tensor<int32, [3]>([9, 4, 256])];
+            tensor<fp32, [1, 9, 4, 256]> var_1135 = transpose(perm = var_1135_perm_0, x = x_27)[name = tensor<string, []>("transpose_28")];
+            tensor<fp32, [9, 4, 256]> x_29 = reshape(shape = var_1139, x = var_1135)[name = tensor<string, []>("x_29")];
             tensor<int32, [5]> prev_kv_9_begin_0 = const()[name = tensor<string, []>("prev_kv_9_begin_0"), val = tensor<int32, [5]>([0, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_9_end_0 = const()[name = tensor<string, []>("prev_kv_9_end_0"), val = tensor<int32, [5]>([1, 9, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_9_end_mask_0 = const()[name = tensor<string, []>("prev_kv_9_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -983,132 +998,132 @@ program(1.0)
             tensor<bool, [2]> prev_scale_9_end_mask_0 = const()[name = tensor<string, []>("prev_scale_9_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_9_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_9_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_9 = slice_by_index(begin = prev_scale_9_begin_0, end = prev_scale_9_end_0, end_mask = prev_scale_9_end_mask_0, squeeze_mask = prev_scale_9_squeeze_mask_0, x = dec_scale)[name = tensor<string, []>("prev_scale_9")];
-            tensor<fp32, [9, 4, 256]> var_1114 = linear(bias = decoder_q_proj_0_bias, weight = decoder_q_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_38")];
-            tensor<int32, [4]> var_1115 = const()[name = tensor<string, []>("op_1115"), val = tensor<int32, [4]>([9, 4, 4, 64])];
-            tensor<fp32, [9, 4, 4, 64]> var_1116 = reshape(shape = var_1115, x = var_1114)[name = tensor<string, []>("op_1116")];
+            tensor<fp32, [9, 4, 256]> var_1147 = linear(bias = inner_decoder_q_proj_0_bias, weight = inner_decoder_q_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_38")];
+            tensor<int32, [4]> var_1148 = const()[name = tensor<string, []>("op_1148"), val = tensor<int32, [4]>([9, 4, 4, 64])];
+            tensor<fp32, [9, 4, 4, 64]> var_1149 = reshape(shape = var_1148, x = var_1147)[name = tensor<string, []>("op_1149")];
             tensor<int32, [4]> q_9_perm_0 = const()[name = tensor<string, []>("q_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [9, 4, 256]> var_1120 = linear(bias = decoder_k_proj_0_bias, weight = decoder_k_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_39")];
-            tensor<fp32, []> var_1121 = const()[name = tensor<string, []>("op_1121"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [9, 4, 256]> var_1122 = mul(x = var_1120, y = var_1121)[name = tensor<string, []>("op_1122")];
-            tensor<int32, [4]> var_1123 = const()[name = tensor<string, []>("op_1123"), val = tensor<int32, [4]>([9, 4, 4, 64])];
-            tensor<fp32, [9, 4, 4, 64]> var_1124 = reshape(shape = var_1123, x = var_1122)[name = tensor<string, []>("op_1124")];
+            tensor<fp32, [9, 4, 256]> var_1153 = linear(bias = inner_decoder_k_proj_0_bias, weight = inner_decoder_k_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_39")];
+            tensor<fp32, []> var_1154 = const()[name = tensor<string, []>("op_1154"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [9, 4, 256]> var_1155 = mul(x = var_1153, y = var_1154)[name = tensor<string, []>("op_1155")];
+            tensor<int32, [4]> var_1156 = const()[name = tensor<string, []>("op_1156"), val = tensor<int32, [4]>([9, 4, 4, 64])];
+            tensor<fp32, [9, 4, 4, 64]> var_1157 = reshape(shape = var_1156, x = var_1155)[name = tensor<string, []>("op_1157")];
             tensor<int32, [4]> k_9_perm_0 = const()[name = tensor<string, []>("k_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [9, 4, 256]> var_1128 = linear(bias = decoder_v_proj_0_bias, weight = decoder_v_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_40")];
-            tensor<int32, [4]> var_1129 = const()[name = tensor<string, []>("op_1129"), val = tensor<int32, [4]>([9, 4, 4, 64])];
-            tensor<fp32, [9, 4, 4, 64]> var_1130 = reshape(shape = var_1129, x = var_1128)[name = tensor<string, []>("op_1130")];
+            tensor<fp32, [9, 4, 256]> var_1161 = linear(bias = inner_decoder_v_proj_0_bias, weight = inner_decoder_v_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_40")];
+            tensor<int32, [4]> var_1162 = const()[name = tensor<string, []>("op_1162"), val = tensor<int32, [4]>([9, 4, 4, 64])];
+            tensor<fp32, [9, 4, 4, 64]> var_1163 = reshape(shape = var_1162, x = var_1161)[name = tensor<string, []>("op_1163")];
             tensor<int32, [4]> v_9_perm_0 = const()[name = tensor<string, []>("v_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [9, 4, 256]> input_169 = linear(bias = decoder_g_proj_0_bias, weight = decoder_g_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_41")];
+            tensor<fp32, [9, 4, 256]> input_171 = linear(bias = inner_decoder_g_proj_0_bias, weight = inner_decoder_g_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_41")];
             tensor<bool, []> cumsum_mask_1_exclusive_0 = const()[name = tensor<string, []>("cumsum_mask_1_exclusive_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> cumsum_mask_1_reverse_0 = const()[name = tensor<string, []>("cumsum_mask_1_reverse_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [4]> cumsum_mask_1 = cumsum(axis = var_1032, exclusive = cumsum_mask_1_exclusive_0, reverse = cumsum_mask_1_reverse_0, x = valid_mask)[name = tensor<string, []>("cumsum_mask_1")];
+            tensor<fp32, [4]> cumsum_mask_1 = cumsum(axis = var_69, exclusive = cumsum_mask_1_exclusive_0, reverse = cumsum_mask_1_reverse_0, x = valid_mask)[name = tensor<string, []>("cumsum_mask_1")];
             tensor<fp32, [1]> sqrt_s0_9 = sqrt(x = prev_scale_9)[name = tensor<string, []>("sqrt_s0_9")];
             tensor<fp32, [4]> s_t_9 = add(x = prev_scale_9, y = cumsum_mask_1)[name = tensor<string, []>("s_t_9")];
             tensor<fp32, []> const_20 = const()[name = tensor<string, []>("const_20"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [4]> clip_1 = clip(alpha = var_1022, beta = const_20, x = s_t_9)[name = tensor<string, []>("clip_1")];
+            tensor<fp32, [4]> clip_1 = clip(alpha = var_59, beta = const_20, x = s_t_9)[name = tensor<string, []>("clip_1")];
             tensor<fp32, [4]> sqrt_s_t_9 = sqrt(x = clip_1)[name = tensor<string, []>("sqrt_s_t_9")];
             tensor<bool, []> qk_9_transpose_x_1 = const()[name = tensor<string, []>("qk_9_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_9_transpose_y_1 = const()[name = tensor<string, []>("qk_9_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [9, 4, 4, 64]> k_9 = transpose(perm = k_9_perm_0, x = var_1124)[name = tensor<string, []>("transpose_26")];
-            tensor<fp32, [9, 4, 4, 64]> q_9 = transpose(perm = q_9_perm_0, x = var_1116)[name = tensor<string, []>("transpose_27")];
+            tensor<fp32, [9, 4, 4, 64]> k_9 = transpose(perm = k_9_perm_0, x = var_1157)[name = tensor<string, []>("transpose_26")];
+            tensor<fp32, [9, 4, 4, 64]> q_9 = transpose(perm = q_9_perm_0, x = var_1149)[name = tensor<string, []>("transpose_27")];
             tensor<fp32, [9, 4, 4, 4]> qk_9 = matmul(transpose_x = qk_9_transpose_x_1, transpose_y = qk_9_transpose_y_1, x = q_9, y = k_9)[name = tensor<string, []>("qk_9")];
-            tensor<int32, [2]> var_1142 = const()[name = tensor<string, []>("op_1142"), val = tensor<int32, [2]>([1, 4])];
-            tensor<fp32, [1, 4]> var_1143 = reshape(shape = var_1142, x = valid_mask)[name = tensor<string, []>("op_1143")];
-            tensor<fp32, [4, 4]> causal_with_valid_1 = mul(x = decoder__causal_mask, y = var_1143)[name = tensor<string, []>("causal_with_valid_1")];
-            tensor<int32, [2]> var_1145 = const()[name = tensor<string, []>("op_1145"), val = tensor<int32, [2]>([4, 1])];
-            tensor<fp32, [4, 1]> var_1146 = reshape(shape = var_1145, x = sqrt_s_t_9)[name = tensor<string, []>("op_1146")];
-            tensor<fp32, [4, 4]> M_9 = real_div(x = causal_with_valid_1, y = var_1146)[name = tensor<string, []>("M_9")];
-            tensor<fp32, [9, 4, 4, 4]> var_1148 = mul(x = qk_9, y = M_9)[name = tensor<string, []>("op_1148")];
+            tensor<int32, [2]> var_1175 = const()[name = tensor<string, []>("op_1175"), val = tensor<int32, [2]>([1, 4])];
+            tensor<fp32, [1, 4]> var_1176 = reshape(shape = var_1175, x = valid_mask)[name = tensor<string, []>("op_1176")];
+            tensor<fp32, [4, 4]> causal_with_valid_1 = mul(x = inner_decoder__causal_mask, y = var_1176)[name = tensor<string, []>("causal_with_valid_1")];
+            tensor<int32, [2]> var_1178 = const()[name = tensor<string, []>("op_1178"), val = tensor<int32, [2]>([4, 1])];
+            tensor<fp32, [4, 1]> var_1179 = reshape(shape = var_1178, x = sqrt_s_t_9)[name = tensor<string, []>("op_1179")];
+            tensor<fp32, [4, 4]> M_9 = real_div(x = causal_with_valid_1, y = var_1179)[name = tensor<string, []>("M_9")];
+            tensor<fp32, [9, 4, 4, 4]> var_1181 = mul(x = qk_9, y = M_9)[name = tensor<string, []>("op_1181")];
             tensor<bool, []> inner_9_transpose_x_0 = const()[name = tensor<string, []>("inner_9_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_9_transpose_y_0 = const()[name = tensor<string, []>("inner_9_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [9, 4, 4, 64]> v_9 = transpose(perm = v_9_perm_0, x = var_1130)[name = tensor<string, []>("transpose_25")];
-            tensor<fp32, [9, 4, 4, 64]> inner_9 = matmul(transpose_x = inner_9_transpose_x_0, transpose_y = inner_9_transpose_y_0, x = var_1148, y = v_9)[name = tensor<string, []>("inner_9")];
-            tensor<bool, []> var_1150_transpose_x_0 = const()[name = tensor<string, []>("op_1150_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_1150_transpose_y_0 = const()[name = tensor<string, []>("op_1150_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [9, 4, 4, 64]> var_1150 = matmul(transpose_x = var_1150_transpose_x_0, transpose_y = var_1150_transpose_y_0, x = q_9, y = prev_kv_9)[name = tensor<string, []>("op_1150")];
-            tensor<fp32, [4]> var_1151 = real_div(x = sqrt_s0_9, y = sqrt_s_t_9)[name = tensor<string, []>("op_1151")];
-            tensor<int32, [4]> var_1152 = const()[name = tensor<string, []>("op_1152"), val = tensor<int32, [4]>([1, 1, 4, 1])];
-            tensor<fp32, [1, 1, 4, 1]> var_1153 = reshape(shape = var_1152, x = var_1151)[name = tensor<string, []>("op_1153")];
-            tensor<fp32, [9, 4, 4, 64]> cross_9 = mul(x = var_1150, y = var_1153)[name = tensor<string, []>("cross_9")];
+            tensor<fp32, [9, 4, 4, 64]> v_9 = transpose(perm = v_9_perm_0, x = var_1163)[name = tensor<string, []>("transpose_25")];
+            tensor<fp32, [9, 4, 4, 64]> inner_9 = matmul(transpose_x = inner_9_transpose_x_0, transpose_y = inner_9_transpose_y_0, x = var_1181, y = v_9)[name = tensor<string, []>("inner_9")];
+            tensor<bool, []> var_1183_transpose_x_0 = const()[name = tensor<string, []>("op_1183_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_1183_transpose_y_0 = const()[name = tensor<string, []>("op_1183_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [9, 4, 4, 64]> var_1183 = matmul(transpose_x = var_1183_transpose_x_0, transpose_y = var_1183_transpose_y_0, x = q_9, y = prev_kv_9)[name = tensor<string, []>("op_1183")];
+            tensor<fp32, [4]> var_1184 = real_div(x = sqrt_s0_9, y = sqrt_s_t_9)[name = tensor<string, []>("op_1184")];
+            tensor<int32, [4]> var_1185 = const()[name = tensor<string, []>("op_1185"), val = tensor<int32, [4]>([1, 1, 4, 1])];
+            tensor<fp32, [1, 1, 4, 1]> var_1186 = reshape(shape = var_1185, x = var_1184)[name = tensor<string, []>("op_1186")];
+            tensor<fp32, [9, 4, 4, 64]> cross_9 = mul(x = var_1183, y = var_1186)[name = tensor<string, []>("cross_9")];
             tensor<fp32, [9, 4, 4, 64]> out_25 = add(x = inner_9, y = cross_9)[name = tensor<string, []>("out_25")];
-            tensor<int32, [4]> var_1156 = const()[name = tensor<string, []>("op_1156"), val = tensor<int32, [4]>([1, 1, 4, 1])];
-            tensor<fp32, [1, 1, 4, 1]> var_1157 = reshape(shape = var_1156, x = valid_mask)[name = tensor<string, []>("op_1157")];
-            tensor<fp32, [9, 4, 4, 64]> v_masked_1 = mul(x = v_9, y = var_1157)[name = tensor<string, []>("v_masked_1")];
-            tensor<fp32, [9, 4, 64, 64]> var_1159 = mul(x = prev_kv_9, y = sqrt_s0_9)[name = tensor<string, []>("op_1159")];
-            tensor<bool, []> var_1161_transpose_x_1 = const()[name = tensor<string, []>("op_1161_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_1161_transpose_y_1 = const()[name = tensor<string, []>("op_1161_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [9, 4, 64, 64]> var_1161 = matmul(transpose_x = var_1161_transpose_x_1, transpose_y = var_1161_transpose_y_1, x = k_9, y = v_masked_1)[name = tensor<string, []>("op_1161")];
-            tensor<fp32, [9, 4, 64, 64]> new_kv_unnorm_9 = add(x = var_1159, y = var_1161)[name = tensor<string, []>("new_kv_unnorm_9")];
-            tensor<bool, []> var_1163_keep_dims_0 = const()[name = tensor<string, []>("op_1163_keep_dims_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, []> var_1163 = reduce_sum(keep_dims = var_1163_keep_dims_0, x = valid_mask)[name = tensor<string, []>("op_1163")];
-            tensor<int32, [1]> var_1164 = const()[name = tensor<string, []>("op_1164"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [1]> var_1165 = reshape(shape = var_1164, x = var_1163)[name = tensor<string, []>("op_1165")];
-            tensor<fp32, [1]> new_scale_9 = add(x = prev_scale_9, y = var_1165)[name = tensor<string, []>("new_scale_9")];
+            tensor<int32, [4]> var_1189 = const()[name = tensor<string, []>("op_1189"), val = tensor<int32, [4]>([1, 1, 4, 1])];
+            tensor<fp32, [1, 1, 4, 1]> var_1190 = reshape(shape = var_1189, x = valid_mask)[name = tensor<string, []>("op_1190")];
+            tensor<fp32, [9, 4, 4, 64]> v_masked_1 = mul(x = v_9, y = var_1190)[name = tensor<string, []>("v_masked_1")];
+            tensor<fp32, [9, 4, 64, 64]> var_1192 = mul(x = prev_kv_9, y = sqrt_s0_9)[name = tensor<string, []>("op_1192")];
+            tensor<bool, []> var_1194_transpose_x_1 = const()[name = tensor<string, []>("op_1194_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_1194_transpose_y_1 = const()[name = tensor<string, []>("op_1194_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [9, 4, 64, 64]> var_1194 = matmul(transpose_x = var_1194_transpose_x_1, transpose_y = var_1194_transpose_y_1, x = k_9, y = v_masked_1)[name = tensor<string, []>("op_1194")];
+            tensor<fp32, [9, 4, 64, 64]> new_kv_unnorm_9 = add(x = var_1192, y = var_1194)[name = tensor<string, []>("new_kv_unnorm_9")];
+            tensor<bool, []> var_1196_keep_dims_0 = const()[name = tensor<string, []>("op_1196_keep_dims_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, []> var_1196 = reduce_sum(keep_dims = var_1196_keep_dims_0, x = valid_mask)[name = tensor<string, []>("op_1196")];
+            tensor<int32, [1]> var_1197 = const()[name = tensor<string, []>("op_1197"), val = tensor<int32, [1]>([1])];
+            tensor<fp32, [1]> var_1198 = reshape(shape = var_1197, x = var_1196)[name = tensor<string, []>("op_1198")];
+            tensor<fp32, [1]> new_scale_9 = add(x = prev_scale_9, y = var_1198)[name = tensor<string, []>("new_scale_9")];
             tensor<fp32, []> const_21 = const()[name = tensor<string, []>("const_21"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_2 = clip(alpha = var_1022, beta = const_21, x = new_scale_9)[name = tensor<string, []>("clip_2")];
+            tensor<fp32, [1]> clip_2 = clip(alpha = var_59, beta = const_21, x = new_scale_9)[name = tensor<string, []>("clip_2")];
             tensor<fp32, [1]> sqrt_new_scale_1 = sqrt(x = clip_2)[name = tensor<string, []>("sqrt_new_scale_1")];
-            tensor<fp32, [9, 4, 64, 64]> var_1169 = real_div(x = new_kv_unnorm_9, y = sqrt_new_scale_1)[name = tensor<string, []>("op_1169")];
-            tensor<int32, [4]> var_1170_perm_0 = const()[name = tensor<string, []>("op_1170_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [9, 4, 64, 64]> var_1202 = real_div(x = new_kv_unnorm_9, y = sqrt_new_scale_1)[name = tensor<string, []>("op_1202")];
+            tensor<int32, [4]> var_1203_perm_0 = const()[name = tensor<string, []>("op_1203_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_27_axes_0 = const()[name = tensor<string, []>("out_27_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [9, 4, 4, 64]> var_1170 = transpose(perm = var_1170_perm_0, x = out_25)[name = tensor<string, []>("transpose_24")];
-            tensor<fp32, [9, 4, 4, 64]> out_27 = layer_norm(axes = out_27_axes_0, epsilon = var_1019, x = var_1170)[name = tensor<string, []>("out_27")];
-            tensor<int32, [3]> var_1174 = const()[name = tensor<string, []>("op_1174"), val = tensor<int32, [3]>([9, 4, 256])];
-            tensor<fp32, [9, 4, 256]> out_29 = reshape(shape = var_1174, x = out_27)[name = tensor<string, []>("out_29")];
-            tensor<fp32, [9, 4, 256]> var_1176 = silu(x = input_169)[name = tensor<string, []>("op_1176")];
-            tensor<fp32, [9, 4, 256]> input_171 = mul(x = var_1176, y = out_29)[name = tensor<string, []>("input_171")];
-            tensor<fp32, [9, 4, 256]> ret_out_9 = linear(bias = decoder_out_proj_0_bias, weight = decoder_out_proj_0_weight, x = input_171)[name = tensor<string, []>("linear_42")];
-            tensor<fp32, [9, 4, 256]> input_173 = add(x = x_29, y = ret_out_9)[name = tensor<string, []>("input_173")];
+            tensor<fp32, [9, 4, 4, 64]> var_1203 = transpose(perm = var_1203_perm_0, x = out_25)[name = tensor<string, []>("transpose_24")];
+            tensor<fp32, [9, 4, 4, 64]> out_27 = layer_norm(axes = out_27_axes_0, epsilon = var_74, x = var_1203)[name = tensor<string, []>("out_27")];
+            tensor<int32, [3]> var_1207 = const()[name = tensor<string, []>("op_1207"), val = tensor<int32, [3]>([9, 4, 256])];
+            tensor<fp32, [9, 4, 256]> out_29 = reshape(shape = var_1207, x = out_27)[name = tensor<string, []>("out_29")];
+            tensor<fp32, [9, 4, 256]> var_1209 = silu(x = input_171)[name = tensor<string, []>("op_1209")];
+            tensor<fp32, [9, 4, 256]> input_173 = mul(x = var_1209, y = out_29)[name = tensor<string, []>("input_173")];
+            tensor<fp32, [9, 4, 256]> ret_out_9 = linear(bias = inner_decoder_out_proj_0_bias, weight = inner_decoder_out_proj_0_weight, x = input_173)[name = tensor<string, []>("linear_42")];
+            tensor<fp32, [9, 4, 256]> input_175 = add(x = x_29, y = ret_out_9)[name = tensor<string, []>("input_175")];
             tensor<int32, [1]> xt_1_axes_0 = const()[name = tensor<string, []>("xt_1_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [9, 4, 256]> xt_1 = layer_norm(axes = xt_1_axes_0, beta = decoder_norm11_0_bias, epsilon = var_1017, gamma = decoder_norm11_0_weight, x = input_173)[name = tensor<string, []>("xt_1")];
-            tensor<int32, [4]> var_1186 = const()[name = tensor<string, []>("op_1186"), val = tensor<int32, [4]>([1, 9, 4, 256])];
-            tensor<fp32, [1, 9, 4, 256]> var_1187 = reshape(shape = var_1186, x = xt_1)[name = tensor<string, []>("op_1187")];
-            tensor<int32, [4]> var_1188_perm_0 = const()[name = tensor<string, []>("op_1188_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1191 = const()[name = tensor<string, []>("op_1191"), val = tensor<int32, [3]>([4, 9, 256])];
-            tensor<fp32, [1, 4, 9, 256]> var_1188 = transpose(perm = var_1188_perm_0, x = var_1187)[name = tensor<string, []>("transpose_23")];
-            tensor<fp32, [4, 9, 256]> query_1 = reshape(shape = var_1191, x = var_1188)[name = tensor<string, []>("query_1")];
+            tensor<fp32, [9, 4, 256]> xt_1 = layer_norm(axes = xt_1_axes_0, beta = inner_decoder_norm11_0_bias, epsilon = var_66, gamma = inner_decoder_norm11_0_weight, x = input_175)[name = tensor<string, []>("xt_1")];
+            tensor<int32, [4]> var_1219 = const()[name = tensor<string, []>("op_1219"), val = tensor<int32, [4]>([1, 9, 4, 256])];
+            tensor<fp32, [1, 9, 4, 256]> var_1220 = reshape(shape = var_1219, x = xt_1)[name = tensor<string, []>("op_1220")];
+            tensor<int32, [4]> var_1221_perm_0 = const()[name = tensor<string, []>("op_1221_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1224 = const()[name = tensor<string, []>("op_1224"), val = tensor<int32, [3]>([4, 9, 256])];
+            tensor<fp32, [1, 4, 9, 256]> var_1221 = transpose(perm = var_1221_perm_0, x = var_1220)[name = tensor<string, []>("transpose_23")];
+            tensor<fp32, [4, 9, 256]> query_1 = reshape(shape = var_1224, x = var_1221)[name = tensor<string, []>("query_1")];
             tensor<int32, [3]> query_3_perm_0 = const()[name = tensor<string, []>("query_3_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [9, 4, 256]> query_3 = transpose(perm = query_3_perm_0, x = query_1)[name = tensor<string, []>("transpose_22")];
-            tensor<fp32, [9, 4, 768]> var_1214 = linear(bias = decoder_self_attn2_0_in_proj_bias, weight = decoder_self_attn2_0_in_proj_weight, x = query_3)[name = tensor<string, []>("linear_43")];
+            tensor<fp32, [9, 4, 768]> var_1247 = linear(bias = inner_decoder_self_attn2_0_in_proj_bias, weight = inner_decoder_self_attn2_0_in_proj_weight, x = query_3)[name = tensor<string, []>("linear_43")];
             tensor<int32, [4]> concat_1 = const()[name = tensor<string, []>("concat_1"), val = tensor<int32, [4]>([9, 4, 3, 256])];
-            tensor<fp32, [9, 4, 3, 256]> var_1216 = reshape(shape = concat_1, x = var_1214)[name = tensor<string, []>("op_1216")];
-            tensor<int32, [1]> var_1217_axes_0 = const()[name = tensor<string, []>("op_1217_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp32, [1, 9, 4, 3, 256]> var_1217 = expand_dims(axes = var_1217_axes_0, x = var_1216)[name = tensor<string, []>("op_1217")];
-            tensor<int32, [5]> var_1218_perm_0 = const()[name = tensor<string, []>("op_1218_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
-            tensor<int32, [1]> var_1219_axes_0 = const()[name = tensor<string, []>("op_1219_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp32, [3, 9, 4, 1, 256]> var_1218 = transpose(perm = var_1218_perm_0, x = var_1217)[name = tensor<string, []>("transpose_21")];
-            tensor<fp32, [3, 9, 4, 256]> var_1219 = squeeze(axes = var_1219_axes_0, x = var_1218)[name = tensor<string, []>("op_1219")];
+            tensor<fp32, [9, 4, 3, 256]> var_1249 = reshape(shape = concat_1, x = var_1247)[name = tensor<string, []>("op_1249")];
+            tensor<int32, [1]> var_1250_axes_0 = const()[name = tensor<string, []>("op_1250_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 9, 4, 3, 256]> var_1250 = expand_dims(axes = var_1250_axes_0, x = var_1249)[name = tensor<string, []>("op_1250")];
+            tensor<int32, [5]> var_1251_perm_0 = const()[name = tensor<string, []>("op_1251_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
+            tensor<int32, [1]> var_1252_axes_0 = const()[name = tensor<string, []>("op_1252_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp32, [3, 9, 4, 1, 256]> var_1251 = transpose(perm = var_1251_perm_0, x = var_1250)[name = tensor<string, []>("transpose_21")];
+            tensor<fp32, [3, 9, 4, 256]> var_1252 = squeeze(axes = var_1252_axes_0, x = var_1251)[name = tensor<string, []>("op_1252")];
             tensor<int32, [4]> q_11_begin_0 = const()[name = tensor<string, []>("q_11_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> q_11_end_0 = const()[name = tensor<string, []>("q_11_end_0"), val = tensor<int32, [4]>([1, 9, 4, 256])];
             tensor<bool, [4]> q_11_end_mask_0 = const()[name = tensor<string, []>("q_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> q_11_squeeze_mask_0 = const()[name = tensor<string, []>("q_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [9, 4, 256]> q_11 = slice_by_index(begin = q_11_begin_0, end = q_11_end_0, end_mask = q_11_end_mask_0, squeeze_mask = q_11_squeeze_mask_0, x = var_1219)[name = tensor<string, []>("q_11")];
+            tensor<fp32, [9, 4, 256]> q_11 = slice_by_index(begin = q_11_begin_0, end = q_11_end_0, end_mask = q_11_end_mask_0, squeeze_mask = q_11_squeeze_mask_0, x = var_1252)[name = tensor<string, []>("q_11")];
             tensor<int32, [4]> k_11_begin_0 = const()[name = tensor<string, []>("k_11_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> k_11_end_0 = const()[name = tensor<string, []>("k_11_end_0"), val = tensor<int32, [4]>([2, 9, 4, 256])];
             tensor<bool, [4]> k_11_end_mask_0 = const()[name = tensor<string, []>("k_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> k_11_squeeze_mask_0 = const()[name = tensor<string, []>("k_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [9, 4, 256]> k_11 = slice_by_index(begin = k_11_begin_0, end = k_11_end_0, end_mask = k_11_end_mask_0, squeeze_mask = k_11_squeeze_mask_0, x = var_1219)[name = tensor<string, []>("k_11")];
+            tensor<fp32, [9, 4, 256]> k_11 = slice_by_index(begin = k_11_begin_0, end = k_11_end_0, end_mask = k_11_end_mask_0, squeeze_mask = k_11_squeeze_mask_0, x = var_1252)[name = tensor<string, []>("k_11")];
             tensor<int32, [4]> v_11_begin_0 = const()[name = tensor<string, []>("v_11_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> v_11_end_0 = const()[name = tensor<string, []>("v_11_end_0"), val = tensor<int32, [4]>([3, 9, 4, 256])];
             tensor<bool, [4]> v_11_end_mask_0 = const()[name = tensor<string, []>("v_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> v_11_squeeze_mask_0 = const()[name = tensor<string, []>("v_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [9, 4, 256]> v_11 = slice_by_index(begin = v_11_begin_0, end = v_11_end_0, end_mask = v_11_end_mask_0, squeeze_mask = v_11_squeeze_mask_0, x = var_1219)[name = tensor<string, []>("v_11")];
-            tensor<int32, [3]> var_1227 = const()[name = tensor<string, []>("op_1227"), val = tensor<int32, [3]>([9, 16, 64])];
-            tensor<fp32, [9, 16, 64]> var_1228 = reshape(shape = var_1227, x = q_11)[name = tensor<string, []>("op_1228")];
+            tensor<fp32, [9, 4, 256]> v_11 = slice_by_index(begin = v_11_begin_0, end = v_11_end_0, end_mask = v_11_end_mask_0, squeeze_mask = v_11_squeeze_mask_0, x = var_1252)[name = tensor<string, []>("v_11")];
+            tensor<int32, [3]> var_1260 = const()[name = tensor<string, []>("op_1260"), val = tensor<int32, [3]>([9, 16, 64])];
+            tensor<fp32, [9, 16, 64]> var_1261 = reshape(shape = var_1260, x = q_11)[name = tensor<string, []>("op_1261")];
             tensor<int32, [3]> q_13_perm_0 = const()[name = tensor<string, []>("q_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1234 = const()[name = tensor<string, []>("op_1234"), val = tensor<int32, [3]>([9, 16, 64])];
-            tensor<fp32, [9, 16, 64]> var_1235 = reshape(shape = var_1234, x = k_11)[name = tensor<string, []>("op_1235")];
+            tensor<int32, [3]> var_1267 = const()[name = tensor<string, []>("op_1267"), val = tensor<int32, [3]>([9, 16, 64])];
+            tensor<fp32, [9, 16, 64]> var_1268 = reshape(shape = var_1267, x = k_11)[name = tensor<string, []>("op_1268")];
             tensor<int32, [3]> k_13_perm_0 = const()[name = tensor<string, []>("k_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1241 = const()[name = tensor<string, []>("op_1241"), val = tensor<int32, [3]>([9, 16, 64])];
-            tensor<fp32, [9, 16, 64]> var_1242 = reshape(shape = var_1241, x = v_11)[name = tensor<string, []>("op_1242")];
+            tensor<int32, [3]> var_1274 = const()[name = tensor<string, []>("op_1274"), val = tensor<int32, [3]>([9, 16, 64])];
+            tensor<fp32, [9, 16, 64]> var_1275 = reshape(shape = var_1274, x = v_11)[name = tensor<string, []>("op_1275")];
             tensor<int32, [3]> v_13_perm_0 = const()[name = tensor<string, []>("v_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [4]> var_1245 = const()[name = tensor<string, []>("op_1245"), val = tensor<int32, [4]>([4, 4, 9, 64])];
-            tensor<fp32, [16, 9, 64]> q_13 = transpose(perm = q_13_perm_0, x = var_1228)[name = tensor<string, []>("transpose_20")];
-            tensor<fp32, [4, 4, 9, 64]> q_15 = reshape(shape = var_1245, x = q_13)[name = tensor<string, []>("q_15")];
-            tensor<int32, [4]> var_1247 = const()[name = tensor<string, []>("op_1247"), val = tensor<int32, [4]>([4, 4, 9, 64])];
-            tensor<fp32, [16, 9, 64]> k_13 = transpose(perm = k_13_perm_0, x = var_1235)[name = tensor<string, []>("transpose_19")];
-            tensor<fp32, [4, 4, 9, 64]> k_15 = reshape(shape = var_1247, x = k_13)[name = tensor<string, []>("k_15")];
-            tensor<int32, [4]> var_1249 = const()[name = tensor<string, []>("op_1249"), val = tensor<int32, [4]>([4, 4, 9, 64])];
-            tensor<fp32, [16, 9, 64]> v_13 = transpose(perm = v_13_perm_0, x = var_1242)[name = tensor<string, []>("transpose_18")];
-            tensor<fp32, [4, 4, 9, 64]> v_15 = reshape(shape = var_1249, x = v_13)[name = tensor<string, []>("v_15")];
+            tensor<int32, [4]> var_1278 = const()[name = tensor<string, []>("op_1278"), val = tensor<int32, [4]>([4, 4, 9, 64])];
+            tensor<fp32, [16, 9, 64]> q_13 = transpose(perm = q_13_perm_0, x = var_1261)[name = tensor<string, []>("transpose_20")];
+            tensor<fp32, [4, 4, 9, 64]> q_15 = reshape(shape = var_1278, x = q_13)[name = tensor<string, []>("q_15")];
+            tensor<int32, [4]> var_1280 = const()[name = tensor<string, []>("op_1280"), val = tensor<int32, [4]>([4, 4, 9, 64])];
+            tensor<fp32, [16, 9, 64]> k_13 = transpose(perm = k_13_perm_0, x = var_1268)[name = tensor<string, []>("transpose_19")];
+            tensor<fp32, [4, 4, 9, 64]> k_15 = reshape(shape = var_1280, x = k_13)[name = tensor<string, []>("k_15")];
+            tensor<int32, [4]> var_1282 = const()[name = tensor<string, []>("op_1282"), val = tensor<int32, [4]>([4, 4, 9, 64])];
+            tensor<fp32, [16, 9, 64]> v_13 = transpose(perm = v_13_perm_0, x = var_1275)[name = tensor<string, []>("transpose_18")];
+            tensor<fp32, [4, 4, 9, 64]> v_15 = reshape(shape = var_1282, x = v_13)[name = tensor<string, []>("v_15")];
             tensor<fp32, []> mul_1_y_0 = const()[name = tensor<string, []>("mul_1_y_0"), val = tensor<fp32, []>(0x1p-3)];
             tensor<fp32, [4, 4, 9, 64]> mul_1 = mul(x = q_15, y = mul_1_y_0)[name = tensor<string, []>("mul_1")];
             tensor<bool, []> matmul_0_transpose_y_0 = const()[name = tensor<string, []>("matmul_0_transpose_y_0"), val = tensor<bool, []>(true)];
@@ -1119,30 +1134,30 @@ program(1.0)
             tensor<bool, []> attn_output_1_transpose_x_0 = const()[name = tensor<string, []>("attn_output_1_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> attn_output_1_transpose_y_0 = const()[name = tensor<string, []>("attn_output_1_transpose_y_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [4, 4, 9, 64]> attn_output_1 = matmul(transpose_x = attn_output_1_transpose_x_0, transpose_y = attn_output_1_transpose_y_0, x = softmax_0, y = v_15)[name = tensor<string, []>("attn_output_1")];
-            tensor<int32, [4]> var_1252 = const()[name = tensor<string, []>("op_1252"), val = tensor<int32, [4]>([2, 0, 1, 3])];
-            tensor<int32, [2]> var_1257 = const()[name = tensor<string, []>("op_1257"), val = tensor<int32, [2]>([36, 256])];
-            tensor<fp32, [9, 4, 4, 64]> var_1253 = transpose(perm = var_1252, x = attn_output_1)[name = tensor<string, []>("transpose_17")];
-            tensor<fp32, [36, 256]> attn_output_3 = reshape(shape = var_1257, x = var_1253)[name = tensor<string, []>("attn_output_3")];
-            tensor<fp32, [36, 256]> attn_output_5 = linear(bias = decoder_self_attn2_0_out_proj_bias, weight = decoder_self_attn2_0_out_proj_weight, x = attn_output_3)[name = tensor<string, []>("linear_44")];
-            tensor<int32, [3]> var_1261 = const()[name = tensor<string, []>("op_1261"), val = tensor<int32, [3]>([9, 4, 256])];
-            tensor<fp32, [9, 4, 256]> attn_output_7 = reshape(shape = var_1261, x = attn_output_5)[name = tensor<string, []>("attn_output_7")];
+            tensor<int32, [4]> var_1285 = const()[name = tensor<string, []>("op_1285"), val = tensor<int32, [4]>([2, 0, 1, 3])];
+            tensor<int32, [2]> var_1290 = const()[name = tensor<string, []>("op_1290"), val = tensor<int32, [2]>([36, 256])];
+            tensor<fp32, [9, 4, 4, 64]> var_1286 = transpose(perm = var_1285, x = attn_output_1)[name = tensor<string, []>("transpose_17")];
+            tensor<fp32, [36, 256]> attn_output_3 = reshape(shape = var_1290, x = var_1286)[name = tensor<string, []>("attn_output_3")];
+            tensor<fp32, [36, 256]> attn_output_5 = linear(bias = inner_decoder_self_attn2_0_out_proj_bias, weight = inner_decoder_self_attn2_0_out_proj_weight, x = attn_output_3)[name = tensor<string, []>("linear_44")];
+            tensor<int32, [3]> var_1294 = const()[name = tensor<string, []>("op_1294"), val = tensor<int32, [3]>([9, 4, 256])];
+            tensor<fp32, [9, 4, 256]> attn_output_7 = reshape(shape = var_1294, x = attn_output_5)[name = tensor<string, []>("attn_output_7")];
             tensor<int32, [3]> sa2_out_1_perm_0 = const()[name = tensor<string, []>("sa2_out_1_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [4, 9, 256]> sa2_out_1 = transpose(perm = sa2_out_1_perm_0, x = attn_output_7)[name = tensor<string, []>("transpose_16")];
-            tensor<fp32, [4, 9, 256]> input_175 = add(x = query_1, y = sa2_out_1)[name = tensor<string, []>("input_175")];
-            tensor<int32, [1]> input_177_axes_0 = const()[name = tensor<string, []>("input_177_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [4, 9, 256]> input_177 = layer_norm(axes = input_177_axes_0, beta = decoder_norm21_0_bias, epsilon = var_1017, gamma = decoder_norm21_0_weight, x = input_175)[name = tensor<string, []>("input_177")];
-            tensor<fp32, [4, 9, 2048]> input_179 = linear(bias = decoder_linear1_0_bias, weight = decoder_linear1_0_weight, x = input_177)[name = tensor<string, []>("linear_45")];
-            tensor<fp32, [4, 9, 2048]> input_181 = relu(x = input_179)[name = tensor<string, []>("input_181")];
-            tensor<fp32, [4, 9, 256]> ffn_out_1 = linear(bias = decoder_linear2_0_bias, weight = decoder_linear2_0_weight, x = input_181)[name = tensor<string, []>("linear_46")];
-            tensor<fp32, [4, 9, 256]> input_183 = add(x = input_177, y = ffn_out_1)[name = tensor<string, []>("input_183")];
+            tensor<fp32, [4, 9, 256]> input_177 = add(x = query_1, y = sa2_out_1)[name = tensor<string, []>("input_177")];
+            tensor<int32, [1]> input_179_axes_0 = const()[name = tensor<string, []>("input_179_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [4, 9, 256]> input_179 = layer_norm(axes = input_179_axes_0, beta = inner_decoder_norm21_0_bias, epsilon = var_66, gamma = inner_decoder_norm21_0_weight, x = input_177)[name = tensor<string, []>("input_179")];
+            tensor<fp32, [4, 9, 2048]> input_181 = linear(bias = inner_decoder_linear1_0_bias, weight = inner_decoder_linear1_0_weight, x = input_179)[name = tensor<string, []>("linear_45")];
+            tensor<fp32, [4, 9, 2048]> input_183 = relu(x = input_181)[name = tensor<string, []>("input_183")];
+            tensor<fp32, [4, 9, 256]> ffn_out_1 = linear(bias = inner_decoder_linear2_0_bias, weight = inner_decoder_linear2_0_weight, x = input_183)[name = tensor<string, []>("linear_46")];
+            tensor<fp32, [4, 9, 256]> input_185 = add(x = input_179, y = ffn_out_1)[name = tensor<string, []>("input_185")];
             tensor<int32, [1]> xt_3_axes_0 = const()[name = tensor<string, []>("xt_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [4, 9, 256]> xt_3 = layer_norm(axes = xt_3_axes_0, beta = decoder_norm22_0_bias, epsilon = var_1017, gamma = decoder_norm22_0_weight, x = input_183)[name = tensor<string, []>("xt_3")];
-            tensor<int32, [4]> var_1281 = const()[name = tensor<string, []>("op_1281"), val = tensor<int32, [4]>([1, 4, 9, 256])];
-            tensor<fp32, [1, 4, 9, 256]> x_31 = reshape(shape = var_1281, x = xt_3)[name = tensor<string, []>("x_31")];
-            tensor<int32, [4]> var_1283_perm_0 = const()[name = tensor<string, []>("op_1283_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1287 = const()[name = tensor<string, []>("op_1287"), val = tensor<int32, [3]>([9, 4, 256])];
-            tensor<fp32, [1, 9, 4, 256]> var_1283 = transpose(perm = var_1283_perm_0, x = x_31)[name = tensor<string, []>("transpose_15")];
-            tensor<fp32, [9, 4, 256]> x = reshape(shape = var_1287, x = var_1283)[name = tensor<string, []>("x")];
+            tensor<fp32, [4, 9, 256]> xt_3 = layer_norm(axes = xt_3_axes_0, beta = inner_decoder_norm22_0_bias, epsilon = var_66, gamma = inner_decoder_norm22_0_weight, x = input_185)[name = tensor<string, []>("xt_3")];
+            tensor<int32, [4]> var_1314 = const()[name = tensor<string, []>("op_1314"), val = tensor<int32, [4]>([1, 4, 9, 256])];
+            tensor<fp32, [1, 4, 9, 256]> x_31 = reshape(shape = var_1314, x = xt_3)[name = tensor<string, []>("x_31")];
+            tensor<int32, [4]> var_1316_perm_0 = const()[name = tensor<string, []>("op_1316_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1320 = const()[name = tensor<string, []>("op_1320"), val = tensor<int32, [3]>([9, 4, 256])];
+            tensor<fp32, [1, 9, 4, 256]> var_1316 = transpose(perm = var_1316_perm_0, x = x_31)[name = tensor<string, []>("transpose_15")];
+            tensor<fp32, [9, 4, 256]> x = reshape(shape = var_1320, x = var_1316)[name = tensor<string, []>("x")];
             tensor<int32, [5]> prev_kv_begin_0 = const()[name = tensor<string, []>("prev_kv_begin_0"), val = tensor<int32, [5]>([1, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_end_0 = const()[name = tensor<string, []>("prev_kv_end_0"), val = tensor<int32, [5]>([2, 9, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_end_mask_0 = const()[name = tensor<string, []>("prev_kv_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -1153,120 +1168,120 @@ program(1.0)
             tensor<bool, [2]> prev_scale_end_mask_0 = const()[name = tensor<string, []>("prev_scale_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale = slice_by_index(begin = prev_scale_begin_0, end = prev_scale_end_0, end_mask = prev_scale_end_mask_0, squeeze_mask = prev_scale_squeeze_mask_0, x = dec_scale)[name = tensor<string, []>("prev_scale")];
-            tensor<fp32, [9, 4, 256]> var_1295 = linear(bias = decoder_q_proj_1_bias, weight = decoder_q_proj_1_weight, x = x)[name = tensor<string, []>("linear_47")];
-            tensor<int32, [4]> var_1296 = const()[name = tensor<string, []>("op_1296"), val = tensor<int32, [4]>([9, 4, 4, 64])];
-            tensor<fp32, [9, 4, 4, 64]> var_1297 = reshape(shape = var_1296, x = var_1295)[name = tensor<string, []>("op_1297")];
+            tensor<fp32, [9, 4, 256]> var_1328 = linear(bias = inner_decoder_q_proj_1_bias, weight = inner_decoder_q_proj_1_weight, x = x)[name = tensor<string, []>("linear_47")];
+            tensor<int32, [4]> var_1329 = const()[name = tensor<string, []>("op_1329"), val = tensor<int32, [4]>([9, 4, 4, 64])];
+            tensor<fp32, [9, 4, 4, 64]> var_1330 = reshape(shape = var_1329, x = var_1328)[name = tensor<string, []>("op_1330")];
             tensor<int32, [4]> q_17_perm_0 = const()[name = tensor<string, []>("q_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [9, 4, 256]> var_1301 = linear(bias = decoder_k_proj_1_bias, weight = decoder_k_proj_1_weight, x = x)[name = tensor<string, []>("linear_48")];
-            tensor<fp32, []> var_1302 = const()[name = tensor<string, []>("op_1302"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [9, 4, 256]> var_1303 = mul(x = var_1301, y = var_1302)[name = tensor<string, []>("op_1303")];
-            tensor<int32, [4]> var_1304 = const()[name = tensor<string, []>("op_1304"), val = tensor<int32, [4]>([9, 4, 4, 64])];
-            tensor<fp32, [9, 4, 4, 64]> var_1305 = reshape(shape = var_1304, x = var_1303)[name = tensor<string, []>("op_1305")];
+            tensor<fp32, [9, 4, 256]> var_1334 = linear(bias = inner_decoder_k_proj_1_bias, weight = inner_decoder_k_proj_1_weight, x = x)[name = tensor<string, []>("linear_48")];
+            tensor<fp32, []> var_1335 = const()[name = tensor<string, []>("op_1335"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [9, 4, 256]> var_1336 = mul(x = var_1334, y = var_1335)[name = tensor<string, []>("op_1336")];
+            tensor<int32, [4]> var_1337 = const()[name = tensor<string, []>("op_1337"), val = tensor<int32, [4]>([9, 4, 4, 64])];
+            tensor<fp32, [9, 4, 4, 64]> var_1338 = reshape(shape = var_1337, x = var_1336)[name = tensor<string, []>("op_1338")];
             tensor<int32, [4]> k_17_perm_0 = const()[name = tensor<string, []>("k_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [9, 4, 256]> var_1309 = linear(bias = decoder_v_proj_1_bias, weight = decoder_v_proj_1_weight, x = x)[name = tensor<string, []>("linear_49")];
-            tensor<int32, [4]> var_1310 = const()[name = tensor<string, []>("op_1310"), val = tensor<int32, [4]>([9, 4, 4, 64])];
-            tensor<fp32, [9, 4, 4, 64]> var_1311 = reshape(shape = var_1310, x = var_1309)[name = tensor<string, []>("op_1311")];
+            tensor<fp32, [9, 4, 256]> var_1342 = linear(bias = inner_decoder_v_proj_1_bias, weight = inner_decoder_v_proj_1_weight, x = x)[name = tensor<string, []>("linear_49")];
+            tensor<int32, [4]> var_1343 = const()[name = tensor<string, []>("op_1343"), val = tensor<int32, [4]>([9, 4, 4, 64])];
+            tensor<fp32, [9, 4, 4, 64]> var_1344 = reshape(shape = var_1343, x = var_1342)[name = tensor<string, []>("op_1344")];
             tensor<int32, [4]> v_17_perm_0 = const()[name = tensor<string, []>("v_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [9, 4, 256]> input_187 = linear(bias = decoder_g_proj_1_bias, weight = decoder_g_proj_1_weight, x = x)[name = tensor<string, []>("linear_50")];
+            tensor<fp32, [9, 4, 256]> input_189 = linear(bias = inner_decoder_g_proj_1_bias, weight = inner_decoder_g_proj_1_weight, x = x)[name = tensor<string, []>("linear_50")];
             tensor<fp32, [1]> sqrt_s0 = sqrt(x = prev_scale)[name = tensor<string, []>("sqrt_s0")];
             tensor<fp32, [4]> s_t = add(x = prev_scale, y = cumsum_mask_1)[name = tensor<string, []>("s_t")];
             tensor<fp32, []> const_32 = const()[name = tensor<string, []>("const_32"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [4]> clip_3 = clip(alpha = var_1022, beta = const_32, x = s_t)[name = tensor<string, []>("clip_3")];
+            tensor<fp32, [4]> clip_3 = clip(alpha = var_59, beta = const_32, x = s_t)[name = tensor<string, []>("clip_3")];
             tensor<fp32, [4]> sqrt_s_t = sqrt(x = clip_3)[name = tensor<string, []>("sqrt_s_t")];
             tensor<bool, []> qk_transpose_x_1 = const()[name = tensor<string, []>("qk_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_transpose_y_1 = const()[name = tensor<string, []>("qk_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [9, 4, 4, 64]> k_17 = transpose(perm = k_17_perm_0, x = var_1305)[name = tensor<string, []>("transpose_13")];
-            tensor<fp32, [9, 4, 4, 64]> q_17 = transpose(perm = q_17_perm_0, x = var_1297)[name = tensor<string, []>("transpose_14")];
+            tensor<fp32, [9, 4, 4, 64]> k_17 = transpose(perm = k_17_perm_0, x = var_1338)[name = tensor<string, []>("transpose_13")];
+            tensor<fp32, [9, 4, 4, 64]> q_17 = transpose(perm = q_17_perm_0, x = var_1330)[name = tensor<string, []>("transpose_14")];
             tensor<fp32, [9, 4, 4, 4]> qk = matmul(transpose_x = qk_transpose_x_1, transpose_y = qk_transpose_y_1, x = q_17, y = k_17)[name = tensor<string, []>("qk")];
-            tensor<int32, [2]> var_1326 = const()[name = tensor<string, []>("op_1326"), val = tensor<int32, [2]>([4, 1])];
-            tensor<fp32, [4, 1]> var_1327 = reshape(shape = var_1326, x = sqrt_s_t)[name = tensor<string, []>("op_1327")];
-            tensor<fp32, [4, 4]> M = real_div(x = causal_with_valid_1, y = var_1327)[name = tensor<string, []>("M")];
-            tensor<fp32, [9, 4, 4, 4]> var_1329 = mul(x = qk, y = M)[name = tensor<string, []>("op_1329")];
-            tensor<bool, []> inner_transpose_x_0 = const()[name = tensor<string, []>("inner_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> inner_transpose_y_0 = const()[name = tensor<string, []>("inner_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [9, 4, 4, 64]> v_17 = transpose(perm = v_17_perm_0, x = var_1311)[name = tensor<string, []>("transpose_12")];
-            tensor<fp32, [9, 4, 4, 64]> inner = matmul(transpose_x = inner_transpose_x_0, transpose_y = inner_transpose_y_0, x = var_1329, y = v_17)[name = tensor<string, []>("inner")];
-            tensor<bool, []> var_1331_transpose_x_0 = const()[name = tensor<string, []>("op_1331_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_1331_transpose_y_0 = const()[name = tensor<string, []>("op_1331_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [9, 4, 4, 64]> var_1331 = matmul(transpose_x = var_1331_transpose_x_0, transpose_y = var_1331_transpose_y_0, x = q_17, y = prev_kv)[name = tensor<string, []>("op_1331")];
-            tensor<fp32, [4]> var_1332 = real_div(x = sqrt_s0, y = sqrt_s_t)[name = tensor<string, []>("op_1332")];
-            tensor<int32, [4]> var_1333 = const()[name = tensor<string, []>("op_1333"), val = tensor<int32, [4]>([1, 1, 4, 1])];
-            tensor<fp32, [1, 1, 4, 1]> var_1334 = reshape(shape = var_1333, x = var_1332)[name = tensor<string, []>("op_1334")];
-            tensor<fp32, [9, 4, 4, 64]> cross = mul(x = var_1331, y = var_1334)[name = tensor<string, []>("cross")];
-            tensor<fp32, [9, 4, 4, 64]> out_31 = add(x = inner, y = cross)[name = tensor<string, []>("out_31")];
-            tensor<fp32, [9, 4, 4, 64]> v_masked = mul(x = v_17, y = var_1157)[name = tensor<string, []>("v_masked")];
-            tensor<fp32, [9, 4, 64, 64]> var_1340 = mul(x = prev_kv, y = sqrt_s0)[name = tensor<string, []>("op_1340")];
-            tensor<bool, []> var_1342_transpose_x_1 = const()[name = tensor<string, []>("op_1342_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_1342_transpose_y_1 = const()[name = tensor<string, []>("op_1342_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [9, 4, 64, 64]> var_1342 = matmul(transpose_x = var_1342_transpose_x_1, transpose_y = var_1342_transpose_y_1, x = k_17, y = v_masked)[name = tensor<string, []>("op_1342")];
-            tensor<fp32, [9, 4, 64, 64]> new_kv_unnorm = add(x = var_1340, y = var_1342)[name = tensor<string, []>("new_kv_unnorm")];
-            tensor<fp32, [1]> new_scale = add(x = prev_scale, y = var_1165)[name = tensor<string, []>("new_scale")];
+            tensor<int32, [2]> var_1359 = const()[name = tensor<string, []>("op_1359"), val = tensor<int32, [2]>([4, 1])];
+            tensor<fp32, [4, 1]> var_1360 = reshape(shape = var_1359, x = sqrt_s_t)[name = tensor<string, []>("op_1360")];
+            tensor<fp32, [4, 4]> M = real_div(x = causal_with_valid_1, y = var_1360)[name = tensor<string, []>("M")];
+            tensor<fp32, [9, 4, 4, 4]> var_1362 = mul(x = qk, y = M)[name = tensor<string, []>("op_1362")];
+            tensor<bool, []> inner_11_transpose_x_0 = const()[name = tensor<string, []>("inner_11_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> inner_11_transpose_y_0 = const()[name = tensor<string, []>("inner_11_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [9, 4, 4, 64]> v_17 = transpose(perm = v_17_perm_0, x = var_1344)[name = tensor<string, []>("transpose_12")];
+            tensor<fp32, [9, 4, 4, 64]> inner_11 = matmul(transpose_x = inner_11_transpose_x_0, transpose_y = inner_11_transpose_y_0, x = var_1362, y = v_17)[name = tensor<string, []>("inner_11")];
+            tensor<bool, []> var_1364_transpose_x_0 = const()[name = tensor<string, []>("op_1364_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_1364_transpose_y_0 = const()[name = tensor<string, []>("op_1364_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [9, 4, 4, 64]> var_1364 = matmul(transpose_x = var_1364_transpose_x_0, transpose_y = var_1364_transpose_y_0, x = q_17, y = prev_kv)[name = tensor<string, []>("op_1364")];
+            tensor<fp32, [4]> var_1365 = real_div(x = sqrt_s0, y = sqrt_s_t)[name = tensor<string, []>("op_1365")];
+            tensor<int32, [4]> var_1366 = const()[name = tensor<string, []>("op_1366"), val = tensor<int32, [4]>([1, 1, 4, 1])];
+            tensor<fp32, [1, 1, 4, 1]> var_1367 = reshape(shape = var_1366, x = var_1365)[name = tensor<string, []>("op_1367")];
+            tensor<fp32, [9, 4, 4, 64]> cross = mul(x = var_1364, y = var_1367)[name = tensor<string, []>("cross")];
+            tensor<fp32, [9, 4, 4, 64]> out_31 = add(x = inner_11, y = cross)[name = tensor<string, []>("out_31")];
+            tensor<fp32, [9, 4, 4, 64]> v_masked = mul(x = v_17, y = var_1190)[name = tensor<string, []>("v_masked")];
+            tensor<fp32, [9, 4, 64, 64]> var_1373 = mul(x = prev_kv, y = sqrt_s0)[name = tensor<string, []>("op_1373")];
+            tensor<bool, []> var_1375_transpose_x_1 = const()[name = tensor<string, []>("op_1375_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_1375_transpose_y_1 = const()[name = tensor<string, []>("op_1375_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [9, 4, 64, 64]> var_1375 = matmul(transpose_x = var_1375_transpose_x_1, transpose_y = var_1375_transpose_y_1, x = k_17, y = v_masked)[name = tensor<string, []>("op_1375")];
+            tensor<fp32, [9, 4, 64, 64]> new_kv_unnorm = add(x = var_1373, y = var_1375)[name = tensor<string, []>("new_kv_unnorm")];
+            tensor<fp32, [1]> new_scale = add(x = prev_scale, y = var_1198)[name = tensor<string, []>("new_scale")];
             tensor<fp32, []> const_33 = const()[name = tensor<string, []>("const_33"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_4 = clip(alpha = var_1022, beta = const_33, x = new_scale)[name = tensor<string, []>("clip_4")];
+            tensor<fp32, [1]> clip_4 = clip(alpha = var_59, beta = const_33, x = new_scale)[name = tensor<string, []>("clip_4")];
             tensor<fp32, [1]> sqrt_new_scale = sqrt(x = clip_4)[name = tensor<string, []>("sqrt_new_scale")];
             tensor<fp32, [9, 4, 64, 64]> nkv = real_div(x = new_kv_unnorm, y = sqrt_new_scale)[name = tensor<string, []>("nkv")];
-            tensor<int32, [4]> var_1351_perm_0 = const()[name = tensor<string, []>("op_1351_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [4]> var_1384_perm_0 = const()[name = tensor<string, []>("op_1384_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_33_axes_0 = const()[name = tensor<string, []>("out_33_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [9, 4, 4, 64]> var_1351 = transpose(perm = var_1351_perm_0, x = out_31)[name = tensor<string, []>("transpose_11")];
-            tensor<fp32, [9, 4, 4, 64]> out_33 = layer_norm(axes = out_33_axes_0, epsilon = var_1019, x = var_1351)[name = tensor<string, []>("out_33")];
-            tensor<int32, [3]> var_1355 = const()[name = tensor<string, []>("op_1355"), val = tensor<int32, [3]>([9, 4, 256])];
-            tensor<fp32, [9, 4, 256]> out = reshape(shape = var_1355, x = out_33)[name = tensor<string, []>("out")];
-            tensor<fp32, [9, 4, 256]> var_1357 = silu(x = input_187)[name = tensor<string, []>("op_1357")];
-            tensor<fp32, [9, 4, 256]> input_189 = mul(x = var_1357, y = out)[name = tensor<string, []>("input_189")];
-            tensor<fp32, [9, 4, 256]> ret_out = linear(bias = decoder_out_proj_1_bias, weight = decoder_out_proj_1_weight, x = input_189)[name = tensor<string, []>("linear_51")];
-            tensor<fp32, [9, 4, 256]> input_191 = add(x = x, y = ret_out)[name = tensor<string, []>("input_191")];
+            tensor<fp32, [9, 4, 4, 64]> var_1384 = transpose(perm = var_1384_perm_0, x = out_31)[name = tensor<string, []>("transpose_11")];
+            tensor<fp32, [9, 4, 4, 64]> out_33 = layer_norm(axes = out_33_axes_0, epsilon = var_74, x = var_1384)[name = tensor<string, []>("out_33")];
+            tensor<int32, [3]> var_1388 = const()[name = tensor<string, []>("op_1388"), val = tensor<int32, [3]>([9, 4, 256])];
+            tensor<fp32, [9, 4, 256]> out = reshape(shape = var_1388, x = out_33)[name = tensor<string, []>("out")];
+            tensor<fp32, [9, 4, 256]> var_1390 = silu(x = input_189)[name = tensor<string, []>("op_1390")];
+            tensor<fp32, [9, 4, 256]> input_191 = mul(x = var_1390, y = out)[name = tensor<string, []>("input_191")];
+            tensor<fp32, [9, 4, 256]> ret_out = linear(bias = inner_decoder_out_proj_1_bias, weight = inner_decoder_out_proj_1_weight, x = input_191)[name = tensor<string, []>("linear_51")];
+            tensor<fp32, [9, 4, 256]> input_193 = add(x = x, y = ret_out)[name = tensor<string, []>("input_193")];
             tensor<int32, [1]> xt_5_axes_0 = const()[name = tensor<string, []>("xt_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [9, 4, 256]> xt_5 = layer_norm(axes = xt_5_axes_0, beta = decoder_norm11_1_bias, epsilon = var_1017, gamma = decoder_norm11_1_weight, x = input_191)[name = tensor<string, []>("xt_5")];
-            tensor<int32, [4]> var_1367 = const()[name = tensor<string, []>("op_1367"), val = tensor<int32, [4]>([1, 9, 4, 256])];
-            tensor<fp32, [1, 9, 4, 256]> var_1368 = reshape(shape = var_1367, x = xt_5)[name = tensor<string, []>("op_1368")];
-            tensor<int32, [4]> var_1369_perm_0 = const()[name = tensor<string, []>("op_1369_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1372 = const()[name = tensor<string, []>("op_1372"), val = tensor<int32, [3]>([4, 9, 256])];
-            tensor<fp32, [1, 4, 9, 256]> var_1369 = transpose(perm = var_1369_perm_0, x = var_1368)[name = tensor<string, []>("transpose_10")];
-            tensor<fp32, [4, 9, 256]> query_5 = reshape(shape = var_1372, x = var_1369)[name = tensor<string, []>("query_5")];
+            tensor<fp32, [9, 4, 256]> xt_5 = layer_norm(axes = xt_5_axes_0, beta = inner_decoder_norm11_1_bias, epsilon = var_66, gamma = inner_decoder_norm11_1_weight, x = input_193)[name = tensor<string, []>("xt_5")];
+            tensor<int32, [4]> var_1400 = const()[name = tensor<string, []>("op_1400"), val = tensor<int32, [4]>([1, 9, 4, 256])];
+            tensor<fp32, [1, 9, 4, 256]> var_1401 = reshape(shape = var_1400, x = xt_5)[name = tensor<string, []>("op_1401")];
+            tensor<int32, [4]> var_1402_perm_0 = const()[name = tensor<string, []>("op_1402_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1405 = const()[name = tensor<string, []>("op_1405"), val = tensor<int32, [3]>([4, 9, 256])];
+            tensor<fp32, [1, 4, 9, 256]> var_1402 = transpose(perm = var_1402_perm_0, x = var_1401)[name = tensor<string, []>("transpose_10")];
+            tensor<fp32, [4, 9, 256]> query_5 = reshape(shape = var_1405, x = var_1402)[name = tensor<string, []>("query_5")];
             tensor<int32, [3]> query_perm_0 = const()[name = tensor<string, []>("query_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [9, 4, 256]> query = transpose(perm = query_perm_0, x = query_5)[name = tensor<string, []>("transpose_9")];
-            tensor<fp32, [9, 4, 768]> var_1395 = linear(bias = decoder_self_attn2_1_in_proj_bias, weight = decoder_self_attn2_1_in_proj_weight, x = query)[name = tensor<string, []>("linear_52")];
+            tensor<fp32, [9, 4, 768]> var_1428 = linear(bias = inner_decoder_self_attn2_1_in_proj_bias, weight = inner_decoder_self_attn2_1_in_proj_weight, x = query)[name = tensor<string, []>("linear_52")];
             tensor<int32, [4]> concat_2 = const()[name = tensor<string, []>("concat_2"), val = tensor<int32, [4]>([9, 4, 3, 256])];
-            tensor<fp32, [9, 4, 3, 256]> var_1397 = reshape(shape = concat_2, x = var_1395)[name = tensor<string, []>("op_1397")];
-            tensor<int32, [1]> var_1398_axes_0 = const()[name = tensor<string, []>("op_1398_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp32, [1, 9, 4, 3, 256]> var_1398 = expand_dims(axes = var_1398_axes_0, x = var_1397)[name = tensor<string, []>("op_1398")];
-            tensor<int32, [5]> var_1399_perm_0 = const()[name = tensor<string, []>("op_1399_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
-            tensor<int32, [1]> var_1400_axes_0 = const()[name = tensor<string, []>("op_1400_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp32, [3, 9, 4, 1, 256]> var_1399 = transpose(perm = var_1399_perm_0, x = var_1398)[name = tensor<string, []>("transpose_8")];
-            tensor<fp32, [3, 9, 4, 256]> var_1400 = squeeze(axes = var_1400_axes_0, x = var_1399)[name = tensor<string, []>("op_1400")];
+            tensor<fp32, [9, 4, 3, 256]> var_1430 = reshape(shape = concat_2, x = var_1428)[name = tensor<string, []>("op_1430")];
+            tensor<int32, [1]> var_1431_axes_0 = const()[name = tensor<string, []>("op_1431_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 9, 4, 3, 256]> var_1431 = expand_dims(axes = var_1431_axes_0, x = var_1430)[name = tensor<string, []>("op_1431")];
+            tensor<int32, [5]> var_1432_perm_0 = const()[name = tensor<string, []>("op_1432_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
+            tensor<int32, [1]> var_1433_axes_0 = const()[name = tensor<string, []>("op_1433_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp32, [3, 9, 4, 1, 256]> var_1432 = transpose(perm = var_1432_perm_0, x = var_1431)[name = tensor<string, []>("transpose_8")];
+            tensor<fp32, [3, 9, 4, 256]> var_1433 = squeeze(axes = var_1433_axes_0, x = var_1432)[name = tensor<string, []>("op_1433")];
             tensor<int32, [4]> q_19_begin_0 = const()[name = tensor<string, []>("q_19_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> q_19_end_0 = const()[name = tensor<string, []>("q_19_end_0"), val = tensor<int32, [4]>([1, 9, 4, 256])];
             tensor<bool, [4]> q_19_end_mask_0 = const()[name = tensor<string, []>("q_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> q_19_squeeze_mask_0 = const()[name = tensor<string, []>("q_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [9, 4, 256]> q_19 = slice_by_index(begin = q_19_begin_0, end = q_19_end_0, end_mask = q_19_end_mask_0, squeeze_mask = q_19_squeeze_mask_0, x = var_1400)[name = tensor<string, []>("q_19")];
+            tensor<fp32, [9, 4, 256]> q_19 = slice_by_index(begin = q_19_begin_0, end = q_19_end_0, end_mask = q_19_end_mask_0, squeeze_mask = q_19_squeeze_mask_0, x = var_1433)[name = tensor<string, []>("q_19")];
             tensor<int32, [4]> k_19_begin_0 = const()[name = tensor<string, []>("k_19_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> k_19_end_0 = const()[name = tensor<string, []>("k_19_end_0"), val = tensor<int32, [4]>([2, 9, 4, 256])];
             tensor<bool, [4]> k_19_end_mask_0 = const()[name = tensor<string, []>("k_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> k_19_squeeze_mask_0 = const()[name = tensor<string, []>("k_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [9, 4, 256]> k_19 = slice_by_index(begin = k_19_begin_0, end = k_19_end_0, end_mask = k_19_end_mask_0, squeeze_mask = k_19_squeeze_mask_0, x = var_1400)[name = tensor<string, []>("k_19")];
+            tensor<fp32, [9, 4, 256]> k_19 = slice_by_index(begin = k_19_begin_0, end = k_19_end_0, end_mask = k_19_end_mask_0, squeeze_mask = k_19_squeeze_mask_0, x = var_1433)[name = tensor<string, []>("k_19")];
             tensor<int32, [4]> v_19_begin_0 = const()[name = tensor<string, []>("v_19_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> v_19_end_0 = const()[name = tensor<string, []>("v_19_end_0"), val = tensor<int32, [4]>([3, 9, 4, 256])];
             tensor<bool, [4]> v_19_end_mask_0 = const()[name = tensor<string, []>("v_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> v_19_squeeze_mask_0 = const()[name = tensor<string, []>("v_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [9, 4, 256]> v_19 = slice_by_index(begin = v_19_begin_0, end = v_19_end_0, end_mask = v_19_end_mask_0, squeeze_mask = v_19_squeeze_mask_0, x = var_1400)[name = tensor<string, []>("v_19")];
-            tensor<int32, [3]> var_1408 = const()[name = tensor<string, []>("op_1408"), val = tensor<int32, [3]>([9, 16, 64])];
-            tensor<fp32, [9, 16, 64]> var_1409 = reshape(shape = var_1408, x = q_19)[name = tensor<string, []>("op_1409")];
+            tensor<fp32, [9, 4, 256]> v_19 = slice_by_index(begin = v_19_begin_0, end = v_19_end_0, end_mask = v_19_end_mask_0, squeeze_mask = v_19_squeeze_mask_0, x = var_1433)[name = tensor<string, []>("v_19")];
+            tensor<int32, [3]> var_1441 = const()[name = tensor<string, []>("op_1441"), val = tensor<int32, [3]>([9, 16, 64])];
+            tensor<fp32, [9, 16, 64]> var_1442 = reshape(shape = var_1441, x = q_19)[name = tensor<string, []>("op_1442")];
             tensor<int32, [3]> q_21_perm_0 = const()[name = tensor<string, []>("q_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1415 = const()[name = tensor<string, []>("op_1415"), val = tensor<int32, [3]>([9, 16, 64])];
-            tensor<fp32, [9, 16, 64]> var_1416 = reshape(shape = var_1415, x = k_19)[name = tensor<string, []>("op_1416")];
+            tensor<int32, [3]> var_1448 = const()[name = tensor<string, []>("op_1448"), val = tensor<int32, [3]>([9, 16, 64])];
+            tensor<fp32, [9, 16, 64]> var_1449 = reshape(shape = var_1448, x = k_19)[name = tensor<string, []>("op_1449")];
             tensor<int32, [3]> k_21_perm_0 = const()[name = tensor<string, []>("k_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1422 = const()[name = tensor<string, []>("op_1422"), val = tensor<int32, [3]>([9, 16, 64])];
-            tensor<fp32, [9, 16, 64]> var_1423 = reshape(shape = var_1422, x = v_19)[name = tensor<string, []>("op_1423")];
+            tensor<int32, [3]> var_1455 = const()[name = tensor<string, []>("op_1455"), val = tensor<int32, [3]>([9, 16, 64])];
+            tensor<fp32, [9, 16, 64]> var_1456 = reshape(shape = var_1455, x = v_19)[name = tensor<string, []>("op_1456")];
             tensor<int32, [3]> v_21_perm_0 = const()[name = tensor<string, []>("v_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [4]> var_1426 = const()[name = tensor<string, []>("op_1426"), val = tensor<int32, [4]>([4, 4, 9, 64])];
-            tensor<fp32, [16, 9, 64]> q_21 = transpose(perm = q_21_perm_0, x = var_1409)[name = tensor<string, []>("transpose_7")];
-            tensor<fp32, [4, 4, 9, 64]> q = reshape(shape = var_1426, x = q_21)[name = tensor<string, []>("q")];
-            tensor<int32, [4]> var_1428 = const()[name = tensor<string, []>("op_1428"), val = tensor<int32, [4]>([4, 4, 9, 64])];
-            tensor<fp32, [16, 9, 64]> k_21 = transpose(perm = k_21_perm_0, x = var_1416)[name = tensor<string, []>("transpose_6")];
-            tensor<fp32, [4, 4, 9, 64]> k = reshape(shape = var_1428, x = k_21)[name = tensor<string, []>("k")];
-            tensor<int32, [4]> var_1430 = const()[name = tensor<string, []>("op_1430"), val = tensor<int32, [4]>([4, 4, 9, 64])];
-            tensor<fp32, [16, 9, 64]> v_21 = transpose(perm = v_21_perm_0, x = var_1423)[name = tensor<string, []>("transpose_5")];
-            tensor<fp32, [4, 4, 9, 64]> v = reshape(shape = var_1430, x = v_21)[name = tensor<string, []>("v")];
+            tensor<int32, [4]> var_1459 = const()[name = tensor<string, []>("op_1459"), val = tensor<int32, [4]>([4, 4, 9, 64])];
+            tensor<fp32, [16, 9, 64]> q_21 = transpose(perm = q_21_perm_0, x = var_1442)[name = tensor<string, []>("transpose_7")];
+            tensor<fp32, [4, 4, 9, 64]> q = reshape(shape = var_1459, x = q_21)[name = tensor<string, []>("q")];
+            tensor<int32, [4]> var_1461 = const()[name = tensor<string, []>("op_1461"), val = tensor<int32, [4]>([4, 4, 9, 64])];
+            tensor<fp32, [16, 9, 64]> k_21 = transpose(perm = k_21_perm_0, x = var_1449)[name = tensor<string, []>("transpose_6")];
+            tensor<fp32, [4, 4, 9, 64]> k = reshape(shape = var_1461, x = k_21)[name = tensor<string, []>("k")];
+            tensor<int32, [4]> var_1463 = const()[name = tensor<string, []>("op_1463"), val = tensor<int32, [4]>([4, 4, 9, 64])];
+            tensor<fp32, [16, 9, 64]> v_21 = transpose(perm = v_21_perm_0, x = var_1456)[name = tensor<string, []>("transpose_5")];
+            tensor<fp32, [4, 4, 9, 64]> v = reshape(shape = var_1463, x = v_21)[name = tensor<string, []>("v")];
             tensor<fp32, []> mul_3_y_0 = const()[name = tensor<string, []>("mul_3_y_0"), val = tensor<fp32, []>(0x1p-3)];
             tensor<fp32, [4, 4, 9, 64]> mul_3 = mul(x = q, y = mul_3_y_0)[name = tensor<string, []>("mul_3")];
             tensor<bool, []> matmul_1_transpose_y_0 = const()[name = tensor<string, []>("matmul_1_transpose_y_0"), val = tensor<bool, []>(true)];
@@ -1277,36 +1292,36 @@ program(1.0)
             tensor<bool, []> attn_output_9_transpose_x_0 = const()[name = tensor<string, []>("attn_output_9_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> attn_output_9_transpose_y_0 = const()[name = tensor<string, []>("attn_output_9_transpose_y_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [4, 4, 9, 64]> attn_output_9 = matmul(transpose_x = attn_output_9_transpose_x_0, transpose_y = attn_output_9_transpose_y_0, x = softmax_1, y = v)[name = tensor<string, []>("attn_output_9")];
-            tensor<int32, [4]> var_1433 = const()[name = tensor<string, []>("op_1433"), val = tensor<int32, [4]>([2, 0, 1, 3])];
-            tensor<int32, [2]> var_1438 = const()[name = tensor<string, []>("op_1438"), val = tensor<int32, [2]>([36, 256])];
-            tensor<fp32, [9, 4, 4, 64]> var_1434 = transpose(perm = var_1433, x = attn_output_9)[name = tensor<string, []>("transpose_4")];
-            tensor<fp32, [36, 256]> attn_output_11 = reshape(shape = var_1438, x = var_1434)[name = tensor<string, []>("attn_output_11")];
-            tensor<fp32, [36, 256]> attn_output_13 = linear(bias = decoder_self_attn2_1_out_proj_bias, weight = decoder_self_attn2_1_out_proj_weight, x = attn_output_11)[name = tensor<string, []>("linear_53")];
-            tensor<int32, [3]> var_1442 = const()[name = tensor<string, []>("op_1442"), val = tensor<int32, [3]>([9, 4, 256])];
-            tensor<fp32, [9, 4, 256]> attn_output = reshape(shape = var_1442, x = attn_output_13)[name = tensor<string, []>("attn_output")];
+            tensor<int32, [4]> var_1466 = const()[name = tensor<string, []>("op_1466"), val = tensor<int32, [4]>([2, 0, 1, 3])];
+            tensor<int32, [2]> var_1471 = const()[name = tensor<string, []>("op_1471"), val = tensor<int32, [2]>([36, 256])];
+            tensor<fp32, [9, 4, 4, 64]> var_1467 = transpose(perm = var_1466, x = attn_output_9)[name = tensor<string, []>("transpose_4")];
+            tensor<fp32, [36, 256]> attn_output_11 = reshape(shape = var_1471, x = var_1467)[name = tensor<string, []>("attn_output_11")];
+            tensor<fp32, [36, 256]> attn_output_13 = linear(bias = inner_decoder_self_attn2_1_out_proj_bias, weight = inner_decoder_self_attn2_1_out_proj_weight, x = attn_output_11)[name = tensor<string, []>("linear_53")];
+            tensor<int32, [3]> var_1475 = const()[name = tensor<string, []>("op_1475"), val = tensor<int32, [3]>([9, 4, 256])];
+            tensor<fp32, [9, 4, 256]> attn_output = reshape(shape = var_1475, x = attn_output_13)[name = tensor<string, []>("attn_output")];
             tensor<int32, [3]> sa2_out_perm_0 = const()[name = tensor<string, []>("sa2_out_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [4, 9, 256]> sa2_out = transpose(perm = sa2_out_perm_0, x = attn_output)[name = tensor<string, []>("transpose_3")];
-            tensor<fp32, [4, 9, 256]> input_193 = add(x = query_5, y = sa2_out)[name = tensor<string, []>("input_193")];
-            tensor<int32, [1]> input_195_axes_0 = const()[name = tensor<string, []>("input_195_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [4, 9, 256]> input_195 = layer_norm(axes = input_195_axes_0, beta = decoder_norm21_1_bias, epsilon = var_1017, gamma = decoder_norm21_1_weight, x = input_193)[name = tensor<string, []>("input_195")];
-            tensor<fp32, [4, 9, 2048]> input_197 = linear(bias = decoder_linear1_1_bias, weight = decoder_linear1_1_weight, x = input_195)[name = tensor<string, []>("linear_54")];
-            tensor<fp32, [4, 9, 2048]> input_199 = relu(x = input_197)[name = tensor<string, []>("input_199")];
-            tensor<fp32, [4, 9, 256]> ffn_out = linear(bias = decoder_linear2_1_bias, weight = decoder_linear2_1_weight, x = input_199)[name = tensor<string, []>("linear_55")];
-            tensor<fp32, [4, 9, 256]> input_201 = add(x = input_195, y = ffn_out)[name = tensor<string, []>("input_201")];
+            tensor<fp32, [4, 9, 256]> input_195 = add(x = query_5, y = sa2_out)[name = tensor<string, []>("input_195")];
+            tensor<int32, [1]> input_197_axes_0 = const()[name = tensor<string, []>("input_197_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [4, 9, 256]> input_197 = layer_norm(axes = input_197_axes_0, beta = inner_decoder_norm21_1_bias, epsilon = var_66, gamma = inner_decoder_norm21_1_weight, x = input_195)[name = tensor<string, []>("input_197")];
+            tensor<fp32, [4, 9, 2048]> input_199 = linear(bias = inner_decoder_linear1_1_bias, weight = inner_decoder_linear1_1_weight, x = input_197)[name = tensor<string, []>("linear_54")];
+            tensor<fp32, [4, 9, 2048]> input_201 = relu(x = input_199)[name = tensor<string, []>("input_201")];
+            tensor<fp32, [4, 9, 256]> ffn_out = linear(bias = inner_decoder_linear2_1_bias, weight = inner_decoder_linear2_1_weight, x = input_201)[name = tensor<string, []>("linear_55")];
+            tensor<fp32, [4, 9, 256]> input_203 = add(x = input_197, y = ffn_out)[name = tensor<string, []>("input_203")];
             tensor<int32, [1]> xt_axes_0 = const()[name = tensor<string, []>("xt_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [4, 9, 256]> xt = layer_norm(axes = xt_axes_0, beta = decoder_norm22_1_bias, epsilon = var_1017, gamma = decoder_norm22_1_weight, x = input_201)[name = tensor<string, []>("xt")];
-            tensor<int32, [4]> var_1462 = const()[name = tensor<string, []>("op_1462"), val = tensor<int32, [4]>([1, 4, 9, 256])];
-            tensor<fp32, [1, 4, 9, 256]> input = reshape(shape = var_1462, x = xt)[name = tensor<string, []>("input")];
-            tensor<int32, [1]> var_1464 = const()[name = tensor<string, []>("op_1464"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 9, 1]> var_1465 = reduce_l2_norm(axes = var_1464, keep_dims = var_1020, x = input)[name = tensor<string, []>("op_1465")];
+            tensor<fp32, [4, 9, 256]> xt = layer_norm(axes = xt_axes_0, beta = inner_decoder_norm22_1_bias, epsilon = var_66, gamma = inner_decoder_norm22_1_weight, x = input_203)[name = tensor<string, []>("xt")];
+            tensor<int32, [4]> var_1495 = const()[name = tensor<string, []>("op_1495"), val = tensor<int32, [4]>([1, 4, 9, 256])];
+            tensor<fp32, [1, 4, 9, 256]> input = reshape(shape = var_1495, x = xt)[name = tensor<string, []>("input")];
+            tensor<int32, [1]> var_1497 = const()[name = tensor<string, []>("op_1497"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 4, 9, 1]> var_1498 = reduce_l2_norm(axes = var_1497, keep_dims = var_65, x = input)[name = tensor<string, []>("op_1498")];
             tensor<fp32, []> const_42 = const()[name = tensor<string, []>("const_42"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1, 4, 9, 1]> clip_5 = clip(alpha = var_1012, beta = const_42, x = var_1465)[name = tensor<string, []>("clip_5")];
-            tensor<fp32, [1, 4, 9, 256]> var_1467 = real_div(x = input, y = clip_5)[name = tensor<string, []>("op_1467")];
+            tensor<fp32, [1, 4, 9, 1]> clip_5 = clip(alpha = var_79, beta = const_42, x = var_1498)[name = tensor<string, []>("clip_5")];
+            tensor<fp32, [1, 4, 9, 256]> var_1500 = real_div(x = input, y = clip_5)[name = tensor<string, []>("op_1500")];
             tensor<int32, [3]> concat_6 = const()[name = tensor<string, []>("concat_6"), val = tensor<int32, [3]>([4, 1, 256])];
             tensor<fp32, [4, 1, 256]> reshape_0 = reshape(shape = concat_6, x = emb)[name = tensor<string, []>("reshape_0")];
             tensor<int32, [4]> transpose_1_perm_0 = const()[name = tensor<string, []>("transpose_1_perm_0"), val = tensor<int32, [4]>([0, 1, 3, 2])];
             tensor<int32, [3]> concat_7 = const()[name = tensor<string, []>("concat_7"), val = tensor<int32, [3]>([4, 256, 9])];
-            tensor<fp32, [1, 4, 256, 9]> transpose_1 = transpose(perm = transpose_1_perm_0, x = var_1467)[name = tensor<string, []>("transpose_2")];
+            tensor<fp32, [1, 4, 256, 9]> transpose_1 = transpose(perm = transpose_1_perm_0, x = var_1500)[name = tensor<string, []>("transpose_2")];
             tensor<fp32, [4, 256, 9]> reshape_1 = reshape(shape = concat_7, x = transpose_1)[name = tensor<string, []>("reshape_1")];
             tensor<bool, []> matmul_2_transpose_x_0 = const()[name = tensor<string, []>("matmul_2_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> matmul_2_transpose_y_0 = const()[name = tensor<string, []>("matmul_2_transpose_y_0"), val = tensor<bool, []>(false)];
@@ -1317,10 +1332,10 @@ program(1.0)
             tensor<int32, [3]> output_end_0 = const()[name = tensor<string, []>("output_end_0"), val = tensor<int32, [3]>([1, 4, 8])];
             tensor<bool, [3]> output_end_mask_0 = const()[name = tensor<string, []>("output_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
             tensor<fp32, [1, 4, 7]> output = slice_by_index(begin = output_begin_0, end = output_end_0, end_mask = output_end_mask_0, x = reshape_2)[name = tensor<string, []>("output")];
-            tensor<fp32, [1, 4, 7]> probs = sigmoid(x = output)[name = tensor<string, []>("op_1471")];
-            tensor<int32, []> var_1473_axis_0 = const()[name = tensor<string, []>("op_1473_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [2, 9, 4, 64, 64]> dec_kv_new = stack(axis = var_1473_axis_0, values = (var_1169, nkv))[name = tensor<string, []>("op_1473")];
-            tensor<int32, []> var_1475_axis_0 = const()[name = tensor<string, []>("op_1475_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [2, 1]> dec_scale_new = stack(axis = var_1475_axis_0, values = (new_scale_9, new_scale))[name = tensor<string, []>("op_1475")];
+            tensor<fp32, [1, 4, 7]> probs = sigmoid(x = output)[name = tensor<string, []>("op_1504")];
+            tensor<int32, []> var_1506_axis_0 = const()[name = tensor<string, []>("op_1506_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [2, 9, 4, 64, 64]> dec_kv_new = stack(axis = var_1506_axis_0, values = (var_1202, nkv))[name = tensor<string, []>("op_1506")];
+            tensor<int32, []> var_1508_axis_0 = const()[name = tensor<string, []>("op_1508_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [2, 1]> dec_scale_new = stack(axis = var_1508_axis_0, values = (new_scale_9, new_scale))[name = tensor<string, []>("op_1508")];
         } -> (probs, enc_kv_new, enc_scale_new, enc_conv_cache_new, cnn_window_new, dec_kv_new, dec_scale_new);
 }
\ No newline at end of file
diff --git a/optimized/ch/400ms/ls_eend_ch_400ms.mlpackage/Data/com.apple.CoreML/model.mlmodel b/optimized/ch/400ms/ls_eend_ch_400ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
index adac95464c7b8d573c30be0062638e5c2c0320d8..ed72efa3240836143eb2ec2a8f2cbc986306ded9 100644
--- a/optimized/ch/400ms/ls_eend_ch_400ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
+++ b/optimized/ch/400ms/ls_eend_ch_400ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:182619e9c96f802ccae152cbda7d15ae2b22f480c04c689709f70aafdc13e145
-size 191044
+oid sha256:ea37aa7ff0809165f05c3e642105047e5ab12f90ec70be3299df1257bc98c557
+size 197116
diff --git a/optimized/ch/400ms/ls_eend_ch_400ms.mlpackage/Manifest.json b/optimized/ch/400ms/ls_eend_ch_400ms.mlpackage/Manifest.json
index 36202097fcb62b8f48444015affe7358861ad8e4..95621140d601caa12673af33e82bbf0318b2be4e 100644
--- a/optimized/ch/400ms/ls_eend_ch_400ms.mlpackage/Manifest.json
+++ b/optimized/ch/400ms/ls_eend_ch_400ms.mlpackage/Manifest.json
@@ -1,18 +1,18 @@
 {
     "fileFormatVersion": "1.0.0",
     "itemInfoEntries": {
-        "1C991CEA-BA44-4416-8759-2F7BBCD14BF6": {
+        "6C11BC4B-3170-4769-81B4-E5118EDB1B86": {
             "author": "com.apple.CoreML",
             "description": "CoreML Model Weights",
             "name": "weights",
             "path": "com.apple.CoreML/weights"
         },
-        "676FBDCA-C6C7-4101-9A73-E68F17B5F2F1": {
+        "CF204ECD-C3A4-43DD-9F68-85A7874B4336": {
             "author": "com.apple.CoreML",
             "description": "CoreML Model Specification",
             "name": "model.mlmodel",
             "path": "com.apple.CoreML/model.mlmodel"
         }
     },
-    "rootModelIdentifier": "676FBDCA-C6C7-4101-9A73-E68F17B5F2F1"
+    "rootModelIdentifier": "CF204ECD-C3A4-43DD-9F68-85A7874B4336"
 }
diff --git a/optimized/ch/500ms/ls_eend_ch_500ms.mlmodelc/analytics/coremldata.bin b/optimized/ch/500ms/ls_eend_ch_500ms.mlmodelc/analytics/coremldata.bin
index 3126a415e64304c0c88002baacbed295e38ad760..2c56ac9511ab6867e4dcc2bc80303dacc61cdec9 100644
--- a/optimized/ch/500ms/ls_eend_ch_500ms.mlmodelc/analytics/coremldata.bin
+++ b/optimized/ch/500ms/ls_eend_ch_500ms.mlmodelc/analytics/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:631a472bbce1c64c900ce12bd152385af02044081a60d2b7634e29c403b9bc69
+oid sha256:8959c901ace7b3872ee7f214ad15c6993022f8cff276a7f35f918465bdd6cfe3
 size 243
diff --git a/optimized/ch/500ms/ls_eend_ch_500ms.mlmodelc/coremldata.bin b/optimized/ch/500ms/ls_eend_ch_500ms.mlmodelc/coremldata.bin
index 54dd550a42c104727ee2b06639b7b4ba3bc7bf83..f63b0a6d0db3ceb7d6600b3e6617798efb173804 100644
--- a/optimized/ch/500ms/ls_eend_ch_500ms.mlmodelc/coremldata.bin
+++ b/optimized/ch/500ms/ls_eend_ch_500ms.mlmodelc/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3cce9489e7b9d36fdbfd8fa6395dc2943f598f22d4ce2b28f349fff244c15703
-size 1301
+oid sha256:758464a340f78679e799f784a793d9900805b7b05ca4480cbbce6bdf1e0eea42
+size 1404
diff --git a/optimized/ch/500ms/ls_eend_ch_500ms.mlmodelc/metadata.json b/optimized/ch/500ms/ls_eend_ch_500ms.mlmodelc/metadata.json
index 6c6b1d83c6b133bc9631ee243751876ec6e39cef..d9da2c68b4a00990419d3882ff7963f01c13d62f 100644
--- a/optimized/ch/500ms/ls_eend_ch_500ms.mlmodelc/metadata.json
+++ b/optimized/ch/500ms/ls_eend_ch_500ms.mlmodelc/metadata.json
@@ -1,7 +1,7 @@
 [
   {
     "metadataOutputVersion" : "3.0",
-    "shortDescription" : "LS-EEND CALLHOME streaming diarizer (pipeline, T=5, max_speakers=7)",
+    "shortDescription" : "LS-EEND CALLHOME streaming diarizer (pipeline, T=5, max_speakers=7, layout=raw_mel, cu=all)",
     "outputSchema" : [
       {
         "hasShapeFlexibility" : "0",
@@ -81,7 +81,7 @@
     "specificationVersion" : 8,
     "mlProgramOperationTypeHistogram" : {
       "Ios16.reduceL2Norm" : 2,
-      "Ios17.reshape" : 68,
+      "Ios17.reshape" : 69,
       "Ios16.softmax" : 2,
       "Ios17.matmul" : 29,
       "Ios17.transpose" : 57,
@@ -89,7 +89,7 @@
       "Ios17.expandDims" : 3,
       "Ios17.add" : 46,
       "Ios16.sigmoid" : 5,
-      "Ios17.sliceByIndex" : 72,
+      "Ios17.sliceByIndex" : 77,
       "Tile" : 1,
       "Ios16.reduceSum" : 1,
       "Ios17.squeeze" : 2,
@@ -101,7 +101,7 @@
       "Ios16.silu" : 18,
       "Ios17.realDiv" : 20,
       "Ios17.linear" : 56,
-      "Stack" : 5,
+      "Stack" : 6,
       "Ios17.concat" : 26,
       "Ios16.relu" : 2,
       "Ios16.cumsum" : 1,
@@ -128,9 +128,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float32",
-        "formattedType" : "MultiArray (Float32 1 × 5 × 345)",
+        "formattedType" : "MultiArray (Float32 1 × 55 × 23)",
         "shortDescription" : "",
-        "shape" : "[1, 5, 345]",
+        "shape" : "[1, 55, 23]",
         "name" : "features",
         "type" : "MultiArray"
       },
@@ -206,8 +206,8 @@
       }
     ],
     "userDefinedMetadata" : {
-      "com.github.apple.coremltools.conversion_date" : "2026-04-16",
-      "config" : "{\"model_name\": \"ch\", \"model_label\": \"CALLHOME\", \"variant\": \"pipeline\", \"chunk_size\": 5, \"step_duration_ms\": 500, \"frame_duration_ms\": 100, \"frame_duration_seconds\": 0.1, \"max_speakers\": 7, \"max_nspks\": 9, \"n_units\": 256, \"n_heads\": 4, \"enc_n_layers\": 4, \"dec_n_layers\": 2, \"conv_kernel_size\": 16, \"conv_delay\": 9, \"sample_rate\": 8000, \"win_length\": 200, \"hop_length\": 80, \"n_mels\": 23, \"context_size\": 7, \"subsampling\": 10, \"feat_type\": \"logmel23_cummn\", \"pure_roll\": true}",
+      "com.github.apple.coremltools.conversion_date" : "2026-04-18",
+      "config" : "{\"model_name\": \"ch\", \"model_label\": \"CALLHOME\", \"variant\": \"pipeline\", \"chunk_size\": 5, \"step_duration_ms\": 500, \"frame_duration_ms\": 100, \"frame_duration_seconds\": 0.1, \"max_speakers\": 7, \"max_nspks\": 9, \"n_units\": 256, \"n_heads\": 4, \"enc_n_layers\": 4, \"dec_n_layers\": 2, \"conv_kernel_size\": 16, \"conv_delay\": 9, \"sample_rate\": 8000, \"win_length\": 200, \"hop_length\": 80, \"n_mels\": 23, \"context_size\": 7, \"subsampling\": 10, \"feat_type\": \"logmel23_cummn\", \"pure_roll\": true, \"input_layout\": \"raw_mel\", \"compute_units_export\": \"all\", \"raw_mel_length\": 55}",
       "com.github.apple.coremltools.source" : "torch==2.6.0",
       "com.github.apple.coremltools.version" : "9.0",
       "com.github.apple.coremltools.source_dialect" : "TorchScript"
diff --git a/optimized/ch/500ms/ls_eend_ch_500ms.mlmodelc/model.mil b/optimized/ch/500ms/ls_eend_ch_500ms.mlmodelc/model.mil
index fce20b41f60d6b106fa6a87a31e58eb7e064dc8b..23d359348d66fd1d2d9e467d139aa1b83f5a183a 100644
--- a/optimized/ch/500ms/ls_eend_ch_500ms.mlmodelc/model.mil
+++ b/optimized/ch/500ms/ls_eend_ch_500ms.mlmodelc/model.mil
@@ -1,234 +1,260 @@
 program(1.0)
 [buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.6.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "9.0"}})]
 {
-    func main<ios17>(tensor<fp32, [1, 256, 18]> cnn_window, tensor<fp32, [2, 9, 4, 64, 64]> dec_kv, tensor<fp32, [2, 1]> dec_scale, tensor<fp32, [4, 1, 16, 256]> enc_conv_cache, tensor<fp32, [4, 1, 4, 64, 64]> enc_kv, tensor<fp32, [4, 1]> enc_scale, tensor<fp32, [1, 5, 345]> features, tensor<fp32, [5]> valid_mask) {
-            tensor<fp32, [256]> encoder_cnn_bias = const()[name = tensor<string, []>("encoder_cnn_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
-            tensor<fp32, [256, 256, 19]> encoder_cnn_weight = const()[name = tensor<string, []>("encoder_cnn_weight"), val = tensor<fp32, [256, 256, 19]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
-            tensor<fp32, [5, 5]> encoder__causal_mask = const()[name = tensor<string, []>("encoder__causal_mask"), val = tensor<fp32, [5, 5]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4981952)))];
-            tensor<fp32, [5]> encoder__t_index = const()[name = tensor<string, []>("encoder__t_index"), val = tensor<fp32, [5]>([0x1p+0, 0x1p+1, 0x1.8p+1, 0x1p+2, 0x1.4p+2])];
-            tensor<fp32, [256]> encoder_input_projection_linear_bias = const()[name = tensor<string, []>("encoder_input_projection_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4982144)))];
-            tensor<fp32, [256, 345]> encoder_input_projection_linear_weight = const()[name = tensor<string, []>("encoder_input_projection_linear_weight"), val = tensor<fp32, [256, 345]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4983232)))];
-            tensor<fp32, [256]> encoder_pre_layer_norm_bias = const()[name = tensor<string, []>("encoder_pre_layer_norm_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5336576)))];
-            tensor<fp32, [256]> encoder_pre_layer_norm_weight = const()[name = tensor<string, []>("encoder_pre_layer_norm_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5337664)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5338752)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5339840)))];
-            tensor<fp32, [1024]> encoder_ffn1_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5340928)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5345088)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6393728)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6394816)))];
-            tensor<fp32, [256]> encoder_ret_lns_0_bias = const()[name = tensor<string, []>("encoder_ret_lns_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7443456)))];
-            tensor<fp32, [256]> encoder_ret_lns_0_weight = const()[name = tensor<string, []>("encoder_ret_lns_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7444544)))];
-            tensor<fp32, [256]> encoder_q_proj_0_bias = const()[name = tensor<string, []>("encoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7445632)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_0_weight = const()[name = tensor<string, []>("encoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7446720)))];
-            tensor<fp32, [256]> encoder_k_proj_0_bias = const()[name = tensor<string, []>("encoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7708928)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_0_weight = const()[name = tensor<string, []>("encoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7710016)))];
-            tensor<fp32, [256]> encoder_v_proj_0_bias = const()[name = tensor<string, []>("encoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7972224)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_0_weight = const()[name = tensor<string, []>("encoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7973312)))];
-            tensor<fp32, [256]> encoder_g_proj_0_bias = const()[name = tensor<string, []>("encoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8235520)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_0_weight = const()[name = tensor<string, []>("encoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8236608)))];
-            tensor<fp32, [256]> encoder_out_proj_0_bias = const()[name = tensor<string, []>("encoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8498816)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_0_weight = const()[name = tensor<string, []>("encoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8499904)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8762112)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8763200)))];
-            tensor<fp32, [512]> encoder_conv_module_0_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8764288)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_0_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8766400)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_0_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9290752)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9307200)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9308288)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9309376)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9310464)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9311552)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_0_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9312640)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9574848)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9575936)))];
-            tensor<fp32, [1024]> encoder_ffn2_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9577024)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9581184)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10629824)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10630912)))];
-            tensor<fp32, [256]> encoder_layer_norm_0_bias = const()[name = tensor<string, []>("encoder_layer_norm_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11679552)))];
-            tensor<fp32, [256]> encoder_layer_norm_0_weight = const()[name = tensor<string, []>("encoder_layer_norm_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11680640)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11681728)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11682816)))];
-            tensor<fp32, [1024]> encoder_ffn1_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11683904)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11688064)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12736704)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12737792)))];
-            tensor<fp32, [256]> encoder_ret_lns_1_bias = const()[name = tensor<string, []>("encoder_ret_lns_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13786432)))];
-            tensor<fp32, [256]> encoder_ret_lns_1_weight = const()[name = tensor<string, []>("encoder_ret_lns_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13787520)))];
-            tensor<fp32, [256]> encoder_q_proj_1_bias = const()[name = tensor<string, []>("encoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13788608)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_1_weight = const()[name = tensor<string, []>("encoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13789696)))];
-            tensor<fp32, [256]> encoder_k_proj_1_bias = const()[name = tensor<string, []>("encoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14051904)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_1_weight = const()[name = tensor<string, []>("encoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14052992)))];
-            tensor<fp32, [256]> encoder_v_proj_1_bias = const()[name = tensor<string, []>("encoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14315200)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_1_weight = const()[name = tensor<string, []>("encoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14316288)))];
-            tensor<fp32, [256]> encoder_g_proj_1_bias = const()[name = tensor<string, []>("encoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14578496)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_1_weight = const()[name = tensor<string, []>("encoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14579584)))];
-            tensor<fp32, [256]> encoder_out_proj_1_bias = const()[name = tensor<string, []>("encoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14841792)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_1_weight = const()[name = tensor<string, []>("encoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14842880)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15105088)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15106176)))];
-            tensor<fp32, [512]> encoder_conv_module_1_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15107264)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_1_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15109376)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_1_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15633728)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15650176)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15651264)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15652352)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15653440)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15654528)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_1_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15655616)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15917824)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15918912)))];
-            tensor<fp32, [1024]> encoder_ffn2_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15920000)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15924160)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16972800)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16973888)))];
-            tensor<fp32, [256]> encoder_layer_norm_1_bias = const()[name = tensor<string, []>("encoder_layer_norm_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18022528)))];
-            tensor<fp32, [256]> encoder_layer_norm_1_weight = const()[name = tensor<string, []>("encoder_layer_norm_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18023616)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18024704)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18025792)))];
-            tensor<fp32, [1024]> encoder_ffn1_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18026880)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18031040)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19079680)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19080768)))];
-            tensor<fp32, [256]> encoder_ret_lns_2_bias = const()[name = tensor<string, []>("encoder_ret_lns_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20129408)))];
-            tensor<fp32, [256]> encoder_ret_lns_2_weight = const()[name = tensor<string, []>("encoder_ret_lns_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20130496)))];
-            tensor<fp32, [256]> encoder_q_proj_2_bias = const()[name = tensor<string, []>("encoder_q_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20131584)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_2_weight = const()[name = tensor<string, []>("encoder_q_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20132672)))];
-            tensor<fp32, [256]> encoder_k_proj_2_bias = const()[name = tensor<string, []>("encoder_k_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20394880)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_2_weight = const()[name = tensor<string, []>("encoder_k_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20395968)))];
-            tensor<fp32, [256]> encoder_v_proj_2_bias = const()[name = tensor<string, []>("encoder_v_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20658176)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_2_weight = const()[name = tensor<string, []>("encoder_v_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20659264)))];
-            tensor<fp32, [256]> encoder_g_proj_2_bias = const()[name = tensor<string, []>("encoder_g_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20921472)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_2_weight = const()[name = tensor<string, []>("encoder_g_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20922560)))];
-            tensor<fp32, [256]> encoder_out_proj_2_bias = const()[name = tensor<string, []>("encoder_out_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21184768)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_2_weight = const()[name = tensor<string, []>("encoder_out_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21185856)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21448064)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21449152)))];
-            tensor<fp32, [512]> encoder_conv_module_2_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21450240)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_2_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21452352)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_2_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21976704)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21993152)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21994240)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21995328)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21996416)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21997504)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_2_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21998592)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22260800)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22261888)))];
-            tensor<fp32, [1024]> encoder_ffn2_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22262976)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22267136)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23315776)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23316864)))];
-            tensor<fp32, [256]> encoder_layer_norm_2_bias = const()[name = tensor<string, []>("encoder_layer_norm_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24365504)))];
-            tensor<fp32, [256]> encoder_layer_norm_2_weight = const()[name = tensor<string, []>("encoder_layer_norm_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24366592)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24367680)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24368768)))];
-            tensor<fp32, [1024]> encoder_ffn1_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24369856)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24374016)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25422656)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25423744)))];
-            tensor<fp32, [256]> encoder_ret_lns_3_bias = const()[name = tensor<string, []>("encoder_ret_lns_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26472384)))];
-            tensor<fp32, [256]> encoder_ret_lns_3_weight = const()[name = tensor<string, []>("encoder_ret_lns_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26473472)))];
-            tensor<fp32, [256]> encoder_q_proj_3_bias = const()[name = tensor<string, []>("encoder_q_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26474560)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_3_weight = const()[name = tensor<string, []>("encoder_q_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26475648)))];
-            tensor<fp32, [256]> encoder_k_proj_3_bias = const()[name = tensor<string, []>("encoder_k_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26737856)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_3_weight = const()[name = tensor<string, []>("encoder_k_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26738944)))];
-            tensor<fp32, [256]> encoder_v_proj_3_bias = const()[name = tensor<string, []>("encoder_v_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27001152)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_3_weight = const()[name = tensor<string, []>("encoder_v_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27002240)))];
-            tensor<fp32, [256]> encoder_g_proj_3_bias = const()[name = tensor<string, []>("encoder_g_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27264448)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_3_weight = const()[name = tensor<string, []>("encoder_g_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27265536)))];
-            tensor<fp32, [256]> encoder_out_proj_3_bias = const()[name = tensor<string, []>("encoder_out_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27527744)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_3_weight = const()[name = tensor<string, []>("encoder_out_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27528832)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27791040)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27792128)))];
-            tensor<fp32, [512]> encoder_conv_module_3_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27793216)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_3_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27795328)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_3_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28319680)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28336128)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28337216)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28338304)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28339392)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28340480)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_3_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28341568)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28603776)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28604864)))];
-            tensor<fp32, [1024]> encoder_ffn2_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28605952)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28610112)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29658752)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29659840)))];
-            tensor<fp32, [256]> encoder_layer_norm_3_bias = const()[name = tensor<string, []>("encoder_layer_norm_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30708480)))];
-            tensor<fp32, [256]> encoder_layer_norm_3_weight = const()[name = tensor<string, []>("encoder_layer_norm_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30709568)))];
-            tensor<fp32, [5, 5]> decoder__causal_mask = const()[name = tensor<string, []>("decoder__causal_mask"), val = tensor<fp32, [5, 5]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710656)))];
-            tensor<fp32, [256]> decoder_convert_bias = const()[name = tensor<string, []>("decoder_convert_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710848)))];
-            tensor<fp32, [256, 512]> decoder_convert_weight = const()[name = tensor<string, []>("decoder_convert_weight"), val = tensor<fp32, [256, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30711936)))];
-            tensor<fp32, [256]> decoder_q_proj_0_bias = const()[name = tensor<string, []>("decoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31236288)))];
-            tensor<fp32, [256, 256]> decoder_q_proj_0_weight = const()[name = tensor<string, []>("decoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31237376)))];
-            tensor<fp32, [256]> decoder_k_proj_0_bias = const()[name = tensor<string, []>("decoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31499584)))];
-            tensor<fp32, [256, 256]> decoder_k_proj_0_weight = const()[name = tensor<string, []>("decoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31500672)))];
-            tensor<fp32, [256]> decoder_v_proj_0_bias = const()[name = tensor<string, []>("decoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31762880)))];
-            tensor<fp32, [256, 256]> decoder_v_proj_0_weight = const()[name = tensor<string, []>("decoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31763968)))];
-            tensor<fp32, [256]> decoder_g_proj_0_bias = const()[name = tensor<string, []>("decoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32026176)))];
-            tensor<fp32, [256, 256]> decoder_g_proj_0_weight = const()[name = tensor<string, []>("decoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32027264)))];
-            tensor<fp32, [256]> decoder_out_proj_0_bias = const()[name = tensor<string, []>("decoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32289472)))];
-            tensor<fp32, [256, 256]> decoder_out_proj_0_weight = const()[name = tensor<string, []>("decoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32290560)))];
-            tensor<fp32, [256]> decoder_norm11_0_bias = const()[name = tensor<string, []>("decoder_norm11_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32552768)))];
-            tensor<fp32, [256]> decoder_norm11_0_weight = const()[name = tensor<string, []>("decoder_norm11_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32553856)))];
-            tensor<fp32, [256]> decoder_self_attn2_0_out_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_0_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32554944)))];
-            tensor<fp32, [256, 256]> decoder_self_attn2_0_out_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_0_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32556032)))];
-            tensor<fp32, [768]> decoder_self_attn2_0_in_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_0_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32818240)))];
-            tensor<fp32, [768, 256]> decoder_self_attn2_0_in_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_0_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32821376)))];
-            tensor<fp32, [256]> decoder_norm21_0_bias = const()[name = tensor<string, []>("decoder_norm21_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33607872)))];
-            tensor<fp32, [256]> decoder_norm21_0_weight = const()[name = tensor<string, []>("decoder_norm21_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33608960)))];
-            tensor<fp32, [2048]> decoder_linear1_0_bias = const()[name = tensor<string, []>("decoder_linear1_0_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33610048)))];
-            tensor<fp32, [2048, 256]> decoder_linear1_0_weight = const()[name = tensor<string, []>("decoder_linear1_0_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33618304)))];
-            tensor<fp32, [256]> decoder_linear2_0_bias = const()[name = tensor<string, []>("decoder_linear2_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35715520)))];
-            tensor<fp32, [256, 2048]> decoder_linear2_0_weight = const()[name = tensor<string, []>("decoder_linear2_0_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35716608)))];
-            tensor<fp32, [256]> decoder_norm22_0_bias = const()[name = tensor<string, []>("decoder_norm22_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37813824)))];
-            tensor<fp32, [256]> decoder_norm22_0_weight = const()[name = tensor<string, []>("decoder_norm22_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37814912)))];
-            tensor<fp32, [256]> decoder_q_proj_1_bias = const()[name = tensor<string, []>("decoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37816000)))];
-            tensor<fp32, [256, 256]> decoder_q_proj_1_weight = const()[name = tensor<string, []>("decoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37817088)))];
-            tensor<fp32, [256]> decoder_k_proj_1_bias = const()[name = tensor<string, []>("decoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38079296)))];
-            tensor<fp32, [256, 256]> decoder_k_proj_1_weight = const()[name = tensor<string, []>("decoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38080384)))];
-            tensor<fp32, [256]> decoder_v_proj_1_bias = const()[name = tensor<string, []>("decoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38342592)))];
-            tensor<fp32, [256, 256]> decoder_v_proj_1_weight = const()[name = tensor<string, []>("decoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38343680)))];
-            tensor<fp32, [256]> decoder_g_proj_1_bias = const()[name = tensor<string, []>("decoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38605888)))];
-            tensor<fp32, [256, 256]> decoder_g_proj_1_weight = const()[name = tensor<string, []>("decoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38606976)))];
-            tensor<fp32, [256]> decoder_out_proj_1_bias = const()[name = tensor<string, []>("decoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38869184)))];
-            tensor<fp32, [256, 256]> decoder_out_proj_1_weight = const()[name = tensor<string, []>("decoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38870272)))];
-            tensor<fp32, [256]> decoder_norm11_1_bias = const()[name = tensor<string, []>("decoder_norm11_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39132480)))];
-            tensor<fp32, [256]> decoder_norm11_1_weight = const()[name = tensor<string, []>("decoder_norm11_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39133568)))];
-            tensor<fp32, [256]> decoder_self_attn2_1_out_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_1_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39134656)))];
-            tensor<fp32, [256, 256]> decoder_self_attn2_1_out_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_1_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39135744)))];
-            tensor<fp32, [768]> decoder_self_attn2_1_in_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_1_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39397952)))];
-            tensor<fp32, [768, 256]> decoder_self_attn2_1_in_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_1_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39401088)))];
-            tensor<fp32, [256]> decoder_norm21_1_bias = const()[name = tensor<string, []>("decoder_norm21_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40187584)))];
-            tensor<fp32, [256]> decoder_norm21_1_weight = const()[name = tensor<string, []>("decoder_norm21_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40188672)))];
-            tensor<fp32, [2048]> decoder_linear1_1_bias = const()[name = tensor<string, []>("decoder_linear1_1_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40189760)))];
-            tensor<fp32, [2048, 256]> decoder_linear1_1_weight = const()[name = tensor<string, []>("decoder_linear1_1_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40198016)))];
-            tensor<fp32, [256]> decoder_linear2_1_bias = const()[name = tensor<string, []>("decoder_linear2_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42295232)))];
-            tensor<fp32, [256, 2048]> decoder_linear2_1_weight = const()[name = tensor<string, []>("decoder_linear2_1_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42296320)))];
-            tensor<fp32, [256]> decoder_norm22_1_bias = const()[name = tensor<string, []>("decoder_norm22_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44393536)))];
-            tensor<fp32, [256]> decoder_norm22_1_weight = const()[name = tensor<string, []>("decoder_norm22_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44394624)))];
-            tensor<fp32, []> var_11 = const()[name = tensor<string, []>("op_11"), val = tensor<fp32, []>(0x1.5798eep-27)];
-            tensor<fp32, []> var_18 = const()[name = tensor<string, []>("op_18"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
-            tensor<int32, []> var_21 = const()[name = tensor<string, []>("op_21"), val = tensor<int32, []>(2)];
-            tensor<int32, []> var_24 = const()[name = tensor<string, []>("op_24"), val = tensor<int32, []>(0)];
-            tensor<int32, []> var_27 = const()[name = tensor<string, []>("op_27"), val = tensor<int32, []>(1)];
-            tensor<fp32, []> var_29 = const()[name = tensor<string, []>("op_29"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
-            tensor<bool, []> var_30 = const()[name = tensor<string, []>("op_30"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 5, 256]> input_1 = linear(bias = encoder_input_projection_linear_bias, weight = encoder_input_projection_linear_weight, x = features)[name = tensor<string, []>("linear_0")];
-            tensor<int32, [1]> input_3_axes_0 = const()[name = tensor<string, []>("input_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_3 = layer_norm(axes = input_3_axes_0, beta = encoder_pre_layer_norm_bias, epsilon = var_29, gamma = encoder_pre_layer_norm_weight, x = input_1)[name = tensor<string, []>("input_3")];
+    func main<ios17>(tensor<fp32, [1, 256, 18]> cnn_window, tensor<fp32, [2, 9, 4, 64, 64]> dec_kv, tensor<fp32, [2, 1]> dec_scale, tensor<fp32, [4, 1, 16, 256]> enc_conv_cache, tensor<fp32, [4, 1, 4, 64, 64]> enc_kv, tensor<fp32, [4, 1]> enc_scale, tensor<fp32, [1, 55, 23]> features, tensor<fp32, [5]> valid_mask) {
+            tensor<fp32, [256]> inner_encoder_cnn_bias = const()[name = tensor<string, []>("inner_encoder_cnn_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp32, [256, 256, 19]> inner_encoder_cnn_weight = const()[name = tensor<string, []>("inner_encoder_cnn_weight"), val = tensor<fp32, [256, 256, 19]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
+            tensor<fp32, [5, 5]> inner_encoder__causal_mask = const()[name = tensor<string, []>("inner_encoder__causal_mask"), val = tensor<fp32, [5, 5]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4981952)))];
+            tensor<fp32, [5]> inner_encoder__t_index = const()[name = tensor<string, []>("inner_encoder__t_index"), val = tensor<fp32, [5]>([0x1p+0, 0x1p+1, 0x1.8p+1, 0x1p+2, 0x1.4p+2])];
+            tensor<fp32, [256]> inner_encoder_input_projection_linear_bias = const()[name = tensor<string, []>("inner_encoder_input_projection_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4982144)))];
+            tensor<fp32, [256, 345]> inner_encoder_input_projection_linear_weight = const()[name = tensor<string, []>("inner_encoder_input_projection_linear_weight"), val = tensor<fp32, [256, 345]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4983232)))];
+            tensor<fp32, [256]> inner_encoder_pre_layer_norm_bias = const()[name = tensor<string, []>("inner_encoder_pre_layer_norm_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5336576)))];
+            tensor<fp32, [256]> inner_encoder_pre_layer_norm_weight = const()[name = tensor<string, []>("inner_encoder_pre_layer_norm_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5337664)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5338752)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5339840)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5340928)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5345088)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6393728)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6394816)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_0_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7443456)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_0_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7444544)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7445632)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7446720)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7708928)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7710016)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7972224)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7973312)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8235520)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8236608)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8498816)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8499904)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8762112)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8763200)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_0_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8764288)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_0_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8766400)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_0_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9290752)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9307200)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9308288)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9309376)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9310464)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9311552)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_0_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9312640)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9574848)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9575936)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9577024)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9581184)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10629824)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10630912)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_0_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11679552)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_0_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11680640)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11681728)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11682816)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11683904)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11688064)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12736704)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12737792)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_1_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13786432)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_1_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13787520)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13788608)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13789696)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14051904)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14052992)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14315200)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14316288)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14578496)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14579584)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14841792)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14842880)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15105088)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15106176)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_1_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15107264)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_1_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15109376)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_1_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15633728)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15650176)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15651264)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15652352)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15653440)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15654528)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_1_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15655616)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15917824)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15918912)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15920000)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15924160)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16972800)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16973888)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_1_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18022528)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_1_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18023616)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18024704)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18025792)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18026880)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18031040)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19079680)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19080768)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_2_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20129408)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_2_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20130496)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20131584)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20132672)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20394880)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20395968)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20658176)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20659264)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20921472)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20922560)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21184768)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21185856)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21448064)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21449152)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_2_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21450240)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_2_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21452352)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_2_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21976704)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21993152)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21994240)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21995328)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21996416)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21997504)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_2_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21998592)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22260800)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22261888)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22262976)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22267136)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23315776)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23316864)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_2_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24365504)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_2_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24366592)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24367680)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24368768)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24369856)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24374016)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25422656)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25423744)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_3_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26472384)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_3_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26473472)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26474560)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26475648)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26737856)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26738944)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27001152)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27002240)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27264448)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27265536)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27527744)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27528832)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27791040)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27792128)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_3_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27793216)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_3_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27795328)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_3_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28319680)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28336128)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28337216)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28338304)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28339392)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28340480)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_3_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28341568)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28603776)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28604864)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28605952)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28610112)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29658752)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29659840)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_3_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30708480)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_3_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30709568)))];
+            tensor<fp32, [5, 5]> inner_decoder__causal_mask = const()[name = tensor<string, []>("inner_decoder__causal_mask"), val = tensor<fp32, [5, 5]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710656)))];
+            tensor<fp32, [256]> inner_decoder_convert_bias = const()[name = tensor<string, []>("inner_decoder_convert_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710848)))];
+            tensor<fp32, [256, 512]> inner_decoder_convert_weight = const()[name = tensor<string, []>("inner_decoder_convert_weight"), val = tensor<fp32, [256, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30711936)))];
+            tensor<fp32, [256]> inner_decoder_q_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31236288)))];
+            tensor<fp32, [256, 256]> inner_decoder_q_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31237376)))];
+            tensor<fp32, [256]> inner_decoder_k_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31499584)))];
+            tensor<fp32, [256, 256]> inner_decoder_k_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31500672)))];
+            tensor<fp32, [256]> inner_decoder_v_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31762880)))];
+            tensor<fp32, [256, 256]> inner_decoder_v_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31763968)))];
+            tensor<fp32, [256]> inner_decoder_g_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32026176)))];
+            tensor<fp32, [256, 256]> inner_decoder_g_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32027264)))];
+            tensor<fp32, [256]> inner_decoder_out_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32289472)))];
+            tensor<fp32, [256, 256]> inner_decoder_out_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32290560)))];
+            tensor<fp32, [256]> inner_decoder_norm11_0_bias = const()[name = tensor<string, []>("inner_decoder_norm11_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32552768)))];
+            tensor<fp32, [256]> inner_decoder_norm11_0_weight = const()[name = tensor<string, []>("inner_decoder_norm11_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32553856)))];
+            tensor<fp32, [256]> inner_decoder_self_attn2_0_out_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32554944)))];
+            tensor<fp32, [256, 256]> inner_decoder_self_attn2_0_out_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32556032)))];
+            tensor<fp32, [768]> inner_decoder_self_attn2_0_in_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32818240)))];
+            tensor<fp32, [768, 256]> inner_decoder_self_attn2_0_in_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32821376)))];
+            tensor<fp32, [256]> inner_decoder_norm21_0_bias = const()[name = tensor<string, []>("inner_decoder_norm21_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33607872)))];
+            tensor<fp32, [256]> inner_decoder_norm21_0_weight = const()[name = tensor<string, []>("inner_decoder_norm21_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33608960)))];
+            tensor<fp32, [2048]> inner_decoder_linear1_0_bias = const()[name = tensor<string, []>("inner_decoder_linear1_0_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33610048)))];
+            tensor<fp32, [2048, 256]> inner_decoder_linear1_0_weight = const()[name = tensor<string, []>("inner_decoder_linear1_0_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33618304)))];
+            tensor<fp32, [256]> inner_decoder_linear2_0_bias = const()[name = tensor<string, []>("inner_decoder_linear2_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35715520)))];
+            tensor<fp32, [256, 2048]> inner_decoder_linear2_0_weight = const()[name = tensor<string, []>("inner_decoder_linear2_0_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35716608)))];
+            tensor<fp32, [256]> inner_decoder_norm22_0_bias = const()[name = tensor<string, []>("inner_decoder_norm22_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37813824)))];
+            tensor<fp32, [256]> inner_decoder_norm22_0_weight = const()[name = tensor<string, []>("inner_decoder_norm22_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37814912)))];
+            tensor<fp32, [256]> inner_decoder_q_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37816000)))];
+            tensor<fp32, [256, 256]> inner_decoder_q_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37817088)))];
+            tensor<fp32, [256]> inner_decoder_k_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38079296)))];
+            tensor<fp32, [256, 256]> inner_decoder_k_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38080384)))];
+            tensor<fp32, [256]> inner_decoder_v_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38342592)))];
+            tensor<fp32, [256, 256]> inner_decoder_v_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38343680)))];
+            tensor<fp32, [256]> inner_decoder_g_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38605888)))];
+            tensor<fp32, [256, 256]> inner_decoder_g_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38606976)))];
+            tensor<fp32, [256]> inner_decoder_out_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38869184)))];
+            tensor<fp32, [256, 256]> inner_decoder_out_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38870272)))];
+            tensor<fp32, [256]> inner_decoder_norm11_1_bias = const()[name = tensor<string, []>("inner_decoder_norm11_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39132480)))];
+            tensor<fp32, [256]> inner_decoder_norm11_1_weight = const()[name = tensor<string, []>("inner_decoder_norm11_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39133568)))];
+            tensor<fp32, [256]> inner_decoder_self_attn2_1_out_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39134656)))];
+            tensor<fp32, [256, 256]> inner_decoder_self_attn2_1_out_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39135744)))];
+            tensor<fp32, [768]> inner_decoder_self_attn2_1_in_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39397952)))];
+            tensor<fp32, [768, 256]> inner_decoder_self_attn2_1_in_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39401088)))];
+            tensor<fp32, [256]> inner_decoder_norm21_1_bias = const()[name = tensor<string, []>("inner_decoder_norm21_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40187584)))];
+            tensor<fp32, [256]> inner_decoder_norm21_1_weight = const()[name = tensor<string, []>("inner_decoder_norm21_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40188672)))];
+            tensor<fp32, [2048]> inner_decoder_linear1_1_bias = const()[name = tensor<string, []>("inner_decoder_linear1_1_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40189760)))];
+            tensor<fp32, [2048, 256]> inner_decoder_linear1_1_weight = const()[name = tensor<string, []>("inner_decoder_linear1_1_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40198016)))];
+            tensor<fp32, [256]> inner_decoder_linear2_1_bias = const()[name = tensor<string, []>("inner_decoder_linear2_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42295232)))];
+            tensor<fp32, [256, 2048]> inner_decoder_linear2_1_weight = const()[name = tensor<string, []>("inner_decoder_linear2_1_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42296320)))];
+            tensor<fp32, [256]> inner_decoder_norm22_1_bias = const()[name = tensor<string, []>("inner_decoder_norm22_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44393536)))];
+            tensor<fp32, [256]> inner_decoder_norm22_1_weight = const()[name = tensor<string, []>("inner_decoder_norm22_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44394624)))];
+            tensor<int32, [3]> var_19_begin_0 = const()[name = tensor<string, []>("op_19_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_19_end_0 = const()[name = tensor<string, []>("op_19_end_0"), val = tensor<int32, [3]>([1, 15, 23])];
+            tensor<bool, [3]> var_19_end_mask_0 = const()[name = tensor<string, []>("op_19_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 15, 23]> var_19 = slice_by_index(begin = var_19_begin_0, end = var_19_end_0, end_mask = var_19_end_mask_0, x = features)[name = tensor<string, []>("op_19")];
+            tensor<int32, [3]> var_29_begin_0 = const()[name = tensor<string, []>("op_29_begin_0"), val = tensor<int32, [3]>([0, 10, 0])];
+            tensor<int32, [3]> var_29_end_0 = const()[name = tensor<string, []>("op_29_end_0"), val = tensor<int32, [3]>([1, 25, 23])];
+            tensor<bool, [3]> var_29_end_mask_0 = const()[name = tensor<string, []>("op_29_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 15, 23]> var_29 = slice_by_index(begin = var_29_begin_0, end = var_29_end_0, end_mask = var_29_end_mask_0, x = features)[name = tensor<string, []>("op_29")];
+            tensor<int32, [3]> var_39_begin_0 = const()[name = tensor<string, []>("op_39_begin_0"), val = tensor<int32, [3]>([0, 20, 0])];
+            tensor<int32, [3]> var_39_end_0 = const()[name = tensor<string, []>("op_39_end_0"), val = tensor<int32, [3]>([1, 35, 23])];
+            tensor<bool, [3]> var_39_end_mask_0 = const()[name = tensor<string, []>("op_39_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 15, 23]> var_39 = slice_by_index(begin = var_39_begin_0, end = var_39_end_0, end_mask = var_39_end_mask_0, x = features)[name = tensor<string, []>("op_39")];
+            tensor<int32, [3]> var_49_begin_0 = const()[name = tensor<string, []>("op_49_begin_0"), val = tensor<int32, [3]>([0, 30, 0])];
+            tensor<int32, [3]> var_49_end_0 = const()[name = tensor<string, []>("op_49_end_0"), val = tensor<int32, [3]>([1, 45, 23])];
+            tensor<bool, [3]> var_49_end_mask_0 = const()[name = tensor<string, []>("op_49_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 15, 23]> var_49 = slice_by_index(begin = var_49_begin_0, end = var_49_end_0, end_mask = var_49_end_mask_0, x = features)[name = tensor<string, []>("op_49")];
+            tensor<int32, [3]> var_59_begin_0 = const()[name = tensor<string, []>("op_59_begin_0"), val = tensor<int32, [3]>([0, 40, 0])];
+            tensor<int32, [3]> var_59_end_0 = const()[name = tensor<string, []>("op_59_end_0"), val = tensor<int32, [3]>([1, 1, 23])];
+            tensor<bool, [3]> var_59_end_mask_0 = const()[name = tensor<string, []>("op_59_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 23]> var_59 = slice_by_index(begin = var_59_begin_0, end = var_59_end_0, end_mask = var_59_end_mask_0, x = features)[name = tensor<string, []>("op_59")];
+            tensor<int32, []> stacked_axis_0 = const()[name = tensor<string, []>("stacked_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 5, 15, 23]> stacked = stack(axis = stacked_axis_0, values = (var_19, var_29, var_39, var_49, var_59))[name = tensor<string, []>("stacked")];
+            tensor<int32, [3]> var_66 = const()[name = tensor<string, []>("op_66"), val = tensor<int32, [3]>([1, 5, 345])];
+            tensor<fp32, [1, 5, 345]> input_1 = reshape(shape = var_66, x = stacked)[name = tensor<string, []>("input_1")];
+            tensor<fp32, []> var_69 = const()[name = tensor<string, []>("op_69"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<bool, []> var_75 = const()[name = tensor<string, []>("op_75"), val = tensor<bool, []>(true)];
+            tensor<fp32, []> var_76 = const()[name = tensor<string, []>("op_76"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
+            tensor<int32, []> var_79 = const()[name = tensor<string, []>("op_79"), val = tensor<int32, []>(0)];
+            tensor<int32, []> var_81 = const()[name = tensor<string, []>("op_81"), val = tensor<int32, []>(2)];
+            tensor<int32, []> var_82 = const()[name = tensor<string, []>("op_82"), val = tensor<int32, []>(-1)];
+            tensor<fp32, []> var_84 = const()[name = tensor<string, []>("op_84"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
+            tensor<fp32, []> var_90 = const()[name = tensor<string, []>("op_90"), val = tensor<fp32, []>(0x1.5798eep-27)];
+            tensor<int32, []> var_93 = const()[name = tensor<string, []>("op_93"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 5, 256]> input_3 = linear(bias = inner_encoder_input_projection_linear_bias, weight = inner_encoder_input_projection_linear_weight, x = input_1)[name = tensor<string, []>("linear_0")];
             tensor<int32, [1]> input_5_axes_0 = const()[name = tensor<string, []>("input_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_5 = layer_norm(axes = input_5_axes_0, beta = encoder_ffn1_0_module_sequential_0_bias, epsilon = var_29, gamma = encoder_ffn1_0_module_sequential_0_weight, x = input_3)[name = tensor<string, []>("input_5")];
-            tensor<fp32, [1, 5, 1024]> inputs_1 = linear(bias = encoder_ffn1_0_module_sequential_1_linear_bias, weight = encoder_ffn1_0_module_sequential_1_linear_weight, x = input_5)[name = tensor<string, []>("linear_1")];
-            tensor<fp32, [1, 5, 1024]> input_7 = silu(x = inputs_1)[name = tensor<string, []>("input_7")];
-            tensor<fp32, [1, 5, 256]> input_11 = linear(bias = encoder_ffn1_0_module_sequential_4_linear_bias, weight = encoder_ffn1_0_module_sequential_4_linear_weight, x = input_7)[name = tensor<string, []>("linear_2")];
-            tensor<fp32, []> var_148 = const()[name = tensor<string, []>("op_148"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 5, 256]> var_149 = mul(x = input_11, y = var_148)[name = tensor<string, []>("op_149")];
-            tensor<fp32, [1, 5, 256]> input_13 = add(x = var_149, y = input_3)[name = tensor<string, []>("input_13")];
+            tensor<fp32, [1, 5, 256]> input_5 = layer_norm(axes = input_5_axes_0, beta = inner_encoder_pre_layer_norm_bias, epsilon = var_76, gamma = inner_encoder_pre_layer_norm_weight, x = input_3)[name = tensor<string, []>("input_5")];
+            tensor<int32, [1]> input_7_axes_0 = const()[name = tensor<string, []>("input_7_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 5, 256]> input_7 = layer_norm(axes = input_7_axes_0, beta = inner_encoder_ffn1_0_module_sequential_0_bias, epsilon = var_76, gamma = inner_encoder_ffn1_0_module_sequential_0_weight, x = input_5)[name = tensor<string, []>("input_7")];
+            tensor<fp32, [1, 5, 1024]> inputs_1 = linear(bias = inner_encoder_ffn1_0_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_0_module_sequential_1_linear_weight, x = input_7)[name = tensor<string, []>("linear_1")];
+            tensor<fp32, [1, 5, 1024]> input_9 = silu(x = inputs_1)[name = tensor<string, []>("input_9")];
+            tensor<fp32, [1, 5, 256]> input_13 = linear(bias = inner_encoder_ffn1_0_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_0_module_sequential_4_linear_weight, x = input_9)[name = tensor<string, []>("linear_2")];
+            tensor<fp32, []> var_214 = const()[name = tensor<string, []>("op_214"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 5, 256]> var_215 = mul(x = input_13, y = var_214)[name = tensor<string, []>("op_215")];
+            tensor<fp32, [1, 5, 256]> input_15 = add(x = var_215, y = input_5)[name = tensor<string, []>("input_15")];
             tensor<int32, [1]> x_1_axes_0 = const()[name = tensor<string, []>("x_1_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> x_1 = layer_norm(axes = x_1_axes_0, beta = encoder_ret_lns_0_bias, epsilon = var_29, gamma = encoder_ret_lns_0_weight, x = input_13)[name = tensor<string, []>("x_1")];
+            tensor<fp32, [1, 5, 256]> x_1 = layer_norm(axes = x_1_axes_0, beta = inner_encoder_ret_lns_0_bias, epsilon = var_76, gamma = inner_encoder_ret_lns_0_weight, x = input_15)[name = tensor<string, []>("x_1")];
             tensor<int32, [5]> prev_kv_1_begin_0 = const()[name = tensor<string, []>("prev_kv_1_begin_0"), val = tensor<int32, [5]>([0, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_1_end_0 = const()[name = tensor<string, []>("prev_kv_1_end_0"), val = tensor<int32, [5]>([1, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_1_end_mask_0 = const()[name = tensor<string, []>("prev_kv_1_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -239,183 +265,183 @@ program(1.0)
             tensor<bool, [2]> prev_scale_1_end_mask_0 = const()[name = tensor<string, []>("prev_scale_1_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_1_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_1_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_1 = slice_by_index(begin = prev_scale_1_begin_0, end = prev_scale_1_end_0, end_mask = prev_scale_1_end_mask_0, squeeze_mask = prev_scale_1_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_1")];
-            tensor<fp32, [1, 5, 256]> var_163 = linear(bias = encoder_q_proj_0_bias, weight = encoder_q_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_3")];
-            tensor<int32, [4]> var_164 = const()[name = tensor<string, []>("op_164"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_165 = reshape(shape = var_164, x = var_163)[name = tensor<string, []>("op_165")];
+            tensor<fp32, [1, 5, 256]> var_229 = linear(bias = inner_encoder_q_proj_0_bias, weight = inner_encoder_q_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_3")];
+            tensor<int32, [4]> var_230 = const()[name = tensor<string, []>("op_230"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_231 = reshape(shape = var_230, x = var_229)[name = tensor<string, []>("op_231")];
             tensor<int32, [4]> q_1_perm_0 = const()[name = tensor<string, []>("q_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> var_169 = linear(bias = encoder_k_proj_0_bias, weight = encoder_k_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_4")];
-            tensor<fp32, []> var_170 = const()[name = tensor<string, []>("op_170"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 5, 256]> var_171 = mul(x = var_169, y = var_170)[name = tensor<string, []>("op_171")];
-            tensor<int32, [4]> var_172 = const()[name = tensor<string, []>("op_172"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_173 = reshape(shape = var_172, x = var_171)[name = tensor<string, []>("op_173")];
+            tensor<fp32, [1, 5, 256]> var_235 = linear(bias = inner_encoder_k_proj_0_bias, weight = inner_encoder_k_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_4")];
+            tensor<fp32, []> var_236 = const()[name = tensor<string, []>("op_236"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 5, 256]> var_237 = mul(x = var_235, y = var_236)[name = tensor<string, []>("op_237")];
+            tensor<int32, [4]> var_238 = const()[name = tensor<string, []>("op_238"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_239 = reshape(shape = var_238, x = var_237)[name = tensor<string, []>("op_239")];
             tensor<int32, [4]> k_1_perm_0 = const()[name = tensor<string, []>("k_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> var_177 = linear(bias = encoder_v_proj_0_bias, weight = encoder_v_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_5")];
-            tensor<int32, [4]> var_178 = const()[name = tensor<string, []>("op_178"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_179 = reshape(shape = var_178, x = var_177)[name = tensor<string, []>("op_179")];
+            tensor<fp32, [1, 5, 256]> var_243 = linear(bias = inner_encoder_v_proj_0_bias, weight = inner_encoder_v_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_5")];
+            tensor<int32, [4]> var_244 = const()[name = tensor<string, []>("op_244"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_245 = reshape(shape = var_244, x = var_243)[name = tensor<string, []>("op_245")];
             tensor<int32, [4]> v_1_perm_0 = const()[name = tensor<string, []>("v_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> input_17 = linear(bias = encoder_g_proj_0_bias, weight = encoder_g_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_6")];
+            tensor<fp32, [1, 5, 256]> input_19 = linear(bias = inner_encoder_g_proj_0_bias, weight = inner_encoder_g_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_6")];
             tensor<fp32, [1]> sqrt_s0_1 = sqrt(x = prev_scale_1)[name = tensor<string, []>("sqrt_s0_1")];
-            tensor<fp32, [5]> s_t_1 = add(x = prev_scale_1, y = encoder__t_index)[name = tensor<string, []>("s_t_1")];
+            tensor<fp32, [5]> s_t_1 = add(x = prev_scale_1, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_1")];
             tensor<fp32, [5]> sqrt_s_t_1 = sqrt(x = s_t_1)[name = tensor<string, []>("sqrt_s_t_1")];
             tensor<bool, []> qk_1_transpose_x_1 = const()[name = tensor<string, []>("qk_1_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_1_transpose_y_1 = const()[name = tensor<string, []>("qk_1_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 5, 64]> k_1 = transpose(perm = k_1_perm_0, x = var_173)[name = tensor<string, []>("transpose_57")];
-            tensor<fp32, [1, 4, 5, 64]> q_1 = transpose(perm = q_1_perm_0, x = var_165)[name = tensor<string, []>("transpose_58")];
+            tensor<fp32, [1, 4, 5, 64]> k_1 = transpose(perm = k_1_perm_0, x = var_239)[name = tensor<string, []>("transpose_57")];
+            tensor<fp32, [1, 4, 5, 64]> q_1 = transpose(perm = q_1_perm_0, x = var_231)[name = tensor<string, []>("transpose_58")];
             tensor<fp32, [1, 4, 5, 5]> qk_1 = matmul(transpose_x = qk_1_transpose_x_1, transpose_y = qk_1_transpose_y_1, x = q_1, y = k_1)[name = tensor<string, []>("qk_1")];
-            tensor<int32, [2]> var_189 = const()[name = tensor<string, []>("op_189"), val = tensor<int32, [2]>([5, 1])];
-            tensor<fp32, [5, 1]> var_190 = reshape(shape = var_189, x = sqrt_s_t_1)[name = tensor<string, []>("op_190")];
-            tensor<fp32, [5, 5]> M_1 = real_div(x = encoder__causal_mask, y = var_190)[name = tensor<string, []>("M_1")];
-            tensor<fp32, [1, 4, 5, 5]> var_192 = mul(x = qk_1, y = M_1)[name = tensor<string, []>("op_192")];
+            tensor<int32, [2]> var_255 = const()[name = tensor<string, []>("op_255"), val = tensor<int32, [2]>([5, 1])];
+            tensor<fp32, [5, 1]> var_256 = reshape(shape = var_255, x = sqrt_s_t_1)[name = tensor<string, []>("op_256")];
+            tensor<fp32, [5, 5]> M_1 = real_div(x = inner_encoder__causal_mask, y = var_256)[name = tensor<string, []>("M_1")];
+            tensor<fp32, [1, 4, 5, 5]> var_258 = mul(x = qk_1, y = M_1)[name = tensor<string, []>("op_258")];
             tensor<bool, []> inner_1_transpose_x_0 = const()[name = tensor<string, []>("inner_1_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_1_transpose_y_0 = const()[name = tensor<string, []>("inner_1_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 5, 64]> v_1 = transpose(perm = v_1_perm_0, x = var_179)[name = tensor<string, []>("transpose_56")];
-            tensor<fp32, [1, 4, 5, 64]> inner_1 = matmul(transpose_x = inner_1_transpose_x_0, transpose_y = inner_1_transpose_y_0, x = var_192, y = v_1)[name = tensor<string, []>("inner_1")];
-            tensor<bool, []> var_194_transpose_x_0 = const()[name = tensor<string, []>("op_194_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_194_transpose_y_0 = const()[name = tensor<string, []>("op_194_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 5, 64]> var_194 = matmul(transpose_x = var_194_transpose_x_0, transpose_y = var_194_transpose_y_0, x = q_1, y = prev_kv_1)[name = tensor<string, []>("op_194")];
-            tensor<fp32, [5]> var_195 = real_div(x = sqrt_s0_1, y = sqrt_s_t_1)[name = tensor<string, []>("op_195")];
-            tensor<int32, [4]> var_196 = const()[name = tensor<string, []>("op_196"), val = tensor<int32, [4]>([1, 1, 5, 1])];
-            tensor<fp32, [1, 1, 5, 1]> var_197 = reshape(shape = var_196, x = var_195)[name = tensor<string, []>("op_197")];
-            tensor<fp32, [1, 4, 5, 64]> cross_1 = mul(x = var_194, y = var_197)[name = tensor<string, []>("cross_1")];
+            tensor<fp32, [1, 4, 5, 64]> v_1 = transpose(perm = v_1_perm_0, x = var_245)[name = tensor<string, []>("transpose_56")];
+            tensor<fp32, [1, 4, 5, 64]> inner_1 = matmul(transpose_x = inner_1_transpose_x_0, transpose_y = inner_1_transpose_y_0, x = var_258, y = v_1)[name = tensor<string, []>("inner_1")];
+            tensor<bool, []> var_260_transpose_x_0 = const()[name = tensor<string, []>("op_260_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_260_transpose_y_0 = const()[name = tensor<string, []>("op_260_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 5, 64]> var_260 = matmul(transpose_x = var_260_transpose_x_0, transpose_y = var_260_transpose_y_0, x = q_1, y = prev_kv_1)[name = tensor<string, []>("op_260")];
+            tensor<fp32, [5]> var_261 = real_div(x = sqrt_s0_1, y = sqrt_s_t_1)[name = tensor<string, []>("op_261")];
+            tensor<int32, [4]> var_262 = const()[name = tensor<string, []>("op_262"), val = tensor<int32, [4]>([1, 1, 5, 1])];
+            tensor<fp32, [1, 1, 5, 1]> var_263 = reshape(shape = var_262, x = var_261)[name = tensor<string, []>("op_263")];
+            tensor<fp32, [1, 4, 5, 64]> cross_1 = mul(x = var_260, y = var_263)[name = tensor<string, []>("cross_1")];
             tensor<fp32, [1, 4, 5, 64]> out_1 = add(x = inner_1, y = cross_1)[name = tensor<string, []>("out_1")];
-            tensor<fp32, [1, 4, 64, 64]> var_200 = mul(x = prev_kv_1, y = sqrt_s0_1)[name = tensor<string, []>("op_200")];
-            tensor<bool, []> var_202_transpose_x_1 = const()[name = tensor<string, []>("op_202_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_202_transpose_y_1 = const()[name = tensor<string, []>("op_202_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_202 = matmul(transpose_x = var_202_transpose_x_1, transpose_y = var_202_transpose_y_1, x = k_1, y = v_1)[name = tensor<string, []>("op_202")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_1 = add(x = var_200, y = var_202)[name = tensor<string, []>("new_kv_unnorm_1")];
-            tensor<fp32, []> var_204 = const()[name = tensor<string, []>("op_204"), val = tensor<fp32, []>(0x1.4p+2)];
-            tensor<fp32, [1]> new_scale_1 = add(x = prev_scale_1, y = var_204)[name = tensor<string, []>("new_scale_1")];
-            tensor<fp32, [1]> var_206 = sqrt(x = new_scale_1)[name = tensor<string, []>("op_206")];
-            tensor<fp32, [1, 4, 64, 64]> var_207 = real_div(x = new_kv_unnorm_1, y = var_206)[name = tensor<string, []>("op_207")];
-            tensor<int32, [4]> var_208_perm_0 = const()[name = tensor<string, []>("op_208_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_266 = mul(x = prev_kv_1, y = sqrt_s0_1)[name = tensor<string, []>("op_266")];
+            tensor<bool, []> var_268_transpose_x_1 = const()[name = tensor<string, []>("op_268_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_268_transpose_y_1 = const()[name = tensor<string, []>("op_268_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_268 = matmul(transpose_x = var_268_transpose_x_1, transpose_y = var_268_transpose_y_1, x = k_1, y = v_1)[name = tensor<string, []>("op_268")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_1 = add(x = var_266, y = var_268)[name = tensor<string, []>("new_kv_unnorm_1")];
+            tensor<fp32, []> var_270 = const()[name = tensor<string, []>("op_270"), val = tensor<fp32, []>(0x1.4p+2)];
+            tensor<fp32, [1]> new_scale_1 = add(x = prev_scale_1, y = var_270)[name = tensor<string, []>("new_scale_1")];
+            tensor<fp32, [1]> var_272 = sqrt(x = new_scale_1)[name = tensor<string, []>("op_272")];
+            tensor<fp32, [1, 4, 64, 64]> var_273 = real_div(x = new_kv_unnorm_1, y = var_272)[name = tensor<string, []>("op_273")];
+            tensor<int32, [4]> var_274_perm_0 = const()[name = tensor<string, []>("op_274_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_3_axes_0 = const()[name = tensor<string, []>("out_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 4, 64]> var_208 = transpose(perm = var_208_perm_0, x = out_1)[name = tensor<string, []>("transpose_55")];
-            tensor<fp32, [1, 5, 4, 64]> out_3 = layer_norm(axes = out_3_axes_0, epsilon = var_18, x = var_208)[name = tensor<string, []>("out_3")];
-            tensor<int32, [3]> var_212 = const()[name = tensor<string, []>("op_212"), val = tensor<int32, [3]>([1, 5, 256])];
-            tensor<fp32, [1, 5, 256]> out_5 = reshape(shape = var_212, x = out_3)[name = tensor<string, []>("out_5")];
-            tensor<fp32, [1, 5, 256]> var_214 = silu(x = input_17)[name = tensor<string, []>("op_214")];
-            tensor<fp32, [1, 5, 256]> input_19 = mul(x = var_214, y = out_5)[name = tensor<string, []>("input_19")];
-            tensor<fp32, [1, 5, 256]> ret_out_1 = linear(bias = encoder_out_proj_0_bias, weight = encoder_out_proj_0_weight, x = input_19)[name = tensor<string, []>("linear_7")];
-            tensor<fp32, [1, 5, 256]> x_3 = add(x = input_13, y = ret_out_1)[name = tensor<string, []>("x_3")];
+            tensor<fp32, [1, 5, 4, 64]> var_274 = transpose(perm = var_274_perm_0, x = out_1)[name = tensor<string, []>("transpose_55")];
+            tensor<fp32, [1, 5, 4, 64]> out_3 = layer_norm(axes = out_3_axes_0, epsilon = var_84, x = var_274)[name = tensor<string, []>("out_3")];
+            tensor<int32, [3]> var_278 = const()[name = tensor<string, []>("op_278"), val = tensor<int32, [3]>([1, 5, 256])];
+            tensor<fp32, [1, 5, 256]> out_5 = reshape(shape = var_278, x = out_3)[name = tensor<string, []>("out_5")];
+            tensor<fp32, [1, 5, 256]> var_280 = silu(x = input_19)[name = tensor<string, []>("op_280")];
+            tensor<fp32, [1, 5, 256]> input_21 = mul(x = var_280, y = out_5)[name = tensor<string, []>("input_21")];
+            tensor<fp32, [1, 5, 256]> ret_out_1 = linear(bias = inner_encoder_out_proj_0_bias, weight = inner_encoder_out_proj_0_weight, x = input_21)[name = tensor<string, []>("linear_7")];
+            tensor<fp32, [1, 5, 256]> x_3 = add(x = input_15, y = ret_out_1)[name = tensor<string, []>("x_3")];
             tensor<int32, [4]> window_1_begin_0 = const()[name = tensor<string, []>("window_1_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> window_1_end_0 = const()[name = tensor<string, []>("window_1_end_0"), val = tensor<int32, [4]>([1, 1, 16, 256])];
             tensor<bool, [4]> window_1_end_mask_0 = const()[name = tensor<string, []>("window_1_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_1_squeeze_mask_0 = const()[name = tensor<string, []>("window_1_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_1 = slice_by_index(begin = window_1_begin_0, end = window_1_end_0, end_mask = window_1_end_mask_0, squeeze_mask = window_1_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_1")];
-            tensor<int32, [3]> var_222_begin_0 = const()[name = tensor<string, []>("op_222_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_222_end_0 = const()[name = tensor<string, []>("op_222_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_222_end_mask_0 = const()[name = tensor<string, []>("op_222_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_222 = slice_by_index(begin = var_222_begin_0, end = var_222_end_0, end_mask = var_222_end_mask_0, x = x_3)[name = tensor<string, []>("op_222")];
-            tensor<int32, [3]> var_225_begin_0 = const()[name = tensor<string, []>("op_225_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_225_end_0 = const()[name = tensor<string, []>("op_225_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_225_end_mask_0 = const()[name = tensor<string, []>("op_225_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_225 = slice_by_index(begin = var_225_begin_0, end = var_225_end_0, end_mask = var_225_end_mask_0, x = window_1)[name = tensor<string, []>("op_225")];
+            tensor<int32, [3]> var_288_begin_0 = const()[name = tensor<string, []>("op_288_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_288_end_0 = const()[name = tensor<string, []>("op_288_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_288_end_mask_0 = const()[name = tensor<string, []>("op_288_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_288 = slice_by_index(begin = var_288_begin_0, end = var_288_end_0, end_mask = var_288_end_mask_0, x = x_3)[name = tensor<string, []>("op_288")];
+            tensor<int32, [3]> var_291_begin_0 = const()[name = tensor<string, []>("op_291_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_291_end_0 = const()[name = tensor<string, []>("op_291_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_291_end_mask_0 = const()[name = tensor<string, []>("op_291_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_291 = slice_by_index(begin = var_291_begin_0, end = var_291_end_0, end_mask = var_291_end_mask_0, x = window_1)[name = tensor<string, []>("op_291")];
             tensor<bool, []> window_3_interleave_0 = const()[name = tensor<string, []>("window_3_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_3 = concat(axis = var_27, interleave = window_3_interleave_0, values = (var_225, var_222))[name = tensor<string, []>("window_3")];
-            tensor<int32, [3]> var_230_begin_0 = const()[name = tensor<string, []>("op_230_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_230_end_0 = const()[name = tensor<string, []>("op_230_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_230_end_mask_0 = const()[name = tensor<string, []>("op_230_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_230 = slice_by_index(begin = var_230_begin_0, end = var_230_end_0, end_mask = var_230_end_mask_0, x = x_3)[name = tensor<string, []>("op_230")];
-            tensor<int32, [3]> var_233_begin_0 = const()[name = tensor<string, []>("op_233_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_233_end_0 = const()[name = tensor<string, []>("op_233_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_233_end_mask_0 = const()[name = tensor<string, []>("op_233_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_233 = slice_by_index(begin = var_233_begin_0, end = var_233_end_0, end_mask = var_233_end_mask_0, x = window_3)[name = tensor<string, []>("op_233")];
+            tensor<fp32, [1, 16, 256]> window_3 = concat(axis = var_93, interleave = window_3_interleave_0, values = (var_291, var_288))[name = tensor<string, []>("window_3")];
+            tensor<int32, [3]> var_296_begin_0 = const()[name = tensor<string, []>("op_296_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_296_end_0 = const()[name = tensor<string, []>("op_296_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_296_end_mask_0 = const()[name = tensor<string, []>("op_296_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_296 = slice_by_index(begin = var_296_begin_0, end = var_296_end_0, end_mask = var_296_end_mask_0, x = x_3)[name = tensor<string, []>("op_296")];
+            tensor<int32, [3]> var_299_begin_0 = const()[name = tensor<string, []>("op_299_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_299_end_0 = const()[name = tensor<string, []>("op_299_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_299_end_mask_0 = const()[name = tensor<string, []>("op_299_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_299 = slice_by_index(begin = var_299_begin_0, end = var_299_end_0, end_mask = var_299_end_mask_0, x = window_3)[name = tensor<string, []>("op_299")];
             tensor<bool, []> window_5_interleave_0 = const()[name = tensor<string, []>("window_5_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_5 = concat(axis = var_27, interleave = window_5_interleave_0, values = (var_233, var_230))[name = tensor<string, []>("window_5")];
-            tensor<int32, [3]> var_238_begin_0 = const()[name = tensor<string, []>("op_238_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_238_end_0 = const()[name = tensor<string, []>("op_238_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<bool, [3]> var_238_end_mask_0 = const()[name = tensor<string, []>("op_238_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_238 = slice_by_index(begin = var_238_begin_0, end = var_238_end_0, end_mask = var_238_end_mask_0, x = x_3)[name = tensor<string, []>("op_238")];
-            tensor<int32, [3]> var_241_begin_0 = const()[name = tensor<string, []>("op_241_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_241_end_0 = const()[name = tensor<string, []>("op_241_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_241_end_mask_0 = const()[name = tensor<string, []>("op_241_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_241 = slice_by_index(begin = var_241_begin_0, end = var_241_end_0, end_mask = var_241_end_mask_0, x = window_5)[name = tensor<string, []>("op_241")];
+            tensor<fp32, [1, 16, 256]> window_5 = concat(axis = var_93, interleave = window_5_interleave_0, values = (var_299, var_296))[name = tensor<string, []>("window_5")];
+            tensor<int32, [3]> var_304_begin_0 = const()[name = tensor<string, []>("op_304_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_304_end_0 = const()[name = tensor<string, []>("op_304_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<bool, [3]> var_304_end_mask_0 = const()[name = tensor<string, []>("op_304_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_304 = slice_by_index(begin = var_304_begin_0, end = var_304_end_0, end_mask = var_304_end_mask_0, x = x_3)[name = tensor<string, []>("op_304")];
+            tensor<int32, [3]> var_307_begin_0 = const()[name = tensor<string, []>("op_307_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_307_end_0 = const()[name = tensor<string, []>("op_307_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_307_end_mask_0 = const()[name = tensor<string, []>("op_307_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_307 = slice_by_index(begin = var_307_begin_0, end = var_307_end_0, end_mask = var_307_end_mask_0, x = window_5)[name = tensor<string, []>("op_307")];
             tensor<bool, []> window_7_interleave_0 = const()[name = tensor<string, []>("window_7_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_7 = concat(axis = var_27, interleave = window_7_interleave_0, values = (var_241, var_238))[name = tensor<string, []>("window_7")];
-            tensor<int32, [3]> var_246_begin_0 = const()[name = tensor<string, []>("op_246_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
-            tensor<int32, [3]> var_246_end_0 = const()[name = tensor<string, []>("op_246_end_0"), val = tensor<int32, [3]>([1, 4, 256])];
-            tensor<bool, [3]> var_246_end_mask_0 = const()[name = tensor<string, []>("op_246_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_246 = slice_by_index(begin = var_246_begin_0, end = var_246_end_0, end_mask = var_246_end_mask_0, x = x_3)[name = tensor<string, []>("op_246")];
-            tensor<int32, [3]> var_249_begin_0 = const()[name = tensor<string, []>("op_249_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_249_end_0 = const()[name = tensor<string, []>("op_249_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_249_end_mask_0 = const()[name = tensor<string, []>("op_249_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_249 = slice_by_index(begin = var_249_begin_0, end = var_249_end_0, end_mask = var_249_end_mask_0, x = window_7)[name = tensor<string, []>("op_249")];
+            tensor<fp32, [1, 16, 256]> window_7 = concat(axis = var_93, interleave = window_7_interleave_0, values = (var_307, var_304))[name = tensor<string, []>("window_7")];
+            tensor<int32, [3]> var_312_begin_0 = const()[name = tensor<string, []>("op_312_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
+            tensor<int32, [3]> var_312_end_0 = const()[name = tensor<string, []>("op_312_end_0"), val = tensor<int32, [3]>([1, 4, 256])];
+            tensor<bool, [3]> var_312_end_mask_0 = const()[name = tensor<string, []>("op_312_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_312 = slice_by_index(begin = var_312_begin_0, end = var_312_end_0, end_mask = var_312_end_mask_0, x = x_3)[name = tensor<string, []>("op_312")];
+            tensor<int32, [3]> var_315_begin_0 = const()[name = tensor<string, []>("op_315_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_315_end_0 = const()[name = tensor<string, []>("op_315_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_315_end_mask_0 = const()[name = tensor<string, []>("op_315_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_315 = slice_by_index(begin = var_315_begin_0, end = var_315_end_0, end_mask = var_315_end_mask_0, x = window_7)[name = tensor<string, []>("op_315")];
             tensor<bool, []> window_9_interleave_0 = const()[name = tensor<string, []>("window_9_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_9 = concat(axis = var_27, interleave = window_9_interleave_0, values = (var_249, var_246))[name = tensor<string, []>("window_9")];
-            tensor<int32, [3]> var_254_begin_0 = const()[name = tensor<string, []>("op_254_begin_0"), val = tensor<int32, [3]>([0, 4, 0])];
-            tensor<int32, [3]> var_254_end_0 = const()[name = tensor<string, []>("op_254_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_254_end_mask_0 = const()[name = tensor<string, []>("op_254_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_254 = slice_by_index(begin = var_254_begin_0, end = var_254_end_0, end_mask = var_254_end_mask_0, x = x_3)[name = tensor<string, []>("op_254")];
-            tensor<int32, [3]> var_257_begin_0 = const()[name = tensor<string, []>("op_257_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_257_end_0 = const()[name = tensor<string, []>("op_257_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_257_end_mask_0 = const()[name = tensor<string, []>("op_257_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_257 = slice_by_index(begin = var_257_begin_0, end = var_257_end_0, end_mask = var_257_end_mask_0, x = window_9)[name = tensor<string, []>("op_257")];
+            tensor<fp32, [1, 16, 256]> window_9 = concat(axis = var_93, interleave = window_9_interleave_0, values = (var_315, var_312))[name = tensor<string, []>("window_9")];
+            tensor<int32, [3]> var_320_begin_0 = const()[name = tensor<string, []>("op_320_begin_0"), val = tensor<int32, [3]>([0, 4, 0])];
+            tensor<int32, [3]> var_320_end_0 = const()[name = tensor<string, []>("op_320_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_320_end_mask_0 = const()[name = tensor<string, []>("op_320_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_320 = slice_by_index(begin = var_320_begin_0, end = var_320_end_0, end_mask = var_320_end_mask_0, x = x_3)[name = tensor<string, []>("op_320")];
+            tensor<int32, [3]> var_323_begin_0 = const()[name = tensor<string, []>("op_323_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_323_end_0 = const()[name = tensor<string, []>("op_323_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_323_end_mask_0 = const()[name = tensor<string, []>("op_323_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_323 = slice_by_index(begin = var_323_begin_0, end = var_323_end_0, end_mask = var_323_end_mask_0, x = window_9)[name = tensor<string, []>("op_323")];
             tensor<bool, []> window_11_interleave_0 = const()[name = tensor<string, []>("window_11_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_11 = concat(axis = var_27, interleave = window_11_interleave_0, values = (var_257, var_254))[name = tensor<string, []>("window_11")];
-            tensor<bool, []> input_21_interleave_0 = const()[name = tensor<string, []>("input_21_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [5, 16, 256]> input_21 = concat(axis = var_24, interleave = input_21_interleave_0, values = (window_3, window_5, window_7, window_9, window_11))[name = tensor<string, []>("input_21")];
+            tensor<fp32, [1, 16, 256]> window_11 = concat(axis = var_93, interleave = window_11_interleave_0, values = (var_323, var_320))[name = tensor<string, []>("window_11")];
+            tensor<bool, []> input_23_interleave_0 = const()[name = tensor<string, []>("input_23_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [5, 16, 256]> input_23 = concat(axis = var_79, interleave = input_23_interleave_0, values = (window_3, window_5, window_7, window_9, window_11))[name = tensor<string, []>("input_23")];
             tensor<int32, [1]> x_5_axes_0 = const()[name = tensor<string, []>("x_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [5, 16, 256]> x_5 = layer_norm(axes = x_5_axes_0, beta = encoder_conv_module_0_sequential_0_bias, epsilon = var_29, gamma = encoder_conv_module_0_sequential_0_weight, x = input_21)[name = tensor<string, []>("x_5")];
-            tensor<int32, [3]> input_23_perm_0 = const()[name = tensor<string, []>("input_23_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [5, 16, 256]> x_5 = layer_norm(axes = x_5_axes_0, beta = inner_encoder_conv_module_0_sequential_0_bias, epsilon = var_76, gamma = inner_encoder_conv_module_0_sequential_0_weight, x = input_23)[name = tensor<string, []>("x_5")];
+            tensor<int32, [3]> input_25_perm_0 = const()[name = tensor<string, []>("input_25_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_3_pad_type_0 = const()[name = tensor<string, []>("inputs_3_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_3_strides_0 = const()[name = tensor<string, []>("inputs_3_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_3_pad_0 = const()[name = tensor<string, []>("inputs_3_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_3_dilations_0 = const()[name = tensor<string, []>("inputs_3_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_3_groups_0 = const()[name = tensor<string, []>("inputs_3_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> input_23 = transpose(perm = input_23_perm_0, x = x_5)[name = tensor<string, []>("transpose_54")];
-            tensor<fp32, [5, 512, 16]> inputs_3 = conv(bias = encoder_conv_module_0_sequential_2_conv_bias, dilations = inputs_3_dilations_0, groups = inputs_3_groups_0, pad = inputs_3_pad_0, pad_type = inputs_3_pad_type_0, strides = inputs_3_strides_0, weight = encoder_conv_module_0_sequential_2_conv_weight, x = input_23)[name = tensor<string, []>("inputs_3")];
-            tensor<int32, [2]> var_282_split_sizes_0 = const()[name = tensor<string, []>("op_282_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_282_axis_0 = const()[name = tensor<string, []>("op_282_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> var_282_0, tensor<fp32, [5, 256, 16]> var_282_1 = split(axis = var_282_axis_0, split_sizes = var_282_split_sizes_0, x = inputs_3)[name = tensor<string, []>("op_282")];
-            tensor<fp32, [5, 256, 16]> var_284 = sigmoid(x = var_282_1)[name = tensor<string, []>("op_284")];
-            tensor<fp32, [5, 256, 16]> inputs_5 = mul(x = var_282_0, y = var_284)[name = tensor<string, []>("inputs_5")];
+            tensor<fp32, [5, 256, 16]> input_25 = transpose(perm = input_25_perm_0, x = x_5)[name = tensor<string, []>("transpose_54")];
+            tensor<fp32, [5, 512, 16]> inputs_3 = conv(bias = inner_encoder_conv_module_0_sequential_2_conv_bias, dilations = inputs_3_dilations_0, groups = inputs_3_groups_0, pad = inputs_3_pad_0, pad_type = inputs_3_pad_type_0, strides = inputs_3_strides_0, weight = inner_encoder_conv_module_0_sequential_2_conv_weight, x = input_25)[name = tensor<string, []>("inputs_3")];
+            tensor<int32, [2]> var_348_split_sizes_0 = const()[name = tensor<string, []>("op_348_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_348_axis_0 = const()[name = tensor<string, []>("op_348_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [5, 256, 16]> var_348_0, tensor<fp32, [5, 256, 16]> var_348_1 = split(axis = var_348_axis_0, split_sizes = var_348_split_sizes_0, x = inputs_3)[name = tensor<string, []>("op_348")];
+            tensor<fp32, [5, 256, 16]> var_350 = sigmoid(x = var_348_1)[name = tensor<string, []>("op_350")];
+            tensor<fp32, [5, 256, 16]> inputs_5 = mul(x = var_348_0, y = var_350)[name = tensor<string, []>("inputs_5")];
             tensor<string, []> outputs_aug_1_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_1_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_1_pad_0 = const()[name = tensor<string, []>("outputs_aug_1_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_1_groups_0 = const()[name = tensor<string, []>("outputs_aug_1_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_1_strides_0 = const()[name = tensor<string, []>("outputs_aug_1_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_1_dilations_0 = const()[name = tensor<string, []>("outputs_aug_1_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [5, 256, 31]> outputs_aug_1 = conv(dilations = outputs_aug_1_dilations_0, groups = outputs_aug_1_groups_0, pad = outputs_aug_1_pad_0, pad_type = outputs_aug_1_pad_type_0, strides = outputs_aug_1_strides_0, weight = encoder_conv_module_0_sequential_4_conv_weight, x = inputs_5)[name = tensor<string, []>("outputs_aug_1")];
-            tensor<int32, [3]> input_25_begin_0 = const()[name = tensor<string, []>("input_25_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_25_end_0 = const()[name = tensor<string, []>("input_25_end_0"), val = tensor<int32, [3]>([5, 256, 16])];
-            tensor<bool, [3]> input_25_end_mask_0 = const()[name = tensor<string, []>("input_25_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [5, 256, 16]> input_25 = slice_by_index(begin = input_25_begin_0, end = input_25_end_0, end_mask = input_25_end_mask_0, x = outputs_aug_1)[name = tensor<string, []>("input_25")];
-            tensor<fp32, [5, 256, 16]> inputs_7 = batch_norm(beta = encoder_conv_module_0_sequential_5_bias, epsilon = var_29, gamma = encoder_conv_module_0_sequential_5_weight, mean = encoder_conv_module_0_sequential_5_running_mean, variance = encoder_conv_module_0_sequential_5_running_var, x = input_25)[name = tensor<string, []>("inputs_7")];
-            tensor<fp32, [5, 256, 16]> input_27 = silu(x = inputs_7)[name = tensor<string, []>("input_27")];
-            tensor<string, []> input_29_pad_type_0 = const()[name = tensor<string, []>("input_29_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_29_strides_0 = const()[name = tensor<string, []>("input_29_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_29_pad_0 = const()[name = tensor<string, []>("input_29_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_29_dilations_0 = const()[name = tensor<string, []>("input_29_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_29_groups_0 = const()[name = tensor<string, []>("input_29_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> input_29 = conv(bias = encoder_conv_module_0_sequential_7_conv_bias, dilations = input_29_dilations_0, groups = input_29_groups_0, pad = input_29_pad_0, pad_type = input_29_pad_type_0, strides = input_29_strides_0, weight = encoder_conv_module_0_sequential_7_conv_weight, x = input_27)[name = tensor<string, []>("input_29")];
+            tensor<fp32, [5, 256, 31]> outputs_aug_1 = conv(dilations = outputs_aug_1_dilations_0, groups = outputs_aug_1_groups_0, pad = outputs_aug_1_pad_0, pad_type = outputs_aug_1_pad_type_0, strides = outputs_aug_1_strides_0, weight = inner_encoder_conv_module_0_sequential_4_conv_weight, x = inputs_5)[name = tensor<string, []>("outputs_aug_1")];
+            tensor<int32, [3]> input_27_begin_0 = const()[name = tensor<string, []>("input_27_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_27_end_0 = const()[name = tensor<string, []>("input_27_end_0"), val = tensor<int32, [3]>([5, 256, 16])];
+            tensor<bool, [3]> input_27_end_mask_0 = const()[name = tensor<string, []>("input_27_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [5, 256, 16]> input_27 = slice_by_index(begin = input_27_begin_0, end = input_27_end_0, end_mask = input_27_end_mask_0, x = outputs_aug_1)[name = tensor<string, []>("input_27")];
+            tensor<fp32, [5, 256, 16]> inputs_7 = batch_norm(beta = inner_encoder_conv_module_0_sequential_5_bias, epsilon = var_76, gamma = inner_encoder_conv_module_0_sequential_5_weight, mean = inner_encoder_conv_module_0_sequential_5_running_mean, variance = inner_encoder_conv_module_0_sequential_5_running_var, x = input_27)[name = tensor<string, []>("inputs_7")];
+            tensor<fp32, [5, 256, 16]> input_29 = silu(x = inputs_7)[name = tensor<string, []>("input_29")];
+            tensor<string, []> input_31_pad_type_0 = const()[name = tensor<string, []>("input_31_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_31_strides_0 = const()[name = tensor<string, []>("input_31_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_31_pad_0 = const()[name = tensor<string, []>("input_31_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_31_dilations_0 = const()[name = tensor<string, []>("input_31_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_31_groups_0 = const()[name = tensor<string, []>("input_31_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [5, 256, 16]> input_31 = conv(bias = inner_encoder_conv_module_0_sequential_7_conv_bias, dilations = input_31_dilations_0, groups = input_31_groups_0, pad = input_31_pad_0, pad_type = input_31_pad_type_0, strides = input_31_strides_0, weight = inner_encoder_conv_module_0_sequential_7_conv_weight, x = input_29)[name = tensor<string, []>("input_31")];
             tensor<int32, [3]> conv_out_1_perm_0 = const()[name = tensor<string, []>("conv_out_1_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_315_begin_0 = const()[name = tensor<string, []>("op_315_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_315_end_0 = const()[name = tensor<string, []>("op_315_end_0"), val = tensor<int32, [3]>([5, 16, 256])];
-            tensor<bool, [3]> var_315_end_mask_0 = const()[name = tensor<string, []>("op_315_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [5, 16, 256]> conv_out_1 = transpose(perm = conv_out_1_perm_0, x = input_29)[name = tensor<string, []>("transpose_53")];
-            tensor<fp32, [5, 1, 256]> var_315 = slice_by_index(begin = var_315_begin_0, end = var_315_end_0, end_mask = var_315_end_mask_0, x = conv_out_1)[name = tensor<string, []>("op_315")];
-            tensor<int32, [3]> var_317_perm_0 = const()[name = tensor<string, []>("op_317_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 5, 256]> var_317 = transpose(perm = var_317_perm_0, x = var_315)[name = tensor<string, []>("transpose_52")];
-            tensor<fp32, [1, 5, 256]> input_31 = add(x = x_3, y = var_317)[name = tensor<string, []>("input_31")];
-            tensor<int32, [1]> input_33_axes_0 = const()[name = tensor<string, []>("input_33_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_33 = layer_norm(axes = input_33_axes_0, beta = encoder_ffn2_0_module_sequential_0_bias, epsilon = var_29, gamma = encoder_ffn2_0_module_sequential_0_weight, x = input_31)[name = tensor<string, []>("input_33")];
-            tensor<fp32, [1, 5, 1024]> inputs_9 = linear(bias = encoder_ffn2_0_module_sequential_1_linear_bias, weight = encoder_ffn2_0_module_sequential_1_linear_weight, x = input_33)[name = tensor<string, []>("linear_8")];
-            tensor<fp32, [1, 5, 1024]> input_35 = silu(x = inputs_9)[name = tensor<string, []>("input_35")];
-            tensor<fp32, [1, 5, 256]> input_39 = linear(bias = encoder_ffn2_0_module_sequential_4_linear_bias, weight = encoder_ffn2_0_module_sequential_4_linear_weight, x = input_35)[name = tensor<string, []>("linear_9")];
-            tensor<fp32, []> var_340 = const()[name = tensor<string, []>("op_340"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 5, 256]> var_341 = mul(x = input_39, y = var_340)[name = tensor<string, []>("op_341")];
-            tensor<fp32, [1, 5, 256]> input_41 = add(x = var_341, y = input_31)[name = tensor<string, []>("input_41")];
-            tensor<int32, [1]> input_43_axes_0 = const()[name = tensor<string, []>("input_43_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_43 = layer_norm(axes = input_43_axes_0, beta = encoder_layer_norm_0_bias, epsilon = var_29, gamma = encoder_layer_norm_0_weight, x = input_41)[name = tensor<string, []>("input_43")];
+            tensor<int32, [3]> var_381_begin_0 = const()[name = tensor<string, []>("op_381_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_381_end_0 = const()[name = tensor<string, []>("op_381_end_0"), val = tensor<int32, [3]>([5, 16, 256])];
+            tensor<bool, [3]> var_381_end_mask_0 = const()[name = tensor<string, []>("op_381_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [5, 16, 256]> conv_out_1 = transpose(perm = conv_out_1_perm_0, x = input_31)[name = tensor<string, []>("transpose_53")];
+            tensor<fp32, [5, 1, 256]> var_381 = slice_by_index(begin = var_381_begin_0, end = var_381_end_0, end_mask = var_381_end_mask_0, x = conv_out_1)[name = tensor<string, []>("op_381")];
+            tensor<int32, [3]> var_383_perm_0 = const()[name = tensor<string, []>("op_383_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 5, 256]> var_383 = transpose(perm = var_383_perm_0, x = var_381)[name = tensor<string, []>("transpose_52")];
+            tensor<fp32, [1, 5, 256]> input_33 = add(x = x_3, y = var_383)[name = tensor<string, []>("input_33")];
+            tensor<int32, [1]> input_35_axes_0 = const()[name = tensor<string, []>("input_35_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 5, 256]> input_35 = layer_norm(axes = input_35_axes_0, beta = inner_encoder_ffn2_0_module_sequential_0_bias, epsilon = var_76, gamma = inner_encoder_ffn2_0_module_sequential_0_weight, x = input_33)[name = tensor<string, []>("input_35")];
+            tensor<fp32, [1, 5, 1024]> inputs_9 = linear(bias = inner_encoder_ffn2_0_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_0_module_sequential_1_linear_weight, x = input_35)[name = tensor<string, []>("linear_8")];
+            tensor<fp32, [1, 5, 1024]> input_37 = silu(x = inputs_9)[name = tensor<string, []>("input_37")];
+            tensor<fp32, [1, 5, 256]> input_41 = linear(bias = inner_encoder_ffn2_0_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_0_module_sequential_4_linear_weight, x = input_37)[name = tensor<string, []>("linear_9")];
+            tensor<fp32, []> var_406 = const()[name = tensor<string, []>("op_406"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 5, 256]> var_407 = mul(x = input_41, y = var_406)[name = tensor<string, []>("op_407")];
+            tensor<fp32, [1, 5, 256]> input_43 = add(x = var_407, y = input_33)[name = tensor<string, []>("input_43")];
             tensor<int32, [1]> input_45_axes_0 = const()[name = tensor<string, []>("input_45_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_45 = layer_norm(axes = input_45_axes_0, beta = encoder_ffn1_1_module_sequential_0_bias, epsilon = var_29, gamma = encoder_ffn1_1_module_sequential_0_weight, x = input_43)[name = tensor<string, []>("input_45")];
-            tensor<fp32, [1, 5, 1024]> inputs_11 = linear(bias = encoder_ffn1_1_module_sequential_1_linear_bias, weight = encoder_ffn1_1_module_sequential_1_linear_weight, x = input_45)[name = tensor<string, []>("linear_10")];
-            tensor<fp32, [1, 5, 1024]> input_47 = silu(x = inputs_11)[name = tensor<string, []>("input_47")];
-            tensor<fp32, [1, 5, 256]> input_51 = linear(bias = encoder_ffn1_1_module_sequential_4_linear_bias, weight = encoder_ffn1_1_module_sequential_4_linear_weight, x = input_47)[name = tensor<string, []>("linear_11")];
-            tensor<fp32, []> var_370 = const()[name = tensor<string, []>("op_370"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 5, 256]> var_371 = mul(x = input_51, y = var_370)[name = tensor<string, []>("op_371")];
-            tensor<fp32, [1, 5, 256]> input_53 = add(x = var_371, y = input_43)[name = tensor<string, []>("input_53")];
+            tensor<fp32, [1, 5, 256]> input_45 = layer_norm(axes = input_45_axes_0, beta = inner_encoder_layer_norm_0_bias, epsilon = var_76, gamma = inner_encoder_layer_norm_0_weight, x = input_43)[name = tensor<string, []>("input_45")];
+            tensor<int32, [1]> input_47_axes_0 = const()[name = tensor<string, []>("input_47_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 5, 256]> input_47 = layer_norm(axes = input_47_axes_0, beta = inner_encoder_ffn1_1_module_sequential_0_bias, epsilon = var_76, gamma = inner_encoder_ffn1_1_module_sequential_0_weight, x = input_45)[name = tensor<string, []>("input_47")];
+            tensor<fp32, [1, 5, 1024]> inputs_11 = linear(bias = inner_encoder_ffn1_1_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_1_module_sequential_1_linear_weight, x = input_47)[name = tensor<string, []>("linear_10")];
+            tensor<fp32, [1, 5, 1024]> input_49 = silu(x = inputs_11)[name = tensor<string, []>("input_49")];
+            tensor<fp32, [1, 5, 256]> input_53 = linear(bias = inner_encoder_ffn1_1_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_1_module_sequential_4_linear_weight, x = input_49)[name = tensor<string, []>("linear_11")];
+            tensor<fp32, []> var_436 = const()[name = tensor<string, []>("op_436"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 5, 256]> var_437 = mul(x = input_53, y = var_436)[name = tensor<string, []>("op_437")];
+            tensor<fp32, [1, 5, 256]> input_55 = add(x = var_437, y = input_45)[name = tensor<string, []>("input_55")];
             tensor<int32, [1]> x_7_axes_0 = const()[name = tensor<string, []>("x_7_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> x_7 = layer_norm(axes = x_7_axes_0, beta = encoder_ret_lns_1_bias, epsilon = var_29, gamma = encoder_ret_lns_1_weight, x = input_53)[name = tensor<string, []>("x_7")];
+            tensor<fp32, [1, 5, 256]> x_7 = layer_norm(axes = x_7_axes_0, beta = inner_encoder_ret_lns_1_bias, epsilon = var_76, gamma = inner_encoder_ret_lns_1_weight, x = input_55)[name = tensor<string, []>("x_7")];
             tensor<int32, [5]> prev_kv_3_begin_0 = const()[name = tensor<string, []>("prev_kv_3_begin_0"), val = tensor<int32, [5]>([1, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_3_end_0 = const()[name = tensor<string, []>("prev_kv_3_end_0"), val = tensor<int32, [5]>([2, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_3_end_mask_0 = const()[name = tensor<string, []>("prev_kv_3_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -426,183 +452,183 @@ program(1.0)
             tensor<bool, [2]> prev_scale_3_end_mask_0 = const()[name = tensor<string, []>("prev_scale_3_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_3_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_3_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_3 = slice_by_index(begin = prev_scale_3_begin_0, end = prev_scale_3_end_0, end_mask = prev_scale_3_end_mask_0, squeeze_mask = prev_scale_3_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_3")];
-            tensor<fp32, [1, 5, 256]> var_385 = linear(bias = encoder_q_proj_1_bias, weight = encoder_q_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_12")];
-            tensor<int32, [4]> var_386 = const()[name = tensor<string, []>("op_386"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_387 = reshape(shape = var_386, x = var_385)[name = tensor<string, []>("op_387")];
+            tensor<fp32, [1, 5, 256]> var_451 = linear(bias = inner_encoder_q_proj_1_bias, weight = inner_encoder_q_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_12")];
+            tensor<int32, [4]> var_452 = const()[name = tensor<string, []>("op_452"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_453 = reshape(shape = var_452, x = var_451)[name = tensor<string, []>("op_453")];
             tensor<int32, [4]> q_3_perm_0 = const()[name = tensor<string, []>("q_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> var_391 = linear(bias = encoder_k_proj_1_bias, weight = encoder_k_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_13")];
-            tensor<fp32, []> var_392 = const()[name = tensor<string, []>("op_392"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 5, 256]> var_393 = mul(x = var_391, y = var_392)[name = tensor<string, []>("op_393")];
-            tensor<int32, [4]> var_394 = const()[name = tensor<string, []>("op_394"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_395 = reshape(shape = var_394, x = var_393)[name = tensor<string, []>("op_395")];
+            tensor<fp32, [1, 5, 256]> var_457 = linear(bias = inner_encoder_k_proj_1_bias, weight = inner_encoder_k_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_13")];
+            tensor<fp32, []> var_458 = const()[name = tensor<string, []>("op_458"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 5, 256]> var_459 = mul(x = var_457, y = var_458)[name = tensor<string, []>("op_459")];
+            tensor<int32, [4]> var_460 = const()[name = tensor<string, []>("op_460"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_461 = reshape(shape = var_460, x = var_459)[name = tensor<string, []>("op_461")];
             tensor<int32, [4]> k_3_perm_0 = const()[name = tensor<string, []>("k_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> var_399 = linear(bias = encoder_v_proj_1_bias, weight = encoder_v_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_14")];
-            tensor<int32, [4]> var_400 = const()[name = tensor<string, []>("op_400"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_401 = reshape(shape = var_400, x = var_399)[name = tensor<string, []>("op_401")];
+            tensor<fp32, [1, 5, 256]> var_465 = linear(bias = inner_encoder_v_proj_1_bias, weight = inner_encoder_v_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_14")];
+            tensor<int32, [4]> var_466 = const()[name = tensor<string, []>("op_466"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_467 = reshape(shape = var_466, x = var_465)[name = tensor<string, []>("op_467")];
             tensor<int32, [4]> v_3_perm_0 = const()[name = tensor<string, []>("v_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> input_57 = linear(bias = encoder_g_proj_1_bias, weight = encoder_g_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_15")];
+            tensor<fp32, [1, 5, 256]> input_59 = linear(bias = inner_encoder_g_proj_1_bias, weight = inner_encoder_g_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_15")];
             tensor<fp32, [1]> sqrt_s0_3 = sqrt(x = prev_scale_3)[name = tensor<string, []>("sqrt_s0_3")];
-            tensor<fp32, [5]> s_t_3 = add(x = prev_scale_3, y = encoder__t_index)[name = tensor<string, []>("s_t_3")];
+            tensor<fp32, [5]> s_t_3 = add(x = prev_scale_3, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_3")];
             tensor<fp32, [5]> sqrt_s_t_3 = sqrt(x = s_t_3)[name = tensor<string, []>("sqrt_s_t_3")];
             tensor<bool, []> qk_3_transpose_x_1 = const()[name = tensor<string, []>("qk_3_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_3_transpose_y_1 = const()[name = tensor<string, []>("qk_3_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 5, 64]> k_3 = transpose(perm = k_3_perm_0, x = var_395)[name = tensor<string, []>("transpose_50")];
-            tensor<fp32, [1, 4, 5, 64]> q_3 = transpose(perm = q_3_perm_0, x = var_387)[name = tensor<string, []>("transpose_51")];
+            tensor<fp32, [1, 4, 5, 64]> k_3 = transpose(perm = k_3_perm_0, x = var_461)[name = tensor<string, []>("transpose_50")];
+            tensor<fp32, [1, 4, 5, 64]> q_3 = transpose(perm = q_3_perm_0, x = var_453)[name = tensor<string, []>("transpose_51")];
             tensor<fp32, [1, 4, 5, 5]> qk_3 = matmul(transpose_x = qk_3_transpose_x_1, transpose_y = qk_3_transpose_y_1, x = q_3, y = k_3)[name = tensor<string, []>("qk_3")];
-            tensor<int32, [2]> var_411 = const()[name = tensor<string, []>("op_411"), val = tensor<int32, [2]>([5, 1])];
-            tensor<fp32, [5, 1]> var_412 = reshape(shape = var_411, x = sqrt_s_t_3)[name = tensor<string, []>("op_412")];
-            tensor<fp32, [5, 5]> M_3 = real_div(x = encoder__causal_mask, y = var_412)[name = tensor<string, []>("M_3")];
-            tensor<fp32, [1, 4, 5, 5]> var_414 = mul(x = qk_3, y = M_3)[name = tensor<string, []>("op_414")];
+            tensor<int32, [2]> var_477 = const()[name = tensor<string, []>("op_477"), val = tensor<int32, [2]>([5, 1])];
+            tensor<fp32, [5, 1]> var_478 = reshape(shape = var_477, x = sqrt_s_t_3)[name = tensor<string, []>("op_478")];
+            tensor<fp32, [5, 5]> M_3 = real_div(x = inner_encoder__causal_mask, y = var_478)[name = tensor<string, []>("M_3")];
+            tensor<fp32, [1, 4, 5, 5]> var_480 = mul(x = qk_3, y = M_3)[name = tensor<string, []>("op_480")];
             tensor<bool, []> inner_3_transpose_x_0 = const()[name = tensor<string, []>("inner_3_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_3_transpose_y_0 = const()[name = tensor<string, []>("inner_3_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 5, 64]> v_3 = transpose(perm = v_3_perm_0, x = var_401)[name = tensor<string, []>("transpose_49")];
-            tensor<fp32, [1, 4, 5, 64]> inner_3 = matmul(transpose_x = inner_3_transpose_x_0, transpose_y = inner_3_transpose_y_0, x = var_414, y = v_3)[name = tensor<string, []>("inner_3")];
-            tensor<bool, []> var_416_transpose_x_0 = const()[name = tensor<string, []>("op_416_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_416_transpose_y_0 = const()[name = tensor<string, []>("op_416_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 5, 64]> var_416 = matmul(transpose_x = var_416_transpose_x_0, transpose_y = var_416_transpose_y_0, x = q_3, y = prev_kv_3)[name = tensor<string, []>("op_416")];
-            tensor<fp32, [5]> var_417 = real_div(x = sqrt_s0_3, y = sqrt_s_t_3)[name = tensor<string, []>("op_417")];
-            tensor<int32, [4]> var_418 = const()[name = tensor<string, []>("op_418"), val = tensor<int32, [4]>([1, 1, 5, 1])];
-            tensor<fp32, [1, 1, 5, 1]> var_419 = reshape(shape = var_418, x = var_417)[name = tensor<string, []>("op_419")];
-            tensor<fp32, [1, 4, 5, 64]> cross_3 = mul(x = var_416, y = var_419)[name = tensor<string, []>("cross_3")];
+            tensor<fp32, [1, 4, 5, 64]> v_3 = transpose(perm = v_3_perm_0, x = var_467)[name = tensor<string, []>("transpose_49")];
+            tensor<fp32, [1, 4, 5, 64]> inner_3 = matmul(transpose_x = inner_3_transpose_x_0, transpose_y = inner_3_transpose_y_0, x = var_480, y = v_3)[name = tensor<string, []>("inner_3")];
+            tensor<bool, []> var_482_transpose_x_0 = const()[name = tensor<string, []>("op_482_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_482_transpose_y_0 = const()[name = tensor<string, []>("op_482_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 5, 64]> var_482 = matmul(transpose_x = var_482_transpose_x_0, transpose_y = var_482_transpose_y_0, x = q_3, y = prev_kv_3)[name = tensor<string, []>("op_482")];
+            tensor<fp32, [5]> var_483 = real_div(x = sqrt_s0_3, y = sqrt_s_t_3)[name = tensor<string, []>("op_483")];
+            tensor<int32, [4]> var_484 = const()[name = tensor<string, []>("op_484"), val = tensor<int32, [4]>([1, 1, 5, 1])];
+            tensor<fp32, [1, 1, 5, 1]> var_485 = reshape(shape = var_484, x = var_483)[name = tensor<string, []>("op_485")];
+            tensor<fp32, [1, 4, 5, 64]> cross_3 = mul(x = var_482, y = var_485)[name = tensor<string, []>("cross_3")];
             tensor<fp32, [1, 4, 5, 64]> out_7 = add(x = inner_3, y = cross_3)[name = tensor<string, []>("out_7")];
-            tensor<fp32, [1, 4, 64, 64]> var_422 = mul(x = prev_kv_3, y = sqrt_s0_3)[name = tensor<string, []>("op_422")];
-            tensor<bool, []> var_424_transpose_x_1 = const()[name = tensor<string, []>("op_424_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_424_transpose_y_1 = const()[name = tensor<string, []>("op_424_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_424 = matmul(transpose_x = var_424_transpose_x_1, transpose_y = var_424_transpose_y_1, x = k_3, y = v_3)[name = tensor<string, []>("op_424")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_3 = add(x = var_422, y = var_424)[name = tensor<string, []>("new_kv_unnorm_3")];
-            tensor<fp32, []> var_426 = const()[name = tensor<string, []>("op_426"), val = tensor<fp32, []>(0x1.4p+2)];
-            tensor<fp32, [1]> new_scale_3 = add(x = prev_scale_3, y = var_426)[name = tensor<string, []>("new_scale_3")];
-            tensor<fp32, [1]> var_428 = sqrt(x = new_scale_3)[name = tensor<string, []>("op_428")];
-            tensor<fp32, [1, 4, 64, 64]> var_429 = real_div(x = new_kv_unnorm_3, y = var_428)[name = tensor<string, []>("op_429")];
-            tensor<int32, [4]> var_430_perm_0 = const()[name = tensor<string, []>("op_430_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_488 = mul(x = prev_kv_3, y = sqrt_s0_3)[name = tensor<string, []>("op_488")];
+            tensor<bool, []> var_490_transpose_x_1 = const()[name = tensor<string, []>("op_490_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_490_transpose_y_1 = const()[name = tensor<string, []>("op_490_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_490 = matmul(transpose_x = var_490_transpose_x_1, transpose_y = var_490_transpose_y_1, x = k_3, y = v_3)[name = tensor<string, []>("op_490")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_3 = add(x = var_488, y = var_490)[name = tensor<string, []>("new_kv_unnorm_3")];
+            tensor<fp32, []> var_492 = const()[name = tensor<string, []>("op_492"), val = tensor<fp32, []>(0x1.4p+2)];
+            tensor<fp32, [1]> new_scale_3 = add(x = prev_scale_3, y = var_492)[name = tensor<string, []>("new_scale_3")];
+            tensor<fp32, [1]> var_494 = sqrt(x = new_scale_3)[name = tensor<string, []>("op_494")];
+            tensor<fp32, [1, 4, 64, 64]> var_495 = real_div(x = new_kv_unnorm_3, y = var_494)[name = tensor<string, []>("op_495")];
+            tensor<int32, [4]> var_496_perm_0 = const()[name = tensor<string, []>("op_496_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_9_axes_0 = const()[name = tensor<string, []>("out_9_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 4, 64]> var_430 = transpose(perm = var_430_perm_0, x = out_7)[name = tensor<string, []>("transpose_48")];
-            tensor<fp32, [1, 5, 4, 64]> out_9 = layer_norm(axes = out_9_axes_0, epsilon = var_18, x = var_430)[name = tensor<string, []>("out_9")];
-            tensor<int32, [3]> var_434 = const()[name = tensor<string, []>("op_434"), val = tensor<int32, [3]>([1, 5, 256])];
-            tensor<fp32, [1, 5, 256]> out_11 = reshape(shape = var_434, x = out_9)[name = tensor<string, []>("out_11")];
-            tensor<fp32, [1, 5, 256]> var_436 = silu(x = input_57)[name = tensor<string, []>("op_436")];
-            tensor<fp32, [1, 5, 256]> input_59 = mul(x = var_436, y = out_11)[name = tensor<string, []>("input_59")];
-            tensor<fp32, [1, 5, 256]> ret_out_3 = linear(bias = encoder_out_proj_1_bias, weight = encoder_out_proj_1_weight, x = input_59)[name = tensor<string, []>("linear_16")];
-            tensor<fp32, [1, 5, 256]> x_9 = add(x = input_53, y = ret_out_3)[name = tensor<string, []>("x_9")];
+            tensor<fp32, [1, 5, 4, 64]> var_496 = transpose(perm = var_496_perm_0, x = out_7)[name = tensor<string, []>("transpose_48")];
+            tensor<fp32, [1, 5, 4, 64]> out_9 = layer_norm(axes = out_9_axes_0, epsilon = var_84, x = var_496)[name = tensor<string, []>("out_9")];
+            tensor<int32, [3]> var_500 = const()[name = tensor<string, []>("op_500"), val = tensor<int32, [3]>([1, 5, 256])];
+            tensor<fp32, [1, 5, 256]> out_11 = reshape(shape = var_500, x = out_9)[name = tensor<string, []>("out_11")];
+            tensor<fp32, [1, 5, 256]> var_502 = silu(x = input_59)[name = tensor<string, []>("op_502")];
+            tensor<fp32, [1, 5, 256]> input_61 = mul(x = var_502, y = out_11)[name = tensor<string, []>("input_61")];
+            tensor<fp32, [1, 5, 256]> ret_out_3 = linear(bias = inner_encoder_out_proj_1_bias, weight = inner_encoder_out_proj_1_weight, x = input_61)[name = tensor<string, []>("linear_16")];
+            tensor<fp32, [1, 5, 256]> x_9 = add(x = input_55, y = ret_out_3)[name = tensor<string, []>("x_9")];
             tensor<int32, [4]> window_13_begin_0 = const()[name = tensor<string, []>("window_13_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> window_13_end_0 = const()[name = tensor<string, []>("window_13_end_0"), val = tensor<int32, [4]>([2, 1, 16, 256])];
             tensor<bool, [4]> window_13_end_mask_0 = const()[name = tensor<string, []>("window_13_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_13_squeeze_mask_0 = const()[name = tensor<string, []>("window_13_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_13 = slice_by_index(begin = window_13_begin_0, end = window_13_end_0, end_mask = window_13_end_mask_0, squeeze_mask = window_13_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_13")];
-            tensor<int32, [3]> var_444_begin_0 = const()[name = tensor<string, []>("op_444_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_444_end_0 = const()[name = tensor<string, []>("op_444_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_444_end_mask_0 = const()[name = tensor<string, []>("op_444_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_444 = slice_by_index(begin = var_444_begin_0, end = var_444_end_0, end_mask = var_444_end_mask_0, x = x_9)[name = tensor<string, []>("op_444")];
-            tensor<int32, [3]> var_447_begin_0 = const()[name = tensor<string, []>("op_447_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_447_end_0 = const()[name = tensor<string, []>("op_447_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_447_end_mask_0 = const()[name = tensor<string, []>("op_447_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_447 = slice_by_index(begin = var_447_begin_0, end = var_447_end_0, end_mask = var_447_end_mask_0, x = window_13)[name = tensor<string, []>("op_447")];
+            tensor<int32, [3]> var_510_begin_0 = const()[name = tensor<string, []>("op_510_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_510_end_0 = const()[name = tensor<string, []>("op_510_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_510_end_mask_0 = const()[name = tensor<string, []>("op_510_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_510 = slice_by_index(begin = var_510_begin_0, end = var_510_end_0, end_mask = var_510_end_mask_0, x = x_9)[name = tensor<string, []>("op_510")];
+            tensor<int32, [3]> var_513_begin_0 = const()[name = tensor<string, []>("op_513_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_513_end_0 = const()[name = tensor<string, []>("op_513_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_513_end_mask_0 = const()[name = tensor<string, []>("op_513_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_513 = slice_by_index(begin = var_513_begin_0, end = var_513_end_0, end_mask = var_513_end_mask_0, x = window_13)[name = tensor<string, []>("op_513")];
             tensor<bool, []> window_15_interleave_0 = const()[name = tensor<string, []>("window_15_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_15 = concat(axis = var_27, interleave = window_15_interleave_0, values = (var_447, var_444))[name = tensor<string, []>("window_15")];
-            tensor<int32, [3]> var_452_begin_0 = const()[name = tensor<string, []>("op_452_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_452_end_0 = const()[name = tensor<string, []>("op_452_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_452_end_mask_0 = const()[name = tensor<string, []>("op_452_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_452 = slice_by_index(begin = var_452_begin_0, end = var_452_end_0, end_mask = var_452_end_mask_0, x = x_9)[name = tensor<string, []>("op_452")];
-            tensor<int32, [3]> var_455_begin_0 = const()[name = tensor<string, []>("op_455_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_455_end_0 = const()[name = tensor<string, []>("op_455_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_455_end_mask_0 = const()[name = tensor<string, []>("op_455_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_455 = slice_by_index(begin = var_455_begin_0, end = var_455_end_0, end_mask = var_455_end_mask_0, x = window_15)[name = tensor<string, []>("op_455")];
+            tensor<fp32, [1, 16, 256]> window_15 = concat(axis = var_93, interleave = window_15_interleave_0, values = (var_513, var_510))[name = tensor<string, []>("window_15")];
+            tensor<int32, [3]> var_518_begin_0 = const()[name = tensor<string, []>("op_518_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_518_end_0 = const()[name = tensor<string, []>("op_518_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_518_end_mask_0 = const()[name = tensor<string, []>("op_518_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_518 = slice_by_index(begin = var_518_begin_0, end = var_518_end_0, end_mask = var_518_end_mask_0, x = x_9)[name = tensor<string, []>("op_518")];
+            tensor<int32, [3]> var_521_begin_0 = const()[name = tensor<string, []>("op_521_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_521_end_0 = const()[name = tensor<string, []>("op_521_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_521_end_mask_0 = const()[name = tensor<string, []>("op_521_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_521 = slice_by_index(begin = var_521_begin_0, end = var_521_end_0, end_mask = var_521_end_mask_0, x = window_15)[name = tensor<string, []>("op_521")];
             tensor<bool, []> window_17_interleave_0 = const()[name = tensor<string, []>("window_17_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_17 = concat(axis = var_27, interleave = window_17_interleave_0, values = (var_455, var_452))[name = tensor<string, []>("window_17")];
-            tensor<int32, [3]> var_460_begin_0 = const()[name = tensor<string, []>("op_460_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_460_end_0 = const()[name = tensor<string, []>("op_460_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<bool, [3]> var_460_end_mask_0 = const()[name = tensor<string, []>("op_460_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_460 = slice_by_index(begin = var_460_begin_0, end = var_460_end_0, end_mask = var_460_end_mask_0, x = x_9)[name = tensor<string, []>("op_460")];
-            tensor<int32, [3]> var_463_begin_0 = const()[name = tensor<string, []>("op_463_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_463_end_0 = const()[name = tensor<string, []>("op_463_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_463_end_mask_0 = const()[name = tensor<string, []>("op_463_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_463 = slice_by_index(begin = var_463_begin_0, end = var_463_end_0, end_mask = var_463_end_mask_0, x = window_17)[name = tensor<string, []>("op_463")];
+            tensor<fp32, [1, 16, 256]> window_17 = concat(axis = var_93, interleave = window_17_interleave_0, values = (var_521, var_518))[name = tensor<string, []>("window_17")];
+            tensor<int32, [3]> var_526_begin_0 = const()[name = tensor<string, []>("op_526_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_526_end_0 = const()[name = tensor<string, []>("op_526_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<bool, [3]> var_526_end_mask_0 = const()[name = tensor<string, []>("op_526_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_526 = slice_by_index(begin = var_526_begin_0, end = var_526_end_0, end_mask = var_526_end_mask_0, x = x_9)[name = tensor<string, []>("op_526")];
+            tensor<int32, [3]> var_529_begin_0 = const()[name = tensor<string, []>("op_529_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_529_end_0 = const()[name = tensor<string, []>("op_529_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_529_end_mask_0 = const()[name = tensor<string, []>("op_529_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_529 = slice_by_index(begin = var_529_begin_0, end = var_529_end_0, end_mask = var_529_end_mask_0, x = window_17)[name = tensor<string, []>("op_529")];
             tensor<bool, []> window_19_interleave_0 = const()[name = tensor<string, []>("window_19_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_19 = concat(axis = var_27, interleave = window_19_interleave_0, values = (var_463, var_460))[name = tensor<string, []>("window_19")];
-            tensor<int32, [3]> var_468_begin_0 = const()[name = tensor<string, []>("op_468_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
-            tensor<int32, [3]> var_468_end_0 = const()[name = tensor<string, []>("op_468_end_0"), val = tensor<int32, [3]>([1, 4, 256])];
-            tensor<bool, [3]> var_468_end_mask_0 = const()[name = tensor<string, []>("op_468_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_468 = slice_by_index(begin = var_468_begin_0, end = var_468_end_0, end_mask = var_468_end_mask_0, x = x_9)[name = tensor<string, []>("op_468")];
-            tensor<int32, [3]> var_471_begin_0 = const()[name = tensor<string, []>("op_471_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_471_end_0 = const()[name = tensor<string, []>("op_471_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_471_end_mask_0 = const()[name = tensor<string, []>("op_471_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_471 = slice_by_index(begin = var_471_begin_0, end = var_471_end_0, end_mask = var_471_end_mask_0, x = window_19)[name = tensor<string, []>("op_471")];
+            tensor<fp32, [1, 16, 256]> window_19 = concat(axis = var_93, interleave = window_19_interleave_0, values = (var_529, var_526))[name = tensor<string, []>("window_19")];
+            tensor<int32, [3]> var_534_begin_0 = const()[name = tensor<string, []>("op_534_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
+            tensor<int32, [3]> var_534_end_0 = const()[name = tensor<string, []>("op_534_end_0"), val = tensor<int32, [3]>([1, 4, 256])];
+            tensor<bool, [3]> var_534_end_mask_0 = const()[name = tensor<string, []>("op_534_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_534 = slice_by_index(begin = var_534_begin_0, end = var_534_end_0, end_mask = var_534_end_mask_0, x = x_9)[name = tensor<string, []>("op_534")];
+            tensor<int32, [3]> var_537_begin_0 = const()[name = tensor<string, []>("op_537_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_537_end_0 = const()[name = tensor<string, []>("op_537_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_537_end_mask_0 = const()[name = tensor<string, []>("op_537_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_537 = slice_by_index(begin = var_537_begin_0, end = var_537_end_0, end_mask = var_537_end_mask_0, x = window_19)[name = tensor<string, []>("op_537")];
             tensor<bool, []> window_21_interleave_0 = const()[name = tensor<string, []>("window_21_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_21 = concat(axis = var_27, interleave = window_21_interleave_0, values = (var_471, var_468))[name = tensor<string, []>("window_21")];
-            tensor<int32, [3]> var_476_begin_0 = const()[name = tensor<string, []>("op_476_begin_0"), val = tensor<int32, [3]>([0, 4, 0])];
-            tensor<int32, [3]> var_476_end_0 = const()[name = tensor<string, []>("op_476_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_476_end_mask_0 = const()[name = tensor<string, []>("op_476_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_476 = slice_by_index(begin = var_476_begin_0, end = var_476_end_0, end_mask = var_476_end_mask_0, x = x_9)[name = tensor<string, []>("op_476")];
-            tensor<int32, [3]> var_479_begin_0 = const()[name = tensor<string, []>("op_479_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_479_end_0 = const()[name = tensor<string, []>("op_479_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_479_end_mask_0 = const()[name = tensor<string, []>("op_479_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_479 = slice_by_index(begin = var_479_begin_0, end = var_479_end_0, end_mask = var_479_end_mask_0, x = window_21)[name = tensor<string, []>("op_479")];
+            tensor<fp32, [1, 16, 256]> window_21 = concat(axis = var_93, interleave = window_21_interleave_0, values = (var_537, var_534))[name = tensor<string, []>("window_21")];
+            tensor<int32, [3]> var_542_begin_0 = const()[name = tensor<string, []>("op_542_begin_0"), val = tensor<int32, [3]>([0, 4, 0])];
+            tensor<int32, [3]> var_542_end_0 = const()[name = tensor<string, []>("op_542_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_542_end_mask_0 = const()[name = tensor<string, []>("op_542_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_542 = slice_by_index(begin = var_542_begin_0, end = var_542_end_0, end_mask = var_542_end_mask_0, x = x_9)[name = tensor<string, []>("op_542")];
+            tensor<int32, [3]> var_545_begin_0 = const()[name = tensor<string, []>("op_545_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_545_end_0 = const()[name = tensor<string, []>("op_545_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_545_end_mask_0 = const()[name = tensor<string, []>("op_545_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_545 = slice_by_index(begin = var_545_begin_0, end = var_545_end_0, end_mask = var_545_end_mask_0, x = window_21)[name = tensor<string, []>("op_545")];
             tensor<bool, []> window_23_interleave_0 = const()[name = tensor<string, []>("window_23_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_23 = concat(axis = var_27, interleave = window_23_interleave_0, values = (var_479, var_476))[name = tensor<string, []>("window_23")];
-            tensor<bool, []> input_61_interleave_0 = const()[name = tensor<string, []>("input_61_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [5, 16, 256]> input_61 = concat(axis = var_24, interleave = input_61_interleave_0, values = (window_15, window_17, window_19, window_21, window_23))[name = tensor<string, []>("input_61")];
+            tensor<fp32, [1, 16, 256]> window_23 = concat(axis = var_93, interleave = window_23_interleave_0, values = (var_545, var_542))[name = tensor<string, []>("window_23")];
+            tensor<bool, []> input_63_interleave_0 = const()[name = tensor<string, []>("input_63_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [5, 16, 256]> input_63 = concat(axis = var_79, interleave = input_63_interleave_0, values = (window_15, window_17, window_19, window_21, window_23))[name = tensor<string, []>("input_63")];
             tensor<int32, [1]> x_11_axes_0 = const()[name = tensor<string, []>("x_11_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [5, 16, 256]> x_11 = layer_norm(axes = x_11_axes_0, beta = encoder_conv_module_1_sequential_0_bias, epsilon = var_29, gamma = encoder_conv_module_1_sequential_0_weight, x = input_61)[name = tensor<string, []>("x_11")];
-            tensor<int32, [3]> input_63_perm_0 = const()[name = tensor<string, []>("input_63_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [5, 16, 256]> x_11 = layer_norm(axes = x_11_axes_0, beta = inner_encoder_conv_module_1_sequential_0_bias, epsilon = var_76, gamma = inner_encoder_conv_module_1_sequential_0_weight, x = input_63)[name = tensor<string, []>("x_11")];
+            tensor<int32, [3]> input_65_perm_0 = const()[name = tensor<string, []>("input_65_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_13_pad_type_0 = const()[name = tensor<string, []>("inputs_13_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_13_strides_0 = const()[name = tensor<string, []>("inputs_13_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_13_pad_0 = const()[name = tensor<string, []>("inputs_13_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_13_dilations_0 = const()[name = tensor<string, []>("inputs_13_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_13_groups_0 = const()[name = tensor<string, []>("inputs_13_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> input_63 = transpose(perm = input_63_perm_0, x = x_11)[name = tensor<string, []>("transpose_47")];
-            tensor<fp32, [5, 512, 16]> inputs_13 = conv(bias = encoder_conv_module_1_sequential_2_conv_bias, dilations = inputs_13_dilations_0, groups = inputs_13_groups_0, pad = inputs_13_pad_0, pad_type = inputs_13_pad_type_0, strides = inputs_13_strides_0, weight = encoder_conv_module_1_sequential_2_conv_weight, x = input_63)[name = tensor<string, []>("inputs_13")];
-            tensor<int32, [2]> var_504_split_sizes_0 = const()[name = tensor<string, []>("op_504_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_504_axis_0 = const()[name = tensor<string, []>("op_504_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> var_504_0, tensor<fp32, [5, 256, 16]> var_504_1 = split(axis = var_504_axis_0, split_sizes = var_504_split_sizes_0, x = inputs_13)[name = tensor<string, []>("op_504")];
-            tensor<fp32, [5, 256, 16]> var_506 = sigmoid(x = var_504_1)[name = tensor<string, []>("op_506")];
-            tensor<fp32, [5, 256, 16]> inputs_15 = mul(x = var_504_0, y = var_506)[name = tensor<string, []>("inputs_15")];
+            tensor<fp32, [5, 256, 16]> input_65 = transpose(perm = input_65_perm_0, x = x_11)[name = tensor<string, []>("transpose_47")];
+            tensor<fp32, [5, 512, 16]> inputs_13 = conv(bias = inner_encoder_conv_module_1_sequential_2_conv_bias, dilations = inputs_13_dilations_0, groups = inputs_13_groups_0, pad = inputs_13_pad_0, pad_type = inputs_13_pad_type_0, strides = inputs_13_strides_0, weight = inner_encoder_conv_module_1_sequential_2_conv_weight, x = input_65)[name = tensor<string, []>("inputs_13")];
+            tensor<int32, [2]> var_570_split_sizes_0 = const()[name = tensor<string, []>("op_570_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_570_axis_0 = const()[name = tensor<string, []>("op_570_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [5, 256, 16]> var_570_0, tensor<fp32, [5, 256, 16]> var_570_1 = split(axis = var_570_axis_0, split_sizes = var_570_split_sizes_0, x = inputs_13)[name = tensor<string, []>("op_570")];
+            tensor<fp32, [5, 256, 16]> var_572 = sigmoid(x = var_570_1)[name = tensor<string, []>("op_572")];
+            tensor<fp32, [5, 256, 16]> inputs_15 = mul(x = var_570_0, y = var_572)[name = tensor<string, []>("inputs_15")];
             tensor<string, []> outputs_aug_3_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_3_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_3_pad_0 = const()[name = tensor<string, []>("outputs_aug_3_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_3_groups_0 = const()[name = tensor<string, []>("outputs_aug_3_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_3_strides_0 = const()[name = tensor<string, []>("outputs_aug_3_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_3_dilations_0 = const()[name = tensor<string, []>("outputs_aug_3_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [5, 256, 31]> outputs_aug_3 = conv(dilations = outputs_aug_3_dilations_0, groups = outputs_aug_3_groups_0, pad = outputs_aug_3_pad_0, pad_type = outputs_aug_3_pad_type_0, strides = outputs_aug_3_strides_0, weight = encoder_conv_module_1_sequential_4_conv_weight, x = inputs_15)[name = tensor<string, []>("outputs_aug_3")];
-            tensor<int32, [3]> input_65_begin_0 = const()[name = tensor<string, []>("input_65_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_65_end_0 = const()[name = tensor<string, []>("input_65_end_0"), val = tensor<int32, [3]>([5, 256, 16])];
-            tensor<bool, [3]> input_65_end_mask_0 = const()[name = tensor<string, []>("input_65_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [5, 256, 16]> input_65 = slice_by_index(begin = input_65_begin_0, end = input_65_end_0, end_mask = input_65_end_mask_0, x = outputs_aug_3)[name = tensor<string, []>("input_65")];
-            tensor<fp32, [5, 256, 16]> inputs_17 = batch_norm(beta = encoder_conv_module_1_sequential_5_bias, epsilon = var_29, gamma = encoder_conv_module_1_sequential_5_weight, mean = encoder_conv_module_1_sequential_5_running_mean, variance = encoder_conv_module_1_sequential_5_running_var, x = input_65)[name = tensor<string, []>("inputs_17")];
-            tensor<fp32, [5, 256, 16]> input_67 = silu(x = inputs_17)[name = tensor<string, []>("input_67")];
-            tensor<string, []> input_69_pad_type_0 = const()[name = tensor<string, []>("input_69_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_69_strides_0 = const()[name = tensor<string, []>("input_69_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_69_pad_0 = const()[name = tensor<string, []>("input_69_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_69_dilations_0 = const()[name = tensor<string, []>("input_69_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_69_groups_0 = const()[name = tensor<string, []>("input_69_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> input_69 = conv(bias = encoder_conv_module_1_sequential_7_conv_bias, dilations = input_69_dilations_0, groups = input_69_groups_0, pad = input_69_pad_0, pad_type = input_69_pad_type_0, strides = input_69_strides_0, weight = encoder_conv_module_1_sequential_7_conv_weight, x = input_67)[name = tensor<string, []>("input_69")];
+            tensor<fp32, [5, 256, 31]> outputs_aug_3 = conv(dilations = outputs_aug_3_dilations_0, groups = outputs_aug_3_groups_0, pad = outputs_aug_3_pad_0, pad_type = outputs_aug_3_pad_type_0, strides = outputs_aug_3_strides_0, weight = inner_encoder_conv_module_1_sequential_4_conv_weight, x = inputs_15)[name = tensor<string, []>("outputs_aug_3")];
+            tensor<int32, [3]> input_67_begin_0 = const()[name = tensor<string, []>("input_67_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_67_end_0 = const()[name = tensor<string, []>("input_67_end_0"), val = tensor<int32, [3]>([5, 256, 16])];
+            tensor<bool, [3]> input_67_end_mask_0 = const()[name = tensor<string, []>("input_67_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [5, 256, 16]> input_67 = slice_by_index(begin = input_67_begin_0, end = input_67_end_0, end_mask = input_67_end_mask_0, x = outputs_aug_3)[name = tensor<string, []>("input_67")];
+            tensor<fp32, [5, 256, 16]> inputs_17 = batch_norm(beta = inner_encoder_conv_module_1_sequential_5_bias, epsilon = var_76, gamma = inner_encoder_conv_module_1_sequential_5_weight, mean = inner_encoder_conv_module_1_sequential_5_running_mean, variance = inner_encoder_conv_module_1_sequential_5_running_var, x = input_67)[name = tensor<string, []>("inputs_17")];
+            tensor<fp32, [5, 256, 16]> input_69 = silu(x = inputs_17)[name = tensor<string, []>("input_69")];
+            tensor<string, []> input_71_pad_type_0 = const()[name = tensor<string, []>("input_71_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_71_strides_0 = const()[name = tensor<string, []>("input_71_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_71_pad_0 = const()[name = tensor<string, []>("input_71_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_71_dilations_0 = const()[name = tensor<string, []>("input_71_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_71_groups_0 = const()[name = tensor<string, []>("input_71_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [5, 256, 16]> input_71 = conv(bias = inner_encoder_conv_module_1_sequential_7_conv_bias, dilations = input_71_dilations_0, groups = input_71_groups_0, pad = input_71_pad_0, pad_type = input_71_pad_type_0, strides = input_71_strides_0, weight = inner_encoder_conv_module_1_sequential_7_conv_weight, x = input_69)[name = tensor<string, []>("input_71")];
             tensor<int32, [3]> conv_out_3_perm_0 = const()[name = tensor<string, []>("conv_out_3_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_537_begin_0 = const()[name = tensor<string, []>("op_537_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_537_end_0 = const()[name = tensor<string, []>("op_537_end_0"), val = tensor<int32, [3]>([5, 16, 256])];
-            tensor<bool, [3]> var_537_end_mask_0 = const()[name = tensor<string, []>("op_537_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [5, 16, 256]> conv_out_3 = transpose(perm = conv_out_3_perm_0, x = input_69)[name = tensor<string, []>("transpose_46")];
-            tensor<fp32, [5, 1, 256]> var_537 = slice_by_index(begin = var_537_begin_0, end = var_537_end_0, end_mask = var_537_end_mask_0, x = conv_out_3)[name = tensor<string, []>("op_537")];
-            tensor<int32, [3]> var_539_perm_0 = const()[name = tensor<string, []>("op_539_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 5, 256]> var_539 = transpose(perm = var_539_perm_0, x = var_537)[name = tensor<string, []>("transpose_45")];
-            tensor<fp32, [1, 5, 256]> input_71 = add(x = x_9, y = var_539)[name = tensor<string, []>("input_71")];
-            tensor<int32, [1]> input_73_axes_0 = const()[name = tensor<string, []>("input_73_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_73 = layer_norm(axes = input_73_axes_0, beta = encoder_ffn2_1_module_sequential_0_bias, epsilon = var_29, gamma = encoder_ffn2_1_module_sequential_0_weight, x = input_71)[name = tensor<string, []>("input_73")];
-            tensor<fp32, [1, 5, 1024]> inputs_19 = linear(bias = encoder_ffn2_1_module_sequential_1_linear_bias, weight = encoder_ffn2_1_module_sequential_1_linear_weight, x = input_73)[name = tensor<string, []>("linear_17")];
-            tensor<fp32, [1, 5, 1024]> input_75 = silu(x = inputs_19)[name = tensor<string, []>("input_75")];
-            tensor<fp32, [1, 5, 256]> input_79 = linear(bias = encoder_ffn2_1_module_sequential_4_linear_bias, weight = encoder_ffn2_1_module_sequential_4_linear_weight, x = input_75)[name = tensor<string, []>("linear_18")];
-            tensor<fp32, []> var_562 = const()[name = tensor<string, []>("op_562"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 5, 256]> var_563 = mul(x = input_79, y = var_562)[name = tensor<string, []>("op_563")];
-            tensor<fp32, [1, 5, 256]> input_81 = add(x = var_563, y = input_71)[name = tensor<string, []>("input_81")];
-            tensor<int32, [1]> input_83_axes_0 = const()[name = tensor<string, []>("input_83_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_83 = layer_norm(axes = input_83_axes_0, beta = encoder_layer_norm_1_bias, epsilon = var_29, gamma = encoder_layer_norm_1_weight, x = input_81)[name = tensor<string, []>("input_83")];
+            tensor<int32, [3]> var_603_begin_0 = const()[name = tensor<string, []>("op_603_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_603_end_0 = const()[name = tensor<string, []>("op_603_end_0"), val = tensor<int32, [3]>([5, 16, 256])];
+            tensor<bool, [3]> var_603_end_mask_0 = const()[name = tensor<string, []>("op_603_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [5, 16, 256]> conv_out_3 = transpose(perm = conv_out_3_perm_0, x = input_71)[name = tensor<string, []>("transpose_46")];
+            tensor<fp32, [5, 1, 256]> var_603 = slice_by_index(begin = var_603_begin_0, end = var_603_end_0, end_mask = var_603_end_mask_0, x = conv_out_3)[name = tensor<string, []>("op_603")];
+            tensor<int32, [3]> var_605_perm_0 = const()[name = tensor<string, []>("op_605_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 5, 256]> var_605 = transpose(perm = var_605_perm_0, x = var_603)[name = tensor<string, []>("transpose_45")];
+            tensor<fp32, [1, 5, 256]> input_73 = add(x = x_9, y = var_605)[name = tensor<string, []>("input_73")];
+            tensor<int32, [1]> input_75_axes_0 = const()[name = tensor<string, []>("input_75_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 5, 256]> input_75 = layer_norm(axes = input_75_axes_0, beta = inner_encoder_ffn2_1_module_sequential_0_bias, epsilon = var_76, gamma = inner_encoder_ffn2_1_module_sequential_0_weight, x = input_73)[name = tensor<string, []>("input_75")];
+            tensor<fp32, [1, 5, 1024]> inputs_19 = linear(bias = inner_encoder_ffn2_1_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_1_module_sequential_1_linear_weight, x = input_75)[name = tensor<string, []>("linear_17")];
+            tensor<fp32, [1, 5, 1024]> input_77 = silu(x = inputs_19)[name = tensor<string, []>("input_77")];
+            tensor<fp32, [1, 5, 256]> input_81 = linear(bias = inner_encoder_ffn2_1_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_1_module_sequential_4_linear_weight, x = input_77)[name = tensor<string, []>("linear_18")];
+            tensor<fp32, []> var_628 = const()[name = tensor<string, []>("op_628"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 5, 256]> var_629 = mul(x = input_81, y = var_628)[name = tensor<string, []>("op_629")];
+            tensor<fp32, [1, 5, 256]> input_83 = add(x = var_629, y = input_73)[name = tensor<string, []>("input_83")];
             tensor<int32, [1]> input_85_axes_0 = const()[name = tensor<string, []>("input_85_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_85 = layer_norm(axes = input_85_axes_0, beta = encoder_ffn1_2_module_sequential_0_bias, epsilon = var_29, gamma = encoder_ffn1_2_module_sequential_0_weight, x = input_83)[name = tensor<string, []>("input_85")];
-            tensor<fp32, [1, 5, 1024]> inputs_21 = linear(bias = encoder_ffn1_2_module_sequential_1_linear_bias, weight = encoder_ffn1_2_module_sequential_1_linear_weight, x = input_85)[name = tensor<string, []>("linear_19")];
-            tensor<fp32, [1, 5, 1024]> input_87 = silu(x = inputs_21)[name = tensor<string, []>("input_87")];
-            tensor<fp32, [1, 5, 256]> input_91 = linear(bias = encoder_ffn1_2_module_sequential_4_linear_bias, weight = encoder_ffn1_2_module_sequential_4_linear_weight, x = input_87)[name = tensor<string, []>("linear_20")];
-            tensor<fp32, []> var_592 = const()[name = tensor<string, []>("op_592"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 5, 256]> var_593 = mul(x = input_91, y = var_592)[name = tensor<string, []>("op_593")];
-            tensor<fp32, [1, 5, 256]> input_93 = add(x = var_593, y = input_83)[name = tensor<string, []>("input_93")];
+            tensor<fp32, [1, 5, 256]> input_85 = layer_norm(axes = input_85_axes_0, beta = inner_encoder_layer_norm_1_bias, epsilon = var_76, gamma = inner_encoder_layer_norm_1_weight, x = input_83)[name = tensor<string, []>("input_85")];
+            tensor<int32, [1]> input_87_axes_0 = const()[name = tensor<string, []>("input_87_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 5, 256]> input_87 = layer_norm(axes = input_87_axes_0, beta = inner_encoder_ffn1_2_module_sequential_0_bias, epsilon = var_76, gamma = inner_encoder_ffn1_2_module_sequential_0_weight, x = input_85)[name = tensor<string, []>("input_87")];
+            tensor<fp32, [1, 5, 1024]> inputs_21 = linear(bias = inner_encoder_ffn1_2_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_2_module_sequential_1_linear_weight, x = input_87)[name = tensor<string, []>("linear_19")];
+            tensor<fp32, [1, 5, 1024]> input_89 = silu(x = inputs_21)[name = tensor<string, []>("input_89")];
+            tensor<fp32, [1, 5, 256]> input_93 = linear(bias = inner_encoder_ffn1_2_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_2_module_sequential_4_linear_weight, x = input_89)[name = tensor<string, []>("linear_20")];
+            tensor<fp32, []> var_658 = const()[name = tensor<string, []>("op_658"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 5, 256]> var_659 = mul(x = input_93, y = var_658)[name = tensor<string, []>("op_659")];
+            tensor<fp32, [1, 5, 256]> input_95 = add(x = var_659, y = input_85)[name = tensor<string, []>("input_95")];
             tensor<int32, [1]> x_13_axes_0 = const()[name = tensor<string, []>("x_13_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> x_13 = layer_norm(axes = x_13_axes_0, beta = encoder_ret_lns_2_bias, epsilon = var_29, gamma = encoder_ret_lns_2_weight, x = input_93)[name = tensor<string, []>("x_13")];
+            tensor<fp32, [1, 5, 256]> x_13 = layer_norm(axes = x_13_axes_0, beta = inner_encoder_ret_lns_2_bias, epsilon = var_76, gamma = inner_encoder_ret_lns_2_weight, x = input_95)[name = tensor<string, []>("x_13")];
             tensor<int32, [5]> prev_kv_5_begin_0 = const()[name = tensor<string, []>("prev_kv_5_begin_0"), val = tensor<int32, [5]>([2, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_5_end_0 = const()[name = tensor<string, []>("prev_kv_5_end_0"), val = tensor<int32, [5]>([3, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_5_end_mask_0 = const()[name = tensor<string, []>("prev_kv_5_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -613,183 +639,183 @@ program(1.0)
             tensor<bool, [2]> prev_scale_5_end_mask_0 = const()[name = tensor<string, []>("prev_scale_5_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_5_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_5_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_5 = slice_by_index(begin = prev_scale_5_begin_0, end = prev_scale_5_end_0, end_mask = prev_scale_5_end_mask_0, squeeze_mask = prev_scale_5_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_5")];
-            tensor<fp32, [1, 5, 256]> var_607 = linear(bias = encoder_q_proj_2_bias, weight = encoder_q_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_21")];
-            tensor<int32, [4]> var_608 = const()[name = tensor<string, []>("op_608"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_609 = reshape(shape = var_608, x = var_607)[name = tensor<string, []>("op_609")];
+            tensor<fp32, [1, 5, 256]> var_673 = linear(bias = inner_encoder_q_proj_2_bias, weight = inner_encoder_q_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_21")];
+            tensor<int32, [4]> var_674 = const()[name = tensor<string, []>("op_674"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_675 = reshape(shape = var_674, x = var_673)[name = tensor<string, []>("op_675")];
             tensor<int32, [4]> q_5_perm_0 = const()[name = tensor<string, []>("q_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> var_613 = linear(bias = encoder_k_proj_2_bias, weight = encoder_k_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_22")];
-            tensor<fp32, []> var_614 = const()[name = tensor<string, []>("op_614"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 5, 256]> var_615 = mul(x = var_613, y = var_614)[name = tensor<string, []>("op_615")];
-            tensor<int32, [4]> var_616 = const()[name = tensor<string, []>("op_616"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_617 = reshape(shape = var_616, x = var_615)[name = tensor<string, []>("op_617")];
+            tensor<fp32, [1, 5, 256]> var_679 = linear(bias = inner_encoder_k_proj_2_bias, weight = inner_encoder_k_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_22")];
+            tensor<fp32, []> var_680 = const()[name = tensor<string, []>("op_680"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 5, 256]> var_681 = mul(x = var_679, y = var_680)[name = tensor<string, []>("op_681")];
+            tensor<int32, [4]> var_682 = const()[name = tensor<string, []>("op_682"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_683 = reshape(shape = var_682, x = var_681)[name = tensor<string, []>("op_683")];
             tensor<int32, [4]> k_5_perm_0 = const()[name = tensor<string, []>("k_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> var_621 = linear(bias = encoder_v_proj_2_bias, weight = encoder_v_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_23")];
-            tensor<int32, [4]> var_622 = const()[name = tensor<string, []>("op_622"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_623 = reshape(shape = var_622, x = var_621)[name = tensor<string, []>("op_623")];
+            tensor<fp32, [1, 5, 256]> var_687 = linear(bias = inner_encoder_v_proj_2_bias, weight = inner_encoder_v_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_23")];
+            tensor<int32, [4]> var_688 = const()[name = tensor<string, []>("op_688"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_689 = reshape(shape = var_688, x = var_687)[name = tensor<string, []>("op_689")];
             tensor<int32, [4]> v_5_perm_0 = const()[name = tensor<string, []>("v_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> input_97 = linear(bias = encoder_g_proj_2_bias, weight = encoder_g_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_24")];
+            tensor<fp32, [1, 5, 256]> input_99 = linear(bias = inner_encoder_g_proj_2_bias, weight = inner_encoder_g_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_24")];
             tensor<fp32, [1]> sqrt_s0_5 = sqrt(x = prev_scale_5)[name = tensor<string, []>("sqrt_s0_5")];
-            tensor<fp32, [5]> s_t_5 = add(x = prev_scale_5, y = encoder__t_index)[name = tensor<string, []>("s_t_5")];
+            tensor<fp32, [5]> s_t_5 = add(x = prev_scale_5, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_5")];
             tensor<fp32, [5]> sqrt_s_t_5 = sqrt(x = s_t_5)[name = tensor<string, []>("sqrt_s_t_5")];
             tensor<bool, []> qk_5_transpose_x_1 = const()[name = tensor<string, []>("qk_5_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_5_transpose_y_1 = const()[name = tensor<string, []>("qk_5_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 5, 64]> k_5 = transpose(perm = k_5_perm_0, x = var_617)[name = tensor<string, []>("transpose_43")];
-            tensor<fp32, [1, 4, 5, 64]> q_5 = transpose(perm = q_5_perm_0, x = var_609)[name = tensor<string, []>("transpose_44")];
+            tensor<fp32, [1, 4, 5, 64]> k_5 = transpose(perm = k_5_perm_0, x = var_683)[name = tensor<string, []>("transpose_43")];
+            tensor<fp32, [1, 4, 5, 64]> q_5 = transpose(perm = q_5_perm_0, x = var_675)[name = tensor<string, []>("transpose_44")];
             tensor<fp32, [1, 4, 5, 5]> qk_5 = matmul(transpose_x = qk_5_transpose_x_1, transpose_y = qk_5_transpose_y_1, x = q_5, y = k_5)[name = tensor<string, []>("qk_5")];
-            tensor<int32, [2]> var_633 = const()[name = tensor<string, []>("op_633"), val = tensor<int32, [2]>([5, 1])];
-            tensor<fp32, [5, 1]> var_634 = reshape(shape = var_633, x = sqrt_s_t_5)[name = tensor<string, []>("op_634")];
-            tensor<fp32, [5, 5]> M_5 = real_div(x = encoder__causal_mask, y = var_634)[name = tensor<string, []>("M_5")];
-            tensor<fp32, [1, 4, 5, 5]> var_636 = mul(x = qk_5, y = M_5)[name = tensor<string, []>("op_636")];
+            tensor<int32, [2]> var_699 = const()[name = tensor<string, []>("op_699"), val = tensor<int32, [2]>([5, 1])];
+            tensor<fp32, [5, 1]> var_700 = reshape(shape = var_699, x = sqrt_s_t_5)[name = tensor<string, []>("op_700")];
+            tensor<fp32, [5, 5]> M_5 = real_div(x = inner_encoder__causal_mask, y = var_700)[name = tensor<string, []>("M_5")];
+            tensor<fp32, [1, 4, 5, 5]> var_702 = mul(x = qk_5, y = M_5)[name = tensor<string, []>("op_702")];
             tensor<bool, []> inner_5_transpose_x_0 = const()[name = tensor<string, []>("inner_5_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_5_transpose_y_0 = const()[name = tensor<string, []>("inner_5_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 5, 64]> v_5 = transpose(perm = v_5_perm_0, x = var_623)[name = tensor<string, []>("transpose_42")];
-            tensor<fp32, [1, 4, 5, 64]> inner_5 = matmul(transpose_x = inner_5_transpose_x_0, transpose_y = inner_5_transpose_y_0, x = var_636, y = v_5)[name = tensor<string, []>("inner_5")];
-            tensor<bool, []> var_638_transpose_x_0 = const()[name = tensor<string, []>("op_638_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_638_transpose_y_0 = const()[name = tensor<string, []>("op_638_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 5, 64]> var_638 = matmul(transpose_x = var_638_transpose_x_0, transpose_y = var_638_transpose_y_0, x = q_5, y = prev_kv_5)[name = tensor<string, []>("op_638")];
-            tensor<fp32, [5]> var_639 = real_div(x = sqrt_s0_5, y = sqrt_s_t_5)[name = tensor<string, []>("op_639")];
-            tensor<int32, [4]> var_640 = const()[name = tensor<string, []>("op_640"), val = tensor<int32, [4]>([1, 1, 5, 1])];
-            tensor<fp32, [1, 1, 5, 1]> var_641 = reshape(shape = var_640, x = var_639)[name = tensor<string, []>("op_641")];
-            tensor<fp32, [1, 4, 5, 64]> cross_5 = mul(x = var_638, y = var_641)[name = tensor<string, []>("cross_5")];
+            tensor<fp32, [1, 4, 5, 64]> v_5 = transpose(perm = v_5_perm_0, x = var_689)[name = tensor<string, []>("transpose_42")];
+            tensor<fp32, [1, 4, 5, 64]> inner_5 = matmul(transpose_x = inner_5_transpose_x_0, transpose_y = inner_5_transpose_y_0, x = var_702, y = v_5)[name = tensor<string, []>("inner_5")];
+            tensor<bool, []> var_704_transpose_x_0 = const()[name = tensor<string, []>("op_704_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_704_transpose_y_0 = const()[name = tensor<string, []>("op_704_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 5, 64]> var_704 = matmul(transpose_x = var_704_transpose_x_0, transpose_y = var_704_transpose_y_0, x = q_5, y = prev_kv_5)[name = tensor<string, []>("op_704")];
+            tensor<fp32, [5]> var_705 = real_div(x = sqrt_s0_5, y = sqrt_s_t_5)[name = tensor<string, []>("op_705")];
+            tensor<int32, [4]> var_706 = const()[name = tensor<string, []>("op_706"), val = tensor<int32, [4]>([1, 1, 5, 1])];
+            tensor<fp32, [1, 1, 5, 1]> var_707 = reshape(shape = var_706, x = var_705)[name = tensor<string, []>("op_707")];
+            tensor<fp32, [1, 4, 5, 64]> cross_5 = mul(x = var_704, y = var_707)[name = tensor<string, []>("cross_5")];
             tensor<fp32, [1, 4, 5, 64]> out_13 = add(x = inner_5, y = cross_5)[name = tensor<string, []>("out_13")];
-            tensor<fp32, [1, 4, 64, 64]> var_644 = mul(x = prev_kv_5, y = sqrt_s0_5)[name = tensor<string, []>("op_644")];
-            tensor<bool, []> var_646_transpose_x_1 = const()[name = tensor<string, []>("op_646_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_646_transpose_y_1 = const()[name = tensor<string, []>("op_646_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_646 = matmul(transpose_x = var_646_transpose_x_1, transpose_y = var_646_transpose_y_1, x = k_5, y = v_5)[name = tensor<string, []>("op_646")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_5 = add(x = var_644, y = var_646)[name = tensor<string, []>("new_kv_unnorm_5")];
-            tensor<fp32, []> var_648 = const()[name = tensor<string, []>("op_648"), val = tensor<fp32, []>(0x1.4p+2)];
-            tensor<fp32, [1]> new_scale_5 = add(x = prev_scale_5, y = var_648)[name = tensor<string, []>("new_scale_5")];
-            tensor<fp32, [1]> var_650 = sqrt(x = new_scale_5)[name = tensor<string, []>("op_650")];
-            tensor<fp32, [1, 4, 64, 64]> var_651 = real_div(x = new_kv_unnorm_5, y = var_650)[name = tensor<string, []>("op_651")];
-            tensor<int32, [4]> var_652_perm_0 = const()[name = tensor<string, []>("op_652_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_710 = mul(x = prev_kv_5, y = sqrt_s0_5)[name = tensor<string, []>("op_710")];
+            tensor<bool, []> var_712_transpose_x_1 = const()[name = tensor<string, []>("op_712_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_712_transpose_y_1 = const()[name = tensor<string, []>("op_712_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_712 = matmul(transpose_x = var_712_transpose_x_1, transpose_y = var_712_transpose_y_1, x = k_5, y = v_5)[name = tensor<string, []>("op_712")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_5 = add(x = var_710, y = var_712)[name = tensor<string, []>("new_kv_unnorm_5")];
+            tensor<fp32, []> var_714 = const()[name = tensor<string, []>("op_714"), val = tensor<fp32, []>(0x1.4p+2)];
+            tensor<fp32, [1]> new_scale_5 = add(x = prev_scale_5, y = var_714)[name = tensor<string, []>("new_scale_5")];
+            tensor<fp32, [1]> var_716 = sqrt(x = new_scale_5)[name = tensor<string, []>("op_716")];
+            tensor<fp32, [1, 4, 64, 64]> var_717 = real_div(x = new_kv_unnorm_5, y = var_716)[name = tensor<string, []>("op_717")];
+            tensor<int32, [4]> var_718_perm_0 = const()[name = tensor<string, []>("op_718_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_15_axes_0 = const()[name = tensor<string, []>("out_15_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 4, 64]> var_652 = transpose(perm = var_652_perm_0, x = out_13)[name = tensor<string, []>("transpose_41")];
-            tensor<fp32, [1, 5, 4, 64]> out_15 = layer_norm(axes = out_15_axes_0, epsilon = var_18, x = var_652)[name = tensor<string, []>("out_15")];
-            tensor<int32, [3]> var_656 = const()[name = tensor<string, []>("op_656"), val = tensor<int32, [3]>([1, 5, 256])];
-            tensor<fp32, [1, 5, 256]> out_17 = reshape(shape = var_656, x = out_15)[name = tensor<string, []>("out_17")];
-            tensor<fp32, [1, 5, 256]> var_658 = silu(x = input_97)[name = tensor<string, []>("op_658")];
-            tensor<fp32, [1, 5, 256]> input_99 = mul(x = var_658, y = out_17)[name = tensor<string, []>("input_99")];
-            tensor<fp32, [1, 5, 256]> ret_out_5 = linear(bias = encoder_out_proj_2_bias, weight = encoder_out_proj_2_weight, x = input_99)[name = tensor<string, []>("linear_25")];
-            tensor<fp32, [1, 5, 256]> x_15 = add(x = input_93, y = ret_out_5)[name = tensor<string, []>("x_15")];
+            tensor<fp32, [1, 5, 4, 64]> var_718 = transpose(perm = var_718_perm_0, x = out_13)[name = tensor<string, []>("transpose_41")];
+            tensor<fp32, [1, 5, 4, 64]> out_15 = layer_norm(axes = out_15_axes_0, epsilon = var_84, x = var_718)[name = tensor<string, []>("out_15")];
+            tensor<int32, [3]> var_722 = const()[name = tensor<string, []>("op_722"), val = tensor<int32, [3]>([1, 5, 256])];
+            tensor<fp32, [1, 5, 256]> out_17 = reshape(shape = var_722, x = out_15)[name = tensor<string, []>("out_17")];
+            tensor<fp32, [1, 5, 256]> var_724 = silu(x = input_99)[name = tensor<string, []>("op_724")];
+            tensor<fp32, [1, 5, 256]> input_101 = mul(x = var_724, y = out_17)[name = tensor<string, []>("input_101")];
+            tensor<fp32, [1, 5, 256]> ret_out_5 = linear(bias = inner_encoder_out_proj_2_bias, weight = inner_encoder_out_proj_2_weight, x = input_101)[name = tensor<string, []>("linear_25")];
+            tensor<fp32, [1, 5, 256]> x_15 = add(x = input_95, y = ret_out_5)[name = tensor<string, []>("x_15")];
             tensor<int32, [4]> window_25_begin_0 = const()[name = tensor<string, []>("window_25_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> window_25_end_0 = const()[name = tensor<string, []>("window_25_end_0"), val = tensor<int32, [4]>([3, 1, 16, 256])];
             tensor<bool, [4]> window_25_end_mask_0 = const()[name = tensor<string, []>("window_25_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_25_squeeze_mask_0 = const()[name = tensor<string, []>("window_25_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_25 = slice_by_index(begin = window_25_begin_0, end = window_25_end_0, end_mask = window_25_end_mask_0, squeeze_mask = window_25_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_25")];
-            tensor<int32, [3]> var_666_begin_0 = const()[name = tensor<string, []>("op_666_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_666_end_0 = const()[name = tensor<string, []>("op_666_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_666_end_mask_0 = const()[name = tensor<string, []>("op_666_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_666 = slice_by_index(begin = var_666_begin_0, end = var_666_end_0, end_mask = var_666_end_mask_0, x = x_15)[name = tensor<string, []>("op_666")];
-            tensor<int32, [3]> var_669_begin_0 = const()[name = tensor<string, []>("op_669_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_669_end_0 = const()[name = tensor<string, []>("op_669_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_669_end_mask_0 = const()[name = tensor<string, []>("op_669_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_669 = slice_by_index(begin = var_669_begin_0, end = var_669_end_0, end_mask = var_669_end_mask_0, x = window_25)[name = tensor<string, []>("op_669")];
+            tensor<int32, [3]> var_732_begin_0 = const()[name = tensor<string, []>("op_732_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_732_end_0 = const()[name = tensor<string, []>("op_732_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_732_end_mask_0 = const()[name = tensor<string, []>("op_732_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_732 = slice_by_index(begin = var_732_begin_0, end = var_732_end_0, end_mask = var_732_end_mask_0, x = x_15)[name = tensor<string, []>("op_732")];
+            tensor<int32, [3]> var_735_begin_0 = const()[name = tensor<string, []>("op_735_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_735_end_0 = const()[name = tensor<string, []>("op_735_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_735_end_mask_0 = const()[name = tensor<string, []>("op_735_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_735 = slice_by_index(begin = var_735_begin_0, end = var_735_end_0, end_mask = var_735_end_mask_0, x = window_25)[name = tensor<string, []>("op_735")];
             tensor<bool, []> window_27_interleave_0 = const()[name = tensor<string, []>("window_27_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_27 = concat(axis = var_27, interleave = window_27_interleave_0, values = (var_669, var_666))[name = tensor<string, []>("window_27")];
-            tensor<int32, [3]> var_674_begin_0 = const()[name = tensor<string, []>("op_674_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_674_end_0 = const()[name = tensor<string, []>("op_674_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_674_end_mask_0 = const()[name = tensor<string, []>("op_674_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_674 = slice_by_index(begin = var_674_begin_0, end = var_674_end_0, end_mask = var_674_end_mask_0, x = x_15)[name = tensor<string, []>("op_674")];
-            tensor<int32, [3]> var_677_begin_0 = const()[name = tensor<string, []>("op_677_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_677_end_0 = const()[name = tensor<string, []>("op_677_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_677_end_mask_0 = const()[name = tensor<string, []>("op_677_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_677 = slice_by_index(begin = var_677_begin_0, end = var_677_end_0, end_mask = var_677_end_mask_0, x = window_27)[name = tensor<string, []>("op_677")];
+            tensor<fp32, [1, 16, 256]> window_27 = concat(axis = var_93, interleave = window_27_interleave_0, values = (var_735, var_732))[name = tensor<string, []>("window_27")];
+            tensor<int32, [3]> var_740_begin_0 = const()[name = tensor<string, []>("op_740_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_740_end_0 = const()[name = tensor<string, []>("op_740_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_740_end_mask_0 = const()[name = tensor<string, []>("op_740_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_740 = slice_by_index(begin = var_740_begin_0, end = var_740_end_0, end_mask = var_740_end_mask_0, x = x_15)[name = tensor<string, []>("op_740")];
+            tensor<int32, [3]> var_743_begin_0 = const()[name = tensor<string, []>("op_743_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_743_end_0 = const()[name = tensor<string, []>("op_743_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_743_end_mask_0 = const()[name = tensor<string, []>("op_743_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_743 = slice_by_index(begin = var_743_begin_0, end = var_743_end_0, end_mask = var_743_end_mask_0, x = window_27)[name = tensor<string, []>("op_743")];
             tensor<bool, []> window_29_interleave_0 = const()[name = tensor<string, []>("window_29_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_29 = concat(axis = var_27, interleave = window_29_interleave_0, values = (var_677, var_674))[name = tensor<string, []>("window_29")];
-            tensor<int32, [3]> var_682_begin_0 = const()[name = tensor<string, []>("op_682_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_682_end_0 = const()[name = tensor<string, []>("op_682_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<bool, [3]> var_682_end_mask_0 = const()[name = tensor<string, []>("op_682_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_682 = slice_by_index(begin = var_682_begin_0, end = var_682_end_0, end_mask = var_682_end_mask_0, x = x_15)[name = tensor<string, []>("op_682")];
-            tensor<int32, [3]> var_685_begin_0 = const()[name = tensor<string, []>("op_685_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_685_end_0 = const()[name = tensor<string, []>("op_685_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_685_end_mask_0 = const()[name = tensor<string, []>("op_685_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_685 = slice_by_index(begin = var_685_begin_0, end = var_685_end_0, end_mask = var_685_end_mask_0, x = window_29)[name = tensor<string, []>("op_685")];
+            tensor<fp32, [1, 16, 256]> window_29 = concat(axis = var_93, interleave = window_29_interleave_0, values = (var_743, var_740))[name = tensor<string, []>("window_29")];
+            tensor<int32, [3]> var_748_begin_0 = const()[name = tensor<string, []>("op_748_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_748_end_0 = const()[name = tensor<string, []>("op_748_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<bool, [3]> var_748_end_mask_0 = const()[name = tensor<string, []>("op_748_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_748 = slice_by_index(begin = var_748_begin_0, end = var_748_end_0, end_mask = var_748_end_mask_0, x = x_15)[name = tensor<string, []>("op_748")];
+            tensor<int32, [3]> var_751_begin_0 = const()[name = tensor<string, []>("op_751_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_751_end_0 = const()[name = tensor<string, []>("op_751_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_751_end_mask_0 = const()[name = tensor<string, []>("op_751_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_751 = slice_by_index(begin = var_751_begin_0, end = var_751_end_0, end_mask = var_751_end_mask_0, x = window_29)[name = tensor<string, []>("op_751")];
             tensor<bool, []> window_31_interleave_0 = const()[name = tensor<string, []>("window_31_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_31 = concat(axis = var_27, interleave = window_31_interleave_0, values = (var_685, var_682))[name = tensor<string, []>("window_31")];
-            tensor<int32, [3]> var_690_begin_0 = const()[name = tensor<string, []>("op_690_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
-            tensor<int32, [3]> var_690_end_0 = const()[name = tensor<string, []>("op_690_end_0"), val = tensor<int32, [3]>([1, 4, 256])];
-            tensor<bool, [3]> var_690_end_mask_0 = const()[name = tensor<string, []>("op_690_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_690 = slice_by_index(begin = var_690_begin_0, end = var_690_end_0, end_mask = var_690_end_mask_0, x = x_15)[name = tensor<string, []>("op_690")];
-            tensor<int32, [3]> var_693_begin_0 = const()[name = tensor<string, []>("op_693_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_693_end_0 = const()[name = tensor<string, []>("op_693_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_693_end_mask_0 = const()[name = tensor<string, []>("op_693_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_693 = slice_by_index(begin = var_693_begin_0, end = var_693_end_0, end_mask = var_693_end_mask_0, x = window_31)[name = tensor<string, []>("op_693")];
+            tensor<fp32, [1, 16, 256]> window_31 = concat(axis = var_93, interleave = window_31_interleave_0, values = (var_751, var_748))[name = tensor<string, []>("window_31")];
+            tensor<int32, [3]> var_756_begin_0 = const()[name = tensor<string, []>("op_756_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
+            tensor<int32, [3]> var_756_end_0 = const()[name = tensor<string, []>("op_756_end_0"), val = tensor<int32, [3]>([1, 4, 256])];
+            tensor<bool, [3]> var_756_end_mask_0 = const()[name = tensor<string, []>("op_756_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_756 = slice_by_index(begin = var_756_begin_0, end = var_756_end_0, end_mask = var_756_end_mask_0, x = x_15)[name = tensor<string, []>("op_756")];
+            tensor<int32, [3]> var_759_begin_0 = const()[name = tensor<string, []>("op_759_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_759_end_0 = const()[name = tensor<string, []>("op_759_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_759_end_mask_0 = const()[name = tensor<string, []>("op_759_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_759 = slice_by_index(begin = var_759_begin_0, end = var_759_end_0, end_mask = var_759_end_mask_0, x = window_31)[name = tensor<string, []>("op_759")];
             tensor<bool, []> window_33_interleave_0 = const()[name = tensor<string, []>("window_33_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_33 = concat(axis = var_27, interleave = window_33_interleave_0, values = (var_693, var_690))[name = tensor<string, []>("window_33")];
-            tensor<int32, [3]> var_698_begin_0 = const()[name = tensor<string, []>("op_698_begin_0"), val = tensor<int32, [3]>([0, 4, 0])];
-            tensor<int32, [3]> var_698_end_0 = const()[name = tensor<string, []>("op_698_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_698_end_mask_0 = const()[name = tensor<string, []>("op_698_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_698 = slice_by_index(begin = var_698_begin_0, end = var_698_end_0, end_mask = var_698_end_mask_0, x = x_15)[name = tensor<string, []>("op_698")];
-            tensor<int32, [3]> var_701_begin_0 = const()[name = tensor<string, []>("op_701_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_701_end_0 = const()[name = tensor<string, []>("op_701_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_701_end_mask_0 = const()[name = tensor<string, []>("op_701_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_701 = slice_by_index(begin = var_701_begin_0, end = var_701_end_0, end_mask = var_701_end_mask_0, x = window_33)[name = tensor<string, []>("op_701")];
+            tensor<fp32, [1, 16, 256]> window_33 = concat(axis = var_93, interleave = window_33_interleave_0, values = (var_759, var_756))[name = tensor<string, []>("window_33")];
+            tensor<int32, [3]> var_764_begin_0 = const()[name = tensor<string, []>("op_764_begin_0"), val = tensor<int32, [3]>([0, 4, 0])];
+            tensor<int32, [3]> var_764_end_0 = const()[name = tensor<string, []>("op_764_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_764_end_mask_0 = const()[name = tensor<string, []>("op_764_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_764 = slice_by_index(begin = var_764_begin_0, end = var_764_end_0, end_mask = var_764_end_mask_0, x = x_15)[name = tensor<string, []>("op_764")];
+            tensor<int32, [3]> var_767_begin_0 = const()[name = tensor<string, []>("op_767_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_767_end_0 = const()[name = tensor<string, []>("op_767_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_767_end_mask_0 = const()[name = tensor<string, []>("op_767_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_767 = slice_by_index(begin = var_767_begin_0, end = var_767_end_0, end_mask = var_767_end_mask_0, x = window_33)[name = tensor<string, []>("op_767")];
             tensor<bool, []> window_35_interleave_0 = const()[name = tensor<string, []>("window_35_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_35 = concat(axis = var_27, interleave = window_35_interleave_0, values = (var_701, var_698))[name = tensor<string, []>("window_35")];
-            tensor<bool, []> input_101_interleave_0 = const()[name = tensor<string, []>("input_101_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [5, 16, 256]> input_101 = concat(axis = var_24, interleave = input_101_interleave_0, values = (window_27, window_29, window_31, window_33, window_35))[name = tensor<string, []>("input_101")];
+            tensor<fp32, [1, 16, 256]> window_35 = concat(axis = var_93, interleave = window_35_interleave_0, values = (var_767, var_764))[name = tensor<string, []>("window_35")];
+            tensor<bool, []> input_103_interleave_0 = const()[name = tensor<string, []>("input_103_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [5, 16, 256]> input_103 = concat(axis = var_79, interleave = input_103_interleave_0, values = (window_27, window_29, window_31, window_33, window_35))[name = tensor<string, []>("input_103")];
             tensor<int32, [1]> x_17_axes_0 = const()[name = tensor<string, []>("x_17_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [5, 16, 256]> x_17 = layer_norm(axes = x_17_axes_0, beta = encoder_conv_module_2_sequential_0_bias, epsilon = var_29, gamma = encoder_conv_module_2_sequential_0_weight, x = input_101)[name = tensor<string, []>("x_17")];
-            tensor<int32, [3]> input_103_perm_0 = const()[name = tensor<string, []>("input_103_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [5, 16, 256]> x_17 = layer_norm(axes = x_17_axes_0, beta = inner_encoder_conv_module_2_sequential_0_bias, epsilon = var_76, gamma = inner_encoder_conv_module_2_sequential_0_weight, x = input_103)[name = tensor<string, []>("x_17")];
+            tensor<int32, [3]> input_105_perm_0 = const()[name = tensor<string, []>("input_105_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_23_pad_type_0 = const()[name = tensor<string, []>("inputs_23_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_23_strides_0 = const()[name = tensor<string, []>("inputs_23_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_23_pad_0 = const()[name = tensor<string, []>("inputs_23_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_23_dilations_0 = const()[name = tensor<string, []>("inputs_23_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_23_groups_0 = const()[name = tensor<string, []>("inputs_23_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> input_103 = transpose(perm = input_103_perm_0, x = x_17)[name = tensor<string, []>("transpose_40")];
-            tensor<fp32, [5, 512, 16]> inputs_23 = conv(bias = encoder_conv_module_2_sequential_2_conv_bias, dilations = inputs_23_dilations_0, groups = inputs_23_groups_0, pad = inputs_23_pad_0, pad_type = inputs_23_pad_type_0, strides = inputs_23_strides_0, weight = encoder_conv_module_2_sequential_2_conv_weight, x = input_103)[name = tensor<string, []>("inputs_23")];
-            tensor<int32, [2]> var_726_split_sizes_0 = const()[name = tensor<string, []>("op_726_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_726_axis_0 = const()[name = tensor<string, []>("op_726_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> var_726_0, tensor<fp32, [5, 256, 16]> var_726_1 = split(axis = var_726_axis_0, split_sizes = var_726_split_sizes_0, x = inputs_23)[name = tensor<string, []>("op_726")];
-            tensor<fp32, [5, 256, 16]> var_728 = sigmoid(x = var_726_1)[name = tensor<string, []>("op_728")];
-            tensor<fp32, [5, 256, 16]> inputs_25 = mul(x = var_726_0, y = var_728)[name = tensor<string, []>("inputs_25")];
+            tensor<fp32, [5, 256, 16]> input_105 = transpose(perm = input_105_perm_0, x = x_17)[name = tensor<string, []>("transpose_40")];
+            tensor<fp32, [5, 512, 16]> inputs_23 = conv(bias = inner_encoder_conv_module_2_sequential_2_conv_bias, dilations = inputs_23_dilations_0, groups = inputs_23_groups_0, pad = inputs_23_pad_0, pad_type = inputs_23_pad_type_0, strides = inputs_23_strides_0, weight = inner_encoder_conv_module_2_sequential_2_conv_weight, x = input_105)[name = tensor<string, []>("inputs_23")];
+            tensor<int32, [2]> var_792_split_sizes_0 = const()[name = tensor<string, []>("op_792_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_792_axis_0 = const()[name = tensor<string, []>("op_792_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [5, 256, 16]> var_792_0, tensor<fp32, [5, 256, 16]> var_792_1 = split(axis = var_792_axis_0, split_sizes = var_792_split_sizes_0, x = inputs_23)[name = tensor<string, []>("op_792")];
+            tensor<fp32, [5, 256, 16]> var_794 = sigmoid(x = var_792_1)[name = tensor<string, []>("op_794")];
+            tensor<fp32, [5, 256, 16]> inputs_25 = mul(x = var_792_0, y = var_794)[name = tensor<string, []>("inputs_25")];
             tensor<string, []> outputs_aug_5_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_5_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_5_pad_0 = const()[name = tensor<string, []>("outputs_aug_5_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_5_groups_0 = const()[name = tensor<string, []>("outputs_aug_5_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_5_strides_0 = const()[name = tensor<string, []>("outputs_aug_5_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_5_dilations_0 = const()[name = tensor<string, []>("outputs_aug_5_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [5, 256, 31]> outputs_aug_5 = conv(dilations = outputs_aug_5_dilations_0, groups = outputs_aug_5_groups_0, pad = outputs_aug_5_pad_0, pad_type = outputs_aug_5_pad_type_0, strides = outputs_aug_5_strides_0, weight = encoder_conv_module_2_sequential_4_conv_weight, x = inputs_25)[name = tensor<string, []>("outputs_aug_5")];
-            tensor<int32, [3]> input_105_begin_0 = const()[name = tensor<string, []>("input_105_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_105_end_0 = const()[name = tensor<string, []>("input_105_end_0"), val = tensor<int32, [3]>([5, 256, 16])];
-            tensor<bool, [3]> input_105_end_mask_0 = const()[name = tensor<string, []>("input_105_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [5, 256, 16]> input_105 = slice_by_index(begin = input_105_begin_0, end = input_105_end_0, end_mask = input_105_end_mask_0, x = outputs_aug_5)[name = tensor<string, []>("input_105")];
-            tensor<fp32, [5, 256, 16]> inputs_27 = batch_norm(beta = encoder_conv_module_2_sequential_5_bias, epsilon = var_29, gamma = encoder_conv_module_2_sequential_5_weight, mean = encoder_conv_module_2_sequential_5_running_mean, variance = encoder_conv_module_2_sequential_5_running_var, x = input_105)[name = tensor<string, []>("inputs_27")];
-            tensor<fp32, [5, 256, 16]> input_107 = silu(x = inputs_27)[name = tensor<string, []>("input_107")];
-            tensor<string, []> input_109_pad_type_0 = const()[name = tensor<string, []>("input_109_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_109_strides_0 = const()[name = tensor<string, []>("input_109_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_109_pad_0 = const()[name = tensor<string, []>("input_109_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_109_dilations_0 = const()[name = tensor<string, []>("input_109_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_109_groups_0 = const()[name = tensor<string, []>("input_109_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> input_109 = conv(bias = encoder_conv_module_2_sequential_7_conv_bias, dilations = input_109_dilations_0, groups = input_109_groups_0, pad = input_109_pad_0, pad_type = input_109_pad_type_0, strides = input_109_strides_0, weight = encoder_conv_module_2_sequential_7_conv_weight, x = input_107)[name = tensor<string, []>("input_109")];
+            tensor<fp32, [5, 256, 31]> outputs_aug_5 = conv(dilations = outputs_aug_5_dilations_0, groups = outputs_aug_5_groups_0, pad = outputs_aug_5_pad_0, pad_type = outputs_aug_5_pad_type_0, strides = outputs_aug_5_strides_0, weight = inner_encoder_conv_module_2_sequential_4_conv_weight, x = inputs_25)[name = tensor<string, []>("outputs_aug_5")];
+            tensor<int32, [3]> input_107_begin_0 = const()[name = tensor<string, []>("input_107_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_107_end_0 = const()[name = tensor<string, []>("input_107_end_0"), val = tensor<int32, [3]>([5, 256, 16])];
+            tensor<bool, [3]> input_107_end_mask_0 = const()[name = tensor<string, []>("input_107_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [5, 256, 16]> input_107 = slice_by_index(begin = input_107_begin_0, end = input_107_end_0, end_mask = input_107_end_mask_0, x = outputs_aug_5)[name = tensor<string, []>("input_107")];
+            tensor<fp32, [5, 256, 16]> inputs_27 = batch_norm(beta = inner_encoder_conv_module_2_sequential_5_bias, epsilon = var_76, gamma = inner_encoder_conv_module_2_sequential_5_weight, mean = inner_encoder_conv_module_2_sequential_5_running_mean, variance = inner_encoder_conv_module_2_sequential_5_running_var, x = input_107)[name = tensor<string, []>("inputs_27")];
+            tensor<fp32, [5, 256, 16]> input_109 = silu(x = inputs_27)[name = tensor<string, []>("input_109")];
+            tensor<string, []> input_111_pad_type_0 = const()[name = tensor<string, []>("input_111_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_111_strides_0 = const()[name = tensor<string, []>("input_111_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_111_pad_0 = const()[name = tensor<string, []>("input_111_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_111_dilations_0 = const()[name = tensor<string, []>("input_111_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_111_groups_0 = const()[name = tensor<string, []>("input_111_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [5, 256, 16]> input_111 = conv(bias = inner_encoder_conv_module_2_sequential_7_conv_bias, dilations = input_111_dilations_0, groups = input_111_groups_0, pad = input_111_pad_0, pad_type = input_111_pad_type_0, strides = input_111_strides_0, weight = inner_encoder_conv_module_2_sequential_7_conv_weight, x = input_109)[name = tensor<string, []>("input_111")];
             tensor<int32, [3]> conv_out_5_perm_0 = const()[name = tensor<string, []>("conv_out_5_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_759_begin_0 = const()[name = tensor<string, []>("op_759_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_759_end_0 = const()[name = tensor<string, []>("op_759_end_0"), val = tensor<int32, [3]>([5, 16, 256])];
-            tensor<bool, [3]> var_759_end_mask_0 = const()[name = tensor<string, []>("op_759_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [5, 16, 256]> conv_out_5 = transpose(perm = conv_out_5_perm_0, x = input_109)[name = tensor<string, []>("transpose_39")];
-            tensor<fp32, [5, 1, 256]> var_759 = slice_by_index(begin = var_759_begin_0, end = var_759_end_0, end_mask = var_759_end_mask_0, x = conv_out_5)[name = tensor<string, []>("op_759")];
-            tensor<int32, [3]> var_761_perm_0 = const()[name = tensor<string, []>("op_761_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 5, 256]> var_761 = transpose(perm = var_761_perm_0, x = var_759)[name = tensor<string, []>("transpose_38")];
-            tensor<fp32, [1, 5, 256]> input_111 = add(x = x_15, y = var_761)[name = tensor<string, []>("input_111")];
-            tensor<int32, [1]> input_113_axes_0 = const()[name = tensor<string, []>("input_113_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_113 = layer_norm(axes = input_113_axes_0, beta = encoder_ffn2_2_module_sequential_0_bias, epsilon = var_29, gamma = encoder_ffn2_2_module_sequential_0_weight, x = input_111)[name = tensor<string, []>("input_113")];
-            tensor<fp32, [1, 5, 1024]> inputs_29 = linear(bias = encoder_ffn2_2_module_sequential_1_linear_bias, weight = encoder_ffn2_2_module_sequential_1_linear_weight, x = input_113)[name = tensor<string, []>("linear_26")];
-            tensor<fp32, [1, 5, 1024]> input_115 = silu(x = inputs_29)[name = tensor<string, []>("input_115")];
-            tensor<fp32, [1, 5, 256]> input_119 = linear(bias = encoder_ffn2_2_module_sequential_4_linear_bias, weight = encoder_ffn2_2_module_sequential_4_linear_weight, x = input_115)[name = tensor<string, []>("linear_27")];
-            tensor<fp32, []> var_784 = const()[name = tensor<string, []>("op_784"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 5, 256]> var_785 = mul(x = input_119, y = var_784)[name = tensor<string, []>("op_785")];
-            tensor<fp32, [1, 5, 256]> input_121 = add(x = var_785, y = input_111)[name = tensor<string, []>("input_121")];
-            tensor<int32, [1]> input_123_axes_0 = const()[name = tensor<string, []>("input_123_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_123 = layer_norm(axes = input_123_axes_0, beta = encoder_layer_norm_2_bias, epsilon = var_29, gamma = encoder_layer_norm_2_weight, x = input_121)[name = tensor<string, []>("input_123")];
+            tensor<int32, [3]> var_825_begin_0 = const()[name = tensor<string, []>("op_825_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_825_end_0 = const()[name = tensor<string, []>("op_825_end_0"), val = tensor<int32, [3]>([5, 16, 256])];
+            tensor<bool, [3]> var_825_end_mask_0 = const()[name = tensor<string, []>("op_825_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [5, 16, 256]> conv_out_5 = transpose(perm = conv_out_5_perm_0, x = input_111)[name = tensor<string, []>("transpose_39")];
+            tensor<fp32, [5, 1, 256]> var_825 = slice_by_index(begin = var_825_begin_0, end = var_825_end_0, end_mask = var_825_end_mask_0, x = conv_out_5)[name = tensor<string, []>("op_825")];
+            tensor<int32, [3]> var_827_perm_0 = const()[name = tensor<string, []>("op_827_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 5, 256]> var_827 = transpose(perm = var_827_perm_0, x = var_825)[name = tensor<string, []>("transpose_38")];
+            tensor<fp32, [1, 5, 256]> input_113 = add(x = x_15, y = var_827)[name = tensor<string, []>("input_113")];
+            tensor<int32, [1]> input_115_axes_0 = const()[name = tensor<string, []>("input_115_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 5, 256]> input_115 = layer_norm(axes = input_115_axes_0, beta = inner_encoder_ffn2_2_module_sequential_0_bias, epsilon = var_76, gamma = inner_encoder_ffn2_2_module_sequential_0_weight, x = input_113)[name = tensor<string, []>("input_115")];
+            tensor<fp32, [1, 5, 1024]> inputs_29 = linear(bias = inner_encoder_ffn2_2_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_2_module_sequential_1_linear_weight, x = input_115)[name = tensor<string, []>("linear_26")];
+            tensor<fp32, [1, 5, 1024]> input_117 = silu(x = inputs_29)[name = tensor<string, []>("input_117")];
+            tensor<fp32, [1, 5, 256]> input_121 = linear(bias = inner_encoder_ffn2_2_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_2_module_sequential_4_linear_weight, x = input_117)[name = tensor<string, []>("linear_27")];
+            tensor<fp32, []> var_850 = const()[name = tensor<string, []>("op_850"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 5, 256]> var_851 = mul(x = input_121, y = var_850)[name = tensor<string, []>("op_851")];
+            tensor<fp32, [1, 5, 256]> input_123 = add(x = var_851, y = input_113)[name = tensor<string, []>("input_123")];
             tensor<int32, [1]> input_125_axes_0 = const()[name = tensor<string, []>("input_125_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_125 = layer_norm(axes = input_125_axes_0, beta = encoder_ffn1_3_module_sequential_0_bias, epsilon = var_29, gamma = encoder_ffn1_3_module_sequential_0_weight, x = input_123)[name = tensor<string, []>("input_125")];
-            tensor<fp32, [1, 5, 1024]> inputs_31 = linear(bias = encoder_ffn1_3_module_sequential_1_linear_bias, weight = encoder_ffn1_3_module_sequential_1_linear_weight, x = input_125)[name = tensor<string, []>("linear_28")];
-            tensor<fp32, [1, 5, 1024]> input_127 = silu(x = inputs_31)[name = tensor<string, []>("input_127")];
-            tensor<fp32, [1, 5, 256]> input_131 = linear(bias = encoder_ffn1_3_module_sequential_4_linear_bias, weight = encoder_ffn1_3_module_sequential_4_linear_weight, x = input_127)[name = tensor<string, []>("linear_29")];
-            tensor<fp32, []> var_814 = const()[name = tensor<string, []>("op_814"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 5, 256]> var_815 = mul(x = input_131, y = var_814)[name = tensor<string, []>("op_815")];
-            tensor<fp32, [1, 5, 256]> input_133 = add(x = var_815, y = input_123)[name = tensor<string, []>("input_133")];
+            tensor<fp32, [1, 5, 256]> input_125 = layer_norm(axes = input_125_axes_0, beta = inner_encoder_layer_norm_2_bias, epsilon = var_76, gamma = inner_encoder_layer_norm_2_weight, x = input_123)[name = tensor<string, []>("input_125")];
+            tensor<int32, [1]> input_127_axes_0 = const()[name = tensor<string, []>("input_127_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 5, 256]> input_127 = layer_norm(axes = input_127_axes_0, beta = inner_encoder_ffn1_3_module_sequential_0_bias, epsilon = var_76, gamma = inner_encoder_ffn1_3_module_sequential_0_weight, x = input_125)[name = tensor<string, []>("input_127")];
+            tensor<fp32, [1, 5, 1024]> inputs_31 = linear(bias = inner_encoder_ffn1_3_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_3_module_sequential_1_linear_weight, x = input_127)[name = tensor<string, []>("linear_28")];
+            tensor<fp32, [1, 5, 1024]> input_129 = silu(x = inputs_31)[name = tensor<string, []>("input_129")];
+            tensor<fp32, [1, 5, 256]> input_133 = linear(bias = inner_encoder_ffn1_3_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_3_module_sequential_4_linear_weight, x = input_129)[name = tensor<string, []>("linear_29")];
+            tensor<fp32, []> var_880 = const()[name = tensor<string, []>("op_880"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 5, 256]> var_881 = mul(x = input_133, y = var_880)[name = tensor<string, []>("op_881")];
+            tensor<fp32, [1, 5, 256]> input_135 = add(x = var_881, y = input_125)[name = tensor<string, []>("input_135")];
             tensor<int32, [1]> x_19_axes_0 = const()[name = tensor<string, []>("x_19_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> x_19 = layer_norm(axes = x_19_axes_0, beta = encoder_ret_lns_3_bias, epsilon = var_29, gamma = encoder_ret_lns_3_weight, x = input_133)[name = tensor<string, []>("x_19")];
+            tensor<fp32, [1, 5, 256]> x_19 = layer_norm(axes = x_19_axes_0, beta = inner_encoder_ret_lns_3_bias, epsilon = var_76, gamma = inner_encoder_ret_lns_3_weight, x = input_135)[name = tensor<string, []>("x_19")];
             tensor<int32, [5]> prev_kv_7_begin_0 = const()[name = tensor<string, []>("prev_kv_7_begin_0"), val = tensor<int32, [5]>([3, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_7_end_0 = const()[name = tensor<string, []>("prev_kv_7_end_0"), val = tensor<int32, [5]>([4, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_7_end_mask_0 = const()[name = tensor<string, []>("prev_kv_7_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -800,219 +826,212 @@ program(1.0)
             tensor<bool, [2]> prev_scale_7_end_mask_0 = const()[name = tensor<string, []>("prev_scale_7_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_7_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_7_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_7 = slice_by_index(begin = prev_scale_7_begin_0, end = prev_scale_7_end_0, end_mask = prev_scale_7_end_mask_0, squeeze_mask = prev_scale_7_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_7")];
-            tensor<fp32, [1, 5, 256]> var_829 = linear(bias = encoder_q_proj_3_bias, weight = encoder_q_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_30")];
-            tensor<int32, [4]> var_830 = const()[name = tensor<string, []>("op_830"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_831 = reshape(shape = var_830, x = var_829)[name = tensor<string, []>("op_831")];
+            tensor<fp32, [1, 5, 256]> var_895 = linear(bias = inner_encoder_q_proj_3_bias, weight = inner_encoder_q_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_30")];
+            tensor<int32, [4]> var_896 = const()[name = tensor<string, []>("op_896"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_897 = reshape(shape = var_896, x = var_895)[name = tensor<string, []>("op_897")];
             tensor<int32, [4]> q_7_perm_0 = const()[name = tensor<string, []>("q_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> var_835 = linear(bias = encoder_k_proj_3_bias, weight = encoder_k_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_31")];
-            tensor<fp32, []> var_836 = const()[name = tensor<string, []>("op_836"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 5, 256]> var_837 = mul(x = var_835, y = var_836)[name = tensor<string, []>("op_837")];
-            tensor<int32, [4]> var_838 = const()[name = tensor<string, []>("op_838"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_839 = reshape(shape = var_838, x = var_837)[name = tensor<string, []>("op_839")];
+            tensor<fp32, [1, 5, 256]> var_901 = linear(bias = inner_encoder_k_proj_3_bias, weight = inner_encoder_k_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_31")];
+            tensor<fp32, []> var_902 = const()[name = tensor<string, []>("op_902"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 5, 256]> var_903 = mul(x = var_901, y = var_902)[name = tensor<string, []>("op_903")];
+            tensor<int32, [4]> var_904 = const()[name = tensor<string, []>("op_904"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_905 = reshape(shape = var_904, x = var_903)[name = tensor<string, []>("op_905")];
             tensor<int32, [4]> k_7_perm_0 = const()[name = tensor<string, []>("k_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> var_843 = linear(bias = encoder_v_proj_3_bias, weight = encoder_v_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_32")];
-            tensor<int32, [4]> var_844 = const()[name = tensor<string, []>("op_844"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_845 = reshape(shape = var_844, x = var_843)[name = tensor<string, []>("op_845")];
+            tensor<fp32, [1, 5, 256]> var_909 = linear(bias = inner_encoder_v_proj_3_bias, weight = inner_encoder_v_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_32")];
+            tensor<int32, [4]> var_910 = const()[name = tensor<string, []>("op_910"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_911 = reshape(shape = var_910, x = var_909)[name = tensor<string, []>("op_911")];
             tensor<int32, [4]> v_7_perm_0 = const()[name = tensor<string, []>("v_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> input_137 = linear(bias = encoder_g_proj_3_bias, weight = encoder_g_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_33")];
+            tensor<fp32, [1, 5, 256]> input_139 = linear(bias = inner_encoder_g_proj_3_bias, weight = inner_encoder_g_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_33")];
             tensor<fp32, [1]> sqrt_s0_7 = sqrt(x = prev_scale_7)[name = tensor<string, []>("sqrt_s0_7")];
-            tensor<fp32, [5]> s_t_7 = add(x = prev_scale_7, y = encoder__t_index)[name = tensor<string, []>("s_t_7")];
+            tensor<fp32, [5]> s_t_7 = add(x = prev_scale_7, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_7")];
             tensor<fp32, [5]> sqrt_s_t_7 = sqrt(x = s_t_7)[name = tensor<string, []>("sqrt_s_t_7")];
             tensor<bool, []> qk_7_transpose_x_1 = const()[name = tensor<string, []>("qk_7_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_7_transpose_y_1 = const()[name = tensor<string, []>("qk_7_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 5, 64]> k_7 = transpose(perm = k_7_perm_0, x = var_839)[name = tensor<string, []>("transpose_36")];
-            tensor<fp32, [1, 4, 5, 64]> q_7 = transpose(perm = q_7_perm_0, x = var_831)[name = tensor<string, []>("transpose_37")];
+            tensor<fp32, [1, 4, 5, 64]> k_7 = transpose(perm = k_7_perm_0, x = var_905)[name = tensor<string, []>("transpose_36")];
+            tensor<fp32, [1, 4, 5, 64]> q_7 = transpose(perm = q_7_perm_0, x = var_897)[name = tensor<string, []>("transpose_37")];
             tensor<fp32, [1, 4, 5, 5]> qk_7 = matmul(transpose_x = qk_7_transpose_x_1, transpose_y = qk_7_transpose_y_1, x = q_7, y = k_7)[name = tensor<string, []>("qk_7")];
-            tensor<int32, [2]> var_855 = const()[name = tensor<string, []>("op_855"), val = tensor<int32, [2]>([5, 1])];
-            tensor<fp32, [5, 1]> var_856 = reshape(shape = var_855, x = sqrt_s_t_7)[name = tensor<string, []>("op_856")];
-            tensor<fp32, [5, 5]> M_7 = real_div(x = encoder__causal_mask, y = var_856)[name = tensor<string, []>("M_7")];
-            tensor<fp32, [1, 4, 5, 5]> var_858 = mul(x = qk_7, y = M_7)[name = tensor<string, []>("op_858")];
+            tensor<int32, [2]> var_921 = const()[name = tensor<string, []>("op_921"), val = tensor<int32, [2]>([5, 1])];
+            tensor<fp32, [5, 1]> var_922 = reshape(shape = var_921, x = sqrt_s_t_7)[name = tensor<string, []>("op_922")];
+            tensor<fp32, [5, 5]> M_7 = real_div(x = inner_encoder__causal_mask, y = var_922)[name = tensor<string, []>("M_7")];
+            tensor<fp32, [1, 4, 5, 5]> var_924 = mul(x = qk_7, y = M_7)[name = tensor<string, []>("op_924")];
             tensor<bool, []> inner_7_transpose_x_0 = const()[name = tensor<string, []>("inner_7_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_7_transpose_y_0 = const()[name = tensor<string, []>("inner_7_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 5, 64]> v_7 = transpose(perm = v_7_perm_0, x = var_845)[name = tensor<string, []>("transpose_35")];
-            tensor<fp32, [1, 4, 5, 64]> inner_7 = matmul(transpose_x = inner_7_transpose_x_0, transpose_y = inner_7_transpose_y_0, x = var_858, y = v_7)[name = tensor<string, []>("inner_7")];
-            tensor<bool, []> var_860_transpose_x_0 = const()[name = tensor<string, []>("op_860_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_860_transpose_y_0 = const()[name = tensor<string, []>("op_860_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 5, 64]> var_860 = matmul(transpose_x = var_860_transpose_x_0, transpose_y = var_860_transpose_y_0, x = q_7, y = prev_kv_7)[name = tensor<string, []>("op_860")];
-            tensor<fp32, [5]> var_861 = real_div(x = sqrt_s0_7, y = sqrt_s_t_7)[name = tensor<string, []>("op_861")];
-            tensor<int32, [4]> var_862 = const()[name = tensor<string, []>("op_862"), val = tensor<int32, [4]>([1, 1, 5, 1])];
-            tensor<fp32, [1, 1, 5, 1]> var_863 = reshape(shape = var_862, x = var_861)[name = tensor<string, []>("op_863")];
-            tensor<fp32, [1, 4, 5, 64]> cross_7 = mul(x = var_860, y = var_863)[name = tensor<string, []>("cross_7")];
+            tensor<fp32, [1, 4, 5, 64]> v_7 = transpose(perm = v_7_perm_0, x = var_911)[name = tensor<string, []>("transpose_35")];
+            tensor<fp32, [1, 4, 5, 64]> inner_7 = matmul(transpose_x = inner_7_transpose_x_0, transpose_y = inner_7_transpose_y_0, x = var_924, y = v_7)[name = tensor<string, []>("inner_7")];
+            tensor<bool, []> var_926_transpose_x_0 = const()[name = tensor<string, []>("op_926_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_926_transpose_y_0 = const()[name = tensor<string, []>("op_926_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 5, 64]> var_926 = matmul(transpose_x = var_926_transpose_x_0, transpose_y = var_926_transpose_y_0, x = q_7, y = prev_kv_7)[name = tensor<string, []>("op_926")];
+            tensor<fp32, [5]> var_927 = real_div(x = sqrt_s0_7, y = sqrt_s_t_7)[name = tensor<string, []>("op_927")];
+            tensor<int32, [4]> var_928 = const()[name = tensor<string, []>("op_928"), val = tensor<int32, [4]>([1, 1, 5, 1])];
+            tensor<fp32, [1, 1, 5, 1]> var_929 = reshape(shape = var_928, x = var_927)[name = tensor<string, []>("op_929")];
+            tensor<fp32, [1, 4, 5, 64]> cross_7 = mul(x = var_926, y = var_929)[name = tensor<string, []>("cross_7")];
             tensor<fp32, [1, 4, 5, 64]> out_19 = add(x = inner_7, y = cross_7)[name = tensor<string, []>("out_19")];
-            tensor<fp32, [1, 4, 64, 64]> var_866 = mul(x = prev_kv_7, y = sqrt_s0_7)[name = tensor<string, []>("op_866")];
-            tensor<bool, []> var_868_transpose_x_1 = const()[name = tensor<string, []>("op_868_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_868_transpose_y_1 = const()[name = tensor<string, []>("op_868_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_868 = matmul(transpose_x = var_868_transpose_x_1, transpose_y = var_868_transpose_y_1, x = k_7, y = v_7)[name = tensor<string, []>("op_868")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_7 = add(x = var_866, y = var_868)[name = tensor<string, []>("new_kv_unnorm_7")];
-            tensor<fp32, []> var_870 = const()[name = tensor<string, []>("op_870"), val = tensor<fp32, []>(0x1.4p+2)];
-            tensor<fp32, [1]> new_scale_7 = add(x = prev_scale_7, y = var_870)[name = tensor<string, []>("new_scale_7")];
-            tensor<fp32, [1]> var_872 = sqrt(x = new_scale_7)[name = tensor<string, []>("op_872")];
-            tensor<fp32, [1, 4, 64, 64]> nkv_1 = real_div(x = new_kv_unnorm_7, y = var_872)[name = tensor<string, []>("nkv_1")];
-            tensor<int32, [4]> var_874_perm_0 = const()[name = tensor<string, []>("op_874_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_932 = mul(x = prev_kv_7, y = sqrt_s0_7)[name = tensor<string, []>("op_932")];
+            tensor<bool, []> var_934_transpose_x_1 = const()[name = tensor<string, []>("op_934_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_934_transpose_y_1 = const()[name = tensor<string, []>("op_934_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_934 = matmul(transpose_x = var_934_transpose_x_1, transpose_y = var_934_transpose_y_1, x = k_7, y = v_7)[name = tensor<string, []>("op_934")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_7 = add(x = var_932, y = var_934)[name = tensor<string, []>("new_kv_unnorm_7")];
+            tensor<fp32, []> var_936 = const()[name = tensor<string, []>("op_936"), val = tensor<fp32, []>(0x1.4p+2)];
+            tensor<fp32, [1]> new_scale_7 = add(x = prev_scale_7, y = var_936)[name = tensor<string, []>("new_scale_7")];
+            tensor<fp32, [1]> var_938 = sqrt(x = new_scale_7)[name = tensor<string, []>("op_938")];
+            tensor<fp32, [1, 4, 64, 64]> nkv_1 = real_div(x = new_kv_unnorm_7, y = var_938)[name = tensor<string, []>("nkv_1")];
+            tensor<int32, [4]> var_940_perm_0 = const()[name = tensor<string, []>("op_940_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_21_axes_0 = const()[name = tensor<string, []>("out_21_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 4, 64]> var_874 = transpose(perm = var_874_perm_0, x = out_19)[name = tensor<string, []>("transpose_34")];
-            tensor<fp32, [1, 5, 4, 64]> out_21 = layer_norm(axes = out_21_axes_0, epsilon = var_18, x = var_874)[name = tensor<string, []>("out_21")];
-            tensor<int32, [3]> var_878 = const()[name = tensor<string, []>("op_878"), val = tensor<int32, [3]>([1, 5, 256])];
-            tensor<fp32, [1, 5, 256]> out_23 = reshape(shape = var_878, x = out_21)[name = tensor<string, []>("out_23")];
-            tensor<fp32, [1, 5, 256]> var_880 = silu(x = input_137)[name = tensor<string, []>("op_880")];
-            tensor<fp32, [1, 5, 256]> input_139 = mul(x = var_880, y = out_23)[name = tensor<string, []>("input_139")];
-            tensor<fp32, [1, 5, 256]> ret_out_7 = linear(bias = encoder_out_proj_3_bias, weight = encoder_out_proj_3_weight, x = input_139)[name = tensor<string, []>("linear_34")];
-            tensor<fp32, [1, 5, 256]> x_21 = add(x = input_133, y = ret_out_7)[name = tensor<string, []>("x_21")];
+            tensor<fp32, [1, 5, 4, 64]> var_940 = transpose(perm = var_940_perm_0, x = out_19)[name = tensor<string, []>("transpose_34")];
+            tensor<fp32, [1, 5, 4, 64]> out_21 = layer_norm(axes = out_21_axes_0, epsilon = var_84, x = var_940)[name = tensor<string, []>("out_21")];
+            tensor<int32, [3]> var_944 = const()[name = tensor<string, []>("op_944"), val = tensor<int32, [3]>([1, 5, 256])];
+            tensor<fp32, [1, 5, 256]> out_23 = reshape(shape = var_944, x = out_21)[name = tensor<string, []>("out_23")];
+            tensor<fp32, [1, 5, 256]> var_946 = silu(x = input_139)[name = tensor<string, []>("op_946")];
+            tensor<fp32, [1, 5, 256]> input_141 = mul(x = var_946, y = out_23)[name = tensor<string, []>("input_141")];
+            tensor<fp32, [1, 5, 256]> ret_out_7 = linear(bias = inner_encoder_out_proj_3_bias, weight = inner_encoder_out_proj_3_weight, x = input_141)[name = tensor<string, []>("linear_34")];
+            tensor<fp32, [1, 5, 256]> x_21 = add(x = input_135, y = ret_out_7)[name = tensor<string, []>("x_21")];
             tensor<int32, [4]> window_37_begin_0 = const()[name = tensor<string, []>("window_37_begin_0"), val = tensor<int32, [4]>([3, 0, 0, 0])];
             tensor<int32, [4]> window_37_end_0 = const()[name = tensor<string, []>("window_37_end_0"), val = tensor<int32, [4]>([4, 1, 16, 256])];
             tensor<bool, [4]> window_37_end_mask_0 = const()[name = tensor<string, []>("window_37_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_37_squeeze_mask_0 = const()[name = tensor<string, []>("window_37_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_37 = slice_by_index(begin = window_37_begin_0, end = window_37_end_0, end_mask = window_37_end_mask_0, squeeze_mask = window_37_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_37")];
-            tensor<int32, [3]> var_888_begin_0 = const()[name = tensor<string, []>("op_888_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_888_end_0 = const()[name = tensor<string, []>("op_888_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_888_end_mask_0 = const()[name = tensor<string, []>("op_888_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_888 = slice_by_index(begin = var_888_begin_0, end = var_888_end_0, end_mask = var_888_end_mask_0, x = x_21)[name = tensor<string, []>("op_888")];
-            tensor<int32, [3]> var_891_begin_0 = const()[name = tensor<string, []>("op_891_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_891_end_0 = const()[name = tensor<string, []>("op_891_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_891_end_mask_0 = const()[name = tensor<string, []>("op_891_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_891 = slice_by_index(begin = var_891_begin_0, end = var_891_end_0, end_mask = var_891_end_mask_0, x = window_37)[name = tensor<string, []>("op_891")];
+            tensor<int32, [3]> var_954_begin_0 = const()[name = tensor<string, []>("op_954_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_954_end_0 = const()[name = tensor<string, []>("op_954_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_954_end_mask_0 = const()[name = tensor<string, []>("op_954_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_954 = slice_by_index(begin = var_954_begin_0, end = var_954_end_0, end_mask = var_954_end_mask_0, x = x_21)[name = tensor<string, []>("op_954")];
+            tensor<int32, [3]> var_957_begin_0 = const()[name = tensor<string, []>("op_957_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_957_end_0 = const()[name = tensor<string, []>("op_957_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_957_end_mask_0 = const()[name = tensor<string, []>("op_957_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_957 = slice_by_index(begin = var_957_begin_0, end = var_957_end_0, end_mask = var_957_end_mask_0, x = window_37)[name = tensor<string, []>("op_957")];
             tensor<bool, []> window_39_interleave_0 = const()[name = tensor<string, []>("window_39_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_39 = concat(axis = var_27, interleave = window_39_interleave_0, values = (var_891, var_888))[name = tensor<string, []>("window_39")];
-            tensor<int32, [3]> var_896_begin_0 = const()[name = tensor<string, []>("op_896_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_896_end_0 = const()[name = tensor<string, []>("op_896_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_896_end_mask_0 = const()[name = tensor<string, []>("op_896_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_896 = slice_by_index(begin = var_896_begin_0, end = var_896_end_0, end_mask = var_896_end_mask_0, x = x_21)[name = tensor<string, []>("op_896")];
-            tensor<int32, [3]> var_899_begin_0 = const()[name = tensor<string, []>("op_899_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_899_end_0 = const()[name = tensor<string, []>("op_899_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_899_end_mask_0 = const()[name = tensor<string, []>("op_899_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_899 = slice_by_index(begin = var_899_begin_0, end = var_899_end_0, end_mask = var_899_end_mask_0, x = window_39)[name = tensor<string, []>("op_899")];
+            tensor<fp32, [1, 16, 256]> window_39 = concat(axis = var_93, interleave = window_39_interleave_0, values = (var_957, var_954))[name = tensor<string, []>("window_39")];
+            tensor<int32, [3]> var_962_begin_0 = const()[name = tensor<string, []>("op_962_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_962_end_0 = const()[name = tensor<string, []>("op_962_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_962_end_mask_0 = const()[name = tensor<string, []>("op_962_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_962 = slice_by_index(begin = var_962_begin_0, end = var_962_end_0, end_mask = var_962_end_mask_0, x = x_21)[name = tensor<string, []>("op_962")];
+            tensor<int32, [3]> var_965_begin_0 = const()[name = tensor<string, []>("op_965_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_965_end_0 = const()[name = tensor<string, []>("op_965_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_965_end_mask_0 = const()[name = tensor<string, []>("op_965_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_965 = slice_by_index(begin = var_965_begin_0, end = var_965_end_0, end_mask = var_965_end_mask_0, x = window_39)[name = tensor<string, []>("op_965")];
             tensor<bool, []> window_41_interleave_0 = const()[name = tensor<string, []>("window_41_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_41 = concat(axis = var_27, interleave = window_41_interleave_0, values = (var_899, var_896))[name = tensor<string, []>("window_41")];
-            tensor<int32, [3]> var_904_begin_0 = const()[name = tensor<string, []>("op_904_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_904_end_0 = const()[name = tensor<string, []>("op_904_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<bool, [3]> var_904_end_mask_0 = const()[name = tensor<string, []>("op_904_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_904 = slice_by_index(begin = var_904_begin_0, end = var_904_end_0, end_mask = var_904_end_mask_0, x = x_21)[name = tensor<string, []>("op_904")];
-            tensor<int32, [3]> var_907_begin_0 = const()[name = tensor<string, []>("op_907_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_907_end_0 = const()[name = tensor<string, []>("op_907_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_907_end_mask_0 = const()[name = tensor<string, []>("op_907_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_907 = slice_by_index(begin = var_907_begin_0, end = var_907_end_0, end_mask = var_907_end_mask_0, x = window_41)[name = tensor<string, []>("op_907")];
+            tensor<fp32, [1, 16, 256]> window_41 = concat(axis = var_93, interleave = window_41_interleave_0, values = (var_965, var_962))[name = tensor<string, []>("window_41")];
+            tensor<int32, [3]> var_970_begin_0 = const()[name = tensor<string, []>("op_970_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_970_end_0 = const()[name = tensor<string, []>("op_970_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<bool, [3]> var_970_end_mask_0 = const()[name = tensor<string, []>("op_970_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_970 = slice_by_index(begin = var_970_begin_0, end = var_970_end_0, end_mask = var_970_end_mask_0, x = x_21)[name = tensor<string, []>("op_970")];
+            tensor<int32, [3]> var_973_begin_0 = const()[name = tensor<string, []>("op_973_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_973_end_0 = const()[name = tensor<string, []>("op_973_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_973_end_mask_0 = const()[name = tensor<string, []>("op_973_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_973 = slice_by_index(begin = var_973_begin_0, end = var_973_end_0, end_mask = var_973_end_mask_0, x = window_41)[name = tensor<string, []>("op_973")];
             tensor<bool, []> window_43_interleave_0 = const()[name = tensor<string, []>("window_43_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_43 = concat(axis = var_27, interleave = window_43_interleave_0, values = (var_907, var_904))[name = tensor<string, []>("window_43")];
-            tensor<int32, [3]> var_912_begin_0 = const()[name = tensor<string, []>("op_912_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
-            tensor<int32, [3]> var_912_end_0 = const()[name = tensor<string, []>("op_912_end_0"), val = tensor<int32, [3]>([1, 4, 256])];
-            tensor<bool, [3]> var_912_end_mask_0 = const()[name = tensor<string, []>("op_912_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_912 = slice_by_index(begin = var_912_begin_0, end = var_912_end_0, end_mask = var_912_end_mask_0, x = x_21)[name = tensor<string, []>("op_912")];
-            tensor<int32, [3]> var_915_begin_0 = const()[name = tensor<string, []>("op_915_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_915_end_0 = const()[name = tensor<string, []>("op_915_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_915_end_mask_0 = const()[name = tensor<string, []>("op_915_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_915 = slice_by_index(begin = var_915_begin_0, end = var_915_end_0, end_mask = var_915_end_mask_0, x = window_43)[name = tensor<string, []>("op_915")];
+            tensor<fp32, [1, 16, 256]> window_43 = concat(axis = var_93, interleave = window_43_interleave_0, values = (var_973, var_970))[name = tensor<string, []>("window_43")];
+            tensor<int32, [3]> var_978_begin_0 = const()[name = tensor<string, []>("op_978_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
+            tensor<int32, [3]> var_978_end_0 = const()[name = tensor<string, []>("op_978_end_0"), val = tensor<int32, [3]>([1, 4, 256])];
+            tensor<bool, [3]> var_978_end_mask_0 = const()[name = tensor<string, []>("op_978_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_978 = slice_by_index(begin = var_978_begin_0, end = var_978_end_0, end_mask = var_978_end_mask_0, x = x_21)[name = tensor<string, []>("op_978")];
+            tensor<int32, [3]> var_981_begin_0 = const()[name = tensor<string, []>("op_981_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_981_end_0 = const()[name = tensor<string, []>("op_981_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_981_end_mask_0 = const()[name = tensor<string, []>("op_981_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_981 = slice_by_index(begin = var_981_begin_0, end = var_981_end_0, end_mask = var_981_end_mask_0, x = window_43)[name = tensor<string, []>("op_981")];
             tensor<bool, []> window_45_interleave_0 = const()[name = tensor<string, []>("window_45_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_45 = concat(axis = var_27, interleave = window_45_interleave_0, values = (var_915, var_912))[name = tensor<string, []>("window_45")];
-            tensor<int32, [3]> var_920_begin_0 = const()[name = tensor<string, []>("op_920_begin_0"), val = tensor<int32, [3]>([0, 4, 0])];
-            tensor<int32, [3]> var_920_end_0 = const()[name = tensor<string, []>("op_920_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_920_end_mask_0 = const()[name = tensor<string, []>("op_920_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_920 = slice_by_index(begin = var_920_begin_0, end = var_920_end_0, end_mask = var_920_end_mask_0, x = x_21)[name = tensor<string, []>("op_920")];
-            tensor<int32, [3]> var_923_begin_0 = const()[name = tensor<string, []>("op_923_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_923_end_0 = const()[name = tensor<string, []>("op_923_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_923_end_mask_0 = const()[name = tensor<string, []>("op_923_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_923 = slice_by_index(begin = var_923_begin_0, end = var_923_end_0, end_mask = var_923_end_mask_0, x = window_45)[name = tensor<string, []>("op_923")];
+            tensor<fp32, [1, 16, 256]> window_45 = concat(axis = var_93, interleave = window_45_interleave_0, values = (var_981, var_978))[name = tensor<string, []>("window_45")];
+            tensor<int32, [3]> var_986_begin_0 = const()[name = tensor<string, []>("op_986_begin_0"), val = tensor<int32, [3]>([0, 4, 0])];
+            tensor<int32, [3]> var_986_end_0 = const()[name = tensor<string, []>("op_986_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_986_end_mask_0 = const()[name = tensor<string, []>("op_986_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_986 = slice_by_index(begin = var_986_begin_0, end = var_986_end_0, end_mask = var_986_end_mask_0, x = x_21)[name = tensor<string, []>("op_986")];
+            tensor<int32, [3]> var_989_begin_0 = const()[name = tensor<string, []>("op_989_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_989_end_0 = const()[name = tensor<string, []>("op_989_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_989_end_mask_0 = const()[name = tensor<string, []>("op_989_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_989 = slice_by_index(begin = var_989_begin_0, end = var_989_end_0, end_mask = var_989_end_mask_0, x = window_45)[name = tensor<string, []>("op_989")];
             tensor<bool, []> window_interleave_0 = const()[name = tensor<string, []>("window_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window = concat(axis = var_27, interleave = window_interleave_0, values = (var_923, var_920))[name = tensor<string, []>("window")];
-            tensor<bool, []> input_141_interleave_0 = const()[name = tensor<string, []>("input_141_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [5, 16, 256]> input_141 = concat(axis = var_24, interleave = input_141_interleave_0, values = (window_39, window_41, window_43, window_45, window))[name = tensor<string, []>("input_141")];
+            tensor<fp32, [1, 16, 256]> window = concat(axis = var_93, interleave = window_interleave_0, values = (var_989, var_986))[name = tensor<string, []>("window")];
+            tensor<bool, []> input_143_interleave_0 = const()[name = tensor<string, []>("input_143_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [5, 16, 256]> input_143 = concat(axis = var_79, interleave = input_143_interleave_0, values = (window_39, window_41, window_43, window_45, window))[name = tensor<string, []>("input_143")];
             tensor<int32, [1]> x_23_axes_0 = const()[name = tensor<string, []>("x_23_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [5, 16, 256]> x_23 = layer_norm(axes = x_23_axes_0, beta = encoder_conv_module_3_sequential_0_bias, epsilon = var_29, gamma = encoder_conv_module_3_sequential_0_weight, x = input_141)[name = tensor<string, []>("x_23")];
-            tensor<int32, [3]> input_143_perm_0 = const()[name = tensor<string, []>("input_143_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [5, 16, 256]> x_23 = layer_norm(axes = x_23_axes_0, beta = inner_encoder_conv_module_3_sequential_0_bias, epsilon = var_76, gamma = inner_encoder_conv_module_3_sequential_0_weight, x = input_143)[name = tensor<string, []>("x_23")];
+            tensor<int32, [3]> input_145_perm_0 = const()[name = tensor<string, []>("input_145_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_33_pad_type_0 = const()[name = tensor<string, []>("inputs_33_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_33_strides_0 = const()[name = tensor<string, []>("inputs_33_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_33_pad_0 = const()[name = tensor<string, []>("inputs_33_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_33_dilations_0 = const()[name = tensor<string, []>("inputs_33_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_33_groups_0 = const()[name = tensor<string, []>("inputs_33_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> input_143 = transpose(perm = input_143_perm_0, x = x_23)[name = tensor<string, []>("transpose_33")];
-            tensor<fp32, [5, 512, 16]> inputs_33 = conv(bias = encoder_conv_module_3_sequential_2_conv_bias, dilations = inputs_33_dilations_0, groups = inputs_33_groups_0, pad = inputs_33_pad_0, pad_type = inputs_33_pad_type_0, strides = inputs_33_strides_0, weight = encoder_conv_module_3_sequential_2_conv_weight, x = input_143)[name = tensor<string, []>("inputs_33")];
-            tensor<int32, [2]> var_948_split_sizes_0 = const()[name = tensor<string, []>("op_948_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_948_axis_0 = const()[name = tensor<string, []>("op_948_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> var_948_0, tensor<fp32, [5, 256, 16]> var_948_1 = split(axis = var_948_axis_0, split_sizes = var_948_split_sizes_0, x = inputs_33)[name = tensor<string, []>("op_948")];
-            tensor<fp32, [5, 256, 16]> var_950 = sigmoid(x = var_948_1)[name = tensor<string, []>("op_950")];
-            tensor<fp32, [5, 256, 16]> inputs_35 = mul(x = var_948_0, y = var_950)[name = tensor<string, []>("inputs_35")];
+            tensor<fp32, [5, 256, 16]> input_145 = transpose(perm = input_145_perm_0, x = x_23)[name = tensor<string, []>("transpose_33")];
+            tensor<fp32, [5, 512, 16]> inputs_33 = conv(bias = inner_encoder_conv_module_3_sequential_2_conv_bias, dilations = inputs_33_dilations_0, groups = inputs_33_groups_0, pad = inputs_33_pad_0, pad_type = inputs_33_pad_type_0, strides = inputs_33_strides_0, weight = inner_encoder_conv_module_3_sequential_2_conv_weight, x = input_145)[name = tensor<string, []>("inputs_33")];
+            tensor<int32, [2]> var_1014_split_sizes_0 = const()[name = tensor<string, []>("op_1014_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_1014_axis_0 = const()[name = tensor<string, []>("op_1014_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [5, 256, 16]> var_1014_0, tensor<fp32, [5, 256, 16]> var_1014_1 = split(axis = var_1014_axis_0, split_sizes = var_1014_split_sizes_0, x = inputs_33)[name = tensor<string, []>("op_1014")];
+            tensor<fp32, [5, 256, 16]> var_1016 = sigmoid(x = var_1014_1)[name = tensor<string, []>("op_1016")];
+            tensor<fp32, [5, 256, 16]> inputs_35 = mul(x = var_1014_0, y = var_1016)[name = tensor<string, []>("inputs_35")];
             tensor<string, []> outputs_aug_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_pad_0 = const()[name = tensor<string, []>("outputs_aug_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_groups_0 = const()[name = tensor<string, []>("outputs_aug_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_strides_0 = const()[name = tensor<string, []>("outputs_aug_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_dilations_0 = const()[name = tensor<string, []>("outputs_aug_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [5, 256, 31]> outputs_aug = conv(dilations = outputs_aug_dilations_0, groups = outputs_aug_groups_0, pad = outputs_aug_pad_0, pad_type = outputs_aug_pad_type_0, strides = outputs_aug_strides_0, weight = encoder_conv_module_3_sequential_4_conv_weight, x = inputs_35)[name = tensor<string, []>("outputs_aug")];
-            tensor<int32, [3]> input_145_begin_0 = const()[name = tensor<string, []>("input_145_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_145_end_0 = const()[name = tensor<string, []>("input_145_end_0"), val = tensor<int32, [3]>([5, 256, 16])];
-            tensor<bool, [3]> input_145_end_mask_0 = const()[name = tensor<string, []>("input_145_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [5, 256, 16]> input_145 = slice_by_index(begin = input_145_begin_0, end = input_145_end_0, end_mask = input_145_end_mask_0, x = outputs_aug)[name = tensor<string, []>("input_145")];
-            tensor<fp32, [5, 256, 16]> inputs_37 = batch_norm(beta = encoder_conv_module_3_sequential_5_bias, epsilon = var_29, gamma = encoder_conv_module_3_sequential_5_weight, mean = encoder_conv_module_3_sequential_5_running_mean, variance = encoder_conv_module_3_sequential_5_running_var, x = input_145)[name = tensor<string, []>("inputs_37")];
-            tensor<fp32, [5, 256, 16]> input_147 = silu(x = inputs_37)[name = tensor<string, []>("input_147")];
-            tensor<string, []> input_149_pad_type_0 = const()[name = tensor<string, []>("input_149_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_149_strides_0 = const()[name = tensor<string, []>("input_149_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_149_pad_0 = const()[name = tensor<string, []>("input_149_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_149_dilations_0 = const()[name = tensor<string, []>("input_149_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_149_groups_0 = const()[name = tensor<string, []>("input_149_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> input_149 = conv(bias = encoder_conv_module_3_sequential_7_conv_bias, dilations = input_149_dilations_0, groups = input_149_groups_0, pad = input_149_pad_0, pad_type = input_149_pad_type_0, strides = input_149_strides_0, weight = encoder_conv_module_3_sequential_7_conv_weight, x = input_147)[name = tensor<string, []>("input_149")];
+            tensor<fp32, [5, 256, 31]> outputs_aug = conv(dilations = outputs_aug_dilations_0, groups = outputs_aug_groups_0, pad = outputs_aug_pad_0, pad_type = outputs_aug_pad_type_0, strides = outputs_aug_strides_0, weight = inner_encoder_conv_module_3_sequential_4_conv_weight, x = inputs_35)[name = tensor<string, []>("outputs_aug")];
+            tensor<int32, [3]> input_147_begin_0 = const()[name = tensor<string, []>("input_147_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_147_end_0 = const()[name = tensor<string, []>("input_147_end_0"), val = tensor<int32, [3]>([5, 256, 16])];
+            tensor<bool, [3]> input_147_end_mask_0 = const()[name = tensor<string, []>("input_147_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [5, 256, 16]> input_147 = slice_by_index(begin = input_147_begin_0, end = input_147_end_0, end_mask = input_147_end_mask_0, x = outputs_aug)[name = tensor<string, []>("input_147")];
+            tensor<fp32, [5, 256, 16]> inputs_37 = batch_norm(beta = inner_encoder_conv_module_3_sequential_5_bias, epsilon = var_76, gamma = inner_encoder_conv_module_3_sequential_5_weight, mean = inner_encoder_conv_module_3_sequential_5_running_mean, variance = inner_encoder_conv_module_3_sequential_5_running_var, x = input_147)[name = tensor<string, []>("inputs_37")];
+            tensor<fp32, [5, 256, 16]> input_149 = silu(x = inputs_37)[name = tensor<string, []>("input_149")];
+            tensor<string, []> input_151_pad_type_0 = const()[name = tensor<string, []>("input_151_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_151_strides_0 = const()[name = tensor<string, []>("input_151_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_151_pad_0 = const()[name = tensor<string, []>("input_151_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_151_dilations_0 = const()[name = tensor<string, []>("input_151_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_151_groups_0 = const()[name = tensor<string, []>("input_151_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [5, 256, 16]> input_151 = conv(bias = inner_encoder_conv_module_3_sequential_7_conv_bias, dilations = input_151_dilations_0, groups = input_151_groups_0, pad = input_151_pad_0, pad_type = input_151_pad_type_0, strides = input_151_strides_0, weight = inner_encoder_conv_module_3_sequential_7_conv_weight, x = input_149)[name = tensor<string, []>("input_151")];
             tensor<int32, [3]> conv_out_7_perm_0 = const()[name = tensor<string, []>("conv_out_7_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_981_begin_0 = const()[name = tensor<string, []>("op_981_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_981_end_0 = const()[name = tensor<string, []>("op_981_end_0"), val = tensor<int32, [3]>([5, 16, 256])];
-            tensor<bool, [3]> var_981_end_mask_0 = const()[name = tensor<string, []>("op_981_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [5, 16, 256]> conv_out_7 = transpose(perm = conv_out_7_perm_0, x = input_149)[name = tensor<string, []>("transpose_32")];
-            tensor<fp32, [5, 1, 256]> var_981 = slice_by_index(begin = var_981_begin_0, end = var_981_end_0, end_mask = var_981_end_mask_0, x = conv_out_7)[name = tensor<string, []>("op_981")];
-            tensor<int32, [3]> var_983_perm_0 = const()[name = tensor<string, []>("op_983_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 5, 256]> var_983 = transpose(perm = var_983_perm_0, x = var_981)[name = tensor<string, []>("transpose_31")];
-            tensor<fp32, [1, 5, 256]> input_151 = add(x = x_21, y = var_983)[name = tensor<string, []>("input_151")];
-            tensor<int32, [1]> input_153_axes_0 = const()[name = tensor<string, []>("input_153_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_153 = layer_norm(axes = input_153_axes_0, beta = encoder_ffn2_3_module_sequential_0_bias, epsilon = var_29, gamma = encoder_ffn2_3_module_sequential_0_weight, x = input_151)[name = tensor<string, []>("input_153")];
-            tensor<fp32, [1, 5, 1024]> inputs = linear(bias = encoder_ffn2_3_module_sequential_1_linear_bias, weight = encoder_ffn2_3_module_sequential_1_linear_weight, x = input_153)[name = tensor<string, []>("linear_35")];
-            tensor<fp32, [1, 5, 1024]> input_155 = silu(x = inputs)[name = tensor<string, []>("input_155")];
-            tensor<fp32, [1, 5, 256]> input_159 = linear(bias = encoder_ffn2_3_module_sequential_4_linear_bias, weight = encoder_ffn2_3_module_sequential_4_linear_weight, x = input_155)[name = tensor<string, []>("linear_36")];
-            tensor<fp32, []> var_1006 = const()[name = tensor<string, []>("op_1006"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 5, 256]> var_1007 = mul(x = input_159, y = var_1006)[name = tensor<string, []>("op_1007")];
-            tensor<fp32, [1, 5, 256]> input_161 = add(x = var_1007, y = input_151)[name = tensor<string, []>("input_161")];
+            tensor<int32, [3]> var_1047_begin_0 = const()[name = tensor<string, []>("op_1047_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_1047_end_0 = const()[name = tensor<string, []>("op_1047_end_0"), val = tensor<int32, [3]>([5, 16, 256])];
+            tensor<bool, [3]> var_1047_end_mask_0 = const()[name = tensor<string, []>("op_1047_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [5, 16, 256]> conv_out_7 = transpose(perm = conv_out_7_perm_0, x = input_151)[name = tensor<string, []>("transpose_32")];
+            tensor<fp32, [5, 1, 256]> var_1047 = slice_by_index(begin = var_1047_begin_0, end = var_1047_end_0, end_mask = var_1047_end_mask_0, x = conv_out_7)[name = tensor<string, []>("op_1047")];
+            tensor<int32, [3]> var_1049_perm_0 = const()[name = tensor<string, []>("op_1049_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 5, 256]> var_1049 = transpose(perm = var_1049_perm_0, x = var_1047)[name = tensor<string, []>("transpose_31")];
+            tensor<fp32, [1, 5, 256]> input_153 = add(x = x_21, y = var_1049)[name = tensor<string, []>("input_153")];
+            tensor<int32, [1]> input_155_axes_0 = const()[name = tensor<string, []>("input_155_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 5, 256]> input_155 = layer_norm(axes = input_155_axes_0, beta = inner_encoder_ffn2_3_module_sequential_0_bias, epsilon = var_76, gamma = inner_encoder_ffn2_3_module_sequential_0_weight, x = input_153)[name = tensor<string, []>("input_155")];
+            tensor<fp32, [1, 5, 1024]> inputs = linear(bias = inner_encoder_ffn2_3_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_3_module_sequential_1_linear_weight, x = input_155)[name = tensor<string, []>("linear_35")];
+            tensor<fp32, [1, 5, 1024]> input_157 = silu(x = inputs)[name = tensor<string, []>("input_157")];
+            tensor<fp32, [1, 5, 256]> input_161 = linear(bias = inner_encoder_ffn2_3_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_3_module_sequential_4_linear_weight, x = input_157)[name = tensor<string, []>("linear_36")];
+            tensor<fp32, []> var_1072 = const()[name = tensor<string, []>("op_1072"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 5, 256]> var_1073 = mul(x = input_161, y = var_1072)[name = tensor<string, []>("op_1073")];
+            tensor<fp32, [1, 5, 256]> input_163 = add(x = var_1073, y = input_153)[name = tensor<string, []>("input_163")];
             tensor<int32, [1]> x_25_axes_0 = const()[name = tensor<string, []>("x_25_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> x_25 = layer_norm(axes = x_25_axes_0, beta = encoder_layer_norm_3_bias, epsilon = var_29, gamma = encoder_layer_norm_3_weight, x = input_161)[name = tensor<string, []>("x_25")];
+            tensor<fp32, [1, 5, 256]> x_25 = layer_norm(axes = x_25_axes_0, beta = inner_encoder_layer_norm_3_bias, epsilon = var_76, gamma = inner_encoder_layer_norm_3_weight, x = input_163)[name = tensor<string, []>("x_25")];
             tensor<int32, [3]> x_ct_perm_0 = const()[name = tensor<string, []>("x_ct_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<bool, []> cat_interleave_0 = const()[name = tensor<string, []>("cat_interleave_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [1, 256, 5]> x_ct = transpose(perm = x_ct_perm_0, x = x_25)[name = tensor<string, []>("transpose_30")];
-            tensor<fp32, [1, 256, 23]> cat = concat(axis = var_21, interleave = cat_interleave_0, values = (cnn_window, x_ct))[name = tensor<string, []>("cat")];
+            tensor<fp32, [1, 256, 23]> cat = concat(axis = var_81, interleave = cat_interleave_0, values = (cnn_window, x_ct))[name = tensor<string, []>("cat")];
             tensor<string, []> conv_out_pad_type_0 = const()[name = tensor<string, []>("conv_out_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> conv_out_strides_0 = const()[name = tensor<string, []>("conv_out_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> conv_out_pad_0 = const()[name = tensor<string, []>("conv_out_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> conv_out_dilations_0 = const()[name = tensor<string, []>("conv_out_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> conv_out_groups_0 = const()[name = tensor<string, []>("conv_out_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 5]> conv_out = conv(bias = encoder_cnn_bias, dilations = conv_out_dilations_0, groups = conv_out_groups_0, pad = conv_out_pad_0, pad_type = conv_out_pad_type_0, strides = conv_out_strides_0, weight = encoder_cnn_weight, x = cat)[name = tensor<string, []>("conv_out")];
-            tensor<int32, [3]> var_1025_begin_0 = const()[name = tensor<string, []>("op_1025_begin_0"), val = tensor<int32, [3]>([0, 0, 5])];
-            tensor<int32, [3]> var_1025_end_0 = const()[name = tensor<string, []>("op_1025_end_0"), val = tensor<int32, [3]>([1, 256, 23])];
-            tensor<bool, [3]> var_1025_end_mask_0 = const()[name = tensor<string, []>("op_1025_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 256, 18]> cnn_window_new = slice_by_index(begin = var_1025_begin_0, end = var_1025_end_0, end_mask = var_1025_end_mask_0, x = cat)[name = tensor<string, []>("op_1025")];
-            tensor<int32, [3]> input_163_perm_0 = const()[name = tensor<string, []>("input_163_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [1]> var_1027 = const()[name = tensor<string, []>("op_1027"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_163 = transpose(perm = input_163_perm_0, x = conv_out)[name = tensor<string, []>("transpose_29")];
-            tensor<fp32, [1, 5, 1]> var_1028 = reduce_l2_norm(axes = var_1027, keep_dims = var_30, x = input_163)[name = tensor<string, []>("op_1028")];
+            tensor<fp32, [1, 256, 5]> conv_out = conv(bias = inner_encoder_cnn_bias, dilations = conv_out_dilations_0, groups = conv_out_groups_0, pad = conv_out_pad_0, pad_type = conv_out_pad_type_0, strides = conv_out_strides_0, weight = inner_encoder_cnn_weight, x = cat)[name = tensor<string, []>("conv_out")];
+            tensor<int32, [3]> var_1091_begin_0 = const()[name = tensor<string, []>("op_1091_begin_0"), val = tensor<int32, [3]>([0, 0, 5])];
+            tensor<int32, [3]> var_1091_end_0 = const()[name = tensor<string, []>("op_1091_end_0"), val = tensor<int32, [3]>([1, 256, 23])];
+            tensor<bool, [3]> var_1091_end_mask_0 = const()[name = tensor<string, []>("op_1091_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 256, 18]> cnn_window_new = slice_by_index(begin = var_1091_begin_0, end = var_1091_end_0, end_mask = var_1091_end_mask_0, x = cat)[name = tensor<string, []>("op_1091")];
+            tensor<int32, [3]> input_165_perm_0 = const()[name = tensor<string, []>("input_165_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_1093 = const()[name = tensor<string, []>("op_1093"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 5, 256]> input_165 = transpose(perm = input_165_perm_0, x = conv_out)[name = tensor<string, []>("transpose_29")];
+            tensor<fp32, [1, 5, 1]> var_1094 = reduce_l2_norm(axes = var_1093, keep_dims = var_75, x = input_165)[name = tensor<string, []>("op_1094")];
             tensor<fp32, []> const_12 = const()[name = tensor<string, []>("const_12"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1, 5, 1]> clip_0 = clip(alpha = var_11, beta = const_12, x = var_1028)[name = tensor<string, []>("clip_0")];
-            tensor<fp32, [1, 5, 256]> emb = real_div(x = input_163, y = clip_0)[name = tensor<string, []>("emb")];
-            tensor<int32, []> var_1032_axis_0 = const()[name = tensor<string, []>("op_1032_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1, 4, 64, 64]> enc_kv_new = stack(axis = var_1032_axis_0, values = (var_207, var_429, var_651, nkv_1))[name = tensor<string, []>("op_1032")];
-            tensor<int32, []> var_1034_axis_0 = const()[name = tensor<string, []>("op_1034_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1]> enc_scale_new = stack(axis = var_1034_axis_0, values = (new_scale_1, new_scale_3, new_scale_5, new_scale_7))[name = tensor<string, []>("op_1034")];
-            tensor<int32, []> var_1036_axis_0 = const()[name = tensor<string, []>("op_1036_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1, 16, 256]> enc_conv_cache_new = stack(axis = var_1036_axis_0, values = (window_11, window_23, window_35, window))[name = tensor<string, []>("op_1036")];
-            tensor<fp32, []> var_1045 = const()[name = tensor<string, []>("op_1045"), val = tensor<fp32, []>(0x1.5798eep-27)];
-            tensor<fp32, []> var_1050 = const()[name = tensor<string, []>("op_1050"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
-            tensor<fp32, []> var_1052 = const()[name = tensor<string, []>("op_1052"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
-            tensor<bool, []> var_1053 = const()[name = tensor<string, []>("op_1053"), val = tensor<bool, []>(true)];
-            tensor<fp32, []> var_1055 = const()[name = tensor<string, []>("op_1055"), val = tensor<fp32, []>(0x1p+0)];
-            tensor<int32, []> var_1059 = const()[name = tensor<string, []>("op_1059"), val = tensor<int32, []>(-1)];
-            tensor<int32, []> var_1065 = const()[name = tensor<string, []>("op_1065"), val = tensor<int32, []>(0)];
+            tensor<fp32, [1, 5, 1]> clip_0 = clip(alpha = var_90, beta = const_12, x = var_1094)[name = tensor<string, []>("clip_0")];
+            tensor<fp32, [1, 5, 256]> emb = real_div(x = input_165, y = clip_0)[name = tensor<string, []>("emb")];
+            tensor<int32, []> var_1098_axis_0 = const()[name = tensor<string, []>("op_1098_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1, 4, 64, 64]> enc_kv_new = stack(axis = var_1098_axis_0, values = (var_273, var_495, var_717, nkv_1))[name = tensor<string, []>("op_1098")];
+            tensor<int32, []> var_1100_axis_0 = const()[name = tensor<string, []>("op_1100_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1]> enc_scale_new = stack(axis = var_1100_axis_0, values = (new_scale_1, new_scale_3, new_scale_5, new_scale_7))[name = tensor<string, []>("op_1100")];
+            tensor<int32, []> var_1102_axis_0 = const()[name = tensor<string, []>("op_1102_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1, 16, 256]> enc_conv_cache_new = stack(axis = var_1102_axis_0, values = (window_11, window_23, window_35, window))[name = tensor<string, []>("op_1102")];
             tensor<fp32, [1, 5, 9, 256]> pos = const()[name = tensor<string, []>("pos"), val = tensor<fp32, [1, 5, 9, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44395712)))];
-            tensor<int32, [1]> var_1127_axes_0 = const()[name = tensor<string, []>("op_1127_axes_0"), val = tensor<int32, [1]>([2])];
-            tensor<fp32, [1, 5, 1, 256]> var_1127 = expand_dims(axes = var_1127_axes_0, x = emb)[name = tensor<string, []>("op_1127")];
+            tensor<int32, [1]> var_1170_axes_0 = const()[name = tensor<string, []>("op_1170_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp32, [1, 5, 1, 256]> var_1170 = expand_dims(axes = var_1170_axes_0, x = emb)[name = tensor<string, []>("op_1170")];
             tensor<int32, [4]> emb_exp_reps_0 = const()[name = tensor<string, []>("emb_exp_reps_0"), val = tensor<int32, [4]>([1, 1, 9, 1])];
-            tensor<fp32, [1, 5, 9, 256]> emb_exp = tile(reps = emb_exp_reps_0, x = var_1127)[name = tensor<string, []>("emb_exp")];
-            tensor<bool, []> input_165_interleave_0 = const()[name = tensor<string, []>("input_165_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 5, 9, 512]> input_165 = concat(axis = var_1059, interleave = input_165_interleave_0, values = (emb_exp, pos))[name = tensor<string, []>("input_165")];
-            tensor<fp32, [1, 5, 9, 256]> x_27 = linear(bias = decoder_convert_bias, weight = decoder_convert_weight, x = input_165)[name = tensor<string, []>("linear_37")];
-            tensor<int32, [4]> var_1135_perm_0 = const()[name = tensor<string, []>("op_1135_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1139 = const()[name = tensor<string, []>("op_1139"), val = tensor<int32, [3]>([9, 5, 256])];
-            tensor<fp32, [1, 9, 5, 256]> var_1135 = transpose(perm = var_1135_perm_0, x = x_27)[name = tensor<string, []>("transpose_28")];
-            tensor<fp32, [9, 5, 256]> x_29 = reshape(shape = var_1139, x = var_1135)[name = tensor<string, []>("x_29")];
+            tensor<fp32, [1, 5, 9, 256]> emb_exp = tile(reps = emb_exp_reps_0, x = var_1170)[name = tensor<string, []>("emb_exp")];
+            tensor<bool, []> input_167_interleave_0 = const()[name = tensor<string, []>("input_167_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 5, 9, 512]> input_167 = concat(axis = var_82, interleave = input_167_interleave_0, values = (emb_exp, pos))[name = tensor<string, []>("input_167")];
+            tensor<fp32, [1, 5, 9, 256]> x_27 = linear(bias = inner_decoder_convert_bias, weight = inner_decoder_convert_weight, x = input_167)[name = tensor<string, []>("linear_37")];
+            tensor<int32, [4]> var_1178_perm_0 = const()[name = tensor<string, []>("op_1178_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1182 = const()[name = tensor<string, []>("op_1182"), val = tensor<int32, [3]>([9, 5, 256])];
+            tensor<fp32, [1, 9, 5, 256]> var_1178 = transpose(perm = var_1178_perm_0, x = x_27)[name = tensor<string, []>("transpose_28")];
+            tensor<fp32, [9, 5, 256]> x_29 = reshape(shape = var_1182, x = var_1178)[name = tensor<string, []>("x_29")];
             tensor<int32, [5]> prev_kv_9_begin_0 = const()[name = tensor<string, []>("prev_kv_9_begin_0"), val = tensor<int32, [5]>([0, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_9_end_0 = const()[name = tensor<string, []>("prev_kv_9_end_0"), val = tensor<int32, [5]>([1, 9, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_9_end_mask_0 = const()[name = tensor<string, []>("prev_kv_9_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -1023,132 +1042,132 @@ program(1.0)
             tensor<bool, [2]> prev_scale_9_end_mask_0 = const()[name = tensor<string, []>("prev_scale_9_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_9_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_9_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_9 = slice_by_index(begin = prev_scale_9_begin_0, end = prev_scale_9_end_0, end_mask = prev_scale_9_end_mask_0, squeeze_mask = prev_scale_9_squeeze_mask_0, x = dec_scale)[name = tensor<string, []>("prev_scale_9")];
-            tensor<fp32, [9, 5, 256]> var_1147 = linear(bias = decoder_q_proj_0_bias, weight = decoder_q_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_38")];
-            tensor<int32, [4]> var_1148 = const()[name = tensor<string, []>("op_1148"), val = tensor<int32, [4]>([9, 5, 4, 64])];
-            tensor<fp32, [9, 5, 4, 64]> var_1149 = reshape(shape = var_1148, x = var_1147)[name = tensor<string, []>("op_1149")];
+            tensor<fp32, [9, 5, 256]> var_1190 = linear(bias = inner_decoder_q_proj_0_bias, weight = inner_decoder_q_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_38")];
+            tensor<int32, [4]> var_1191 = const()[name = tensor<string, []>("op_1191"), val = tensor<int32, [4]>([9, 5, 4, 64])];
+            tensor<fp32, [9, 5, 4, 64]> var_1192 = reshape(shape = var_1191, x = var_1190)[name = tensor<string, []>("op_1192")];
             tensor<int32, [4]> q_9_perm_0 = const()[name = tensor<string, []>("q_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [9, 5, 256]> var_1153 = linear(bias = decoder_k_proj_0_bias, weight = decoder_k_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_39")];
-            tensor<fp32, []> var_1154 = const()[name = tensor<string, []>("op_1154"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [9, 5, 256]> var_1155 = mul(x = var_1153, y = var_1154)[name = tensor<string, []>("op_1155")];
-            tensor<int32, [4]> var_1156 = const()[name = tensor<string, []>("op_1156"), val = tensor<int32, [4]>([9, 5, 4, 64])];
-            tensor<fp32, [9, 5, 4, 64]> var_1157 = reshape(shape = var_1156, x = var_1155)[name = tensor<string, []>("op_1157")];
+            tensor<fp32, [9, 5, 256]> var_1196 = linear(bias = inner_decoder_k_proj_0_bias, weight = inner_decoder_k_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_39")];
+            tensor<fp32, []> var_1197 = const()[name = tensor<string, []>("op_1197"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [9, 5, 256]> var_1198 = mul(x = var_1196, y = var_1197)[name = tensor<string, []>("op_1198")];
+            tensor<int32, [4]> var_1199 = const()[name = tensor<string, []>("op_1199"), val = tensor<int32, [4]>([9, 5, 4, 64])];
+            tensor<fp32, [9, 5, 4, 64]> var_1200 = reshape(shape = var_1199, x = var_1198)[name = tensor<string, []>("op_1200")];
             tensor<int32, [4]> k_9_perm_0 = const()[name = tensor<string, []>("k_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [9, 5, 256]> var_1161 = linear(bias = decoder_v_proj_0_bias, weight = decoder_v_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_40")];
-            tensor<int32, [4]> var_1162 = const()[name = tensor<string, []>("op_1162"), val = tensor<int32, [4]>([9, 5, 4, 64])];
-            tensor<fp32, [9, 5, 4, 64]> var_1163 = reshape(shape = var_1162, x = var_1161)[name = tensor<string, []>("op_1163")];
+            tensor<fp32, [9, 5, 256]> var_1204 = linear(bias = inner_decoder_v_proj_0_bias, weight = inner_decoder_v_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_40")];
+            tensor<int32, [4]> var_1205 = const()[name = tensor<string, []>("op_1205"), val = tensor<int32, [4]>([9, 5, 4, 64])];
+            tensor<fp32, [9, 5, 4, 64]> var_1206 = reshape(shape = var_1205, x = var_1204)[name = tensor<string, []>("op_1206")];
             tensor<int32, [4]> v_9_perm_0 = const()[name = tensor<string, []>("v_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [9, 5, 256]> input_169 = linear(bias = decoder_g_proj_0_bias, weight = decoder_g_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_41")];
+            tensor<fp32, [9, 5, 256]> input_171 = linear(bias = inner_decoder_g_proj_0_bias, weight = inner_decoder_g_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_41")];
             tensor<bool, []> cumsum_mask_1_exclusive_0 = const()[name = tensor<string, []>("cumsum_mask_1_exclusive_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> cumsum_mask_1_reverse_0 = const()[name = tensor<string, []>("cumsum_mask_1_reverse_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [5]> cumsum_mask_1 = cumsum(axis = var_1065, exclusive = cumsum_mask_1_exclusive_0, reverse = cumsum_mask_1_reverse_0, x = valid_mask)[name = tensor<string, []>("cumsum_mask_1")];
+            tensor<fp32, [5]> cumsum_mask_1 = cumsum(axis = var_79, exclusive = cumsum_mask_1_exclusive_0, reverse = cumsum_mask_1_reverse_0, x = valid_mask)[name = tensor<string, []>("cumsum_mask_1")];
             tensor<fp32, [1]> sqrt_s0_9 = sqrt(x = prev_scale_9)[name = tensor<string, []>("sqrt_s0_9")];
             tensor<fp32, [5]> s_t_9 = add(x = prev_scale_9, y = cumsum_mask_1)[name = tensor<string, []>("s_t_9")];
             tensor<fp32, []> const_20 = const()[name = tensor<string, []>("const_20"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [5]> clip_1 = clip(alpha = var_1055, beta = const_20, x = s_t_9)[name = tensor<string, []>("clip_1")];
+            tensor<fp32, [5]> clip_1 = clip(alpha = var_69, beta = const_20, x = s_t_9)[name = tensor<string, []>("clip_1")];
             tensor<fp32, [5]> sqrt_s_t_9 = sqrt(x = clip_1)[name = tensor<string, []>("sqrt_s_t_9")];
             tensor<bool, []> qk_9_transpose_x_1 = const()[name = tensor<string, []>("qk_9_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_9_transpose_y_1 = const()[name = tensor<string, []>("qk_9_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [9, 4, 5, 64]> k_9 = transpose(perm = k_9_perm_0, x = var_1157)[name = tensor<string, []>("transpose_26")];
-            tensor<fp32, [9, 4, 5, 64]> q_9 = transpose(perm = q_9_perm_0, x = var_1149)[name = tensor<string, []>("transpose_27")];
+            tensor<fp32, [9, 4, 5, 64]> k_9 = transpose(perm = k_9_perm_0, x = var_1200)[name = tensor<string, []>("transpose_26")];
+            tensor<fp32, [9, 4, 5, 64]> q_9 = transpose(perm = q_9_perm_0, x = var_1192)[name = tensor<string, []>("transpose_27")];
             tensor<fp32, [9, 4, 5, 5]> qk_9 = matmul(transpose_x = qk_9_transpose_x_1, transpose_y = qk_9_transpose_y_1, x = q_9, y = k_9)[name = tensor<string, []>("qk_9")];
-            tensor<int32, [2]> var_1175 = const()[name = tensor<string, []>("op_1175"), val = tensor<int32, [2]>([1, 5])];
-            tensor<fp32, [1, 5]> var_1176 = reshape(shape = var_1175, x = valid_mask)[name = tensor<string, []>("op_1176")];
-            tensor<fp32, [5, 5]> causal_with_valid_1 = mul(x = decoder__causal_mask, y = var_1176)[name = tensor<string, []>("causal_with_valid_1")];
-            tensor<int32, [2]> var_1178 = const()[name = tensor<string, []>("op_1178"), val = tensor<int32, [2]>([5, 1])];
-            tensor<fp32, [5, 1]> var_1179 = reshape(shape = var_1178, x = sqrt_s_t_9)[name = tensor<string, []>("op_1179")];
-            tensor<fp32, [5, 5]> M_9 = real_div(x = causal_with_valid_1, y = var_1179)[name = tensor<string, []>("M_9")];
-            tensor<fp32, [9, 4, 5, 5]> var_1181 = mul(x = qk_9, y = M_9)[name = tensor<string, []>("op_1181")];
+            tensor<int32, [2]> var_1218 = const()[name = tensor<string, []>("op_1218"), val = tensor<int32, [2]>([1, 5])];
+            tensor<fp32, [1, 5]> var_1219 = reshape(shape = var_1218, x = valid_mask)[name = tensor<string, []>("op_1219")];
+            tensor<fp32, [5, 5]> causal_with_valid_1 = mul(x = inner_decoder__causal_mask, y = var_1219)[name = tensor<string, []>("causal_with_valid_1")];
+            tensor<int32, [2]> var_1221 = const()[name = tensor<string, []>("op_1221"), val = tensor<int32, [2]>([5, 1])];
+            tensor<fp32, [5, 1]> var_1222 = reshape(shape = var_1221, x = sqrt_s_t_9)[name = tensor<string, []>("op_1222")];
+            tensor<fp32, [5, 5]> M_9 = real_div(x = causal_with_valid_1, y = var_1222)[name = tensor<string, []>("M_9")];
+            tensor<fp32, [9, 4, 5, 5]> var_1224 = mul(x = qk_9, y = M_9)[name = tensor<string, []>("op_1224")];
             tensor<bool, []> inner_9_transpose_x_0 = const()[name = tensor<string, []>("inner_9_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_9_transpose_y_0 = const()[name = tensor<string, []>("inner_9_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [9, 4, 5, 64]> v_9 = transpose(perm = v_9_perm_0, x = var_1163)[name = tensor<string, []>("transpose_25")];
-            tensor<fp32, [9, 4, 5, 64]> inner_9 = matmul(transpose_x = inner_9_transpose_x_0, transpose_y = inner_9_transpose_y_0, x = var_1181, y = v_9)[name = tensor<string, []>("inner_9")];
-            tensor<bool, []> var_1183_transpose_x_0 = const()[name = tensor<string, []>("op_1183_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_1183_transpose_y_0 = const()[name = tensor<string, []>("op_1183_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [9, 4, 5, 64]> var_1183 = matmul(transpose_x = var_1183_transpose_x_0, transpose_y = var_1183_transpose_y_0, x = q_9, y = prev_kv_9)[name = tensor<string, []>("op_1183")];
-            tensor<fp32, [5]> var_1184 = real_div(x = sqrt_s0_9, y = sqrt_s_t_9)[name = tensor<string, []>("op_1184")];
-            tensor<int32, [4]> var_1185 = const()[name = tensor<string, []>("op_1185"), val = tensor<int32, [4]>([1, 1, 5, 1])];
-            tensor<fp32, [1, 1, 5, 1]> var_1186 = reshape(shape = var_1185, x = var_1184)[name = tensor<string, []>("op_1186")];
-            tensor<fp32, [9, 4, 5, 64]> cross_9 = mul(x = var_1183, y = var_1186)[name = tensor<string, []>("cross_9")];
+            tensor<fp32, [9, 4, 5, 64]> v_9 = transpose(perm = v_9_perm_0, x = var_1206)[name = tensor<string, []>("transpose_25")];
+            tensor<fp32, [9, 4, 5, 64]> inner_9 = matmul(transpose_x = inner_9_transpose_x_0, transpose_y = inner_9_transpose_y_0, x = var_1224, y = v_9)[name = tensor<string, []>("inner_9")];
+            tensor<bool, []> var_1226_transpose_x_0 = const()[name = tensor<string, []>("op_1226_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_1226_transpose_y_0 = const()[name = tensor<string, []>("op_1226_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [9, 4, 5, 64]> var_1226 = matmul(transpose_x = var_1226_transpose_x_0, transpose_y = var_1226_transpose_y_0, x = q_9, y = prev_kv_9)[name = tensor<string, []>("op_1226")];
+            tensor<fp32, [5]> var_1227 = real_div(x = sqrt_s0_9, y = sqrt_s_t_9)[name = tensor<string, []>("op_1227")];
+            tensor<int32, [4]> var_1228 = const()[name = tensor<string, []>("op_1228"), val = tensor<int32, [4]>([1, 1, 5, 1])];
+            tensor<fp32, [1, 1, 5, 1]> var_1229 = reshape(shape = var_1228, x = var_1227)[name = tensor<string, []>("op_1229")];
+            tensor<fp32, [9, 4, 5, 64]> cross_9 = mul(x = var_1226, y = var_1229)[name = tensor<string, []>("cross_9")];
             tensor<fp32, [9, 4, 5, 64]> out_25 = add(x = inner_9, y = cross_9)[name = tensor<string, []>("out_25")];
-            tensor<int32, [4]> var_1189 = const()[name = tensor<string, []>("op_1189"), val = tensor<int32, [4]>([1, 1, 5, 1])];
-            tensor<fp32, [1, 1, 5, 1]> var_1190 = reshape(shape = var_1189, x = valid_mask)[name = tensor<string, []>("op_1190")];
-            tensor<fp32, [9, 4, 5, 64]> v_masked_1 = mul(x = v_9, y = var_1190)[name = tensor<string, []>("v_masked_1")];
-            tensor<fp32, [9, 4, 64, 64]> var_1192 = mul(x = prev_kv_9, y = sqrt_s0_9)[name = tensor<string, []>("op_1192")];
-            tensor<bool, []> var_1194_transpose_x_1 = const()[name = tensor<string, []>("op_1194_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_1194_transpose_y_1 = const()[name = tensor<string, []>("op_1194_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [9, 4, 64, 64]> var_1194 = matmul(transpose_x = var_1194_transpose_x_1, transpose_y = var_1194_transpose_y_1, x = k_9, y = v_masked_1)[name = tensor<string, []>("op_1194")];
-            tensor<fp32, [9, 4, 64, 64]> new_kv_unnorm_9 = add(x = var_1192, y = var_1194)[name = tensor<string, []>("new_kv_unnorm_9")];
-            tensor<bool, []> var_1196_keep_dims_0 = const()[name = tensor<string, []>("op_1196_keep_dims_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, []> var_1196 = reduce_sum(keep_dims = var_1196_keep_dims_0, x = valid_mask)[name = tensor<string, []>("op_1196")];
-            tensor<int32, [1]> var_1197 = const()[name = tensor<string, []>("op_1197"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [1]> var_1198 = reshape(shape = var_1197, x = var_1196)[name = tensor<string, []>("op_1198")];
-            tensor<fp32, [1]> new_scale_9 = add(x = prev_scale_9, y = var_1198)[name = tensor<string, []>("new_scale_9")];
+            tensor<int32, [4]> var_1232 = const()[name = tensor<string, []>("op_1232"), val = tensor<int32, [4]>([1, 1, 5, 1])];
+            tensor<fp32, [1, 1, 5, 1]> var_1233 = reshape(shape = var_1232, x = valid_mask)[name = tensor<string, []>("op_1233")];
+            tensor<fp32, [9, 4, 5, 64]> v_masked_1 = mul(x = v_9, y = var_1233)[name = tensor<string, []>("v_masked_1")];
+            tensor<fp32, [9, 4, 64, 64]> var_1235 = mul(x = prev_kv_9, y = sqrt_s0_9)[name = tensor<string, []>("op_1235")];
+            tensor<bool, []> var_1237_transpose_x_1 = const()[name = tensor<string, []>("op_1237_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_1237_transpose_y_1 = const()[name = tensor<string, []>("op_1237_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [9, 4, 64, 64]> var_1237 = matmul(transpose_x = var_1237_transpose_x_1, transpose_y = var_1237_transpose_y_1, x = k_9, y = v_masked_1)[name = tensor<string, []>("op_1237")];
+            tensor<fp32, [9, 4, 64, 64]> new_kv_unnorm_9 = add(x = var_1235, y = var_1237)[name = tensor<string, []>("new_kv_unnorm_9")];
+            tensor<bool, []> var_1239_keep_dims_0 = const()[name = tensor<string, []>("op_1239_keep_dims_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, []> var_1239 = reduce_sum(keep_dims = var_1239_keep_dims_0, x = valid_mask)[name = tensor<string, []>("op_1239")];
+            tensor<int32, [1]> var_1240 = const()[name = tensor<string, []>("op_1240"), val = tensor<int32, [1]>([1])];
+            tensor<fp32, [1]> var_1241 = reshape(shape = var_1240, x = var_1239)[name = tensor<string, []>("op_1241")];
+            tensor<fp32, [1]> new_scale_9 = add(x = prev_scale_9, y = var_1241)[name = tensor<string, []>("new_scale_9")];
             tensor<fp32, []> const_21 = const()[name = tensor<string, []>("const_21"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_2 = clip(alpha = var_1055, beta = const_21, x = new_scale_9)[name = tensor<string, []>("clip_2")];
+            tensor<fp32, [1]> clip_2 = clip(alpha = var_69, beta = const_21, x = new_scale_9)[name = tensor<string, []>("clip_2")];
             tensor<fp32, [1]> sqrt_new_scale_1 = sqrt(x = clip_2)[name = tensor<string, []>("sqrt_new_scale_1")];
-            tensor<fp32, [9, 4, 64, 64]> var_1202 = real_div(x = new_kv_unnorm_9, y = sqrt_new_scale_1)[name = tensor<string, []>("op_1202")];
-            tensor<int32, [4]> var_1203_perm_0 = const()[name = tensor<string, []>("op_1203_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [9, 4, 64, 64]> var_1245 = real_div(x = new_kv_unnorm_9, y = sqrt_new_scale_1)[name = tensor<string, []>("op_1245")];
+            tensor<int32, [4]> var_1246_perm_0 = const()[name = tensor<string, []>("op_1246_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_27_axes_0 = const()[name = tensor<string, []>("out_27_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [9, 5, 4, 64]> var_1203 = transpose(perm = var_1203_perm_0, x = out_25)[name = tensor<string, []>("transpose_24")];
-            tensor<fp32, [9, 5, 4, 64]> out_27 = layer_norm(axes = out_27_axes_0, epsilon = var_1052, x = var_1203)[name = tensor<string, []>("out_27")];
-            tensor<int32, [3]> var_1207 = const()[name = tensor<string, []>("op_1207"), val = tensor<int32, [3]>([9, 5, 256])];
-            tensor<fp32, [9, 5, 256]> out_29 = reshape(shape = var_1207, x = out_27)[name = tensor<string, []>("out_29")];
-            tensor<fp32, [9, 5, 256]> var_1209 = silu(x = input_169)[name = tensor<string, []>("op_1209")];
-            tensor<fp32, [9, 5, 256]> input_171 = mul(x = var_1209, y = out_29)[name = tensor<string, []>("input_171")];
-            tensor<fp32, [9, 5, 256]> ret_out_9 = linear(bias = decoder_out_proj_0_bias, weight = decoder_out_proj_0_weight, x = input_171)[name = tensor<string, []>("linear_42")];
-            tensor<fp32, [9, 5, 256]> input_173 = add(x = x_29, y = ret_out_9)[name = tensor<string, []>("input_173")];
+            tensor<fp32, [9, 5, 4, 64]> var_1246 = transpose(perm = var_1246_perm_0, x = out_25)[name = tensor<string, []>("transpose_24")];
+            tensor<fp32, [9, 5, 4, 64]> out_27 = layer_norm(axes = out_27_axes_0, epsilon = var_84, x = var_1246)[name = tensor<string, []>("out_27")];
+            tensor<int32, [3]> var_1250 = const()[name = tensor<string, []>("op_1250"), val = tensor<int32, [3]>([9, 5, 256])];
+            tensor<fp32, [9, 5, 256]> out_29 = reshape(shape = var_1250, x = out_27)[name = tensor<string, []>("out_29")];
+            tensor<fp32, [9, 5, 256]> var_1252 = silu(x = input_171)[name = tensor<string, []>("op_1252")];
+            tensor<fp32, [9, 5, 256]> input_173 = mul(x = var_1252, y = out_29)[name = tensor<string, []>("input_173")];
+            tensor<fp32, [9, 5, 256]> ret_out_9 = linear(bias = inner_decoder_out_proj_0_bias, weight = inner_decoder_out_proj_0_weight, x = input_173)[name = tensor<string, []>("linear_42")];
+            tensor<fp32, [9, 5, 256]> input_175 = add(x = x_29, y = ret_out_9)[name = tensor<string, []>("input_175")];
             tensor<int32, [1]> xt_1_axes_0 = const()[name = tensor<string, []>("xt_1_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [9, 5, 256]> xt_1 = layer_norm(axes = xt_1_axes_0, beta = decoder_norm11_0_bias, epsilon = var_1050, gamma = decoder_norm11_0_weight, x = input_173)[name = tensor<string, []>("xt_1")];
-            tensor<int32, [4]> var_1219 = const()[name = tensor<string, []>("op_1219"), val = tensor<int32, [4]>([1, 9, 5, 256])];
-            tensor<fp32, [1, 9, 5, 256]> var_1220 = reshape(shape = var_1219, x = xt_1)[name = tensor<string, []>("op_1220")];
-            tensor<int32, [4]> var_1221_perm_0 = const()[name = tensor<string, []>("op_1221_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1224 = const()[name = tensor<string, []>("op_1224"), val = tensor<int32, [3]>([5, 9, 256])];
-            tensor<fp32, [1, 5, 9, 256]> var_1221 = transpose(perm = var_1221_perm_0, x = var_1220)[name = tensor<string, []>("transpose_23")];
-            tensor<fp32, [5, 9, 256]> query_1 = reshape(shape = var_1224, x = var_1221)[name = tensor<string, []>("query_1")];
+            tensor<fp32, [9, 5, 256]> xt_1 = layer_norm(axes = xt_1_axes_0, beta = inner_decoder_norm11_0_bias, epsilon = var_76, gamma = inner_decoder_norm11_0_weight, x = input_175)[name = tensor<string, []>("xt_1")];
+            tensor<int32, [4]> var_1262 = const()[name = tensor<string, []>("op_1262"), val = tensor<int32, [4]>([1, 9, 5, 256])];
+            tensor<fp32, [1, 9, 5, 256]> var_1263 = reshape(shape = var_1262, x = xt_1)[name = tensor<string, []>("op_1263")];
+            tensor<int32, [4]> var_1264_perm_0 = const()[name = tensor<string, []>("op_1264_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1267 = const()[name = tensor<string, []>("op_1267"), val = tensor<int32, [3]>([5, 9, 256])];
+            tensor<fp32, [1, 5, 9, 256]> var_1264 = transpose(perm = var_1264_perm_0, x = var_1263)[name = tensor<string, []>("transpose_23")];
+            tensor<fp32, [5, 9, 256]> query_1 = reshape(shape = var_1267, x = var_1264)[name = tensor<string, []>("query_1")];
             tensor<int32, [3]> query_3_perm_0 = const()[name = tensor<string, []>("query_3_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [9, 5, 256]> query_3 = transpose(perm = query_3_perm_0, x = query_1)[name = tensor<string, []>("transpose_22")];
-            tensor<fp32, [9, 5, 768]> var_1247 = linear(bias = decoder_self_attn2_0_in_proj_bias, weight = decoder_self_attn2_0_in_proj_weight, x = query_3)[name = tensor<string, []>("linear_43")];
+            tensor<fp32, [9, 5, 768]> var_1290 = linear(bias = inner_decoder_self_attn2_0_in_proj_bias, weight = inner_decoder_self_attn2_0_in_proj_weight, x = query_3)[name = tensor<string, []>("linear_43")];
             tensor<int32, [4]> concat_1 = const()[name = tensor<string, []>("concat_1"), val = tensor<int32, [4]>([9, 5, 3, 256])];
-            tensor<fp32, [9, 5, 3, 256]> var_1249 = reshape(shape = concat_1, x = var_1247)[name = tensor<string, []>("op_1249")];
-            tensor<int32, [1]> var_1250_axes_0 = const()[name = tensor<string, []>("op_1250_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp32, [1, 9, 5, 3, 256]> var_1250 = expand_dims(axes = var_1250_axes_0, x = var_1249)[name = tensor<string, []>("op_1250")];
-            tensor<int32, [5]> var_1251_perm_0 = const()[name = tensor<string, []>("op_1251_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
-            tensor<int32, [1]> var_1252_axes_0 = const()[name = tensor<string, []>("op_1252_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp32, [3, 9, 5, 1, 256]> var_1251 = transpose(perm = var_1251_perm_0, x = var_1250)[name = tensor<string, []>("transpose_21")];
-            tensor<fp32, [3, 9, 5, 256]> var_1252 = squeeze(axes = var_1252_axes_0, x = var_1251)[name = tensor<string, []>("op_1252")];
+            tensor<fp32, [9, 5, 3, 256]> var_1292 = reshape(shape = concat_1, x = var_1290)[name = tensor<string, []>("op_1292")];
+            tensor<int32, [1]> var_1293_axes_0 = const()[name = tensor<string, []>("op_1293_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 9, 5, 3, 256]> var_1293 = expand_dims(axes = var_1293_axes_0, x = var_1292)[name = tensor<string, []>("op_1293")];
+            tensor<int32, [5]> var_1294_perm_0 = const()[name = tensor<string, []>("op_1294_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
+            tensor<int32, [1]> var_1295_axes_0 = const()[name = tensor<string, []>("op_1295_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp32, [3, 9, 5, 1, 256]> var_1294 = transpose(perm = var_1294_perm_0, x = var_1293)[name = tensor<string, []>("transpose_21")];
+            tensor<fp32, [3, 9, 5, 256]> var_1295 = squeeze(axes = var_1295_axes_0, x = var_1294)[name = tensor<string, []>("op_1295")];
             tensor<int32, [4]> q_11_begin_0 = const()[name = tensor<string, []>("q_11_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> q_11_end_0 = const()[name = tensor<string, []>("q_11_end_0"), val = tensor<int32, [4]>([1, 9, 5, 256])];
             tensor<bool, [4]> q_11_end_mask_0 = const()[name = tensor<string, []>("q_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> q_11_squeeze_mask_0 = const()[name = tensor<string, []>("q_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [9, 5, 256]> q_11 = slice_by_index(begin = q_11_begin_0, end = q_11_end_0, end_mask = q_11_end_mask_0, squeeze_mask = q_11_squeeze_mask_0, x = var_1252)[name = tensor<string, []>("q_11")];
+            tensor<fp32, [9, 5, 256]> q_11 = slice_by_index(begin = q_11_begin_0, end = q_11_end_0, end_mask = q_11_end_mask_0, squeeze_mask = q_11_squeeze_mask_0, x = var_1295)[name = tensor<string, []>("q_11")];
             tensor<int32, [4]> k_11_begin_0 = const()[name = tensor<string, []>("k_11_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> k_11_end_0 = const()[name = tensor<string, []>("k_11_end_0"), val = tensor<int32, [4]>([2, 9, 5, 256])];
             tensor<bool, [4]> k_11_end_mask_0 = const()[name = tensor<string, []>("k_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> k_11_squeeze_mask_0 = const()[name = tensor<string, []>("k_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [9, 5, 256]> k_11 = slice_by_index(begin = k_11_begin_0, end = k_11_end_0, end_mask = k_11_end_mask_0, squeeze_mask = k_11_squeeze_mask_0, x = var_1252)[name = tensor<string, []>("k_11")];
+            tensor<fp32, [9, 5, 256]> k_11 = slice_by_index(begin = k_11_begin_0, end = k_11_end_0, end_mask = k_11_end_mask_0, squeeze_mask = k_11_squeeze_mask_0, x = var_1295)[name = tensor<string, []>("k_11")];
             tensor<int32, [4]> v_11_begin_0 = const()[name = tensor<string, []>("v_11_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> v_11_end_0 = const()[name = tensor<string, []>("v_11_end_0"), val = tensor<int32, [4]>([3, 9, 5, 256])];
             tensor<bool, [4]> v_11_end_mask_0 = const()[name = tensor<string, []>("v_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> v_11_squeeze_mask_0 = const()[name = tensor<string, []>("v_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [9, 5, 256]> v_11 = slice_by_index(begin = v_11_begin_0, end = v_11_end_0, end_mask = v_11_end_mask_0, squeeze_mask = v_11_squeeze_mask_0, x = var_1252)[name = tensor<string, []>("v_11")];
-            tensor<int32, [3]> var_1260 = const()[name = tensor<string, []>("op_1260"), val = tensor<int32, [3]>([9, 20, 64])];
-            tensor<fp32, [9, 20, 64]> var_1261 = reshape(shape = var_1260, x = q_11)[name = tensor<string, []>("op_1261")];
+            tensor<fp32, [9, 5, 256]> v_11 = slice_by_index(begin = v_11_begin_0, end = v_11_end_0, end_mask = v_11_end_mask_0, squeeze_mask = v_11_squeeze_mask_0, x = var_1295)[name = tensor<string, []>("v_11")];
+            tensor<int32, [3]> var_1303 = const()[name = tensor<string, []>("op_1303"), val = tensor<int32, [3]>([9, 20, 64])];
+            tensor<fp32, [9, 20, 64]> var_1304 = reshape(shape = var_1303, x = q_11)[name = tensor<string, []>("op_1304")];
             tensor<int32, [3]> q_13_perm_0 = const()[name = tensor<string, []>("q_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1267 = const()[name = tensor<string, []>("op_1267"), val = tensor<int32, [3]>([9, 20, 64])];
-            tensor<fp32, [9, 20, 64]> var_1268 = reshape(shape = var_1267, x = k_11)[name = tensor<string, []>("op_1268")];
+            tensor<int32, [3]> var_1310 = const()[name = tensor<string, []>("op_1310"), val = tensor<int32, [3]>([9, 20, 64])];
+            tensor<fp32, [9, 20, 64]> var_1311 = reshape(shape = var_1310, x = k_11)[name = tensor<string, []>("op_1311")];
             tensor<int32, [3]> k_13_perm_0 = const()[name = tensor<string, []>("k_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1274 = const()[name = tensor<string, []>("op_1274"), val = tensor<int32, [3]>([9, 20, 64])];
-            tensor<fp32, [9, 20, 64]> var_1275 = reshape(shape = var_1274, x = v_11)[name = tensor<string, []>("op_1275")];
+            tensor<int32, [3]> var_1317 = const()[name = tensor<string, []>("op_1317"), val = tensor<int32, [3]>([9, 20, 64])];
+            tensor<fp32, [9, 20, 64]> var_1318 = reshape(shape = var_1317, x = v_11)[name = tensor<string, []>("op_1318")];
             tensor<int32, [3]> v_13_perm_0 = const()[name = tensor<string, []>("v_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [4]> var_1278 = const()[name = tensor<string, []>("op_1278"), val = tensor<int32, [4]>([5, 4, 9, 64])];
-            tensor<fp32, [20, 9, 64]> q_13 = transpose(perm = q_13_perm_0, x = var_1261)[name = tensor<string, []>("transpose_20")];
-            tensor<fp32, [5, 4, 9, 64]> q_15 = reshape(shape = var_1278, x = q_13)[name = tensor<string, []>("q_15")];
-            tensor<int32, [4]> var_1280 = const()[name = tensor<string, []>("op_1280"), val = tensor<int32, [4]>([5, 4, 9, 64])];
-            tensor<fp32, [20, 9, 64]> k_13 = transpose(perm = k_13_perm_0, x = var_1268)[name = tensor<string, []>("transpose_19")];
-            tensor<fp32, [5, 4, 9, 64]> k_15 = reshape(shape = var_1280, x = k_13)[name = tensor<string, []>("k_15")];
-            tensor<int32, [4]> var_1282 = const()[name = tensor<string, []>("op_1282"), val = tensor<int32, [4]>([5, 4, 9, 64])];
-            tensor<fp32, [20, 9, 64]> v_13 = transpose(perm = v_13_perm_0, x = var_1275)[name = tensor<string, []>("transpose_18")];
-            tensor<fp32, [5, 4, 9, 64]> v_15 = reshape(shape = var_1282, x = v_13)[name = tensor<string, []>("v_15")];
+            tensor<int32, [4]> var_1321 = const()[name = tensor<string, []>("op_1321"), val = tensor<int32, [4]>([5, 4, 9, 64])];
+            tensor<fp32, [20, 9, 64]> q_13 = transpose(perm = q_13_perm_0, x = var_1304)[name = tensor<string, []>("transpose_20")];
+            tensor<fp32, [5, 4, 9, 64]> q_15 = reshape(shape = var_1321, x = q_13)[name = tensor<string, []>("q_15")];
+            tensor<int32, [4]> var_1323 = const()[name = tensor<string, []>("op_1323"), val = tensor<int32, [4]>([5, 4, 9, 64])];
+            tensor<fp32, [20, 9, 64]> k_13 = transpose(perm = k_13_perm_0, x = var_1311)[name = tensor<string, []>("transpose_19")];
+            tensor<fp32, [5, 4, 9, 64]> k_15 = reshape(shape = var_1323, x = k_13)[name = tensor<string, []>("k_15")];
+            tensor<int32, [4]> var_1325 = const()[name = tensor<string, []>("op_1325"), val = tensor<int32, [4]>([5, 4, 9, 64])];
+            tensor<fp32, [20, 9, 64]> v_13 = transpose(perm = v_13_perm_0, x = var_1318)[name = tensor<string, []>("transpose_18")];
+            tensor<fp32, [5, 4, 9, 64]> v_15 = reshape(shape = var_1325, x = v_13)[name = tensor<string, []>("v_15")];
             tensor<fp32, []> mul_1_y_0 = const()[name = tensor<string, []>("mul_1_y_0"), val = tensor<fp32, []>(0x1p-3)];
             tensor<fp32, [5, 4, 9, 64]> mul_1 = mul(x = q_15, y = mul_1_y_0)[name = tensor<string, []>("mul_1")];
             tensor<bool, []> matmul_0_transpose_y_0 = const()[name = tensor<string, []>("matmul_0_transpose_y_0"), val = tensor<bool, []>(true)];
@@ -1159,30 +1178,30 @@ program(1.0)
             tensor<bool, []> attn_output_1_transpose_x_0 = const()[name = tensor<string, []>("attn_output_1_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> attn_output_1_transpose_y_0 = const()[name = tensor<string, []>("attn_output_1_transpose_y_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [5, 4, 9, 64]> attn_output_1 = matmul(transpose_x = attn_output_1_transpose_x_0, transpose_y = attn_output_1_transpose_y_0, x = softmax_0, y = v_15)[name = tensor<string, []>("attn_output_1")];
-            tensor<int32, [4]> var_1285 = const()[name = tensor<string, []>("op_1285"), val = tensor<int32, [4]>([2, 0, 1, 3])];
-            tensor<int32, [2]> var_1290 = const()[name = tensor<string, []>("op_1290"), val = tensor<int32, [2]>([45, 256])];
-            tensor<fp32, [9, 5, 4, 64]> var_1286 = transpose(perm = var_1285, x = attn_output_1)[name = tensor<string, []>("transpose_17")];
-            tensor<fp32, [45, 256]> attn_output_3 = reshape(shape = var_1290, x = var_1286)[name = tensor<string, []>("attn_output_3")];
-            tensor<fp32, [45, 256]> attn_output_5 = linear(bias = decoder_self_attn2_0_out_proj_bias, weight = decoder_self_attn2_0_out_proj_weight, x = attn_output_3)[name = tensor<string, []>("linear_44")];
-            tensor<int32, [3]> var_1294 = const()[name = tensor<string, []>("op_1294"), val = tensor<int32, [3]>([9, 5, 256])];
-            tensor<fp32, [9, 5, 256]> attn_output_7 = reshape(shape = var_1294, x = attn_output_5)[name = tensor<string, []>("attn_output_7")];
+            tensor<int32, [4]> var_1328 = const()[name = tensor<string, []>("op_1328"), val = tensor<int32, [4]>([2, 0, 1, 3])];
+            tensor<int32, [2]> var_1333 = const()[name = tensor<string, []>("op_1333"), val = tensor<int32, [2]>([45, 256])];
+            tensor<fp32, [9, 5, 4, 64]> var_1329 = transpose(perm = var_1328, x = attn_output_1)[name = tensor<string, []>("transpose_17")];
+            tensor<fp32, [45, 256]> attn_output_3 = reshape(shape = var_1333, x = var_1329)[name = tensor<string, []>("attn_output_3")];
+            tensor<fp32, [45, 256]> attn_output_5 = linear(bias = inner_decoder_self_attn2_0_out_proj_bias, weight = inner_decoder_self_attn2_0_out_proj_weight, x = attn_output_3)[name = tensor<string, []>("linear_44")];
+            tensor<int32, [3]> var_1337 = const()[name = tensor<string, []>("op_1337"), val = tensor<int32, [3]>([9, 5, 256])];
+            tensor<fp32, [9, 5, 256]> attn_output_7 = reshape(shape = var_1337, x = attn_output_5)[name = tensor<string, []>("attn_output_7")];
             tensor<int32, [3]> sa2_out_1_perm_0 = const()[name = tensor<string, []>("sa2_out_1_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [5, 9, 256]> sa2_out_1 = transpose(perm = sa2_out_1_perm_0, x = attn_output_7)[name = tensor<string, []>("transpose_16")];
-            tensor<fp32, [5, 9, 256]> input_175 = add(x = query_1, y = sa2_out_1)[name = tensor<string, []>("input_175")];
-            tensor<int32, [1]> input_177_axes_0 = const()[name = tensor<string, []>("input_177_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [5, 9, 256]> input_177 = layer_norm(axes = input_177_axes_0, beta = decoder_norm21_0_bias, epsilon = var_1050, gamma = decoder_norm21_0_weight, x = input_175)[name = tensor<string, []>("input_177")];
-            tensor<fp32, [5, 9, 2048]> input_179 = linear(bias = decoder_linear1_0_bias, weight = decoder_linear1_0_weight, x = input_177)[name = tensor<string, []>("linear_45")];
-            tensor<fp32, [5, 9, 2048]> input_181 = relu(x = input_179)[name = tensor<string, []>("input_181")];
-            tensor<fp32, [5, 9, 256]> ffn_out_1 = linear(bias = decoder_linear2_0_bias, weight = decoder_linear2_0_weight, x = input_181)[name = tensor<string, []>("linear_46")];
-            tensor<fp32, [5, 9, 256]> input_183 = add(x = input_177, y = ffn_out_1)[name = tensor<string, []>("input_183")];
+            tensor<fp32, [5, 9, 256]> input_177 = add(x = query_1, y = sa2_out_1)[name = tensor<string, []>("input_177")];
+            tensor<int32, [1]> input_179_axes_0 = const()[name = tensor<string, []>("input_179_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [5, 9, 256]> input_179 = layer_norm(axes = input_179_axes_0, beta = inner_decoder_norm21_0_bias, epsilon = var_76, gamma = inner_decoder_norm21_0_weight, x = input_177)[name = tensor<string, []>("input_179")];
+            tensor<fp32, [5, 9, 2048]> input_181 = linear(bias = inner_decoder_linear1_0_bias, weight = inner_decoder_linear1_0_weight, x = input_179)[name = tensor<string, []>("linear_45")];
+            tensor<fp32, [5, 9, 2048]> input_183 = relu(x = input_181)[name = tensor<string, []>("input_183")];
+            tensor<fp32, [5, 9, 256]> ffn_out_1 = linear(bias = inner_decoder_linear2_0_bias, weight = inner_decoder_linear2_0_weight, x = input_183)[name = tensor<string, []>("linear_46")];
+            tensor<fp32, [5, 9, 256]> input_185 = add(x = input_179, y = ffn_out_1)[name = tensor<string, []>("input_185")];
             tensor<int32, [1]> xt_3_axes_0 = const()[name = tensor<string, []>("xt_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [5, 9, 256]> xt_3 = layer_norm(axes = xt_3_axes_0, beta = decoder_norm22_0_bias, epsilon = var_1050, gamma = decoder_norm22_0_weight, x = input_183)[name = tensor<string, []>("xt_3")];
-            tensor<int32, [4]> var_1314 = const()[name = tensor<string, []>("op_1314"), val = tensor<int32, [4]>([1, 5, 9, 256])];
-            tensor<fp32, [1, 5, 9, 256]> x_31 = reshape(shape = var_1314, x = xt_3)[name = tensor<string, []>("x_31")];
-            tensor<int32, [4]> var_1316_perm_0 = const()[name = tensor<string, []>("op_1316_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1320 = const()[name = tensor<string, []>("op_1320"), val = tensor<int32, [3]>([9, 5, 256])];
-            tensor<fp32, [1, 9, 5, 256]> var_1316 = transpose(perm = var_1316_perm_0, x = x_31)[name = tensor<string, []>("transpose_15")];
-            tensor<fp32, [9, 5, 256]> x = reshape(shape = var_1320, x = var_1316)[name = tensor<string, []>("x")];
+            tensor<fp32, [5, 9, 256]> xt_3 = layer_norm(axes = xt_3_axes_0, beta = inner_decoder_norm22_0_bias, epsilon = var_76, gamma = inner_decoder_norm22_0_weight, x = input_185)[name = tensor<string, []>("xt_3")];
+            tensor<int32, [4]> var_1357 = const()[name = tensor<string, []>("op_1357"), val = tensor<int32, [4]>([1, 5, 9, 256])];
+            tensor<fp32, [1, 5, 9, 256]> x_31 = reshape(shape = var_1357, x = xt_3)[name = tensor<string, []>("x_31")];
+            tensor<int32, [4]> var_1359_perm_0 = const()[name = tensor<string, []>("op_1359_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1363 = const()[name = tensor<string, []>("op_1363"), val = tensor<int32, [3]>([9, 5, 256])];
+            tensor<fp32, [1, 9, 5, 256]> var_1359 = transpose(perm = var_1359_perm_0, x = x_31)[name = tensor<string, []>("transpose_15")];
+            tensor<fp32, [9, 5, 256]> x = reshape(shape = var_1363, x = var_1359)[name = tensor<string, []>("x")];
             tensor<int32, [5]> prev_kv_begin_0 = const()[name = tensor<string, []>("prev_kv_begin_0"), val = tensor<int32, [5]>([1, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_end_0 = const()[name = tensor<string, []>("prev_kv_end_0"), val = tensor<int32, [5]>([2, 9, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_end_mask_0 = const()[name = tensor<string, []>("prev_kv_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -1193,120 +1212,120 @@ program(1.0)
             tensor<bool, [2]> prev_scale_end_mask_0 = const()[name = tensor<string, []>("prev_scale_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale = slice_by_index(begin = prev_scale_begin_0, end = prev_scale_end_0, end_mask = prev_scale_end_mask_0, squeeze_mask = prev_scale_squeeze_mask_0, x = dec_scale)[name = tensor<string, []>("prev_scale")];
-            tensor<fp32, [9, 5, 256]> var_1328 = linear(bias = decoder_q_proj_1_bias, weight = decoder_q_proj_1_weight, x = x)[name = tensor<string, []>("linear_47")];
-            tensor<int32, [4]> var_1329 = const()[name = tensor<string, []>("op_1329"), val = tensor<int32, [4]>([9, 5, 4, 64])];
-            tensor<fp32, [9, 5, 4, 64]> var_1330 = reshape(shape = var_1329, x = var_1328)[name = tensor<string, []>("op_1330")];
+            tensor<fp32, [9, 5, 256]> var_1371 = linear(bias = inner_decoder_q_proj_1_bias, weight = inner_decoder_q_proj_1_weight, x = x)[name = tensor<string, []>("linear_47")];
+            tensor<int32, [4]> var_1372 = const()[name = tensor<string, []>("op_1372"), val = tensor<int32, [4]>([9, 5, 4, 64])];
+            tensor<fp32, [9, 5, 4, 64]> var_1373 = reshape(shape = var_1372, x = var_1371)[name = tensor<string, []>("op_1373")];
             tensor<int32, [4]> q_17_perm_0 = const()[name = tensor<string, []>("q_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [9, 5, 256]> var_1334 = linear(bias = decoder_k_proj_1_bias, weight = decoder_k_proj_1_weight, x = x)[name = tensor<string, []>("linear_48")];
-            tensor<fp32, []> var_1335 = const()[name = tensor<string, []>("op_1335"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [9, 5, 256]> var_1336 = mul(x = var_1334, y = var_1335)[name = tensor<string, []>("op_1336")];
-            tensor<int32, [4]> var_1337 = const()[name = tensor<string, []>("op_1337"), val = tensor<int32, [4]>([9, 5, 4, 64])];
-            tensor<fp32, [9, 5, 4, 64]> var_1338 = reshape(shape = var_1337, x = var_1336)[name = tensor<string, []>("op_1338")];
+            tensor<fp32, [9, 5, 256]> var_1377 = linear(bias = inner_decoder_k_proj_1_bias, weight = inner_decoder_k_proj_1_weight, x = x)[name = tensor<string, []>("linear_48")];
+            tensor<fp32, []> var_1378 = const()[name = tensor<string, []>("op_1378"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [9, 5, 256]> var_1379 = mul(x = var_1377, y = var_1378)[name = tensor<string, []>("op_1379")];
+            tensor<int32, [4]> var_1380 = const()[name = tensor<string, []>("op_1380"), val = tensor<int32, [4]>([9, 5, 4, 64])];
+            tensor<fp32, [9, 5, 4, 64]> var_1381 = reshape(shape = var_1380, x = var_1379)[name = tensor<string, []>("op_1381")];
             tensor<int32, [4]> k_17_perm_0 = const()[name = tensor<string, []>("k_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [9, 5, 256]> var_1342 = linear(bias = decoder_v_proj_1_bias, weight = decoder_v_proj_1_weight, x = x)[name = tensor<string, []>("linear_49")];
-            tensor<int32, [4]> var_1343 = const()[name = tensor<string, []>("op_1343"), val = tensor<int32, [4]>([9, 5, 4, 64])];
-            tensor<fp32, [9, 5, 4, 64]> var_1344 = reshape(shape = var_1343, x = var_1342)[name = tensor<string, []>("op_1344")];
+            tensor<fp32, [9, 5, 256]> var_1385 = linear(bias = inner_decoder_v_proj_1_bias, weight = inner_decoder_v_proj_1_weight, x = x)[name = tensor<string, []>("linear_49")];
+            tensor<int32, [4]> var_1386 = const()[name = tensor<string, []>("op_1386"), val = tensor<int32, [4]>([9, 5, 4, 64])];
+            tensor<fp32, [9, 5, 4, 64]> var_1387 = reshape(shape = var_1386, x = var_1385)[name = tensor<string, []>("op_1387")];
             tensor<int32, [4]> v_17_perm_0 = const()[name = tensor<string, []>("v_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [9, 5, 256]> input_187 = linear(bias = decoder_g_proj_1_bias, weight = decoder_g_proj_1_weight, x = x)[name = tensor<string, []>("linear_50")];
+            tensor<fp32, [9, 5, 256]> input_189 = linear(bias = inner_decoder_g_proj_1_bias, weight = inner_decoder_g_proj_1_weight, x = x)[name = tensor<string, []>("linear_50")];
             tensor<fp32, [1]> sqrt_s0 = sqrt(x = prev_scale)[name = tensor<string, []>("sqrt_s0")];
             tensor<fp32, [5]> s_t = add(x = prev_scale, y = cumsum_mask_1)[name = tensor<string, []>("s_t")];
             tensor<fp32, []> const_32 = const()[name = tensor<string, []>("const_32"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [5]> clip_3 = clip(alpha = var_1055, beta = const_32, x = s_t)[name = tensor<string, []>("clip_3")];
+            tensor<fp32, [5]> clip_3 = clip(alpha = var_69, beta = const_32, x = s_t)[name = tensor<string, []>("clip_3")];
             tensor<fp32, [5]> sqrt_s_t = sqrt(x = clip_3)[name = tensor<string, []>("sqrt_s_t")];
             tensor<bool, []> qk_transpose_x_1 = const()[name = tensor<string, []>("qk_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_transpose_y_1 = const()[name = tensor<string, []>("qk_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [9, 4, 5, 64]> k_17 = transpose(perm = k_17_perm_0, x = var_1338)[name = tensor<string, []>("transpose_13")];
-            tensor<fp32, [9, 4, 5, 64]> q_17 = transpose(perm = q_17_perm_0, x = var_1330)[name = tensor<string, []>("transpose_14")];
+            tensor<fp32, [9, 4, 5, 64]> k_17 = transpose(perm = k_17_perm_0, x = var_1381)[name = tensor<string, []>("transpose_13")];
+            tensor<fp32, [9, 4, 5, 64]> q_17 = transpose(perm = q_17_perm_0, x = var_1373)[name = tensor<string, []>("transpose_14")];
             tensor<fp32, [9, 4, 5, 5]> qk = matmul(transpose_x = qk_transpose_x_1, transpose_y = qk_transpose_y_1, x = q_17, y = k_17)[name = tensor<string, []>("qk")];
-            tensor<int32, [2]> var_1359 = const()[name = tensor<string, []>("op_1359"), val = tensor<int32, [2]>([5, 1])];
-            tensor<fp32, [5, 1]> var_1360 = reshape(shape = var_1359, x = sqrt_s_t)[name = tensor<string, []>("op_1360")];
-            tensor<fp32, [5, 5]> M = real_div(x = causal_with_valid_1, y = var_1360)[name = tensor<string, []>("M")];
-            tensor<fp32, [9, 4, 5, 5]> var_1362 = mul(x = qk, y = M)[name = tensor<string, []>("op_1362")];
-            tensor<bool, []> inner_transpose_x_0 = const()[name = tensor<string, []>("inner_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> inner_transpose_y_0 = const()[name = tensor<string, []>("inner_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [9, 4, 5, 64]> v_17 = transpose(perm = v_17_perm_0, x = var_1344)[name = tensor<string, []>("transpose_12")];
-            tensor<fp32, [9, 4, 5, 64]> inner = matmul(transpose_x = inner_transpose_x_0, transpose_y = inner_transpose_y_0, x = var_1362, y = v_17)[name = tensor<string, []>("inner")];
-            tensor<bool, []> var_1364_transpose_x_0 = const()[name = tensor<string, []>("op_1364_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_1364_transpose_y_0 = const()[name = tensor<string, []>("op_1364_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [9, 4, 5, 64]> var_1364 = matmul(transpose_x = var_1364_transpose_x_0, transpose_y = var_1364_transpose_y_0, x = q_17, y = prev_kv)[name = tensor<string, []>("op_1364")];
-            tensor<fp32, [5]> var_1365 = real_div(x = sqrt_s0, y = sqrt_s_t)[name = tensor<string, []>("op_1365")];
-            tensor<int32, [4]> var_1366 = const()[name = tensor<string, []>("op_1366"), val = tensor<int32, [4]>([1, 1, 5, 1])];
-            tensor<fp32, [1, 1, 5, 1]> var_1367 = reshape(shape = var_1366, x = var_1365)[name = tensor<string, []>("op_1367")];
-            tensor<fp32, [9, 4, 5, 64]> cross = mul(x = var_1364, y = var_1367)[name = tensor<string, []>("cross")];
-            tensor<fp32, [9, 4, 5, 64]> out_31 = add(x = inner, y = cross)[name = tensor<string, []>("out_31")];
-            tensor<fp32, [9, 4, 5, 64]> v_masked = mul(x = v_17, y = var_1190)[name = tensor<string, []>("v_masked")];
-            tensor<fp32, [9, 4, 64, 64]> var_1373 = mul(x = prev_kv, y = sqrt_s0)[name = tensor<string, []>("op_1373")];
-            tensor<bool, []> var_1375_transpose_x_1 = const()[name = tensor<string, []>("op_1375_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_1375_transpose_y_1 = const()[name = tensor<string, []>("op_1375_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [9, 4, 64, 64]> var_1375 = matmul(transpose_x = var_1375_transpose_x_1, transpose_y = var_1375_transpose_y_1, x = k_17, y = v_masked)[name = tensor<string, []>("op_1375")];
-            tensor<fp32, [9, 4, 64, 64]> new_kv_unnorm = add(x = var_1373, y = var_1375)[name = tensor<string, []>("new_kv_unnorm")];
-            tensor<fp32, [1]> new_scale = add(x = prev_scale, y = var_1198)[name = tensor<string, []>("new_scale")];
+            tensor<int32, [2]> var_1402 = const()[name = tensor<string, []>("op_1402"), val = tensor<int32, [2]>([5, 1])];
+            tensor<fp32, [5, 1]> var_1403 = reshape(shape = var_1402, x = sqrt_s_t)[name = tensor<string, []>("op_1403")];
+            tensor<fp32, [5, 5]> M = real_div(x = causal_with_valid_1, y = var_1403)[name = tensor<string, []>("M")];
+            tensor<fp32, [9, 4, 5, 5]> var_1405 = mul(x = qk, y = M)[name = tensor<string, []>("op_1405")];
+            tensor<bool, []> inner_11_transpose_x_0 = const()[name = tensor<string, []>("inner_11_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> inner_11_transpose_y_0 = const()[name = tensor<string, []>("inner_11_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [9, 4, 5, 64]> v_17 = transpose(perm = v_17_perm_0, x = var_1387)[name = tensor<string, []>("transpose_12")];
+            tensor<fp32, [9, 4, 5, 64]> inner_11 = matmul(transpose_x = inner_11_transpose_x_0, transpose_y = inner_11_transpose_y_0, x = var_1405, y = v_17)[name = tensor<string, []>("inner_11")];
+            tensor<bool, []> var_1407_transpose_x_0 = const()[name = tensor<string, []>("op_1407_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_1407_transpose_y_0 = const()[name = tensor<string, []>("op_1407_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [9, 4, 5, 64]> var_1407 = matmul(transpose_x = var_1407_transpose_x_0, transpose_y = var_1407_transpose_y_0, x = q_17, y = prev_kv)[name = tensor<string, []>("op_1407")];
+            tensor<fp32, [5]> var_1408 = real_div(x = sqrt_s0, y = sqrt_s_t)[name = tensor<string, []>("op_1408")];
+            tensor<int32, [4]> var_1409 = const()[name = tensor<string, []>("op_1409"), val = tensor<int32, [4]>([1, 1, 5, 1])];
+            tensor<fp32, [1, 1, 5, 1]> var_1410 = reshape(shape = var_1409, x = var_1408)[name = tensor<string, []>("op_1410")];
+            tensor<fp32, [9, 4, 5, 64]> cross = mul(x = var_1407, y = var_1410)[name = tensor<string, []>("cross")];
+            tensor<fp32, [9, 4, 5, 64]> out_31 = add(x = inner_11, y = cross)[name = tensor<string, []>("out_31")];
+            tensor<fp32, [9, 4, 5, 64]> v_masked = mul(x = v_17, y = var_1233)[name = tensor<string, []>("v_masked")];
+            tensor<fp32, [9, 4, 64, 64]> var_1416 = mul(x = prev_kv, y = sqrt_s0)[name = tensor<string, []>("op_1416")];
+            tensor<bool, []> var_1418_transpose_x_1 = const()[name = tensor<string, []>("op_1418_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_1418_transpose_y_1 = const()[name = tensor<string, []>("op_1418_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [9, 4, 64, 64]> var_1418 = matmul(transpose_x = var_1418_transpose_x_1, transpose_y = var_1418_transpose_y_1, x = k_17, y = v_masked)[name = tensor<string, []>("op_1418")];
+            tensor<fp32, [9, 4, 64, 64]> new_kv_unnorm = add(x = var_1416, y = var_1418)[name = tensor<string, []>("new_kv_unnorm")];
+            tensor<fp32, [1]> new_scale = add(x = prev_scale, y = var_1241)[name = tensor<string, []>("new_scale")];
             tensor<fp32, []> const_33 = const()[name = tensor<string, []>("const_33"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_4 = clip(alpha = var_1055, beta = const_33, x = new_scale)[name = tensor<string, []>("clip_4")];
+            tensor<fp32, [1]> clip_4 = clip(alpha = var_69, beta = const_33, x = new_scale)[name = tensor<string, []>("clip_4")];
             tensor<fp32, [1]> sqrt_new_scale = sqrt(x = clip_4)[name = tensor<string, []>("sqrt_new_scale")];
             tensor<fp32, [9, 4, 64, 64]> nkv = real_div(x = new_kv_unnorm, y = sqrt_new_scale)[name = tensor<string, []>("nkv")];
-            tensor<int32, [4]> var_1384_perm_0 = const()[name = tensor<string, []>("op_1384_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [4]> var_1427_perm_0 = const()[name = tensor<string, []>("op_1427_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_33_axes_0 = const()[name = tensor<string, []>("out_33_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [9, 5, 4, 64]> var_1384 = transpose(perm = var_1384_perm_0, x = out_31)[name = tensor<string, []>("transpose_11")];
-            tensor<fp32, [9, 5, 4, 64]> out_33 = layer_norm(axes = out_33_axes_0, epsilon = var_1052, x = var_1384)[name = tensor<string, []>("out_33")];
-            tensor<int32, [3]> var_1388 = const()[name = tensor<string, []>("op_1388"), val = tensor<int32, [3]>([9, 5, 256])];
-            tensor<fp32, [9, 5, 256]> out = reshape(shape = var_1388, x = out_33)[name = tensor<string, []>("out")];
-            tensor<fp32, [9, 5, 256]> var_1390 = silu(x = input_187)[name = tensor<string, []>("op_1390")];
-            tensor<fp32, [9, 5, 256]> input_189 = mul(x = var_1390, y = out)[name = tensor<string, []>("input_189")];
-            tensor<fp32, [9, 5, 256]> ret_out = linear(bias = decoder_out_proj_1_bias, weight = decoder_out_proj_1_weight, x = input_189)[name = tensor<string, []>("linear_51")];
-            tensor<fp32, [9, 5, 256]> input_191 = add(x = x, y = ret_out)[name = tensor<string, []>("input_191")];
+            tensor<fp32, [9, 5, 4, 64]> var_1427 = transpose(perm = var_1427_perm_0, x = out_31)[name = tensor<string, []>("transpose_11")];
+            tensor<fp32, [9, 5, 4, 64]> out_33 = layer_norm(axes = out_33_axes_0, epsilon = var_84, x = var_1427)[name = tensor<string, []>("out_33")];
+            tensor<int32, [3]> var_1431 = const()[name = tensor<string, []>("op_1431"), val = tensor<int32, [3]>([9, 5, 256])];
+            tensor<fp32, [9, 5, 256]> out = reshape(shape = var_1431, x = out_33)[name = tensor<string, []>("out")];
+            tensor<fp32, [9, 5, 256]> var_1433 = silu(x = input_189)[name = tensor<string, []>("op_1433")];
+            tensor<fp32, [9, 5, 256]> input_191 = mul(x = var_1433, y = out)[name = tensor<string, []>("input_191")];
+            tensor<fp32, [9, 5, 256]> ret_out = linear(bias = inner_decoder_out_proj_1_bias, weight = inner_decoder_out_proj_1_weight, x = input_191)[name = tensor<string, []>("linear_51")];
+            tensor<fp32, [9, 5, 256]> input_193 = add(x = x, y = ret_out)[name = tensor<string, []>("input_193")];
             tensor<int32, [1]> xt_5_axes_0 = const()[name = tensor<string, []>("xt_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [9, 5, 256]> xt_5 = layer_norm(axes = xt_5_axes_0, beta = decoder_norm11_1_bias, epsilon = var_1050, gamma = decoder_norm11_1_weight, x = input_191)[name = tensor<string, []>("xt_5")];
-            tensor<int32, [4]> var_1400 = const()[name = tensor<string, []>("op_1400"), val = tensor<int32, [4]>([1, 9, 5, 256])];
-            tensor<fp32, [1, 9, 5, 256]> var_1401 = reshape(shape = var_1400, x = xt_5)[name = tensor<string, []>("op_1401")];
-            tensor<int32, [4]> var_1402_perm_0 = const()[name = tensor<string, []>("op_1402_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1405 = const()[name = tensor<string, []>("op_1405"), val = tensor<int32, [3]>([5, 9, 256])];
-            tensor<fp32, [1, 5, 9, 256]> var_1402 = transpose(perm = var_1402_perm_0, x = var_1401)[name = tensor<string, []>("transpose_10")];
-            tensor<fp32, [5, 9, 256]> query_5 = reshape(shape = var_1405, x = var_1402)[name = tensor<string, []>("query_5")];
+            tensor<fp32, [9, 5, 256]> xt_5 = layer_norm(axes = xt_5_axes_0, beta = inner_decoder_norm11_1_bias, epsilon = var_76, gamma = inner_decoder_norm11_1_weight, x = input_193)[name = tensor<string, []>("xt_5")];
+            tensor<int32, [4]> var_1443 = const()[name = tensor<string, []>("op_1443"), val = tensor<int32, [4]>([1, 9, 5, 256])];
+            tensor<fp32, [1, 9, 5, 256]> var_1444 = reshape(shape = var_1443, x = xt_5)[name = tensor<string, []>("op_1444")];
+            tensor<int32, [4]> var_1445_perm_0 = const()[name = tensor<string, []>("op_1445_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1448 = const()[name = tensor<string, []>("op_1448"), val = tensor<int32, [3]>([5, 9, 256])];
+            tensor<fp32, [1, 5, 9, 256]> var_1445 = transpose(perm = var_1445_perm_0, x = var_1444)[name = tensor<string, []>("transpose_10")];
+            tensor<fp32, [5, 9, 256]> query_5 = reshape(shape = var_1448, x = var_1445)[name = tensor<string, []>("query_5")];
             tensor<int32, [3]> query_perm_0 = const()[name = tensor<string, []>("query_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [9, 5, 256]> query = transpose(perm = query_perm_0, x = query_5)[name = tensor<string, []>("transpose_9")];
-            tensor<fp32, [9, 5, 768]> var_1428 = linear(bias = decoder_self_attn2_1_in_proj_bias, weight = decoder_self_attn2_1_in_proj_weight, x = query)[name = tensor<string, []>("linear_52")];
+            tensor<fp32, [9, 5, 768]> var_1471 = linear(bias = inner_decoder_self_attn2_1_in_proj_bias, weight = inner_decoder_self_attn2_1_in_proj_weight, x = query)[name = tensor<string, []>("linear_52")];
             tensor<int32, [4]> concat_2 = const()[name = tensor<string, []>("concat_2"), val = tensor<int32, [4]>([9, 5, 3, 256])];
-            tensor<fp32, [9, 5, 3, 256]> var_1430 = reshape(shape = concat_2, x = var_1428)[name = tensor<string, []>("op_1430")];
-            tensor<int32, [1]> var_1431_axes_0 = const()[name = tensor<string, []>("op_1431_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp32, [1, 9, 5, 3, 256]> var_1431 = expand_dims(axes = var_1431_axes_0, x = var_1430)[name = tensor<string, []>("op_1431")];
-            tensor<int32, [5]> var_1432_perm_0 = const()[name = tensor<string, []>("op_1432_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
-            tensor<int32, [1]> var_1433_axes_0 = const()[name = tensor<string, []>("op_1433_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp32, [3, 9, 5, 1, 256]> var_1432 = transpose(perm = var_1432_perm_0, x = var_1431)[name = tensor<string, []>("transpose_8")];
-            tensor<fp32, [3, 9, 5, 256]> var_1433 = squeeze(axes = var_1433_axes_0, x = var_1432)[name = tensor<string, []>("op_1433")];
+            tensor<fp32, [9, 5, 3, 256]> var_1473 = reshape(shape = concat_2, x = var_1471)[name = tensor<string, []>("op_1473")];
+            tensor<int32, [1]> var_1474_axes_0 = const()[name = tensor<string, []>("op_1474_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 9, 5, 3, 256]> var_1474 = expand_dims(axes = var_1474_axes_0, x = var_1473)[name = tensor<string, []>("op_1474")];
+            tensor<int32, [5]> var_1475_perm_0 = const()[name = tensor<string, []>("op_1475_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
+            tensor<int32, [1]> var_1476_axes_0 = const()[name = tensor<string, []>("op_1476_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp32, [3, 9, 5, 1, 256]> var_1475 = transpose(perm = var_1475_perm_0, x = var_1474)[name = tensor<string, []>("transpose_8")];
+            tensor<fp32, [3, 9, 5, 256]> var_1476 = squeeze(axes = var_1476_axes_0, x = var_1475)[name = tensor<string, []>("op_1476")];
             tensor<int32, [4]> q_19_begin_0 = const()[name = tensor<string, []>("q_19_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> q_19_end_0 = const()[name = tensor<string, []>("q_19_end_0"), val = tensor<int32, [4]>([1, 9, 5, 256])];
             tensor<bool, [4]> q_19_end_mask_0 = const()[name = tensor<string, []>("q_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> q_19_squeeze_mask_0 = const()[name = tensor<string, []>("q_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [9, 5, 256]> q_19 = slice_by_index(begin = q_19_begin_0, end = q_19_end_0, end_mask = q_19_end_mask_0, squeeze_mask = q_19_squeeze_mask_0, x = var_1433)[name = tensor<string, []>("q_19")];
+            tensor<fp32, [9, 5, 256]> q_19 = slice_by_index(begin = q_19_begin_0, end = q_19_end_0, end_mask = q_19_end_mask_0, squeeze_mask = q_19_squeeze_mask_0, x = var_1476)[name = tensor<string, []>("q_19")];
             tensor<int32, [4]> k_19_begin_0 = const()[name = tensor<string, []>("k_19_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> k_19_end_0 = const()[name = tensor<string, []>("k_19_end_0"), val = tensor<int32, [4]>([2, 9, 5, 256])];
             tensor<bool, [4]> k_19_end_mask_0 = const()[name = tensor<string, []>("k_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> k_19_squeeze_mask_0 = const()[name = tensor<string, []>("k_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [9, 5, 256]> k_19 = slice_by_index(begin = k_19_begin_0, end = k_19_end_0, end_mask = k_19_end_mask_0, squeeze_mask = k_19_squeeze_mask_0, x = var_1433)[name = tensor<string, []>("k_19")];
+            tensor<fp32, [9, 5, 256]> k_19 = slice_by_index(begin = k_19_begin_0, end = k_19_end_0, end_mask = k_19_end_mask_0, squeeze_mask = k_19_squeeze_mask_0, x = var_1476)[name = tensor<string, []>("k_19")];
             tensor<int32, [4]> v_19_begin_0 = const()[name = tensor<string, []>("v_19_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> v_19_end_0 = const()[name = tensor<string, []>("v_19_end_0"), val = tensor<int32, [4]>([3, 9, 5, 256])];
             tensor<bool, [4]> v_19_end_mask_0 = const()[name = tensor<string, []>("v_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> v_19_squeeze_mask_0 = const()[name = tensor<string, []>("v_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [9, 5, 256]> v_19 = slice_by_index(begin = v_19_begin_0, end = v_19_end_0, end_mask = v_19_end_mask_0, squeeze_mask = v_19_squeeze_mask_0, x = var_1433)[name = tensor<string, []>("v_19")];
-            tensor<int32, [3]> var_1441 = const()[name = tensor<string, []>("op_1441"), val = tensor<int32, [3]>([9, 20, 64])];
-            tensor<fp32, [9, 20, 64]> var_1442 = reshape(shape = var_1441, x = q_19)[name = tensor<string, []>("op_1442")];
+            tensor<fp32, [9, 5, 256]> v_19 = slice_by_index(begin = v_19_begin_0, end = v_19_end_0, end_mask = v_19_end_mask_0, squeeze_mask = v_19_squeeze_mask_0, x = var_1476)[name = tensor<string, []>("v_19")];
+            tensor<int32, [3]> var_1484 = const()[name = tensor<string, []>("op_1484"), val = tensor<int32, [3]>([9, 20, 64])];
+            tensor<fp32, [9, 20, 64]> var_1485 = reshape(shape = var_1484, x = q_19)[name = tensor<string, []>("op_1485")];
             tensor<int32, [3]> q_21_perm_0 = const()[name = tensor<string, []>("q_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1448 = const()[name = tensor<string, []>("op_1448"), val = tensor<int32, [3]>([9, 20, 64])];
-            tensor<fp32, [9, 20, 64]> var_1449 = reshape(shape = var_1448, x = k_19)[name = tensor<string, []>("op_1449")];
+            tensor<int32, [3]> var_1491 = const()[name = tensor<string, []>("op_1491"), val = tensor<int32, [3]>([9, 20, 64])];
+            tensor<fp32, [9, 20, 64]> var_1492 = reshape(shape = var_1491, x = k_19)[name = tensor<string, []>("op_1492")];
             tensor<int32, [3]> k_21_perm_0 = const()[name = tensor<string, []>("k_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1455 = const()[name = tensor<string, []>("op_1455"), val = tensor<int32, [3]>([9, 20, 64])];
-            tensor<fp32, [9, 20, 64]> var_1456 = reshape(shape = var_1455, x = v_19)[name = tensor<string, []>("op_1456")];
+            tensor<int32, [3]> var_1498 = const()[name = tensor<string, []>("op_1498"), val = tensor<int32, [3]>([9, 20, 64])];
+            tensor<fp32, [9, 20, 64]> var_1499 = reshape(shape = var_1498, x = v_19)[name = tensor<string, []>("op_1499")];
             tensor<int32, [3]> v_21_perm_0 = const()[name = tensor<string, []>("v_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [4]> var_1459 = const()[name = tensor<string, []>("op_1459"), val = tensor<int32, [4]>([5, 4, 9, 64])];
-            tensor<fp32, [20, 9, 64]> q_21 = transpose(perm = q_21_perm_0, x = var_1442)[name = tensor<string, []>("transpose_7")];
-            tensor<fp32, [5, 4, 9, 64]> q = reshape(shape = var_1459, x = q_21)[name = tensor<string, []>("q")];
-            tensor<int32, [4]> var_1461 = const()[name = tensor<string, []>("op_1461"), val = tensor<int32, [4]>([5, 4, 9, 64])];
-            tensor<fp32, [20, 9, 64]> k_21 = transpose(perm = k_21_perm_0, x = var_1449)[name = tensor<string, []>("transpose_6")];
-            tensor<fp32, [5, 4, 9, 64]> k = reshape(shape = var_1461, x = k_21)[name = tensor<string, []>("k")];
-            tensor<int32, [4]> var_1463 = const()[name = tensor<string, []>("op_1463"), val = tensor<int32, [4]>([5, 4, 9, 64])];
-            tensor<fp32, [20, 9, 64]> v_21 = transpose(perm = v_21_perm_0, x = var_1456)[name = tensor<string, []>("transpose_5")];
-            tensor<fp32, [5, 4, 9, 64]> v = reshape(shape = var_1463, x = v_21)[name = tensor<string, []>("v")];
+            tensor<int32, [4]> var_1502 = const()[name = tensor<string, []>("op_1502"), val = tensor<int32, [4]>([5, 4, 9, 64])];
+            tensor<fp32, [20, 9, 64]> q_21 = transpose(perm = q_21_perm_0, x = var_1485)[name = tensor<string, []>("transpose_7")];
+            tensor<fp32, [5, 4, 9, 64]> q = reshape(shape = var_1502, x = q_21)[name = tensor<string, []>("q")];
+            tensor<int32, [4]> var_1504 = const()[name = tensor<string, []>("op_1504"), val = tensor<int32, [4]>([5, 4, 9, 64])];
+            tensor<fp32, [20, 9, 64]> k_21 = transpose(perm = k_21_perm_0, x = var_1492)[name = tensor<string, []>("transpose_6")];
+            tensor<fp32, [5, 4, 9, 64]> k = reshape(shape = var_1504, x = k_21)[name = tensor<string, []>("k")];
+            tensor<int32, [4]> var_1506 = const()[name = tensor<string, []>("op_1506"), val = tensor<int32, [4]>([5, 4, 9, 64])];
+            tensor<fp32, [20, 9, 64]> v_21 = transpose(perm = v_21_perm_0, x = var_1499)[name = tensor<string, []>("transpose_5")];
+            tensor<fp32, [5, 4, 9, 64]> v = reshape(shape = var_1506, x = v_21)[name = tensor<string, []>("v")];
             tensor<fp32, []> mul_3_y_0 = const()[name = tensor<string, []>("mul_3_y_0"), val = tensor<fp32, []>(0x1p-3)];
             tensor<fp32, [5, 4, 9, 64]> mul_3 = mul(x = q, y = mul_3_y_0)[name = tensor<string, []>("mul_3")];
             tensor<bool, []> matmul_1_transpose_y_0 = const()[name = tensor<string, []>("matmul_1_transpose_y_0"), val = tensor<bool, []>(true)];
@@ -1317,36 +1336,36 @@ program(1.0)
             tensor<bool, []> attn_output_9_transpose_x_0 = const()[name = tensor<string, []>("attn_output_9_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> attn_output_9_transpose_y_0 = const()[name = tensor<string, []>("attn_output_9_transpose_y_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [5, 4, 9, 64]> attn_output_9 = matmul(transpose_x = attn_output_9_transpose_x_0, transpose_y = attn_output_9_transpose_y_0, x = softmax_1, y = v)[name = tensor<string, []>("attn_output_9")];
-            tensor<int32, [4]> var_1466 = const()[name = tensor<string, []>("op_1466"), val = tensor<int32, [4]>([2, 0, 1, 3])];
-            tensor<int32, [2]> var_1471 = const()[name = tensor<string, []>("op_1471"), val = tensor<int32, [2]>([45, 256])];
-            tensor<fp32, [9, 5, 4, 64]> var_1467 = transpose(perm = var_1466, x = attn_output_9)[name = tensor<string, []>("transpose_4")];
-            tensor<fp32, [45, 256]> attn_output_11 = reshape(shape = var_1471, x = var_1467)[name = tensor<string, []>("attn_output_11")];
-            tensor<fp32, [45, 256]> attn_output_13 = linear(bias = decoder_self_attn2_1_out_proj_bias, weight = decoder_self_attn2_1_out_proj_weight, x = attn_output_11)[name = tensor<string, []>("linear_53")];
-            tensor<int32, [3]> var_1475 = const()[name = tensor<string, []>("op_1475"), val = tensor<int32, [3]>([9, 5, 256])];
-            tensor<fp32, [9, 5, 256]> attn_output = reshape(shape = var_1475, x = attn_output_13)[name = tensor<string, []>("attn_output")];
+            tensor<int32, [4]> var_1509 = const()[name = tensor<string, []>("op_1509"), val = tensor<int32, [4]>([2, 0, 1, 3])];
+            tensor<int32, [2]> var_1514 = const()[name = tensor<string, []>("op_1514"), val = tensor<int32, [2]>([45, 256])];
+            tensor<fp32, [9, 5, 4, 64]> var_1510 = transpose(perm = var_1509, x = attn_output_9)[name = tensor<string, []>("transpose_4")];
+            tensor<fp32, [45, 256]> attn_output_11 = reshape(shape = var_1514, x = var_1510)[name = tensor<string, []>("attn_output_11")];
+            tensor<fp32, [45, 256]> attn_output_13 = linear(bias = inner_decoder_self_attn2_1_out_proj_bias, weight = inner_decoder_self_attn2_1_out_proj_weight, x = attn_output_11)[name = tensor<string, []>("linear_53")];
+            tensor<int32, [3]> var_1518 = const()[name = tensor<string, []>("op_1518"), val = tensor<int32, [3]>([9, 5, 256])];
+            tensor<fp32, [9, 5, 256]> attn_output = reshape(shape = var_1518, x = attn_output_13)[name = tensor<string, []>("attn_output")];
             tensor<int32, [3]> sa2_out_perm_0 = const()[name = tensor<string, []>("sa2_out_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [5, 9, 256]> sa2_out = transpose(perm = sa2_out_perm_0, x = attn_output)[name = tensor<string, []>("transpose_3")];
-            tensor<fp32, [5, 9, 256]> input_193 = add(x = query_5, y = sa2_out)[name = tensor<string, []>("input_193")];
-            tensor<int32, [1]> input_195_axes_0 = const()[name = tensor<string, []>("input_195_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [5, 9, 256]> input_195 = layer_norm(axes = input_195_axes_0, beta = decoder_norm21_1_bias, epsilon = var_1050, gamma = decoder_norm21_1_weight, x = input_193)[name = tensor<string, []>("input_195")];
-            tensor<fp32, [5, 9, 2048]> input_197 = linear(bias = decoder_linear1_1_bias, weight = decoder_linear1_1_weight, x = input_195)[name = tensor<string, []>("linear_54")];
-            tensor<fp32, [5, 9, 2048]> input_199 = relu(x = input_197)[name = tensor<string, []>("input_199")];
-            tensor<fp32, [5, 9, 256]> ffn_out = linear(bias = decoder_linear2_1_bias, weight = decoder_linear2_1_weight, x = input_199)[name = tensor<string, []>("linear_55")];
-            tensor<fp32, [5, 9, 256]> input_201 = add(x = input_195, y = ffn_out)[name = tensor<string, []>("input_201")];
+            tensor<fp32, [5, 9, 256]> input_195 = add(x = query_5, y = sa2_out)[name = tensor<string, []>("input_195")];
+            tensor<int32, [1]> input_197_axes_0 = const()[name = tensor<string, []>("input_197_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [5, 9, 256]> input_197 = layer_norm(axes = input_197_axes_0, beta = inner_decoder_norm21_1_bias, epsilon = var_76, gamma = inner_decoder_norm21_1_weight, x = input_195)[name = tensor<string, []>("input_197")];
+            tensor<fp32, [5, 9, 2048]> input_199 = linear(bias = inner_decoder_linear1_1_bias, weight = inner_decoder_linear1_1_weight, x = input_197)[name = tensor<string, []>("linear_54")];
+            tensor<fp32, [5, 9, 2048]> input_201 = relu(x = input_199)[name = tensor<string, []>("input_201")];
+            tensor<fp32, [5, 9, 256]> ffn_out = linear(bias = inner_decoder_linear2_1_bias, weight = inner_decoder_linear2_1_weight, x = input_201)[name = tensor<string, []>("linear_55")];
+            tensor<fp32, [5, 9, 256]> input_203 = add(x = input_197, y = ffn_out)[name = tensor<string, []>("input_203")];
             tensor<int32, [1]> xt_axes_0 = const()[name = tensor<string, []>("xt_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [5, 9, 256]> xt = layer_norm(axes = xt_axes_0, beta = decoder_norm22_1_bias, epsilon = var_1050, gamma = decoder_norm22_1_weight, x = input_201)[name = tensor<string, []>("xt")];
-            tensor<int32, [4]> var_1495 = const()[name = tensor<string, []>("op_1495"), val = tensor<int32, [4]>([1, 5, 9, 256])];
-            tensor<fp32, [1, 5, 9, 256]> input = reshape(shape = var_1495, x = xt)[name = tensor<string, []>("input")];
-            tensor<int32, [1]> var_1497 = const()[name = tensor<string, []>("op_1497"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 9, 1]> var_1498 = reduce_l2_norm(axes = var_1497, keep_dims = var_1053, x = input)[name = tensor<string, []>("op_1498")];
+            tensor<fp32, [5, 9, 256]> xt = layer_norm(axes = xt_axes_0, beta = inner_decoder_norm22_1_bias, epsilon = var_76, gamma = inner_decoder_norm22_1_weight, x = input_203)[name = tensor<string, []>("xt")];
+            tensor<int32, [4]> var_1538 = const()[name = tensor<string, []>("op_1538"), val = tensor<int32, [4]>([1, 5, 9, 256])];
+            tensor<fp32, [1, 5, 9, 256]> input = reshape(shape = var_1538, x = xt)[name = tensor<string, []>("input")];
+            tensor<int32, [1]> var_1540 = const()[name = tensor<string, []>("op_1540"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 5, 9, 1]> var_1541 = reduce_l2_norm(axes = var_1540, keep_dims = var_75, x = input)[name = tensor<string, []>("op_1541")];
             tensor<fp32, []> const_42 = const()[name = tensor<string, []>("const_42"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1, 5, 9, 1]> clip_5 = clip(alpha = var_1045, beta = const_42, x = var_1498)[name = tensor<string, []>("clip_5")];
-            tensor<fp32, [1, 5, 9, 256]> var_1500 = real_div(x = input, y = clip_5)[name = tensor<string, []>("op_1500")];
+            tensor<fp32, [1, 5, 9, 1]> clip_5 = clip(alpha = var_90, beta = const_42, x = var_1541)[name = tensor<string, []>("clip_5")];
+            tensor<fp32, [1, 5, 9, 256]> var_1543 = real_div(x = input, y = clip_5)[name = tensor<string, []>("op_1543")];
             tensor<int32, [3]> concat_6 = const()[name = tensor<string, []>("concat_6"), val = tensor<int32, [3]>([5, 1, 256])];
             tensor<fp32, [5, 1, 256]> reshape_0 = reshape(shape = concat_6, x = emb)[name = tensor<string, []>("reshape_0")];
             tensor<int32, [4]> transpose_1_perm_0 = const()[name = tensor<string, []>("transpose_1_perm_0"), val = tensor<int32, [4]>([0, 1, 3, 2])];
             tensor<int32, [3]> concat_7 = const()[name = tensor<string, []>("concat_7"), val = tensor<int32, [3]>([5, 256, 9])];
-            tensor<fp32, [1, 5, 256, 9]> transpose_1 = transpose(perm = transpose_1_perm_0, x = var_1500)[name = tensor<string, []>("transpose_2")];
+            tensor<fp32, [1, 5, 256, 9]> transpose_1 = transpose(perm = transpose_1_perm_0, x = var_1543)[name = tensor<string, []>("transpose_2")];
             tensor<fp32, [5, 256, 9]> reshape_1 = reshape(shape = concat_7, x = transpose_1)[name = tensor<string, []>("reshape_1")];
             tensor<bool, []> matmul_2_transpose_x_0 = const()[name = tensor<string, []>("matmul_2_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> matmul_2_transpose_y_0 = const()[name = tensor<string, []>("matmul_2_transpose_y_0"), val = tensor<bool, []>(false)];
@@ -1357,10 +1376,10 @@ program(1.0)
             tensor<int32, [3]> output_end_0 = const()[name = tensor<string, []>("output_end_0"), val = tensor<int32, [3]>([1, 5, 8])];
             tensor<bool, [3]> output_end_mask_0 = const()[name = tensor<string, []>("output_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
             tensor<fp32, [1, 5, 7]> output = slice_by_index(begin = output_begin_0, end = output_end_0, end_mask = output_end_mask_0, x = reshape_2)[name = tensor<string, []>("output")];
-            tensor<fp32, [1, 5, 7]> probs = sigmoid(x = output)[name = tensor<string, []>("op_1504")];
-            tensor<int32, []> var_1506_axis_0 = const()[name = tensor<string, []>("op_1506_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [2, 9, 4, 64, 64]> dec_kv_new = stack(axis = var_1506_axis_0, values = (var_1202, nkv))[name = tensor<string, []>("op_1506")];
-            tensor<int32, []> var_1508_axis_0 = const()[name = tensor<string, []>("op_1508_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [2, 1]> dec_scale_new = stack(axis = var_1508_axis_0, values = (new_scale_9, new_scale))[name = tensor<string, []>("op_1508")];
+            tensor<fp32, [1, 5, 7]> probs = sigmoid(x = output)[name = tensor<string, []>("op_1547")];
+            tensor<int32, []> var_1549_axis_0 = const()[name = tensor<string, []>("op_1549_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [2, 9, 4, 64, 64]> dec_kv_new = stack(axis = var_1549_axis_0, values = (var_1245, nkv))[name = tensor<string, []>("op_1549")];
+            tensor<int32, []> var_1551_axis_0 = const()[name = tensor<string, []>("op_1551_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [2, 1]> dec_scale_new = stack(axis = var_1551_axis_0, values = (new_scale_9, new_scale))[name = tensor<string, []>("op_1551")];
         } -> (probs, enc_kv_new, enc_scale_new, enc_conv_cache_new, cnn_window_new, dec_kv_new, dec_scale_new);
 }
\ No newline at end of file
diff --git a/optimized/ch/500ms/ls_eend_ch_500ms.mlpackage/Data/com.apple.CoreML/model.mlmodel b/optimized/ch/500ms/ls_eend_ch_500ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
index e15cb45b1297e6fc6556ae5ba31a2b881b5eb03b..1b93c691eb7ab0a9b93c9f62a746822f7ce0e698 100644
--- a/optimized/ch/500ms/ls_eend_ch_500ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
+++ b/optimized/ch/500ms/ls_eend_ch_500ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:32b8d373605b82b210571b934838ef9e0ede35dc97c852d616acea69d2d1c22c
-size 196620
+oid sha256:9faeb4f3c5526ac70955a9cbc1da548053b2fdeeda49edad279bca5b0543d453
+size 203220
diff --git a/optimized/ch/500ms/ls_eend_ch_500ms.mlpackage/Manifest.json b/optimized/ch/500ms/ls_eend_ch_500ms.mlpackage/Manifest.json
index 7b1a12bb9811d9e20f8485718da28926f0fe3cd1..880c068fe5e42be2c295514041d528d410ce8114 100644
--- a/optimized/ch/500ms/ls_eend_ch_500ms.mlpackage/Manifest.json
+++ b/optimized/ch/500ms/ls_eend_ch_500ms.mlpackage/Manifest.json
@@ -1,18 +1,18 @@
 {
     "fileFormatVersion": "1.0.0",
     "itemInfoEntries": {
-        "B3E5EFE2-8BD2-41C3-B202-D931C2CCA18E": {
+        "22DDF204-0B49-4CB0-A43C-85A28EC40D19": {
             "author": "com.apple.CoreML",
             "description": "CoreML Model Weights",
             "name": "weights",
             "path": "com.apple.CoreML/weights"
         },
-        "E2319A52-06A1-4D52-8B9C-D3854ED1D555": {
+        "8D5435E8-36A0-406C-8F2A-A64DE6AB7ADA": {
             "author": "com.apple.CoreML",
             "description": "CoreML Model Specification",
             "name": "model.mlmodel",
             "path": "com.apple.CoreML/model.mlmodel"
         }
     },
-    "rootModelIdentifier": "E2319A52-06A1-4D52-8B9C-D3854ED1D555"
+    "rootModelIdentifier": "8D5435E8-36A0-406C-8F2A-A64DE6AB7ADA"
 }
diff --git a/optimized/dih2/100ms/ls_eend_dih2_100ms.mlmodelc/analytics/coremldata.bin b/optimized/dih2/100ms/ls_eend_dih2_100ms.mlmodelc/analytics/coremldata.bin
index e6a49061db71bd6609e70b767cc7b7d63298f84f..08b08077481b93da48cc59029ff2a55c5e492230 100644
--- a/optimized/dih2/100ms/ls_eend_dih2_100ms.mlmodelc/analytics/coremldata.bin
+++ b/optimized/dih2/100ms/ls_eend_dih2_100ms.mlmodelc/analytics/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a4b9f84d7ea9b63709ef0b529a4154d7c460335b9b0ff8de63b48ab92f0e2e2c
+oid sha256:bbc1211c80f5d2d6a66d9a9a8588f469eb038399ffa9625b2bd5e7675d88e026
 size 243
diff --git a/optimized/dih2/100ms/ls_eend_dih2_100ms.mlmodelc/coremldata.bin b/optimized/dih2/100ms/ls_eend_dih2_100ms.mlmodelc/coremldata.bin
index 2f3d2e94407edd8a732a883b6a075a76fb034d65..187a1081ee38d9af1585849e48ffcc03dce7a55e 100644
--- a/optimized/dih2/100ms/ls_eend_dih2_100ms.mlmodelc/coremldata.bin
+++ b/optimized/dih2/100ms/ls_eend_dih2_100ms.mlmodelc/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7b761b596de7a3abcc362d422df9b4f2626ec351b373ef5652bc478cea8e6568
-size 1308
+oid sha256:c08015c27fd3eb10b72b9dcf0cc5426bbc510677b6c471695746401352d58f94
+size 1411
diff --git a/optimized/dih2/100ms/ls_eend_dih2_100ms.mlmodelc/metadata.json b/optimized/dih2/100ms/ls_eend_dih2_100ms.mlmodelc/metadata.json
index 36071321ca497cfeb4a01fcebd77a489dc4f5a14..1f8297ee0f85446fd7dff2b19281eaa7f36166e9 100644
--- a/optimized/dih2/100ms/ls_eend_dih2_100ms.mlmodelc/metadata.json
+++ b/optimized/dih2/100ms/ls_eend_dih2_100ms.mlmodelc/metadata.json
@@ -1,7 +1,7 @@
 [
   {
     "metadataOutputVersion" : "3.0",
-    "shortDescription" : "LS-EEND DIHARD II streaming diarizer (pipeline, T=1, max_speakers=10)",
+    "shortDescription" : "LS-EEND DIHARD II streaming diarizer (pipeline, T=1, max_speakers=10, layout=raw_mel, cu=all)",
     "outputSchema" : [
       {
         "hasShapeFlexibility" : "0",
@@ -81,12 +81,12 @@
     "specificationVersion" : 8,
     "mlProgramOperationTypeHistogram" : {
       "Ios16.reduceL2Norm" : 2,
-      "Ios17.reshape" : 66,
+      "Ios17.reshape" : 67,
       "Ios16.softmax" : 2,
       "Ios17.matmul" : 29,
       "Ios17.transpose" : 57,
       "Split" : 4,
-      "Ios17.expandDims" : 3,
+      "Ios17.expandDims" : 4,
       "Ios17.add" : 46,
       "Ios16.sigmoid" : 5,
       "Ios17.sliceByIndex" : 36,
@@ -128,9 +128,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float32",
-        "formattedType" : "MultiArray (Float32 1 × 1 × 345)",
+        "formattedType" : "MultiArray (Float32 1 × 15 × 23)",
         "shortDescription" : "",
-        "shape" : "[1, 1, 345]",
+        "shape" : "[1, 15, 23]",
         "name" : "features",
         "type" : "MultiArray"
       },
@@ -206,8 +206,8 @@
       }
     ],
     "userDefinedMetadata" : {
-      "com.github.apple.coremltools.conversion_date" : "2026-04-16",
-      "config" : "{\"model_name\": \"dih2\", \"model_label\": \"DIHARD II\", \"variant\": \"pipeline\", \"chunk_size\": 1, \"step_duration_ms\": 100, \"frame_duration_ms\": 100, \"frame_duration_seconds\": 0.1, \"max_speakers\": 10, \"max_nspks\": 12, \"n_units\": 256, \"n_heads\": 4, \"enc_n_layers\": 4, \"dec_n_layers\": 2, \"conv_kernel_size\": 16, \"conv_delay\": 9, \"sample_rate\": 8000, \"win_length\": 200, \"hop_length\": 80, \"n_mels\": 23, \"context_size\": 7, \"subsampling\": 10, \"feat_type\": \"logmel23_cummn\", \"pure_roll\": true}",
+      "com.github.apple.coremltools.conversion_date" : "2026-04-18",
+      "config" : "{\"model_name\": \"dih2\", \"model_label\": \"DIHARD II\", \"variant\": \"pipeline\", \"chunk_size\": 1, \"step_duration_ms\": 100, \"frame_duration_ms\": 100, \"frame_duration_seconds\": 0.1, \"max_speakers\": 10, \"max_nspks\": 12, \"n_units\": 256, \"n_heads\": 4, \"enc_n_layers\": 4, \"dec_n_layers\": 2, \"conv_kernel_size\": 16, \"conv_delay\": 9, \"sample_rate\": 8000, \"win_length\": 200, \"hop_length\": 80, \"n_mels\": 23, \"context_size\": 7, \"subsampling\": 10, \"feat_type\": \"logmel23_cummn\", \"pure_roll\": true, \"input_layout\": \"raw_mel\", \"compute_units_export\": \"all\", \"raw_mel_length\": 15}",
       "com.github.apple.coremltools.source" : "torch==2.6.0",
       "com.github.apple.coremltools.version" : "9.0",
       "com.github.apple.coremltools.source_dialect" : "TorchScript"
diff --git a/optimized/dih2/100ms/ls_eend_dih2_100ms.mlmodelc/model.mil b/optimized/dih2/100ms/ls_eend_dih2_100ms.mlmodelc/model.mil
index 182f88767b6a7156fba8d5e17cc71543704b1b0a..4bc2f36465c8a6b77c6ec5021d7ddf40dec1643b 100644
--- a/optimized/dih2/100ms/ls_eend_dih2_100ms.mlmodelc/model.mil
+++ b/optimized/dih2/100ms/ls_eend_dih2_100ms.mlmodelc/model.mil
@@ -1,233 +1,239 @@
 program(1.0)
 [buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.6.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "9.0"}})]
 {
-    func main<ios17>(tensor<fp32, [1, 256, 18]> cnn_window, tensor<fp32, [2, 12, 4, 64, 64]> dec_kv, tensor<fp32, [2, 1]> dec_scale, tensor<fp32, [4, 1, 16, 256]> enc_conv_cache, tensor<fp32, [4, 1, 4, 64, 64]> enc_kv, tensor<fp32, [4, 1]> enc_scale, tensor<fp32, [1, 1, 345]> features, tensor<fp32, [1]> valid_mask) {
-            tensor<fp32, [256]> encoder_cnn_bias = const()[name = tensor<string, []>("encoder_cnn_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
-            tensor<fp32, [256, 256, 19]> encoder_cnn_weight = const()[name = tensor<string, []>("encoder_cnn_weight"), val = tensor<fp32, [256, 256, 19]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
-            tensor<fp32, [1, 1]> encoder__causal_mask = const()[name = tensor<string, []>("encoder__causal_mask"), val = tensor<fp32, [1, 1]>([[0x1p+0]])];
-            tensor<fp32, [1]> encoder__t_index = const()[name = tensor<string, []>("encoder__t_index"), val = tensor<fp32, [1]>([0x1p+0])];
-            tensor<fp32, [256]> encoder_input_projection_linear_bias = const()[name = tensor<string, []>("encoder_input_projection_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4981952)))];
-            tensor<fp32, [256, 345]> encoder_input_projection_linear_weight = const()[name = tensor<string, []>("encoder_input_projection_linear_weight"), val = tensor<fp32, [256, 345]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4983040)))];
-            tensor<fp32, [256]> encoder_pre_layer_norm_bias = const()[name = tensor<string, []>("encoder_pre_layer_norm_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5336384)))];
-            tensor<fp32, [256]> encoder_pre_layer_norm_weight = const()[name = tensor<string, []>("encoder_pre_layer_norm_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5337472)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5338560)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5339648)))];
-            tensor<fp32, [1024]> encoder_ffn1_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5340736)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5344896)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6393536)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6394624)))];
-            tensor<fp32, [256]> encoder_ret_lns_0_bias = const()[name = tensor<string, []>("encoder_ret_lns_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7443264)))];
-            tensor<fp32, [256]> encoder_ret_lns_0_weight = const()[name = tensor<string, []>("encoder_ret_lns_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7444352)))];
-            tensor<fp32, [256]> encoder_q_proj_0_bias = const()[name = tensor<string, []>("encoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7445440)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_0_weight = const()[name = tensor<string, []>("encoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7446528)))];
-            tensor<fp32, [256]> encoder_k_proj_0_bias = const()[name = tensor<string, []>("encoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7708736)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_0_weight = const()[name = tensor<string, []>("encoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7709824)))];
-            tensor<fp32, [256]> encoder_v_proj_0_bias = const()[name = tensor<string, []>("encoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7972032)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_0_weight = const()[name = tensor<string, []>("encoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7973120)))];
-            tensor<fp32, [256]> encoder_g_proj_0_bias = const()[name = tensor<string, []>("encoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8235328)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_0_weight = const()[name = tensor<string, []>("encoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8236416)))];
-            tensor<fp32, [256]> encoder_out_proj_0_bias = const()[name = tensor<string, []>("encoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8498624)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_0_weight = const()[name = tensor<string, []>("encoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8499712)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8761920)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8763008)))];
-            tensor<fp32, [512]> encoder_conv_module_0_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8764096)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_0_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8766208)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_0_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9290560)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9307008)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9308096)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9309184)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9310272)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9311360)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_0_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9312448)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9574656)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9575744)))];
-            tensor<fp32, [1024]> encoder_ffn2_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9576832)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9580992)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10629632)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10630720)))];
-            tensor<fp32, [256]> encoder_layer_norm_0_bias = const()[name = tensor<string, []>("encoder_layer_norm_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11679360)))];
-            tensor<fp32, [256]> encoder_layer_norm_0_weight = const()[name = tensor<string, []>("encoder_layer_norm_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11680448)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11681536)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11682624)))];
-            tensor<fp32, [1024]> encoder_ffn1_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11683712)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11687872)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12736512)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12737600)))];
-            tensor<fp32, [256]> encoder_ret_lns_1_bias = const()[name = tensor<string, []>("encoder_ret_lns_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13786240)))];
-            tensor<fp32, [256]> encoder_ret_lns_1_weight = const()[name = tensor<string, []>("encoder_ret_lns_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13787328)))];
-            tensor<fp32, [256]> encoder_q_proj_1_bias = const()[name = tensor<string, []>("encoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13788416)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_1_weight = const()[name = tensor<string, []>("encoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13789504)))];
-            tensor<fp32, [256]> encoder_k_proj_1_bias = const()[name = tensor<string, []>("encoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14051712)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_1_weight = const()[name = tensor<string, []>("encoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14052800)))];
-            tensor<fp32, [256]> encoder_v_proj_1_bias = const()[name = tensor<string, []>("encoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14315008)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_1_weight = const()[name = tensor<string, []>("encoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14316096)))];
-            tensor<fp32, [256]> encoder_g_proj_1_bias = const()[name = tensor<string, []>("encoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14578304)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_1_weight = const()[name = tensor<string, []>("encoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14579392)))];
-            tensor<fp32, [256]> encoder_out_proj_1_bias = const()[name = tensor<string, []>("encoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14841600)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_1_weight = const()[name = tensor<string, []>("encoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14842688)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15104896)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15105984)))];
-            tensor<fp32, [512]> encoder_conv_module_1_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15107072)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_1_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15109184)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_1_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15633536)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15649984)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15651072)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15652160)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15653248)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15654336)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_1_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15655424)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15917632)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15918720)))];
-            tensor<fp32, [1024]> encoder_ffn2_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15919808)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15923968)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16972608)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16973696)))];
-            tensor<fp32, [256]> encoder_layer_norm_1_bias = const()[name = tensor<string, []>("encoder_layer_norm_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18022336)))];
-            tensor<fp32, [256]> encoder_layer_norm_1_weight = const()[name = tensor<string, []>("encoder_layer_norm_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18023424)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18024512)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18025600)))];
-            tensor<fp32, [1024]> encoder_ffn1_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18026688)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18030848)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19079488)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19080576)))];
-            tensor<fp32, [256]> encoder_ret_lns_2_bias = const()[name = tensor<string, []>("encoder_ret_lns_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20129216)))];
-            tensor<fp32, [256]> encoder_ret_lns_2_weight = const()[name = tensor<string, []>("encoder_ret_lns_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20130304)))];
-            tensor<fp32, [256]> encoder_q_proj_2_bias = const()[name = tensor<string, []>("encoder_q_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20131392)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_2_weight = const()[name = tensor<string, []>("encoder_q_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20132480)))];
-            tensor<fp32, [256]> encoder_k_proj_2_bias = const()[name = tensor<string, []>("encoder_k_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20394688)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_2_weight = const()[name = tensor<string, []>("encoder_k_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20395776)))];
-            tensor<fp32, [256]> encoder_v_proj_2_bias = const()[name = tensor<string, []>("encoder_v_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20657984)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_2_weight = const()[name = tensor<string, []>("encoder_v_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20659072)))];
-            tensor<fp32, [256]> encoder_g_proj_2_bias = const()[name = tensor<string, []>("encoder_g_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20921280)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_2_weight = const()[name = tensor<string, []>("encoder_g_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20922368)))];
-            tensor<fp32, [256]> encoder_out_proj_2_bias = const()[name = tensor<string, []>("encoder_out_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21184576)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_2_weight = const()[name = tensor<string, []>("encoder_out_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21185664)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21447872)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21448960)))];
-            tensor<fp32, [512]> encoder_conv_module_2_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21450048)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_2_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21452160)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_2_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21976512)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21992960)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21994048)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21995136)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21996224)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21997312)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_2_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21998400)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22260608)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22261696)))];
-            tensor<fp32, [1024]> encoder_ffn2_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22262784)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22266944)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23315584)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23316672)))];
-            tensor<fp32, [256]> encoder_layer_norm_2_bias = const()[name = tensor<string, []>("encoder_layer_norm_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24365312)))];
-            tensor<fp32, [256]> encoder_layer_norm_2_weight = const()[name = tensor<string, []>("encoder_layer_norm_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24366400)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24367488)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24368576)))];
-            tensor<fp32, [1024]> encoder_ffn1_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24369664)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24373824)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25422464)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25423552)))];
-            tensor<fp32, [256]> encoder_ret_lns_3_bias = const()[name = tensor<string, []>("encoder_ret_lns_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26472192)))];
-            tensor<fp32, [256]> encoder_ret_lns_3_weight = const()[name = tensor<string, []>("encoder_ret_lns_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26473280)))];
-            tensor<fp32, [256]> encoder_q_proj_3_bias = const()[name = tensor<string, []>("encoder_q_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26474368)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_3_weight = const()[name = tensor<string, []>("encoder_q_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26475456)))];
-            tensor<fp32, [256]> encoder_k_proj_3_bias = const()[name = tensor<string, []>("encoder_k_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26737664)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_3_weight = const()[name = tensor<string, []>("encoder_k_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26738752)))];
-            tensor<fp32, [256]> encoder_v_proj_3_bias = const()[name = tensor<string, []>("encoder_v_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27000960)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_3_weight = const()[name = tensor<string, []>("encoder_v_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27002048)))];
-            tensor<fp32, [256]> encoder_g_proj_3_bias = const()[name = tensor<string, []>("encoder_g_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27264256)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_3_weight = const()[name = tensor<string, []>("encoder_g_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27265344)))];
-            tensor<fp32, [256]> encoder_out_proj_3_bias = const()[name = tensor<string, []>("encoder_out_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27527552)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_3_weight = const()[name = tensor<string, []>("encoder_out_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27528640)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27790848)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27791936)))];
-            tensor<fp32, [512]> encoder_conv_module_3_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27793024)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_3_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27795136)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_3_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28319488)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28335936)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28337024)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28338112)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28339200)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28340288)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_3_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28341376)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28603584)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28604672)))];
-            tensor<fp32, [1024]> encoder_ffn2_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28605760)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28609920)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29658560)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29659648)))];
-            tensor<fp32, [256]> encoder_layer_norm_3_bias = const()[name = tensor<string, []>("encoder_layer_norm_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30708288)))];
-            tensor<fp32, [256]> encoder_layer_norm_3_weight = const()[name = tensor<string, []>("encoder_layer_norm_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30709376)))];
-            tensor<fp32, [256]> decoder_convert_bias = const()[name = tensor<string, []>("decoder_convert_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710464)))];
-            tensor<fp32, [256, 512]> decoder_convert_weight = const()[name = tensor<string, []>("decoder_convert_weight"), val = tensor<fp32, [256, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30711552)))];
-            tensor<fp32, [256]> decoder_q_proj_0_bias = const()[name = tensor<string, []>("decoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31235904)))];
-            tensor<fp32, [256, 256]> decoder_q_proj_0_weight = const()[name = tensor<string, []>("decoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31236992)))];
-            tensor<fp32, [256]> decoder_k_proj_0_bias = const()[name = tensor<string, []>("decoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31499200)))];
-            tensor<fp32, [256, 256]> decoder_k_proj_0_weight = const()[name = tensor<string, []>("decoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31500288)))];
-            tensor<fp32, [256]> decoder_v_proj_0_bias = const()[name = tensor<string, []>("decoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31762496)))];
-            tensor<fp32, [256, 256]> decoder_v_proj_0_weight = const()[name = tensor<string, []>("decoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31763584)))];
-            tensor<fp32, [256]> decoder_g_proj_0_bias = const()[name = tensor<string, []>("decoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32025792)))];
-            tensor<fp32, [256, 256]> decoder_g_proj_0_weight = const()[name = tensor<string, []>("decoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32026880)))];
-            tensor<fp32, [256]> decoder_out_proj_0_bias = const()[name = tensor<string, []>("decoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32289088)))];
-            tensor<fp32, [256, 256]> decoder_out_proj_0_weight = const()[name = tensor<string, []>("decoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32290176)))];
-            tensor<fp32, [256]> decoder_norm11_0_bias = const()[name = tensor<string, []>("decoder_norm11_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32552384)))];
-            tensor<fp32, [256]> decoder_norm11_0_weight = const()[name = tensor<string, []>("decoder_norm11_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32553472)))];
-            tensor<fp32, [256]> decoder_self_attn2_0_out_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_0_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32554560)))];
-            tensor<fp32, [256, 256]> decoder_self_attn2_0_out_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_0_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32555648)))];
-            tensor<fp32, [768]> decoder_self_attn2_0_in_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_0_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32817856)))];
-            tensor<fp32, [768, 256]> decoder_self_attn2_0_in_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_0_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32820992)))];
-            tensor<fp32, [256]> decoder_norm21_0_bias = const()[name = tensor<string, []>("decoder_norm21_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33607488)))];
-            tensor<fp32, [256]> decoder_norm21_0_weight = const()[name = tensor<string, []>("decoder_norm21_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33608576)))];
-            tensor<fp32, [2048]> decoder_linear1_0_bias = const()[name = tensor<string, []>("decoder_linear1_0_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33609664)))];
-            tensor<fp32, [2048, 256]> decoder_linear1_0_weight = const()[name = tensor<string, []>("decoder_linear1_0_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33617920)))];
-            tensor<fp32, [256]> decoder_linear2_0_bias = const()[name = tensor<string, []>("decoder_linear2_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35715136)))];
-            tensor<fp32, [256, 2048]> decoder_linear2_0_weight = const()[name = tensor<string, []>("decoder_linear2_0_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35716224)))];
-            tensor<fp32, [256]> decoder_norm22_0_bias = const()[name = tensor<string, []>("decoder_norm22_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37813440)))];
-            tensor<fp32, [256]> decoder_norm22_0_weight = const()[name = tensor<string, []>("decoder_norm22_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37814528)))];
-            tensor<fp32, [256]> decoder_q_proj_1_bias = const()[name = tensor<string, []>("decoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37815616)))];
-            tensor<fp32, [256, 256]> decoder_q_proj_1_weight = const()[name = tensor<string, []>("decoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37816704)))];
-            tensor<fp32, [256]> decoder_k_proj_1_bias = const()[name = tensor<string, []>("decoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38078912)))];
-            tensor<fp32, [256, 256]> decoder_k_proj_1_weight = const()[name = tensor<string, []>("decoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38080000)))];
-            tensor<fp32, [256]> decoder_v_proj_1_bias = const()[name = tensor<string, []>("decoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38342208)))];
-            tensor<fp32, [256, 256]> decoder_v_proj_1_weight = const()[name = tensor<string, []>("decoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38343296)))];
-            tensor<fp32, [256]> decoder_g_proj_1_bias = const()[name = tensor<string, []>("decoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38605504)))];
-            tensor<fp32, [256, 256]> decoder_g_proj_1_weight = const()[name = tensor<string, []>("decoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38606592)))];
-            tensor<fp32, [256]> decoder_out_proj_1_bias = const()[name = tensor<string, []>("decoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38868800)))];
-            tensor<fp32, [256, 256]> decoder_out_proj_1_weight = const()[name = tensor<string, []>("decoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38869888)))];
-            tensor<fp32, [256]> decoder_norm11_1_bias = const()[name = tensor<string, []>("decoder_norm11_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39132096)))];
-            tensor<fp32, [256]> decoder_norm11_1_weight = const()[name = tensor<string, []>("decoder_norm11_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39133184)))];
-            tensor<fp32, [256]> decoder_self_attn2_1_out_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_1_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39134272)))];
-            tensor<fp32, [256, 256]> decoder_self_attn2_1_out_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_1_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39135360)))];
-            tensor<fp32, [768]> decoder_self_attn2_1_in_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_1_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39397568)))];
-            tensor<fp32, [768, 256]> decoder_self_attn2_1_in_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_1_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39400704)))];
-            tensor<fp32, [256]> decoder_norm21_1_bias = const()[name = tensor<string, []>("decoder_norm21_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40187200)))];
-            tensor<fp32, [256]> decoder_norm21_1_weight = const()[name = tensor<string, []>("decoder_norm21_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40188288)))];
-            tensor<fp32, [2048]> decoder_linear1_1_bias = const()[name = tensor<string, []>("decoder_linear1_1_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40189376)))];
-            tensor<fp32, [2048, 256]> decoder_linear1_1_weight = const()[name = tensor<string, []>("decoder_linear1_1_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40197632)))];
-            tensor<fp32, [256]> decoder_linear2_1_bias = const()[name = tensor<string, []>("decoder_linear2_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42294848)))];
-            tensor<fp32, [256, 2048]> decoder_linear2_1_weight = const()[name = tensor<string, []>("decoder_linear2_1_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42295936)))];
-            tensor<fp32, [256]> decoder_norm22_1_bias = const()[name = tensor<string, []>("decoder_norm22_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44393152)))];
-            tensor<fp32, [256]> decoder_norm22_1_weight = const()[name = tensor<string, []>("decoder_norm22_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44394240)))];
-            tensor<fp32, []> var_11 = const()[name = tensor<string, []>("op_11"), val = tensor<fp32, []>(0x1.5798eep-27)];
-            tensor<fp32, []> var_17 = const()[name = tensor<string, []>("op_17"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
-            tensor<int32, []> var_20 = const()[name = tensor<string, []>("op_20"), val = tensor<int32, []>(2)];
-            tensor<int32, []> var_23 = const()[name = tensor<string, []>("op_23"), val = tensor<int32, []>(0)];
-            tensor<int32, []> var_26 = const()[name = tensor<string, []>("op_26"), val = tensor<int32, []>(1)];
-            tensor<fp32, []> var_28 = const()[name = tensor<string, []>("op_28"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
-            tensor<bool, []> var_29 = const()[name = tensor<string, []>("op_29"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 1, 256]> input_1 = linear(bias = encoder_input_projection_linear_bias, weight = encoder_input_projection_linear_weight, x = features)[name = tensor<string, []>("linear_0")];
-            tensor<int32, [1]> input_3_axes_0 = const()[name = tensor<string, []>("input_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_3 = layer_norm(axes = input_3_axes_0, beta = encoder_pre_layer_norm_bias, epsilon = var_28, gamma = encoder_pre_layer_norm_weight, x = input_1)[name = tensor<string, []>("input_3")];
+    func main<ios17>(tensor<fp32, [1, 256, 18]> cnn_window, tensor<fp32, [2, 12, 4, 64, 64]> dec_kv, tensor<fp32, [2, 1]> dec_scale, tensor<fp32, [4, 1, 16, 256]> enc_conv_cache, tensor<fp32, [4, 1, 4, 64, 64]> enc_kv, tensor<fp32, [4, 1]> enc_scale, tensor<fp32, [1, 15, 23]> features, tensor<fp32, [1]> valid_mask) {
+            tensor<fp32, [256]> inner_encoder_cnn_bias = const()[name = tensor<string, []>("inner_encoder_cnn_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp32, [256, 256, 19]> inner_encoder_cnn_weight = const()[name = tensor<string, []>("inner_encoder_cnn_weight"), val = tensor<fp32, [256, 256, 19]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
+            tensor<fp32, [1, 1]> inner_encoder__causal_mask = const()[name = tensor<string, []>("inner_encoder__causal_mask"), val = tensor<fp32, [1, 1]>([[0x1p+0]])];
+            tensor<fp32, [1]> inner_encoder__t_index = const()[name = tensor<string, []>("inner_encoder__t_index"), val = tensor<fp32, [1]>([0x1p+0])];
+            tensor<fp32, [256]> inner_encoder_input_projection_linear_bias = const()[name = tensor<string, []>("inner_encoder_input_projection_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4981952)))];
+            tensor<fp32, [256, 345]> inner_encoder_input_projection_linear_weight = const()[name = tensor<string, []>("inner_encoder_input_projection_linear_weight"), val = tensor<fp32, [256, 345]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4983040)))];
+            tensor<fp32, [256]> inner_encoder_pre_layer_norm_bias = const()[name = tensor<string, []>("inner_encoder_pre_layer_norm_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5336384)))];
+            tensor<fp32, [256]> inner_encoder_pre_layer_norm_weight = const()[name = tensor<string, []>("inner_encoder_pre_layer_norm_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5337472)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5338560)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5339648)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5340736)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5344896)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6393536)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6394624)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_0_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7443264)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_0_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7444352)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7445440)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7446528)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7708736)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7709824)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7972032)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7973120)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8235328)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8236416)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8498624)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8499712)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8761920)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8763008)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_0_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8764096)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_0_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8766208)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_0_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9290560)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9307008)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9308096)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9309184)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9310272)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9311360)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_0_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9312448)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9574656)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9575744)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9576832)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9580992)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10629632)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10630720)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_0_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11679360)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_0_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11680448)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11681536)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11682624)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11683712)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11687872)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12736512)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12737600)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_1_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13786240)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_1_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13787328)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13788416)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13789504)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14051712)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14052800)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14315008)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14316096)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14578304)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14579392)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14841600)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14842688)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15104896)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15105984)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_1_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15107072)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_1_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15109184)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_1_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15633536)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15649984)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15651072)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15652160)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15653248)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15654336)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_1_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15655424)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15917632)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15918720)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15919808)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15923968)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16972608)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16973696)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_1_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18022336)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_1_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18023424)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18024512)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18025600)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18026688)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18030848)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19079488)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19080576)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_2_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20129216)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_2_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20130304)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20131392)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20132480)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20394688)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20395776)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20657984)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20659072)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20921280)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20922368)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21184576)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21185664)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21447872)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21448960)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_2_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21450048)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_2_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21452160)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_2_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21976512)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21992960)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21994048)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21995136)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21996224)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21997312)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_2_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21998400)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22260608)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22261696)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22262784)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22266944)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23315584)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23316672)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_2_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24365312)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_2_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24366400)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24367488)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24368576)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24369664)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24373824)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25422464)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25423552)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_3_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26472192)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_3_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26473280)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26474368)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26475456)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26737664)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26738752)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27000960)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27002048)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27264256)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27265344)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27527552)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27528640)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27790848)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27791936)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_3_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27793024)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_3_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27795136)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_3_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28319488)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28335936)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28337024)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28338112)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28339200)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28340288)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_3_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28341376)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28603584)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28604672)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28605760)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28609920)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29658560)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29659648)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_3_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30708288)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_3_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30709376)))];
+            tensor<fp32, [256]> inner_decoder_convert_bias = const()[name = tensor<string, []>("inner_decoder_convert_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710464)))];
+            tensor<fp32, [256, 512]> inner_decoder_convert_weight = const()[name = tensor<string, []>("inner_decoder_convert_weight"), val = tensor<fp32, [256, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30711552)))];
+            tensor<fp32, [256]> inner_decoder_q_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31235904)))];
+            tensor<fp32, [256, 256]> inner_decoder_q_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31236992)))];
+            tensor<fp32, [256]> inner_decoder_k_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31499200)))];
+            tensor<fp32, [256, 256]> inner_decoder_k_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31500288)))];
+            tensor<fp32, [256]> inner_decoder_v_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31762496)))];
+            tensor<fp32, [256, 256]> inner_decoder_v_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31763584)))];
+            tensor<fp32, [256]> inner_decoder_g_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32025792)))];
+            tensor<fp32, [256, 256]> inner_decoder_g_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32026880)))];
+            tensor<fp32, [256]> inner_decoder_out_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32289088)))];
+            tensor<fp32, [256, 256]> inner_decoder_out_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32290176)))];
+            tensor<fp32, [256]> inner_decoder_norm11_0_bias = const()[name = tensor<string, []>("inner_decoder_norm11_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32552384)))];
+            tensor<fp32, [256]> inner_decoder_norm11_0_weight = const()[name = tensor<string, []>("inner_decoder_norm11_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32553472)))];
+            tensor<fp32, [256]> inner_decoder_self_attn2_0_out_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32554560)))];
+            tensor<fp32, [256, 256]> inner_decoder_self_attn2_0_out_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32555648)))];
+            tensor<fp32, [768]> inner_decoder_self_attn2_0_in_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32817856)))];
+            tensor<fp32, [768, 256]> inner_decoder_self_attn2_0_in_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32820992)))];
+            tensor<fp32, [256]> inner_decoder_norm21_0_bias = const()[name = tensor<string, []>("inner_decoder_norm21_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33607488)))];
+            tensor<fp32, [256]> inner_decoder_norm21_0_weight = const()[name = tensor<string, []>("inner_decoder_norm21_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33608576)))];
+            tensor<fp32, [2048]> inner_decoder_linear1_0_bias = const()[name = tensor<string, []>("inner_decoder_linear1_0_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33609664)))];
+            tensor<fp32, [2048, 256]> inner_decoder_linear1_0_weight = const()[name = tensor<string, []>("inner_decoder_linear1_0_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33617920)))];
+            tensor<fp32, [256]> inner_decoder_linear2_0_bias = const()[name = tensor<string, []>("inner_decoder_linear2_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35715136)))];
+            tensor<fp32, [256, 2048]> inner_decoder_linear2_0_weight = const()[name = tensor<string, []>("inner_decoder_linear2_0_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35716224)))];
+            tensor<fp32, [256]> inner_decoder_norm22_0_bias = const()[name = tensor<string, []>("inner_decoder_norm22_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37813440)))];
+            tensor<fp32, [256]> inner_decoder_norm22_0_weight = const()[name = tensor<string, []>("inner_decoder_norm22_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37814528)))];
+            tensor<fp32, [256]> inner_decoder_q_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37815616)))];
+            tensor<fp32, [256, 256]> inner_decoder_q_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37816704)))];
+            tensor<fp32, [256]> inner_decoder_k_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38078912)))];
+            tensor<fp32, [256, 256]> inner_decoder_k_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38080000)))];
+            tensor<fp32, [256]> inner_decoder_v_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38342208)))];
+            tensor<fp32, [256, 256]> inner_decoder_v_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38343296)))];
+            tensor<fp32, [256]> inner_decoder_g_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38605504)))];
+            tensor<fp32, [256, 256]> inner_decoder_g_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38606592)))];
+            tensor<fp32, [256]> inner_decoder_out_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38868800)))];
+            tensor<fp32, [256, 256]> inner_decoder_out_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38869888)))];
+            tensor<fp32, [256]> inner_decoder_norm11_1_bias = const()[name = tensor<string, []>("inner_decoder_norm11_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39132096)))];
+            tensor<fp32, [256]> inner_decoder_norm11_1_weight = const()[name = tensor<string, []>("inner_decoder_norm11_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39133184)))];
+            tensor<fp32, [256]> inner_decoder_self_attn2_1_out_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39134272)))];
+            tensor<fp32, [256, 256]> inner_decoder_self_attn2_1_out_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39135360)))];
+            tensor<fp32, [768]> inner_decoder_self_attn2_1_in_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39397568)))];
+            tensor<fp32, [768, 256]> inner_decoder_self_attn2_1_in_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39400704)))];
+            tensor<fp32, [256]> inner_decoder_norm21_1_bias = const()[name = tensor<string, []>("inner_decoder_norm21_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40187200)))];
+            tensor<fp32, [256]> inner_decoder_norm21_1_weight = const()[name = tensor<string, []>("inner_decoder_norm21_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40188288)))];
+            tensor<fp32, [2048]> inner_decoder_linear1_1_bias = const()[name = tensor<string, []>("inner_decoder_linear1_1_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40189376)))];
+            tensor<fp32, [2048, 256]> inner_decoder_linear1_1_weight = const()[name = tensor<string, []>("inner_decoder_linear1_1_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40197632)))];
+            tensor<fp32, [256]> inner_decoder_linear2_1_bias = const()[name = tensor<string, []>("inner_decoder_linear2_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42294848)))];
+            tensor<fp32, [256, 2048]> inner_decoder_linear2_1_weight = const()[name = tensor<string, []>("inner_decoder_linear2_1_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42295936)))];
+            tensor<fp32, [256]> inner_decoder_norm22_1_bias = const()[name = tensor<string, []>("inner_decoder_norm22_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44393152)))];
+            tensor<fp32, [256]> inner_decoder_norm22_1_weight = const()[name = tensor<string, []>("inner_decoder_norm22_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44394240)))];
+            tensor<int32, [1]> stacked_axes_0 = const()[name = tensor<string, []>("stacked_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp32, [1, 1, 15, 23]> stacked = expand_dims(axes = stacked_axes_0, x = features)[name = tensor<string, []>("stacked")];
+            tensor<int32, [3]> var_26 = const()[name = tensor<string, []>("op_26"), val = tensor<int32, [3]>([1, 1, 345])];
+            tensor<fp32, [1, 1, 345]> input_1 = reshape(shape = var_26, x = stacked)[name = tensor<string, []>("input_1")];
+            tensor<fp32, []> var_29 = const()[name = tensor<string, []>("op_29"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<bool, []> var_35 = const()[name = tensor<string, []>("op_35"), val = tensor<bool, []>(true)];
+            tensor<fp32, []> var_36 = const()[name = tensor<string, []>("op_36"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
+            tensor<int32, []> var_39 = const()[name = tensor<string, []>("op_39"), val = tensor<int32, []>(0)];
+            tensor<int32, []> var_41 = const()[name = tensor<string, []>("op_41"), val = tensor<int32, []>(2)];
+            tensor<int32, []> var_42 = const()[name = tensor<string, []>("op_42"), val = tensor<int32, []>(-1)];
+            tensor<fp32, []> var_44 = const()[name = tensor<string, []>("op_44"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
+            tensor<fp32, []> var_49 = const()[name = tensor<string, []>("op_49"), val = tensor<fp32, []>(0x1.5798eep-27)];
+            tensor<int32, []> var_52 = const()[name = tensor<string, []>("op_52"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 1, 256]> input_3 = linear(bias = inner_encoder_input_projection_linear_bias, weight = inner_encoder_input_projection_linear_weight, x = input_1)[name = tensor<string, []>("linear_0")];
             tensor<int32, [1]> input_5_axes_0 = const()[name = tensor<string, []>("input_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_5 = layer_norm(axes = input_5_axes_0, beta = encoder_ffn1_0_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_0_module_sequential_0_weight, x = input_3)[name = tensor<string, []>("input_5")];
-            tensor<fp32, [1, 1, 1024]> inputs_1 = linear(bias = encoder_ffn1_0_module_sequential_1_linear_bias, weight = encoder_ffn1_0_module_sequential_1_linear_weight, x = input_5)[name = tensor<string, []>("linear_1")];
-            tensor<fp32, [1, 1, 1024]> input_7 = silu(x = inputs_1)[name = tensor<string, []>("input_7")];
-            tensor<fp32, [1, 1, 256]> input_11 = linear(bias = encoder_ffn1_0_module_sequential_4_linear_bias, weight = encoder_ffn1_0_module_sequential_4_linear_weight, x = input_7)[name = tensor<string, []>("linear_2")];
-            tensor<fp32, []> var_147 = const()[name = tensor<string, []>("op_147"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 1, 256]> var_148 = mul(x = input_11, y = var_147)[name = tensor<string, []>("op_148")];
-            tensor<fp32, [1, 1, 256]> input_13 = add(x = var_148, y = input_3)[name = tensor<string, []>("input_13")];
+            tensor<fp32, [1, 1, 256]> input_5 = layer_norm(axes = input_5_axes_0, beta = inner_encoder_pre_layer_norm_bias, epsilon = var_36, gamma = inner_encoder_pre_layer_norm_weight, x = input_3)[name = tensor<string, []>("input_5")];
+            tensor<int32, [1]> input_7_axes_0 = const()[name = tensor<string, []>("input_7_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 1, 256]> input_7 = layer_norm(axes = input_7_axes_0, beta = inner_encoder_ffn1_0_module_sequential_0_bias, epsilon = var_36, gamma = inner_encoder_ffn1_0_module_sequential_0_weight, x = input_5)[name = tensor<string, []>("input_7")];
+            tensor<fp32, [1, 1, 1024]> inputs_1 = linear(bias = inner_encoder_ffn1_0_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_0_module_sequential_1_linear_weight, x = input_7)[name = tensor<string, []>("linear_1")];
+            tensor<fp32, [1, 1, 1024]> input_9 = silu(x = inputs_1)[name = tensor<string, []>("input_9")];
+            tensor<fp32, [1, 1, 256]> input_13 = linear(bias = inner_encoder_ffn1_0_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_0_module_sequential_4_linear_weight, x = input_9)[name = tensor<string, []>("linear_2")];
+            tensor<fp32, []> var_173 = const()[name = tensor<string, []>("op_173"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 1, 256]> var_174 = mul(x = input_13, y = var_173)[name = tensor<string, []>("op_174")];
+            tensor<fp32, [1, 1, 256]> input_15 = add(x = var_174, y = input_5)[name = tensor<string, []>("input_15")];
             tensor<int32, [1]> x_1_axes_0 = const()[name = tensor<string, []>("x_1_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> x_1 = layer_norm(axes = x_1_axes_0, beta = encoder_ret_lns_0_bias, epsilon = var_28, gamma = encoder_ret_lns_0_weight, x = input_13)[name = tensor<string, []>("x_1")];
+            tensor<fp32, [1, 1, 256]> x_1 = layer_norm(axes = x_1_axes_0, beta = inner_encoder_ret_lns_0_bias, epsilon = var_36, gamma = inner_encoder_ret_lns_0_weight, x = input_15)[name = tensor<string, []>("x_1")];
             tensor<int32, [5]> prev_kv_1_begin_0 = const()[name = tensor<string, []>("prev_kv_1_begin_0"), val = tensor<int32, [5]>([0, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_1_end_0 = const()[name = tensor<string, []>("prev_kv_1_end_0"), val = tensor<int32, [5]>([1, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_1_end_mask_0 = const()[name = tensor<string, []>("prev_kv_1_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -238,139 +244,139 @@ program(1.0)
             tensor<bool, [2]> prev_scale_1_end_mask_0 = const()[name = tensor<string, []>("prev_scale_1_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_1_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_1_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_1 = slice_by_index(begin = prev_scale_1_begin_0, end = prev_scale_1_end_0, end_mask = prev_scale_1_end_mask_0, squeeze_mask = prev_scale_1_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_1")];
-            tensor<fp32, [1, 1, 256]> var_162 = linear(bias = encoder_q_proj_0_bias, weight = encoder_q_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_3")];
-            tensor<int32, [4]> var_163 = const()[name = tensor<string, []>("op_163"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_164 = reshape(shape = var_163, x = var_162)[name = tensor<string, []>("op_164")];
+            tensor<fp32, [1, 1, 256]> var_188 = linear(bias = inner_encoder_q_proj_0_bias, weight = inner_encoder_q_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_3")];
+            tensor<int32, [4]> var_189 = const()[name = tensor<string, []>("op_189"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_190 = reshape(shape = var_189, x = var_188)[name = tensor<string, []>("op_190")];
             tensor<int32, [4]> q_1_perm_0 = const()[name = tensor<string, []>("q_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> var_168 = linear(bias = encoder_k_proj_0_bias, weight = encoder_k_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_4")];
-            tensor<fp32, []> var_169 = const()[name = tensor<string, []>("op_169"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 1, 256]> var_170 = mul(x = var_168, y = var_169)[name = tensor<string, []>("op_170")];
-            tensor<int32, [4]> var_171 = const()[name = tensor<string, []>("op_171"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_172 = reshape(shape = var_171, x = var_170)[name = tensor<string, []>("op_172")];
+            tensor<fp32, [1, 1, 256]> var_194 = linear(bias = inner_encoder_k_proj_0_bias, weight = inner_encoder_k_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_4")];
+            tensor<fp32, []> var_195 = const()[name = tensor<string, []>("op_195"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 1, 256]> var_196 = mul(x = var_194, y = var_195)[name = tensor<string, []>("op_196")];
+            tensor<int32, [4]> var_197 = const()[name = tensor<string, []>("op_197"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_198 = reshape(shape = var_197, x = var_196)[name = tensor<string, []>("op_198")];
             tensor<int32, [4]> k_1_perm_0 = const()[name = tensor<string, []>("k_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> var_176 = linear(bias = encoder_v_proj_0_bias, weight = encoder_v_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_5")];
-            tensor<int32, [4]> var_177 = const()[name = tensor<string, []>("op_177"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_178 = reshape(shape = var_177, x = var_176)[name = tensor<string, []>("op_178")];
+            tensor<fp32, [1, 1, 256]> var_202 = linear(bias = inner_encoder_v_proj_0_bias, weight = inner_encoder_v_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_5")];
+            tensor<int32, [4]> var_203 = const()[name = tensor<string, []>("op_203"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_204 = reshape(shape = var_203, x = var_202)[name = tensor<string, []>("op_204")];
             tensor<int32, [4]> v_1_perm_0 = const()[name = tensor<string, []>("v_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> input_17 = linear(bias = encoder_g_proj_0_bias, weight = encoder_g_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_6")];
+            tensor<fp32, [1, 1, 256]> input_19 = linear(bias = inner_encoder_g_proj_0_bias, weight = inner_encoder_g_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_6")];
             tensor<fp32, [1]> sqrt_s0_1 = sqrt(x = prev_scale_1)[name = tensor<string, []>("sqrt_s0_1")];
-            tensor<fp32, [1]> s_t_1 = add(x = prev_scale_1, y = encoder__t_index)[name = tensor<string, []>("s_t_1")];
+            tensor<fp32, [1]> s_t_1 = add(x = prev_scale_1, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_1")];
             tensor<fp32, [1]> sqrt_s_t_1 = sqrt(x = s_t_1)[name = tensor<string, []>("sqrt_s_t_1")];
             tensor<bool, []> qk_1_transpose_x_1 = const()[name = tensor<string, []>("qk_1_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_1_transpose_y_1 = const()[name = tensor<string, []>("qk_1_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 1, 64]> k_1 = transpose(perm = k_1_perm_0, x = var_172)[name = tensor<string, []>("transpose_57")];
-            tensor<fp32, [1, 4, 1, 64]> q_1 = transpose(perm = q_1_perm_0, x = var_164)[name = tensor<string, []>("transpose_58")];
+            tensor<fp32, [1, 4, 1, 64]> k_1 = transpose(perm = k_1_perm_0, x = var_198)[name = tensor<string, []>("transpose_57")];
+            tensor<fp32, [1, 4, 1, 64]> q_1 = transpose(perm = q_1_perm_0, x = var_190)[name = tensor<string, []>("transpose_58")];
             tensor<fp32, [1, 4, 1, 1]> qk_1 = matmul(transpose_x = qk_1_transpose_x_1, transpose_y = qk_1_transpose_y_1, x = q_1, y = k_1)[name = tensor<string, []>("qk_1")];
-            tensor<int32, [2]> var_188 = const()[name = tensor<string, []>("op_188"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp32, [1, 1]> var_189 = reshape(shape = var_188, x = sqrt_s_t_1)[name = tensor<string, []>("op_189")];
-            tensor<fp32, [1, 1]> M_1 = real_div(x = encoder__causal_mask, y = var_189)[name = tensor<string, []>("M_1")];
-            tensor<fp32, [1, 4, 1, 1]> var_191 = mul(x = qk_1, y = M_1)[name = tensor<string, []>("op_191")];
+            tensor<int32, [2]> var_214 = const()[name = tensor<string, []>("op_214"), val = tensor<int32, [2]>([1, 1])];
+            tensor<fp32, [1, 1]> var_215 = reshape(shape = var_214, x = sqrt_s_t_1)[name = tensor<string, []>("op_215")];
+            tensor<fp32, [1, 1]> M_1 = real_div(x = inner_encoder__causal_mask, y = var_215)[name = tensor<string, []>("M_1")];
+            tensor<fp32, [1, 4, 1, 1]> var_217 = mul(x = qk_1, y = M_1)[name = tensor<string, []>("op_217")];
             tensor<bool, []> inner_1_transpose_x_0 = const()[name = tensor<string, []>("inner_1_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_1_transpose_y_0 = const()[name = tensor<string, []>("inner_1_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 1, 64]> v_1 = transpose(perm = v_1_perm_0, x = var_178)[name = tensor<string, []>("transpose_56")];
-            tensor<fp32, [1, 4, 1, 64]> inner_1 = matmul(transpose_x = inner_1_transpose_x_0, transpose_y = inner_1_transpose_y_0, x = var_191, y = v_1)[name = tensor<string, []>("inner_1")];
-            tensor<bool, []> var_193_transpose_x_0 = const()[name = tensor<string, []>("op_193_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_193_transpose_y_0 = const()[name = tensor<string, []>("op_193_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 1, 64]> var_193 = matmul(transpose_x = var_193_transpose_x_0, transpose_y = var_193_transpose_y_0, x = q_1, y = prev_kv_1)[name = tensor<string, []>("op_193")];
-            tensor<fp32, [1]> var_194 = real_div(x = sqrt_s0_1, y = sqrt_s_t_1)[name = tensor<string, []>("op_194")];
-            tensor<int32, [4]> var_195 = const()[name = tensor<string, []>("op_195"), val = tensor<int32, [4]>([1, 1, 1, 1])];
-            tensor<fp32, [1, 1, 1, 1]> var_196 = reshape(shape = var_195, x = var_194)[name = tensor<string, []>("op_196")];
-            tensor<fp32, [1, 4, 1, 64]> cross_1 = mul(x = var_193, y = var_196)[name = tensor<string, []>("cross_1")];
+            tensor<fp32, [1, 4, 1, 64]> v_1 = transpose(perm = v_1_perm_0, x = var_204)[name = tensor<string, []>("transpose_56")];
+            tensor<fp32, [1, 4, 1, 64]> inner_1 = matmul(transpose_x = inner_1_transpose_x_0, transpose_y = inner_1_transpose_y_0, x = var_217, y = v_1)[name = tensor<string, []>("inner_1")];
+            tensor<bool, []> var_219_transpose_x_0 = const()[name = tensor<string, []>("op_219_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_219_transpose_y_0 = const()[name = tensor<string, []>("op_219_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 1, 64]> var_219 = matmul(transpose_x = var_219_transpose_x_0, transpose_y = var_219_transpose_y_0, x = q_1, y = prev_kv_1)[name = tensor<string, []>("op_219")];
+            tensor<fp32, [1]> var_220 = real_div(x = sqrt_s0_1, y = sqrt_s_t_1)[name = tensor<string, []>("op_220")];
+            tensor<int32, [4]> var_221 = const()[name = tensor<string, []>("op_221"), val = tensor<int32, [4]>([1, 1, 1, 1])];
+            tensor<fp32, [1, 1, 1, 1]> var_222 = reshape(shape = var_221, x = var_220)[name = tensor<string, []>("op_222")];
+            tensor<fp32, [1, 4, 1, 64]> cross_1 = mul(x = var_219, y = var_222)[name = tensor<string, []>("cross_1")];
             tensor<fp32, [1, 4, 1, 64]> out_1 = add(x = inner_1, y = cross_1)[name = tensor<string, []>("out_1")];
-            tensor<fp32, [1, 4, 64, 64]> var_199 = mul(x = prev_kv_1, y = sqrt_s0_1)[name = tensor<string, []>("op_199")];
-            tensor<bool, []> var_201_transpose_x_1 = const()[name = tensor<string, []>("op_201_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_201_transpose_y_1 = const()[name = tensor<string, []>("op_201_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_201 = matmul(transpose_x = var_201_transpose_x_1, transpose_y = var_201_transpose_y_1, x = k_1, y = v_1)[name = tensor<string, []>("op_201")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_1 = add(x = var_199, y = var_201)[name = tensor<string, []>("new_kv_unnorm_1")];
-            tensor<fp32, []> var_203 = const()[name = tensor<string, []>("op_203"), val = tensor<fp32, []>(0x1p+0)];
-            tensor<fp32, [1]> new_scale_1 = add(x = prev_scale_1, y = var_203)[name = tensor<string, []>("new_scale_1")];
-            tensor<fp32, [1]> var_205 = sqrt(x = new_scale_1)[name = tensor<string, []>("op_205")];
-            tensor<fp32, [1, 4, 64, 64]> var_206 = real_div(x = new_kv_unnorm_1, y = var_205)[name = tensor<string, []>("op_206")];
-            tensor<int32, [4]> var_207_perm_0 = const()[name = tensor<string, []>("op_207_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_225 = mul(x = prev_kv_1, y = sqrt_s0_1)[name = tensor<string, []>("op_225")];
+            tensor<bool, []> var_227_transpose_x_1 = const()[name = tensor<string, []>("op_227_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_227_transpose_y_1 = const()[name = tensor<string, []>("op_227_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_227 = matmul(transpose_x = var_227_transpose_x_1, transpose_y = var_227_transpose_y_1, x = k_1, y = v_1)[name = tensor<string, []>("op_227")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_1 = add(x = var_225, y = var_227)[name = tensor<string, []>("new_kv_unnorm_1")];
+            tensor<fp32, []> var_229 = const()[name = tensor<string, []>("op_229"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<fp32, [1]> new_scale_1 = add(x = prev_scale_1, y = var_229)[name = tensor<string, []>("new_scale_1")];
+            tensor<fp32, [1]> var_231 = sqrt(x = new_scale_1)[name = tensor<string, []>("op_231")];
+            tensor<fp32, [1, 4, 64, 64]> var_232 = real_div(x = new_kv_unnorm_1, y = var_231)[name = tensor<string, []>("op_232")];
+            tensor<int32, [4]> var_233_perm_0 = const()[name = tensor<string, []>("op_233_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_3_axes_0 = const()[name = tensor<string, []>("out_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 4, 64]> var_207 = transpose(perm = var_207_perm_0, x = out_1)[name = tensor<string, []>("transpose_55")];
-            tensor<fp32, [1, 1, 4, 64]> out_3 = layer_norm(axes = out_3_axes_0, epsilon = var_17, x = var_207)[name = tensor<string, []>("out_3")];
-            tensor<int32, [3]> var_211 = const()[name = tensor<string, []>("op_211"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<fp32, [1, 1, 256]> out_5 = reshape(shape = var_211, x = out_3)[name = tensor<string, []>("out_5")];
-            tensor<fp32, [1, 1, 256]> var_213 = silu(x = input_17)[name = tensor<string, []>("op_213")];
-            tensor<fp32, [1, 1, 256]> input_19 = mul(x = var_213, y = out_5)[name = tensor<string, []>("input_19")];
-            tensor<fp32, [1, 1, 256]> ret_out_1 = linear(bias = encoder_out_proj_0_bias, weight = encoder_out_proj_0_weight, x = input_19)[name = tensor<string, []>("linear_7")];
-            tensor<fp32, [1, 1, 256]> x_3 = add(x = input_13, y = ret_out_1)[name = tensor<string, []>("x_3")];
+            tensor<fp32, [1, 1, 4, 64]> var_233 = transpose(perm = var_233_perm_0, x = out_1)[name = tensor<string, []>("transpose_55")];
+            tensor<fp32, [1, 1, 4, 64]> out_3 = layer_norm(axes = out_3_axes_0, epsilon = var_44, x = var_233)[name = tensor<string, []>("out_3")];
+            tensor<int32, [3]> var_237 = const()[name = tensor<string, []>("op_237"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<fp32, [1, 1, 256]> out_5 = reshape(shape = var_237, x = out_3)[name = tensor<string, []>("out_5")];
+            tensor<fp32, [1, 1, 256]> var_239 = silu(x = input_19)[name = tensor<string, []>("op_239")];
+            tensor<fp32, [1, 1, 256]> input_21 = mul(x = var_239, y = out_5)[name = tensor<string, []>("input_21")];
+            tensor<fp32, [1, 1, 256]> ret_out_1 = linear(bias = inner_encoder_out_proj_0_bias, weight = inner_encoder_out_proj_0_weight, x = input_21)[name = tensor<string, []>("linear_7")];
+            tensor<fp32, [1, 1, 256]> x_3 = add(x = input_15, y = ret_out_1)[name = tensor<string, []>("x_3")];
             tensor<int32, [4]> window_1_begin_0 = const()[name = tensor<string, []>("window_1_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> window_1_end_0 = const()[name = tensor<string, []>("window_1_end_0"), val = tensor<int32, [4]>([1, 1, 16, 256])];
             tensor<bool, [4]> window_1_end_mask_0 = const()[name = tensor<string, []>("window_1_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_1_squeeze_mask_0 = const()[name = tensor<string, []>("window_1_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_1 = slice_by_index(begin = window_1_begin_0, end = window_1_end_0, end_mask = window_1_end_mask_0, squeeze_mask = window_1_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_1")];
-            tensor<int32, [3]> var_224_begin_0 = const()[name = tensor<string, []>("op_224_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_224_end_0 = const()[name = tensor<string, []>("op_224_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_224_end_mask_0 = const()[name = tensor<string, []>("op_224_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_224 = slice_by_index(begin = var_224_begin_0, end = var_224_end_0, end_mask = var_224_end_mask_0, x = window_1)[name = tensor<string, []>("op_224")];
+            tensor<int32, [3]> var_250_begin_0 = const()[name = tensor<string, []>("op_250_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_250_end_0 = const()[name = tensor<string, []>("op_250_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_250_end_mask_0 = const()[name = tensor<string, []>("op_250_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_250 = slice_by_index(begin = var_250_begin_0, end = var_250_end_0, end_mask = var_250_end_mask_0, x = window_1)[name = tensor<string, []>("op_250")];
             tensor<bool, []> window_3_interleave_0 = const()[name = tensor<string, []>("window_3_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_3 = concat(axis = var_26, interleave = window_3_interleave_0, values = (var_224, x_3))[name = tensor<string, []>("window_3")];
-            tensor<bool, []> input_21_interleave_0 = const()[name = tensor<string, []>("input_21_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> input_21 = concat(axis = var_23, interleave = input_21_interleave_0, values = window_3)[name = tensor<string, []>("input_21")];
+            tensor<fp32, [1, 16, 256]> window_3 = concat(axis = var_52, interleave = window_3_interleave_0, values = (var_250, x_3))[name = tensor<string, []>("window_3")];
+            tensor<bool, []> input_23_interleave_0 = const()[name = tensor<string, []>("input_23_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 16, 256]> input_23 = concat(axis = var_39, interleave = input_23_interleave_0, values = window_3)[name = tensor<string, []>("input_23")];
             tensor<int32, [1]> x_5_axes_0 = const()[name = tensor<string, []>("x_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 16, 256]> x_5 = layer_norm(axes = x_5_axes_0, beta = encoder_conv_module_0_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_0_sequential_0_weight, x = input_21)[name = tensor<string, []>("x_5")];
-            tensor<int32, [3]> input_23_perm_0 = const()[name = tensor<string, []>("input_23_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [1, 16, 256]> x_5 = layer_norm(axes = x_5_axes_0, beta = inner_encoder_conv_module_0_sequential_0_bias, epsilon = var_36, gamma = inner_encoder_conv_module_0_sequential_0_weight, x = input_23)[name = tensor<string, []>("x_5")];
+            tensor<int32, [3]> input_25_perm_0 = const()[name = tensor<string, []>("input_25_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_3_pad_type_0 = const()[name = tensor<string, []>("inputs_3_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_3_strides_0 = const()[name = tensor<string, []>("inputs_3_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_3_pad_0 = const()[name = tensor<string, []>("inputs_3_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_3_dilations_0 = const()[name = tensor<string, []>("inputs_3_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_3_groups_0 = const()[name = tensor<string, []>("inputs_3_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> input_23 = transpose(perm = input_23_perm_0, x = x_5)[name = tensor<string, []>("transpose_54")];
-            tensor<fp32, [1, 512, 16]> inputs_3 = conv(bias = encoder_conv_module_0_sequential_2_conv_bias, dilations = inputs_3_dilations_0, groups = inputs_3_groups_0, pad = inputs_3_pad_0, pad_type = inputs_3_pad_type_0, strides = inputs_3_strides_0, weight = encoder_conv_module_0_sequential_2_conv_weight, x = input_23)[name = tensor<string, []>("inputs_3")];
-            tensor<int32, [2]> var_249_split_sizes_0 = const()[name = tensor<string, []>("op_249_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_249_axis_0 = const()[name = tensor<string, []>("op_249_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> var_249_0, tensor<fp32, [1, 256, 16]> var_249_1 = split(axis = var_249_axis_0, split_sizes = var_249_split_sizes_0, x = inputs_3)[name = tensor<string, []>("op_249")];
-            tensor<fp32, [1, 256, 16]> var_251 = sigmoid(x = var_249_1)[name = tensor<string, []>("op_251")];
-            tensor<fp32, [1, 256, 16]> inputs_5 = mul(x = var_249_0, y = var_251)[name = tensor<string, []>("inputs_5")];
+            tensor<fp32, [1, 256, 16]> input_25 = transpose(perm = input_25_perm_0, x = x_5)[name = tensor<string, []>("transpose_54")];
+            tensor<fp32, [1, 512, 16]> inputs_3 = conv(bias = inner_encoder_conv_module_0_sequential_2_conv_bias, dilations = inputs_3_dilations_0, groups = inputs_3_groups_0, pad = inputs_3_pad_0, pad_type = inputs_3_pad_type_0, strides = inputs_3_strides_0, weight = inner_encoder_conv_module_0_sequential_2_conv_weight, x = input_25)[name = tensor<string, []>("inputs_3")];
+            tensor<int32, [2]> var_275_split_sizes_0 = const()[name = tensor<string, []>("op_275_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_275_axis_0 = const()[name = tensor<string, []>("op_275_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 256, 16]> var_275_0, tensor<fp32, [1, 256, 16]> var_275_1 = split(axis = var_275_axis_0, split_sizes = var_275_split_sizes_0, x = inputs_3)[name = tensor<string, []>("op_275")];
+            tensor<fp32, [1, 256, 16]> var_277 = sigmoid(x = var_275_1)[name = tensor<string, []>("op_277")];
+            tensor<fp32, [1, 256, 16]> inputs_5 = mul(x = var_275_0, y = var_277)[name = tensor<string, []>("inputs_5")];
             tensor<string, []> outputs_aug_1_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_1_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_1_pad_0 = const()[name = tensor<string, []>("outputs_aug_1_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_1_groups_0 = const()[name = tensor<string, []>("outputs_aug_1_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_1_strides_0 = const()[name = tensor<string, []>("outputs_aug_1_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_1_dilations_0 = const()[name = tensor<string, []>("outputs_aug_1_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [1, 256, 31]> outputs_aug_1 = conv(dilations = outputs_aug_1_dilations_0, groups = outputs_aug_1_groups_0, pad = outputs_aug_1_pad_0, pad_type = outputs_aug_1_pad_type_0, strides = outputs_aug_1_strides_0, weight = encoder_conv_module_0_sequential_4_conv_weight, x = inputs_5)[name = tensor<string, []>("outputs_aug_1")];
-            tensor<int32, [3]> input_25_begin_0 = const()[name = tensor<string, []>("input_25_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_25_end_0 = const()[name = tensor<string, []>("input_25_end_0"), val = tensor<int32, [3]>([1, 256, 16])];
-            tensor<bool, [3]> input_25_end_mask_0 = const()[name = tensor<string, []>("input_25_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [1, 256, 16]> input_25 = slice_by_index(begin = input_25_begin_0, end = input_25_end_0, end_mask = input_25_end_mask_0, x = outputs_aug_1)[name = tensor<string, []>("input_25")];
-            tensor<fp32, [1, 256, 16]> inputs_7 = batch_norm(beta = encoder_conv_module_0_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_0_sequential_5_weight, mean = encoder_conv_module_0_sequential_5_running_mean, variance = encoder_conv_module_0_sequential_5_running_var, x = input_25)[name = tensor<string, []>("inputs_7")];
-            tensor<fp32, [1, 256, 16]> input_27 = silu(x = inputs_7)[name = tensor<string, []>("input_27")];
-            tensor<string, []> input_29_pad_type_0 = const()[name = tensor<string, []>("input_29_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_29_strides_0 = const()[name = tensor<string, []>("input_29_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_29_pad_0 = const()[name = tensor<string, []>("input_29_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_29_dilations_0 = const()[name = tensor<string, []>("input_29_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_29_groups_0 = const()[name = tensor<string, []>("input_29_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> input_29 = conv(bias = encoder_conv_module_0_sequential_7_conv_bias, dilations = input_29_dilations_0, groups = input_29_groups_0, pad = input_29_pad_0, pad_type = input_29_pad_type_0, strides = input_29_strides_0, weight = encoder_conv_module_0_sequential_7_conv_weight, x = input_27)[name = tensor<string, []>("input_29")];
+            tensor<fp32, [1, 256, 31]> outputs_aug_1 = conv(dilations = outputs_aug_1_dilations_0, groups = outputs_aug_1_groups_0, pad = outputs_aug_1_pad_0, pad_type = outputs_aug_1_pad_type_0, strides = outputs_aug_1_strides_0, weight = inner_encoder_conv_module_0_sequential_4_conv_weight, x = inputs_5)[name = tensor<string, []>("outputs_aug_1")];
+            tensor<int32, [3]> input_27_begin_0 = const()[name = tensor<string, []>("input_27_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_27_end_0 = const()[name = tensor<string, []>("input_27_end_0"), val = tensor<int32, [3]>([1, 256, 16])];
+            tensor<bool, [3]> input_27_end_mask_0 = const()[name = tensor<string, []>("input_27_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [1, 256, 16]> input_27 = slice_by_index(begin = input_27_begin_0, end = input_27_end_0, end_mask = input_27_end_mask_0, x = outputs_aug_1)[name = tensor<string, []>("input_27")];
+            tensor<fp32, [1, 256, 16]> inputs_7 = batch_norm(beta = inner_encoder_conv_module_0_sequential_5_bias, epsilon = var_36, gamma = inner_encoder_conv_module_0_sequential_5_weight, mean = inner_encoder_conv_module_0_sequential_5_running_mean, variance = inner_encoder_conv_module_0_sequential_5_running_var, x = input_27)[name = tensor<string, []>("inputs_7")];
+            tensor<fp32, [1, 256, 16]> input_29 = silu(x = inputs_7)[name = tensor<string, []>("input_29")];
+            tensor<string, []> input_31_pad_type_0 = const()[name = tensor<string, []>("input_31_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_31_strides_0 = const()[name = tensor<string, []>("input_31_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_31_pad_0 = const()[name = tensor<string, []>("input_31_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_31_dilations_0 = const()[name = tensor<string, []>("input_31_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_31_groups_0 = const()[name = tensor<string, []>("input_31_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 256, 16]> input_31 = conv(bias = inner_encoder_conv_module_0_sequential_7_conv_bias, dilations = input_31_dilations_0, groups = input_31_groups_0, pad = input_31_pad_0, pad_type = input_31_pad_type_0, strides = input_31_strides_0, weight = inner_encoder_conv_module_0_sequential_7_conv_weight, x = input_29)[name = tensor<string, []>("input_31")];
             tensor<int32, [3]> conv_out_1_perm_0 = const()[name = tensor<string, []>("conv_out_1_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_282_begin_0 = const()[name = tensor<string, []>("op_282_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_282_end_0 = const()[name = tensor<string, []>("op_282_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_282_end_mask_0 = const()[name = tensor<string, []>("op_282_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 16, 256]> conv_out_1 = transpose(perm = conv_out_1_perm_0, x = input_29)[name = tensor<string, []>("transpose_53")];
-            tensor<fp32, [1, 1, 256]> var_282 = slice_by_index(begin = var_282_begin_0, end = var_282_end_0, end_mask = var_282_end_mask_0, x = conv_out_1)[name = tensor<string, []>("op_282")];
-            tensor<int32, [3]> var_284_perm_0 = const()[name = tensor<string, []>("op_284_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 1, 256]> var_284 = transpose(perm = var_284_perm_0, x = var_282)[name = tensor<string, []>("transpose_52")];
-            tensor<fp32, [1, 1, 256]> input_31 = add(x = x_3, y = var_284)[name = tensor<string, []>("input_31")];
-            tensor<int32, [1]> input_33_axes_0 = const()[name = tensor<string, []>("input_33_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_33 = layer_norm(axes = input_33_axes_0, beta = encoder_ffn2_0_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_0_module_sequential_0_weight, x = input_31)[name = tensor<string, []>("input_33")];
-            tensor<fp32, [1, 1, 1024]> inputs_9 = linear(bias = encoder_ffn2_0_module_sequential_1_linear_bias, weight = encoder_ffn2_0_module_sequential_1_linear_weight, x = input_33)[name = tensor<string, []>("linear_8")];
-            tensor<fp32, [1, 1, 1024]> input_35 = silu(x = inputs_9)[name = tensor<string, []>("input_35")];
-            tensor<fp32, [1, 1, 256]> input_39 = linear(bias = encoder_ffn2_0_module_sequential_4_linear_bias, weight = encoder_ffn2_0_module_sequential_4_linear_weight, x = input_35)[name = tensor<string, []>("linear_9")];
-            tensor<fp32, []> var_307 = const()[name = tensor<string, []>("op_307"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 1, 256]> var_308 = mul(x = input_39, y = var_307)[name = tensor<string, []>("op_308")];
-            tensor<fp32, [1, 1, 256]> input_41 = add(x = var_308, y = input_31)[name = tensor<string, []>("input_41")];
-            tensor<int32, [1]> input_43_axes_0 = const()[name = tensor<string, []>("input_43_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_43 = layer_norm(axes = input_43_axes_0, beta = encoder_layer_norm_0_bias, epsilon = var_28, gamma = encoder_layer_norm_0_weight, x = input_41)[name = tensor<string, []>("input_43")];
+            tensor<int32, [3]> var_308_begin_0 = const()[name = tensor<string, []>("op_308_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_308_end_0 = const()[name = tensor<string, []>("op_308_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_308_end_mask_0 = const()[name = tensor<string, []>("op_308_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 16, 256]> conv_out_1 = transpose(perm = conv_out_1_perm_0, x = input_31)[name = tensor<string, []>("transpose_53")];
+            tensor<fp32, [1, 1, 256]> var_308 = slice_by_index(begin = var_308_begin_0, end = var_308_end_0, end_mask = var_308_end_mask_0, x = conv_out_1)[name = tensor<string, []>("op_308")];
+            tensor<int32, [3]> var_310_perm_0 = const()[name = tensor<string, []>("op_310_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 1, 256]> var_310 = transpose(perm = var_310_perm_0, x = var_308)[name = tensor<string, []>("transpose_52")];
+            tensor<fp32, [1, 1, 256]> input_33 = add(x = x_3, y = var_310)[name = tensor<string, []>("input_33")];
+            tensor<int32, [1]> input_35_axes_0 = const()[name = tensor<string, []>("input_35_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 1, 256]> input_35 = layer_norm(axes = input_35_axes_0, beta = inner_encoder_ffn2_0_module_sequential_0_bias, epsilon = var_36, gamma = inner_encoder_ffn2_0_module_sequential_0_weight, x = input_33)[name = tensor<string, []>("input_35")];
+            tensor<fp32, [1, 1, 1024]> inputs_9 = linear(bias = inner_encoder_ffn2_0_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_0_module_sequential_1_linear_weight, x = input_35)[name = tensor<string, []>("linear_8")];
+            tensor<fp32, [1, 1, 1024]> input_37 = silu(x = inputs_9)[name = tensor<string, []>("input_37")];
+            tensor<fp32, [1, 1, 256]> input_41 = linear(bias = inner_encoder_ffn2_0_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_0_module_sequential_4_linear_weight, x = input_37)[name = tensor<string, []>("linear_9")];
+            tensor<fp32, []> var_333 = const()[name = tensor<string, []>("op_333"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 1, 256]> var_334 = mul(x = input_41, y = var_333)[name = tensor<string, []>("op_334")];
+            tensor<fp32, [1, 1, 256]> input_43 = add(x = var_334, y = input_33)[name = tensor<string, []>("input_43")];
             tensor<int32, [1]> input_45_axes_0 = const()[name = tensor<string, []>("input_45_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_45 = layer_norm(axes = input_45_axes_0, beta = encoder_ffn1_1_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_1_module_sequential_0_weight, x = input_43)[name = tensor<string, []>("input_45")];
-            tensor<fp32, [1, 1, 1024]> inputs_11 = linear(bias = encoder_ffn1_1_module_sequential_1_linear_bias, weight = encoder_ffn1_1_module_sequential_1_linear_weight, x = input_45)[name = tensor<string, []>("linear_10")];
-            tensor<fp32, [1, 1, 1024]> input_47 = silu(x = inputs_11)[name = tensor<string, []>("input_47")];
-            tensor<fp32, [1, 1, 256]> input_51 = linear(bias = encoder_ffn1_1_module_sequential_4_linear_bias, weight = encoder_ffn1_1_module_sequential_4_linear_weight, x = input_47)[name = tensor<string, []>("linear_11")];
-            tensor<fp32, []> var_337 = const()[name = tensor<string, []>("op_337"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 1, 256]> var_338 = mul(x = input_51, y = var_337)[name = tensor<string, []>("op_338")];
-            tensor<fp32, [1, 1, 256]> input_53 = add(x = var_338, y = input_43)[name = tensor<string, []>("input_53")];
+            tensor<fp32, [1, 1, 256]> input_45 = layer_norm(axes = input_45_axes_0, beta = inner_encoder_layer_norm_0_bias, epsilon = var_36, gamma = inner_encoder_layer_norm_0_weight, x = input_43)[name = tensor<string, []>("input_45")];
+            tensor<int32, [1]> input_47_axes_0 = const()[name = tensor<string, []>("input_47_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 1, 256]> input_47 = layer_norm(axes = input_47_axes_0, beta = inner_encoder_ffn1_1_module_sequential_0_bias, epsilon = var_36, gamma = inner_encoder_ffn1_1_module_sequential_0_weight, x = input_45)[name = tensor<string, []>("input_47")];
+            tensor<fp32, [1, 1, 1024]> inputs_11 = linear(bias = inner_encoder_ffn1_1_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_1_module_sequential_1_linear_weight, x = input_47)[name = tensor<string, []>("linear_10")];
+            tensor<fp32, [1, 1, 1024]> input_49 = silu(x = inputs_11)[name = tensor<string, []>("input_49")];
+            tensor<fp32, [1, 1, 256]> input_53 = linear(bias = inner_encoder_ffn1_1_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_1_module_sequential_4_linear_weight, x = input_49)[name = tensor<string, []>("linear_11")];
+            tensor<fp32, []> var_363 = const()[name = tensor<string, []>("op_363"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 1, 256]> var_364 = mul(x = input_53, y = var_363)[name = tensor<string, []>("op_364")];
+            tensor<fp32, [1, 1, 256]> input_55 = add(x = var_364, y = input_45)[name = tensor<string, []>("input_55")];
             tensor<int32, [1]> x_7_axes_0 = const()[name = tensor<string, []>("x_7_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> x_7 = layer_norm(axes = x_7_axes_0, beta = encoder_ret_lns_1_bias, epsilon = var_28, gamma = encoder_ret_lns_1_weight, x = input_53)[name = tensor<string, []>("x_7")];
+            tensor<fp32, [1, 1, 256]> x_7 = layer_norm(axes = x_7_axes_0, beta = inner_encoder_ret_lns_1_bias, epsilon = var_36, gamma = inner_encoder_ret_lns_1_weight, x = input_55)[name = tensor<string, []>("x_7")];
             tensor<int32, [5]> prev_kv_3_begin_0 = const()[name = tensor<string, []>("prev_kv_3_begin_0"), val = tensor<int32, [5]>([1, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_3_end_0 = const()[name = tensor<string, []>("prev_kv_3_end_0"), val = tensor<int32, [5]>([2, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_3_end_mask_0 = const()[name = tensor<string, []>("prev_kv_3_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -381,139 +387,139 @@ program(1.0)
             tensor<bool, [2]> prev_scale_3_end_mask_0 = const()[name = tensor<string, []>("prev_scale_3_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_3_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_3_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_3 = slice_by_index(begin = prev_scale_3_begin_0, end = prev_scale_3_end_0, end_mask = prev_scale_3_end_mask_0, squeeze_mask = prev_scale_3_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_3")];
-            tensor<fp32, [1, 1, 256]> var_352 = linear(bias = encoder_q_proj_1_bias, weight = encoder_q_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_12")];
-            tensor<int32, [4]> var_353 = const()[name = tensor<string, []>("op_353"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_354 = reshape(shape = var_353, x = var_352)[name = tensor<string, []>("op_354")];
+            tensor<fp32, [1, 1, 256]> var_378 = linear(bias = inner_encoder_q_proj_1_bias, weight = inner_encoder_q_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_12")];
+            tensor<int32, [4]> var_379 = const()[name = tensor<string, []>("op_379"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_380 = reshape(shape = var_379, x = var_378)[name = tensor<string, []>("op_380")];
             tensor<int32, [4]> q_3_perm_0 = const()[name = tensor<string, []>("q_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> var_358 = linear(bias = encoder_k_proj_1_bias, weight = encoder_k_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_13")];
-            tensor<fp32, []> var_359 = const()[name = tensor<string, []>("op_359"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 1, 256]> var_360 = mul(x = var_358, y = var_359)[name = tensor<string, []>("op_360")];
-            tensor<int32, [4]> var_361 = const()[name = tensor<string, []>("op_361"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_362 = reshape(shape = var_361, x = var_360)[name = tensor<string, []>("op_362")];
+            tensor<fp32, [1, 1, 256]> var_384 = linear(bias = inner_encoder_k_proj_1_bias, weight = inner_encoder_k_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_13")];
+            tensor<fp32, []> var_385 = const()[name = tensor<string, []>("op_385"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 1, 256]> var_386 = mul(x = var_384, y = var_385)[name = tensor<string, []>("op_386")];
+            tensor<int32, [4]> var_387 = const()[name = tensor<string, []>("op_387"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_388 = reshape(shape = var_387, x = var_386)[name = tensor<string, []>("op_388")];
             tensor<int32, [4]> k_3_perm_0 = const()[name = tensor<string, []>("k_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> var_366 = linear(bias = encoder_v_proj_1_bias, weight = encoder_v_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_14")];
-            tensor<int32, [4]> var_367 = const()[name = tensor<string, []>("op_367"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_368 = reshape(shape = var_367, x = var_366)[name = tensor<string, []>("op_368")];
+            tensor<fp32, [1, 1, 256]> var_392 = linear(bias = inner_encoder_v_proj_1_bias, weight = inner_encoder_v_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_14")];
+            tensor<int32, [4]> var_393 = const()[name = tensor<string, []>("op_393"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_394 = reshape(shape = var_393, x = var_392)[name = tensor<string, []>("op_394")];
             tensor<int32, [4]> v_3_perm_0 = const()[name = tensor<string, []>("v_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> input_57 = linear(bias = encoder_g_proj_1_bias, weight = encoder_g_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_15")];
+            tensor<fp32, [1, 1, 256]> input_59 = linear(bias = inner_encoder_g_proj_1_bias, weight = inner_encoder_g_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_15")];
             tensor<fp32, [1]> sqrt_s0_3 = sqrt(x = prev_scale_3)[name = tensor<string, []>("sqrt_s0_3")];
-            tensor<fp32, [1]> s_t_3 = add(x = prev_scale_3, y = encoder__t_index)[name = tensor<string, []>("s_t_3")];
+            tensor<fp32, [1]> s_t_3 = add(x = prev_scale_3, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_3")];
             tensor<fp32, [1]> sqrt_s_t_3 = sqrt(x = s_t_3)[name = tensor<string, []>("sqrt_s_t_3")];
             tensor<bool, []> qk_3_transpose_x_1 = const()[name = tensor<string, []>("qk_3_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_3_transpose_y_1 = const()[name = tensor<string, []>("qk_3_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 1, 64]> k_3 = transpose(perm = k_3_perm_0, x = var_362)[name = tensor<string, []>("transpose_50")];
-            tensor<fp32, [1, 4, 1, 64]> q_3 = transpose(perm = q_3_perm_0, x = var_354)[name = tensor<string, []>("transpose_51")];
+            tensor<fp32, [1, 4, 1, 64]> k_3 = transpose(perm = k_3_perm_0, x = var_388)[name = tensor<string, []>("transpose_50")];
+            tensor<fp32, [1, 4, 1, 64]> q_3 = transpose(perm = q_3_perm_0, x = var_380)[name = tensor<string, []>("transpose_51")];
             tensor<fp32, [1, 4, 1, 1]> qk_3 = matmul(transpose_x = qk_3_transpose_x_1, transpose_y = qk_3_transpose_y_1, x = q_3, y = k_3)[name = tensor<string, []>("qk_3")];
-            tensor<int32, [2]> var_378 = const()[name = tensor<string, []>("op_378"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp32, [1, 1]> var_379 = reshape(shape = var_378, x = sqrt_s_t_3)[name = tensor<string, []>("op_379")];
-            tensor<fp32, [1, 1]> M_3 = real_div(x = encoder__causal_mask, y = var_379)[name = tensor<string, []>("M_3")];
-            tensor<fp32, [1, 4, 1, 1]> var_381 = mul(x = qk_3, y = M_3)[name = tensor<string, []>("op_381")];
+            tensor<int32, [2]> var_404 = const()[name = tensor<string, []>("op_404"), val = tensor<int32, [2]>([1, 1])];
+            tensor<fp32, [1, 1]> var_405 = reshape(shape = var_404, x = sqrt_s_t_3)[name = tensor<string, []>("op_405")];
+            tensor<fp32, [1, 1]> M_3 = real_div(x = inner_encoder__causal_mask, y = var_405)[name = tensor<string, []>("M_3")];
+            tensor<fp32, [1, 4, 1, 1]> var_407 = mul(x = qk_3, y = M_3)[name = tensor<string, []>("op_407")];
             tensor<bool, []> inner_3_transpose_x_0 = const()[name = tensor<string, []>("inner_3_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_3_transpose_y_0 = const()[name = tensor<string, []>("inner_3_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 1, 64]> v_3 = transpose(perm = v_3_perm_0, x = var_368)[name = tensor<string, []>("transpose_49")];
-            tensor<fp32, [1, 4, 1, 64]> inner_3 = matmul(transpose_x = inner_3_transpose_x_0, transpose_y = inner_3_transpose_y_0, x = var_381, y = v_3)[name = tensor<string, []>("inner_3")];
-            tensor<bool, []> var_383_transpose_x_0 = const()[name = tensor<string, []>("op_383_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_383_transpose_y_0 = const()[name = tensor<string, []>("op_383_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 1, 64]> var_383 = matmul(transpose_x = var_383_transpose_x_0, transpose_y = var_383_transpose_y_0, x = q_3, y = prev_kv_3)[name = tensor<string, []>("op_383")];
-            tensor<fp32, [1]> var_384 = real_div(x = sqrt_s0_3, y = sqrt_s_t_3)[name = tensor<string, []>("op_384")];
-            tensor<int32, [4]> var_385 = const()[name = tensor<string, []>("op_385"), val = tensor<int32, [4]>([1, 1, 1, 1])];
-            tensor<fp32, [1, 1, 1, 1]> var_386 = reshape(shape = var_385, x = var_384)[name = tensor<string, []>("op_386")];
-            tensor<fp32, [1, 4, 1, 64]> cross_3 = mul(x = var_383, y = var_386)[name = tensor<string, []>("cross_3")];
+            tensor<fp32, [1, 4, 1, 64]> v_3 = transpose(perm = v_3_perm_0, x = var_394)[name = tensor<string, []>("transpose_49")];
+            tensor<fp32, [1, 4, 1, 64]> inner_3 = matmul(transpose_x = inner_3_transpose_x_0, transpose_y = inner_3_transpose_y_0, x = var_407, y = v_3)[name = tensor<string, []>("inner_3")];
+            tensor<bool, []> var_409_transpose_x_0 = const()[name = tensor<string, []>("op_409_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_409_transpose_y_0 = const()[name = tensor<string, []>("op_409_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 1, 64]> var_409 = matmul(transpose_x = var_409_transpose_x_0, transpose_y = var_409_transpose_y_0, x = q_3, y = prev_kv_3)[name = tensor<string, []>("op_409")];
+            tensor<fp32, [1]> var_410 = real_div(x = sqrt_s0_3, y = sqrt_s_t_3)[name = tensor<string, []>("op_410")];
+            tensor<int32, [4]> var_411 = const()[name = tensor<string, []>("op_411"), val = tensor<int32, [4]>([1, 1, 1, 1])];
+            tensor<fp32, [1, 1, 1, 1]> var_412 = reshape(shape = var_411, x = var_410)[name = tensor<string, []>("op_412")];
+            tensor<fp32, [1, 4, 1, 64]> cross_3 = mul(x = var_409, y = var_412)[name = tensor<string, []>("cross_3")];
             tensor<fp32, [1, 4, 1, 64]> out_7 = add(x = inner_3, y = cross_3)[name = tensor<string, []>("out_7")];
-            tensor<fp32, [1, 4, 64, 64]> var_389 = mul(x = prev_kv_3, y = sqrt_s0_3)[name = tensor<string, []>("op_389")];
-            tensor<bool, []> var_391_transpose_x_1 = const()[name = tensor<string, []>("op_391_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_391_transpose_y_1 = const()[name = tensor<string, []>("op_391_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_391 = matmul(transpose_x = var_391_transpose_x_1, transpose_y = var_391_transpose_y_1, x = k_3, y = v_3)[name = tensor<string, []>("op_391")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_3 = add(x = var_389, y = var_391)[name = tensor<string, []>("new_kv_unnorm_3")];
-            tensor<fp32, []> var_393 = const()[name = tensor<string, []>("op_393"), val = tensor<fp32, []>(0x1p+0)];
-            tensor<fp32, [1]> new_scale_3 = add(x = prev_scale_3, y = var_393)[name = tensor<string, []>("new_scale_3")];
-            tensor<fp32, [1]> var_395 = sqrt(x = new_scale_3)[name = tensor<string, []>("op_395")];
-            tensor<fp32, [1, 4, 64, 64]> var_396 = real_div(x = new_kv_unnorm_3, y = var_395)[name = tensor<string, []>("op_396")];
-            tensor<int32, [4]> var_397_perm_0 = const()[name = tensor<string, []>("op_397_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_415 = mul(x = prev_kv_3, y = sqrt_s0_3)[name = tensor<string, []>("op_415")];
+            tensor<bool, []> var_417_transpose_x_1 = const()[name = tensor<string, []>("op_417_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_417_transpose_y_1 = const()[name = tensor<string, []>("op_417_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_417 = matmul(transpose_x = var_417_transpose_x_1, transpose_y = var_417_transpose_y_1, x = k_3, y = v_3)[name = tensor<string, []>("op_417")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_3 = add(x = var_415, y = var_417)[name = tensor<string, []>("new_kv_unnorm_3")];
+            tensor<fp32, []> var_419 = const()[name = tensor<string, []>("op_419"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<fp32, [1]> new_scale_3 = add(x = prev_scale_3, y = var_419)[name = tensor<string, []>("new_scale_3")];
+            tensor<fp32, [1]> var_421 = sqrt(x = new_scale_3)[name = tensor<string, []>("op_421")];
+            tensor<fp32, [1, 4, 64, 64]> var_422 = real_div(x = new_kv_unnorm_3, y = var_421)[name = tensor<string, []>("op_422")];
+            tensor<int32, [4]> var_423_perm_0 = const()[name = tensor<string, []>("op_423_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_9_axes_0 = const()[name = tensor<string, []>("out_9_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 4, 64]> var_397 = transpose(perm = var_397_perm_0, x = out_7)[name = tensor<string, []>("transpose_48")];
-            tensor<fp32, [1, 1, 4, 64]> out_9 = layer_norm(axes = out_9_axes_0, epsilon = var_17, x = var_397)[name = tensor<string, []>("out_9")];
-            tensor<int32, [3]> var_401 = const()[name = tensor<string, []>("op_401"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<fp32, [1, 1, 256]> out_11 = reshape(shape = var_401, x = out_9)[name = tensor<string, []>("out_11")];
-            tensor<fp32, [1, 1, 256]> var_403 = silu(x = input_57)[name = tensor<string, []>("op_403")];
-            tensor<fp32, [1, 1, 256]> input_59 = mul(x = var_403, y = out_11)[name = tensor<string, []>("input_59")];
-            tensor<fp32, [1, 1, 256]> ret_out_3 = linear(bias = encoder_out_proj_1_bias, weight = encoder_out_proj_1_weight, x = input_59)[name = tensor<string, []>("linear_16")];
-            tensor<fp32, [1, 1, 256]> x_9 = add(x = input_53, y = ret_out_3)[name = tensor<string, []>("x_9")];
+            tensor<fp32, [1, 1, 4, 64]> var_423 = transpose(perm = var_423_perm_0, x = out_7)[name = tensor<string, []>("transpose_48")];
+            tensor<fp32, [1, 1, 4, 64]> out_9 = layer_norm(axes = out_9_axes_0, epsilon = var_44, x = var_423)[name = tensor<string, []>("out_9")];
+            tensor<int32, [3]> var_427 = const()[name = tensor<string, []>("op_427"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<fp32, [1, 1, 256]> out_11 = reshape(shape = var_427, x = out_9)[name = tensor<string, []>("out_11")];
+            tensor<fp32, [1, 1, 256]> var_429 = silu(x = input_59)[name = tensor<string, []>("op_429")];
+            tensor<fp32, [1, 1, 256]> input_61 = mul(x = var_429, y = out_11)[name = tensor<string, []>("input_61")];
+            tensor<fp32, [1, 1, 256]> ret_out_3 = linear(bias = inner_encoder_out_proj_1_bias, weight = inner_encoder_out_proj_1_weight, x = input_61)[name = tensor<string, []>("linear_16")];
+            tensor<fp32, [1, 1, 256]> x_9 = add(x = input_55, y = ret_out_3)[name = tensor<string, []>("x_9")];
             tensor<int32, [4]> window_5_begin_0 = const()[name = tensor<string, []>("window_5_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> window_5_end_0 = const()[name = tensor<string, []>("window_5_end_0"), val = tensor<int32, [4]>([2, 1, 16, 256])];
             tensor<bool, [4]> window_5_end_mask_0 = const()[name = tensor<string, []>("window_5_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_5_squeeze_mask_0 = const()[name = tensor<string, []>("window_5_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_5 = slice_by_index(begin = window_5_begin_0, end = window_5_end_0, end_mask = window_5_end_mask_0, squeeze_mask = window_5_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_5")];
-            tensor<int32, [3]> var_414_begin_0 = const()[name = tensor<string, []>("op_414_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_414_end_0 = const()[name = tensor<string, []>("op_414_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_414_end_mask_0 = const()[name = tensor<string, []>("op_414_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_414 = slice_by_index(begin = var_414_begin_0, end = var_414_end_0, end_mask = var_414_end_mask_0, x = window_5)[name = tensor<string, []>("op_414")];
+            tensor<int32, [3]> var_440_begin_0 = const()[name = tensor<string, []>("op_440_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_440_end_0 = const()[name = tensor<string, []>("op_440_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_440_end_mask_0 = const()[name = tensor<string, []>("op_440_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_440 = slice_by_index(begin = var_440_begin_0, end = var_440_end_0, end_mask = var_440_end_mask_0, x = window_5)[name = tensor<string, []>("op_440")];
             tensor<bool, []> window_7_interleave_0 = const()[name = tensor<string, []>("window_7_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_7 = concat(axis = var_26, interleave = window_7_interleave_0, values = (var_414, x_9))[name = tensor<string, []>("window_7")];
-            tensor<bool, []> input_61_interleave_0 = const()[name = tensor<string, []>("input_61_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> input_61 = concat(axis = var_23, interleave = input_61_interleave_0, values = window_7)[name = tensor<string, []>("input_61")];
+            tensor<fp32, [1, 16, 256]> window_7 = concat(axis = var_52, interleave = window_7_interleave_0, values = (var_440, x_9))[name = tensor<string, []>("window_7")];
+            tensor<bool, []> input_63_interleave_0 = const()[name = tensor<string, []>("input_63_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 16, 256]> input_63 = concat(axis = var_39, interleave = input_63_interleave_0, values = window_7)[name = tensor<string, []>("input_63")];
             tensor<int32, [1]> x_11_axes_0 = const()[name = tensor<string, []>("x_11_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 16, 256]> x_11 = layer_norm(axes = x_11_axes_0, beta = encoder_conv_module_1_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_1_sequential_0_weight, x = input_61)[name = tensor<string, []>("x_11")];
-            tensor<int32, [3]> input_63_perm_0 = const()[name = tensor<string, []>("input_63_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [1, 16, 256]> x_11 = layer_norm(axes = x_11_axes_0, beta = inner_encoder_conv_module_1_sequential_0_bias, epsilon = var_36, gamma = inner_encoder_conv_module_1_sequential_0_weight, x = input_63)[name = tensor<string, []>("x_11")];
+            tensor<int32, [3]> input_65_perm_0 = const()[name = tensor<string, []>("input_65_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_13_pad_type_0 = const()[name = tensor<string, []>("inputs_13_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_13_strides_0 = const()[name = tensor<string, []>("inputs_13_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_13_pad_0 = const()[name = tensor<string, []>("inputs_13_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_13_dilations_0 = const()[name = tensor<string, []>("inputs_13_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_13_groups_0 = const()[name = tensor<string, []>("inputs_13_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> input_63 = transpose(perm = input_63_perm_0, x = x_11)[name = tensor<string, []>("transpose_47")];
-            tensor<fp32, [1, 512, 16]> inputs_13 = conv(bias = encoder_conv_module_1_sequential_2_conv_bias, dilations = inputs_13_dilations_0, groups = inputs_13_groups_0, pad = inputs_13_pad_0, pad_type = inputs_13_pad_type_0, strides = inputs_13_strides_0, weight = encoder_conv_module_1_sequential_2_conv_weight, x = input_63)[name = tensor<string, []>("inputs_13")];
-            tensor<int32, [2]> var_439_split_sizes_0 = const()[name = tensor<string, []>("op_439_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_439_axis_0 = const()[name = tensor<string, []>("op_439_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> var_439_0, tensor<fp32, [1, 256, 16]> var_439_1 = split(axis = var_439_axis_0, split_sizes = var_439_split_sizes_0, x = inputs_13)[name = tensor<string, []>("op_439")];
-            tensor<fp32, [1, 256, 16]> var_441 = sigmoid(x = var_439_1)[name = tensor<string, []>("op_441")];
-            tensor<fp32, [1, 256, 16]> inputs_15 = mul(x = var_439_0, y = var_441)[name = tensor<string, []>("inputs_15")];
+            tensor<fp32, [1, 256, 16]> input_65 = transpose(perm = input_65_perm_0, x = x_11)[name = tensor<string, []>("transpose_47")];
+            tensor<fp32, [1, 512, 16]> inputs_13 = conv(bias = inner_encoder_conv_module_1_sequential_2_conv_bias, dilations = inputs_13_dilations_0, groups = inputs_13_groups_0, pad = inputs_13_pad_0, pad_type = inputs_13_pad_type_0, strides = inputs_13_strides_0, weight = inner_encoder_conv_module_1_sequential_2_conv_weight, x = input_65)[name = tensor<string, []>("inputs_13")];
+            tensor<int32, [2]> var_465_split_sizes_0 = const()[name = tensor<string, []>("op_465_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_465_axis_0 = const()[name = tensor<string, []>("op_465_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 256, 16]> var_465_0, tensor<fp32, [1, 256, 16]> var_465_1 = split(axis = var_465_axis_0, split_sizes = var_465_split_sizes_0, x = inputs_13)[name = tensor<string, []>("op_465")];
+            tensor<fp32, [1, 256, 16]> var_467 = sigmoid(x = var_465_1)[name = tensor<string, []>("op_467")];
+            tensor<fp32, [1, 256, 16]> inputs_15 = mul(x = var_465_0, y = var_467)[name = tensor<string, []>("inputs_15")];
             tensor<string, []> outputs_aug_3_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_3_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_3_pad_0 = const()[name = tensor<string, []>("outputs_aug_3_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_3_groups_0 = const()[name = tensor<string, []>("outputs_aug_3_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_3_strides_0 = const()[name = tensor<string, []>("outputs_aug_3_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_3_dilations_0 = const()[name = tensor<string, []>("outputs_aug_3_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [1, 256, 31]> outputs_aug_3 = conv(dilations = outputs_aug_3_dilations_0, groups = outputs_aug_3_groups_0, pad = outputs_aug_3_pad_0, pad_type = outputs_aug_3_pad_type_0, strides = outputs_aug_3_strides_0, weight = encoder_conv_module_1_sequential_4_conv_weight, x = inputs_15)[name = tensor<string, []>("outputs_aug_3")];
-            tensor<int32, [3]> input_65_begin_0 = const()[name = tensor<string, []>("input_65_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_65_end_0 = const()[name = tensor<string, []>("input_65_end_0"), val = tensor<int32, [3]>([1, 256, 16])];
-            tensor<bool, [3]> input_65_end_mask_0 = const()[name = tensor<string, []>("input_65_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [1, 256, 16]> input_65 = slice_by_index(begin = input_65_begin_0, end = input_65_end_0, end_mask = input_65_end_mask_0, x = outputs_aug_3)[name = tensor<string, []>("input_65")];
-            tensor<fp32, [1, 256, 16]> inputs_17 = batch_norm(beta = encoder_conv_module_1_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_1_sequential_5_weight, mean = encoder_conv_module_1_sequential_5_running_mean, variance = encoder_conv_module_1_sequential_5_running_var, x = input_65)[name = tensor<string, []>("inputs_17")];
-            tensor<fp32, [1, 256, 16]> input_67 = silu(x = inputs_17)[name = tensor<string, []>("input_67")];
-            tensor<string, []> input_69_pad_type_0 = const()[name = tensor<string, []>("input_69_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_69_strides_0 = const()[name = tensor<string, []>("input_69_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_69_pad_0 = const()[name = tensor<string, []>("input_69_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_69_dilations_0 = const()[name = tensor<string, []>("input_69_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_69_groups_0 = const()[name = tensor<string, []>("input_69_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> input_69 = conv(bias = encoder_conv_module_1_sequential_7_conv_bias, dilations = input_69_dilations_0, groups = input_69_groups_0, pad = input_69_pad_0, pad_type = input_69_pad_type_0, strides = input_69_strides_0, weight = encoder_conv_module_1_sequential_7_conv_weight, x = input_67)[name = tensor<string, []>("input_69")];
+            tensor<fp32, [1, 256, 31]> outputs_aug_3 = conv(dilations = outputs_aug_3_dilations_0, groups = outputs_aug_3_groups_0, pad = outputs_aug_3_pad_0, pad_type = outputs_aug_3_pad_type_0, strides = outputs_aug_3_strides_0, weight = inner_encoder_conv_module_1_sequential_4_conv_weight, x = inputs_15)[name = tensor<string, []>("outputs_aug_3")];
+            tensor<int32, [3]> input_67_begin_0 = const()[name = tensor<string, []>("input_67_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_67_end_0 = const()[name = tensor<string, []>("input_67_end_0"), val = tensor<int32, [3]>([1, 256, 16])];
+            tensor<bool, [3]> input_67_end_mask_0 = const()[name = tensor<string, []>("input_67_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [1, 256, 16]> input_67 = slice_by_index(begin = input_67_begin_0, end = input_67_end_0, end_mask = input_67_end_mask_0, x = outputs_aug_3)[name = tensor<string, []>("input_67")];
+            tensor<fp32, [1, 256, 16]> inputs_17 = batch_norm(beta = inner_encoder_conv_module_1_sequential_5_bias, epsilon = var_36, gamma = inner_encoder_conv_module_1_sequential_5_weight, mean = inner_encoder_conv_module_1_sequential_5_running_mean, variance = inner_encoder_conv_module_1_sequential_5_running_var, x = input_67)[name = tensor<string, []>("inputs_17")];
+            tensor<fp32, [1, 256, 16]> input_69 = silu(x = inputs_17)[name = tensor<string, []>("input_69")];
+            tensor<string, []> input_71_pad_type_0 = const()[name = tensor<string, []>("input_71_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_71_strides_0 = const()[name = tensor<string, []>("input_71_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_71_pad_0 = const()[name = tensor<string, []>("input_71_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_71_dilations_0 = const()[name = tensor<string, []>("input_71_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_71_groups_0 = const()[name = tensor<string, []>("input_71_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 256, 16]> input_71 = conv(bias = inner_encoder_conv_module_1_sequential_7_conv_bias, dilations = input_71_dilations_0, groups = input_71_groups_0, pad = input_71_pad_0, pad_type = input_71_pad_type_0, strides = input_71_strides_0, weight = inner_encoder_conv_module_1_sequential_7_conv_weight, x = input_69)[name = tensor<string, []>("input_71")];
             tensor<int32, [3]> conv_out_3_perm_0 = const()[name = tensor<string, []>("conv_out_3_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_472_begin_0 = const()[name = tensor<string, []>("op_472_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_472_end_0 = const()[name = tensor<string, []>("op_472_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_472_end_mask_0 = const()[name = tensor<string, []>("op_472_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 16, 256]> conv_out_3 = transpose(perm = conv_out_3_perm_0, x = input_69)[name = tensor<string, []>("transpose_46")];
-            tensor<fp32, [1, 1, 256]> var_472 = slice_by_index(begin = var_472_begin_0, end = var_472_end_0, end_mask = var_472_end_mask_0, x = conv_out_3)[name = tensor<string, []>("op_472")];
-            tensor<int32, [3]> var_474_perm_0 = const()[name = tensor<string, []>("op_474_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 1, 256]> var_474 = transpose(perm = var_474_perm_0, x = var_472)[name = tensor<string, []>("transpose_45")];
-            tensor<fp32, [1, 1, 256]> input_71 = add(x = x_9, y = var_474)[name = tensor<string, []>("input_71")];
-            tensor<int32, [1]> input_73_axes_0 = const()[name = tensor<string, []>("input_73_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_73 = layer_norm(axes = input_73_axes_0, beta = encoder_ffn2_1_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_1_module_sequential_0_weight, x = input_71)[name = tensor<string, []>("input_73")];
-            tensor<fp32, [1, 1, 1024]> inputs_19 = linear(bias = encoder_ffn2_1_module_sequential_1_linear_bias, weight = encoder_ffn2_1_module_sequential_1_linear_weight, x = input_73)[name = tensor<string, []>("linear_17")];
-            tensor<fp32, [1, 1, 1024]> input_75 = silu(x = inputs_19)[name = tensor<string, []>("input_75")];
-            tensor<fp32, [1, 1, 256]> input_79 = linear(bias = encoder_ffn2_1_module_sequential_4_linear_bias, weight = encoder_ffn2_1_module_sequential_4_linear_weight, x = input_75)[name = tensor<string, []>("linear_18")];
-            tensor<fp32, []> var_497 = const()[name = tensor<string, []>("op_497"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 1, 256]> var_498 = mul(x = input_79, y = var_497)[name = tensor<string, []>("op_498")];
-            tensor<fp32, [1, 1, 256]> input_81 = add(x = var_498, y = input_71)[name = tensor<string, []>("input_81")];
-            tensor<int32, [1]> input_83_axes_0 = const()[name = tensor<string, []>("input_83_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_83 = layer_norm(axes = input_83_axes_0, beta = encoder_layer_norm_1_bias, epsilon = var_28, gamma = encoder_layer_norm_1_weight, x = input_81)[name = tensor<string, []>("input_83")];
+            tensor<int32, [3]> var_498_begin_0 = const()[name = tensor<string, []>("op_498_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_498_end_0 = const()[name = tensor<string, []>("op_498_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_498_end_mask_0 = const()[name = tensor<string, []>("op_498_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 16, 256]> conv_out_3 = transpose(perm = conv_out_3_perm_0, x = input_71)[name = tensor<string, []>("transpose_46")];
+            tensor<fp32, [1, 1, 256]> var_498 = slice_by_index(begin = var_498_begin_0, end = var_498_end_0, end_mask = var_498_end_mask_0, x = conv_out_3)[name = tensor<string, []>("op_498")];
+            tensor<int32, [3]> var_500_perm_0 = const()[name = tensor<string, []>("op_500_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 1, 256]> var_500 = transpose(perm = var_500_perm_0, x = var_498)[name = tensor<string, []>("transpose_45")];
+            tensor<fp32, [1, 1, 256]> input_73 = add(x = x_9, y = var_500)[name = tensor<string, []>("input_73")];
+            tensor<int32, [1]> input_75_axes_0 = const()[name = tensor<string, []>("input_75_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 1, 256]> input_75 = layer_norm(axes = input_75_axes_0, beta = inner_encoder_ffn2_1_module_sequential_0_bias, epsilon = var_36, gamma = inner_encoder_ffn2_1_module_sequential_0_weight, x = input_73)[name = tensor<string, []>("input_75")];
+            tensor<fp32, [1, 1, 1024]> inputs_19 = linear(bias = inner_encoder_ffn2_1_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_1_module_sequential_1_linear_weight, x = input_75)[name = tensor<string, []>("linear_17")];
+            tensor<fp32, [1, 1, 1024]> input_77 = silu(x = inputs_19)[name = tensor<string, []>("input_77")];
+            tensor<fp32, [1, 1, 256]> input_81 = linear(bias = inner_encoder_ffn2_1_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_1_module_sequential_4_linear_weight, x = input_77)[name = tensor<string, []>("linear_18")];
+            tensor<fp32, []> var_523 = const()[name = tensor<string, []>("op_523"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 1, 256]> var_524 = mul(x = input_81, y = var_523)[name = tensor<string, []>("op_524")];
+            tensor<fp32, [1, 1, 256]> input_83 = add(x = var_524, y = input_73)[name = tensor<string, []>("input_83")];
             tensor<int32, [1]> input_85_axes_0 = const()[name = tensor<string, []>("input_85_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_85 = layer_norm(axes = input_85_axes_0, beta = encoder_ffn1_2_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_2_module_sequential_0_weight, x = input_83)[name = tensor<string, []>("input_85")];
-            tensor<fp32, [1, 1, 1024]> inputs_21 = linear(bias = encoder_ffn1_2_module_sequential_1_linear_bias, weight = encoder_ffn1_2_module_sequential_1_linear_weight, x = input_85)[name = tensor<string, []>("linear_19")];
-            tensor<fp32, [1, 1, 1024]> input_87 = silu(x = inputs_21)[name = tensor<string, []>("input_87")];
-            tensor<fp32, [1, 1, 256]> input_91 = linear(bias = encoder_ffn1_2_module_sequential_4_linear_bias, weight = encoder_ffn1_2_module_sequential_4_linear_weight, x = input_87)[name = tensor<string, []>("linear_20")];
-            tensor<fp32, []> var_527 = const()[name = tensor<string, []>("op_527"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 1, 256]> var_528 = mul(x = input_91, y = var_527)[name = tensor<string, []>("op_528")];
-            tensor<fp32, [1, 1, 256]> input_93 = add(x = var_528, y = input_83)[name = tensor<string, []>("input_93")];
+            tensor<fp32, [1, 1, 256]> input_85 = layer_norm(axes = input_85_axes_0, beta = inner_encoder_layer_norm_1_bias, epsilon = var_36, gamma = inner_encoder_layer_norm_1_weight, x = input_83)[name = tensor<string, []>("input_85")];
+            tensor<int32, [1]> input_87_axes_0 = const()[name = tensor<string, []>("input_87_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 1, 256]> input_87 = layer_norm(axes = input_87_axes_0, beta = inner_encoder_ffn1_2_module_sequential_0_bias, epsilon = var_36, gamma = inner_encoder_ffn1_2_module_sequential_0_weight, x = input_85)[name = tensor<string, []>("input_87")];
+            tensor<fp32, [1, 1, 1024]> inputs_21 = linear(bias = inner_encoder_ffn1_2_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_2_module_sequential_1_linear_weight, x = input_87)[name = tensor<string, []>("linear_19")];
+            tensor<fp32, [1, 1, 1024]> input_89 = silu(x = inputs_21)[name = tensor<string, []>("input_89")];
+            tensor<fp32, [1, 1, 256]> input_93 = linear(bias = inner_encoder_ffn1_2_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_2_module_sequential_4_linear_weight, x = input_89)[name = tensor<string, []>("linear_20")];
+            tensor<fp32, []> var_553 = const()[name = tensor<string, []>("op_553"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 1, 256]> var_554 = mul(x = input_93, y = var_553)[name = tensor<string, []>("op_554")];
+            tensor<fp32, [1, 1, 256]> input_95 = add(x = var_554, y = input_85)[name = tensor<string, []>("input_95")];
             tensor<int32, [1]> x_13_axes_0 = const()[name = tensor<string, []>("x_13_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> x_13 = layer_norm(axes = x_13_axes_0, beta = encoder_ret_lns_2_bias, epsilon = var_28, gamma = encoder_ret_lns_2_weight, x = input_93)[name = tensor<string, []>("x_13")];
+            tensor<fp32, [1, 1, 256]> x_13 = layer_norm(axes = x_13_axes_0, beta = inner_encoder_ret_lns_2_bias, epsilon = var_36, gamma = inner_encoder_ret_lns_2_weight, x = input_95)[name = tensor<string, []>("x_13")];
             tensor<int32, [5]> prev_kv_5_begin_0 = const()[name = tensor<string, []>("prev_kv_5_begin_0"), val = tensor<int32, [5]>([2, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_5_end_0 = const()[name = tensor<string, []>("prev_kv_5_end_0"), val = tensor<int32, [5]>([3, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_5_end_mask_0 = const()[name = tensor<string, []>("prev_kv_5_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -524,139 +530,139 @@ program(1.0)
             tensor<bool, [2]> prev_scale_5_end_mask_0 = const()[name = tensor<string, []>("prev_scale_5_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_5_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_5_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_5 = slice_by_index(begin = prev_scale_5_begin_0, end = prev_scale_5_end_0, end_mask = prev_scale_5_end_mask_0, squeeze_mask = prev_scale_5_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_5")];
-            tensor<fp32, [1, 1, 256]> var_542 = linear(bias = encoder_q_proj_2_bias, weight = encoder_q_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_21")];
-            tensor<int32, [4]> var_543 = const()[name = tensor<string, []>("op_543"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_544 = reshape(shape = var_543, x = var_542)[name = tensor<string, []>("op_544")];
+            tensor<fp32, [1, 1, 256]> var_568 = linear(bias = inner_encoder_q_proj_2_bias, weight = inner_encoder_q_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_21")];
+            tensor<int32, [4]> var_569 = const()[name = tensor<string, []>("op_569"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_570 = reshape(shape = var_569, x = var_568)[name = tensor<string, []>("op_570")];
             tensor<int32, [4]> q_5_perm_0 = const()[name = tensor<string, []>("q_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> var_548 = linear(bias = encoder_k_proj_2_bias, weight = encoder_k_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_22")];
-            tensor<fp32, []> var_549 = const()[name = tensor<string, []>("op_549"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 1, 256]> var_550 = mul(x = var_548, y = var_549)[name = tensor<string, []>("op_550")];
-            tensor<int32, [4]> var_551 = const()[name = tensor<string, []>("op_551"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_552 = reshape(shape = var_551, x = var_550)[name = tensor<string, []>("op_552")];
+            tensor<fp32, [1, 1, 256]> var_574 = linear(bias = inner_encoder_k_proj_2_bias, weight = inner_encoder_k_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_22")];
+            tensor<fp32, []> var_575 = const()[name = tensor<string, []>("op_575"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 1, 256]> var_576 = mul(x = var_574, y = var_575)[name = tensor<string, []>("op_576")];
+            tensor<int32, [4]> var_577 = const()[name = tensor<string, []>("op_577"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_578 = reshape(shape = var_577, x = var_576)[name = tensor<string, []>("op_578")];
             tensor<int32, [4]> k_5_perm_0 = const()[name = tensor<string, []>("k_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> var_556 = linear(bias = encoder_v_proj_2_bias, weight = encoder_v_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_23")];
-            tensor<int32, [4]> var_557 = const()[name = tensor<string, []>("op_557"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_558 = reshape(shape = var_557, x = var_556)[name = tensor<string, []>("op_558")];
+            tensor<fp32, [1, 1, 256]> var_582 = linear(bias = inner_encoder_v_proj_2_bias, weight = inner_encoder_v_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_23")];
+            tensor<int32, [4]> var_583 = const()[name = tensor<string, []>("op_583"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_584 = reshape(shape = var_583, x = var_582)[name = tensor<string, []>("op_584")];
             tensor<int32, [4]> v_5_perm_0 = const()[name = tensor<string, []>("v_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> input_97 = linear(bias = encoder_g_proj_2_bias, weight = encoder_g_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_24")];
+            tensor<fp32, [1, 1, 256]> input_99 = linear(bias = inner_encoder_g_proj_2_bias, weight = inner_encoder_g_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_24")];
             tensor<fp32, [1]> sqrt_s0_5 = sqrt(x = prev_scale_5)[name = tensor<string, []>("sqrt_s0_5")];
-            tensor<fp32, [1]> s_t_5 = add(x = prev_scale_5, y = encoder__t_index)[name = tensor<string, []>("s_t_5")];
+            tensor<fp32, [1]> s_t_5 = add(x = prev_scale_5, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_5")];
             tensor<fp32, [1]> sqrt_s_t_5 = sqrt(x = s_t_5)[name = tensor<string, []>("sqrt_s_t_5")];
             tensor<bool, []> qk_5_transpose_x_1 = const()[name = tensor<string, []>("qk_5_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_5_transpose_y_1 = const()[name = tensor<string, []>("qk_5_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 1, 64]> k_5 = transpose(perm = k_5_perm_0, x = var_552)[name = tensor<string, []>("transpose_43")];
-            tensor<fp32, [1, 4, 1, 64]> q_5 = transpose(perm = q_5_perm_0, x = var_544)[name = tensor<string, []>("transpose_44")];
+            tensor<fp32, [1, 4, 1, 64]> k_5 = transpose(perm = k_5_perm_0, x = var_578)[name = tensor<string, []>("transpose_43")];
+            tensor<fp32, [1, 4, 1, 64]> q_5 = transpose(perm = q_5_perm_0, x = var_570)[name = tensor<string, []>("transpose_44")];
             tensor<fp32, [1, 4, 1, 1]> qk_5 = matmul(transpose_x = qk_5_transpose_x_1, transpose_y = qk_5_transpose_y_1, x = q_5, y = k_5)[name = tensor<string, []>("qk_5")];
-            tensor<int32, [2]> var_568 = const()[name = tensor<string, []>("op_568"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp32, [1, 1]> var_569 = reshape(shape = var_568, x = sqrt_s_t_5)[name = tensor<string, []>("op_569")];
-            tensor<fp32, [1, 1]> M_5 = real_div(x = encoder__causal_mask, y = var_569)[name = tensor<string, []>("M_5")];
-            tensor<fp32, [1, 4, 1, 1]> var_571 = mul(x = qk_5, y = M_5)[name = tensor<string, []>("op_571")];
+            tensor<int32, [2]> var_594 = const()[name = tensor<string, []>("op_594"), val = tensor<int32, [2]>([1, 1])];
+            tensor<fp32, [1, 1]> var_595 = reshape(shape = var_594, x = sqrt_s_t_5)[name = tensor<string, []>("op_595")];
+            tensor<fp32, [1, 1]> M_5 = real_div(x = inner_encoder__causal_mask, y = var_595)[name = tensor<string, []>("M_5")];
+            tensor<fp32, [1, 4, 1, 1]> var_597 = mul(x = qk_5, y = M_5)[name = tensor<string, []>("op_597")];
             tensor<bool, []> inner_5_transpose_x_0 = const()[name = tensor<string, []>("inner_5_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_5_transpose_y_0 = const()[name = tensor<string, []>("inner_5_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 1, 64]> v_5 = transpose(perm = v_5_perm_0, x = var_558)[name = tensor<string, []>("transpose_42")];
-            tensor<fp32, [1, 4, 1, 64]> inner_5 = matmul(transpose_x = inner_5_transpose_x_0, transpose_y = inner_5_transpose_y_0, x = var_571, y = v_5)[name = tensor<string, []>("inner_5")];
-            tensor<bool, []> var_573_transpose_x_0 = const()[name = tensor<string, []>("op_573_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_573_transpose_y_0 = const()[name = tensor<string, []>("op_573_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 1, 64]> var_573 = matmul(transpose_x = var_573_transpose_x_0, transpose_y = var_573_transpose_y_0, x = q_5, y = prev_kv_5)[name = tensor<string, []>("op_573")];
-            tensor<fp32, [1]> var_574 = real_div(x = sqrt_s0_5, y = sqrt_s_t_5)[name = tensor<string, []>("op_574")];
-            tensor<int32, [4]> var_575 = const()[name = tensor<string, []>("op_575"), val = tensor<int32, [4]>([1, 1, 1, 1])];
-            tensor<fp32, [1, 1, 1, 1]> var_576 = reshape(shape = var_575, x = var_574)[name = tensor<string, []>("op_576")];
-            tensor<fp32, [1, 4, 1, 64]> cross_5 = mul(x = var_573, y = var_576)[name = tensor<string, []>("cross_5")];
+            tensor<fp32, [1, 4, 1, 64]> v_5 = transpose(perm = v_5_perm_0, x = var_584)[name = tensor<string, []>("transpose_42")];
+            tensor<fp32, [1, 4, 1, 64]> inner_5 = matmul(transpose_x = inner_5_transpose_x_0, transpose_y = inner_5_transpose_y_0, x = var_597, y = v_5)[name = tensor<string, []>("inner_5")];
+            tensor<bool, []> var_599_transpose_x_0 = const()[name = tensor<string, []>("op_599_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_599_transpose_y_0 = const()[name = tensor<string, []>("op_599_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 1, 64]> var_599 = matmul(transpose_x = var_599_transpose_x_0, transpose_y = var_599_transpose_y_0, x = q_5, y = prev_kv_5)[name = tensor<string, []>("op_599")];
+            tensor<fp32, [1]> var_600 = real_div(x = sqrt_s0_5, y = sqrt_s_t_5)[name = tensor<string, []>("op_600")];
+            tensor<int32, [4]> var_601 = const()[name = tensor<string, []>("op_601"), val = tensor<int32, [4]>([1, 1, 1, 1])];
+            tensor<fp32, [1, 1, 1, 1]> var_602 = reshape(shape = var_601, x = var_600)[name = tensor<string, []>("op_602")];
+            tensor<fp32, [1, 4, 1, 64]> cross_5 = mul(x = var_599, y = var_602)[name = tensor<string, []>("cross_5")];
             tensor<fp32, [1, 4, 1, 64]> out_13 = add(x = inner_5, y = cross_5)[name = tensor<string, []>("out_13")];
-            tensor<fp32, [1, 4, 64, 64]> var_579 = mul(x = prev_kv_5, y = sqrt_s0_5)[name = tensor<string, []>("op_579")];
-            tensor<bool, []> var_581_transpose_x_1 = const()[name = tensor<string, []>("op_581_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_581_transpose_y_1 = const()[name = tensor<string, []>("op_581_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_581 = matmul(transpose_x = var_581_transpose_x_1, transpose_y = var_581_transpose_y_1, x = k_5, y = v_5)[name = tensor<string, []>("op_581")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_5 = add(x = var_579, y = var_581)[name = tensor<string, []>("new_kv_unnorm_5")];
-            tensor<fp32, []> var_583 = const()[name = tensor<string, []>("op_583"), val = tensor<fp32, []>(0x1p+0)];
-            tensor<fp32, [1]> new_scale_5 = add(x = prev_scale_5, y = var_583)[name = tensor<string, []>("new_scale_5")];
-            tensor<fp32, [1]> var_585 = sqrt(x = new_scale_5)[name = tensor<string, []>("op_585")];
-            tensor<fp32, [1, 4, 64, 64]> var_586 = real_div(x = new_kv_unnorm_5, y = var_585)[name = tensor<string, []>("op_586")];
-            tensor<int32, [4]> var_587_perm_0 = const()[name = tensor<string, []>("op_587_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_605 = mul(x = prev_kv_5, y = sqrt_s0_5)[name = tensor<string, []>("op_605")];
+            tensor<bool, []> var_607_transpose_x_1 = const()[name = tensor<string, []>("op_607_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_607_transpose_y_1 = const()[name = tensor<string, []>("op_607_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_607 = matmul(transpose_x = var_607_transpose_x_1, transpose_y = var_607_transpose_y_1, x = k_5, y = v_5)[name = tensor<string, []>("op_607")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_5 = add(x = var_605, y = var_607)[name = tensor<string, []>("new_kv_unnorm_5")];
+            tensor<fp32, []> var_609 = const()[name = tensor<string, []>("op_609"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<fp32, [1]> new_scale_5 = add(x = prev_scale_5, y = var_609)[name = tensor<string, []>("new_scale_5")];
+            tensor<fp32, [1]> var_611 = sqrt(x = new_scale_5)[name = tensor<string, []>("op_611")];
+            tensor<fp32, [1, 4, 64, 64]> var_612 = real_div(x = new_kv_unnorm_5, y = var_611)[name = tensor<string, []>("op_612")];
+            tensor<int32, [4]> var_613_perm_0 = const()[name = tensor<string, []>("op_613_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_15_axes_0 = const()[name = tensor<string, []>("out_15_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 4, 64]> var_587 = transpose(perm = var_587_perm_0, x = out_13)[name = tensor<string, []>("transpose_41")];
-            tensor<fp32, [1, 1, 4, 64]> out_15 = layer_norm(axes = out_15_axes_0, epsilon = var_17, x = var_587)[name = tensor<string, []>("out_15")];
-            tensor<int32, [3]> var_591 = const()[name = tensor<string, []>("op_591"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<fp32, [1, 1, 256]> out_17 = reshape(shape = var_591, x = out_15)[name = tensor<string, []>("out_17")];
-            tensor<fp32, [1, 1, 256]> var_593 = silu(x = input_97)[name = tensor<string, []>("op_593")];
-            tensor<fp32, [1, 1, 256]> input_99 = mul(x = var_593, y = out_17)[name = tensor<string, []>("input_99")];
-            tensor<fp32, [1, 1, 256]> ret_out_5 = linear(bias = encoder_out_proj_2_bias, weight = encoder_out_proj_2_weight, x = input_99)[name = tensor<string, []>("linear_25")];
-            tensor<fp32, [1, 1, 256]> x_15 = add(x = input_93, y = ret_out_5)[name = tensor<string, []>("x_15")];
+            tensor<fp32, [1, 1, 4, 64]> var_613 = transpose(perm = var_613_perm_0, x = out_13)[name = tensor<string, []>("transpose_41")];
+            tensor<fp32, [1, 1, 4, 64]> out_15 = layer_norm(axes = out_15_axes_0, epsilon = var_44, x = var_613)[name = tensor<string, []>("out_15")];
+            tensor<int32, [3]> var_617 = const()[name = tensor<string, []>("op_617"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<fp32, [1, 1, 256]> out_17 = reshape(shape = var_617, x = out_15)[name = tensor<string, []>("out_17")];
+            tensor<fp32, [1, 1, 256]> var_619 = silu(x = input_99)[name = tensor<string, []>("op_619")];
+            tensor<fp32, [1, 1, 256]> input_101 = mul(x = var_619, y = out_17)[name = tensor<string, []>("input_101")];
+            tensor<fp32, [1, 1, 256]> ret_out_5 = linear(bias = inner_encoder_out_proj_2_bias, weight = inner_encoder_out_proj_2_weight, x = input_101)[name = tensor<string, []>("linear_25")];
+            tensor<fp32, [1, 1, 256]> x_15 = add(x = input_95, y = ret_out_5)[name = tensor<string, []>("x_15")];
             tensor<int32, [4]> window_9_begin_0 = const()[name = tensor<string, []>("window_9_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> window_9_end_0 = const()[name = tensor<string, []>("window_9_end_0"), val = tensor<int32, [4]>([3, 1, 16, 256])];
             tensor<bool, [4]> window_9_end_mask_0 = const()[name = tensor<string, []>("window_9_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_9_squeeze_mask_0 = const()[name = tensor<string, []>("window_9_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_9 = slice_by_index(begin = window_9_begin_0, end = window_9_end_0, end_mask = window_9_end_mask_0, squeeze_mask = window_9_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_9")];
-            tensor<int32, [3]> var_604_begin_0 = const()[name = tensor<string, []>("op_604_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_604_end_0 = const()[name = tensor<string, []>("op_604_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_604_end_mask_0 = const()[name = tensor<string, []>("op_604_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_604 = slice_by_index(begin = var_604_begin_0, end = var_604_end_0, end_mask = var_604_end_mask_0, x = window_9)[name = tensor<string, []>("op_604")];
+            tensor<int32, [3]> var_630_begin_0 = const()[name = tensor<string, []>("op_630_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_630_end_0 = const()[name = tensor<string, []>("op_630_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_630_end_mask_0 = const()[name = tensor<string, []>("op_630_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_630 = slice_by_index(begin = var_630_begin_0, end = var_630_end_0, end_mask = var_630_end_mask_0, x = window_9)[name = tensor<string, []>("op_630")];
             tensor<bool, []> window_11_interleave_0 = const()[name = tensor<string, []>("window_11_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_11 = concat(axis = var_26, interleave = window_11_interleave_0, values = (var_604, x_15))[name = tensor<string, []>("window_11")];
-            tensor<bool, []> input_101_interleave_0 = const()[name = tensor<string, []>("input_101_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> input_101 = concat(axis = var_23, interleave = input_101_interleave_0, values = window_11)[name = tensor<string, []>("input_101")];
+            tensor<fp32, [1, 16, 256]> window_11 = concat(axis = var_52, interleave = window_11_interleave_0, values = (var_630, x_15))[name = tensor<string, []>("window_11")];
+            tensor<bool, []> input_103_interleave_0 = const()[name = tensor<string, []>("input_103_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 16, 256]> input_103 = concat(axis = var_39, interleave = input_103_interleave_0, values = window_11)[name = tensor<string, []>("input_103")];
             tensor<int32, [1]> x_17_axes_0 = const()[name = tensor<string, []>("x_17_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 16, 256]> x_17 = layer_norm(axes = x_17_axes_0, beta = encoder_conv_module_2_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_2_sequential_0_weight, x = input_101)[name = tensor<string, []>("x_17")];
-            tensor<int32, [3]> input_103_perm_0 = const()[name = tensor<string, []>("input_103_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [1, 16, 256]> x_17 = layer_norm(axes = x_17_axes_0, beta = inner_encoder_conv_module_2_sequential_0_bias, epsilon = var_36, gamma = inner_encoder_conv_module_2_sequential_0_weight, x = input_103)[name = tensor<string, []>("x_17")];
+            tensor<int32, [3]> input_105_perm_0 = const()[name = tensor<string, []>("input_105_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_23_pad_type_0 = const()[name = tensor<string, []>("inputs_23_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_23_strides_0 = const()[name = tensor<string, []>("inputs_23_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_23_pad_0 = const()[name = tensor<string, []>("inputs_23_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_23_dilations_0 = const()[name = tensor<string, []>("inputs_23_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_23_groups_0 = const()[name = tensor<string, []>("inputs_23_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> input_103 = transpose(perm = input_103_perm_0, x = x_17)[name = tensor<string, []>("transpose_40")];
-            tensor<fp32, [1, 512, 16]> inputs_23 = conv(bias = encoder_conv_module_2_sequential_2_conv_bias, dilations = inputs_23_dilations_0, groups = inputs_23_groups_0, pad = inputs_23_pad_0, pad_type = inputs_23_pad_type_0, strides = inputs_23_strides_0, weight = encoder_conv_module_2_sequential_2_conv_weight, x = input_103)[name = tensor<string, []>("inputs_23")];
-            tensor<int32, [2]> var_629_split_sizes_0 = const()[name = tensor<string, []>("op_629_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_629_axis_0 = const()[name = tensor<string, []>("op_629_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> var_629_0, tensor<fp32, [1, 256, 16]> var_629_1 = split(axis = var_629_axis_0, split_sizes = var_629_split_sizes_0, x = inputs_23)[name = tensor<string, []>("op_629")];
-            tensor<fp32, [1, 256, 16]> var_631 = sigmoid(x = var_629_1)[name = tensor<string, []>("op_631")];
-            tensor<fp32, [1, 256, 16]> inputs_25 = mul(x = var_629_0, y = var_631)[name = tensor<string, []>("inputs_25")];
+            tensor<fp32, [1, 256, 16]> input_105 = transpose(perm = input_105_perm_0, x = x_17)[name = tensor<string, []>("transpose_40")];
+            tensor<fp32, [1, 512, 16]> inputs_23 = conv(bias = inner_encoder_conv_module_2_sequential_2_conv_bias, dilations = inputs_23_dilations_0, groups = inputs_23_groups_0, pad = inputs_23_pad_0, pad_type = inputs_23_pad_type_0, strides = inputs_23_strides_0, weight = inner_encoder_conv_module_2_sequential_2_conv_weight, x = input_105)[name = tensor<string, []>("inputs_23")];
+            tensor<int32, [2]> var_655_split_sizes_0 = const()[name = tensor<string, []>("op_655_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_655_axis_0 = const()[name = tensor<string, []>("op_655_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 256, 16]> var_655_0, tensor<fp32, [1, 256, 16]> var_655_1 = split(axis = var_655_axis_0, split_sizes = var_655_split_sizes_0, x = inputs_23)[name = tensor<string, []>("op_655")];
+            tensor<fp32, [1, 256, 16]> var_657 = sigmoid(x = var_655_1)[name = tensor<string, []>("op_657")];
+            tensor<fp32, [1, 256, 16]> inputs_25 = mul(x = var_655_0, y = var_657)[name = tensor<string, []>("inputs_25")];
             tensor<string, []> outputs_aug_5_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_5_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_5_pad_0 = const()[name = tensor<string, []>("outputs_aug_5_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_5_groups_0 = const()[name = tensor<string, []>("outputs_aug_5_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_5_strides_0 = const()[name = tensor<string, []>("outputs_aug_5_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_5_dilations_0 = const()[name = tensor<string, []>("outputs_aug_5_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [1, 256, 31]> outputs_aug_5 = conv(dilations = outputs_aug_5_dilations_0, groups = outputs_aug_5_groups_0, pad = outputs_aug_5_pad_0, pad_type = outputs_aug_5_pad_type_0, strides = outputs_aug_5_strides_0, weight = encoder_conv_module_2_sequential_4_conv_weight, x = inputs_25)[name = tensor<string, []>("outputs_aug_5")];
-            tensor<int32, [3]> input_105_begin_0 = const()[name = tensor<string, []>("input_105_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_105_end_0 = const()[name = tensor<string, []>("input_105_end_0"), val = tensor<int32, [3]>([1, 256, 16])];
-            tensor<bool, [3]> input_105_end_mask_0 = const()[name = tensor<string, []>("input_105_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [1, 256, 16]> input_105 = slice_by_index(begin = input_105_begin_0, end = input_105_end_0, end_mask = input_105_end_mask_0, x = outputs_aug_5)[name = tensor<string, []>("input_105")];
-            tensor<fp32, [1, 256, 16]> inputs_27 = batch_norm(beta = encoder_conv_module_2_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_2_sequential_5_weight, mean = encoder_conv_module_2_sequential_5_running_mean, variance = encoder_conv_module_2_sequential_5_running_var, x = input_105)[name = tensor<string, []>("inputs_27")];
-            tensor<fp32, [1, 256, 16]> input_107 = silu(x = inputs_27)[name = tensor<string, []>("input_107")];
-            tensor<string, []> input_109_pad_type_0 = const()[name = tensor<string, []>("input_109_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_109_strides_0 = const()[name = tensor<string, []>("input_109_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_109_pad_0 = const()[name = tensor<string, []>("input_109_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_109_dilations_0 = const()[name = tensor<string, []>("input_109_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_109_groups_0 = const()[name = tensor<string, []>("input_109_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> input_109 = conv(bias = encoder_conv_module_2_sequential_7_conv_bias, dilations = input_109_dilations_0, groups = input_109_groups_0, pad = input_109_pad_0, pad_type = input_109_pad_type_0, strides = input_109_strides_0, weight = encoder_conv_module_2_sequential_7_conv_weight, x = input_107)[name = tensor<string, []>("input_109")];
+            tensor<fp32, [1, 256, 31]> outputs_aug_5 = conv(dilations = outputs_aug_5_dilations_0, groups = outputs_aug_5_groups_0, pad = outputs_aug_5_pad_0, pad_type = outputs_aug_5_pad_type_0, strides = outputs_aug_5_strides_0, weight = inner_encoder_conv_module_2_sequential_4_conv_weight, x = inputs_25)[name = tensor<string, []>("outputs_aug_5")];
+            tensor<int32, [3]> input_107_begin_0 = const()[name = tensor<string, []>("input_107_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_107_end_0 = const()[name = tensor<string, []>("input_107_end_0"), val = tensor<int32, [3]>([1, 256, 16])];
+            tensor<bool, [3]> input_107_end_mask_0 = const()[name = tensor<string, []>("input_107_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [1, 256, 16]> input_107 = slice_by_index(begin = input_107_begin_0, end = input_107_end_0, end_mask = input_107_end_mask_0, x = outputs_aug_5)[name = tensor<string, []>("input_107")];
+            tensor<fp32, [1, 256, 16]> inputs_27 = batch_norm(beta = inner_encoder_conv_module_2_sequential_5_bias, epsilon = var_36, gamma = inner_encoder_conv_module_2_sequential_5_weight, mean = inner_encoder_conv_module_2_sequential_5_running_mean, variance = inner_encoder_conv_module_2_sequential_5_running_var, x = input_107)[name = tensor<string, []>("inputs_27")];
+            tensor<fp32, [1, 256, 16]> input_109 = silu(x = inputs_27)[name = tensor<string, []>("input_109")];
+            tensor<string, []> input_111_pad_type_0 = const()[name = tensor<string, []>("input_111_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_111_strides_0 = const()[name = tensor<string, []>("input_111_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_111_pad_0 = const()[name = tensor<string, []>("input_111_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_111_dilations_0 = const()[name = tensor<string, []>("input_111_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_111_groups_0 = const()[name = tensor<string, []>("input_111_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 256, 16]> input_111 = conv(bias = inner_encoder_conv_module_2_sequential_7_conv_bias, dilations = input_111_dilations_0, groups = input_111_groups_0, pad = input_111_pad_0, pad_type = input_111_pad_type_0, strides = input_111_strides_0, weight = inner_encoder_conv_module_2_sequential_7_conv_weight, x = input_109)[name = tensor<string, []>("input_111")];
             tensor<int32, [3]> conv_out_5_perm_0 = const()[name = tensor<string, []>("conv_out_5_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_662_begin_0 = const()[name = tensor<string, []>("op_662_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_662_end_0 = const()[name = tensor<string, []>("op_662_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_662_end_mask_0 = const()[name = tensor<string, []>("op_662_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 16, 256]> conv_out_5 = transpose(perm = conv_out_5_perm_0, x = input_109)[name = tensor<string, []>("transpose_39")];
-            tensor<fp32, [1, 1, 256]> var_662 = slice_by_index(begin = var_662_begin_0, end = var_662_end_0, end_mask = var_662_end_mask_0, x = conv_out_5)[name = tensor<string, []>("op_662")];
-            tensor<int32, [3]> var_664_perm_0 = const()[name = tensor<string, []>("op_664_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 1, 256]> var_664 = transpose(perm = var_664_perm_0, x = var_662)[name = tensor<string, []>("transpose_38")];
-            tensor<fp32, [1, 1, 256]> input_111 = add(x = x_15, y = var_664)[name = tensor<string, []>("input_111")];
-            tensor<int32, [1]> input_113_axes_0 = const()[name = tensor<string, []>("input_113_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_113 = layer_norm(axes = input_113_axes_0, beta = encoder_ffn2_2_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_2_module_sequential_0_weight, x = input_111)[name = tensor<string, []>("input_113")];
-            tensor<fp32, [1, 1, 1024]> inputs_29 = linear(bias = encoder_ffn2_2_module_sequential_1_linear_bias, weight = encoder_ffn2_2_module_sequential_1_linear_weight, x = input_113)[name = tensor<string, []>("linear_26")];
-            tensor<fp32, [1, 1, 1024]> input_115 = silu(x = inputs_29)[name = tensor<string, []>("input_115")];
-            tensor<fp32, [1, 1, 256]> input_119 = linear(bias = encoder_ffn2_2_module_sequential_4_linear_bias, weight = encoder_ffn2_2_module_sequential_4_linear_weight, x = input_115)[name = tensor<string, []>("linear_27")];
-            tensor<fp32, []> var_687 = const()[name = tensor<string, []>("op_687"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 1, 256]> var_688 = mul(x = input_119, y = var_687)[name = tensor<string, []>("op_688")];
-            tensor<fp32, [1, 1, 256]> input_121 = add(x = var_688, y = input_111)[name = tensor<string, []>("input_121")];
-            tensor<int32, [1]> input_123_axes_0 = const()[name = tensor<string, []>("input_123_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_123 = layer_norm(axes = input_123_axes_0, beta = encoder_layer_norm_2_bias, epsilon = var_28, gamma = encoder_layer_norm_2_weight, x = input_121)[name = tensor<string, []>("input_123")];
+            tensor<int32, [3]> var_688_begin_0 = const()[name = tensor<string, []>("op_688_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_688_end_0 = const()[name = tensor<string, []>("op_688_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_688_end_mask_0 = const()[name = tensor<string, []>("op_688_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 16, 256]> conv_out_5 = transpose(perm = conv_out_5_perm_0, x = input_111)[name = tensor<string, []>("transpose_39")];
+            tensor<fp32, [1, 1, 256]> var_688 = slice_by_index(begin = var_688_begin_0, end = var_688_end_0, end_mask = var_688_end_mask_0, x = conv_out_5)[name = tensor<string, []>("op_688")];
+            tensor<int32, [3]> var_690_perm_0 = const()[name = tensor<string, []>("op_690_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 1, 256]> var_690 = transpose(perm = var_690_perm_0, x = var_688)[name = tensor<string, []>("transpose_38")];
+            tensor<fp32, [1, 1, 256]> input_113 = add(x = x_15, y = var_690)[name = tensor<string, []>("input_113")];
+            tensor<int32, [1]> input_115_axes_0 = const()[name = tensor<string, []>("input_115_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 1, 256]> input_115 = layer_norm(axes = input_115_axes_0, beta = inner_encoder_ffn2_2_module_sequential_0_bias, epsilon = var_36, gamma = inner_encoder_ffn2_2_module_sequential_0_weight, x = input_113)[name = tensor<string, []>("input_115")];
+            tensor<fp32, [1, 1, 1024]> inputs_29 = linear(bias = inner_encoder_ffn2_2_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_2_module_sequential_1_linear_weight, x = input_115)[name = tensor<string, []>("linear_26")];
+            tensor<fp32, [1, 1, 1024]> input_117 = silu(x = inputs_29)[name = tensor<string, []>("input_117")];
+            tensor<fp32, [1, 1, 256]> input_121 = linear(bias = inner_encoder_ffn2_2_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_2_module_sequential_4_linear_weight, x = input_117)[name = tensor<string, []>("linear_27")];
+            tensor<fp32, []> var_713 = const()[name = tensor<string, []>("op_713"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 1, 256]> var_714 = mul(x = input_121, y = var_713)[name = tensor<string, []>("op_714")];
+            tensor<fp32, [1, 1, 256]> input_123 = add(x = var_714, y = input_113)[name = tensor<string, []>("input_123")];
             tensor<int32, [1]> input_125_axes_0 = const()[name = tensor<string, []>("input_125_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_125 = layer_norm(axes = input_125_axes_0, beta = encoder_ffn1_3_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_3_module_sequential_0_weight, x = input_123)[name = tensor<string, []>("input_125")];
-            tensor<fp32, [1, 1, 1024]> inputs_31 = linear(bias = encoder_ffn1_3_module_sequential_1_linear_bias, weight = encoder_ffn1_3_module_sequential_1_linear_weight, x = input_125)[name = tensor<string, []>("linear_28")];
-            tensor<fp32, [1, 1, 1024]> input_127 = silu(x = inputs_31)[name = tensor<string, []>("input_127")];
-            tensor<fp32, [1, 1, 256]> input_131 = linear(bias = encoder_ffn1_3_module_sequential_4_linear_bias, weight = encoder_ffn1_3_module_sequential_4_linear_weight, x = input_127)[name = tensor<string, []>("linear_29")];
-            tensor<fp32, []> var_717 = const()[name = tensor<string, []>("op_717"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 1, 256]> var_718 = mul(x = input_131, y = var_717)[name = tensor<string, []>("op_718")];
-            tensor<fp32, [1, 1, 256]> input_133 = add(x = var_718, y = input_123)[name = tensor<string, []>("input_133")];
+            tensor<fp32, [1, 1, 256]> input_125 = layer_norm(axes = input_125_axes_0, beta = inner_encoder_layer_norm_2_bias, epsilon = var_36, gamma = inner_encoder_layer_norm_2_weight, x = input_123)[name = tensor<string, []>("input_125")];
+            tensor<int32, [1]> input_127_axes_0 = const()[name = tensor<string, []>("input_127_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 1, 256]> input_127 = layer_norm(axes = input_127_axes_0, beta = inner_encoder_ffn1_3_module_sequential_0_bias, epsilon = var_36, gamma = inner_encoder_ffn1_3_module_sequential_0_weight, x = input_125)[name = tensor<string, []>("input_127")];
+            tensor<fp32, [1, 1, 1024]> inputs_31 = linear(bias = inner_encoder_ffn1_3_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_3_module_sequential_1_linear_weight, x = input_127)[name = tensor<string, []>("linear_28")];
+            tensor<fp32, [1, 1, 1024]> input_129 = silu(x = inputs_31)[name = tensor<string, []>("input_129")];
+            tensor<fp32, [1, 1, 256]> input_133 = linear(bias = inner_encoder_ffn1_3_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_3_module_sequential_4_linear_weight, x = input_129)[name = tensor<string, []>("linear_29")];
+            tensor<fp32, []> var_743 = const()[name = tensor<string, []>("op_743"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 1, 256]> var_744 = mul(x = input_133, y = var_743)[name = tensor<string, []>("op_744")];
+            tensor<fp32, [1, 1, 256]> input_135 = add(x = var_744, y = input_125)[name = tensor<string, []>("input_135")];
             tensor<int32, [1]> x_19_axes_0 = const()[name = tensor<string, []>("x_19_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> x_19 = layer_norm(axes = x_19_axes_0, beta = encoder_ret_lns_3_bias, epsilon = var_28, gamma = encoder_ret_lns_3_weight, x = input_133)[name = tensor<string, []>("x_19")];
+            tensor<fp32, [1, 1, 256]> x_19 = layer_norm(axes = x_19_axes_0, beta = inner_encoder_ret_lns_3_bias, epsilon = var_36, gamma = inner_encoder_ret_lns_3_weight, x = input_135)[name = tensor<string, []>("x_19")];
             tensor<int32, [5]> prev_kv_7_begin_0 = const()[name = tensor<string, []>("prev_kv_7_begin_0"), val = tensor<int32, [5]>([3, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_7_end_0 = const()[name = tensor<string, []>("prev_kv_7_end_0"), val = tensor<int32, [5]>([4, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_7_end_mask_0 = const()[name = tensor<string, []>("prev_kv_7_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -667,175 +673,168 @@ program(1.0)
             tensor<bool, [2]> prev_scale_7_end_mask_0 = const()[name = tensor<string, []>("prev_scale_7_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_7_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_7_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_7 = slice_by_index(begin = prev_scale_7_begin_0, end = prev_scale_7_end_0, end_mask = prev_scale_7_end_mask_0, squeeze_mask = prev_scale_7_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_7")];
-            tensor<fp32, [1, 1, 256]> var_732 = linear(bias = encoder_q_proj_3_bias, weight = encoder_q_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_30")];
-            tensor<int32, [4]> var_733 = const()[name = tensor<string, []>("op_733"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_734 = reshape(shape = var_733, x = var_732)[name = tensor<string, []>("op_734")];
+            tensor<fp32, [1, 1, 256]> var_758 = linear(bias = inner_encoder_q_proj_3_bias, weight = inner_encoder_q_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_30")];
+            tensor<int32, [4]> var_759 = const()[name = tensor<string, []>("op_759"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_760 = reshape(shape = var_759, x = var_758)[name = tensor<string, []>("op_760")];
             tensor<int32, [4]> q_7_perm_0 = const()[name = tensor<string, []>("q_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> var_738 = linear(bias = encoder_k_proj_3_bias, weight = encoder_k_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_31")];
-            tensor<fp32, []> var_739 = const()[name = tensor<string, []>("op_739"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 1, 256]> var_740 = mul(x = var_738, y = var_739)[name = tensor<string, []>("op_740")];
-            tensor<int32, [4]> var_741 = const()[name = tensor<string, []>("op_741"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_742 = reshape(shape = var_741, x = var_740)[name = tensor<string, []>("op_742")];
+            tensor<fp32, [1, 1, 256]> var_764 = linear(bias = inner_encoder_k_proj_3_bias, weight = inner_encoder_k_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_31")];
+            tensor<fp32, []> var_765 = const()[name = tensor<string, []>("op_765"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 1, 256]> var_766 = mul(x = var_764, y = var_765)[name = tensor<string, []>("op_766")];
+            tensor<int32, [4]> var_767 = const()[name = tensor<string, []>("op_767"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_768 = reshape(shape = var_767, x = var_766)[name = tensor<string, []>("op_768")];
             tensor<int32, [4]> k_7_perm_0 = const()[name = tensor<string, []>("k_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> var_746 = linear(bias = encoder_v_proj_3_bias, weight = encoder_v_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_32")];
-            tensor<int32, [4]> var_747 = const()[name = tensor<string, []>("op_747"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_748 = reshape(shape = var_747, x = var_746)[name = tensor<string, []>("op_748")];
+            tensor<fp32, [1, 1, 256]> var_772 = linear(bias = inner_encoder_v_proj_3_bias, weight = inner_encoder_v_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_32")];
+            tensor<int32, [4]> var_773 = const()[name = tensor<string, []>("op_773"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_774 = reshape(shape = var_773, x = var_772)[name = tensor<string, []>("op_774")];
             tensor<int32, [4]> v_7_perm_0 = const()[name = tensor<string, []>("v_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> input_137 = linear(bias = encoder_g_proj_3_bias, weight = encoder_g_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_33")];
+            tensor<fp32, [1, 1, 256]> input_139 = linear(bias = inner_encoder_g_proj_3_bias, weight = inner_encoder_g_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_33")];
             tensor<fp32, [1]> sqrt_s0_7 = sqrt(x = prev_scale_7)[name = tensor<string, []>("sqrt_s0_7")];
-            tensor<fp32, [1]> s_t_7 = add(x = prev_scale_7, y = encoder__t_index)[name = tensor<string, []>("s_t_7")];
+            tensor<fp32, [1]> s_t_7 = add(x = prev_scale_7, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_7")];
             tensor<fp32, [1]> sqrt_s_t_7 = sqrt(x = s_t_7)[name = tensor<string, []>("sqrt_s_t_7")];
             tensor<bool, []> qk_7_transpose_x_1 = const()[name = tensor<string, []>("qk_7_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_7_transpose_y_1 = const()[name = tensor<string, []>("qk_7_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 1, 64]> k_7 = transpose(perm = k_7_perm_0, x = var_742)[name = tensor<string, []>("transpose_36")];
-            tensor<fp32, [1, 4, 1, 64]> q_7 = transpose(perm = q_7_perm_0, x = var_734)[name = tensor<string, []>("transpose_37")];
+            tensor<fp32, [1, 4, 1, 64]> k_7 = transpose(perm = k_7_perm_0, x = var_768)[name = tensor<string, []>("transpose_36")];
+            tensor<fp32, [1, 4, 1, 64]> q_7 = transpose(perm = q_7_perm_0, x = var_760)[name = tensor<string, []>("transpose_37")];
             tensor<fp32, [1, 4, 1, 1]> qk_7 = matmul(transpose_x = qk_7_transpose_x_1, transpose_y = qk_7_transpose_y_1, x = q_7, y = k_7)[name = tensor<string, []>("qk_7")];
-            tensor<int32, [2]> var_758 = const()[name = tensor<string, []>("op_758"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp32, [1, 1]> var_759 = reshape(shape = var_758, x = sqrt_s_t_7)[name = tensor<string, []>("op_759")];
-            tensor<fp32, [1, 1]> M_7 = real_div(x = encoder__causal_mask, y = var_759)[name = tensor<string, []>("M_7")];
-            tensor<fp32, [1, 4, 1, 1]> var_761 = mul(x = qk_7, y = M_7)[name = tensor<string, []>("op_761")];
+            tensor<int32, [2]> var_784 = const()[name = tensor<string, []>("op_784"), val = tensor<int32, [2]>([1, 1])];
+            tensor<fp32, [1, 1]> var_785 = reshape(shape = var_784, x = sqrt_s_t_7)[name = tensor<string, []>("op_785")];
+            tensor<fp32, [1, 1]> M_7 = real_div(x = inner_encoder__causal_mask, y = var_785)[name = tensor<string, []>("M_7")];
+            tensor<fp32, [1, 4, 1, 1]> var_787 = mul(x = qk_7, y = M_7)[name = tensor<string, []>("op_787")];
             tensor<bool, []> inner_7_transpose_x_0 = const()[name = tensor<string, []>("inner_7_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_7_transpose_y_0 = const()[name = tensor<string, []>("inner_7_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 1, 64]> v_7 = transpose(perm = v_7_perm_0, x = var_748)[name = tensor<string, []>("transpose_35")];
-            tensor<fp32, [1, 4, 1, 64]> inner_7 = matmul(transpose_x = inner_7_transpose_x_0, transpose_y = inner_7_transpose_y_0, x = var_761, y = v_7)[name = tensor<string, []>("inner_7")];
-            tensor<bool, []> var_763_transpose_x_0 = const()[name = tensor<string, []>("op_763_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_763_transpose_y_0 = const()[name = tensor<string, []>("op_763_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 1, 64]> var_763 = matmul(transpose_x = var_763_transpose_x_0, transpose_y = var_763_transpose_y_0, x = q_7, y = prev_kv_7)[name = tensor<string, []>("op_763")];
-            tensor<fp32, [1]> var_764 = real_div(x = sqrt_s0_7, y = sqrt_s_t_7)[name = tensor<string, []>("op_764")];
-            tensor<int32, [4]> var_765 = const()[name = tensor<string, []>("op_765"), val = tensor<int32, [4]>([1, 1, 1, 1])];
-            tensor<fp32, [1, 1, 1, 1]> var_766 = reshape(shape = var_765, x = var_764)[name = tensor<string, []>("op_766")];
-            tensor<fp32, [1, 4, 1, 64]> cross_7 = mul(x = var_763, y = var_766)[name = tensor<string, []>("cross_7")];
+            tensor<fp32, [1, 4, 1, 64]> v_7 = transpose(perm = v_7_perm_0, x = var_774)[name = tensor<string, []>("transpose_35")];
+            tensor<fp32, [1, 4, 1, 64]> inner_7 = matmul(transpose_x = inner_7_transpose_x_0, transpose_y = inner_7_transpose_y_0, x = var_787, y = v_7)[name = tensor<string, []>("inner_7")];
+            tensor<bool, []> var_789_transpose_x_0 = const()[name = tensor<string, []>("op_789_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_789_transpose_y_0 = const()[name = tensor<string, []>("op_789_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 1, 64]> var_789 = matmul(transpose_x = var_789_transpose_x_0, transpose_y = var_789_transpose_y_0, x = q_7, y = prev_kv_7)[name = tensor<string, []>("op_789")];
+            tensor<fp32, [1]> var_790 = real_div(x = sqrt_s0_7, y = sqrt_s_t_7)[name = tensor<string, []>("op_790")];
+            tensor<int32, [4]> var_791 = const()[name = tensor<string, []>("op_791"), val = tensor<int32, [4]>([1, 1, 1, 1])];
+            tensor<fp32, [1, 1, 1, 1]> var_792 = reshape(shape = var_791, x = var_790)[name = tensor<string, []>("op_792")];
+            tensor<fp32, [1, 4, 1, 64]> cross_7 = mul(x = var_789, y = var_792)[name = tensor<string, []>("cross_7")];
             tensor<fp32, [1, 4, 1, 64]> out_19 = add(x = inner_7, y = cross_7)[name = tensor<string, []>("out_19")];
-            tensor<fp32, [1, 4, 64, 64]> var_769 = mul(x = prev_kv_7, y = sqrt_s0_7)[name = tensor<string, []>("op_769")];
-            tensor<bool, []> var_771_transpose_x_1 = const()[name = tensor<string, []>("op_771_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_771_transpose_y_1 = const()[name = tensor<string, []>("op_771_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_771 = matmul(transpose_x = var_771_transpose_x_1, transpose_y = var_771_transpose_y_1, x = k_7, y = v_7)[name = tensor<string, []>("op_771")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_7 = add(x = var_769, y = var_771)[name = tensor<string, []>("new_kv_unnorm_7")];
-            tensor<fp32, []> var_773 = const()[name = tensor<string, []>("op_773"), val = tensor<fp32, []>(0x1p+0)];
-            tensor<fp32, [1]> new_scale_7 = add(x = prev_scale_7, y = var_773)[name = tensor<string, []>("new_scale_7")];
-            tensor<fp32, [1]> var_775 = sqrt(x = new_scale_7)[name = tensor<string, []>("op_775")];
-            tensor<fp32, [1, 4, 64, 64]> nkv_1 = real_div(x = new_kv_unnorm_7, y = var_775)[name = tensor<string, []>("nkv_1")];
-            tensor<int32, [4]> var_777_perm_0 = const()[name = tensor<string, []>("op_777_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_795 = mul(x = prev_kv_7, y = sqrt_s0_7)[name = tensor<string, []>("op_795")];
+            tensor<bool, []> var_797_transpose_x_1 = const()[name = tensor<string, []>("op_797_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_797_transpose_y_1 = const()[name = tensor<string, []>("op_797_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_797 = matmul(transpose_x = var_797_transpose_x_1, transpose_y = var_797_transpose_y_1, x = k_7, y = v_7)[name = tensor<string, []>("op_797")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_7 = add(x = var_795, y = var_797)[name = tensor<string, []>("new_kv_unnorm_7")];
+            tensor<fp32, []> var_799 = const()[name = tensor<string, []>("op_799"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<fp32, [1]> new_scale_7 = add(x = prev_scale_7, y = var_799)[name = tensor<string, []>("new_scale_7")];
+            tensor<fp32, [1]> var_801 = sqrt(x = new_scale_7)[name = tensor<string, []>("op_801")];
+            tensor<fp32, [1, 4, 64, 64]> nkv_1 = real_div(x = new_kv_unnorm_7, y = var_801)[name = tensor<string, []>("nkv_1")];
+            tensor<int32, [4]> var_803_perm_0 = const()[name = tensor<string, []>("op_803_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_21_axes_0 = const()[name = tensor<string, []>("out_21_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 4, 64]> var_777 = transpose(perm = var_777_perm_0, x = out_19)[name = tensor<string, []>("transpose_34")];
-            tensor<fp32, [1, 1, 4, 64]> out_21 = layer_norm(axes = out_21_axes_0, epsilon = var_17, x = var_777)[name = tensor<string, []>("out_21")];
-            tensor<int32, [3]> var_781 = const()[name = tensor<string, []>("op_781"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<fp32, [1, 1, 256]> out_23 = reshape(shape = var_781, x = out_21)[name = tensor<string, []>("out_23")];
-            tensor<fp32, [1, 1, 256]> var_783 = silu(x = input_137)[name = tensor<string, []>("op_783")];
-            tensor<fp32, [1, 1, 256]> input_139 = mul(x = var_783, y = out_23)[name = tensor<string, []>("input_139")];
-            tensor<fp32, [1, 1, 256]> ret_out_7 = linear(bias = encoder_out_proj_3_bias, weight = encoder_out_proj_3_weight, x = input_139)[name = tensor<string, []>("linear_34")];
-            tensor<fp32, [1, 1, 256]> x_21 = add(x = input_133, y = ret_out_7)[name = tensor<string, []>("x_21")];
+            tensor<fp32, [1, 1, 4, 64]> var_803 = transpose(perm = var_803_perm_0, x = out_19)[name = tensor<string, []>("transpose_34")];
+            tensor<fp32, [1, 1, 4, 64]> out_21 = layer_norm(axes = out_21_axes_0, epsilon = var_44, x = var_803)[name = tensor<string, []>("out_21")];
+            tensor<int32, [3]> var_807 = const()[name = tensor<string, []>("op_807"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<fp32, [1, 1, 256]> out_23 = reshape(shape = var_807, x = out_21)[name = tensor<string, []>("out_23")];
+            tensor<fp32, [1, 1, 256]> var_809 = silu(x = input_139)[name = tensor<string, []>("op_809")];
+            tensor<fp32, [1, 1, 256]> input_141 = mul(x = var_809, y = out_23)[name = tensor<string, []>("input_141")];
+            tensor<fp32, [1, 1, 256]> ret_out_7 = linear(bias = inner_encoder_out_proj_3_bias, weight = inner_encoder_out_proj_3_weight, x = input_141)[name = tensor<string, []>("linear_34")];
+            tensor<fp32, [1, 1, 256]> x_21 = add(x = input_135, y = ret_out_7)[name = tensor<string, []>("x_21")];
             tensor<int32, [4]> window_13_begin_0 = const()[name = tensor<string, []>("window_13_begin_0"), val = tensor<int32, [4]>([3, 0, 0, 0])];
             tensor<int32, [4]> window_13_end_0 = const()[name = tensor<string, []>("window_13_end_0"), val = tensor<int32, [4]>([4, 1, 16, 256])];
             tensor<bool, [4]> window_13_end_mask_0 = const()[name = tensor<string, []>("window_13_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_13_squeeze_mask_0 = const()[name = tensor<string, []>("window_13_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_13 = slice_by_index(begin = window_13_begin_0, end = window_13_end_0, end_mask = window_13_end_mask_0, squeeze_mask = window_13_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_13")];
-            tensor<int32, [3]> var_794_begin_0 = const()[name = tensor<string, []>("op_794_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_794_end_0 = const()[name = tensor<string, []>("op_794_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_794_end_mask_0 = const()[name = tensor<string, []>("op_794_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_794 = slice_by_index(begin = var_794_begin_0, end = var_794_end_0, end_mask = var_794_end_mask_0, x = window_13)[name = tensor<string, []>("op_794")];
+            tensor<int32, [3]> var_820_begin_0 = const()[name = tensor<string, []>("op_820_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_820_end_0 = const()[name = tensor<string, []>("op_820_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_820_end_mask_0 = const()[name = tensor<string, []>("op_820_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_820 = slice_by_index(begin = var_820_begin_0, end = var_820_end_0, end_mask = var_820_end_mask_0, x = window_13)[name = tensor<string, []>("op_820")];
             tensor<bool, []> window_interleave_0 = const()[name = tensor<string, []>("window_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window = concat(axis = var_26, interleave = window_interleave_0, values = (var_794, x_21))[name = tensor<string, []>("window")];
-            tensor<bool, []> input_141_interleave_0 = const()[name = tensor<string, []>("input_141_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> input_141 = concat(axis = var_23, interleave = input_141_interleave_0, values = window)[name = tensor<string, []>("input_141")];
+            tensor<fp32, [1, 16, 256]> window = concat(axis = var_52, interleave = window_interleave_0, values = (var_820, x_21))[name = tensor<string, []>("window")];
+            tensor<bool, []> input_143_interleave_0 = const()[name = tensor<string, []>("input_143_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 16, 256]> input_143 = concat(axis = var_39, interleave = input_143_interleave_0, values = window)[name = tensor<string, []>("input_143")];
             tensor<int32, [1]> x_23_axes_0 = const()[name = tensor<string, []>("x_23_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 16, 256]> x_23 = layer_norm(axes = x_23_axes_0, beta = encoder_conv_module_3_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_3_sequential_0_weight, x = input_141)[name = tensor<string, []>("x_23")];
-            tensor<int32, [3]> input_143_perm_0 = const()[name = tensor<string, []>("input_143_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [1, 16, 256]> x_23 = layer_norm(axes = x_23_axes_0, beta = inner_encoder_conv_module_3_sequential_0_bias, epsilon = var_36, gamma = inner_encoder_conv_module_3_sequential_0_weight, x = input_143)[name = tensor<string, []>("x_23")];
+            tensor<int32, [3]> input_145_perm_0 = const()[name = tensor<string, []>("input_145_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_33_pad_type_0 = const()[name = tensor<string, []>("inputs_33_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_33_strides_0 = const()[name = tensor<string, []>("inputs_33_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_33_pad_0 = const()[name = tensor<string, []>("inputs_33_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_33_dilations_0 = const()[name = tensor<string, []>("inputs_33_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_33_groups_0 = const()[name = tensor<string, []>("inputs_33_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> input_143 = transpose(perm = input_143_perm_0, x = x_23)[name = tensor<string, []>("transpose_33")];
-            tensor<fp32, [1, 512, 16]> inputs_33 = conv(bias = encoder_conv_module_3_sequential_2_conv_bias, dilations = inputs_33_dilations_0, groups = inputs_33_groups_0, pad = inputs_33_pad_0, pad_type = inputs_33_pad_type_0, strides = inputs_33_strides_0, weight = encoder_conv_module_3_sequential_2_conv_weight, x = input_143)[name = tensor<string, []>("inputs_33")];
-            tensor<int32, [2]> var_819_split_sizes_0 = const()[name = tensor<string, []>("op_819_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_819_axis_0 = const()[name = tensor<string, []>("op_819_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> var_819_0, tensor<fp32, [1, 256, 16]> var_819_1 = split(axis = var_819_axis_0, split_sizes = var_819_split_sizes_0, x = inputs_33)[name = tensor<string, []>("op_819")];
-            tensor<fp32, [1, 256, 16]> var_821 = sigmoid(x = var_819_1)[name = tensor<string, []>("op_821")];
-            tensor<fp32, [1, 256, 16]> inputs_35 = mul(x = var_819_0, y = var_821)[name = tensor<string, []>("inputs_35")];
+            tensor<fp32, [1, 256, 16]> input_145 = transpose(perm = input_145_perm_0, x = x_23)[name = tensor<string, []>("transpose_33")];
+            tensor<fp32, [1, 512, 16]> inputs_33 = conv(bias = inner_encoder_conv_module_3_sequential_2_conv_bias, dilations = inputs_33_dilations_0, groups = inputs_33_groups_0, pad = inputs_33_pad_0, pad_type = inputs_33_pad_type_0, strides = inputs_33_strides_0, weight = inner_encoder_conv_module_3_sequential_2_conv_weight, x = input_145)[name = tensor<string, []>("inputs_33")];
+            tensor<int32, [2]> var_845_split_sizes_0 = const()[name = tensor<string, []>("op_845_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_845_axis_0 = const()[name = tensor<string, []>("op_845_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 256, 16]> var_845_0, tensor<fp32, [1, 256, 16]> var_845_1 = split(axis = var_845_axis_0, split_sizes = var_845_split_sizes_0, x = inputs_33)[name = tensor<string, []>("op_845")];
+            tensor<fp32, [1, 256, 16]> var_847 = sigmoid(x = var_845_1)[name = tensor<string, []>("op_847")];
+            tensor<fp32, [1, 256, 16]> inputs_35 = mul(x = var_845_0, y = var_847)[name = tensor<string, []>("inputs_35")];
             tensor<string, []> outputs_aug_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_pad_0 = const()[name = tensor<string, []>("outputs_aug_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_groups_0 = const()[name = tensor<string, []>("outputs_aug_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_strides_0 = const()[name = tensor<string, []>("outputs_aug_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_dilations_0 = const()[name = tensor<string, []>("outputs_aug_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [1, 256, 31]> outputs_aug = conv(dilations = outputs_aug_dilations_0, groups = outputs_aug_groups_0, pad = outputs_aug_pad_0, pad_type = outputs_aug_pad_type_0, strides = outputs_aug_strides_0, weight = encoder_conv_module_3_sequential_4_conv_weight, x = inputs_35)[name = tensor<string, []>("outputs_aug")];
-            tensor<int32, [3]> input_145_begin_0 = const()[name = tensor<string, []>("input_145_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_145_end_0 = const()[name = tensor<string, []>("input_145_end_0"), val = tensor<int32, [3]>([1, 256, 16])];
-            tensor<bool, [3]> input_145_end_mask_0 = const()[name = tensor<string, []>("input_145_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [1, 256, 16]> input_145 = slice_by_index(begin = input_145_begin_0, end = input_145_end_0, end_mask = input_145_end_mask_0, x = outputs_aug)[name = tensor<string, []>("input_145")];
-            tensor<fp32, [1, 256, 16]> inputs_37 = batch_norm(beta = encoder_conv_module_3_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_3_sequential_5_weight, mean = encoder_conv_module_3_sequential_5_running_mean, variance = encoder_conv_module_3_sequential_5_running_var, x = input_145)[name = tensor<string, []>("inputs_37")];
-            tensor<fp32, [1, 256, 16]> input_147 = silu(x = inputs_37)[name = tensor<string, []>("input_147")];
-            tensor<string, []> input_149_pad_type_0 = const()[name = tensor<string, []>("input_149_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_149_strides_0 = const()[name = tensor<string, []>("input_149_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_149_pad_0 = const()[name = tensor<string, []>("input_149_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_149_dilations_0 = const()[name = tensor<string, []>("input_149_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_149_groups_0 = const()[name = tensor<string, []>("input_149_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> input_149 = conv(bias = encoder_conv_module_3_sequential_7_conv_bias, dilations = input_149_dilations_0, groups = input_149_groups_0, pad = input_149_pad_0, pad_type = input_149_pad_type_0, strides = input_149_strides_0, weight = encoder_conv_module_3_sequential_7_conv_weight, x = input_147)[name = tensor<string, []>("input_149")];
+            tensor<fp32, [1, 256, 31]> outputs_aug = conv(dilations = outputs_aug_dilations_0, groups = outputs_aug_groups_0, pad = outputs_aug_pad_0, pad_type = outputs_aug_pad_type_0, strides = outputs_aug_strides_0, weight = inner_encoder_conv_module_3_sequential_4_conv_weight, x = inputs_35)[name = tensor<string, []>("outputs_aug")];
+            tensor<int32, [3]> input_147_begin_0 = const()[name = tensor<string, []>("input_147_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_147_end_0 = const()[name = tensor<string, []>("input_147_end_0"), val = tensor<int32, [3]>([1, 256, 16])];
+            tensor<bool, [3]> input_147_end_mask_0 = const()[name = tensor<string, []>("input_147_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [1, 256, 16]> input_147 = slice_by_index(begin = input_147_begin_0, end = input_147_end_0, end_mask = input_147_end_mask_0, x = outputs_aug)[name = tensor<string, []>("input_147")];
+            tensor<fp32, [1, 256, 16]> inputs_37 = batch_norm(beta = inner_encoder_conv_module_3_sequential_5_bias, epsilon = var_36, gamma = inner_encoder_conv_module_3_sequential_5_weight, mean = inner_encoder_conv_module_3_sequential_5_running_mean, variance = inner_encoder_conv_module_3_sequential_5_running_var, x = input_147)[name = tensor<string, []>("inputs_37")];
+            tensor<fp32, [1, 256, 16]> input_149 = silu(x = inputs_37)[name = tensor<string, []>("input_149")];
+            tensor<string, []> input_151_pad_type_0 = const()[name = tensor<string, []>("input_151_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_151_strides_0 = const()[name = tensor<string, []>("input_151_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_151_pad_0 = const()[name = tensor<string, []>("input_151_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_151_dilations_0 = const()[name = tensor<string, []>("input_151_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_151_groups_0 = const()[name = tensor<string, []>("input_151_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 256, 16]> input_151 = conv(bias = inner_encoder_conv_module_3_sequential_7_conv_bias, dilations = input_151_dilations_0, groups = input_151_groups_0, pad = input_151_pad_0, pad_type = input_151_pad_type_0, strides = input_151_strides_0, weight = inner_encoder_conv_module_3_sequential_7_conv_weight, x = input_149)[name = tensor<string, []>("input_151")];
             tensor<int32, [3]> conv_out_7_perm_0 = const()[name = tensor<string, []>("conv_out_7_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_852_begin_0 = const()[name = tensor<string, []>("op_852_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_852_end_0 = const()[name = tensor<string, []>("op_852_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_852_end_mask_0 = const()[name = tensor<string, []>("op_852_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 16, 256]> conv_out_7 = transpose(perm = conv_out_7_perm_0, x = input_149)[name = tensor<string, []>("transpose_32")];
-            tensor<fp32, [1, 1, 256]> var_852 = slice_by_index(begin = var_852_begin_0, end = var_852_end_0, end_mask = var_852_end_mask_0, x = conv_out_7)[name = tensor<string, []>("op_852")];
-            tensor<int32, [3]> var_854_perm_0 = const()[name = tensor<string, []>("op_854_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 1, 256]> var_854 = transpose(perm = var_854_perm_0, x = var_852)[name = tensor<string, []>("transpose_31")];
-            tensor<fp32, [1, 1, 256]> input_151 = add(x = x_21, y = var_854)[name = tensor<string, []>("input_151")];
-            tensor<int32, [1]> input_153_axes_0 = const()[name = tensor<string, []>("input_153_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_153 = layer_norm(axes = input_153_axes_0, beta = encoder_ffn2_3_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_3_module_sequential_0_weight, x = input_151)[name = tensor<string, []>("input_153")];
-            tensor<fp32, [1, 1, 1024]> inputs = linear(bias = encoder_ffn2_3_module_sequential_1_linear_bias, weight = encoder_ffn2_3_module_sequential_1_linear_weight, x = input_153)[name = tensor<string, []>("linear_35")];
-            tensor<fp32, [1, 1, 1024]> input_155 = silu(x = inputs)[name = tensor<string, []>("input_155")];
-            tensor<fp32, [1, 1, 256]> input_159 = linear(bias = encoder_ffn2_3_module_sequential_4_linear_bias, weight = encoder_ffn2_3_module_sequential_4_linear_weight, x = input_155)[name = tensor<string, []>("linear_36")];
-            tensor<fp32, []> var_877 = const()[name = tensor<string, []>("op_877"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 1, 256]> var_878 = mul(x = input_159, y = var_877)[name = tensor<string, []>("op_878")];
-            tensor<fp32, [1, 1, 256]> input_161 = add(x = var_878, y = input_151)[name = tensor<string, []>("input_161")];
+            tensor<int32, [3]> var_878_begin_0 = const()[name = tensor<string, []>("op_878_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_878_end_0 = const()[name = tensor<string, []>("op_878_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_878_end_mask_0 = const()[name = tensor<string, []>("op_878_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 16, 256]> conv_out_7 = transpose(perm = conv_out_7_perm_0, x = input_151)[name = tensor<string, []>("transpose_32")];
+            tensor<fp32, [1, 1, 256]> var_878 = slice_by_index(begin = var_878_begin_0, end = var_878_end_0, end_mask = var_878_end_mask_0, x = conv_out_7)[name = tensor<string, []>("op_878")];
+            tensor<int32, [3]> var_880_perm_0 = const()[name = tensor<string, []>("op_880_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 1, 256]> var_880 = transpose(perm = var_880_perm_0, x = var_878)[name = tensor<string, []>("transpose_31")];
+            tensor<fp32, [1, 1, 256]> input_153 = add(x = x_21, y = var_880)[name = tensor<string, []>("input_153")];
+            tensor<int32, [1]> input_155_axes_0 = const()[name = tensor<string, []>("input_155_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 1, 256]> input_155 = layer_norm(axes = input_155_axes_0, beta = inner_encoder_ffn2_3_module_sequential_0_bias, epsilon = var_36, gamma = inner_encoder_ffn2_3_module_sequential_0_weight, x = input_153)[name = tensor<string, []>("input_155")];
+            tensor<fp32, [1, 1, 1024]> inputs = linear(bias = inner_encoder_ffn2_3_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_3_module_sequential_1_linear_weight, x = input_155)[name = tensor<string, []>("linear_35")];
+            tensor<fp32, [1, 1, 1024]> input_157 = silu(x = inputs)[name = tensor<string, []>("input_157")];
+            tensor<fp32, [1, 1, 256]> input_161 = linear(bias = inner_encoder_ffn2_3_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_3_module_sequential_4_linear_weight, x = input_157)[name = tensor<string, []>("linear_36")];
+            tensor<fp32, []> var_903 = const()[name = tensor<string, []>("op_903"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 1, 256]> var_904 = mul(x = input_161, y = var_903)[name = tensor<string, []>("op_904")];
+            tensor<fp32, [1, 1, 256]> input_163 = add(x = var_904, y = input_153)[name = tensor<string, []>("input_163")];
             tensor<int32, [1]> x_25_axes_0 = const()[name = tensor<string, []>("x_25_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> x_25 = layer_norm(axes = x_25_axes_0, beta = encoder_layer_norm_3_bias, epsilon = var_28, gamma = encoder_layer_norm_3_weight, x = input_161)[name = tensor<string, []>("x_25")];
+            tensor<fp32, [1, 1, 256]> x_25 = layer_norm(axes = x_25_axes_0, beta = inner_encoder_layer_norm_3_bias, epsilon = var_36, gamma = inner_encoder_layer_norm_3_weight, x = input_163)[name = tensor<string, []>("x_25")];
             tensor<int32, [3]> x_ct_perm_0 = const()[name = tensor<string, []>("x_ct_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<bool, []> cat_interleave_0 = const()[name = tensor<string, []>("cat_interleave_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [1, 256, 1]> x_ct = transpose(perm = x_ct_perm_0, x = x_25)[name = tensor<string, []>("transpose_30")];
-            tensor<fp32, [1, 256, 19]> cat = concat(axis = var_20, interleave = cat_interleave_0, values = (cnn_window, x_ct))[name = tensor<string, []>("cat")];
+            tensor<fp32, [1, 256, 19]> cat = concat(axis = var_41, interleave = cat_interleave_0, values = (cnn_window, x_ct))[name = tensor<string, []>("cat")];
             tensor<string, []> conv_out_pad_type_0 = const()[name = tensor<string, []>("conv_out_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> conv_out_strides_0 = const()[name = tensor<string, []>("conv_out_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> conv_out_pad_0 = const()[name = tensor<string, []>("conv_out_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> conv_out_dilations_0 = const()[name = tensor<string, []>("conv_out_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> conv_out_groups_0 = const()[name = tensor<string, []>("conv_out_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 1]> conv_out = conv(bias = encoder_cnn_bias, dilations = conv_out_dilations_0, groups = conv_out_groups_0, pad = conv_out_pad_0, pad_type = conv_out_pad_type_0, strides = conv_out_strides_0, weight = encoder_cnn_weight, x = cat)[name = tensor<string, []>("conv_out")];
-            tensor<int32, [3]> var_896_begin_0 = const()[name = tensor<string, []>("op_896_begin_0"), val = tensor<int32, [3]>([0, 0, 1])];
-            tensor<int32, [3]> var_896_end_0 = const()[name = tensor<string, []>("op_896_end_0"), val = tensor<int32, [3]>([1, 256, 19])];
-            tensor<bool, [3]> var_896_end_mask_0 = const()[name = tensor<string, []>("op_896_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 256, 18]> cnn_window_new = slice_by_index(begin = var_896_begin_0, end = var_896_end_0, end_mask = var_896_end_mask_0, x = cat)[name = tensor<string, []>("op_896")];
-            tensor<int32, [3]> input_163_perm_0 = const()[name = tensor<string, []>("input_163_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [1]> var_898 = const()[name = tensor<string, []>("op_898"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_163 = transpose(perm = input_163_perm_0, x = conv_out)[name = tensor<string, []>("transpose_29")];
-            tensor<fp32, [1, 1, 1]> var_899 = reduce_l2_norm(axes = var_898, keep_dims = var_29, x = input_163)[name = tensor<string, []>("op_899")];
+            tensor<fp32, [1, 256, 1]> conv_out = conv(bias = inner_encoder_cnn_bias, dilations = conv_out_dilations_0, groups = conv_out_groups_0, pad = conv_out_pad_0, pad_type = conv_out_pad_type_0, strides = conv_out_strides_0, weight = inner_encoder_cnn_weight, x = cat)[name = tensor<string, []>("conv_out")];
+            tensor<int32, [3]> var_922_begin_0 = const()[name = tensor<string, []>("op_922_begin_0"), val = tensor<int32, [3]>([0, 0, 1])];
+            tensor<int32, [3]> var_922_end_0 = const()[name = tensor<string, []>("op_922_end_0"), val = tensor<int32, [3]>([1, 256, 19])];
+            tensor<bool, [3]> var_922_end_mask_0 = const()[name = tensor<string, []>("op_922_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 256, 18]> cnn_window_new = slice_by_index(begin = var_922_begin_0, end = var_922_end_0, end_mask = var_922_end_mask_0, x = cat)[name = tensor<string, []>("op_922")];
+            tensor<int32, [3]> input_165_perm_0 = const()[name = tensor<string, []>("input_165_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_924 = const()[name = tensor<string, []>("op_924"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 1, 256]> input_165 = transpose(perm = input_165_perm_0, x = conv_out)[name = tensor<string, []>("transpose_29")];
+            tensor<fp32, [1, 1, 1]> var_925 = reduce_l2_norm(axes = var_924, keep_dims = var_35, x = input_165)[name = tensor<string, []>("op_925")];
             tensor<fp32, []> const_12 = const()[name = tensor<string, []>("const_12"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1, 1, 1]> clip_0 = clip(alpha = var_11, beta = const_12, x = var_899)[name = tensor<string, []>("clip_0")];
-            tensor<fp32, [1, 1, 256]> emb = real_div(x = input_163, y = clip_0)[name = tensor<string, []>("emb")];
-            tensor<int32, []> var_903_axis_0 = const()[name = tensor<string, []>("op_903_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1, 4, 64, 64]> enc_kv_new = stack(axis = var_903_axis_0, values = (var_206, var_396, var_586, nkv_1))[name = tensor<string, []>("op_903")];
-            tensor<int32, []> var_905_axis_0 = const()[name = tensor<string, []>("op_905_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1]> enc_scale_new = stack(axis = var_905_axis_0, values = (new_scale_1, new_scale_3, new_scale_5, new_scale_7))[name = tensor<string, []>("op_905")];
-            tensor<int32, []> var_907_axis_0 = const()[name = tensor<string, []>("op_907_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1, 16, 256]> enc_conv_cache_new = stack(axis = var_907_axis_0, values = (window_3, window_7, window_11, window))[name = tensor<string, []>("op_907")];
-            tensor<fp32, []> var_916 = const()[name = tensor<string, []>("op_916"), val = tensor<fp32, []>(0x1.5798eep-27)];
-            tensor<fp32, []> var_921 = const()[name = tensor<string, []>("op_921"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
-            tensor<fp32, []> var_923 = const()[name = tensor<string, []>("op_923"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
-            tensor<bool, []> var_924 = const()[name = tensor<string, []>("op_924"), val = tensor<bool, []>(true)];
-            tensor<fp32, []> var_926 = const()[name = tensor<string, []>("op_926"), val = tensor<fp32, []>(0x1p+0)];
-            tensor<int32, []> var_930 = const()[name = tensor<string, []>("op_930"), val = tensor<int32, []>(-1)];
-            tensor<int32, []> var_936 = const()[name = tensor<string, []>("op_936"), val = tensor<int32, []>(0)];
-            tensor<fp32, [1, 1, 12, 256]> var_993 = const()[name = tensor<string, []>("op_993"), val = tensor<fp32, [1, 1, 12, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44395328)))];
-            tensor<int32, [1]> var_998_axes_0 = const()[name = tensor<string, []>("op_998_axes_0"), val = tensor<int32, [1]>([2])];
-            tensor<fp32, [1, 1, 1, 256]> var_998 = expand_dims(axes = var_998_axes_0, x = emb)[name = tensor<string, []>("op_998")];
+            tensor<fp32, [1, 1, 1]> clip_0 = clip(alpha = var_49, beta = const_12, x = var_925)[name = tensor<string, []>("clip_0")];
+            tensor<fp32, [1, 1, 256]> emb = real_div(x = input_165, y = clip_0)[name = tensor<string, []>("emb")];
+            tensor<int32, []> var_929_axis_0 = const()[name = tensor<string, []>("op_929_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1, 4, 64, 64]> enc_kv_new = stack(axis = var_929_axis_0, values = (var_232, var_422, var_612, nkv_1))[name = tensor<string, []>("op_929")];
+            tensor<int32, []> var_931_axis_0 = const()[name = tensor<string, []>("op_931_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1]> enc_scale_new = stack(axis = var_931_axis_0, values = (new_scale_1, new_scale_3, new_scale_5, new_scale_7))[name = tensor<string, []>("op_931")];
+            tensor<int32, []> var_933_axis_0 = const()[name = tensor<string, []>("op_933_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1, 16, 256]> enc_conv_cache_new = stack(axis = var_933_axis_0, values = (window_3, window_7, window_11, window))[name = tensor<string, []>("op_933")];
+            tensor<fp32, [1, 1, 12, 256]> var_996 = const()[name = tensor<string, []>("op_996"), val = tensor<fp32, [1, 1, 12, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44395328)))];
+            tensor<int32, [1]> var_1001_axes_0 = const()[name = tensor<string, []>("op_1001_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp32, [1, 1, 1, 256]> var_1001 = expand_dims(axes = var_1001_axes_0, x = emb)[name = tensor<string, []>("op_1001")];
             tensor<int32, [4]> emb_exp_reps_0 = const()[name = tensor<string, []>("emb_exp_reps_0"), val = tensor<int32, [4]>([1, 1, 12, 1])];
-            tensor<fp32, [1, 1, 12, 256]> emb_exp = tile(reps = emb_exp_reps_0, x = var_998)[name = tensor<string, []>("emb_exp")];
-            tensor<bool, []> input_165_interleave_0 = const()[name = tensor<string, []>("input_165_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 1, 12, 512]> input_165 = concat(axis = var_930, interleave = input_165_interleave_0, values = (emb_exp, var_993))[name = tensor<string, []>("input_165")];
-            tensor<fp32, [1, 1, 12, 256]> x_27 = linear(bias = decoder_convert_bias, weight = decoder_convert_weight, x = input_165)[name = tensor<string, []>("linear_37")];
-            tensor<int32, [4]> var_1006_perm_0 = const()[name = tensor<string, []>("op_1006_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1010 = const()[name = tensor<string, []>("op_1010"), val = tensor<int32, [3]>([12, 1, 256])];
-            tensor<fp32, [1, 12, 1, 256]> var_1006 = transpose(perm = var_1006_perm_0, x = x_27)[name = tensor<string, []>("transpose_28")];
-            tensor<fp32, [12, 1, 256]> x_29 = reshape(shape = var_1010, x = var_1006)[name = tensor<string, []>("x_29")];
+            tensor<fp32, [1, 1, 12, 256]> emb_exp = tile(reps = emb_exp_reps_0, x = var_1001)[name = tensor<string, []>("emb_exp")];
+            tensor<bool, []> input_167_interleave_0 = const()[name = tensor<string, []>("input_167_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 1, 12, 512]> input_167 = concat(axis = var_42, interleave = input_167_interleave_0, values = (emb_exp, var_996))[name = tensor<string, []>("input_167")];
+            tensor<fp32, [1, 1, 12, 256]> x_27 = linear(bias = inner_decoder_convert_bias, weight = inner_decoder_convert_weight, x = input_167)[name = tensor<string, []>("linear_37")];
+            tensor<int32, [4]> var_1009_perm_0 = const()[name = tensor<string, []>("op_1009_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1013 = const()[name = tensor<string, []>("op_1013"), val = tensor<int32, [3]>([12, 1, 256])];
+            tensor<fp32, [1, 12, 1, 256]> var_1009 = transpose(perm = var_1009_perm_0, x = x_27)[name = tensor<string, []>("transpose_28")];
+            tensor<fp32, [12, 1, 256]> x_29 = reshape(shape = var_1013, x = var_1009)[name = tensor<string, []>("x_29")];
             tensor<int32, [5]> prev_kv_9_begin_0 = const()[name = tensor<string, []>("prev_kv_9_begin_0"), val = tensor<int32, [5]>([0, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_9_end_0 = const()[name = tensor<string, []>("prev_kv_9_end_0"), val = tensor<int32, [5]>([1, 12, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_9_end_mask_0 = const()[name = tensor<string, []>("prev_kv_9_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -846,131 +845,131 @@ program(1.0)
             tensor<bool, [2]> prev_scale_9_end_mask_0 = const()[name = tensor<string, []>("prev_scale_9_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_9_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_9_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_9 = slice_by_index(begin = prev_scale_9_begin_0, end = prev_scale_9_end_0, end_mask = prev_scale_9_end_mask_0, squeeze_mask = prev_scale_9_squeeze_mask_0, x = dec_scale)[name = tensor<string, []>("prev_scale_9")];
-            tensor<fp32, [12, 1, 256]> var_1018 = linear(bias = decoder_q_proj_0_bias, weight = decoder_q_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_38")];
-            tensor<int32, [4]> var_1019 = const()[name = tensor<string, []>("op_1019"), val = tensor<int32, [4]>([12, 1, 4, 64])];
-            tensor<fp32, [12, 1, 4, 64]> var_1020 = reshape(shape = var_1019, x = var_1018)[name = tensor<string, []>("op_1020")];
+            tensor<fp32, [12, 1, 256]> var_1021 = linear(bias = inner_decoder_q_proj_0_bias, weight = inner_decoder_q_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_38")];
+            tensor<int32, [4]> var_1022 = const()[name = tensor<string, []>("op_1022"), val = tensor<int32, [4]>([12, 1, 4, 64])];
+            tensor<fp32, [12, 1, 4, 64]> var_1023 = reshape(shape = var_1022, x = var_1021)[name = tensor<string, []>("op_1023")];
             tensor<int32, [4]> q_9_perm_0 = const()[name = tensor<string, []>("q_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 1, 256]> var_1024 = linear(bias = decoder_k_proj_0_bias, weight = decoder_k_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_39")];
-            tensor<fp32, []> var_1025 = const()[name = tensor<string, []>("op_1025"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [12, 1, 256]> var_1026 = mul(x = var_1024, y = var_1025)[name = tensor<string, []>("op_1026")];
-            tensor<int32, [4]> var_1027 = const()[name = tensor<string, []>("op_1027"), val = tensor<int32, [4]>([12, 1, 4, 64])];
-            tensor<fp32, [12, 1, 4, 64]> var_1028 = reshape(shape = var_1027, x = var_1026)[name = tensor<string, []>("op_1028")];
+            tensor<fp32, [12, 1, 256]> var_1027 = linear(bias = inner_decoder_k_proj_0_bias, weight = inner_decoder_k_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_39")];
+            tensor<fp32, []> var_1028 = const()[name = tensor<string, []>("op_1028"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [12, 1, 256]> var_1029 = mul(x = var_1027, y = var_1028)[name = tensor<string, []>("op_1029")];
+            tensor<int32, [4]> var_1030 = const()[name = tensor<string, []>("op_1030"), val = tensor<int32, [4]>([12, 1, 4, 64])];
+            tensor<fp32, [12, 1, 4, 64]> var_1031 = reshape(shape = var_1030, x = var_1029)[name = tensor<string, []>("op_1031")];
             tensor<int32, [4]> k_9_perm_0 = const()[name = tensor<string, []>("k_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 1, 256]> var_1032 = linear(bias = decoder_v_proj_0_bias, weight = decoder_v_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_40")];
-            tensor<int32, [4]> var_1033 = const()[name = tensor<string, []>("op_1033"), val = tensor<int32, [4]>([12, 1, 4, 64])];
-            tensor<fp32, [12, 1, 4, 64]> var_1034 = reshape(shape = var_1033, x = var_1032)[name = tensor<string, []>("op_1034")];
+            tensor<fp32, [12, 1, 256]> var_1035 = linear(bias = inner_decoder_v_proj_0_bias, weight = inner_decoder_v_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_40")];
+            tensor<int32, [4]> var_1036 = const()[name = tensor<string, []>("op_1036"), val = tensor<int32, [4]>([12, 1, 4, 64])];
+            tensor<fp32, [12, 1, 4, 64]> var_1037 = reshape(shape = var_1036, x = var_1035)[name = tensor<string, []>("op_1037")];
             tensor<int32, [4]> v_9_perm_0 = const()[name = tensor<string, []>("v_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 1, 256]> input_169 = linear(bias = decoder_g_proj_0_bias, weight = decoder_g_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_41")];
+            tensor<fp32, [12, 1, 256]> input_171 = linear(bias = inner_decoder_g_proj_0_bias, weight = inner_decoder_g_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_41")];
             tensor<bool, []> cumsum_mask_1_exclusive_0 = const()[name = tensor<string, []>("cumsum_mask_1_exclusive_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> cumsum_mask_1_reverse_0 = const()[name = tensor<string, []>("cumsum_mask_1_reverse_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1]> cumsum_mask_1 = cumsum(axis = var_936, exclusive = cumsum_mask_1_exclusive_0, reverse = cumsum_mask_1_reverse_0, x = valid_mask)[name = tensor<string, []>("cumsum_mask_1")];
+            tensor<fp32, [1]> cumsum_mask_1 = cumsum(axis = var_39, exclusive = cumsum_mask_1_exclusive_0, reverse = cumsum_mask_1_reverse_0, x = valid_mask)[name = tensor<string, []>("cumsum_mask_1")];
             tensor<fp32, [1]> sqrt_s0_9 = sqrt(x = prev_scale_9)[name = tensor<string, []>("sqrt_s0_9")];
             tensor<fp32, [1]> s_t_9 = add(x = prev_scale_9, y = cumsum_mask_1)[name = tensor<string, []>("s_t_9")];
             tensor<fp32, []> const_20 = const()[name = tensor<string, []>("const_20"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_1 = clip(alpha = var_926, beta = const_20, x = s_t_9)[name = tensor<string, []>("clip_1")];
+            tensor<fp32, [1]> clip_1 = clip(alpha = var_29, beta = const_20, x = s_t_9)[name = tensor<string, []>("clip_1")];
             tensor<fp32, [1]> sqrt_s_t_9 = sqrt(x = clip_1)[name = tensor<string, []>("sqrt_s_t_9")];
             tensor<bool, []> qk_9_transpose_x_1 = const()[name = tensor<string, []>("qk_9_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_9_transpose_y_1 = const()[name = tensor<string, []>("qk_9_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [12, 4, 1, 64]> k_9 = transpose(perm = k_9_perm_0, x = var_1028)[name = tensor<string, []>("transpose_26")];
-            tensor<fp32, [12, 4, 1, 64]> q_9 = transpose(perm = q_9_perm_0, x = var_1020)[name = tensor<string, []>("transpose_27")];
+            tensor<fp32, [12, 4, 1, 64]> k_9 = transpose(perm = k_9_perm_0, x = var_1031)[name = tensor<string, []>("transpose_26")];
+            tensor<fp32, [12, 4, 1, 64]> q_9 = transpose(perm = q_9_perm_0, x = var_1023)[name = tensor<string, []>("transpose_27")];
             tensor<fp32, [12, 4, 1, 1]> qk_9 = matmul(transpose_x = qk_9_transpose_x_1, transpose_y = qk_9_transpose_y_1, x = q_9, y = k_9)[name = tensor<string, []>("qk_9")];
-            tensor<int32, [2]> var_1046 = const()[name = tensor<string, []>("op_1046"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp32, [1, 1]> var_1047 = reshape(shape = var_1046, x = valid_mask)[name = tensor<string, []>("op_1047")];
             tensor<int32, [2]> var_1049 = const()[name = tensor<string, []>("op_1049"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp32, [1, 1]> var_1050 = reshape(shape = var_1049, x = sqrt_s_t_9)[name = tensor<string, []>("op_1050")];
-            tensor<fp32, [1, 1]> M_9 = real_div(x = var_1047, y = var_1050)[name = tensor<string, []>("M_9")];
-            tensor<fp32, [12, 4, 1, 1]> var_1052 = mul(x = qk_9, y = M_9)[name = tensor<string, []>("op_1052")];
+            tensor<fp32, [1, 1]> var_1050 = reshape(shape = var_1049, x = valid_mask)[name = tensor<string, []>("op_1050")];
+            tensor<int32, [2]> var_1052 = const()[name = tensor<string, []>("op_1052"), val = tensor<int32, [2]>([1, 1])];
+            tensor<fp32, [1, 1]> var_1053 = reshape(shape = var_1052, x = sqrt_s_t_9)[name = tensor<string, []>("op_1053")];
+            tensor<fp32, [1, 1]> M_9 = real_div(x = var_1050, y = var_1053)[name = tensor<string, []>("M_9")];
+            tensor<fp32, [12, 4, 1, 1]> var_1055 = mul(x = qk_9, y = M_9)[name = tensor<string, []>("op_1055")];
             tensor<bool, []> inner_9_transpose_x_0 = const()[name = tensor<string, []>("inner_9_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_9_transpose_y_0 = const()[name = tensor<string, []>("inner_9_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 1, 64]> v_9 = transpose(perm = v_9_perm_0, x = var_1034)[name = tensor<string, []>("transpose_25")];
-            tensor<fp32, [12, 4, 1, 64]> inner_9 = matmul(transpose_x = inner_9_transpose_x_0, transpose_y = inner_9_transpose_y_0, x = var_1052, y = v_9)[name = tensor<string, []>("inner_9")];
-            tensor<bool, []> var_1054_transpose_x_0 = const()[name = tensor<string, []>("op_1054_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_1054_transpose_y_0 = const()[name = tensor<string, []>("op_1054_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 1, 64]> var_1054 = matmul(transpose_x = var_1054_transpose_x_0, transpose_y = var_1054_transpose_y_0, x = q_9, y = prev_kv_9)[name = tensor<string, []>("op_1054")];
-            tensor<fp32, [1]> var_1055 = real_div(x = sqrt_s0_9, y = sqrt_s_t_9)[name = tensor<string, []>("op_1055")];
-            tensor<int32, [4]> var_1056 = const()[name = tensor<string, []>("op_1056"), val = tensor<int32, [4]>([1, 1, 1, 1])];
-            tensor<fp32, [1, 1, 1, 1]> var_1057 = reshape(shape = var_1056, x = var_1055)[name = tensor<string, []>("op_1057")];
-            tensor<fp32, [12, 4, 1, 64]> cross_9 = mul(x = var_1054, y = var_1057)[name = tensor<string, []>("cross_9")];
+            tensor<fp32, [12, 4, 1, 64]> v_9 = transpose(perm = v_9_perm_0, x = var_1037)[name = tensor<string, []>("transpose_25")];
+            tensor<fp32, [12, 4, 1, 64]> inner_9 = matmul(transpose_x = inner_9_transpose_x_0, transpose_y = inner_9_transpose_y_0, x = var_1055, y = v_9)[name = tensor<string, []>("inner_9")];
+            tensor<bool, []> var_1057_transpose_x_0 = const()[name = tensor<string, []>("op_1057_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_1057_transpose_y_0 = const()[name = tensor<string, []>("op_1057_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 1, 64]> var_1057 = matmul(transpose_x = var_1057_transpose_x_0, transpose_y = var_1057_transpose_y_0, x = q_9, y = prev_kv_9)[name = tensor<string, []>("op_1057")];
+            tensor<fp32, [1]> var_1058 = real_div(x = sqrt_s0_9, y = sqrt_s_t_9)[name = tensor<string, []>("op_1058")];
+            tensor<int32, [4]> var_1059 = const()[name = tensor<string, []>("op_1059"), val = tensor<int32, [4]>([1, 1, 1, 1])];
+            tensor<fp32, [1, 1, 1, 1]> var_1060 = reshape(shape = var_1059, x = var_1058)[name = tensor<string, []>("op_1060")];
+            tensor<fp32, [12, 4, 1, 64]> cross_9 = mul(x = var_1057, y = var_1060)[name = tensor<string, []>("cross_9")];
             tensor<fp32, [12, 4, 1, 64]> out_25 = add(x = inner_9, y = cross_9)[name = tensor<string, []>("out_25")];
-            tensor<int32, [4]> var_1060 = const()[name = tensor<string, []>("op_1060"), val = tensor<int32, [4]>([1, 1, 1, 1])];
-            tensor<fp32, [1, 1, 1, 1]> var_1061 = reshape(shape = var_1060, x = valid_mask)[name = tensor<string, []>("op_1061")];
-            tensor<fp32, [12, 4, 1, 64]> v_masked_1 = mul(x = v_9, y = var_1061)[name = tensor<string, []>("v_masked_1")];
-            tensor<fp32, [12, 4, 64, 64]> var_1063 = mul(x = prev_kv_9, y = sqrt_s0_9)[name = tensor<string, []>("op_1063")];
-            tensor<bool, []> var_1065_transpose_x_1 = const()[name = tensor<string, []>("op_1065_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_1065_transpose_y_1 = const()[name = tensor<string, []>("op_1065_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 64, 64]> var_1065 = matmul(transpose_x = var_1065_transpose_x_1, transpose_y = var_1065_transpose_y_1, x = k_9, y = v_masked_1)[name = tensor<string, []>("op_1065")];
-            tensor<fp32, [12, 4, 64, 64]> new_kv_unnorm_9 = add(x = var_1063, y = var_1065)[name = tensor<string, []>("new_kv_unnorm_9")];
-            tensor<bool, []> var_1067_keep_dims_0 = const()[name = tensor<string, []>("op_1067_keep_dims_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, []> var_1067 = reduce_sum(keep_dims = var_1067_keep_dims_0, x = valid_mask)[name = tensor<string, []>("op_1067")];
-            tensor<int32, [1]> var_1068 = const()[name = tensor<string, []>("op_1068"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [1]> var_1069 = reshape(shape = var_1068, x = var_1067)[name = tensor<string, []>("op_1069")];
-            tensor<fp32, [1]> new_scale_9 = add(x = prev_scale_9, y = var_1069)[name = tensor<string, []>("new_scale_9")];
+            tensor<int32, [4]> var_1063 = const()[name = tensor<string, []>("op_1063"), val = tensor<int32, [4]>([1, 1, 1, 1])];
+            tensor<fp32, [1, 1, 1, 1]> var_1064 = reshape(shape = var_1063, x = valid_mask)[name = tensor<string, []>("op_1064")];
+            tensor<fp32, [12, 4, 1, 64]> v_masked_1 = mul(x = v_9, y = var_1064)[name = tensor<string, []>("v_masked_1")];
+            tensor<fp32, [12, 4, 64, 64]> var_1066 = mul(x = prev_kv_9, y = sqrt_s0_9)[name = tensor<string, []>("op_1066")];
+            tensor<bool, []> var_1068_transpose_x_1 = const()[name = tensor<string, []>("op_1068_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_1068_transpose_y_1 = const()[name = tensor<string, []>("op_1068_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 64, 64]> var_1068 = matmul(transpose_x = var_1068_transpose_x_1, transpose_y = var_1068_transpose_y_1, x = k_9, y = v_masked_1)[name = tensor<string, []>("op_1068")];
+            tensor<fp32, [12, 4, 64, 64]> new_kv_unnorm_9 = add(x = var_1066, y = var_1068)[name = tensor<string, []>("new_kv_unnorm_9")];
+            tensor<bool, []> var_1070_keep_dims_0 = const()[name = tensor<string, []>("op_1070_keep_dims_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, []> var_1070 = reduce_sum(keep_dims = var_1070_keep_dims_0, x = valid_mask)[name = tensor<string, []>("op_1070")];
+            tensor<int32, [1]> var_1071 = const()[name = tensor<string, []>("op_1071"), val = tensor<int32, [1]>([1])];
+            tensor<fp32, [1]> var_1072 = reshape(shape = var_1071, x = var_1070)[name = tensor<string, []>("op_1072")];
+            tensor<fp32, [1]> new_scale_9 = add(x = prev_scale_9, y = var_1072)[name = tensor<string, []>("new_scale_9")];
             tensor<fp32, []> const_21 = const()[name = tensor<string, []>("const_21"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_2 = clip(alpha = var_926, beta = const_21, x = new_scale_9)[name = tensor<string, []>("clip_2")];
+            tensor<fp32, [1]> clip_2 = clip(alpha = var_29, beta = const_21, x = new_scale_9)[name = tensor<string, []>("clip_2")];
             tensor<fp32, [1]> sqrt_new_scale_1 = sqrt(x = clip_2)[name = tensor<string, []>("sqrt_new_scale_1")];
-            tensor<fp32, [12, 4, 64, 64]> var_1073 = real_div(x = new_kv_unnorm_9, y = sqrt_new_scale_1)[name = tensor<string, []>("op_1073")];
-            tensor<int32, [4]> var_1074_perm_0 = const()[name = tensor<string, []>("op_1074_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [12, 4, 64, 64]> var_1076 = real_div(x = new_kv_unnorm_9, y = sqrt_new_scale_1)[name = tensor<string, []>("op_1076")];
+            tensor<int32, [4]> var_1077_perm_0 = const()[name = tensor<string, []>("op_1077_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_27_axes_0 = const()[name = tensor<string, []>("out_27_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [12, 1, 4, 64]> var_1074 = transpose(perm = var_1074_perm_0, x = out_25)[name = tensor<string, []>("transpose_24")];
-            tensor<fp32, [12, 1, 4, 64]> out_27 = layer_norm(axes = out_27_axes_0, epsilon = var_923, x = var_1074)[name = tensor<string, []>("out_27")];
-            tensor<int32, [3]> var_1078 = const()[name = tensor<string, []>("op_1078"), val = tensor<int32, [3]>([12, 1, 256])];
-            tensor<fp32, [12, 1, 256]> out_29 = reshape(shape = var_1078, x = out_27)[name = tensor<string, []>("out_29")];
-            tensor<fp32, [12, 1, 256]> var_1080 = silu(x = input_169)[name = tensor<string, []>("op_1080")];
-            tensor<fp32, [12, 1, 256]> input_171 = mul(x = var_1080, y = out_29)[name = tensor<string, []>("input_171")];
-            tensor<fp32, [12, 1, 256]> ret_out_9 = linear(bias = decoder_out_proj_0_bias, weight = decoder_out_proj_0_weight, x = input_171)[name = tensor<string, []>("linear_42")];
-            tensor<fp32, [12, 1, 256]> input_173 = add(x = x_29, y = ret_out_9)[name = tensor<string, []>("input_173")];
+            tensor<fp32, [12, 1, 4, 64]> var_1077 = transpose(perm = var_1077_perm_0, x = out_25)[name = tensor<string, []>("transpose_24")];
+            tensor<fp32, [12, 1, 4, 64]> out_27 = layer_norm(axes = out_27_axes_0, epsilon = var_44, x = var_1077)[name = tensor<string, []>("out_27")];
+            tensor<int32, [3]> var_1081 = const()[name = tensor<string, []>("op_1081"), val = tensor<int32, [3]>([12, 1, 256])];
+            tensor<fp32, [12, 1, 256]> out_29 = reshape(shape = var_1081, x = out_27)[name = tensor<string, []>("out_29")];
+            tensor<fp32, [12, 1, 256]> var_1083 = silu(x = input_171)[name = tensor<string, []>("op_1083")];
+            tensor<fp32, [12, 1, 256]> input_173 = mul(x = var_1083, y = out_29)[name = tensor<string, []>("input_173")];
+            tensor<fp32, [12, 1, 256]> ret_out_9 = linear(bias = inner_decoder_out_proj_0_bias, weight = inner_decoder_out_proj_0_weight, x = input_173)[name = tensor<string, []>("linear_42")];
+            tensor<fp32, [12, 1, 256]> input_175 = add(x = x_29, y = ret_out_9)[name = tensor<string, []>("input_175")];
             tensor<int32, [1]> xt_1_axes_0 = const()[name = tensor<string, []>("xt_1_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [12, 1, 256]> xt_1 = layer_norm(axes = xt_1_axes_0, beta = decoder_norm11_0_bias, epsilon = var_921, gamma = decoder_norm11_0_weight, x = input_173)[name = tensor<string, []>("xt_1")];
-            tensor<int32, [4]> var_1090 = const()[name = tensor<string, []>("op_1090"), val = tensor<int32, [4]>([1, 12, 1, 256])];
-            tensor<fp32, [1, 12, 1, 256]> var_1091 = reshape(shape = var_1090, x = xt_1)[name = tensor<string, []>("op_1091")];
-            tensor<int32, [4]> var_1092_perm_0 = const()[name = tensor<string, []>("op_1092_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1095 = const()[name = tensor<string, []>("op_1095"), val = tensor<int32, [3]>([1, 12, 256])];
-            tensor<fp32, [1, 1, 12, 256]> var_1092 = transpose(perm = var_1092_perm_0, x = var_1091)[name = tensor<string, []>("transpose_23")];
-            tensor<fp32, [1, 12, 256]> query_1 = reshape(shape = var_1095, x = var_1092)[name = tensor<string, []>("query_1")];
+            tensor<fp32, [12, 1, 256]> xt_1 = layer_norm(axes = xt_1_axes_0, beta = inner_decoder_norm11_0_bias, epsilon = var_36, gamma = inner_decoder_norm11_0_weight, x = input_175)[name = tensor<string, []>("xt_1")];
+            tensor<int32, [4]> var_1093 = const()[name = tensor<string, []>("op_1093"), val = tensor<int32, [4]>([1, 12, 1, 256])];
+            tensor<fp32, [1, 12, 1, 256]> var_1094 = reshape(shape = var_1093, x = xt_1)[name = tensor<string, []>("op_1094")];
+            tensor<int32, [4]> var_1095_perm_0 = const()[name = tensor<string, []>("op_1095_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1098 = const()[name = tensor<string, []>("op_1098"), val = tensor<int32, [3]>([1, 12, 256])];
+            tensor<fp32, [1, 1, 12, 256]> var_1095 = transpose(perm = var_1095_perm_0, x = var_1094)[name = tensor<string, []>("transpose_23")];
+            tensor<fp32, [1, 12, 256]> query_1 = reshape(shape = var_1098, x = var_1095)[name = tensor<string, []>("query_1")];
             tensor<int32, [3]> query_3_perm_0 = const()[name = tensor<string, []>("query_3_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [12, 1, 256]> query_3 = transpose(perm = query_3_perm_0, x = query_1)[name = tensor<string, []>("transpose_22")];
-            tensor<fp32, [12, 1, 768]> var_1118 = linear(bias = decoder_self_attn2_0_in_proj_bias, weight = decoder_self_attn2_0_in_proj_weight, x = query_3)[name = tensor<string, []>("linear_43")];
+            tensor<fp32, [12, 1, 768]> var_1121 = linear(bias = inner_decoder_self_attn2_0_in_proj_bias, weight = inner_decoder_self_attn2_0_in_proj_weight, x = query_3)[name = tensor<string, []>("linear_43")];
             tensor<int32, [4]> concat_1 = const()[name = tensor<string, []>("concat_1"), val = tensor<int32, [4]>([12, 1, 3, 256])];
-            tensor<fp32, [12, 1, 3, 256]> var_1120 = reshape(shape = concat_1, x = var_1118)[name = tensor<string, []>("op_1120")];
-            tensor<int32, [1]> var_1121_axes_0 = const()[name = tensor<string, []>("op_1121_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp32, [1, 12, 1, 3, 256]> var_1121 = expand_dims(axes = var_1121_axes_0, x = var_1120)[name = tensor<string, []>("op_1121")];
-            tensor<int32, [5]> var_1122_perm_0 = const()[name = tensor<string, []>("op_1122_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
-            tensor<int32, [1]> var_1123_axes_0 = const()[name = tensor<string, []>("op_1123_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp32, [3, 12, 1, 1, 256]> var_1122 = transpose(perm = var_1122_perm_0, x = var_1121)[name = tensor<string, []>("transpose_21")];
-            tensor<fp32, [3, 12, 1, 256]> var_1123 = squeeze(axes = var_1123_axes_0, x = var_1122)[name = tensor<string, []>("op_1123")];
+            tensor<fp32, [12, 1, 3, 256]> var_1123 = reshape(shape = concat_1, x = var_1121)[name = tensor<string, []>("op_1123")];
+            tensor<int32, [1]> var_1124_axes_0 = const()[name = tensor<string, []>("op_1124_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 12, 1, 3, 256]> var_1124 = expand_dims(axes = var_1124_axes_0, x = var_1123)[name = tensor<string, []>("op_1124")];
+            tensor<int32, [5]> var_1125_perm_0 = const()[name = tensor<string, []>("op_1125_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
+            tensor<int32, [1]> var_1126_axes_0 = const()[name = tensor<string, []>("op_1126_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp32, [3, 12, 1, 1, 256]> var_1125 = transpose(perm = var_1125_perm_0, x = var_1124)[name = tensor<string, []>("transpose_21")];
+            tensor<fp32, [3, 12, 1, 256]> var_1126 = squeeze(axes = var_1126_axes_0, x = var_1125)[name = tensor<string, []>("op_1126")];
             tensor<int32, [4]> q_11_begin_0 = const()[name = tensor<string, []>("q_11_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> q_11_end_0 = const()[name = tensor<string, []>("q_11_end_0"), val = tensor<int32, [4]>([1, 12, 1, 256])];
             tensor<bool, [4]> q_11_end_mask_0 = const()[name = tensor<string, []>("q_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> q_11_squeeze_mask_0 = const()[name = tensor<string, []>("q_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 1, 256]> q_11 = slice_by_index(begin = q_11_begin_0, end = q_11_end_0, end_mask = q_11_end_mask_0, squeeze_mask = q_11_squeeze_mask_0, x = var_1123)[name = tensor<string, []>("q_11")];
+            tensor<fp32, [12, 1, 256]> q_11 = slice_by_index(begin = q_11_begin_0, end = q_11_end_0, end_mask = q_11_end_mask_0, squeeze_mask = q_11_squeeze_mask_0, x = var_1126)[name = tensor<string, []>("q_11")];
             tensor<int32, [4]> k_11_begin_0 = const()[name = tensor<string, []>("k_11_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> k_11_end_0 = const()[name = tensor<string, []>("k_11_end_0"), val = tensor<int32, [4]>([2, 12, 1, 256])];
             tensor<bool, [4]> k_11_end_mask_0 = const()[name = tensor<string, []>("k_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> k_11_squeeze_mask_0 = const()[name = tensor<string, []>("k_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 1, 256]> k_11 = slice_by_index(begin = k_11_begin_0, end = k_11_end_0, end_mask = k_11_end_mask_0, squeeze_mask = k_11_squeeze_mask_0, x = var_1123)[name = tensor<string, []>("k_11")];
+            tensor<fp32, [12, 1, 256]> k_11 = slice_by_index(begin = k_11_begin_0, end = k_11_end_0, end_mask = k_11_end_mask_0, squeeze_mask = k_11_squeeze_mask_0, x = var_1126)[name = tensor<string, []>("k_11")];
             tensor<int32, [4]> v_11_begin_0 = const()[name = tensor<string, []>("v_11_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> v_11_end_0 = const()[name = tensor<string, []>("v_11_end_0"), val = tensor<int32, [4]>([3, 12, 1, 256])];
             tensor<bool, [4]> v_11_end_mask_0 = const()[name = tensor<string, []>("v_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> v_11_squeeze_mask_0 = const()[name = tensor<string, []>("v_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 1, 256]> v_11 = slice_by_index(begin = v_11_begin_0, end = v_11_end_0, end_mask = v_11_end_mask_0, squeeze_mask = v_11_squeeze_mask_0, x = var_1123)[name = tensor<string, []>("v_11")];
-            tensor<int32, [3]> var_1131 = const()[name = tensor<string, []>("op_1131"), val = tensor<int32, [3]>([12, 4, 64])];
-            tensor<fp32, [12, 4, 64]> var_1132 = reshape(shape = var_1131, x = q_11)[name = tensor<string, []>("op_1132")];
+            tensor<fp32, [12, 1, 256]> v_11 = slice_by_index(begin = v_11_begin_0, end = v_11_end_0, end_mask = v_11_end_mask_0, squeeze_mask = v_11_squeeze_mask_0, x = var_1126)[name = tensor<string, []>("v_11")];
+            tensor<int32, [3]> var_1134 = const()[name = tensor<string, []>("op_1134"), val = tensor<int32, [3]>([12, 4, 64])];
+            tensor<fp32, [12, 4, 64]> var_1135 = reshape(shape = var_1134, x = q_11)[name = tensor<string, []>("op_1135")];
             tensor<int32, [3]> q_13_perm_0 = const()[name = tensor<string, []>("q_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1138 = const()[name = tensor<string, []>("op_1138"), val = tensor<int32, [3]>([12, 4, 64])];
-            tensor<fp32, [12, 4, 64]> var_1139 = reshape(shape = var_1138, x = k_11)[name = tensor<string, []>("op_1139")];
+            tensor<int32, [3]> var_1141 = const()[name = tensor<string, []>("op_1141"), val = tensor<int32, [3]>([12, 4, 64])];
+            tensor<fp32, [12, 4, 64]> var_1142 = reshape(shape = var_1141, x = k_11)[name = tensor<string, []>("op_1142")];
             tensor<int32, [3]> k_13_perm_0 = const()[name = tensor<string, []>("k_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1145 = const()[name = tensor<string, []>("op_1145"), val = tensor<int32, [3]>([12, 4, 64])];
-            tensor<fp32, [12, 4, 64]> var_1146 = reshape(shape = var_1145, x = v_11)[name = tensor<string, []>("op_1146")];
+            tensor<int32, [3]> var_1148 = const()[name = tensor<string, []>("op_1148"), val = tensor<int32, [3]>([12, 4, 64])];
+            tensor<fp32, [12, 4, 64]> var_1149 = reshape(shape = var_1148, x = v_11)[name = tensor<string, []>("op_1149")];
             tensor<int32, [3]> v_13_perm_0 = const()[name = tensor<string, []>("v_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [4]> var_1149 = const()[name = tensor<string, []>("op_1149"), val = tensor<int32, [4]>([1, 4, 12, 64])];
-            tensor<fp32, [4, 12, 64]> q_13 = transpose(perm = q_13_perm_0, x = var_1132)[name = tensor<string, []>("transpose_20")];
-            tensor<fp32, [1, 4, 12, 64]> q_15 = reshape(shape = var_1149, x = q_13)[name = tensor<string, []>("q_15")];
-            tensor<int32, [4]> var_1151 = const()[name = tensor<string, []>("op_1151"), val = tensor<int32, [4]>([1, 4, 12, 64])];
-            tensor<fp32, [4, 12, 64]> k_13 = transpose(perm = k_13_perm_0, x = var_1139)[name = tensor<string, []>("transpose_19")];
-            tensor<fp32, [1, 4, 12, 64]> k_15 = reshape(shape = var_1151, x = k_13)[name = tensor<string, []>("k_15")];
-            tensor<int32, [4]> var_1153 = const()[name = tensor<string, []>("op_1153"), val = tensor<int32, [4]>([1, 4, 12, 64])];
-            tensor<fp32, [4, 12, 64]> v_13 = transpose(perm = v_13_perm_0, x = var_1146)[name = tensor<string, []>("transpose_18")];
-            tensor<fp32, [1, 4, 12, 64]> v_15 = reshape(shape = var_1153, x = v_13)[name = tensor<string, []>("v_15")];
+            tensor<int32, [4]> var_1152 = const()[name = tensor<string, []>("op_1152"), val = tensor<int32, [4]>([1, 4, 12, 64])];
+            tensor<fp32, [4, 12, 64]> q_13 = transpose(perm = q_13_perm_0, x = var_1135)[name = tensor<string, []>("transpose_20")];
+            tensor<fp32, [1, 4, 12, 64]> q_15 = reshape(shape = var_1152, x = q_13)[name = tensor<string, []>("q_15")];
+            tensor<int32, [4]> var_1154 = const()[name = tensor<string, []>("op_1154"), val = tensor<int32, [4]>([1, 4, 12, 64])];
+            tensor<fp32, [4, 12, 64]> k_13 = transpose(perm = k_13_perm_0, x = var_1142)[name = tensor<string, []>("transpose_19")];
+            tensor<fp32, [1, 4, 12, 64]> k_15 = reshape(shape = var_1154, x = k_13)[name = tensor<string, []>("k_15")];
+            tensor<int32, [4]> var_1156 = const()[name = tensor<string, []>("op_1156"), val = tensor<int32, [4]>([1, 4, 12, 64])];
+            tensor<fp32, [4, 12, 64]> v_13 = transpose(perm = v_13_perm_0, x = var_1149)[name = tensor<string, []>("transpose_18")];
+            tensor<fp32, [1, 4, 12, 64]> v_15 = reshape(shape = var_1156, x = v_13)[name = tensor<string, []>("v_15")];
             tensor<fp32, []> mul_1_y_0 = const()[name = tensor<string, []>("mul_1_y_0"), val = tensor<fp32, []>(0x1p-3)];
             tensor<fp32, [1, 4, 12, 64]> mul_1 = mul(x = q_15, y = mul_1_y_0)[name = tensor<string, []>("mul_1")];
             tensor<bool, []> matmul_0_transpose_y_0 = const()[name = tensor<string, []>("matmul_0_transpose_y_0"), val = tensor<bool, []>(true)];
@@ -981,30 +980,30 @@ program(1.0)
             tensor<bool, []> attn_output_1_transpose_x_0 = const()[name = tensor<string, []>("attn_output_1_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> attn_output_1_transpose_y_0 = const()[name = tensor<string, []>("attn_output_1_transpose_y_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [1, 4, 12, 64]> attn_output_1 = matmul(transpose_x = attn_output_1_transpose_x_0, transpose_y = attn_output_1_transpose_y_0, x = softmax_0, y = v_15)[name = tensor<string, []>("attn_output_1")];
-            tensor<int32, [4]> var_1156 = const()[name = tensor<string, []>("op_1156"), val = tensor<int32, [4]>([2, 0, 1, 3])];
-            tensor<int32, [2]> var_1161 = const()[name = tensor<string, []>("op_1161"), val = tensor<int32, [2]>([12, 256])];
-            tensor<fp32, [12, 1, 4, 64]> var_1157 = transpose(perm = var_1156, x = attn_output_1)[name = tensor<string, []>("transpose_17")];
-            tensor<fp32, [12, 256]> attn_output_3 = reshape(shape = var_1161, x = var_1157)[name = tensor<string, []>("attn_output_3")];
-            tensor<fp32, [12, 256]> attn_output_5 = linear(bias = decoder_self_attn2_0_out_proj_bias, weight = decoder_self_attn2_0_out_proj_weight, x = attn_output_3)[name = tensor<string, []>("linear_44")];
-            tensor<int32, [3]> var_1165 = const()[name = tensor<string, []>("op_1165"), val = tensor<int32, [3]>([12, 1, 256])];
-            tensor<fp32, [12, 1, 256]> attn_output_7 = reshape(shape = var_1165, x = attn_output_5)[name = tensor<string, []>("attn_output_7")];
+            tensor<int32, [4]> var_1159 = const()[name = tensor<string, []>("op_1159"), val = tensor<int32, [4]>([2, 0, 1, 3])];
+            tensor<int32, [2]> var_1164 = const()[name = tensor<string, []>("op_1164"), val = tensor<int32, [2]>([12, 256])];
+            tensor<fp32, [12, 1, 4, 64]> var_1160 = transpose(perm = var_1159, x = attn_output_1)[name = tensor<string, []>("transpose_17")];
+            tensor<fp32, [12, 256]> attn_output_3 = reshape(shape = var_1164, x = var_1160)[name = tensor<string, []>("attn_output_3")];
+            tensor<fp32, [12, 256]> attn_output_5 = linear(bias = inner_decoder_self_attn2_0_out_proj_bias, weight = inner_decoder_self_attn2_0_out_proj_weight, x = attn_output_3)[name = tensor<string, []>("linear_44")];
+            tensor<int32, [3]> var_1168 = const()[name = tensor<string, []>("op_1168"), val = tensor<int32, [3]>([12, 1, 256])];
+            tensor<fp32, [12, 1, 256]> attn_output_7 = reshape(shape = var_1168, x = attn_output_5)[name = tensor<string, []>("attn_output_7")];
             tensor<int32, [3]> sa2_out_1_perm_0 = const()[name = tensor<string, []>("sa2_out_1_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [1, 12, 256]> sa2_out_1 = transpose(perm = sa2_out_1_perm_0, x = attn_output_7)[name = tensor<string, []>("transpose_16")];
-            tensor<fp32, [1, 12, 256]> input_175 = add(x = query_1, y = sa2_out_1)[name = tensor<string, []>("input_175")];
-            tensor<int32, [1]> input_177_axes_0 = const()[name = tensor<string, []>("input_177_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 12, 256]> input_177 = layer_norm(axes = input_177_axes_0, beta = decoder_norm21_0_bias, epsilon = var_921, gamma = decoder_norm21_0_weight, x = input_175)[name = tensor<string, []>("input_177")];
-            tensor<fp32, [1, 12, 2048]> input_179 = linear(bias = decoder_linear1_0_bias, weight = decoder_linear1_0_weight, x = input_177)[name = tensor<string, []>("linear_45")];
-            tensor<fp32, [1, 12, 2048]> input_181 = relu(x = input_179)[name = tensor<string, []>("input_181")];
-            tensor<fp32, [1, 12, 256]> ffn_out_1 = linear(bias = decoder_linear2_0_bias, weight = decoder_linear2_0_weight, x = input_181)[name = tensor<string, []>("linear_46")];
-            tensor<fp32, [1, 12, 256]> input_183 = add(x = input_177, y = ffn_out_1)[name = tensor<string, []>("input_183")];
+            tensor<fp32, [1, 12, 256]> input_177 = add(x = query_1, y = sa2_out_1)[name = tensor<string, []>("input_177")];
+            tensor<int32, [1]> input_179_axes_0 = const()[name = tensor<string, []>("input_179_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 12, 256]> input_179 = layer_norm(axes = input_179_axes_0, beta = inner_decoder_norm21_0_bias, epsilon = var_36, gamma = inner_decoder_norm21_0_weight, x = input_177)[name = tensor<string, []>("input_179")];
+            tensor<fp32, [1, 12, 2048]> input_181 = linear(bias = inner_decoder_linear1_0_bias, weight = inner_decoder_linear1_0_weight, x = input_179)[name = tensor<string, []>("linear_45")];
+            tensor<fp32, [1, 12, 2048]> input_183 = relu(x = input_181)[name = tensor<string, []>("input_183")];
+            tensor<fp32, [1, 12, 256]> ffn_out_1 = linear(bias = inner_decoder_linear2_0_bias, weight = inner_decoder_linear2_0_weight, x = input_183)[name = tensor<string, []>("linear_46")];
+            tensor<fp32, [1, 12, 256]> input_185 = add(x = input_179, y = ffn_out_1)[name = tensor<string, []>("input_185")];
             tensor<int32, [1]> xt_3_axes_0 = const()[name = tensor<string, []>("xt_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 12, 256]> xt_3 = layer_norm(axes = xt_3_axes_0, beta = decoder_norm22_0_bias, epsilon = var_921, gamma = decoder_norm22_0_weight, x = input_183)[name = tensor<string, []>("xt_3")];
-            tensor<int32, [4]> var_1185 = const()[name = tensor<string, []>("op_1185"), val = tensor<int32, [4]>([1, 1, 12, 256])];
-            tensor<fp32, [1, 1, 12, 256]> x_31 = reshape(shape = var_1185, x = xt_3)[name = tensor<string, []>("x_31")];
-            tensor<int32, [4]> var_1187_perm_0 = const()[name = tensor<string, []>("op_1187_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1191 = const()[name = tensor<string, []>("op_1191"), val = tensor<int32, [3]>([12, 1, 256])];
-            tensor<fp32, [1, 12, 1, 256]> var_1187 = transpose(perm = var_1187_perm_0, x = x_31)[name = tensor<string, []>("transpose_15")];
-            tensor<fp32, [12, 1, 256]> x = reshape(shape = var_1191, x = var_1187)[name = tensor<string, []>("x")];
+            tensor<fp32, [1, 12, 256]> xt_3 = layer_norm(axes = xt_3_axes_0, beta = inner_decoder_norm22_0_bias, epsilon = var_36, gamma = inner_decoder_norm22_0_weight, x = input_185)[name = tensor<string, []>("xt_3")];
+            tensor<int32, [4]> var_1188 = const()[name = tensor<string, []>("op_1188"), val = tensor<int32, [4]>([1, 1, 12, 256])];
+            tensor<fp32, [1, 1, 12, 256]> x_31 = reshape(shape = var_1188, x = xt_3)[name = tensor<string, []>("x_31")];
+            tensor<int32, [4]> var_1190_perm_0 = const()[name = tensor<string, []>("op_1190_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1194 = const()[name = tensor<string, []>("op_1194"), val = tensor<int32, [3]>([12, 1, 256])];
+            tensor<fp32, [1, 12, 1, 256]> var_1190 = transpose(perm = var_1190_perm_0, x = x_31)[name = tensor<string, []>("transpose_15")];
+            tensor<fp32, [12, 1, 256]> x = reshape(shape = var_1194, x = var_1190)[name = tensor<string, []>("x")];
             tensor<int32, [5]> prev_kv_begin_0 = const()[name = tensor<string, []>("prev_kv_begin_0"), val = tensor<int32, [5]>([1, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_end_0 = const()[name = tensor<string, []>("prev_kv_end_0"), val = tensor<int32, [5]>([2, 12, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_end_mask_0 = const()[name = tensor<string, []>("prev_kv_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -1015,120 +1014,120 @@ program(1.0)
             tensor<bool, [2]> prev_scale_end_mask_0 = const()[name = tensor<string, []>("prev_scale_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale = slice_by_index(begin = prev_scale_begin_0, end = prev_scale_end_0, end_mask = prev_scale_end_mask_0, squeeze_mask = prev_scale_squeeze_mask_0, x = dec_scale)[name = tensor<string, []>("prev_scale")];
-            tensor<fp32, [12, 1, 256]> var_1199 = linear(bias = decoder_q_proj_1_bias, weight = decoder_q_proj_1_weight, x = x)[name = tensor<string, []>("linear_47")];
-            tensor<int32, [4]> var_1200 = const()[name = tensor<string, []>("op_1200"), val = tensor<int32, [4]>([12, 1, 4, 64])];
-            tensor<fp32, [12, 1, 4, 64]> var_1201 = reshape(shape = var_1200, x = var_1199)[name = tensor<string, []>("op_1201")];
+            tensor<fp32, [12, 1, 256]> var_1202 = linear(bias = inner_decoder_q_proj_1_bias, weight = inner_decoder_q_proj_1_weight, x = x)[name = tensor<string, []>("linear_47")];
+            tensor<int32, [4]> var_1203 = const()[name = tensor<string, []>("op_1203"), val = tensor<int32, [4]>([12, 1, 4, 64])];
+            tensor<fp32, [12, 1, 4, 64]> var_1204 = reshape(shape = var_1203, x = var_1202)[name = tensor<string, []>("op_1204")];
             tensor<int32, [4]> q_17_perm_0 = const()[name = tensor<string, []>("q_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 1, 256]> var_1205 = linear(bias = decoder_k_proj_1_bias, weight = decoder_k_proj_1_weight, x = x)[name = tensor<string, []>("linear_48")];
-            tensor<fp32, []> var_1206 = const()[name = tensor<string, []>("op_1206"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [12, 1, 256]> var_1207 = mul(x = var_1205, y = var_1206)[name = tensor<string, []>("op_1207")];
-            tensor<int32, [4]> var_1208 = const()[name = tensor<string, []>("op_1208"), val = tensor<int32, [4]>([12, 1, 4, 64])];
-            tensor<fp32, [12, 1, 4, 64]> var_1209 = reshape(shape = var_1208, x = var_1207)[name = tensor<string, []>("op_1209")];
+            tensor<fp32, [12, 1, 256]> var_1208 = linear(bias = inner_decoder_k_proj_1_bias, weight = inner_decoder_k_proj_1_weight, x = x)[name = tensor<string, []>("linear_48")];
+            tensor<fp32, []> var_1209 = const()[name = tensor<string, []>("op_1209"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [12, 1, 256]> var_1210 = mul(x = var_1208, y = var_1209)[name = tensor<string, []>("op_1210")];
+            tensor<int32, [4]> var_1211 = const()[name = tensor<string, []>("op_1211"), val = tensor<int32, [4]>([12, 1, 4, 64])];
+            tensor<fp32, [12, 1, 4, 64]> var_1212 = reshape(shape = var_1211, x = var_1210)[name = tensor<string, []>("op_1212")];
             tensor<int32, [4]> k_17_perm_0 = const()[name = tensor<string, []>("k_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 1, 256]> var_1213 = linear(bias = decoder_v_proj_1_bias, weight = decoder_v_proj_1_weight, x = x)[name = tensor<string, []>("linear_49")];
-            tensor<int32, [4]> var_1214 = const()[name = tensor<string, []>("op_1214"), val = tensor<int32, [4]>([12, 1, 4, 64])];
-            tensor<fp32, [12, 1, 4, 64]> var_1215 = reshape(shape = var_1214, x = var_1213)[name = tensor<string, []>("op_1215")];
+            tensor<fp32, [12, 1, 256]> var_1216 = linear(bias = inner_decoder_v_proj_1_bias, weight = inner_decoder_v_proj_1_weight, x = x)[name = tensor<string, []>("linear_49")];
+            tensor<int32, [4]> var_1217 = const()[name = tensor<string, []>("op_1217"), val = tensor<int32, [4]>([12, 1, 4, 64])];
+            tensor<fp32, [12, 1, 4, 64]> var_1218 = reshape(shape = var_1217, x = var_1216)[name = tensor<string, []>("op_1218")];
             tensor<int32, [4]> v_17_perm_0 = const()[name = tensor<string, []>("v_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 1, 256]> input_187 = linear(bias = decoder_g_proj_1_bias, weight = decoder_g_proj_1_weight, x = x)[name = tensor<string, []>("linear_50")];
+            tensor<fp32, [12, 1, 256]> input_189 = linear(bias = inner_decoder_g_proj_1_bias, weight = inner_decoder_g_proj_1_weight, x = x)[name = tensor<string, []>("linear_50")];
             tensor<fp32, [1]> sqrt_s0 = sqrt(x = prev_scale)[name = tensor<string, []>("sqrt_s0")];
             tensor<fp32, [1]> s_t = add(x = prev_scale, y = cumsum_mask_1)[name = tensor<string, []>("s_t")];
             tensor<fp32, []> const_32 = const()[name = tensor<string, []>("const_32"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_3 = clip(alpha = var_926, beta = const_32, x = s_t)[name = tensor<string, []>("clip_3")];
+            tensor<fp32, [1]> clip_3 = clip(alpha = var_29, beta = const_32, x = s_t)[name = tensor<string, []>("clip_3")];
             tensor<fp32, [1]> sqrt_s_t = sqrt(x = clip_3)[name = tensor<string, []>("sqrt_s_t")];
             tensor<bool, []> qk_transpose_x_1 = const()[name = tensor<string, []>("qk_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_transpose_y_1 = const()[name = tensor<string, []>("qk_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [12, 4, 1, 64]> k_17 = transpose(perm = k_17_perm_0, x = var_1209)[name = tensor<string, []>("transpose_13")];
-            tensor<fp32, [12, 4, 1, 64]> q_17 = transpose(perm = q_17_perm_0, x = var_1201)[name = tensor<string, []>("transpose_14")];
+            tensor<fp32, [12, 4, 1, 64]> k_17 = transpose(perm = k_17_perm_0, x = var_1212)[name = tensor<string, []>("transpose_13")];
+            tensor<fp32, [12, 4, 1, 64]> q_17 = transpose(perm = q_17_perm_0, x = var_1204)[name = tensor<string, []>("transpose_14")];
             tensor<fp32, [12, 4, 1, 1]> qk = matmul(transpose_x = qk_transpose_x_1, transpose_y = qk_transpose_y_1, x = q_17, y = k_17)[name = tensor<string, []>("qk")];
-            tensor<int32, [2]> var_1230 = const()[name = tensor<string, []>("op_1230"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp32, [1, 1]> var_1231 = reshape(shape = var_1230, x = sqrt_s_t)[name = tensor<string, []>("op_1231")];
-            tensor<fp32, [1, 1]> M = real_div(x = var_1047, y = var_1231)[name = tensor<string, []>("M")];
-            tensor<fp32, [12, 4, 1, 1]> var_1233 = mul(x = qk, y = M)[name = tensor<string, []>("op_1233")];
-            tensor<bool, []> inner_transpose_x_0 = const()[name = tensor<string, []>("inner_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> inner_transpose_y_0 = const()[name = tensor<string, []>("inner_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 1, 64]> v_17 = transpose(perm = v_17_perm_0, x = var_1215)[name = tensor<string, []>("transpose_12")];
-            tensor<fp32, [12, 4, 1, 64]> inner = matmul(transpose_x = inner_transpose_x_0, transpose_y = inner_transpose_y_0, x = var_1233, y = v_17)[name = tensor<string, []>("inner")];
-            tensor<bool, []> var_1235_transpose_x_0 = const()[name = tensor<string, []>("op_1235_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_1235_transpose_y_0 = const()[name = tensor<string, []>("op_1235_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 1, 64]> var_1235 = matmul(transpose_x = var_1235_transpose_x_0, transpose_y = var_1235_transpose_y_0, x = q_17, y = prev_kv)[name = tensor<string, []>("op_1235")];
-            tensor<fp32, [1]> var_1236 = real_div(x = sqrt_s0, y = sqrt_s_t)[name = tensor<string, []>("op_1236")];
-            tensor<int32, [4]> var_1237 = const()[name = tensor<string, []>("op_1237"), val = tensor<int32, [4]>([1, 1, 1, 1])];
-            tensor<fp32, [1, 1, 1, 1]> var_1238 = reshape(shape = var_1237, x = var_1236)[name = tensor<string, []>("op_1238")];
-            tensor<fp32, [12, 4, 1, 64]> cross = mul(x = var_1235, y = var_1238)[name = tensor<string, []>("cross")];
-            tensor<fp32, [12, 4, 1, 64]> out_31 = add(x = inner, y = cross)[name = tensor<string, []>("out_31")];
-            tensor<fp32, [12, 4, 1, 64]> v_masked = mul(x = v_17, y = var_1061)[name = tensor<string, []>("v_masked")];
-            tensor<fp32, [12, 4, 64, 64]> var_1244 = mul(x = prev_kv, y = sqrt_s0)[name = tensor<string, []>("op_1244")];
-            tensor<bool, []> var_1246_transpose_x_1 = const()[name = tensor<string, []>("op_1246_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_1246_transpose_y_1 = const()[name = tensor<string, []>("op_1246_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 64, 64]> var_1246 = matmul(transpose_x = var_1246_transpose_x_1, transpose_y = var_1246_transpose_y_1, x = k_17, y = v_masked)[name = tensor<string, []>("op_1246")];
-            tensor<fp32, [12, 4, 64, 64]> new_kv_unnorm = add(x = var_1244, y = var_1246)[name = tensor<string, []>("new_kv_unnorm")];
-            tensor<fp32, [1]> new_scale = add(x = prev_scale, y = var_1069)[name = tensor<string, []>("new_scale")];
+            tensor<int32, [2]> var_1233 = const()[name = tensor<string, []>("op_1233"), val = tensor<int32, [2]>([1, 1])];
+            tensor<fp32, [1, 1]> var_1234 = reshape(shape = var_1233, x = sqrt_s_t)[name = tensor<string, []>("op_1234")];
+            tensor<fp32, [1, 1]> M = real_div(x = var_1050, y = var_1234)[name = tensor<string, []>("M")];
+            tensor<fp32, [12, 4, 1, 1]> var_1236 = mul(x = qk, y = M)[name = tensor<string, []>("op_1236")];
+            tensor<bool, []> inner_11_transpose_x_0 = const()[name = tensor<string, []>("inner_11_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> inner_11_transpose_y_0 = const()[name = tensor<string, []>("inner_11_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 1, 64]> v_17 = transpose(perm = v_17_perm_0, x = var_1218)[name = tensor<string, []>("transpose_12")];
+            tensor<fp32, [12, 4, 1, 64]> inner_11 = matmul(transpose_x = inner_11_transpose_x_0, transpose_y = inner_11_transpose_y_0, x = var_1236, y = v_17)[name = tensor<string, []>("inner_11")];
+            tensor<bool, []> var_1238_transpose_x_0 = const()[name = tensor<string, []>("op_1238_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_1238_transpose_y_0 = const()[name = tensor<string, []>("op_1238_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 1, 64]> var_1238 = matmul(transpose_x = var_1238_transpose_x_0, transpose_y = var_1238_transpose_y_0, x = q_17, y = prev_kv)[name = tensor<string, []>("op_1238")];
+            tensor<fp32, [1]> var_1239 = real_div(x = sqrt_s0, y = sqrt_s_t)[name = tensor<string, []>("op_1239")];
+            tensor<int32, [4]> var_1240 = const()[name = tensor<string, []>("op_1240"), val = tensor<int32, [4]>([1, 1, 1, 1])];
+            tensor<fp32, [1, 1, 1, 1]> var_1241 = reshape(shape = var_1240, x = var_1239)[name = tensor<string, []>("op_1241")];
+            tensor<fp32, [12, 4, 1, 64]> cross = mul(x = var_1238, y = var_1241)[name = tensor<string, []>("cross")];
+            tensor<fp32, [12, 4, 1, 64]> out_31 = add(x = inner_11, y = cross)[name = tensor<string, []>("out_31")];
+            tensor<fp32, [12, 4, 1, 64]> v_masked = mul(x = v_17, y = var_1064)[name = tensor<string, []>("v_masked")];
+            tensor<fp32, [12, 4, 64, 64]> var_1247 = mul(x = prev_kv, y = sqrt_s0)[name = tensor<string, []>("op_1247")];
+            tensor<bool, []> var_1249_transpose_x_1 = const()[name = tensor<string, []>("op_1249_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_1249_transpose_y_1 = const()[name = tensor<string, []>("op_1249_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 64, 64]> var_1249 = matmul(transpose_x = var_1249_transpose_x_1, transpose_y = var_1249_transpose_y_1, x = k_17, y = v_masked)[name = tensor<string, []>("op_1249")];
+            tensor<fp32, [12, 4, 64, 64]> new_kv_unnorm = add(x = var_1247, y = var_1249)[name = tensor<string, []>("new_kv_unnorm")];
+            tensor<fp32, [1]> new_scale = add(x = prev_scale, y = var_1072)[name = tensor<string, []>("new_scale")];
             tensor<fp32, []> const_33 = const()[name = tensor<string, []>("const_33"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_4 = clip(alpha = var_926, beta = const_33, x = new_scale)[name = tensor<string, []>("clip_4")];
+            tensor<fp32, [1]> clip_4 = clip(alpha = var_29, beta = const_33, x = new_scale)[name = tensor<string, []>("clip_4")];
             tensor<fp32, [1]> sqrt_new_scale = sqrt(x = clip_4)[name = tensor<string, []>("sqrt_new_scale")];
             tensor<fp32, [12, 4, 64, 64]> nkv = real_div(x = new_kv_unnorm, y = sqrt_new_scale)[name = tensor<string, []>("nkv")];
-            tensor<int32, [4]> var_1255_perm_0 = const()[name = tensor<string, []>("op_1255_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [4]> var_1258_perm_0 = const()[name = tensor<string, []>("op_1258_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_33_axes_0 = const()[name = tensor<string, []>("out_33_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [12, 1, 4, 64]> var_1255 = transpose(perm = var_1255_perm_0, x = out_31)[name = tensor<string, []>("transpose_11")];
-            tensor<fp32, [12, 1, 4, 64]> out_33 = layer_norm(axes = out_33_axes_0, epsilon = var_923, x = var_1255)[name = tensor<string, []>("out_33")];
-            tensor<int32, [3]> var_1259 = const()[name = tensor<string, []>("op_1259"), val = tensor<int32, [3]>([12, 1, 256])];
-            tensor<fp32, [12, 1, 256]> out = reshape(shape = var_1259, x = out_33)[name = tensor<string, []>("out")];
-            tensor<fp32, [12, 1, 256]> var_1261 = silu(x = input_187)[name = tensor<string, []>("op_1261")];
-            tensor<fp32, [12, 1, 256]> input_189 = mul(x = var_1261, y = out)[name = tensor<string, []>("input_189")];
-            tensor<fp32, [12, 1, 256]> ret_out = linear(bias = decoder_out_proj_1_bias, weight = decoder_out_proj_1_weight, x = input_189)[name = tensor<string, []>("linear_51")];
-            tensor<fp32, [12, 1, 256]> input_191 = add(x = x, y = ret_out)[name = tensor<string, []>("input_191")];
+            tensor<fp32, [12, 1, 4, 64]> var_1258 = transpose(perm = var_1258_perm_0, x = out_31)[name = tensor<string, []>("transpose_11")];
+            tensor<fp32, [12, 1, 4, 64]> out_33 = layer_norm(axes = out_33_axes_0, epsilon = var_44, x = var_1258)[name = tensor<string, []>("out_33")];
+            tensor<int32, [3]> var_1262 = const()[name = tensor<string, []>("op_1262"), val = tensor<int32, [3]>([12, 1, 256])];
+            tensor<fp32, [12, 1, 256]> out = reshape(shape = var_1262, x = out_33)[name = tensor<string, []>("out")];
+            tensor<fp32, [12, 1, 256]> var_1264 = silu(x = input_189)[name = tensor<string, []>("op_1264")];
+            tensor<fp32, [12, 1, 256]> input_191 = mul(x = var_1264, y = out)[name = tensor<string, []>("input_191")];
+            tensor<fp32, [12, 1, 256]> ret_out = linear(bias = inner_decoder_out_proj_1_bias, weight = inner_decoder_out_proj_1_weight, x = input_191)[name = tensor<string, []>("linear_51")];
+            tensor<fp32, [12, 1, 256]> input_193 = add(x = x, y = ret_out)[name = tensor<string, []>("input_193")];
             tensor<int32, [1]> xt_5_axes_0 = const()[name = tensor<string, []>("xt_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [12, 1, 256]> xt_5 = layer_norm(axes = xt_5_axes_0, beta = decoder_norm11_1_bias, epsilon = var_921, gamma = decoder_norm11_1_weight, x = input_191)[name = tensor<string, []>("xt_5")];
-            tensor<int32, [4]> var_1271 = const()[name = tensor<string, []>("op_1271"), val = tensor<int32, [4]>([1, 12, 1, 256])];
-            tensor<fp32, [1, 12, 1, 256]> var_1272 = reshape(shape = var_1271, x = xt_5)[name = tensor<string, []>("op_1272")];
-            tensor<int32, [4]> var_1273_perm_0 = const()[name = tensor<string, []>("op_1273_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1276 = const()[name = tensor<string, []>("op_1276"), val = tensor<int32, [3]>([1, 12, 256])];
-            tensor<fp32, [1, 1, 12, 256]> var_1273 = transpose(perm = var_1273_perm_0, x = var_1272)[name = tensor<string, []>("transpose_10")];
-            tensor<fp32, [1, 12, 256]> query_5 = reshape(shape = var_1276, x = var_1273)[name = tensor<string, []>("query_5")];
+            tensor<fp32, [12, 1, 256]> xt_5 = layer_norm(axes = xt_5_axes_0, beta = inner_decoder_norm11_1_bias, epsilon = var_36, gamma = inner_decoder_norm11_1_weight, x = input_193)[name = tensor<string, []>("xt_5")];
+            tensor<int32, [4]> var_1274 = const()[name = tensor<string, []>("op_1274"), val = tensor<int32, [4]>([1, 12, 1, 256])];
+            tensor<fp32, [1, 12, 1, 256]> var_1275 = reshape(shape = var_1274, x = xt_5)[name = tensor<string, []>("op_1275")];
+            tensor<int32, [4]> var_1276_perm_0 = const()[name = tensor<string, []>("op_1276_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1279 = const()[name = tensor<string, []>("op_1279"), val = tensor<int32, [3]>([1, 12, 256])];
+            tensor<fp32, [1, 1, 12, 256]> var_1276 = transpose(perm = var_1276_perm_0, x = var_1275)[name = tensor<string, []>("transpose_10")];
+            tensor<fp32, [1, 12, 256]> query_5 = reshape(shape = var_1279, x = var_1276)[name = tensor<string, []>("query_5")];
             tensor<int32, [3]> query_perm_0 = const()[name = tensor<string, []>("query_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [12, 1, 256]> query = transpose(perm = query_perm_0, x = query_5)[name = tensor<string, []>("transpose_9")];
-            tensor<fp32, [12, 1, 768]> var_1299 = linear(bias = decoder_self_attn2_1_in_proj_bias, weight = decoder_self_attn2_1_in_proj_weight, x = query)[name = tensor<string, []>("linear_52")];
+            tensor<fp32, [12, 1, 768]> var_1302 = linear(bias = inner_decoder_self_attn2_1_in_proj_bias, weight = inner_decoder_self_attn2_1_in_proj_weight, x = query)[name = tensor<string, []>("linear_52")];
             tensor<int32, [4]> concat_2 = const()[name = tensor<string, []>("concat_2"), val = tensor<int32, [4]>([12, 1, 3, 256])];
-            tensor<fp32, [12, 1, 3, 256]> var_1301 = reshape(shape = concat_2, x = var_1299)[name = tensor<string, []>("op_1301")];
-            tensor<int32, [1]> var_1302_axes_0 = const()[name = tensor<string, []>("op_1302_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp32, [1, 12, 1, 3, 256]> var_1302 = expand_dims(axes = var_1302_axes_0, x = var_1301)[name = tensor<string, []>("op_1302")];
-            tensor<int32, [5]> var_1303_perm_0 = const()[name = tensor<string, []>("op_1303_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
-            tensor<int32, [1]> var_1304_axes_0 = const()[name = tensor<string, []>("op_1304_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp32, [3, 12, 1, 1, 256]> var_1303 = transpose(perm = var_1303_perm_0, x = var_1302)[name = tensor<string, []>("transpose_8")];
-            tensor<fp32, [3, 12, 1, 256]> var_1304 = squeeze(axes = var_1304_axes_0, x = var_1303)[name = tensor<string, []>("op_1304")];
+            tensor<fp32, [12, 1, 3, 256]> var_1304 = reshape(shape = concat_2, x = var_1302)[name = tensor<string, []>("op_1304")];
+            tensor<int32, [1]> var_1305_axes_0 = const()[name = tensor<string, []>("op_1305_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 12, 1, 3, 256]> var_1305 = expand_dims(axes = var_1305_axes_0, x = var_1304)[name = tensor<string, []>("op_1305")];
+            tensor<int32, [5]> var_1306_perm_0 = const()[name = tensor<string, []>("op_1306_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
+            tensor<int32, [1]> var_1307_axes_0 = const()[name = tensor<string, []>("op_1307_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp32, [3, 12, 1, 1, 256]> var_1306 = transpose(perm = var_1306_perm_0, x = var_1305)[name = tensor<string, []>("transpose_8")];
+            tensor<fp32, [3, 12, 1, 256]> var_1307 = squeeze(axes = var_1307_axes_0, x = var_1306)[name = tensor<string, []>("op_1307")];
             tensor<int32, [4]> q_19_begin_0 = const()[name = tensor<string, []>("q_19_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> q_19_end_0 = const()[name = tensor<string, []>("q_19_end_0"), val = tensor<int32, [4]>([1, 12, 1, 256])];
             tensor<bool, [4]> q_19_end_mask_0 = const()[name = tensor<string, []>("q_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> q_19_squeeze_mask_0 = const()[name = tensor<string, []>("q_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 1, 256]> q_19 = slice_by_index(begin = q_19_begin_0, end = q_19_end_0, end_mask = q_19_end_mask_0, squeeze_mask = q_19_squeeze_mask_0, x = var_1304)[name = tensor<string, []>("q_19")];
+            tensor<fp32, [12, 1, 256]> q_19 = slice_by_index(begin = q_19_begin_0, end = q_19_end_0, end_mask = q_19_end_mask_0, squeeze_mask = q_19_squeeze_mask_0, x = var_1307)[name = tensor<string, []>("q_19")];
             tensor<int32, [4]> k_19_begin_0 = const()[name = tensor<string, []>("k_19_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> k_19_end_0 = const()[name = tensor<string, []>("k_19_end_0"), val = tensor<int32, [4]>([2, 12, 1, 256])];
             tensor<bool, [4]> k_19_end_mask_0 = const()[name = tensor<string, []>("k_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> k_19_squeeze_mask_0 = const()[name = tensor<string, []>("k_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 1, 256]> k_19 = slice_by_index(begin = k_19_begin_0, end = k_19_end_0, end_mask = k_19_end_mask_0, squeeze_mask = k_19_squeeze_mask_0, x = var_1304)[name = tensor<string, []>("k_19")];
+            tensor<fp32, [12, 1, 256]> k_19 = slice_by_index(begin = k_19_begin_0, end = k_19_end_0, end_mask = k_19_end_mask_0, squeeze_mask = k_19_squeeze_mask_0, x = var_1307)[name = tensor<string, []>("k_19")];
             tensor<int32, [4]> v_19_begin_0 = const()[name = tensor<string, []>("v_19_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> v_19_end_0 = const()[name = tensor<string, []>("v_19_end_0"), val = tensor<int32, [4]>([3, 12, 1, 256])];
             tensor<bool, [4]> v_19_end_mask_0 = const()[name = tensor<string, []>("v_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> v_19_squeeze_mask_0 = const()[name = tensor<string, []>("v_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 1, 256]> v_19 = slice_by_index(begin = v_19_begin_0, end = v_19_end_0, end_mask = v_19_end_mask_0, squeeze_mask = v_19_squeeze_mask_0, x = var_1304)[name = tensor<string, []>("v_19")];
-            tensor<int32, [3]> var_1312 = const()[name = tensor<string, []>("op_1312"), val = tensor<int32, [3]>([12, 4, 64])];
-            tensor<fp32, [12, 4, 64]> var_1313 = reshape(shape = var_1312, x = q_19)[name = tensor<string, []>("op_1313")];
+            tensor<fp32, [12, 1, 256]> v_19 = slice_by_index(begin = v_19_begin_0, end = v_19_end_0, end_mask = v_19_end_mask_0, squeeze_mask = v_19_squeeze_mask_0, x = var_1307)[name = tensor<string, []>("v_19")];
+            tensor<int32, [3]> var_1315 = const()[name = tensor<string, []>("op_1315"), val = tensor<int32, [3]>([12, 4, 64])];
+            tensor<fp32, [12, 4, 64]> var_1316 = reshape(shape = var_1315, x = q_19)[name = tensor<string, []>("op_1316")];
             tensor<int32, [3]> q_21_perm_0 = const()[name = tensor<string, []>("q_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1319 = const()[name = tensor<string, []>("op_1319"), val = tensor<int32, [3]>([12, 4, 64])];
-            tensor<fp32, [12, 4, 64]> var_1320 = reshape(shape = var_1319, x = k_19)[name = tensor<string, []>("op_1320")];
+            tensor<int32, [3]> var_1322 = const()[name = tensor<string, []>("op_1322"), val = tensor<int32, [3]>([12, 4, 64])];
+            tensor<fp32, [12, 4, 64]> var_1323 = reshape(shape = var_1322, x = k_19)[name = tensor<string, []>("op_1323")];
             tensor<int32, [3]> k_21_perm_0 = const()[name = tensor<string, []>("k_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1326 = const()[name = tensor<string, []>("op_1326"), val = tensor<int32, [3]>([12, 4, 64])];
-            tensor<fp32, [12, 4, 64]> var_1327 = reshape(shape = var_1326, x = v_19)[name = tensor<string, []>("op_1327")];
+            tensor<int32, [3]> var_1329 = const()[name = tensor<string, []>("op_1329"), val = tensor<int32, [3]>([12, 4, 64])];
+            tensor<fp32, [12, 4, 64]> var_1330 = reshape(shape = var_1329, x = v_19)[name = tensor<string, []>("op_1330")];
             tensor<int32, [3]> v_21_perm_0 = const()[name = tensor<string, []>("v_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [4]> var_1330 = const()[name = tensor<string, []>("op_1330"), val = tensor<int32, [4]>([1, 4, 12, 64])];
-            tensor<fp32, [4, 12, 64]> q_21 = transpose(perm = q_21_perm_0, x = var_1313)[name = tensor<string, []>("transpose_7")];
-            tensor<fp32, [1, 4, 12, 64]> q = reshape(shape = var_1330, x = q_21)[name = tensor<string, []>("q")];
-            tensor<int32, [4]> var_1332 = const()[name = tensor<string, []>("op_1332"), val = tensor<int32, [4]>([1, 4, 12, 64])];
-            tensor<fp32, [4, 12, 64]> k_21 = transpose(perm = k_21_perm_0, x = var_1320)[name = tensor<string, []>("transpose_6")];
-            tensor<fp32, [1, 4, 12, 64]> k = reshape(shape = var_1332, x = k_21)[name = tensor<string, []>("k")];
-            tensor<int32, [4]> var_1334 = const()[name = tensor<string, []>("op_1334"), val = tensor<int32, [4]>([1, 4, 12, 64])];
-            tensor<fp32, [4, 12, 64]> v_21 = transpose(perm = v_21_perm_0, x = var_1327)[name = tensor<string, []>("transpose_5")];
-            tensor<fp32, [1, 4, 12, 64]> v = reshape(shape = var_1334, x = v_21)[name = tensor<string, []>("v")];
+            tensor<int32, [4]> var_1333 = const()[name = tensor<string, []>("op_1333"), val = tensor<int32, [4]>([1, 4, 12, 64])];
+            tensor<fp32, [4, 12, 64]> q_21 = transpose(perm = q_21_perm_0, x = var_1316)[name = tensor<string, []>("transpose_7")];
+            tensor<fp32, [1, 4, 12, 64]> q = reshape(shape = var_1333, x = q_21)[name = tensor<string, []>("q")];
+            tensor<int32, [4]> var_1335 = const()[name = tensor<string, []>("op_1335"), val = tensor<int32, [4]>([1, 4, 12, 64])];
+            tensor<fp32, [4, 12, 64]> k_21 = transpose(perm = k_21_perm_0, x = var_1323)[name = tensor<string, []>("transpose_6")];
+            tensor<fp32, [1, 4, 12, 64]> k = reshape(shape = var_1335, x = k_21)[name = tensor<string, []>("k")];
+            tensor<int32, [4]> var_1337 = const()[name = tensor<string, []>("op_1337"), val = tensor<int32, [4]>([1, 4, 12, 64])];
+            tensor<fp32, [4, 12, 64]> v_21 = transpose(perm = v_21_perm_0, x = var_1330)[name = tensor<string, []>("transpose_5")];
+            tensor<fp32, [1, 4, 12, 64]> v = reshape(shape = var_1337, x = v_21)[name = tensor<string, []>("v")];
             tensor<fp32, []> mul_3_y_0 = const()[name = tensor<string, []>("mul_3_y_0"), val = tensor<fp32, []>(0x1p-3)];
             tensor<fp32, [1, 4, 12, 64]> mul_3 = mul(x = q, y = mul_3_y_0)[name = tensor<string, []>("mul_3")];
             tensor<bool, []> matmul_1_transpose_y_0 = const()[name = tensor<string, []>("matmul_1_transpose_y_0"), val = tensor<bool, []>(true)];
@@ -1139,34 +1138,34 @@ program(1.0)
             tensor<bool, []> attn_output_9_transpose_x_0 = const()[name = tensor<string, []>("attn_output_9_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> attn_output_9_transpose_y_0 = const()[name = tensor<string, []>("attn_output_9_transpose_y_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [1, 4, 12, 64]> attn_output_9 = matmul(transpose_x = attn_output_9_transpose_x_0, transpose_y = attn_output_9_transpose_y_0, x = softmax_1, y = v)[name = tensor<string, []>("attn_output_9")];
-            tensor<int32, [4]> var_1337 = const()[name = tensor<string, []>("op_1337"), val = tensor<int32, [4]>([2, 0, 1, 3])];
-            tensor<int32, [2]> var_1342 = const()[name = tensor<string, []>("op_1342"), val = tensor<int32, [2]>([12, 256])];
-            tensor<fp32, [12, 1, 4, 64]> var_1338 = transpose(perm = var_1337, x = attn_output_9)[name = tensor<string, []>("transpose_4")];
-            tensor<fp32, [12, 256]> attn_output_11 = reshape(shape = var_1342, x = var_1338)[name = tensor<string, []>("attn_output_11")];
-            tensor<fp32, [12, 256]> attn_output_13 = linear(bias = decoder_self_attn2_1_out_proj_bias, weight = decoder_self_attn2_1_out_proj_weight, x = attn_output_11)[name = tensor<string, []>("linear_53")];
-            tensor<int32, [3]> var_1346 = const()[name = tensor<string, []>("op_1346"), val = tensor<int32, [3]>([12, 1, 256])];
-            tensor<fp32, [12, 1, 256]> attn_output = reshape(shape = var_1346, x = attn_output_13)[name = tensor<string, []>("attn_output")];
+            tensor<int32, [4]> var_1340 = const()[name = tensor<string, []>("op_1340"), val = tensor<int32, [4]>([2, 0, 1, 3])];
+            tensor<int32, [2]> var_1345 = const()[name = tensor<string, []>("op_1345"), val = tensor<int32, [2]>([12, 256])];
+            tensor<fp32, [12, 1, 4, 64]> var_1341 = transpose(perm = var_1340, x = attn_output_9)[name = tensor<string, []>("transpose_4")];
+            tensor<fp32, [12, 256]> attn_output_11 = reshape(shape = var_1345, x = var_1341)[name = tensor<string, []>("attn_output_11")];
+            tensor<fp32, [12, 256]> attn_output_13 = linear(bias = inner_decoder_self_attn2_1_out_proj_bias, weight = inner_decoder_self_attn2_1_out_proj_weight, x = attn_output_11)[name = tensor<string, []>("linear_53")];
+            tensor<int32, [3]> var_1349 = const()[name = tensor<string, []>("op_1349"), val = tensor<int32, [3]>([12, 1, 256])];
+            tensor<fp32, [12, 1, 256]> attn_output = reshape(shape = var_1349, x = attn_output_13)[name = tensor<string, []>("attn_output")];
             tensor<int32, [3]> sa2_out_perm_0 = const()[name = tensor<string, []>("sa2_out_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [1, 12, 256]> sa2_out = transpose(perm = sa2_out_perm_0, x = attn_output)[name = tensor<string, []>("transpose_3")];
-            tensor<fp32, [1, 12, 256]> input_193 = add(x = query_5, y = sa2_out)[name = tensor<string, []>("input_193")];
-            tensor<int32, [1]> input_195_axes_0 = const()[name = tensor<string, []>("input_195_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 12, 256]> input_195 = layer_norm(axes = input_195_axes_0, beta = decoder_norm21_1_bias, epsilon = var_921, gamma = decoder_norm21_1_weight, x = input_193)[name = tensor<string, []>("input_195")];
-            tensor<fp32, [1, 12, 2048]> input_197 = linear(bias = decoder_linear1_1_bias, weight = decoder_linear1_1_weight, x = input_195)[name = tensor<string, []>("linear_54")];
-            tensor<fp32, [1, 12, 2048]> input_199 = relu(x = input_197)[name = tensor<string, []>("input_199")];
-            tensor<fp32, [1, 12, 256]> ffn_out = linear(bias = decoder_linear2_1_bias, weight = decoder_linear2_1_weight, x = input_199)[name = tensor<string, []>("linear_55")];
-            tensor<fp32, [1, 12, 256]> input_201 = add(x = input_195, y = ffn_out)[name = tensor<string, []>("input_201")];
+            tensor<fp32, [1, 12, 256]> input_195 = add(x = query_5, y = sa2_out)[name = tensor<string, []>("input_195")];
+            tensor<int32, [1]> input_197_axes_0 = const()[name = tensor<string, []>("input_197_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 12, 256]> input_197 = layer_norm(axes = input_197_axes_0, beta = inner_decoder_norm21_1_bias, epsilon = var_36, gamma = inner_decoder_norm21_1_weight, x = input_195)[name = tensor<string, []>("input_197")];
+            tensor<fp32, [1, 12, 2048]> input_199 = linear(bias = inner_decoder_linear1_1_bias, weight = inner_decoder_linear1_1_weight, x = input_197)[name = tensor<string, []>("linear_54")];
+            tensor<fp32, [1, 12, 2048]> input_201 = relu(x = input_199)[name = tensor<string, []>("input_201")];
+            tensor<fp32, [1, 12, 256]> ffn_out = linear(bias = inner_decoder_linear2_1_bias, weight = inner_decoder_linear2_1_weight, x = input_201)[name = tensor<string, []>("linear_55")];
+            tensor<fp32, [1, 12, 256]> input_203 = add(x = input_197, y = ffn_out)[name = tensor<string, []>("input_203")];
             tensor<int32, [1]> xt_axes_0 = const()[name = tensor<string, []>("xt_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 12, 256]> xt = layer_norm(axes = xt_axes_0, beta = decoder_norm22_1_bias, epsilon = var_921, gamma = decoder_norm22_1_weight, x = input_201)[name = tensor<string, []>("xt")];
-            tensor<int32, [4]> var_1366 = const()[name = tensor<string, []>("op_1366"), val = tensor<int32, [4]>([1, 1, 12, 256])];
-            tensor<fp32, [1, 1, 12, 256]> input = reshape(shape = var_1366, x = xt)[name = tensor<string, []>("input")];
-            tensor<int32, [1]> var_1368 = const()[name = tensor<string, []>("op_1368"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 12, 1]> var_1369 = reduce_l2_norm(axes = var_1368, keep_dims = var_924, x = input)[name = tensor<string, []>("op_1369")];
+            tensor<fp32, [1, 12, 256]> xt = layer_norm(axes = xt_axes_0, beta = inner_decoder_norm22_1_bias, epsilon = var_36, gamma = inner_decoder_norm22_1_weight, x = input_203)[name = tensor<string, []>("xt")];
+            tensor<int32, [4]> var_1369 = const()[name = tensor<string, []>("op_1369"), val = tensor<int32, [4]>([1, 1, 12, 256])];
+            tensor<fp32, [1, 1, 12, 256]> input = reshape(shape = var_1369, x = xt)[name = tensor<string, []>("input")];
+            tensor<int32, [1]> var_1371 = const()[name = tensor<string, []>("op_1371"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 1, 12, 1]> var_1372 = reduce_l2_norm(axes = var_1371, keep_dims = var_35, x = input)[name = tensor<string, []>("op_1372")];
             tensor<fp32, []> const_42 = const()[name = tensor<string, []>("const_42"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1, 1, 12, 1]> clip_5 = clip(alpha = var_916, beta = const_42, x = var_1369)[name = tensor<string, []>("clip_5")];
-            tensor<fp32, [1, 1, 12, 256]> var_1371 = real_div(x = input, y = clip_5)[name = tensor<string, []>("op_1371")];
+            tensor<fp32, [1, 1, 12, 1]> clip_5 = clip(alpha = var_49, beta = const_42, x = var_1372)[name = tensor<string, []>("clip_5")];
+            tensor<fp32, [1, 1, 12, 256]> var_1374 = real_div(x = input, y = clip_5)[name = tensor<string, []>("op_1374")];
             tensor<int32, [4]> transpose_1_perm_0 = const()[name = tensor<string, []>("transpose_1_perm_0"), val = tensor<int32, [4]>([0, 1, 3, 2])];
             tensor<int32, [3]> concat_7 = const()[name = tensor<string, []>("concat_7"), val = tensor<int32, [3]>([1, 256, 12])];
-            tensor<fp32, [1, 1, 256, 12]> transpose_1 = transpose(perm = transpose_1_perm_0, x = var_1371)[name = tensor<string, []>("transpose_2")];
+            tensor<fp32, [1, 1, 256, 12]> transpose_1 = transpose(perm = transpose_1_perm_0, x = var_1374)[name = tensor<string, []>("transpose_2")];
             tensor<fp32, [1, 256, 12]> reshape_1 = reshape(shape = concat_7, x = transpose_1)[name = tensor<string, []>("reshape_1")];
             tensor<bool, []> matmul_2_transpose_x_0 = const()[name = tensor<string, []>("matmul_2_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> matmul_2_transpose_y_0 = const()[name = tensor<string, []>("matmul_2_transpose_y_0"), val = tensor<bool, []>(false)];
@@ -1175,10 +1174,10 @@ program(1.0)
             tensor<int32, [3]> output_end_0 = const()[name = tensor<string, []>("output_end_0"), val = tensor<int32, [3]>([1, 1, 11])];
             tensor<bool, [3]> output_end_mask_0 = const()[name = tensor<string, []>("output_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
             tensor<fp32, [1, 1, 10]> output = slice_by_index(begin = output_begin_0, end = output_end_0, end_mask = output_end_mask_0, x = matmul_2)[name = tensor<string, []>("output")];
-            tensor<fp32, [1, 1, 10]> probs = sigmoid(x = output)[name = tensor<string, []>("op_1375")];
-            tensor<int32, []> var_1377_axis_0 = const()[name = tensor<string, []>("op_1377_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [2, 12, 4, 64, 64]> dec_kv_new = stack(axis = var_1377_axis_0, values = (var_1073, nkv))[name = tensor<string, []>("op_1377")];
-            tensor<int32, []> var_1379_axis_0 = const()[name = tensor<string, []>("op_1379_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [2, 1]> dec_scale_new = stack(axis = var_1379_axis_0, values = (new_scale_9, new_scale))[name = tensor<string, []>("op_1379")];
+            tensor<fp32, [1, 1, 10]> probs = sigmoid(x = output)[name = tensor<string, []>("op_1378")];
+            tensor<int32, []> var_1380_axis_0 = const()[name = tensor<string, []>("op_1380_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [2, 12, 4, 64, 64]> dec_kv_new = stack(axis = var_1380_axis_0, values = (var_1076, nkv))[name = tensor<string, []>("op_1380")];
+            tensor<int32, []> var_1382_axis_0 = const()[name = tensor<string, []>("op_1382_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [2, 1]> dec_scale_new = stack(axis = var_1382_axis_0, values = (new_scale_9, new_scale))[name = tensor<string, []>("op_1382")];
         } -> (probs, enc_kv_new, enc_scale_new, enc_conv_cache_new, cnn_window_new, dec_kv_new, dec_scale_new);
 }
\ No newline at end of file
diff --git a/optimized/dih2/100ms/ls_eend_dih2_100ms.mlpackage/Data/com.apple.CoreML/model.mlmodel b/optimized/dih2/100ms/ls_eend_dih2_100ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
index 2b188342f30379c189c2ef1ec6d955b9033ba14d..b188b3ecab52bd0e3ef6ff0a638d3cdfe256dc40 100644
--- a/optimized/dih2/100ms/ls_eend_dih2_100ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
+++ b/optimized/dih2/100ms/ls_eend_dih2_100ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f0e1bf629771d4cad61a66e7932eb77dae2f0231edb3f0070409798b2d0c643c
-size 171364
+oid sha256:9e9f5785b2d4260e790dcbe4aa4afb5c33fb5377866338c24ddf9e39bf75a1c0
+size 175282
diff --git a/optimized/dih2/100ms/ls_eend_dih2_100ms.mlpackage/Manifest.json b/optimized/dih2/100ms/ls_eend_dih2_100ms.mlpackage/Manifest.json
index 84f6f3de60aa59bdc9ffe7ddbf987d76c5f22beb..16f3cfcaecb66465ec8f40bc6e1e4c580806a317 100644
--- a/optimized/dih2/100ms/ls_eend_dih2_100ms.mlpackage/Manifest.json
+++ b/optimized/dih2/100ms/ls_eend_dih2_100ms.mlpackage/Manifest.json
@@ -1,18 +1,18 @@
 {
     "fileFormatVersion": "1.0.0",
     "itemInfoEntries": {
-        "377FDFC9-ABF8-4C96-8917-8CBDA39D46CD": {
+        "34D13A76-D1CC-4F3E-BACD-855110612CCB": {
             "author": "com.apple.CoreML",
             "description": "CoreML Model Specification",
             "name": "model.mlmodel",
             "path": "com.apple.CoreML/model.mlmodel"
         },
-        "EEC7467E-63A3-4C31-9B54-74606F035FAA": {
+        "59FE4953-2547-41D5-BACE-CC39E25C6CE9": {
             "author": "com.apple.CoreML",
             "description": "CoreML Model Weights",
             "name": "weights",
             "path": "com.apple.CoreML/weights"
         }
     },
-    "rootModelIdentifier": "377FDFC9-ABF8-4C96-8917-8CBDA39D46CD"
+    "rootModelIdentifier": "34D13A76-D1CC-4F3E-BACD-855110612CCB"
 }
diff --git a/optimized/dih2/200ms/ls_eend_dih2_200ms.mlmodelc/analytics/coremldata.bin b/optimized/dih2/200ms/ls_eend_dih2_200ms.mlmodelc/analytics/coremldata.bin
index 397635a9a0e1cd59d8cf6642abba267486b68f69..da9d72c722d78161a3ac653368f6509b3416670b 100644
--- a/optimized/dih2/200ms/ls_eend_dih2_200ms.mlmodelc/analytics/coremldata.bin
+++ b/optimized/dih2/200ms/ls_eend_dih2_200ms.mlmodelc/analytics/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f1b0bab18a5f1a234fd3b8f86af03830354b0ff7d715753b069b582fd0dac6de
+oid sha256:4a16507ded5530171695a9e2509707ad428a6321635d34dd07ec69fd35b1d8d5
 size 243
diff --git a/optimized/dih2/200ms/ls_eend_dih2_200ms.mlmodelc/coremldata.bin b/optimized/dih2/200ms/ls_eend_dih2_200ms.mlmodelc/coremldata.bin
index e804d1c5d44837686767f82aef6a0244dc12e26c..781371005b7381affae84182486c13a36628874b 100644
--- a/optimized/dih2/200ms/ls_eend_dih2_200ms.mlmodelc/coremldata.bin
+++ b/optimized/dih2/200ms/ls_eend_dih2_200ms.mlmodelc/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:76832edb1250c28bb6886d9fd633957adb70b61d6525ec6e5ee1131c50305350
-size 1308
+oid sha256:a160a2ed2d67341409c7837e2856513edf10369b35df3aaea2ee36ecb094050a
+size 1411
diff --git a/optimized/dih2/200ms/ls_eend_dih2_200ms.mlmodelc/metadata.json b/optimized/dih2/200ms/ls_eend_dih2_200ms.mlmodelc/metadata.json
index fcec287e71dc3d3cfdc7a74070099ad04b2b2ae4..e6cf77d52791285f08f6a72989700edc635598d0 100644
--- a/optimized/dih2/200ms/ls_eend_dih2_200ms.mlmodelc/metadata.json
+++ b/optimized/dih2/200ms/ls_eend_dih2_200ms.mlmodelc/metadata.json
@@ -1,7 +1,7 @@
 [
   {
     "metadataOutputVersion" : "3.0",
-    "shortDescription" : "LS-EEND DIHARD II streaming diarizer (pipeline, T=2, max_speakers=10)",
+    "shortDescription" : "LS-EEND DIHARD II streaming diarizer (pipeline, T=2, max_speakers=10, layout=raw_mel, cu=all)",
     "outputSchema" : [
       {
         "hasShapeFlexibility" : "0",
@@ -81,7 +81,7 @@
     "specificationVersion" : 8,
     "mlProgramOperationTypeHistogram" : {
       "Ios16.reduceL2Norm" : 2,
-      "Ios17.reshape" : 68,
+      "Ios17.reshape" : 69,
       "Ios16.softmax" : 2,
       "Ios17.matmul" : 29,
       "Ios17.transpose" : 57,
@@ -89,7 +89,7 @@
       "Ios17.expandDims" : 3,
       "Ios17.add" : 46,
       "Ios16.sigmoid" : 5,
-      "Ios17.sliceByIndex" : 48,
+      "Ios17.sliceByIndex" : 50,
       "Tile" : 1,
       "Ios16.reduceSum" : 1,
       "Ios17.squeeze" : 2,
@@ -101,7 +101,7 @@
       "Ios16.silu" : 18,
       "Ios17.realDiv" : 20,
       "Ios17.linear" : 56,
-      "Stack" : 5,
+      "Stack" : 6,
       "Ios17.concat" : 14,
       "Ios16.relu" : 2,
       "Ios16.cumsum" : 1,
@@ -128,9 +128,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float32",
-        "formattedType" : "MultiArray (Float32 1 × 2 × 345)",
+        "formattedType" : "MultiArray (Float32 1 × 25 × 23)",
         "shortDescription" : "",
-        "shape" : "[1, 2, 345]",
+        "shape" : "[1, 25, 23]",
         "name" : "features",
         "type" : "MultiArray"
       },
@@ -206,8 +206,8 @@
       }
     ],
     "userDefinedMetadata" : {
-      "com.github.apple.coremltools.conversion_date" : "2026-04-16",
-      "config" : "{\"model_name\": \"dih2\", \"model_label\": \"DIHARD II\", \"variant\": \"pipeline\", \"chunk_size\": 2, \"step_duration_ms\": 200, \"frame_duration_ms\": 100, \"frame_duration_seconds\": 0.1, \"max_speakers\": 10, \"max_nspks\": 12, \"n_units\": 256, \"n_heads\": 4, \"enc_n_layers\": 4, \"dec_n_layers\": 2, \"conv_kernel_size\": 16, \"conv_delay\": 9, \"sample_rate\": 8000, \"win_length\": 200, \"hop_length\": 80, \"n_mels\": 23, \"context_size\": 7, \"subsampling\": 10, \"feat_type\": \"logmel23_cummn\", \"pure_roll\": true}",
+      "com.github.apple.coremltools.conversion_date" : "2026-04-18",
+      "config" : "{\"model_name\": \"dih2\", \"model_label\": \"DIHARD II\", \"variant\": \"pipeline\", \"chunk_size\": 2, \"step_duration_ms\": 200, \"frame_duration_ms\": 100, \"frame_duration_seconds\": 0.1, \"max_speakers\": 10, \"max_nspks\": 12, \"n_units\": 256, \"n_heads\": 4, \"enc_n_layers\": 4, \"dec_n_layers\": 2, \"conv_kernel_size\": 16, \"conv_delay\": 9, \"sample_rate\": 8000, \"win_length\": 200, \"hop_length\": 80, \"n_mels\": 23, \"context_size\": 7, \"subsampling\": 10, \"feat_type\": \"logmel23_cummn\", \"pure_roll\": true, \"input_layout\": \"raw_mel\", \"compute_units_export\": \"all\", \"raw_mel_length\": 25}",
       "com.github.apple.coremltools.source" : "torch==2.6.0",
       "com.github.apple.coremltools.version" : "9.0",
       "com.github.apple.coremltools.source_dialect" : "TorchScript"
diff --git a/optimized/dih2/200ms/ls_eend_dih2_200ms.mlmodelc/model.mil b/optimized/dih2/200ms/ls_eend_dih2_200ms.mlmodelc/model.mil
index 5651eaf71d9adaeffd8f5bfe34de4f9950314977..2ccb23e6a99a5f049025ab2b67fb75bd7b7827a2 100644
--- a/optimized/dih2/200ms/ls_eend_dih2_200ms.mlmodelc/model.mil
+++ b/optimized/dih2/200ms/ls_eend_dih2_200ms.mlmodelc/model.mil
@@ -1,234 +1,248 @@
 program(1.0)
 [buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.6.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "9.0"}})]
 {
-    func main<ios17>(tensor<fp32, [1, 256, 18]> cnn_window, tensor<fp32, [2, 12, 4, 64, 64]> dec_kv, tensor<fp32, [2, 1]> dec_scale, tensor<fp32, [4, 1, 16, 256]> enc_conv_cache, tensor<fp32, [4, 1, 4, 64, 64]> enc_kv, tensor<fp32, [4, 1]> enc_scale, tensor<fp32, [1, 2, 345]> features, tensor<fp32, [2]> valid_mask) {
-            tensor<fp32, [256]> encoder_cnn_bias = const()[name = tensor<string, []>("encoder_cnn_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
-            tensor<fp32, [256, 256, 19]> encoder_cnn_weight = const()[name = tensor<string, []>("encoder_cnn_weight"), val = tensor<fp32, [256, 256, 19]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
-            tensor<fp32, [2, 2]> encoder__causal_mask = const()[name = tensor<string, []>("encoder__causal_mask"), val = tensor<fp32, [2, 2]>([[0x1p+0, 0x0p+0], [0x1p+0, 0x1p+0]])];
-            tensor<fp32, [2]> encoder__t_index = const()[name = tensor<string, []>("encoder__t_index"), val = tensor<fp32, [2]>([0x1p+0, 0x1p+1])];
-            tensor<fp32, [256]> encoder_input_projection_linear_bias = const()[name = tensor<string, []>("encoder_input_projection_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4981952)))];
-            tensor<fp32, [256, 345]> encoder_input_projection_linear_weight = const()[name = tensor<string, []>("encoder_input_projection_linear_weight"), val = tensor<fp32, [256, 345]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4983040)))];
-            tensor<fp32, [256]> encoder_pre_layer_norm_bias = const()[name = tensor<string, []>("encoder_pre_layer_norm_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5336384)))];
-            tensor<fp32, [256]> encoder_pre_layer_norm_weight = const()[name = tensor<string, []>("encoder_pre_layer_norm_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5337472)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5338560)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5339648)))];
-            tensor<fp32, [1024]> encoder_ffn1_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5340736)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5344896)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6393536)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6394624)))];
-            tensor<fp32, [256]> encoder_ret_lns_0_bias = const()[name = tensor<string, []>("encoder_ret_lns_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7443264)))];
-            tensor<fp32, [256]> encoder_ret_lns_0_weight = const()[name = tensor<string, []>("encoder_ret_lns_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7444352)))];
-            tensor<fp32, [256]> encoder_q_proj_0_bias = const()[name = tensor<string, []>("encoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7445440)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_0_weight = const()[name = tensor<string, []>("encoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7446528)))];
-            tensor<fp32, [256]> encoder_k_proj_0_bias = const()[name = tensor<string, []>("encoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7708736)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_0_weight = const()[name = tensor<string, []>("encoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7709824)))];
-            tensor<fp32, [256]> encoder_v_proj_0_bias = const()[name = tensor<string, []>("encoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7972032)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_0_weight = const()[name = tensor<string, []>("encoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7973120)))];
-            tensor<fp32, [256]> encoder_g_proj_0_bias = const()[name = tensor<string, []>("encoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8235328)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_0_weight = const()[name = tensor<string, []>("encoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8236416)))];
-            tensor<fp32, [256]> encoder_out_proj_0_bias = const()[name = tensor<string, []>("encoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8498624)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_0_weight = const()[name = tensor<string, []>("encoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8499712)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8761920)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8763008)))];
-            tensor<fp32, [512]> encoder_conv_module_0_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8764096)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_0_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8766208)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_0_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9290560)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9307008)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9308096)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9309184)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9310272)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9311360)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_0_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9312448)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9574656)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9575744)))];
-            tensor<fp32, [1024]> encoder_ffn2_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9576832)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9580992)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10629632)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10630720)))];
-            tensor<fp32, [256]> encoder_layer_norm_0_bias = const()[name = tensor<string, []>("encoder_layer_norm_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11679360)))];
-            tensor<fp32, [256]> encoder_layer_norm_0_weight = const()[name = tensor<string, []>("encoder_layer_norm_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11680448)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11681536)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11682624)))];
-            tensor<fp32, [1024]> encoder_ffn1_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11683712)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11687872)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12736512)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12737600)))];
-            tensor<fp32, [256]> encoder_ret_lns_1_bias = const()[name = tensor<string, []>("encoder_ret_lns_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13786240)))];
-            tensor<fp32, [256]> encoder_ret_lns_1_weight = const()[name = tensor<string, []>("encoder_ret_lns_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13787328)))];
-            tensor<fp32, [256]> encoder_q_proj_1_bias = const()[name = tensor<string, []>("encoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13788416)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_1_weight = const()[name = tensor<string, []>("encoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13789504)))];
-            tensor<fp32, [256]> encoder_k_proj_1_bias = const()[name = tensor<string, []>("encoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14051712)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_1_weight = const()[name = tensor<string, []>("encoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14052800)))];
-            tensor<fp32, [256]> encoder_v_proj_1_bias = const()[name = tensor<string, []>("encoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14315008)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_1_weight = const()[name = tensor<string, []>("encoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14316096)))];
-            tensor<fp32, [256]> encoder_g_proj_1_bias = const()[name = tensor<string, []>("encoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14578304)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_1_weight = const()[name = tensor<string, []>("encoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14579392)))];
-            tensor<fp32, [256]> encoder_out_proj_1_bias = const()[name = tensor<string, []>("encoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14841600)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_1_weight = const()[name = tensor<string, []>("encoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14842688)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15104896)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15105984)))];
-            tensor<fp32, [512]> encoder_conv_module_1_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15107072)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_1_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15109184)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_1_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15633536)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15649984)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15651072)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15652160)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15653248)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15654336)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_1_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15655424)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15917632)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15918720)))];
-            tensor<fp32, [1024]> encoder_ffn2_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15919808)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15923968)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16972608)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16973696)))];
-            tensor<fp32, [256]> encoder_layer_norm_1_bias = const()[name = tensor<string, []>("encoder_layer_norm_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18022336)))];
-            tensor<fp32, [256]> encoder_layer_norm_1_weight = const()[name = tensor<string, []>("encoder_layer_norm_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18023424)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18024512)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18025600)))];
-            tensor<fp32, [1024]> encoder_ffn1_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18026688)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18030848)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19079488)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19080576)))];
-            tensor<fp32, [256]> encoder_ret_lns_2_bias = const()[name = tensor<string, []>("encoder_ret_lns_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20129216)))];
-            tensor<fp32, [256]> encoder_ret_lns_2_weight = const()[name = tensor<string, []>("encoder_ret_lns_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20130304)))];
-            tensor<fp32, [256]> encoder_q_proj_2_bias = const()[name = tensor<string, []>("encoder_q_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20131392)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_2_weight = const()[name = tensor<string, []>("encoder_q_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20132480)))];
-            tensor<fp32, [256]> encoder_k_proj_2_bias = const()[name = tensor<string, []>("encoder_k_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20394688)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_2_weight = const()[name = tensor<string, []>("encoder_k_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20395776)))];
-            tensor<fp32, [256]> encoder_v_proj_2_bias = const()[name = tensor<string, []>("encoder_v_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20657984)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_2_weight = const()[name = tensor<string, []>("encoder_v_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20659072)))];
-            tensor<fp32, [256]> encoder_g_proj_2_bias = const()[name = tensor<string, []>("encoder_g_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20921280)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_2_weight = const()[name = tensor<string, []>("encoder_g_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20922368)))];
-            tensor<fp32, [256]> encoder_out_proj_2_bias = const()[name = tensor<string, []>("encoder_out_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21184576)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_2_weight = const()[name = tensor<string, []>("encoder_out_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21185664)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21447872)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21448960)))];
-            tensor<fp32, [512]> encoder_conv_module_2_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21450048)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_2_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21452160)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_2_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21976512)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21992960)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21994048)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21995136)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21996224)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21997312)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_2_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21998400)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22260608)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22261696)))];
-            tensor<fp32, [1024]> encoder_ffn2_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22262784)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22266944)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23315584)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23316672)))];
-            tensor<fp32, [256]> encoder_layer_norm_2_bias = const()[name = tensor<string, []>("encoder_layer_norm_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24365312)))];
-            tensor<fp32, [256]> encoder_layer_norm_2_weight = const()[name = tensor<string, []>("encoder_layer_norm_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24366400)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24367488)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24368576)))];
-            tensor<fp32, [1024]> encoder_ffn1_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24369664)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24373824)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25422464)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25423552)))];
-            tensor<fp32, [256]> encoder_ret_lns_3_bias = const()[name = tensor<string, []>("encoder_ret_lns_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26472192)))];
-            tensor<fp32, [256]> encoder_ret_lns_3_weight = const()[name = tensor<string, []>("encoder_ret_lns_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26473280)))];
-            tensor<fp32, [256]> encoder_q_proj_3_bias = const()[name = tensor<string, []>("encoder_q_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26474368)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_3_weight = const()[name = tensor<string, []>("encoder_q_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26475456)))];
-            tensor<fp32, [256]> encoder_k_proj_3_bias = const()[name = tensor<string, []>("encoder_k_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26737664)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_3_weight = const()[name = tensor<string, []>("encoder_k_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26738752)))];
-            tensor<fp32, [256]> encoder_v_proj_3_bias = const()[name = tensor<string, []>("encoder_v_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27000960)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_3_weight = const()[name = tensor<string, []>("encoder_v_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27002048)))];
-            tensor<fp32, [256]> encoder_g_proj_3_bias = const()[name = tensor<string, []>("encoder_g_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27264256)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_3_weight = const()[name = tensor<string, []>("encoder_g_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27265344)))];
-            tensor<fp32, [256]> encoder_out_proj_3_bias = const()[name = tensor<string, []>("encoder_out_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27527552)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_3_weight = const()[name = tensor<string, []>("encoder_out_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27528640)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27790848)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27791936)))];
-            tensor<fp32, [512]> encoder_conv_module_3_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27793024)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_3_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27795136)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_3_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28319488)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28335936)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28337024)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28338112)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28339200)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28340288)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_3_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28341376)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28603584)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28604672)))];
-            tensor<fp32, [1024]> encoder_ffn2_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28605760)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28609920)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29658560)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29659648)))];
-            tensor<fp32, [256]> encoder_layer_norm_3_bias = const()[name = tensor<string, []>("encoder_layer_norm_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30708288)))];
-            tensor<fp32, [256]> encoder_layer_norm_3_weight = const()[name = tensor<string, []>("encoder_layer_norm_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30709376)))];
-            tensor<fp32, [2, 2]> decoder__causal_mask = const()[name = tensor<string, []>("decoder__causal_mask"), val = tensor<fp32, [2, 2]>([[0x1p+0, 0x0p+0], [0x1p+0, 0x1p+0]])];
-            tensor<fp32, [256]> decoder_convert_bias = const()[name = tensor<string, []>("decoder_convert_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710464)))];
-            tensor<fp32, [256, 512]> decoder_convert_weight = const()[name = tensor<string, []>("decoder_convert_weight"), val = tensor<fp32, [256, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30711552)))];
-            tensor<fp32, [256]> decoder_q_proj_0_bias = const()[name = tensor<string, []>("decoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31235904)))];
-            tensor<fp32, [256, 256]> decoder_q_proj_0_weight = const()[name = tensor<string, []>("decoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31236992)))];
-            tensor<fp32, [256]> decoder_k_proj_0_bias = const()[name = tensor<string, []>("decoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31499200)))];
-            tensor<fp32, [256, 256]> decoder_k_proj_0_weight = const()[name = tensor<string, []>("decoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31500288)))];
-            tensor<fp32, [256]> decoder_v_proj_0_bias = const()[name = tensor<string, []>("decoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31762496)))];
-            tensor<fp32, [256, 256]> decoder_v_proj_0_weight = const()[name = tensor<string, []>("decoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31763584)))];
-            tensor<fp32, [256]> decoder_g_proj_0_bias = const()[name = tensor<string, []>("decoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32025792)))];
-            tensor<fp32, [256, 256]> decoder_g_proj_0_weight = const()[name = tensor<string, []>("decoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32026880)))];
-            tensor<fp32, [256]> decoder_out_proj_0_bias = const()[name = tensor<string, []>("decoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32289088)))];
-            tensor<fp32, [256, 256]> decoder_out_proj_0_weight = const()[name = tensor<string, []>("decoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32290176)))];
-            tensor<fp32, [256]> decoder_norm11_0_bias = const()[name = tensor<string, []>("decoder_norm11_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32552384)))];
-            tensor<fp32, [256]> decoder_norm11_0_weight = const()[name = tensor<string, []>("decoder_norm11_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32553472)))];
-            tensor<fp32, [256]> decoder_self_attn2_0_out_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_0_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32554560)))];
-            tensor<fp32, [256, 256]> decoder_self_attn2_0_out_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_0_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32555648)))];
-            tensor<fp32, [768]> decoder_self_attn2_0_in_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_0_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32817856)))];
-            tensor<fp32, [768, 256]> decoder_self_attn2_0_in_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_0_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32820992)))];
-            tensor<fp32, [256]> decoder_norm21_0_bias = const()[name = tensor<string, []>("decoder_norm21_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33607488)))];
-            tensor<fp32, [256]> decoder_norm21_0_weight = const()[name = tensor<string, []>("decoder_norm21_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33608576)))];
-            tensor<fp32, [2048]> decoder_linear1_0_bias = const()[name = tensor<string, []>("decoder_linear1_0_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33609664)))];
-            tensor<fp32, [2048, 256]> decoder_linear1_0_weight = const()[name = tensor<string, []>("decoder_linear1_0_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33617920)))];
-            tensor<fp32, [256]> decoder_linear2_0_bias = const()[name = tensor<string, []>("decoder_linear2_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35715136)))];
-            tensor<fp32, [256, 2048]> decoder_linear2_0_weight = const()[name = tensor<string, []>("decoder_linear2_0_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35716224)))];
-            tensor<fp32, [256]> decoder_norm22_0_bias = const()[name = tensor<string, []>("decoder_norm22_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37813440)))];
-            tensor<fp32, [256]> decoder_norm22_0_weight = const()[name = tensor<string, []>("decoder_norm22_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37814528)))];
-            tensor<fp32, [256]> decoder_q_proj_1_bias = const()[name = tensor<string, []>("decoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37815616)))];
-            tensor<fp32, [256, 256]> decoder_q_proj_1_weight = const()[name = tensor<string, []>("decoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37816704)))];
-            tensor<fp32, [256]> decoder_k_proj_1_bias = const()[name = tensor<string, []>("decoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38078912)))];
-            tensor<fp32, [256, 256]> decoder_k_proj_1_weight = const()[name = tensor<string, []>("decoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38080000)))];
-            tensor<fp32, [256]> decoder_v_proj_1_bias = const()[name = tensor<string, []>("decoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38342208)))];
-            tensor<fp32, [256, 256]> decoder_v_proj_1_weight = const()[name = tensor<string, []>("decoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38343296)))];
-            tensor<fp32, [256]> decoder_g_proj_1_bias = const()[name = tensor<string, []>("decoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38605504)))];
-            tensor<fp32, [256, 256]> decoder_g_proj_1_weight = const()[name = tensor<string, []>("decoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38606592)))];
-            tensor<fp32, [256]> decoder_out_proj_1_bias = const()[name = tensor<string, []>("decoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38868800)))];
-            tensor<fp32, [256, 256]> decoder_out_proj_1_weight = const()[name = tensor<string, []>("decoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38869888)))];
-            tensor<fp32, [256]> decoder_norm11_1_bias = const()[name = tensor<string, []>("decoder_norm11_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39132096)))];
-            tensor<fp32, [256]> decoder_norm11_1_weight = const()[name = tensor<string, []>("decoder_norm11_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39133184)))];
-            tensor<fp32, [256]> decoder_self_attn2_1_out_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_1_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39134272)))];
-            tensor<fp32, [256, 256]> decoder_self_attn2_1_out_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_1_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39135360)))];
-            tensor<fp32, [768]> decoder_self_attn2_1_in_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_1_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39397568)))];
-            tensor<fp32, [768, 256]> decoder_self_attn2_1_in_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_1_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39400704)))];
-            tensor<fp32, [256]> decoder_norm21_1_bias = const()[name = tensor<string, []>("decoder_norm21_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40187200)))];
-            tensor<fp32, [256]> decoder_norm21_1_weight = const()[name = tensor<string, []>("decoder_norm21_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40188288)))];
-            tensor<fp32, [2048]> decoder_linear1_1_bias = const()[name = tensor<string, []>("decoder_linear1_1_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40189376)))];
-            tensor<fp32, [2048, 256]> decoder_linear1_1_weight = const()[name = tensor<string, []>("decoder_linear1_1_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40197632)))];
-            tensor<fp32, [256]> decoder_linear2_1_bias = const()[name = tensor<string, []>("decoder_linear2_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42294848)))];
-            tensor<fp32, [256, 2048]> decoder_linear2_1_weight = const()[name = tensor<string, []>("decoder_linear2_1_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42295936)))];
-            tensor<fp32, [256]> decoder_norm22_1_bias = const()[name = tensor<string, []>("decoder_norm22_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44393152)))];
-            tensor<fp32, [256]> decoder_norm22_1_weight = const()[name = tensor<string, []>("decoder_norm22_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44394240)))];
-            tensor<fp32, []> var_11 = const()[name = tensor<string, []>("op_11"), val = tensor<fp32, []>(0x1.5798eep-27)];
-            tensor<fp32, []> var_17 = const()[name = tensor<string, []>("op_17"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
-            tensor<int32, []> var_20 = const()[name = tensor<string, []>("op_20"), val = tensor<int32, []>(2)];
-            tensor<int32, []> var_23 = const()[name = tensor<string, []>("op_23"), val = tensor<int32, []>(0)];
-            tensor<int32, []> var_26 = const()[name = tensor<string, []>("op_26"), val = tensor<int32, []>(1)];
-            tensor<fp32, []> var_28 = const()[name = tensor<string, []>("op_28"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
-            tensor<bool, []> var_29 = const()[name = tensor<string, []>("op_29"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 2, 256]> input_1 = linear(bias = encoder_input_projection_linear_bias, weight = encoder_input_projection_linear_weight, x = features)[name = tensor<string, []>("linear_0")];
-            tensor<int32, [1]> input_3_axes_0 = const()[name = tensor<string, []>("input_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_3 = layer_norm(axes = input_3_axes_0, beta = encoder_pre_layer_norm_bias, epsilon = var_28, gamma = encoder_pre_layer_norm_weight, x = input_1)[name = tensor<string, []>("input_3")];
+    func main<ios17>(tensor<fp32, [1, 256, 18]> cnn_window, tensor<fp32, [2, 12, 4, 64, 64]> dec_kv, tensor<fp32, [2, 1]> dec_scale, tensor<fp32, [4, 1, 16, 256]> enc_conv_cache, tensor<fp32, [4, 1, 4, 64, 64]> enc_kv, tensor<fp32, [4, 1]> enc_scale, tensor<fp32, [1, 25, 23]> features, tensor<fp32, [2]> valid_mask) {
+            tensor<fp32, [256]> inner_encoder_cnn_bias = const()[name = tensor<string, []>("inner_encoder_cnn_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp32, [256, 256, 19]> inner_encoder_cnn_weight = const()[name = tensor<string, []>("inner_encoder_cnn_weight"), val = tensor<fp32, [256, 256, 19]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
+            tensor<fp32, [2, 2]> inner_encoder__causal_mask = const()[name = tensor<string, []>("inner_encoder__causal_mask"), val = tensor<fp32, [2, 2]>([[0x1p+0, 0x0p+0], [0x1p+0, 0x1p+0]])];
+            tensor<fp32, [2]> inner_encoder__t_index = const()[name = tensor<string, []>("inner_encoder__t_index"), val = tensor<fp32, [2]>([0x1p+0, 0x1p+1])];
+            tensor<fp32, [256]> inner_encoder_input_projection_linear_bias = const()[name = tensor<string, []>("inner_encoder_input_projection_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4981952)))];
+            tensor<fp32, [256, 345]> inner_encoder_input_projection_linear_weight = const()[name = tensor<string, []>("inner_encoder_input_projection_linear_weight"), val = tensor<fp32, [256, 345]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4983040)))];
+            tensor<fp32, [256]> inner_encoder_pre_layer_norm_bias = const()[name = tensor<string, []>("inner_encoder_pre_layer_norm_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5336384)))];
+            tensor<fp32, [256]> inner_encoder_pre_layer_norm_weight = const()[name = tensor<string, []>("inner_encoder_pre_layer_norm_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5337472)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5338560)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5339648)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5340736)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5344896)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6393536)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6394624)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_0_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7443264)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_0_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7444352)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7445440)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7446528)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7708736)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7709824)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7972032)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7973120)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8235328)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8236416)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8498624)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8499712)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8761920)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8763008)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_0_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8764096)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_0_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8766208)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_0_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9290560)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9307008)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9308096)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9309184)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9310272)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9311360)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_0_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9312448)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9574656)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9575744)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9576832)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9580992)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10629632)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10630720)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_0_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11679360)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_0_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11680448)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11681536)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11682624)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11683712)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11687872)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12736512)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12737600)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_1_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13786240)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_1_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13787328)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13788416)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13789504)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14051712)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14052800)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14315008)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14316096)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14578304)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14579392)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14841600)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14842688)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15104896)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15105984)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_1_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15107072)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_1_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15109184)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_1_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15633536)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15649984)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15651072)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15652160)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15653248)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15654336)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_1_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15655424)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15917632)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15918720)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15919808)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15923968)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16972608)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16973696)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_1_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18022336)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_1_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18023424)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18024512)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18025600)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18026688)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18030848)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19079488)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19080576)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_2_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20129216)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_2_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20130304)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20131392)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20132480)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20394688)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20395776)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20657984)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20659072)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20921280)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20922368)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21184576)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21185664)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21447872)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21448960)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_2_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21450048)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_2_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21452160)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_2_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21976512)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21992960)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21994048)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21995136)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21996224)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21997312)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_2_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21998400)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22260608)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22261696)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22262784)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22266944)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23315584)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23316672)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_2_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24365312)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_2_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24366400)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24367488)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24368576)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24369664)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24373824)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25422464)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25423552)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_3_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26472192)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_3_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26473280)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26474368)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26475456)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26737664)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26738752)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27000960)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27002048)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27264256)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27265344)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27527552)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27528640)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27790848)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27791936)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_3_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27793024)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_3_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27795136)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_3_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28319488)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28335936)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28337024)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28338112)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28339200)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28340288)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_3_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28341376)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28603584)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28604672)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28605760)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28609920)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29658560)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29659648)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_3_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30708288)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_3_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30709376)))];
+            tensor<fp32, [2, 2]> inner_decoder__causal_mask = const()[name = tensor<string, []>("inner_decoder__causal_mask"), val = tensor<fp32, [2, 2]>([[0x1p+0, 0x0p+0], [0x1p+0, 0x1p+0]])];
+            tensor<fp32, [256]> inner_decoder_convert_bias = const()[name = tensor<string, []>("inner_decoder_convert_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710464)))];
+            tensor<fp32, [256, 512]> inner_decoder_convert_weight = const()[name = tensor<string, []>("inner_decoder_convert_weight"), val = tensor<fp32, [256, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30711552)))];
+            tensor<fp32, [256]> inner_decoder_q_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31235904)))];
+            tensor<fp32, [256, 256]> inner_decoder_q_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31236992)))];
+            tensor<fp32, [256]> inner_decoder_k_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31499200)))];
+            tensor<fp32, [256, 256]> inner_decoder_k_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31500288)))];
+            tensor<fp32, [256]> inner_decoder_v_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31762496)))];
+            tensor<fp32, [256, 256]> inner_decoder_v_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31763584)))];
+            tensor<fp32, [256]> inner_decoder_g_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32025792)))];
+            tensor<fp32, [256, 256]> inner_decoder_g_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32026880)))];
+            tensor<fp32, [256]> inner_decoder_out_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32289088)))];
+            tensor<fp32, [256, 256]> inner_decoder_out_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32290176)))];
+            tensor<fp32, [256]> inner_decoder_norm11_0_bias = const()[name = tensor<string, []>("inner_decoder_norm11_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32552384)))];
+            tensor<fp32, [256]> inner_decoder_norm11_0_weight = const()[name = tensor<string, []>("inner_decoder_norm11_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32553472)))];
+            tensor<fp32, [256]> inner_decoder_self_attn2_0_out_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32554560)))];
+            tensor<fp32, [256, 256]> inner_decoder_self_attn2_0_out_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32555648)))];
+            tensor<fp32, [768]> inner_decoder_self_attn2_0_in_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32817856)))];
+            tensor<fp32, [768, 256]> inner_decoder_self_attn2_0_in_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32820992)))];
+            tensor<fp32, [256]> inner_decoder_norm21_0_bias = const()[name = tensor<string, []>("inner_decoder_norm21_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33607488)))];
+            tensor<fp32, [256]> inner_decoder_norm21_0_weight = const()[name = tensor<string, []>("inner_decoder_norm21_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33608576)))];
+            tensor<fp32, [2048]> inner_decoder_linear1_0_bias = const()[name = tensor<string, []>("inner_decoder_linear1_0_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33609664)))];
+            tensor<fp32, [2048, 256]> inner_decoder_linear1_0_weight = const()[name = tensor<string, []>("inner_decoder_linear1_0_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33617920)))];
+            tensor<fp32, [256]> inner_decoder_linear2_0_bias = const()[name = tensor<string, []>("inner_decoder_linear2_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35715136)))];
+            tensor<fp32, [256, 2048]> inner_decoder_linear2_0_weight = const()[name = tensor<string, []>("inner_decoder_linear2_0_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35716224)))];
+            tensor<fp32, [256]> inner_decoder_norm22_0_bias = const()[name = tensor<string, []>("inner_decoder_norm22_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37813440)))];
+            tensor<fp32, [256]> inner_decoder_norm22_0_weight = const()[name = tensor<string, []>("inner_decoder_norm22_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37814528)))];
+            tensor<fp32, [256]> inner_decoder_q_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37815616)))];
+            tensor<fp32, [256, 256]> inner_decoder_q_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37816704)))];
+            tensor<fp32, [256]> inner_decoder_k_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38078912)))];
+            tensor<fp32, [256, 256]> inner_decoder_k_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38080000)))];
+            tensor<fp32, [256]> inner_decoder_v_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38342208)))];
+            tensor<fp32, [256, 256]> inner_decoder_v_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38343296)))];
+            tensor<fp32, [256]> inner_decoder_g_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38605504)))];
+            tensor<fp32, [256, 256]> inner_decoder_g_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38606592)))];
+            tensor<fp32, [256]> inner_decoder_out_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38868800)))];
+            tensor<fp32, [256, 256]> inner_decoder_out_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38869888)))];
+            tensor<fp32, [256]> inner_decoder_norm11_1_bias = const()[name = tensor<string, []>("inner_decoder_norm11_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39132096)))];
+            tensor<fp32, [256]> inner_decoder_norm11_1_weight = const()[name = tensor<string, []>("inner_decoder_norm11_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39133184)))];
+            tensor<fp32, [256]> inner_decoder_self_attn2_1_out_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39134272)))];
+            tensor<fp32, [256, 256]> inner_decoder_self_attn2_1_out_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39135360)))];
+            tensor<fp32, [768]> inner_decoder_self_attn2_1_in_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39397568)))];
+            tensor<fp32, [768, 256]> inner_decoder_self_attn2_1_in_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39400704)))];
+            tensor<fp32, [256]> inner_decoder_norm21_1_bias = const()[name = tensor<string, []>("inner_decoder_norm21_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40187200)))];
+            tensor<fp32, [256]> inner_decoder_norm21_1_weight = const()[name = tensor<string, []>("inner_decoder_norm21_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40188288)))];
+            tensor<fp32, [2048]> inner_decoder_linear1_1_bias = const()[name = tensor<string, []>("inner_decoder_linear1_1_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40189376)))];
+            tensor<fp32, [2048, 256]> inner_decoder_linear1_1_weight = const()[name = tensor<string, []>("inner_decoder_linear1_1_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40197632)))];
+            tensor<fp32, [256]> inner_decoder_linear2_1_bias = const()[name = tensor<string, []>("inner_decoder_linear2_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42294848)))];
+            tensor<fp32, [256, 2048]> inner_decoder_linear2_1_weight = const()[name = tensor<string, []>("inner_decoder_linear2_1_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42295936)))];
+            tensor<fp32, [256]> inner_decoder_norm22_1_bias = const()[name = tensor<string, []>("inner_decoder_norm22_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44393152)))];
+            tensor<fp32, [256]> inner_decoder_norm22_1_weight = const()[name = tensor<string, []>("inner_decoder_norm22_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44394240)))];
+            tensor<int32, [3]> var_19_begin_0 = const()[name = tensor<string, []>("op_19_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_19_end_0 = const()[name = tensor<string, []>("op_19_end_0"), val = tensor<int32, [3]>([1, 15, 23])];
+            tensor<bool, [3]> var_19_end_mask_0 = const()[name = tensor<string, []>("op_19_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 15, 23]> var_19 = slice_by_index(begin = var_19_begin_0, end = var_19_end_0, end_mask = var_19_end_mask_0, x = features)[name = tensor<string, []>("op_19")];
+            tensor<int32, [3]> var_29_begin_0 = const()[name = tensor<string, []>("op_29_begin_0"), val = tensor<int32, [3]>([0, 10, 0])];
+            tensor<int32, [3]> var_29_end_0 = const()[name = tensor<string, []>("op_29_end_0"), val = tensor<int32, [3]>([1, 1, 23])];
+            tensor<bool, [3]> var_29_end_mask_0 = const()[name = tensor<string, []>("op_29_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 23]> var_29 = slice_by_index(begin = var_29_begin_0, end = var_29_end_0, end_mask = var_29_end_mask_0, x = features)[name = tensor<string, []>("op_29")];
+            tensor<int32, []> stacked_axis_0 = const()[name = tensor<string, []>("stacked_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 2, 15, 23]> stacked = stack(axis = stacked_axis_0, values = (var_19, var_29))[name = tensor<string, []>("stacked")];
+            tensor<int32, [3]> var_36 = const()[name = tensor<string, []>("op_36"), val = tensor<int32, [3]>([1, 2, 345])];
+            tensor<fp32, [1, 2, 345]> input_1 = reshape(shape = var_36, x = stacked)[name = tensor<string, []>("input_1")];
+            tensor<fp32, []> var_39 = const()[name = tensor<string, []>("op_39"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<bool, []> var_45 = const()[name = tensor<string, []>("op_45"), val = tensor<bool, []>(true)];
+            tensor<fp32, []> var_46 = const()[name = tensor<string, []>("op_46"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
+            tensor<int32, []> var_49 = const()[name = tensor<string, []>("op_49"), val = tensor<int32, []>(0)];
+            tensor<int32, []> var_51 = const()[name = tensor<string, []>("op_51"), val = tensor<int32, []>(2)];
+            tensor<int32, []> var_52 = const()[name = tensor<string, []>("op_52"), val = tensor<int32, []>(-1)];
+            tensor<fp32, []> var_54 = const()[name = tensor<string, []>("op_54"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
+            tensor<fp32, []> var_59 = const()[name = tensor<string, []>("op_59"), val = tensor<fp32, []>(0x1.5798eep-27)];
+            tensor<int32, []> var_62 = const()[name = tensor<string, []>("op_62"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 2, 256]> input_3 = linear(bias = inner_encoder_input_projection_linear_bias, weight = inner_encoder_input_projection_linear_weight, x = input_1)[name = tensor<string, []>("linear_0")];
             tensor<int32, [1]> input_5_axes_0 = const()[name = tensor<string, []>("input_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_5 = layer_norm(axes = input_5_axes_0, beta = encoder_ffn1_0_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_0_module_sequential_0_weight, x = input_3)[name = tensor<string, []>("input_5")];
-            tensor<fp32, [1, 2, 1024]> inputs_1 = linear(bias = encoder_ffn1_0_module_sequential_1_linear_bias, weight = encoder_ffn1_0_module_sequential_1_linear_weight, x = input_5)[name = tensor<string, []>("linear_1")];
-            tensor<fp32, [1, 2, 1024]> input_7 = silu(x = inputs_1)[name = tensor<string, []>("input_7")];
-            tensor<fp32, [1, 2, 256]> input_11 = linear(bias = encoder_ffn1_0_module_sequential_4_linear_bias, weight = encoder_ffn1_0_module_sequential_4_linear_weight, x = input_7)[name = tensor<string, []>("linear_2")];
-            tensor<fp32, []> var_147 = const()[name = tensor<string, []>("op_147"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 2, 256]> var_148 = mul(x = input_11, y = var_147)[name = tensor<string, []>("op_148")];
-            tensor<fp32, [1, 2, 256]> input_13 = add(x = var_148, y = input_3)[name = tensor<string, []>("input_13")];
+            tensor<fp32, [1, 2, 256]> input_5 = layer_norm(axes = input_5_axes_0, beta = inner_encoder_pre_layer_norm_bias, epsilon = var_46, gamma = inner_encoder_pre_layer_norm_weight, x = input_3)[name = tensor<string, []>("input_5")];
+            tensor<int32, [1]> input_7_axes_0 = const()[name = tensor<string, []>("input_7_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 2, 256]> input_7 = layer_norm(axes = input_7_axes_0, beta = inner_encoder_ffn1_0_module_sequential_0_bias, epsilon = var_46, gamma = inner_encoder_ffn1_0_module_sequential_0_weight, x = input_5)[name = tensor<string, []>("input_7")];
+            tensor<fp32, [1, 2, 1024]> inputs_1 = linear(bias = inner_encoder_ffn1_0_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_0_module_sequential_1_linear_weight, x = input_7)[name = tensor<string, []>("linear_1")];
+            tensor<fp32, [1, 2, 1024]> input_9 = silu(x = inputs_1)[name = tensor<string, []>("input_9")];
+            tensor<fp32, [1, 2, 256]> input_13 = linear(bias = inner_encoder_ffn1_0_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_0_module_sequential_4_linear_weight, x = input_9)[name = tensor<string, []>("linear_2")];
+            tensor<fp32, []> var_183 = const()[name = tensor<string, []>("op_183"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 2, 256]> var_184 = mul(x = input_13, y = var_183)[name = tensor<string, []>("op_184")];
+            tensor<fp32, [1, 2, 256]> input_15 = add(x = var_184, y = input_5)[name = tensor<string, []>("input_15")];
             tensor<int32, [1]> x_1_axes_0 = const()[name = tensor<string, []>("x_1_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> x_1 = layer_norm(axes = x_1_axes_0, beta = encoder_ret_lns_0_bias, epsilon = var_28, gamma = encoder_ret_lns_0_weight, x = input_13)[name = tensor<string, []>("x_1")];
+            tensor<fp32, [1, 2, 256]> x_1 = layer_norm(axes = x_1_axes_0, beta = inner_encoder_ret_lns_0_bias, epsilon = var_46, gamma = inner_encoder_ret_lns_0_weight, x = input_15)[name = tensor<string, []>("x_1")];
             tensor<int32, [5]> prev_kv_1_begin_0 = const()[name = tensor<string, []>("prev_kv_1_begin_0"), val = tensor<int32, [5]>([0, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_1_end_0 = const()[name = tensor<string, []>("prev_kv_1_end_0"), val = tensor<int32, [5]>([1, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_1_end_mask_0 = const()[name = tensor<string, []>("prev_kv_1_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -239,153 +253,153 @@ program(1.0)
             tensor<bool, [2]> prev_scale_1_end_mask_0 = const()[name = tensor<string, []>("prev_scale_1_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_1_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_1_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_1 = slice_by_index(begin = prev_scale_1_begin_0, end = prev_scale_1_end_0, end_mask = prev_scale_1_end_mask_0, squeeze_mask = prev_scale_1_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_1")];
-            tensor<fp32, [1, 2, 256]> var_162 = linear(bias = encoder_q_proj_0_bias, weight = encoder_q_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_3")];
-            tensor<int32, [4]> var_163 = const()[name = tensor<string, []>("op_163"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_164 = reshape(shape = var_163, x = var_162)[name = tensor<string, []>("op_164")];
+            tensor<fp32, [1, 2, 256]> var_198 = linear(bias = inner_encoder_q_proj_0_bias, weight = inner_encoder_q_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_3")];
+            tensor<int32, [4]> var_199 = const()[name = tensor<string, []>("op_199"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_200 = reshape(shape = var_199, x = var_198)[name = tensor<string, []>("op_200")];
             tensor<int32, [4]> q_1_perm_0 = const()[name = tensor<string, []>("q_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> var_168 = linear(bias = encoder_k_proj_0_bias, weight = encoder_k_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_4")];
-            tensor<fp32, []> var_169 = const()[name = tensor<string, []>("op_169"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 2, 256]> var_170 = mul(x = var_168, y = var_169)[name = tensor<string, []>("op_170")];
-            tensor<int32, [4]> var_171 = const()[name = tensor<string, []>("op_171"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_172 = reshape(shape = var_171, x = var_170)[name = tensor<string, []>("op_172")];
+            tensor<fp32, [1, 2, 256]> var_204 = linear(bias = inner_encoder_k_proj_0_bias, weight = inner_encoder_k_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_4")];
+            tensor<fp32, []> var_205 = const()[name = tensor<string, []>("op_205"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 2, 256]> var_206 = mul(x = var_204, y = var_205)[name = tensor<string, []>("op_206")];
+            tensor<int32, [4]> var_207 = const()[name = tensor<string, []>("op_207"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_208 = reshape(shape = var_207, x = var_206)[name = tensor<string, []>("op_208")];
             tensor<int32, [4]> k_1_perm_0 = const()[name = tensor<string, []>("k_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> var_176 = linear(bias = encoder_v_proj_0_bias, weight = encoder_v_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_5")];
-            tensor<int32, [4]> var_177 = const()[name = tensor<string, []>("op_177"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_178 = reshape(shape = var_177, x = var_176)[name = tensor<string, []>("op_178")];
+            tensor<fp32, [1, 2, 256]> var_212 = linear(bias = inner_encoder_v_proj_0_bias, weight = inner_encoder_v_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_5")];
+            tensor<int32, [4]> var_213 = const()[name = tensor<string, []>("op_213"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_214 = reshape(shape = var_213, x = var_212)[name = tensor<string, []>("op_214")];
             tensor<int32, [4]> v_1_perm_0 = const()[name = tensor<string, []>("v_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> input_17 = linear(bias = encoder_g_proj_0_bias, weight = encoder_g_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_6")];
+            tensor<fp32, [1, 2, 256]> input_19 = linear(bias = inner_encoder_g_proj_0_bias, weight = inner_encoder_g_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_6")];
             tensor<fp32, [1]> sqrt_s0_1 = sqrt(x = prev_scale_1)[name = tensor<string, []>("sqrt_s0_1")];
-            tensor<fp32, [2]> s_t_1 = add(x = prev_scale_1, y = encoder__t_index)[name = tensor<string, []>("s_t_1")];
+            tensor<fp32, [2]> s_t_1 = add(x = prev_scale_1, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_1")];
             tensor<fp32, [2]> sqrt_s_t_1 = sqrt(x = s_t_1)[name = tensor<string, []>("sqrt_s_t_1")];
             tensor<bool, []> qk_1_transpose_x_1 = const()[name = tensor<string, []>("qk_1_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_1_transpose_y_1 = const()[name = tensor<string, []>("qk_1_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 2, 64]> k_1 = transpose(perm = k_1_perm_0, x = var_172)[name = tensor<string, []>("transpose_57")];
-            tensor<fp32, [1, 4, 2, 64]> q_1 = transpose(perm = q_1_perm_0, x = var_164)[name = tensor<string, []>("transpose_58")];
+            tensor<fp32, [1, 4, 2, 64]> k_1 = transpose(perm = k_1_perm_0, x = var_208)[name = tensor<string, []>("transpose_57")];
+            tensor<fp32, [1, 4, 2, 64]> q_1 = transpose(perm = q_1_perm_0, x = var_200)[name = tensor<string, []>("transpose_58")];
             tensor<fp32, [1, 4, 2, 2]> qk_1 = matmul(transpose_x = qk_1_transpose_x_1, transpose_y = qk_1_transpose_y_1, x = q_1, y = k_1)[name = tensor<string, []>("qk_1")];
-            tensor<int32, [2]> var_188 = const()[name = tensor<string, []>("op_188"), val = tensor<int32, [2]>([2, 1])];
-            tensor<fp32, [2, 1]> var_189 = reshape(shape = var_188, x = sqrt_s_t_1)[name = tensor<string, []>("op_189")];
-            tensor<fp32, [2, 2]> M_1 = real_div(x = encoder__causal_mask, y = var_189)[name = tensor<string, []>("M_1")];
-            tensor<fp32, [1, 4, 2, 2]> var_191 = mul(x = qk_1, y = M_1)[name = tensor<string, []>("op_191")];
+            tensor<int32, [2]> var_224 = const()[name = tensor<string, []>("op_224"), val = tensor<int32, [2]>([2, 1])];
+            tensor<fp32, [2, 1]> var_225 = reshape(shape = var_224, x = sqrt_s_t_1)[name = tensor<string, []>("op_225")];
+            tensor<fp32, [2, 2]> M_1 = real_div(x = inner_encoder__causal_mask, y = var_225)[name = tensor<string, []>("M_1")];
+            tensor<fp32, [1, 4, 2, 2]> var_227 = mul(x = qk_1, y = M_1)[name = tensor<string, []>("op_227")];
             tensor<bool, []> inner_1_transpose_x_0 = const()[name = tensor<string, []>("inner_1_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_1_transpose_y_0 = const()[name = tensor<string, []>("inner_1_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 2, 64]> v_1 = transpose(perm = v_1_perm_0, x = var_178)[name = tensor<string, []>("transpose_56")];
-            tensor<fp32, [1, 4, 2, 64]> inner_1 = matmul(transpose_x = inner_1_transpose_x_0, transpose_y = inner_1_transpose_y_0, x = var_191, y = v_1)[name = tensor<string, []>("inner_1")];
-            tensor<bool, []> var_193_transpose_x_0 = const()[name = tensor<string, []>("op_193_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_193_transpose_y_0 = const()[name = tensor<string, []>("op_193_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 2, 64]> var_193 = matmul(transpose_x = var_193_transpose_x_0, transpose_y = var_193_transpose_y_0, x = q_1, y = prev_kv_1)[name = tensor<string, []>("op_193")];
-            tensor<fp32, [2]> var_194 = real_div(x = sqrt_s0_1, y = sqrt_s_t_1)[name = tensor<string, []>("op_194")];
-            tensor<int32, [4]> var_195 = const()[name = tensor<string, []>("op_195"), val = tensor<int32, [4]>([1, 1, 2, 1])];
-            tensor<fp32, [1, 1, 2, 1]> var_196 = reshape(shape = var_195, x = var_194)[name = tensor<string, []>("op_196")];
-            tensor<fp32, [1, 4, 2, 64]> cross_1 = mul(x = var_193, y = var_196)[name = tensor<string, []>("cross_1")];
+            tensor<fp32, [1, 4, 2, 64]> v_1 = transpose(perm = v_1_perm_0, x = var_214)[name = tensor<string, []>("transpose_56")];
+            tensor<fp32, [1, 4, 2, 64]> inner_1 = matmul(transpose_x = inner_1_transpose_x_0, transpose_y = inner_1_transpose_y_0, x = var_227, y = v_1)[name = tensor<string, []>("inner_1")];
+            tensor<bool, []> var_229_transpose_x_0 = const()[name = tensor<string, []>("op_229_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_229_transpose_y_0 = const()[name = tensor<string, []>("op_229_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 2, 64]> var_229 = matmul(transpose_x = var_229_transpose_x_0, transpose_y = var_229_transpose_y_0, x = q_1, y = prev_kv_1)[name = tensor<string, []>("op_229")];
+            tensor<fp32, [2]> var_230 = real_div(x = sqrt_s0_1, y = sqrt_s_t_1)[name = tensor<string, []>("op_230")];
+            tensor<int32, [4]> var_231 = const()[name = tensor<string, []>("op_231"), val = tensor<int32, [4]>([1, 1, 2, 1])];
+            tensor<fp32, [1, 1, 2, 1]> var_232 = reshape(shape = var_231, x = var_230)[name = tensor<string, []>("op_232")];
+            tensor<fp32, [1, 4, 2, 64]> cross_1 = mul(x = var_229, y = var_232)[name = tensor<string, []>("cross_1")];
             tensor<fp32, [1, 4, 2, 64]> out_1 = add(x = inner_1, y = cross_1)[name = tensor<string, []>("out_1")];
-            tensor<fp32, [1, 4, 64, 64]> var_199 = mul(x = prev_kv_1, y = sqrt_s0_1)[name = tensor<string, []>("op_199")];
-            tensor<bool, []> var_201_transpose_x_1 = const()[name = tensor<string, []>("op_201_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_201_transpose_y_1 = const()[name = tensor<string, []>("op_201_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_201 = matmul(transpose_x = var_201_transpose_x_1, transpose_y = var_201_transpose_y_1, x = k_1, y = v_1)[name = tensor<string, []>("op_201")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_1 = add(x = var_199, y = var_201)[name = tensor<string, []>("new_kv_unnorm_1")];
-            tensor<fp32, []> var_203 = const()[name = tensor<string, []>("op_203"), val = tensor<fp32, []>(0x1p+1)];
-            tensor<fp32, [1]> new_scale_1 = add(x = prev_scale_1, y = var_203)[name = tensor<string, []>("new_scale_1")];
-            tensor<fp32, [1]> var_205 = sqrt(x = new_scale_1)[name = tensor<string, []>("op_205")];
-            tensor<fp32, [1, 4, 64, 64]> var_206 = real_div(x = new_kv_unnorm_1, y = var_205)[name = tensor<string, []>("op_206")];
-            tensor<int32, [4]> var_207_perm_0 = const()[name = tensor<string, []>("op_207_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_235 = mul(x = prev_kv_1, y = sqrt_s0_1)[name = tensor<string, []>("op_235")];
+            tensor<bool, []> var_237_transpose_x_1 = const()[name = tensor<string, []>("op_237_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_237_transpose_y_1 = const()[name = tensor<string, []>("op_237_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_237 = matmul(transpose_x = var_237_transpose_x_1, transpose_y = var_237_transpose_y_1, x = k_1, y = v_1)[name = tensor<string, []>("op_237")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_1 = add(x = var_235, y = var_237)[name = tensor<string, []>("new_kv_unnorm_1")];
+            tensor<fp32, []> var_239 = const()[name = tensor<string, []>("op_239"), val = tensor<fp32, []>(0x1p+1)];
+            tensor<fp32, [1]> new_scale_1 = add(x = prev_scale_1, y = var_239)[name = tensor<string, []>("new_scale_1")];
+            tensor<fp32, [1]> var_241 = sqrt(x = new_scale_1)[name = tensor<string, []>("op_241")];
+            tensor<fp32, [1, 4, 64, 64]> var_242 = real_div(x = new_kv_unnorm_1, y = var_241)[name = tensor<string, []>("op_242")];
+            tensor<int32, [4]> var_243_perm_0 = const()[name = tensor<string, []>("op_243_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_3_axes_0 = const()[name = tensor<string, []>("out_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 4, 64]> var_207 = transpose(perm = var_207_perm_0, x = out_1)[name = tensor<string, []>("transpose_55")];
-            tensor<fp32, [1, 2, 4, 64]> out_3 = layer_norm(axes = out_3_axes_0, epsilon = var_17, x = var_207)[name = tensor<string, []>("out_3")];
-            tensor<int32, [3]> var_211 = const()[name = tensor<string, []>("op_211"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<fp32, [1, 2, 256]> out_5 = reshape(shape = var_211, x = out_3)[name = tensor<string, []>("out_5")];
-            tensor<fp32, [1, 2, 256]> var_213 = silu(x = input_17)[name = tensor<string, []>("op_213")];
-            tensor<fp32, [1, 2, 256]> input_19 = mul(x = var_213, y = out_5)[name = tensor<string, []>("input_19")];
-            tensor<fp32, [1, 2, 256]> ret_out_1 = linear(bias = encoder_out_proj_0_bias, weight = encoder_out_proj_0_weight, x = input_19)[name = tensor<string, []>("linear_7")];
-            tensor<fp32, [1, 2, 256]> x_3 = add(x = input_13, y = ret_out_1)[name = tensor<string, []>("x_3")];
+            tensor<fp32, [1, 2, 4, 64]> var_243 = transpose(perm = var_243_perm_0, x = out_1)[name = tensor<string, []>("transpose_55")];
+            tensor<fp32, [1, 2, 4, 64]> out_3 = layer_norm(axes = out_3_axes_0, epsilon = var_54, x = var_243)[name = tensor<string, []>("out_3")];
+            tensor<int32, [3]> var_247 = const()[name = tensor<string, []>("op_247"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp32, [1, 2, 256]> out_5 = reshape(shape = var_247, x = out_3)[name = tensor<string, []>("out_5")];
+            tensor<fp32, [1, 2, 256]> var_249 = silu(x = input_19)[name = tensor<string, []>("op_249")];
+            tensor<fp32, [1, 2, 256]> input_21 = mul(x = var_249, y = out_5)[name = tensor<string, []>("input_21")];
+            tensor<fp32, [1, 2, 256]> ret_out_1 = linear(bias = inner_encoder_out_proj_0_bias, weight = inner_encoder_out_proj_0_weight, x = input_21)[name = tensor<string, []>("linear_7")];
+            tensor<fp32, [1, 2, 256]> x_3 = add(x = input_15, y = ret_out_1)[name = tensor<string, []>("x_3")];
             tensor<int32, [4]> window_1_begin_0 = const()[name = tensor<string, []>("window_1_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> window_1_end_0 = const()[name = tensor<string, []>("window_1_end_0"), val = tensor<int32, [4]>([1, 1, 16, 256])];
             tensor<bool, [4]> window_1_end_mask_0 = const()[name = tensor<string, []>("window_1_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_1_squeeze_mask_0 = const()[name = tensor<string, []>("window_1_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_1 = slice_by_index(begin = window_1_begin_0, end = window_1_end_0, end_mask = window_1_end_mask_0, squeeze_mask = window_1_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_1")];
-            tensor<int32, [3]> var_221_begin_0 = const()[name = tensor<string, []>("op_221_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_221_end_0 = const()[name = tensor<string, []>("op_221_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_221_end_mask_0 = const()[name = tensor<string, []>("op_221_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_221 = slice_by_index(begin = var_221_begin_0, end = var_221_end_0, end_mask = var_221_end_mask_0, x = x_3)[name = tensor<string, []>("op_221")];
-            tensor<int32, [3]> var_224_begin_0 = const()[name = tensor<string, []>("op_224_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_224_end_0 = const()[name = tensor<string, []>("op_224_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_224_end_mask_0 = const()[name = tensor<string, []>("op_224_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_224 = slice_by_index(begin = var_224_begin_0, end = var_224_end_0, end_mask = var_224_end_mask_0, x = window_1)[name = tensor<string, []>("op_224")];
+            tensor<int32, [3]> var_257_begin_0 = const()[name = tensor<string, []>("op_257_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_257_end_0 = const()[name = tensor<string, []>("op_257_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_257_end_mask_0 = const()[name = tensor<string, []>("op_257_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_257 = slice_by_index(begin = var_257_begin_0, end = var_257_end_0, end_mask = var_257_end_mask_0, x = x_3)[name = tensor<string, []>("op_257")];
+            tensor<int32, [3]> var_260_begin_0 = const()[name = tensor<string, []>("op_260_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_260_end_0 = const()[name = tensor<string, []>("op_260_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_260_end_mask_0 = const()[name = tensor<string, []>("op_260_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_260 = slice_by_index(begin = var_260_begin_0, end = var_260_end_0, end_mask = var_260_end_mask_0, x = window_1)[name = tensor<string, []>("op_260")];
             tensor<bool, []> window_3_interleave_0 = const()[name = tensor<string, []>("window_3_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_3 = concat(axis = var_26, interleave = window_3_interleave_0, values = (var_224, var_221))[name = tensor<string, []>("window_3")];
-            tensor<int32, [3]> var_229_begin_0 = const()[name = tensor<string, []>("op_229_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_229_end_0 = const()[name = tensor<string, []>("op_229_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_229_end_mask_0 = const()[name = tensor<string, []>("op_229_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_229 = slice_by_index(begin = var_229_begin_0, end = var_229_end_0, end_mask = var_229_end_mask_0, x = x_3)[name = tensor<string, []>("op_229")];
-            tensor<int32, [3]> var_232_begin_0 = const()[name = tensor<string, []>("op_232_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_232_end_0 = const()[name = tensor<string, []>("op_232_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_232_end_mask_0 = const()[name = tensor<string, []>("op_232_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_232 = slice_by_index(begin = var_232_begin_0, end = var_232_end_0, end_mask = var_232_end_mask_0, x = window_3)[name = tensor<string, []>("op_232")];
+            tensor<fp32, [1, 16, 256]> window_3 = concat(axis = var_62, interleave = window_3_interleave_0, values = (var_260, var_257))[name = tensor<string, []>("window_3")];
+            tensor<int32, [3]> var_265_begin_0 = const()[name = tensor<string, []>("op_265_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_265_end_0 = const()[name = tensor<string, []>("op_265_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_265_end_mask_0 = const()[name = tensor<string, []>("op_265_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_265 = slice_by_index(begin = var_265_begin_0, end = var_265_end_0, end_mask = var_265_end_mask_0, x = x_3)[name = tensor<string, []>("op_265")];
+            tensor<int32, [3]> var_268_begin_0 = const()[name = tensor<string, []>("op_268_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_268_end_0 = const()[name = tensor<string, []>("op_268_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_268_end_mask_0 = const()[name = tensor<string, []>("op_268_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_268 = slice_by_index(begin = var_268_begin_0, end = var_268_end_0, end_mask = var_268_end_mask_0, x = window_3)[name = tensor<string, []>("op_268")];
             tensor<bool, []> window_5_interleave_0 = const()[name = tensor<string, []>("window_5_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_5 = concat(axis = var_26, interleave = window_5_interleave_0, values = (var_232, var_229))[name = tensor<string, []>("window_5")];
-            tensor<bool, []> input_21_interleave_0 = const()[name = tensor<string, []>("input_21_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [2, 16, 256]> input_21 = concat(axis = var_23, interleave = input_21_interleave_0, values = (window_3, window_5))[name = tensor<string, []>("input_21")];
+            tensor<fp32, [1, 16, 256]> window_5 = concat(axis = var_62, interleave = window_5_interleave_0, values = (var_268, var_265))[name = tensor<string, []>("window_5")];
+            tensor<bool, []> input_23_interleave_0 = const()[name = tensor<string, []>("input_23_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [2, 16, 256]> input_23 = concat(axis = var_49, interleave = input_23_interleave_0, values = (window_3, window_5))[name = tensor<string, []>("input_23")];
             tensor<int32, [1]> x_5_axes_0 = const()[name = tensor<string, []>("x_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [2, 16, 256]> x_5 = layer_norm(axes = x_5_axes_0, beta = encoder_conv_module_0_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_0_sequential_0_weight, x = input_21)[name = tensor<string, []>("x_5")];
-            tensor<int32, [3]> input_23_perm_0 = const()[name = tensor<string, []>("input_23_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [2, 16, 256]> x_5 = layer_norm(axes = x_5_axes_0, beta = inner_encoder_conv_module_0_sequential_0_bias, epsilon = var_46, gamma = inner_encoder_conv_module_0_sequential_0_weight, x = input_23)[name = tensor<string, []>("x_5")];
+            tensor<int32, [3]> input_25_perm_0 = const()[name = tensor<string, []>("input_25_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_3_pad_type_0 = const()[name = tensor<string, []>("inputs_3_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_3_strides_0 = const()[name = tensor<string, []>("inputs_3_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_3_pad_0 = const()[name = tensor<string, []>("inputs_3_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_3_dilations_0 = const()[name = tensor<string, []>("inputs_3_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_3_groups_0 = const()[name = tensor<string, []>("inputs_3_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> input_23 = transpose(perm = input_23_perm_0, x = x_5)[name = tensor<string, []>("transpose_54")];
-            tensor<fp32, [2, 512, 16]> inputs_3 = conv(bias = encoder_conv_module_0_sequential_2_conv_bias, dilations = inputs_3_dilations_0, groups = inputs_3_groups_0, pad = inputs_3_pad_0, pad_type = inputs_3_pad_type_0, strides = inputs_3_strides_0, weight = encoder_conv_module_0_sequential_2_conv_weight, x = input_23)[name = tensor<string, []>("inputs_3")];
-            tensor<int32, [2]> var_257_split_sizes_0 = const()[name = tensor<string, []>("op_257_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_257_axis_0 = const()[name = tensor<string, []>("op_257_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> var_257_0, tensor<fp32, [2, 256, 16]> var_257_1 = split(axis = var_257_axis_0, split_sizes = var_257_split_sizes_0, x = inputs_3)[name = tensor<string, []>("op_257")];
-            tensor<fp32, [2, 256, 16]> var_259 = sigmoid(x = var_257_1)[name = tensor<string, []>("op_259")];
-            tensor<fp32, [2, 256, 16]> inputs_5 = mul(x = var_257_0, y = var_259)[name = tensor<string, []>("inputs_5")];
+            tensor<fp32, [2, 256, 16]> input_25 = transpose(perm = input_25_perm_0, x = x_5)[name = tensor<string, []>("transpose_54")];
+            tensor<fp32, [2, 512, 16]> inputs_3 = conv(bias = inner_encoder_conv_module_0_sequential_2_conv_bias, dilations = inputs_3_dilations_0, groups = inputs_3_groups_0, pad = inputs_3_pad_0, pad_type = inputs_3_pad_type_0, strides = inputs_3_strides_0, weight = inner_encoder_conv_module_0_sequential_2_conv_weight, x = input_25)[name = tensor<string, []>("inputs_3")];
+            tensor<int32, [2]> var_293_split_sizes_0 = const()[name = tensor<string, []>("op_293_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_293_axis_0 = const()[name = tensor<string, []>("op_293_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [2, 256, 16]> var_293_0, tensor<fp32, [2, 256, 16]> var_293_1 = split(axis = var_293_axis_0, split_sizes = var_293_split_sizes_0, x = inputs_3)[name = tensor<string, []>("op_293")];
+            tensor<fp32, [2, 256, 16]> var_295 = sigmoid(x = var_293_1)[name = tensor<string, []>("op_295")];
+            tensor<fp32, [2, 256, 16]> inputs_5 = mul(x = var_293_0, y = var_295)[name = tensor<string, []>("inputs_5")];
             tensor<string, []> outputs_aug_1_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_1_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_1_pad_0 = const()[name = tensor<string, []>("outputs_aug_1_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_1_groups_0 = const()[name = tensor<string, []>("outputs_aug_1_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_1_strides_0 = const()[name = tensor<string, []>("outputs_aug_1_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_1_dilations_0 = const()[name = tensor<string, []>("outputs_aug_1_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [2, 256, 31]> outputs_aug_1 = conv(dilations = outputs_aug_1_dilations_0, groups = outputs_aug_1_groups_0, pad = outputs_aug_1_pad_0, pad_type = outputs_aug_1_pad_type_0, strides = outputs_aug_1_strides_0, weight = encoder_conv_module_0_sequential_4_conv_weight, x = inputs_5)[name = tensor<string, []>("outputs_aug_1")];
-            tensor<int32, [3]> input_25_begin_0 = const()[name = tensor<string, []>("input_25_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_25_end_0 = const()[name = tensor<string, []>("input_25_end_0"), val = tensor<int32, [3]>([2, 256, 16])];
-            tensor<bool, [3]> input_25_end_mask_0 = const()[name = tensor<string, []>("input_25_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [2, 256, 16]> input_25 = slice_by_index(begin = input_25_begin_0, end = input_25_end_0, end_mask = input_25_end_mask_0, x = outputs_aug_1)[name = tensor<string, []>("input_25")];
-            tensor<fp32, [2, 256, 16]> inputs_7 = batch_norm(beta = encoder_conv_module_0_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_0_sequential_5_weight, mean = encoder_conv_module_0_sequential_5_running_mean, variance = encoder_conv_module_0_sequential_5_running_var, x = input_25)[name = tensor<string, []>("inputs_7")];
-            tensor<fp32, [2, 256, 16]> input_27 = silu(x = inputs_7)[name = tensor<string, []>("input_27")];
-            tensor<string, []> input_29_pad_type_0 = const()[name = tensor<string, []>("input_29_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_29_strides_0 = const()[name = tensor<string, []>("input_29_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_29_pad_0 = const()[name = tensor<string, []>("input_29_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_29_dilations_0 = const()[name = tensor<string, []>("input_29_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_29_groups_0 = const()[name = tensor<string, []>("input_29_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> input_29 = conv(bias = encoder_conv_module_0_sequential_7_conv_bias, dilations = input_29_dilations_0, groups = input_29_groups_0, pad = input_29_pad_0, pad_type = input_29_pad_type_0, strides = input_29_strides_0, weight = encoder_conv_module_0_sequential_7_conv_weight, x = input_27)[name = tensor<string, []>("input_29")];
+            tensor<fp32, [2, 256, 31]> outputs_aug_1 = conv(dilations = outputs_aug_1_dilations_0, groups = outputs_aug_1_groups_0, pad = outputs_aug_1_pad_0, pad_type = outputs_aug_1_pad_type_0, strides = outputs_aug_1_strides_0, weight = inner_encoder_conv_module_0_sequential_4_conv_weight, x = inputs_5)[name = tensor<string, []>("outputs_aug_1")];
+            tensor<int32, [3]> input_27_begin_0 = const()[name = tensor<string, []>("input_27_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_27_end_0 = const()[name = tensor<string, []>("input_27_end_0"), val = tensor<int32, [3]>([2, 256, 16])];
+            tensor<bool, [3]> input_27_end_mask_0 = const()[name = tensor<string, []>("input_27_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [2, 256, 16]> input_27 = slice_by_index(begin = input_27_begin_0, end = input_27_end_0, end_mask = input_27_end_mask_0, x = outputs_aug_1)[name = tensor<string, []>("input_27")];
+            tensor<fp32, [2, 256, 16]> inputs_7 = batch_norm(beta = inner_encoder_conv_module_0_sequential_5_bias, epsilon = var_46, gamma = inner_encoder_conv_module_0_sequential_5_weight, mean = inner_encoder_conv_module_0_sequential_5_running_mean, variance = inner_encoder_conv_module_0_sequential_5_running_var, x = input_27)[name = tensor<string, []>("inputs_7")];
+            tensor<fp32, [2, 256, 16]> input_29 = silu(x = inputs_7)[name = tensor<string, []>("input_29")];
+            tensor<string, []> input_31_pad_type_0 = const()[name = tensor<string, []>("input_31_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_31_strides_0 = const()[name = tensor<string, []>("input_31_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_31_pad_0 = const()[name = tensor<string, []>("input_31_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_31_dilations_0 = const()[name = tensor<string, []>("input_31_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_31_groups_0 = const()[name = tensor<string, []>("input_31_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [2, 256, 16]> input_31 = conv(bias = inner_encoder_conv_module_0_sequential_7_conv_bias, dilations = input_31_dilations_0, groups = input_31_groups_0, pad = input_31_pad_0, pad_type = input_31_pad_type_0, strides = input_31_strides_0, weight = inner_encoder_conv_module_0_sequential_7_conv_weight, x = input_29)[name = tensor<string, []>("input_31")];
             tensor<int32, [3]> conv_out_1_perm_0 = const()[name = tensor<string, []>("conv_out_1_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_290_begin_0 = const()[name = tensor<string, []>("op_290_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_290_end_0 = const()[name = tensor<string, []>("op_290_end_0"), val = tensor<int32, [3]>([2, 16, 256])];
-            tensor<bool, [3]> var_290_end_mask_0 = const()[name = tensor<string, []>("op_290_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [2, 16, 256]> conv_out_1 = transpose(perm = conv_out_1_perm_0, x = input_29)[name = tensor<string, []>("transpose_53")];
-            tensor<fp32, [2, 1, 256]> var_290 = slice_by_index(begin = var_290_begin_0, end = var_290_end_0, end_mask = var_290_end_mask_0, x = conv_out_1)[name = tensor<string, []>("op_290")];
-            tensor<int32, [3]> var_292_perm_0 = const()[name = tensor<string, []>("op_292_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 2, 256]> var_292 = transpose(perm = var_292_perm_0, x = var_290)[name = tensor<string, []>("transpose_52")];
-            tensor<fp32, [1, 2, 256]> input_31 = add(x = x_3, y = var_292)[name = tensor<string, []>("input_31")];
-            tensor<int32, [1]> input_33_axes_0 = const()[name = tensor<string, []>("input_33_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_33 = layer_norm(axes = input_33_axes_0, beta = encoder_ffn2_0_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_0_module_sequential_0_weight, x = input_31)[name = tensor<string, []>("input_33")];
-            tensor<fp32, [1, 2, 1024]> inputs_9 = linear(bias = encoder_ffn2_0_module_sequential_1_linear_bias, weight = encoder_ffn2_0_module_sequential_1_linear_weight, x = input_33)[name = tensor<string, []>("linear_8")];
-            tensor<fp32, [1, 2, 1024]> input_35 = silu(x = inputs_9)[name = tensor<string, []>("input_35")];
-            tensor<fp32, [1, 2, 256]> input_39 = linear(bias = encoder_ffn2_0_module_sequential_4_linear_bias, weight = encoder_ffn2_0_module_sequential_4_linear_weight, x = input_35)[name = tensor<string, []>("linear_9")];
-            tensor<fp32, []> var_315 = const()[name = tensor<string, []>("op_315"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 2, 256]> var_316 = mul(x = input_39, y = var_315)[name = tensor<string, []>("op_316")];
-            tensor<fp32, [1, 2, 256]> input_41 = add(x = var_316, y = input_31)[name = tensor<string, []>("input_41")];
-            tensor<int32, [1]> input_43_axes_0 = const()[name = tensor<string, []>("input_43_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_43 = layer_norm(axes = input_43_axes_0, beta = encoder_layer_norm_0_bias, epsilon = var_28, gamma = encoder_layer_norm_0_weight, x = input_41)[name = tensor<string, []>("input_43")];
+            tensor<int32, [3]> var_326_begin_0 = const()[name = tensor<string, []>("op_326_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_326_end_0 = const()[name = tensor<string, []>("op_326_end_0"), val = tensor<int32, [3]>([2, 16, 256])];
+            tensor<bool, [3]> var_326_end_mask_0 = const()[name = tensor<string, []>("op_326_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [2, 16, 256]> conv_out_1 = transpose(perm = conv_out_1_perm_0, x = input_31)[name = tensor<string, []>("transpose_53")];
+            tensor<fp32, [2, 1, 256]> var_326 = slice_by_index(begin = var_326_begin_0, end = var_326_end_0, end_mask = var_326_end_mask_0, x = conv_out_1)[name = tensor<string, []>("op_326")];
+            tensor<int32, [3]> var_328_perm_0 = const()[name = tensor<string, []>("op_328_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 2, 256]> var_328 = transpose(perm = var_328_perm_0, x = var_326)[name = tensor<string, []>("transpose_52")];
+            tensor<fp32, [1, 2, 256]> input_33 = add(x = x_3, y = var_328)[name = tensor<string, []>("input_33")];
+            tensor<int32, [1]> input_35_axes_0 = const()[name = tensor<string, []>("input_35_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 2, 256]> input_35 = layer_norm(axes = input_35_axes_0, beta = inner_encoder_ffn2_0_module_sequential_0_bias, epsilon = var_46, gamma = inner_encoder_ffn2_0_module_sequential_0_weight, x = input_33)[name = tensor<string, []>("input_35")];
+            tensor<fp32, [1, 2, 1024]> inputs_9 = linear(bias = inner_encoder_ffn2_0_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_0_module_sequential_1_linear_weight, x = input_35)[name = tensor<string, []>("linear_8")];
+            tensor<fp32, [1, 2, 1024]> input_37 = silu(x = inputs_9)[name = tensor<string, []>("input_37")];
+            tensor<fp32, [1, 2, 256]> input_41 = linear(bias = inner_encoder_ffn2_0_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_0_module_sequential_4_linear_weight, x = input_37)[name = tensor<string, []>("linear_9")];
+            tensor<fp32, []> var_351 = const()[name = tensor<string, []>("op_351"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 2, 256]> var_352 = mul(x = input_41, y = var_351)[name = tensor<string, []>("op_352")];
+            tensor<fp32, [1, 2, 256]> input_43 = add(x = var_352, y = input_33)[name = tensor<string, []>("input_43")];
             tensor<int32, [1]> input_45_axes_0 = const()[name = tensor<string, []>("input_45_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_45 = layer_norm(axes = input_45_axes_0, beta = encoder_ffn1_1_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_1_module_sequential_0_weight, x = input_43)[name = tensor<string, []>("input_45")];
-            tensor<fp32, [1, 2, 1024]> inputs_11 = linear(bias = encoder_ffn1_1_module_sequential_1_linear_bias, weight = encoder_ffn1_1_module_sequential_1_linear_weight, x = input_45)[name = tensor<string, []>("linear_10")];
-            tensor<fp32, [1, 2, 1024]> input_47 = silu(x = inputs_11)[name = tensor<string, []>("input_47")];
-            tensor<fp32, [1, 2, 256]> input_51 = linear(bias = encoder_ffn1_1_module_sequential_4_linear_bias, weight = encoder_ffn1_1_module_sequential_4_linear_weight, x = input_47)[name = tensor<string, []>("linear_11")];
-            tensor<fp32, []> var_345 = const()[name = tensor<string, []>("op_345"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 2, 256]> var_346 = mul(x = input_51, y = var_345)[name = tensor<string, []>("op_346")];
-            tensor<fp32, [1, 2, 256]> input_53 = add(x = var_346, y = input_43)[name = tensor<string, []>("input_53")];
+            tensor<fp32, [1, 2, 256]> input_45 = layer_norm(axes = input_45_axes_0, beta = inner_encoder_layer_norm_0_bias, epsilon = var_46, gamma = inner_encoder_layer_norm_0_weight, x = input_43)[name = tensor<string, []>("input_45")];
+            tensor<int32, [1]> input_47_axes_0 = const()[name = tensor<string, []>("input_47_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 2, 256]> input_47 = layer_norm(axes = input_47_axes_0, beta = inner_encoder_ffn1_1_module_sequential_0_bias, epsilon = var_46, gamma = inner_encoder_ffn1_1_module_sequential_0_weight, x = input_45)[name = tensor<string, []>("input_47")];
+            tensor<fp32, [1, 2, 1024]> inputs_11 = linear(bias = inner_encoder_ffn1_1_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_1_module_sequential_1_linear_weight, x = input_47)[name = tensor<string, []>("linear_10")];
+            tensor<fp32, [1, 2, 1024]> input_49 = silu(x = inputs_11)[name = tensor<string, []>("input_49")];
+            tensor<fp32, [1, 2, 256]> input_53 = linear(bias = inner_encoder_ffn1_1_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_1_module_sequential_4_linear_weight, x = input_49)[name = tensor<string, []>("linear_11")];
+            tensor<fp32, []> var_381 = const()[name = tensor<string, []>("op_381"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 2, 256]> var_382 = mul(x = input_53, y = var_381)[name = tensor<string, []>("op_382")];
+            tensor<fp32, [1, 2, 256]> input_55 = add(x = var_382, y = input_45)[name = tensor<string, []>("input_55")];
             tensor<int32, [1]> x_7_axes_0 = const()[name = tensor<string, []>("x_7_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> x_7 = layer_norm(axes = x_7_axes_0, beta = encoder_ret_lns_1_bias, epsilon = var_28, gamma = encoder_ret_lns_1_weight, x = input_53)[name = tensor<string, []>("x_7")];
+            tensor<fp32, [1, 2, 256]> x_7 = layer_norm(axes = x_7_axes_0, beta = inner_encoder_ret_lns_1_bias, epsilon = var_46, gamma = inner_encoder_ret_lns_1_weight, x = input_55)[name = tensor<string, []>("x_7")];
             tensor<int32, [5]> prev_kv_3_begin_0 = const()[name = tensor<string, []>("prev_kv_3_begin_0"), val = tensor<int32, [5]>([1, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_3_end_0 = const()[name = tensor<string, []>("prev_kv_3_end_0"), val = tensor<int32, [5]>([2, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_3_end_mask_0 = const()[name = tensor<string, []>("prev_kv_3_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -396,153 +410,153 @@ program(1.0)
             tensor<bool, [2]> prev_scale_3_end_mask_0 = const()[name = tensor<string, []>("prev_scale_3_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_3_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_3_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_3 = slice_by_index(begin = prev_scale_3_begin_0, end = prev_scale_3_end_0, end_mask = prev_scale_3_end_mask_0, squeeze_mask = prev_scale_3_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_3")];
-            tensor<fp32, [1, 2, 256]> var_360 = linear(bias = encoder_q_proj_1_bias, weight = encoder_q_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_12")];
-            tensor<int32, [4]> var_361 = const()[name = tensor<string, []>("op_361"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_362 = reshape(shape = var_361, x = var_360)[name = tensor<string, []>("op_362")];
+            tensor<fp32, [1, 2, 256]> var_396 = linear(bias = inner_encoder_q_proj_1_bias, weight = inner_encoder_q_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_12")];
+            tensor<int32, [4]> var_397 = const()[name = tensor<string, []>("op_397"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_398 = reshape(shape = var_397, x = var_396)[name = tensor<string, []>("op_398")];
             tensor<int32, [4]> q_3_perm_0 = const()[name = tensor<string, []>("q_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> var_366 = linear(bias = encoder_k_proj_1_bias, weight = encoder_k_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_13")];
-            tensor<fp32, []> var_367 = const()[name = tensor<string, []>("op_367"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 2, 256]> var_368 = mul(x = var_366, y = var_367)[name = tensor<string, []>("op_368")];
-            tensor<int32, [4]> var_369 = const()[name = tensor<string, []>("op_369"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_370 = reshape(shape = var_369, x = var_368)[name = tensor<string, []>("op_370")];
+            tensor<fp32, [1, 2, 256]> var_402 = linear(bias = inner_encoder_k_proj_1_bias, weight = inner_encoder_k_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_13")];
+            tensor<fp32, []> var_403 = const()[name = tensor<string, []>("op_403"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 2, 256]> var_404 = mul(x = var_402, y = var_403)[name = tensor<string, []>("op_404")];
+            tensor<int32, [4]> var_405 = const()[name = tensor<string, []>("op_405"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_406 = reshape(shape = var_405, x = var_404)[name = tensor<string, []>("op_406")];
             tensor<int32, [4]> k_3_perm_0 = const()[name = tensor<string, []>("k_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> var_374 = linear(bias = encoder_v_proj_1_bias, weight = encoder_v_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_14")];
-            tensor<int32, [4]> var_375 = const()[name = tensor<string, []>("op_375"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_376 = reshape(shape = var_375, x = var_374)[name = tensor<string, []>("op_376")];
+            tensor<fp32, [1, 2, 256]> var_410 = linear(bias = inner_encoder_v_proj_1_bias, weight = inner_encoder_v_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_14")];
+            tensor<int32, [4]> var_411 = const()[name = tensor<string, []>("op_411"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_412 = reshape(shape = var_411, x = var_410)[name = tensor<string, []>("op_412")];
             tensor<int32, [4]> v_3_perm_0 = const()[name = tensor<string, []>("v_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> input_57 = linear(bias = encoder_g_proj_1_bias, weight = encoder_g_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_15")];
+            tensor<fp32, [1, 2, 256]> input_59 = linear(bias = inner_encoder_g_proj_1_bias, weight = inner_encoder_g_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_15")];
             tensor<fp32, [1]> sqrt_s0_3 = sqrt(x = prev_scale_3)[name = tensor<string, []>("sqrt_s0_3")];
-            tensor<fp32, [2]> s_t_3 = add(x = prev_scale_3, y = encoder__t_index)[name = tensor<string, []>("s_t_3")];
+            tensor<fp32, [2]> s_t_3 = add(x = prev_scale_3, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_3")];
             tensor<fp32, [2]> sqrt_s_t_3 = sqrt(x = s_t_3)[name = tensor<string, []>("sqrt_s_t_3")];
             tensor<bool, []> qk_3_transpose_x_1 = const()[name = tensor<string, []>("qk_3_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_3_transpose_y_1 = const()[name = tensor<string, []>("qk_3_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 2, 64]> k_3 = transpose(perm = k_3_perm_0, x = var_370)[name = tensor<string, []>("transpose_50")];
-            tensor<fp32, [1, 4, 2, 64]> q_3 = transpose(perm = q_3_perm_0, x = var_362)[name = tensor<string, []>("transpose_51")];
+            tensor<fp32, [1, 4, 2, 64]> k_3 = transpose(perm = k_3_perm_0, x = var_406)[name = tensor<string, []>("transpose_50")];
+            tensor<fp32, [1, 4, 2, 64]> q_3 = transpose(perm = q_3_perm_0, x = var_398)[name = tensor<string, []>("transpose_51")];
             tensor<fp32, [1, 4, 2, 2]> qk_3 = matmul(transpose_x = qk_3_transpose_x_1, transpose_y = qk_3_transpose_y_1, x = q_3, y = k_3)[name = tensor<string, []>("qk_3")];
-            tensor<int32, [2]> var_386 = const()[name = tensor<string, []>("op_386"), val = tensor<int32, [2]>([2, 1])];
-            tensor<fp32, [2, 1]> var_387 = reshape(shape = var_386, x = sqrt_s_t_3)[name = tensor<string, []>("op_387")];
-            tensor<fp32, [2, 2]> M_3 = real_div(x = encoder__causal_mask, y = var_387)[name = tensor<string, []>("M_3")];
-            tensor<fp32, [1, 4, 2, 2]> var_389 = mul(x = qk_3, y = M_3)[name = tensor<string, []>("op_389")];
+            tensor<int32, [2]> var_422 = const()[name = tensor<string, []>("op_422"), val = tensor<int32, [2]>([2, 1])];
+            tensor<fp32, [2, 1]> var_423 = reshape(shape = var_422, x = sqrt_s_t_3)[name = tensor<string, []>("op_423")];
+            tensor<fp32, [2, 2]> M_3 = real_div(x = inner_encoder__causal_mask, y = var_423)[name = tensor<string, []>("M_3")];
+            tensor<fp32, [1, 4, 2, 2]> var_425 = mul(x = qk_3, y = M_3)[name = tensor<string, []>("op_425")];
             tensor<bool, []> inner_3_transpose_x_0 = const()[name = tensor<string, []>("inner_3_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_3_transpose_y_0 = const()[name = tensor<string, []>("inner_3_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 2, 64]> v_3 = transpose(perm = v_3_perm_0, x = var_376)[name = tensor<string, []>("transpose_49")];
-            tensor<fp32, [1, 4, 2, 64]> inner_3 = matmul(transpose_x = inner_3_transpose_x_0, transpose_y = inner_3_transpose_y_0, x = var_389, y = v_3)[name = tensor<string, []>("inner_3")];
-            tensor<bool, []> var_391_transpose_x_0 = const()[name = tensor<string, []>("op_391_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_391_transpose_y_0 = const()[name = tensor<string, []>("op_391_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 2, 64]> var_391 = matmul(transpose_x = var_391_transpose_x_0, transpose_y = var_391_transpose_y_0, x = q_3, y = prev_kv_3)[name = tensor<string, []>("op_391")];
-            tensor<fp32, [2]> var_392 = real_div(x = sqrt_s0_3, y = sqrt_s_t_3)[name = tensor<string, []>("op_392")];
-            tensor<int32, [4]> var_393 = const()[name = tensor<string, []>("op_393"), val = tensor<int32, [4]>([1, 1, 2, 1])];
-            tensor<fp32, [1, 1, 2, 1]> var_394 = reshape(shape = var_393, x = var_392)[name = tensor<string, []>("op_394")];
-            tensor<fp32, [1, 4, 2, 64]> cross_3 = mul(x = var_391, y = var_394)[name = tensor<string, []>("cross_3")];
+            tensor<fp32, [1, 4, 2, 64]> v_3 = transpose(perm = v_3_perm_0, x = var_412)[name = tensor<string, []>("transpose_49")];
+            tensor<fp32, [1, 4, 2, 64]> inner_3 = matmul(transpose_x = inner_3_transpose_x_0, transpose_y = inner_3_transpose_y_0, x = var_425, y = v_3)[name = tensor<string, []>("inner_3")];
+            tensor<bool, []> var_427_transpose_x_0 = const()[name = tensor<string, []>("op_427_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_427_transpose_y_0 = const()[name = tensor<string, []>("op_427_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 2, 64]> var_427 = matmul(transpose_x = var_427_transpose_x_0, transpose_y = var_427_transpose_y_0, x = q_3, y = prev_kv_3)[name = tensor<string, []>("op_427")];
+            tensor<fp32, [2]> var_428 = real_div(x = sqrt_s0_3, y = sqrt_s_t_3)[name = tensor<string, []>("op_428")];
+            tensor<int32, [4]> var_429 = const()[name = tensor<string, []>("op_429"), val = tensor<int32, [4]>([1, 1, 2, 1])];
+            tensor<fp32, [1, 1, 2, 1]> var_430 = reshape(shape = var_429, x = var_428)[name = tensor<string, []>("op_430")];
+            tensor<fp32, [1, 4, 2, 64]> cross_3 = mul(x = var_427, y = var_430)[name = tensor<string, []>("cross_3")];
             tensor<fp32, [1, 4, 2, 64]> out_7 = add(x = inner_3, y = cross_3)[name = tensor<string, []>("out_7")];
-            tensor<fp32, [1, 4, 64, 64]> var_397 = mul(x = prev_kv_3, y = sqrt_s0_3)[name = tensor<string, []>("op_397")];
-            tensor<bool, []> var_399_transpose_x_1 = const()[name = tensor<string, []>("op_399_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_399_transpose_y_1 = const()[name = tensor<string, []>("op_399_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_399 = matmul(transpose_x = var_399_transpose_x_1, transpose_y = var_399_transpose_y_1, x = k_3, y = v_3)[name = tensor<string, []>("op_399")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_3 = add(x = var_397, y = var_399)[name = tensor<string, []>("new_kv_unnorm_3")];
-            tensor<fp32, []> var_401 = const()[name = tensor<string, []>("op_401"), val = tensor<fp32, []>(0x1p+1)];
-            tensor<fp32, [1]> new_scale_3 = add(x = prev_scale_3, y = var_401)[name = tensor<string, []>("new_scale_3")];
-            tensor<fp32, [1]> var_403 = sqrt(x = new_scale_3)[name = tensor<string, []>("op_403")];
-            tensor<fp32, [1, 4, 64, 64]> var_404 = real_div(x = new_kv_unnorm_3, y = var_403)[name = tensor<string, []>("op_404")];
-            tensor<int32, [4]> var_405_perm_0 = const()[name = tensor<string, []>("op_405_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_433 = mul(x = prev_kv_3, y = sqrt_s0_3)[name = tensor<string, []>("op_433")];
+            tensor<bool, []> var_435_transpose_x_1 = const()[name = tensor<string, []>("op_435_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_435_transpose_y_1 = const()[name = tensor<string, []>("op_435_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_435 = matmul(transpose_x = var_435_transpose_x_1, transpose_y = var_435_transpose_y_1, x = k_3, y = v_3)[name = tensor<string, []>("op_435")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_3 = add(x = var_433, y = var_435)[name = tensor<string, []>("new_kv_unnorm_3")];
+            tensor<fp32, []> var_437 = const()[name = tensor<string, []>("op_437"), val = tensor<fp32, []>(0x1p+1)];
+            tensor<fp32, [1]> new_scale_3 = add(x = prev_scale_3, y = var_437)[name = tensor<string, []>("new_scale_3")];
+            tensor<fp32, [1]> var_439 = sqrt(x = new_scale_3)[name = tensor<string, []>("op_439")];
+            tensor<fp32, [1, 4, 64, 64]> var_440 = real_div(x = new_kv_unnorm_3, y = var_439)[name = tensor<string, []>("op_440")];
+            tensor<int32, [4]> var_441_perm_0 = const()[name = tensor<string, []>("op_441_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_9_axes_0 = const()[name = tensor<string, []>("out_9_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 4, 64]> var_405 = transpose(perm = var_405_perm_0, x = out_7)[name = tensor<string, []>("transpose_48")];
-            tensor<fp32, [1, 2, 4, 64]> out_9 = layer_norm(axes = out_9_axes_0, epsilon = var_17, x = var_405)[name = tensor<string, []>("out_9")];
-            tensor<int32, [3]> var_409 = const()[name = tensor<string, []>("op_409"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<fp32, [1, 2, 256]> out_11 = reshape(shape = var_409, x = out_9)[name = tensor<string, []>("out_11")];
-            tensor<fp32, [1, 2, 256]> var_411 = silu(x = input_57)[name = tensor<string, []>("op_411")];
-            tensor<fp32, [1, 2, 256]> input_59 = mul(x = var_411, y = out_11)[name = tensor<string, []>("input_59")];
-            tensor<fp32, [1, 2, 256]> ret_out_3 = linear(bias = encoder_out_proj_1_bias, weight = encoder_out_proj_1_weight, x = input_59)[name = tensor<string, []>("linear_16")];
-            tensor<fp32, [1, 2, 256]> x_9 = add(x = input_53, y = ret_out_3)[name = tensor<string, []>("x_9")];
+            tensor<fp32, [1, 2, 4, 64]> var_441 = transpose(perm = var_441_perm_0, x = out_7)[name = tensor<string, []>("transpose_48")];
+            tensor<fp32, [1, 2, 4, 64]> out_9 = layer_norm(axes = out_9_axes_0, epsilon = var_54, x = var_441)[name = tensor<string, []>("out_9")];
+            tensor<int32, [3]> var_445 = const()[name = tensor<string, []>("op_445"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp32, [1, 2, 256]> out_11 = reshape(shape = var_445, x = out_9)[name = tensor<string, []>("out_11")];
+            tensor<fp32, [1, 2, 256]> var_447 = silu(x = input_59)[name = tensor<string, []>("op_447")];
+            tensor<fp32, [1, 2, 256]> input_61 = mul(x = var_447, y = out_11)[name = tensor<string, []>("input_61")];
+            tensor<fp32, [1, 2, 256]> ret_out_3 = linear(bias = inner_encoder_out_proj_1_bias, weight = inner_encoder_out_proj_1_weight, x = input_61)[name = tensor<string, []>("linear_16")];
+            tensor<fp32, [1, 2, 256]> x_9 = add(x = input_55, y = ret_out_3)[name = tensor<string, []>("x_9")];
             tensor<int32, [4]> window_7_begin_0 = const()[name = tensor<string, []>("window_7_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> window_7_end_0 = const()[name = tensor<string, []>("window_7_end_0"), val = tensor<int32, [4]>([2, 1, 16, 256])];
             tensor<bool, [4]> window_7_end_mask_0 = const()[name = tensor<string, []>("window_7_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_7_squeeze_mask_0 = const()[name = tensor<string, []>("window_7_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_7 = slice_by_index(begin = window_7_begin_0, end = window_7_end_0, end_mask = window_7_end_mask_0, squeeze_mask = window_7_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_7")];
-            tensor<int32, [3]> var_419_begin_0 = const()[name = tensor<string, []>("op_419_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_419_end_0 = const()[name = tensor<string, []>("op_419_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_419_end_mask_0 = const()[name = tensor<string, []>("op_419_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_419 = slice_by_index(begin = var_419_begin_0, end = var_419_end_0, end_mask = var_419_end_mask_0, x = x_9)[name = tensor<string, []>("op_419")];
-            tensor<int32, [3]> var_422_begin_0 = const()[name = tensor<string, []>("op_422_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_422_end_0 = const()[name = tensor<string, []>("op_422_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_422_end_mask_0 = const()[name = tensor<string, []>("op_422_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_422 = slice_by_index(begin = var_422_begin_0, end = var_422_end_0, end_mask = var_422_end_mask_0, x = window_7)[name = tensor<string, []>("op_422")];
+            tensor<int32, [3]> var_455_begin_0 = const()[name = tensor<string, []>("op_455_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_455_end_0 = const()[name = tensor<string, []>("op_455_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_455_end_mask_0 = const()[name = tensor<string, []>("op_455_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_455 = slice_by_index(begin = var_455_begin_0, end = var_455_end_0, end_mask = var_455_end_mask_0, x = x_9)[name = tensor<string, []>("op_455")];
+            tensor<int32, [3]> var_458_begin_0 = const()[name = tensor<string, []>("op_458_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_458_end_0 = const()[name = tensor<string, []>("op_458_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_458_end_mask_0 = const()[name = tensor<string, []>("op_458_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_458 = slice_by_index(begin = var_458_begin_0, end = var_458_end_0, end_mask = var_458_end_mask_0, x = window_7)[name = tensor<string, []>("op_458")];
             tensor<bool, []> window_9_interleave_0 = const()[name = tensor<string, []>("window_9_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_9 = concat(axis = var_26, interleave = window_9_interleave_0, values = (var_422, var_419))[name = tensor<string, []>("window_9")];
-            tensor<int32, [3]> var_427_begin_0 = const()[name = tensor<string, []>("op_427_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_427_end_0 = const()[name = tensor<string, []>("op_427_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_427_end_mask_0 = const()[name = tensor<string, []>("op_427_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_427 = slice_by_index(begin = var_427_begin_0, end = var_427_end_0, end_mask = var_427_end_mask_0, x = x_9)[name = tensor<string, []>("op_427")];
-            tensor<int32, [3]> var_430_begin_0 = const()[name = tensor<string, []>("op_430_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_430_end_0 = const()[name = tensor<string, []>("op_430_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_430_end_mask_0 = const()[name = tensor<string, []>("op_430_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_430 = slice_by_index(begin = var_430_begin_0, end = var_430_end_0, end_mask = var_430_end_mask_0, x = window_9)[name = tensor<string, []>("op_430")];
+            tensor<fp32, [1, 16, 256]> window_9 = concat(axis = var_62, interleave = window_9_interleave_0, values = (var_458, var_455))[name = tensor<string, []>("window_9")];
+            tensor<int32, [3]> var_463_begin_0 = const()[name = tensor<string, []>("op_463_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_463_end_0 = const()[name = tensor<string, []>("op_463_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_463_end_mask_0 = const()[name = tensor<string, []>("op_463_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_463 = slice_by_index(begin = var_463_begin_0, end = var_463_end_0, end_mask = var_463_end_mask_0, x = x_9)[name = tensor<string, []>("op_463")];
+            tensor<int32, [3]> var_466_begin_0 = const()[name = tensor<string, []>("op_466_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_466_end_0 = const()[name = tensor<string, []>("op_466_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_466_end_mask_0 = const()[name = tensor<string, []>("op_466_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_466 = slice_by_index(begin = var_466_begin_0, end = var_466_end_0, end_mask = var_466_end_mask_0, x = window_9)[name = tensor<string, []>("op_466")];
             tensor<bool, []> window_11_interleave_0 = const()[name = tensor<string, []>("window_11_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_11 = concat(axis = var_26, interleave = window_11_interleave_0, values = (var_430, var_427))[name = tensor<string, []>("window_11")];
-            tensor<bool, []> input_61_interleave_0 = const()[name = tensor<string, []>("input_61_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [2, 16, 256]> input_61 = concat(axis = var_23, interleave = input_61_interleave_0, values = (window_9, window_11))[name = tensor<string, []>("input_61")];
+            tensor<fp32, [1, 16, 256]> window_11 = concat(axis = var_62, interleave = window_11_interleave_0, values = (var_466, var_463))[name = tensor<string, []>("window_11")];
+            tensor<bool, []> input_63_interleave_0 = const()[name = tensor<string, []>("input_63_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [2, 16, 256]> input_63 = concat(axis = var_49, interleave = input_63_interleave_0, values = (window_9, window_11))[name = tensor<string, []>("input_63")];
             tensor<int32, [1]> x_11_axes_0 = const()[name = tensor<string, []>("x_11_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [2, 16, 256]> x_11 = layer_norm(axes = x_11_axes_0, beta = encoder_conv_module_1_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_1_sequential_0_weight, x = input_61)[name = tensor<string, []>("x_11")];
-            tensor<int32, [3]> input_63_perm_0 = const()[name = tensor<string, []>("input_63_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [2, 16, 256]> x_11 = layer_norm(axes = x_11_axes_0, beta = inner_encoder_conv_module_1_sequential_0_bias, epsilon = var_46, gamma = inner_encoder_conv_module_1_sequential_0_weight, x = input_63)[name = tensor<string, []>("x_11")];
+            tensor<int32, [3]> input_65_perm_0 = const()[name = tensor<string, []>("input_65_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_13_pad_type_0 = const()[name = tensor<string, []>("inputs_13_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_13_strides_0 = const()[name = tensor<string, []>("inputs_13_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_13_pad_0 = const()[name = tensor<string, []>("inputs_13_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_13_dilations_0 = const()[name = tensor<string, []>("inputs_13_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_13_groups_0 = const()[name = tensor<string, []>("inputs_13_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> input_63 = transpose(perm = input_63_perm_0, x = x_11)[name = tensor<string, []>("transpose_47")];
-            tensor<fp32, [2, 512, 16]> inputs_13 = conv(bias = encoder_conv_module_1_sequential_2_conv_bias, dilations = inputs_13_dilations_0, groups = inputs_13_groups_0, pad = inputs_13_pad_0, pad_type = inputs_13_pad_type_0, strides = inputs_13_strides_0, weight = encoder_conv_module_1_sequential_2_conv_weight, x = input_63)[name = tensor<string, []>("inputs_13")];
-            tensor<int32, [2]> var_455_split_sizes_0 = const()[name = tensor<string, []>("op_455_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_455_axis_0 = const()[name = tensor<string, []>("op_455_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> var_455_0, tensor<fp32, [2, 256, 16]> var_455_1 = split(axis = var_455_axis_0, split_sizes = var_455_split_sizes_0, x = inputs_13)[name = tensor<string, []>("op_455")];
-            tensor<fp32, [2, 256, 16]> var_457 = sigmoid(x = var_455_1)[name = tensor<string, []>("op_457")];
-            tensor<fp32, [2, 256, 16]> inputs_15 = mul(x = var_455_0, y = var_457)[name = tensor<string, []>("inputs_15")];
+            tensor<fp32, [2, 256, 16]> input_65 = transpose(perm = input_65_perm_0, x = x_11)[name = tensor<string, []>("transpose_47")];
+            tensor<fp32, [2, 512, 16]> inputs_13 = conv(bias = inner_encoder_conv_module_1_sequential_2_conv_bias, dilations = inputs_13_dilations_0, groups = inputs_13_groups_0, pad = inputs_13_pad_0, pad_type = inputs_13_pad_type_0, strides = inputs_13_strides_0, weight = inner_encoder_conv_module_1_sequential_2_conv_weight, x = input_65)[name = tensor<string, []>("inputs_13")];
+            tensor<int32, [2]> var_491_split_sizes_0 = const()[name = tensor<string, []>("op_491_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_491_axis_0 = const()[name = tensor<string, []>("op_491_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [2, 256, 16]> var_491_0, tensor<fp32, [2, 256, 16]> var_491_1 = split(axis = var_491_axis_0, split_sizes = var_491_split_sizes_0, x = inputs_13)[name = tensor<string, []>("op_491")];
+            tensor<fp32, [2, 256, 16]> var_493 = sigmoid(x = var_491_1)[name = tensor<string, []>("op_493")];
+            tensor<fp32, [2, 256, 16]> inputs_15 = mul(x = var_491_0, y = var_493)[name = tensor<string, []>("inputs_15")];
             tensor<string, []> outputs_aug_3_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_3_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_3_pad_0 = const()[name = tensor<string, []>("outputs_aug_3_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_3_groups_0 = const()[name = tensor<string, []>("outputs_aug_3_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_3_strides_0 = const()[name = tensor<string, []>("outputs_aug_3_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_3_dilations_0 = const()[name = tensor<string, []>("outputs_aug_3_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [2, 256, 31]> outputs_aug_3 = conv(dilations = outputs_aug_3_dilations_0, groups = outputs_aug_3_groups_0, pad = outputs_aug_3_pad_0, pad_type = outputs_aug_3_pad_type_0, strides = outputs_aug_3_strides_0, weight = encoder_conv_module_1_sequential_4_conv_weight, x = inputs_15)[name = tensor<string, []>("outputs_aug_3")];
-            tensor<int32, [3]> input_65_begin_0 = const()[name = tensor<string, []>("input_65_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_65_end_0 = const()[name = tensor<string, []>("input_65_end_0"), val = tensor<int32, [3]>([2, 256, 16])];
-            tensor<bool, [3]> input_65_end_mask_0 = const()[name = tensor<string, []>("input_65_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [2, 256, 16]> input_65 = slice_by_index(begin = input_65_begin_0, end = input_65_end_0, end_mask = input_65_end_mask_0, x = outputs_aug_3)[name = tensor<string, []>("input_65")];
-            tensor<fp32, [2, 256, 16]> inputs_17 = batch_norm(beta = encoder_conv_module_1_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_1_sequential_5_weight, mean = encoder_conv_module_1_sequential_5_running_mean, variance = encoder_conv_module_1_sequential_5_running_var, x = input_65)[name = tensor<string, []>("inputs_17")];
-            tensor<fp32, [2, 256, 16]> input_67 = silu(x = inputs_17)[name = tensor<string, []>("input_67")];
-            tensor<string, []> input_69_pad_type_0 = const()[name = tensor<string, []>("input_69_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_69_strides_0 = const()[name = tensor<string, []>("input_69_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_69_pad_0 = const()[name = tensor<string, []>("input_69_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_69_dilations_0 = const()[name = tensor<string, []>("input_69_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_69_groups_0 = const()[name = tensor<string, []>("input_69_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> input_69 = conv(bias = encoder_conv_module_1_sequential_7_conv_bias, dilations = input_69_dilations_0, groups = input_69_groups_0, pad = input_69_pad_0, pad_type = input_69_pad_type_0, strides = input_69_strides_0, weight = encoder_conv_module_1_sequential_7_conv_weight, x = input_67)[name = tensor<string, []>("input_69")];
+            tensor<fp32, [2, 256, 31]> outputs_aug_3 = conv(dilations = outputs_aug_3_dilations_0, groups = outputs_aug_3_groups_0, pad = outputs_aug_3_pad_0, pad_type = outputs_aug_3_pad_type_0, strides = outputs_aug_3_strides_0, weight = inner_encoder_conv_module_1_sequential_4_conv_weight, x = inputs_15)[name = tensor<string, []>("outputs_aug_3")];
+            tensor<int32, [3]> input_67_begin_0 = const()[name = tensor<string, []>("input_67_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_67_end_0 = const()[name = tensor<string, []>("input_67_end_0"), val = tensor<int32, [3]>([2, 256, 16])];
+            tensor<bool, [3]> input_67_end_mask_0 = const()[name = tensor<string, []>("input_67_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [2, 256, 16]> input_67 = slice_by_index(begin = input_67_begin_0, end = input_67_end_0, end_mask = input_67_end_mask_0, x = outputs_aug_3)[name = tensor<string, []>("input_67")];
+            tensor<fp32, [2, 256, 16]> inputs_17 = batch_norm(beta = inner_encoder_conv_module_1_sequential_5_bias, epsilon = var_46, gamma = inner_encoder_conv_module_1_sequential_5_weight, mean = inner_encoder_conv_module_1_sequential_5_running_mean, variance = inner_encoder_conv_module_1_sequential_5_running_var, x = input_67)[name = tensor<string, []>("inputs_17")];
+            tensor<fp32, [2, 256, 16]> input_69 = silu(x = inputs_17)[name = tensor<string, []>("input_69")];
+            tensor<string, []> input_71_pad_type_0 = const()[name = tensor<string, []>("input_71_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_71_strides_0 = const()[name = tensor<string, []>("input_71_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_71_pad_0 = const()[name = tensor<string, []>("input_71_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_71_dilations_0 = const()[name = tensor<string, []>("input_71_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_71_groups_0 = const()[name = tensor<string, []>("input_71_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [2, 256, 16]> input_71 = conv(bias = inner_encoder_conv_module_1_sequential_7_conv_bias, dilations = input_71_dilations_0, groups = input_71_groups_0, pad = input_71_pad_0, pad_type = input_71_pad_type_0, strides = input_71_strides_0, weight = inner_encoder_conv_module_1_sequential_7_conv_weight, x = input_69)[name = tensor<string, []>("input_71")];
             tensor<int32, [3]> conv_out_3_perm_0 = const()[name = tensor<string, []>("conv_out_3_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_488_begin_0 = const()[name = tensor<string, []>("op_488_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_488_end_0 = const()[name = tensor<string, []>("op_488_end_0"), val = tensor<int32, [3]>([2, 16, 256])];
-            tensor<bool, [3]> var_488_end_mask_0 = const()[name = tensor<string, []>("op_488_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [2, 16, 256]> conv_out_3 = transpose(perm = conv_out_3_perm_0, x = input_69)[name = tensor<string, []>("transpose_46")];
-            tensor<fp32, [2, 1, 256]> var_488 = slice_by_index(begin = var_488_begin_0, end = var_488_end_0, end_mask = var_488_end_mask_0, x = conv_out_3)[name = tensor<string, []>("op_488")];
-            tensor<int32, [3]> var_490_perm_0 = const()[name = tensor<string, []>("op_490_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 2, 256]> var_490 = transpose(perm = var_490_perm_0, x = var_488)[name = tensor<string, []>("transpose_45")];
-            tensor<fp32, [1, 2, 256]> input_71 = add(x = x_9, y = var_490)[name = tensor<string, []>("input_71")];
-            tensor<int32, [1]> input_73_axes_0 = const()[name = tensor<string, []>("input_73_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_73 = layer_norm(axes = input_73_axes_0, beta = encoder_ffn2_1_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_1_module_sequential_0_weight, x = input_71)[name = tensor<string, []>("input_73")];
-            tensor<fp32, [1, 2, 1024]> inputs_19 = linear(bias = encoder_ffn2_1_module_sequential_1_linear_bias, weight = encoder_ffn2_1_module_sequential_1_linear_weight, x = input_73)[name = tensor<string, []>("linear_17")];
-            tensor<fp32, [1, 2, 1024]> input_75 = silu(x = inputs_19)[name = tensor<string, []>("input_75")];
-            tensor<fp32, [1, 2, 256]> input_79 = linear(bias = encoder_ffn2_1_module_sequential_4_linear_bias, weight = encoder_ffn2_1_module_sequential_4_linear_weight, x = input_75)[name = tensor<string, []>("linear_18")];
-            tensor<fp32, []> var_513 = const()[name = tensor<string, []>("op_513"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 2, 256]> var_514 = mul(x = input_79, y = var_513)[name = tensor<string, []>("op_514")];
-            tensor<fp32, [1, 2, 256]> input_81 = add(x = var_514, y = input_71)[name = tensor<string, []>("input_81")];
-            tensor<int32, [1]> input_83_axes_0 = const()[name = tensor<string, []>("input_83_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_83 = layer_norm(axes = input_83_axes_0, beta = encoder_layer_norm_1_bias, epsilon = var_28, gamma = encoder_layer_norm_1_weight, x = input_81)[name = tensor<string, []>("input_83")];
+            tensor<int32, [3]> var_524_begin_0 = const()[name = tensor<string, []>("op_524_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_524_end_0 = const()[name = tensor<string, []>("op_524_end_0"), val = tensor<int32, [3]>([2, 16, 256])];
+            tensor<bool, [3]> var_524_end_mask_0 = const()[name = tensor<string, []>("op_524_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [2, 16, 256]> conv_out_3 = transpose(perm = conv_out_3_perm_0, x = input_71)[name = tensor<string, []>("transpose_46")];
+            tensor<fp32, [2, 1, 256]> var_524 = slice_by_index(begin = var_524_begin_0, end = var_524_end_0, end_mask = var_524_end_mask_0, x = conv_out_3)[name = tensor<string, []>("op_524")];
+            tensor<int32, [3]> var_526_perm_0 = const()[name = tensor<string, []>("op_526_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 2, 256]> var_526 = transpose(perm = var_526_perm_0, x = var_524)[name = tensor<string, []>("transpose_45")];
+            tensor<fp32, [1, 2, 256]> input_73 = add(x = x_9, y = var_526)[name = tensor<string, []>("input_73")];
+            tensor<int32, [1]> input_75_axes_0 = const()[name = tensor<string, []>("input_75_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 2, 256]> input_75 = layer_norm(axes = input_75_axes_0, beta = inner_encoder_ffn2_1_module_sequential_0_bias, epsilon = var_46, gamma = inner_encoder_ffn2_1_module_sequential_0_weight, x = input_73)[name = tensor<string, []>("input_75")];
+            tensor<fp32, [1, 2, 1024]> inputs_19 = linear(bias = inner_encoder_ffn2_1_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_1_module_sequential_1_linear_weight, x = input_75)[name = tensor<string, []>("linear_17")];
+            tensor<fp32, [1, 2, 1024]> input_77 = silu(x = inputs_19)[name = tensor<string, []>("input_77")];
+            tensor<fp32, [1, 2, 256]> input_81 = linear(bias = inner_encoder_ffn2_1_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_1_module_sequential_4_linear_weight, x = input_77)[name = tensor<string, []>("linear_18")];
+            tensor<fp32, []> var_549 = const()[name = tensor<string, []>("op_549"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 2, 256]> var_550 = mul(x = input_81, y = var_549)[name = tensor<string, []>("op_550")];
+            tensor<fp32, [1, 2, 256]> input_83 = add(x = var_550, y = input_73)[name = tensor<string, []>("input_83")];
             tensor<int32, [1]> input_85_axes_0 = const()[name = tensor<string, []>("input_85_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_85 = layer_norm(axes = input_85_axes_0, beta = encoder_ffn1_2_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_2_module_sequential_0_weight, x = input_83)[name = tensor<string, []>("input_85")];
-            tensor<fp32, [1, 2, 1024]> inputs_21 = linear(bias = encoder_ffn1_2_module_sequential_1_linear_bias, weight = encoder_ffn1_2_module_sequential_1_linear_weight, x = input_85)[name = tensor<string, []>("linear_19")];
-            tensor<fp32, [1, 2, 1024]> input_87 = silu(x = inputs_21)[name = tensor<string, []>("input_87")];
-            tensor<fp32, [1, 2, 256]> input_91 = linear(bias = encoder_ffn1_2_module_sequential_4_linear_bias, weight = encoder_ffn1_2_module_sequential_4_linear_weight, x = input_87)[name = tensor<string, []>("linear_20")];
-            tensor<fp32, []> var_543 = const()[name = tensor<string, []>("op_543"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 2, 256]> var_544 = mul(x = input_91, y = var_543)[name = tensor<string, []>("op_544")];
-            tensor<fp32, [1, 2, 256]> input_93 = add(x = var_544, y = input_83)[name = tensor<string, []>("input_93")];
+            tensor<fp32, [1, 2, 256]> input_85 = layer_norm(axes = input_85_axes_0, beta = inner_encoder_layer_norm_1_bias, epsilon = var_46, gamma = inner_encoder_layer_norm_1_weight, x = input_83)[name = tensor<string, []>("input_85")];
+            tensor<int32, [1]> input_87_axes_0 = const()[name = tensor<string, []>("input_87_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 2, 256]> input_87 = layer_norm(axes = input_87_axes_0, beta = inner_encoder_ffn1_2_module_sequential_0_bias, epsilon = var_46, gamma = inner_encoder_ffn1_2_module_sequential_0_weight, x = input_85)[name = tensor<string, []>("input_87")];
+            tensor<fp32, [1, 2, 1024]> inputs_21 = linear(bias = inner_encoder_ffn1_2_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_2_module_sequential_1_linear_weight, x = input_87)[name = tensor<string, []>("linear_19")];
+            tensor<fp32, [1, 2, 1024]> input_89 = silu(x = inputs_21)[name = tensor<string, []>("input_89")];
+            tensor<fp32, [1, 2, 256]> input_93 = linear(bias = inner_encoder_ffn1_2_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_2_module_sequential_4_linear_weight, x = input_89)[name = tensor<string, []>("linear_20")];
+            tensor<fp32, []> var_579 = const()[name = tensor<string, []>("op_579"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 2, 256]> var_580 = mul(x = input_93, y = var_579)[name = tensor<string, []>("op_580")];
+            tensor<fp32, [1, 2, 256]> input_95 = add(x = var_580, y = input_85)[name = tensor<string, []>("input_95")];
             tensor<int32, [1]> x_13_axes_0 = const()[name = tensor<string, []>("x_13_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> x_13 = layer_norm(axes = x_13_axes_0, beta = encoder_ret_lns_2_bias, epsilon = var_28, gamma = encoder_ret_lns_2_weight, x = input_93)[name = tensor<string, []>("x_13")];
+            tensor<fp32, [1, 2, 256]> x_13 = layer_norm(axes = x_13_axes_0, beta = inner_encoder_ret_lns_2_bias, epsilon = var_46, gamma = inner_encoder_ret_lns_2_weight, x = input_95)[name = tensor<string, []>("x_13")];
             tensor<int32, [5]> prev_kv_5_begin_0 = const()[name = tensor<string, []>("prev_kv_5_begin_0"), val = tensor<int32, [5]>([2, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_5_end_0 = const()[name = tensor<string, []>("prev_kv_5_end_0"), val = tensor<int32, [5]>([3, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_5_end_mask_0 = const()[name = tensor<string, []>("prev_kv_5_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -553,153 +567,153 @@ program(1.0)
             tensor<bool, [2]> prev_scale_5_end_mask_0 = const()[name = tensor<string, []>("prev_scale_5_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_5_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_5_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_5 = slice_by_index(begin = prev_scale_5_begin_0, end = prev_scale_5_end_0, end_mask = prev_scale_5_end_mask_0, squeeze_mask = prev_scale_5_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_5")];
-            tensor<fp32, [1, 2, 256]> var_558 = linear(bias = encoder_q_proj_2_bias, weight = encoder_q_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_21")];
-            tensor<int32, [4]> var_559 = const()[name = tensor<string, []>("op_559"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_560 = reshape(shape = var_559, x = var_558)[name = tensor<string, []>("op_560")];
+            tensor<fp32, [1, 2, 256]> var_594 = linear(bias = inner_encoder_q_proj_2_bias, weight = inner_encoder_q_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_21")];
+            tensor<int32, [4]> var_595 = const()[name = tensor<string, []>("op_595"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_596 = reshape(shape = var_595, x = var_594)[name = tensor<string, []>("op_596")];
             tensor<int32, [4]> q_5_perm_0 = const()[name = tensor<string, []>("q_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> var_564 = linear(bias = encoder_k_proj_2_bias, weight = encoder_k_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_22")];
-            tensor<fp32, []> var_565 = const()[name = tensor<string, []>("op_565"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 2, 256]> var_566 = mul(x = var_564, y = var_565)[name = tensor<string, []>("op_566")];
-            tensor<int32, [4]> var_567 = const()[name = tensor<string, []>("op_567"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_568 = reshape(shape = var_567, x = var_566)[name = tensor<string, []>("op_568")];
+            tensor<fp32, [1, 2, 256]> var_600 = linear(bias = inner_encoder_k_proj_2_bias, weight = inner_encoder_k_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_22")];
+            tensor<fp32, []> var_601 = const()[name = tensor<string, []>("op_601"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 2, 256]> var_602 = mul(x = var_600, y = var_601)[name = tensor<string, []>("op_602")];
+            tensor<int32, [4]> var_603 = const()[name = tensor<string, []>("op_603"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_604 = reshape(shape = var_603, x = var_602)[name = tensor<string, []>("op_604")];
             tensor<int32, [4]> k_5_perm_0 = const()[name = tensor<string, []>("k_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> var_572 = linear(bias = encoder_v_proj_2_bias, weight = encoder_v_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_23")];
-            tensor<int32, [4]> var_573 = const()[name = tensor<string, []>("op_573"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_574 = reshape(shape = var_573, x = var_572)[name = tensor<string, []>("op_574")];
+            tensor<fp32, [1, 2, 256]> var_608 = linear(bias = inner_encoder_v_proj_2_bias, weight = inner_encoder_v_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_23")];
+            tensor<int32, [4]> var_609 = const()[name = tensor<string, []>("op_609"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_610 = reshape(shape = var_609, x = var_608)[name = tensor<string, []>("op_610")];
             tensor<int32, [4]> v_5_perm_0 = const()[name = tensor<string, []>("v_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> input_97 = linear(bias = encoder_g_proj_2_bias, weight = encoder_g_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_24")];
+            tensor<fp32, [1, 2, 256]> input_99 = linear(bias = inner_encoder_g_proj_2_bias, weight = inner_encoder_g_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_24")];
             tensor<fp32, [1]> sqrt_s0_5 = sqrt(x = prev_scale_5)[name = tensor<string, []>("sqrt_s0_5")];
-            tensor<fp32, [2]> s_t_5 = add(x = prev_scale_5, y = encoder__t_index)[name = tensor<string, []>("s_t_5")];
+            tensor<fp32, [2]> s_t_5 = add(x = prev_scale_5, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_5")];
             tensor<fp32, [2]> sqrt_s_t_5 = sqrt(x = s_t_5)[name = tensor<string, []>("sqrt_s_t_5")];
             tensor<bool, []> qk_5_transpose_x_1 = const()[name = tensor<string, []>("qk_5_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_5_transpose_y_1 = const()[name = tensor<string, []>("qk_5_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 2, 64]> k_5 = transpose(perm = k_5_perm_0, x = var_568)[name = tensor<string, []>("transpose_43")];
-            tensor<fp32, [1, 4, 2, 64]> q_5 = transpose(perm = q_5_perm_0, x = var_560)[name = tensor<string, []>("transpose_44")];
+            tensor<fp32, [1, 4, 2, 64]> k_5 = transpose(perm = k_5_perm_0, x = var_604)[name = tensor<string, []>("transpose_43")];
+            tensor<fp32, [1, 4, 2, 64]> q_5 = transpose(perm = q_5_perm_0, x = var_596)[name = tensor<string, []>("transpose_44")];
             tensor<fp32, [1, 4, 2, 2]> qk_5 = matmul(transpose_x = qk_5_transpose_x_1, transpose_y = qk_5_transpose_y_1, x = q_5, y = k_5)[name = tensor<string, []>("qk_5")];
-            tensor<int32, [2]> var_584 = const()[name = tensor<string, []>("op_584"), val = tensor<int32, [2]>([2, 1])];
-            tensor<fp32, [2, 1]> var_585 = reshape(shape = var_584, x = sqrt_s_t_5)[name = tensor<string, []>("op_585")];
-            tensor<fp32, [2, 2]> M_5 = real_div(x = encoder__causal_mask, y = var_585)[name = tensor<string, []>("M_5")];
-            tensor<fp32, [1, 4, 2, 2]> var_587 = mul(x = qk_5, y = M_5)[name = tensor<string, []>("op_587")];
+            tensor<int32, [2]> var_620 = const()[name = tensor<string, []>("op_620"), val = tensor<int32, [2]>([2, 1])];
+            tensor<fp32, [2, 1]> var_621 = reshape(shape = var_620, x = sqrt_s_t_5)[name = tensor<string, []>("op_621")];
+            tensor<fp32, [2, 2]> M_5 = real_div(x = inner_encoder__causal_mask, y = var_621)[name = tensor<string, []>("M_5")];
+            tensor<fp32, [1, 4, 2, 2]> var_623 = mul(x = qk_5, y = M_5)[name = tensor<string, []>("op_623")];
             tensor<bool, []> inner_5_transpose_x_0 = const()[name = tensor<string, []>("inner_5_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_5_transpose_y_0 = const()[name = tensor<string, []>("inner_5_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 2, 64]> v_5 = transpose(perm = v_5_perm_0, x = var_574)[name = tensor<string, []>("transpose_42")];
-            tensor<fp32, [1, 4, 2, 64]> inner_5 = matmul(transpose_x = inner_5_transpose_x_0, transpose_y = inner_5_transpose_y_0, x = var_587, y = v_5)[name = tensor<string, []>("inner_5")];
-            tensor<bool, []> var_589_transpose_x_0 = const()[name = tensor<string, []>("op_589_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_589_transpose_y_0 = const()[name = tensor<string, []>("op_589_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 2, 64]> var_589 = matmul(transpose_x = var_589_transpose_x_0, transpose_y = var_589_transpose_y_0, x = q_5, y = prev_kv_5)[name = tensor<string, []>("op_589")];
-            tensor<fp32, [2]> var_590 = real_div(x = sqrt_s0_5, y = sqrt_s_t_5)[name = tensor<string, []>("op_590")];
-            tensor<int32, [4]> var_591 = const()[name = tensor<string, []>("op_591"), val = tensor<int32, [4]>([1, 1, 2, 1])];
-            tensor<fp32, [1, 1, 2, 1]> var_592 = reshape(shape = var_591, x = var_590)[name = tensor<string, []>("op_592")];
-            tensor<fp32, [1, 4, 2, 64]> cross_5 = mul(x = var_589, y = var_592)[name = tensor<string, []>("cross_5")];
+            tensor<fp32, [1, 4, 2, 64]> v_5 = transpose(perm = v_5_perm_0, x = var_610)[name = tensor<string, []>("transpose_42")];
+            tensor<fp32, [1, 4, 2, 64]> inner_5 = matmul(transpose_x = inner_5_transpose_x_0, transpose_y = inner_5_transpose_y_0, x = var_623, y = v_5)[name = tensor<string, []>("inner_5")];
+            tensor<bool, []> var_625_transpose_x_0 = const()[name = tensor<string, []>("op_625_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_625_transpose_y_0 = const()[name = tensor<string, []>("op_625_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 2, 64]> var_625 = matmul(transpose_x = var_625_transpose_x_0, transpose_y = var_625_transpose_y_0, x = q_5, y = prev_kv_5)[name = tensor<string, []>("op_625")];
+            tensor<fp32, [2]> var_626 = real_div(x = sqrt_s0_5, y = sqrt_s_t_5)[name = tensor<string, []>("op_626")];
+            tensor<int32, [4]> var_627 = const()[name = tensor<string, []>("op_627"), val = tensor<int32, [4]>([1, 1, 2, 1])];
+            tensor<fp32, [1, 1, 2, 1]> var_628 = reshape(shape = var_627, x = var_626)[name = tensor<string, []>("op_628")];
+            tensor<fp32, [1, 4, 2, 64]> cross_5 = mul(x = var_625, y = var_628)[name = tensor<string, []>("cross_5")];
             tensor<fp32, [1, 4, 2, 64]> out_13 = add(x = inner_5, y = cross_5)[name = tensor<string, []>("out_13")];
-            tensor<fp32, [1, 4, 64, 64]> var_595 = mul(x = prev_kv_5, y = sqrt_s0_5)[name = tensor<string, []>("op_595")];
-            tensor<bool, []> var_597_transpose_x_1 = const()[name = tensor<string, []>("op_597_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_597_transpose_y_1 = const()[name = tensor<string, []>("op_597_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_597 = matmul(transpose_x = var_597_transpose_x_1, transpose_y = var_597_transpose_y_1, x = k_5, y = v_5)[name = tensor<string, []>("op_597")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_5 = add(x = var_595, y = var_597)[name = tensor<string, []>("new_kv_unnorm_5")];
-            tensor<fp32, []> var_599 = const()[name = tensor<string, []>("op_599"), val = tensor<fp32, []>(0x1p+1)];
-            tensor<fp32, [1]> new_scale_5 = add(x = prev_scale_5, y = var_599)[name = tensor<string, []>("new_scale_5")];
-            tensor<fp32, [1]> var_601 = sqrt(x = new_scale_5)[name = tensor<string, []>("op_601")];
-            tensor<fp32, [1, 4, 64, 64]> var_602 = real_div(x = new_kv_unnorm_5, y = var_601)[name = tensor<string, []>("op_602")];
-            tensor<int32, [4]> var_603_perm_0 = const()[name = tensor<string, []>("op_603_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_631 = mul(x = prev_kv_5, y = sqrt_s0_5)[name = tensor<string, []>("op_631")];
+            tensor<bool, []> var_633_transpose_x_1 = const()[name = tensor<string, []>("op_633_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_633_transpose_y_1 = const()[name = tensor<string, []>("op_633_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_633 = matmul(transpose_x = var_633_transpose_x_1, transpose_y = var_633_transpose_y_1, x = k_5, y = v_5)[name = tensor<string, []>("op_633")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_5 = add(x = var_631, y = var_633)[name = tensor<string, []>("new_kv_unnorm_5")];
+            tensor<fp32, []> var_635 = const()[name = tensor<string, []>("op_635"), val = tensor<fp32, []>(0x1p+1)];
+            tensor<fp32, [1]> new_scale_5 = add(x = prev_scale_5, y = var_635)[name = tensor<string, []>("new_scale_5")];
+            tensor<fp32, [1]> var_637 = sqrt(x = new_scale_5)[name = tensor<string, []>("op_637")];
+            tensor<fp32, [1, 4, 64, 64]> var_638 = real_div(x = new_kv_unnorm_5, y = var_637)[name = tensor<string, []>("op_638")];
+            tensor<int32, [4]> var_639_perm_0 = const()[name = tensor<string, []>("op_639_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_15_axes_0 = const()[name = tensor<string, []>("out_15_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 4, 64]> var_603 = transpose(perm = var_603_perm_0, x = out_13)[name = tensor<string, []>("transpose_41")];
-            tensor<fp32, [1, 2, 4, 64]> out_15 = layer_norm(axes = out_15_axes_0, epsilon = var_17, x = var_603)[name = tensor<string, []>("out_15")];
-            tensor<int32, [3]> var_607 = const()[name = tensor<string, []>("op_607"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<fp32, [1, 2, 256]> out_17 = reshape(shape = var_607, x = out_15)[name = tensor<string, []>("out_17")];
-            tensor<fp32, [1, 2, 256]> var_609 = silu(x = input_97)[name = tensor<string, []>("op_609")];
-            tensor<fp32, [1, 2, 256]> input_99 = mul(x = var_609, y = out_17)[name = tensor<string, []>("input_99")];
-            tensor<fp32, [1, 2, 256]> ret_out_5 = linear(bias = encoder_out_proj_2_bias, weight = encoder_out_proj_2_weight, x = input_99)[name = tensor<string, []>("linear_25")];
-            tensor<fp32, [1, 2, 256]> x_15 = add(x = input_93, y = ret_out_5)[name = tensor<string, []>("x_15")];
+            tensor<fp32, [1, 2, 4, 64]> var_639 = transpose(perm = var_639_perm_0, x = out_13)[name = tensor<string, []>("transpose_41")];
+            tensor<fp32, [1, 2, 4, 64]> out_15 = layer_norm(axes = out_15_axes_0, epsilon = var_54, x = var_639)[name = tensor<string, []>("out_15")];
+            tensor<int32, [3]> var_643 = const()[name = tensor<string, []>("op_643"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp32, [1, 2, 256]> out_17 = reshape(shape = var_643, x = out_15)[name = tensor<string, []>("out_17")];
+            tensor<fp32, [1, 2, 256]> var_645 = silu(x = input_99)[name = tensor<string, []>("op_645")];
+            tensor<fp32, [1, 2, 256]> input_101 = mul(x = var_645, y = out_17)[name = tensor<string, []>("input_101")];
+            tensor<fp32, [1, 2, 256]> ret_out_5 = linear(bias = inner_encoder_out_proj_2_bias, weight = inner_encoder_out_proj_2_weight, x = input_101)[name = tensor<string, []>("linear_25")];
+            tensor<fp32, [1, 2, 256]> x_15 = add(x = input_95, y = ret_out_5)[name = tensor<string, []>("x_15")];
             tensor<int32, [4]> window_13_begin_0 = const()[name = tensor<string, []>("window_13_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> window_13_end_0 = const()[name = tensor<string, []>("window_13_end_0"), val = tensor<int32, [4]>([3, 1, 16, 256])];
             tensor<bool, [4]> window_13_end_mask_0 = const()[name = tensor<string, []>("window_13_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_13_squeeze_mask_0 = const()[name = tensor<string, []>("window_13_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_13 = slice_by_index(begin = window_13_begin_0, end = window_13_end_0, end_mask = window_13_end_mask_0, squeeze_mask = window_13_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_13")];
-            tensor<int32, [3]> var_617_begin_0 = const()[name = tensor<string, []>("op_617_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_617_end_0 = const()[name = tensor<string, []>("op_617_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_617_end_mask_0 = const()[name = tensor<string, []>("op_617_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_617 = slice_by_index(begin = var_617_begin_0, end = var_617_end_0, end_mask = var_617_end_mask_0, x = x_15)[name = tensor<string, []>("op_617")];
-            tensor<int32, [3]> var_620_begin_0 = const()[name = tensor<string, []>("op_620_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_620_end_0 = const()[name = tensor<string, []>("op_620_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_620_end_mask_0 = const()[name = tensor<string, []>("op_620_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_620 = slice_by_index(begin = var_620_begin_0, end = var_620_end_0, end_mask = var_620_end_mask_0, x = window_13)[name = tensor<string, []>("op_620")];
+            tensor<int32, [3]> var_653_begin_0 = const()[name = tensor<string, []>("op_653_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_653_end_0 = const()[name = tensor<string, []>("op_653_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_653_end_mask_0 = const()[name = tensor<string, []>("op_653_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_653 = slice_by_index(begin = var_653_begin_0, end = var_653_end_0, end_mask = var_653_end_mask_0, x = x_15)[name = tensor<string, []>("op_653")];
+            tensor<int32, [3]> var_656_begin_0 = const()[name = tensor<string, []>("op_656_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_656_end_0 = const()[name = tensor<string, []>("op_656_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_656_end_mask_0 = const()[name = tensor<string, []>("op_656_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_656 = slice_by_index(begin = var_656_begin_0, end = var_656_end_0, end_mask = var_656_end_mask_0, x = window_13)[name = tensor<string, []>("op_656")];
             tensor<bool, []> window_15_interleave_0 = const()[name = tensor<string, []>("window_15_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_15 = concat(axis = var_26, interleave = window_15_interleave_0, values = (var_620, var_617))[name = tensor<string, []>("window_15")];
-            tensor<int32, [3]> var_625_begin_0 = const()[name = tensor<string, []>("op_625_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_625_end_0 = const()[name = tensor<string, []>("op_625_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_625_end_mask_0 = const()[name = tensor<string, []>("op_625_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_625 = slice_by_index(begin = var_625_begin_0, end = var_625_end_0, end_mask = var_625_end_mask_0, x = x_15)[name = tensor<string, []>("op_625")];
-            tensor<int32, [3]> var_628_begin_0 = const()[name = tensor<string, []>("op_628_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_628_end_0 = const()[name = tensor<string, []>("op_628_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_628_end_mask_0 = const()[name = tensor<string, []>("op_628_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_628 = slice_by_index(begin = var_628_begin_0, end = var_628_end_0, end_mask = var_628_end_mask_0, x = window_15)[name = tensor<string, []>("op_628")];
+            tensor<fp32, [1, 16, 256]> window_15 = concat(axis = var_62, interleave = window_15_interleave_0, values = (var_656, var_653))[name = tensor<string, []>("window_15")];
+            tensor<int32, [3]> var_661_begin_0 = const()[name = tensor<string, []>("op_661_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_661_end_0 = const()[name = tensor<string, []>("op_661_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_661_end_mask_0 = const()[name = tensor<string, []>("op_661_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_661 = slice_by_index(begin = var_661_begin_0, end = var_661_end_0, end_mask = var_661_end_mask_0, x = x_15)[name = tensor<string, []>("op_661")];
+            tensor<int32, [3]> var_664_begin_0 = const()[name = tensor<string, []>("op_664_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_664_end_0 = const()[name = tensor<string, []>("op_664_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_664_end_mask_0 = const()[name = tensor<string, []>("op_664_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_664 = slice_by_index(begin = var_664_begin_0, end = var_664_end_0, end_mask = var_664_end_mask_0, x = window_15)[name = tensor<string, []>("op_664")];
             tensor<bool, []> window_17_interleave_0 = const()[name = tensor<string, []>("window_17_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_17 = concat(axis = var_26, interleave = window_17_interleave_0, values = (var_628, var_625))[name = tensor<string, []>("window_17")];
-            tensor<bool, []> input_101_interleave_0 = const()[name = tensor<string, []>("input_101_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [2, 16, 256]> input_101 = concat(axis = var_23, interleave = input_101_interleave_0, values = (window_15, window_17))[name = tensor<string, []>("input_101")];
+            tensor<fp32, [1, 16, 256]> window_17 = concat(axis = var_62, interleave = window_17_interleave_0, values = (var_664, var_661))[name = tensor<string, []>("window_17")];
+            tensor<bool, []> input_103_interleave_0 = const()[name = tensor<string, []>("input_103_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [2, 16, 256]> input_103 = concat(axis = var_49, interleave = input_103_interleave_0, values = (window_15, window_17))[name = tensor<string, []>("input_103")];
             tensor<int32, [1]> x_17_axes_0 = const()[name = tensor<string, []>("x_17_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [2, 16, 256]> x_17 = layer_norm(axes = x_17_axes_0, beta = encoder_conv_module_2_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_2_sequential_0_weight, x = input_101)[name = tensor<string, []>("x_17")];
-            tensor<int32, [3]> input_103_perm_0 = const()[name = tensor<string, []>("input_103_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [2, 16, 256]> x_17 = layer_norm(axes = x_17_axes_0, beta = inner_encoder_conv_module_2_sequential_0_bias, epsilon = var_46, gamma = inner_encoder_conv_module_2_sequential_0_weight, x = input_103)[name = tensor<string, []>("x_17")];
+            tensor<int32, [3]> input_105_perm_0 = const()[name = tensor<string, []>("input_105_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_23_pad_type_0 = const()[name = tensor<string, []>("inputs_23_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_23_strides_0 = const()[name = tensor<string, []>("inputs_23_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_23_pad_0 = const()[name = tensor<string, []>("inputs_23_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_23_dilations_0 = const()[name = tensor<string, []>("inputs_23_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_23_groups_0 = const()[name = tensor<string, []>("inputs_23_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> input_103 = transpose(perm = input_103_perm_0, x = x_17)[name = tensor<string, []>("transpose_40")];
-            tensor<fp32, [2, 512, 16]> inputs_23 = conv(bias = encoder_conv_module_2_sequential_2_conv_bias, dilations = inputs_23_dilations_0, groups = inputs_23_groups_0, pad = inputs_23_pad_0, pad_type = inputs_23_pad_type_0, strides = inputs_23_strides_0, weight = encoder_conv_module_2_sequential_2_conv_weight, x = input_103)[name = tensor<string, []>("inputs_23")];
-            tensor<int32, [2]> var_653_split_sizes_0 = const()[name = tensor<string, []>("op_653_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_653_axis_0 = const()[name = tensor<string, []>("op_653_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> var_653_0, tensor<fp32, [2, 256, 16]> var_653_1 = split(axis = var_653_axis_0, split_sizes = var_653_split_sizes_0, x = inputs_23)[name = tensor<string, []>("op_653")];
-            tensor<fp32, [2, 256, 16]> var_655 = sigmoid(x = var_653_1)[name = tensor<string, []>("op_655")];
-            tensor<fp32, [2, 256, 16]> inputs_25 = mul(x = var_653_0, y = var_655)[name = tensor<string, []>("inputs_25")];
+            tensor<fp32, [2, 256, 16]> input_105 = transpose(perm = input_105_perm_0, x = x_17)[name = tensor<string, []>("transpose_40")];
+            tensor<fp32, [2, 512, 16]> inputs_23 = conv(bias = inner_encoder_conv_module_2_sequential_2_conv_bias, dilations = inputs_23_dilations_0, groups = inputs_23_groups_0, pad = inputs_23_pad_0, pad_type = inputs_23_pad_type_0, strides = inputs_23_strides_0, weight = inner_encoder_conv_module_2_sequential_2_conv_weight, x = input_105)[name = tensor<string, []>("inputs_23")];
+            tensor<int32, [2]> var_689_split_sizes_0 = const()[name = tensor<string, []>("op_689_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_689_axis_0 = const()[name = tensor<string, []>("op_689_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [2, 256, 16]> var_689_0, tensor<fp32, [2, 256, 16]> var_689_1 = split(axis = var_689_axis_0, split_sizes = var_689_split_sizes_0, x = inputs_23)[name = tensor<string, []>("op_689")];
+            tensor<fp32, [2, 256, 16]> var_691 = sigmoid(x = var_689_1)[name = tensor<string, []>("op_691")];
+            tensor<fp32, [2, 256, 16]> inputs_25 = mul(x = var_689_0, y = var_691)[name = tensor<string, []>("inputs_25")];
             tensor<string, []> outputs_aug_5_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_5_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_5_pad_0 = const()[name = tensor<string, []>("outputs_aug_5_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_5_groups_0 = const()[name = tensor<string, []>("outputs_aug_5_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_5_strides_0 = const()[name = tensor<string, []>("outputs_aug_5_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_5_dilations_0 = const()[name = tensor<string, []>("outputs_aug_5_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [2, 256, 31]> outputs_aug_5 = conv(dilations = outputs_aug_5_dilations_0, groups = outputs_aug_5_groups_0, pad = outputs_aug_5_pad_0, pad_type = outputs_aug_5_pad_type_0, strides = outputs_aug_5_strides_0, weight = encoder_conv_module_2_sequential_4_conv_weight, x = inputs_25)[name = tensor<string, []>("outputs_aug_5")];
-            tensor<int32, [3]> input_105_begin_0 = const()[name = tensor<string, []>("input_105_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_105_end_0 = const()[name = tensor<string, []>("input_105_end_0"), val = tensor<int32, [3]>([2, 256, 16])];
-            tensor<bool, [3]> input_105_end_mask_0 = const()[name = tensor<string, []>("input_105_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [2, 256, 16]> input_105 = slice_by_index(begin = input_105_begin_0, end = input_105_end_0, end_mask = input_105_end_mask_0, x = outputs_aug_5)[name = tensor<string, []>("input_105")];
-            tensor<fp32, [2, 256, 16]> inputs_27 = batch_norm(beta = encoder_conv_module_2_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_2_sequential_5_weight, mean = encoder_conv_module_2_sequential_5_running_mean, variance = encoder_conv_module_2_sequential_5_running_var, x = input_105)[name = tensor<string, []>("inputs_27")];
-            tensor<fp32, [2, 256, 16]> input_107 = silu(x = inputs_27)[name = tensor<string, []>("input_107")];
-            tensor<string, []> input_109_pad_type_0 = const()[name = tensor<string, []>("input_109_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_109_strides_0 = const()[name = tensor<string, []>("input_109_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_109_pad_0 = const()[name = tensor<string, []>("input_109_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_109_dilations_0 = const()[name = tensor<string, []>("input_109_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_109_groups_0 = const()[name = tensor<string, []>("input_109_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> input_109 = conv(bias = encoder_conv_module_2_sequential_7_conv_bias, dilations = input_109_dilations_0, groups = input_109_groups_0, pad = input_109_pad_0, pad_type = input_109_pad_type_0, strides = input_109_strides_0, weight = encoder_conv_module_2_sequential_7_conv_weight, x = input_107)[name = tensor<string, []>("input_109")];
+            tensor<fp32, [2, 256, 31]> outputs_aug_5 = conv(dilations = outputs_aug_5_dilations_0, groups = outputs_aug_5_groups_0, pad = outputs_aug_5_pad_0, pad_type = outputs_aug_5_pad_type_0, strides = outputs_aug_5_strides_0, weight = inner_encoder_conv_module_2_sequential_4_conv_weight, x = inputs_25)[name = tensor<string, []>("outputs_aug_5")];
+            tensor<int32, [3]> input_107_begin_0 = const()[name = tensor<string, []>("input_107_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_107_end_0 = const()[name = tensor<string, []>("input_107_end_0"), val = tensor<int32, [3]>([2, 256, 16])];
+            tensor<bool, [3]> input_107_end_mask_0 = const()[name = tensor<string, []>("input_107_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [2, 256, 16]> input_107 = slice_by_index(begin = input_107_begin_0, end = input_107_end_0, end_mask = input_107_end_mask_0, x = outputs_aug_5)[name = tensor<string, []>("input_107")];
+            tensor<fp32, [2, 256, 16]> inputs_27 = batch_norm(beta = inner_encoder_conv_module_2_sequential_5_bias, epsilon = var_46, gamma = inner_encoder_conv_module_2_sequential_5_weight, mean = inner_encoder_conv_module_2_sequential_5_running_mean, variance = inner_encoder_conv_module_2_sequential_5_running_var, x = input_107)[name = tensor<string, []>("inputs_27")];
+            tensor<fp32, [2, 256, 16]> input_109 = silu(x = inputs_27)[name = tensor<string, []>("input_109")];
+            tensor<string, []> input_111_pad_type_0 = const()[name = tensor<string, []>("input_111_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_111_strides_0 = const()[name = tensor<string, []>("input_111_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_111_pad_0 = const()[name = tensor<string, []>("input_111_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_111_dilations_0 = const()[name = tensor<string, []>("input_111_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_111_groups_0 = const()[name = tensor<string, []>("input_111_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [2, 256, 16]> input_111 = conv(bias = inner_encoder_conv_module_2_sequential_7_conv_bias, dilations = input_111_dilations_0, groups = input_111_groups_0, pad = input_111_pad_0, pad_type = input_111_pad_type_0, strides = input_111_strides_0, weight = inner_encoder_conv_module_2_sequential_7_conv_weight, x = input_109)[name = tensor<string, []>("input_111")];
             tensor<int32, [3]> conv_out_5_perm_0 = const()[name = tensor<string, []>("conv_out_5_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_686_begin_0 = const()[name = tensor<string, []>("op_686_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_686_end_0 = const()[name = tensor<string, []>("op_686_end_0"), val = tensor<int32, [3]>([2, 16, 256])];
-            tensor<bool, [3]> var_686_end_mask_0 = const()[name = tensor<string, []>("op_686_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [2, 16, 256]> conv_out_5 = transpose(perm = conv_out_5_perm_0, x = input_109)[name = tensor<string, []>("transpose_39")];
-            tensor<fp32, [2, 1, 256]> var_686 = slice_by_index(begin = var_686_begin_0, end = var_686_end_0, end_mask = var_686_end_mask_0, x = conv_out_5)[name = tensor<string, []>("op_686")];
-            tensor<int32, [3]> var_688_perm_0 = const()[name = tensor<string, []>("op_688_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 2, 256]> var_688 = transpose(perm = var_688_perm_0, x = var_686)[name = tensor<string, []>("transpose_38")];
-            tensor<fp32, [1, 2, 256]> input_111 = add(x = x_15, y = var_688)[name = tensor<string, []>("input_111")];
-            tensor<int32, [1]> input_113_axes_0 = const()[name = tensor<string, []>("input_113_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_113 = layer_norm(axes = input_113_axes_0, beta = encoder_ffn2_2_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_2_module_sequential_0_weight, x = input_111)[name = tensor<string, []>("input_113")];
-            tensor<fp32, [1, 2, 1024]> inputs_29 = linear(bias = encoder_ffn2_2_module_sequential_1_linear_bias, weight = encoder_ffn2_2_module_sequential_1_linear_weight, x = input_113)[name = tensor<string, []>("linear_26")];
-            tensor<fp32, [1, 2, 1024]> input_115 = silu(x = inputs_29)[name = tensor<string, []>("input_115")];
-            tensor<fp32, [1, 2, 256]> input_119 = linear(bias = encoder_ffn2_2_module_sequential_4_linear_bias, weight = encoder_ffn2_2_module_sequential_4_linear_weight, x = input_115)[name = tensor<string, []>("linear_27")];
-            tensor<fp32, []> var_711 = const()[name = tensor<string, []>("op_711"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 2, 256]> var_712 = mul(x = input_119, y = var_711)[name = tensor<string, []>("op_712")];
-            tensor<fp32, [1, 2, 256]> input_121 = add(x = var_712, y = input_111)[name = tensor<string, []>("input_121")];
-            tensor<int32, [1]> input_123_axes_0 = const()[name = tensor<string, []>("input_123_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_123 = layer_norm(axes = input_123_axes_0, beta = encoder_layer_norm_2_bias, epsilon = var_28, gamma = encoder_layer_norm_2_weight, x = input_121)[name = tensor<string, []>("input_123")];
+            tensor<int32, [3]> var_722_begin_0 = const()[name = tensor<string, []>("op_722_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_722_end_0 = const()[name = tensor<string, []>("op_722_end_0"), val = tensor<int32, [3]>([2, 16, 256])];
+            tensor<bool, [3]> var_722_end_mask_0 = const()[name = tensor<string, []>("op_722_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [2, 16, 256]> conv_out_5 = transpose(perm = conv_out_5_perm_0, x = input_111)[name = tensor<string, []>("transpose_39")];
+            tensor<fp32, [2, 1, 256]> var_722 = slice_by_index(begin = var_722_begin_0, end = var_722_end_0, end_mask = var_722_end_mask_0, x = conv_out_5)[name = tensor<string, []>("op_722")];
+            tensor<int32, [3]> var_724_perm_0 = const()[name = tensor<string, []>("op_724_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 2, 256]> var_724 = transpose(perm = var_724_perm_0, x = var_722)[name = tensor<string, []>("transpose_38")];
+            tensor<fp32, [1, 2, 256]> input_113 = add(x = x_15, y = var_724)[name = tensor<string, []>("input_113")];
+            tensor<int32, [1]> input_115_axes_0 = const()[name = tensor<string, []>("input_115_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 2, 256]> input_115 = layer_norm(axes = input_115_axes_0, beta = inner_encoder_ffn2_2_module_sequential_0_bias, epsilon = var_46, gamma = inner_encoder_ffn2_2_module_sequential_0_weight, x = input_113)[name = tensor<string, []>("input_115")];
+            tensor<fp32, [1, 2, 1024]> inputs_29 = linear(bias = inner_encoder_ffn2_2_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_2_module_sequential_1_linear_weight, x = input_115)[name = tensor<string, []>("linear_26")];
+            tensor<fp32, [1, 2, 1024]> input_117 = silu(x = inputs_29)[name = tensor<string, []>("input_117")];
+            tensor<fp32, [1, 2, 256]> input_121 = linear(bias = inner_encoder_ffn2_2_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_2_module_sequential_4_linear_weight, x = input_117)[name = tensor<string, []>("linear_27")];
+            tensor<fp32, []> var_747 = const()[name = tensor<string, []>("op_747"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 2, 256]> var_748 = mul(x = input_121, y = var_747)[name = tensor<string, []>("op_748")];
+            tensor<fp32, [1, 2, 256]> input_123 = add(x = var_748, y = input_113)[name = tensor<string, []>("input_123")];
             tensor<int32, [1]> input_125_axes_0 = const()[name = tensor<string, []>("input_125_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_125 = layer_norm(axes = input_125_axes_0, beta = encoder_ffn1_3_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_3_module_sequential_0_weight, x = input_123)[name = tensor<string, []>("input_125")];
-            tensor<fp32, [1, 2, 1024]> inputs_31 = linear(bias = encoder_ffn1_3_module_sequential_1_linear_bias, weight = encoder_ffn1_3_module_sequential_1_linear_weight, x = input_125)[name = tensor<string, []>("linear_28")];
-            tensor<fp32, [1, 2, 1024]> input_127 = silu(x = inputs_31)[name = tensor<string, []>("input_127")];
-            tensor<fp32, [1, 2, 256]> input_131 = linear(bias = encoder_ffn1_3_module_sequential_4_linear_bias, weight = encoder_ffn1_3_module_sequential_4_linear_weight, x = input_127)[name = tensor<string, []>("linear_29")];
-            tensor<fp32, []> var_741 = const()[name = tensor<string, []>("op_741"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 2, 256]> var_742 = mul(x = input_131, y = var_741)[name = tensor<string, []>("op_742")];
-            tensor<fp32, [1, 2, 256]> input_133 = add(x = var_742, y = input_123)[name = tensor<string, []>("input_133")];
+            tensor<fp32, [1, 2, 256]> input_125 = layer_norm(axes = input_125_axes_0, beta = inner_encoder_layer_norm_2_bias, epsilon = var_46, gamma = inner_encoder_layer_norm_2_weight, x = input_123)[name = tensor<string, []>("input_125")];
+            tensor<int32, [1]> input_127_axes_0 = const()[name = tensor<string, []>("input_127_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 2, 256]> input_127 = layer_norm(axes = input_127_axes_0, beta = inner_encoder_ffn1_3_module_sequential_0_bias, epsilon = var_46, gamma = inner_encoder_ffn1_3_module_sequential_0_weight, x = input_125)[name = tensor<string, []>("input_127")];
+            tensor<fp32, [1, 2, 1024]> inputs_31 = linear(bias = inner_encoder_ffn1_3_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_3_module_sequential_1_linear_weight, x = input_127)[name = tensor<string, []>("linear_28")];
+            tensor<fp32, [1, 2, 1024]> input_129 = silu(x = inputs_31)[name = tensor<string, []>("input_129")];
+            tensor<fp32, [1, 2, 256]> input_133 = linear(bias = inner_encoder_ffn1_3_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_3_module_sequential_4_linear_weight, x = input_129)[name = tensor<string, []>("linear_29")];
+            tensor<fp32, []> var_777 = const()[name = tensor<string, []>("op_777"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 2, 256]> var_778 = mul(x = input_133, y = var_777)[name = tensor<string, []>("op_778")];
+            tensor<fp32, [1, 2, 256]> input_135 = add(x = var_778, y = input_125)[name = tensor<string, []>("input_135")];
             tensor<int32, [1]> x_19_axes_0 = const()[name = tensor<string, []>("x_19_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> x_19 = layer_norm(axes = x_19_axes_0, beta = encoder_ret_lns_3_bias, epsilon = var_28, gamma = encoder_ret_lns_3_weight, x = input_133)[name = tensor<string, []>("x_19")];
+            tensor<fp32, [1, 2, 256]> x_19 = layer_norm(axes = x_19_axes_0, beta = inner_encoder_ret_lns_3_bias, epsilon = var_46, gamma = inner_encoder_ret_lns_3_weight, x = input_135)[name = tensor<string, []>("x_19")];
             tensor<int32, [5]> prev_kv_7_begin_0 = const()[name = tensor<string, []>("prev_kv_7_begin_0"), val = tensor<int32, [5]>([3, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_7_end_0 = const()[name = tensor<string, []>("prev_kv_7_end_0"), val = tensor<int32, [5]>([4, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_7_end_mask_0 = const()[name = tensor<string, []>("prev_kv_7_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -710,189 +724,182 @@ program(1.0)
             tensor<bool, [2]> prev_scale_7_end_mask_0 = const()[name = tensor<string, []>("prev_scale_7_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_7_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_7_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_7 = slice_by_index(begin = prev_scale_7_begin_0, end = prev_scale_7_end_0, end_mask = prev_scale_7_end_mask_0, squeeze_mask = prev_scale_7_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_7")];
-            tensor<fp32, [1, 2, 256]> var_756 = linear(bias = encoder_q_proj_3_bias, weight = encoder_q_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_30")];
-            tensor<int32, [4]> var_757 = const()[name = tensor<string, []>("op_757"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_758 = reshape(shape = var_757, x = var_756)[name = tensor<string, []>("op_758")];
+            tensor<fp32, [1, 2, 256]> var_792 = linear(bias = inner_encoder_q_proj_3_bias, weight = inner_encoder_q_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_30")];
+            tensor<int32, [4]> var_793 = const()[name = tensor<string, []>("op_793"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_794 = reshape(shape = var_793, x = var_792)[name = tensor<string, []>("op_794")];
             tensor<int32, [4]> q_7_perm_0 = const()[name = tensor<string, []>("q_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> var_762 = linear(bias = encoder_k_proj_3_bias, weight = encoder_k_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_31")];
-            tensor<fp32, []> var_763 = const()[name = tensor<string, []>("op_763"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 2, 256]> var_764 = mul(x = var_762, y = var_763)[name = tensor<string, []>("op_764")];
-            tensor<int32, [4]> var_765 = const()[name = tensor<string, []>("op_765"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_766 = reshape(shape = var_765, x = var_764)[name = tensor<string, []>("op_766")];
+            tensor<fp32, [1, 2, 256]> var_798 = linear(bias = inner_encoder_k_proj_3_bias, weight = inner_encoder_k_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_31")];
+            tensor<fp32, []> var_799 = const()[name = tensor<string, []>("op_799"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 2, 256]> var_800 = mul(x = var_798, y = var_799)[name = tensor<string, []>("op_800")];
+            tensor<int32, [4]> var_801 = const()[name = tensor<string, []>("op_801"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_802 = reshape(shape = var_801, x = var_800)[name = tensor<string, []>("op_802")];
             tensor<int32, [4]> k_7_perm_0 = const()[name = tensor<string, []>("k_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> var_770 = linear(bias = encoder_v_proj_3_bias, weight = encoder_v_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_32")];
-            tensor<int32, [4]> var_771 = const()[name = tensor<string, []>("op_771"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_772 = reshape(shape = var_771, x = var_770)[name = tensor<string, []>("op_772")];
+            tensor<fp32, [1, 2, 256]> var_806 = linear(bias = inner_encoder_v_proj_3_bias, weight = inner_encoder_v_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_32")];
+            tensor<int32, [4]> var_807 = const()[name = tensor<string, []>("op_807"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_808 = reshape(shape = var_807, x = var_806)[name = tensor<string, []>("op_808")];
             tensor<int32, [4]> v_7_perm_0 = const()[name = tensor<string, []>("v_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> input_137 = linear(bias = encoder_g_proj_3_bias, weight = encoder_g_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_33")];
+            tensor<fp32, [1, 2, 256]> input_139 = linear(bias = inner_encoder_g_proj_3_bias, weight = inner_encoder_g_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_33")];
             tensor<fp32, [1]> sqrt_s0_7 = sqrt(x = prev_scale_7)[name = tensor<string, []>("sqrt_s0_7")];
-            tensor<fp32, [2]> s_t_7 = add(x = prev_scale_7, y = encoder__t_index)[name = tensor<string, []>("s_t_7")];
+            tensor<fp32, [2]> s_t_7 = add(x = prev_scale_7, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_7")];
             tensor<fp32, [2]> sqrt_s_t_7 = sqrt(x = s_t_7)[name = tensor<string, []>("sqrt_s_t_7")];
             tensor<bool, []> qk_7_transpose_x_1 = const()[name = tensor<string, []>("qk_7_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_7_transpose_y_1 = const()[name = tensor<string, []>("qk_7_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 2, 64]> k_7 = transpose(perm = k_7_perm_0, x = var_766)[name = tensor<string, []>("transpose_36")];
-            tensor<fp32, [1, 4, 2, 64]> q_7 = transpose(perm = q_7_perm_0, x = var_758)[name = tensor<string, []>("transpose_37")];
+            tensor<fp32, [1, 4, 2, 64]> k_7 = transpose(perm = k_7_perm_0, x = var_802)[name = tensor<string, []>("transpose_36")];
+            tensor<fp32, [1, 4, 2, 64]> q_7 = transpose(perm = q_7_perm_0, x = var_794)[name = tensor<string, []>("transpose_37")];
             tensor<fp32, [1, 4, 2, 2]> qk_7 = matmul(transpose_x = qk_7_transpose_x_1, transpose_y = qk_7_transpose_y_1, x = q_7, y = k_7)[name = tensor<string, []>("qk_7")];
-            tensor<int32, [2]> var_782 = const()[name = tensor<string, []>("op_782"), val = tensor<int32, [2]>([2, 1])];
-            tensor<fp32, [2, 1]> var_783 = reshape(shape = var_782, x = sqrt_s_t_7)[name = tensor<string, []>("op_783")];
-            tensor<fp32, [2, 2]> M_7 = real_div(x = encoder__causal_mask, y = var_783)[name = tensor<string, []>("M_7")];
-            tensor<fp32, [1, 4, 2, 2]> var_785 = mul(x = qk_7, y = M_7)[name = tensor<string, []>("op_785")];
+            tensor<int32, [2]> var_818 = const()[name = tensor<string, []>("op_818"), val = tensor<int32, [2]>([2, 1])];
+            tensor<fp32, [2, 1]> var_819 = reshape(shape = var_818, x = sqrt_s_t_7)[name = tensor<string, []>("op_819")];
+            tensor<fp32, [2, 2]> M_7 = real_div(x = inner_encoder__causal_mask, y = var_819)[name = tensor<string, []>("M_7")];
+            tensor<fp32, [1, 4, 2, 2]> var_821 = mul(x = qk_7, y = M_7)[name = tensor<string, []>("op_821")];
             tensor<bool, []> inner_7_transpose_x_0 = const()[name = tensor<string, []>("inner_7_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_7_transpose_y_0 = const()[name = tensor<string, []>("inner_7_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 2, 64]> v_7 = transpose(perm = v_7_perm_0, x = var_772)[name = tensor<string, []>("transpose_35")];
-            tensor<fp32, [1, 4, 2, 64]> inner_7 = matmul(transpose_x = inner_7_transpose_x_0, transpose_y = inner_7_transpose_y_0, x = var_785, y = v_7)[name = tensor<string, []>("inner_7")];
-            tensor<bool, []> var_787_transpose_x_0 = const()[name = tensor<string, []>("op_787_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_787_transpose_y_0 = const()[name = tensor<string, []>("op_787_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 2, 64]> var_787 = matmul(transpose_x = var_787_transpose_x_0, transpose_y = var_787_transpose_y_0, x = q_7, y = prev_kv_7)[name = tensor<string, []>("op_787")];
-            tensor<fp32, [2]> var_788 = real_div(x = sqrt_s0_7, y = sqrt_s_t_7)[name = tensor<string, []>("op_788")];
-            tensor<int32, [4]> var_789 = const()[name = tensor<string, []>("op_789"), val = tensor<int32, [4]>([1, 1, 2, 1])];
-            tensor<fp32, [1, 1, 2, 1]> var_790 = reshape(shape = var_789, x = var_788)[name = tensor<string, []>("op_790")];
-            tensor<fp32, [1, 4, 2, 64]> cross_7 = mul(x = var_787, y = var_790)[name = tensor<string, []>("cross_7")];
+            tensor<fp32, [1, 4, 2, 64]> v_7 = transpose(perm = v_7_perm_0, x = var_808)[name = tensor<string, []>("transpose_35")];
+            tensor<fp32, [1, 4, 2, 64]> inner_7 = matmul(transpose_x = inner_7_transpose_x_0, transpose_y = inner_7_transpose_y_0, x = var_821, y = v_7)[name = tensor<string, []>("inner_7")];
+            tensor<bool, []> var_823_transpose_x_0 = const()[name = tensor<string, []>("op_823_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_823_transpose_y_0 = const()[name = tensor<string, []>("op_823_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 2, 64]> var_823 = matmul(transpose_x = var_823_transpose_x_0, transpose_y = var_823_transpose_y_0, x = q_7, y = prev_kv_7)[name = tensor<string, []>("op_823")];
+            tensor<fp32, [2]> var_824 = real_div(x = sqrt_s0_7, y = sqrt_s_t_7)[name = tensor<string, []>("op_824")];
+            tensor<int32, [4]> var_825 = const()[name = tensor<string, []>("op_825"), val = tensor<int32, [4]>([1, 1, 2, 1])];
+            tensor<fp32, [1, 1, 2, 1]> var_826 = reshape(shape = var_825, x = var_824)[name = tensor<string, []>("op_826")];
+            tensor<fp32, [1, 4, 2, 64]> cross_7 = mul(x = var_823, y = var_826)[name = tensor<string, []>("cross_7")];
             tensor<fp32, [1, 4, 2, 64]> out_19 = add(x = inner_7, y = cross_7)[name = tensor<string, []>("out_19")];
-            tensor<fp32, [1, 4, 64, 64]> var_793 = mul(x = prev_kv_7, y = sqrt_s0_7)[name = tensor<string, []>("op_793")];
-            tensor<bool, []> var_795_transpose_x_1 = const()[name = tensor<string, []>("op_795_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_795_transpose_y_1 = const()[name = tensor<string, []>("op_795_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_795 = matmul(transpose_x = var_795_transpose_x_1, transpose_y = var_795_transpose_y_1, x = k_7, y = v_7)[name = tensor<string, []>("op_795")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_7 = add(x = var_793, y = var_795)[name = tensor<string, []>("new_kv_unnorm_7")];
-            tensor<fp32, []> var_797 = const()[name = tensor<string, []>("op_797"), val = tensor<fp32, []>(0x1p+1)];
-            tensor<fp32, [1]> new_scale_7 = add(x = prev_scale_7, y = var_797)[name = tensor<string, []>("new_scale_7")];
-            tensor<fp32, [1]> var_799 = sqrt(x = new_scale_7)[name = tensor<string, []>("op_799")];
-            tensor<fp32, [1, 4, 64, 64]> nkv_1 = real_div(x = new_kv_unnorm_7, y = var_799)[name = tensor<string, []>("nkv_1")];
-            tensor<int32, [4]> var_801_perm_0 = const()[name = tensor<string, []>("op_801_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_829 = mul(x = prev_kv_7, y = sqrt_s0_7)[name = tensor<string, []>("op_829")];
+            tensor<bool, []> var_831_transpose_x_1 = const()[name = tensor<string, []>("op_831_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_831_transpose_y_1 = const()[name = tensor<string, []>("op_831_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_831 = matmul(transpose_x = var_831_transpose_x_1, transpose_y = var_831_transpose_y_1, x = k_7, y = v_7)[name = tensor<string, []>("op_831")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_7 = add(x = var_829, y = var_831)[name = tensor<string, []>("new_kv_unnorm_7")];
+            tensor<fp32, []> var_833 = const()[name = tensor<string, []>("op_833"), val = tensor<fp32, []>(0x1p+1)];
+            tensor<fp32, [1]> new_scale_7 = add(x = prev_scale_7, y = var_833)[name = tensor<string, []>("new_scale_7")];
+            tensor<fp32, [1]> var_835 = sqrt(x = new_scale_7)[name = tensor<string, []>("op_835")];
+            tensor<fp32, [1, 4, 64, 64]> nkv_1 = real_div(x = new_kv_unnorm_7, y = var_835)[name = tensor<string, []>("nkv_1")];
+            tensor<int32, [4]> var_837_perm_0 = const()[name = tensor<string, []>("op_837_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_21_axes_0 = const()[name = tensor<string, []>("out_21_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 4, 64]> var_801 = transpose(perm = var_801_perm_0, x = out_19)[name = tensor<string, []>("transpose_34")];
-            tensor<fp32, [1, 2, 4, 64]> out_21 = layer_norm(axes = out_21_axes_0, epsilon = var_17, x = var_801)[name = tensor<string, []>("out_21")];
-            tensor<int32, [3]> var_805 = const()[name = tensor<string, []>("op_805"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<fp32, [1, 2, 256]> out_23 = reshape(shape = var_805, x = out_21)[name = tensor<string, []>("out_23")];
-            tensor<fp32, [1, 2, 256]> var_807 = silu(x = input_137)[name = tensor<string, []>("op_807")];
-            tensor<fp32, [1, 2, 256]> input_139 = mul(x = var_807, y = out_23)[name = tensor<string, []>("input_139")];
-            tensor<fp32, [1, 2, 256]> ret_out_7 = linear(bias = encoder_out_proj_3_bias, weight = encoder_out_proj_3_weight, x = input_139)[name = tensor<string, []>("linear_34")];
-            tensor<fp32, [1, 2, 256]> x_21 = add(x = input_133, y = ret_out_7)[name = tensor<string, []>("x_21")];
+            tensor<fp32, [1, 2, 4, 64]> var_837 = transpose(perm = var_837_perm_0, x = out_19)[name = tensor<string, []>("transpose_34")];
+            tensor<fp32, [1, 2, 4, 64]> out_21 = layer_norm(axes = out_21_axes_0, epsilon = var_54, x = var_837)[name = tensor<string, []>("out_21")];
+            tensor<int32, [3]> var_841 = const()[name = tensor<string, []>("op_841"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp32, [1, 2, 256]> out_23 = reshape(shape = var_841, x = out_21)[name = tensor<string, []>("out_23")];
+            tensor<fp32, [1, 2, 256]> var_843 = silu(x = input_139)[name = tensor<string, []>("op_843")];
+            tensor<fp32, [1, 2, 256]> input_141 = mul(x = var_843, y = out_23)[name = tensor<string, []>("input_141")];
+            tensor<fp32, [1, 2, 256]> ret_out_7 = linear(bias = inner_encoder_out_proj_3_bias, weight = inner_encoder_out_proj_3_weight, x = input_141)[name = tensor<string, []>("linear_34")];
+            tensor<fp32, [1, 2, 256]> x_21 = add(x = input_135, y = ret_out_7)[name = tensor<string, []>("x_21")];
             tensor<int32, [4]> window_19_begin_0 = const()[name = tensor<string, []>("window_19_begin_0"), val = tensor<int32, [4]>([3, 0, 0, 0])];
             tensor<int32, [4]> window_19_end_0 = const()[name = tensor<string, []>("window_19_end_0"), val = tensor<int32, [4]>([4, 1, 16, 256])];
             tensor<bool, [4]> window_19_end_mask_0 = const()[name = tensor<string, []>("window_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_19_squeeze_mask_0 = const()[name = tensor<string, []>("window_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_19 = slice_by_index(begin = window_19_begin_0, end = window_19_end_0, end_mask = window_19_end_mask_0, squeeze_mask = window_19_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_19")];
-            tensor<int32, [3]> var_815_begin_0 = const()[name = tensor<string, []>("op_815_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_815_end_0 = const()[name = tensor<string, []>("op_815_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_815_end_mask_0 = const()[name = tensor<string, []>("op_815_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_815 = slice_by_index(begin = var_815_begin_0, end = var_815_end_0, end_mask = var_815_end_mask_0, x = x_21)[name = tensor<string, []>("op_815")];
-            tensor<int32, [3]> var_818_begin_0 = const()[name = tensor<string, []>("op_818_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_818_end_0 = const()[name = tensor<string, []>("op_818_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_818_end_mask_0 = const()[name = tensor<string, []>("op_818_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_818 = slice_by_index(begin = var_818_begin_0, end = var_818_end_0, end_mask = var_818_end_mask_0, x = window_19)[name = tensor<string, []>("op_818")];
+            tensor<int32, [3]> var_851_begin_0 = const()[name = tensor<string, []>("op_851_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_851_end_0 = const()[name = tensor<string, []>("op_851_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_851_end_mask_0 = const()[name = tensor<string, []>("op_851_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_851 = slice_by_index(begin = var_851_begin_0, end = var_851_end_0, end_mask = var_851_end_mask_0, x = x_21)[name = tensor<string, []>("op_851")];
+            tensor<int32, [3]> var_854_begin_0 = const()[name = tensor<string, []>("op_854_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_854_end_0 = const()[name = tensor<string, []>("op_854_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_854_end_mask_0 = const()[name = tensor<string, []>("op_854_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_854 = slice_by_index(begin = var_854_begin_0, end = var_854_end_0, end_mask = var_854_end_mask_0, x = window_19)[name = tensor<string, []>("op_854")];
             tensor<bool, []> window_21_interleave_0 = const()[name = tensor<string, []>("window_21_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_21 = concat(axis = var_26, interleave = window_21_interleave_0, values = (var_818, var_815))[name = tensor<string, []>("window_21")];
-            tensor<int32, [3]> var_823_begin_0 = const()[name = tensor<string, []>("op_823_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_823_end_0 = const()[name = tensor<string, []>("op_823_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_823_end_mask_0 = const()[name = tensor<string, []>("op_823_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_823 = slice_by_index(begin = var_823_begin_0, end = var_823_end_0, end_mask = var_823_end_mask_0, x = x_21)[name = tensor<string, []>("op_823")];
-            tensor<int32, [3]> var_826_begin_0 = const()[name = tensor<string, []>("op_826_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_826_end_0 = const()[name = tensor<string, []>("op_826_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_826_end_mask_0 = const()[name = tensor<string, []>("op_826_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_826 = slice_by_index(begin = var_826_begin_0, end = var_826_end_0, end_mask = var_826_end_mask_0, x = window_21)[name = tensor<string, []>("op_826")];
+            tensor<fp32, [1, 16, 256]> window_21 = concat(axis = var_62, interleave = window_21_interleave_0, values = (var_854, var_851))[name = tensor<string, []>("window_21")];
+            tensor<int32, [3]> var_859_begin_0 = const()[name = tensor<string, []>("op_859_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_859_end_0 = const()[name = tensor<string, []>("op_859_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_859_end_mask_0 = const()[name = tensor<string, []>("op_859_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_859 = slice_by_index(begin = var_859_begin_0, end = var_859_end_0, end_mask = var_859_end_mask_0, x = x_21)[name = tensor<string, []>("op_859")];
+            tensor<int32, [3]> var_862_begin_0 = const()[name = tensor<string, []>("op_862_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_862_end_0 = const()[name = tensor<string, []>("op_862_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_862_end_mask_0 = const()[name = tensor<string, []>("op_862_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_862 = slice_by_index(begin = var_862_begin_0, end = var_862_end_0, end_mask = var_862_end_mask_0, x = window_21)[name = tensor<string, []>("op_862")];
             tensor<bool, []> window_interleave_0 = const()[name = tensor<string, []>("window_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window = concat(axis = var_26, interleave = window_interleave_0, values = (var_826, var_823))[name = tensor<string, []>("window")];
-            tensor<bool, []> input_141_interleave_0 = const()[name = tensor<string, []>("input_141_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [2, 16, 256]> input_141 = concat(axis = var_23, interleave = input_141_interleave_0, values = (window_21, window))[name = tensor<string, []>("input_141")];
+            tensor<fp32, [1, 16, 256]> window = concat(axis = var_62, interleave = window_interleave_0, values = (var_862, var_859))[name = tensor<string, []>("window")];
+            tensor<bool, []> input_143_interleave_0 = const()[name = tensor<string, []>("input_143_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [2, 16, 256]> input_143 = concat(axis = var_49, interleave = input_143_interleave_0, values = (window_21, window))[name = tensor<string, []>("input_143")];
             tensor<int32, [1]> x_23_axes_0 = const()[name = tensor<string, []>("x_23_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [2, 16, 256]> x_23 = layer_norm(axes = x_23_axes_0, beta = encoder_conv_module_3_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_3_sequential_0_weight, x = input_141)[name = tensor<string, []>("x_23")];
-            tensor<int32, [3]> input_143_perm_0 = const()[name = tensor<string, []>("input_143_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [2, 16, 256]> x_23 = layer_norm(axes = x_23_axes_0, beta = inner_encoder_conv_module_3_sequential_0_bias, epsilon = var_46, gamma = inner_encoder_conv_module_3_sequential_0_weight, x = input_143)[name = tensor<string, []>("x_23")];
+            tensor<int32, [3]> input_145_perm_0 = const()[name = tensor<string, []>("input_145_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_33_pad_type_0 = const()[name = tensor<string, []>("inputs_33_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_33_strides_0 = const()[name = tensor<string, []>("inputs_33_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_33_pad_0 = const()[name = tensor<string, []>("inputs_33_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_33_dilations_0 = const()[name = tensor<string, []>("inputs_33_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_33_groups_0 = const()[name = tensor<string, []>("inputs_33_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> input_143 = transpose(perm = input_143_perm_0, x = x_23)[name = tensor<string, []>("transpose_33")];
-            tensor<fp32, [2, 512, 16]> inputs_33 = conv(bias = encoder_conv_module_3_sequential_2_conv_bias, dilations = inputs_33_dilations_0, groups = inputs_33_groups_0, pad = inputs_33_pad_0, pad_type = inputs_33_pad_type_0, strides = inputs_33_strides_0, weight = encoder_conv_module_3_sequential_2_conv_weight, x = input_143)[name = tensor<string, []>("inputs_33")];
-            tensor<int32, [2]> var_851_split_sizes_0 = const()[name = tensor<string, []>("op_851_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_851_axis_0 = const()[name = tensor<string, []>("op_851_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> var_851_0, tensor<fp32, [2, 256, 16]> var_851_1 = split(axis = var_851_axis_0, split_sizes = var_851_split_sizes_0, x = inputs_33)[name = tensor<string, []>("op_851")];
-            tensor<fp32, [2, 256, 16]> var_853 = sigmoid(x = var_851_1)[name = tensor<string, []>("op_853")];
-            tensor<fp32, [2, 256, 16]> inputs_35 = mul(x = var_851_0, y = var_853)[name = tensor<string, []>("inputs_35")];
+            tensor<fp32, [2, 256, 16]> input_145 = transpose(perm = input_145_perm_0, x = x_23)[name = tensor<string, []>("transpose_33")];
+            tensor<fp32, [2, 512, 16]> inputs_33 = conv(bias = inner_encoder_conv_module_3_sequential_2_conv_bias, dilations = inputs_33_dilations_0, groups = inputs_33_groups_0, pad = inputs_33_pad_0, pad_type = inputs_33_pad_type_0, strides = inputs_33_strides_0, weight = inner_encoder_conv_module_3_sequential_2_conv_weight, x = input_145)[name = tensor<string, []>("inputs_33")];
+            tensor<int32, [2]> var_887_split_sizes_0 = const()[name = tensor<string, []>("op_887_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_887_axis_0 = const()[name = tensor<string, []>("op_887_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [2, 256, 16]> var_887_0, tensor<fp32, [2, 256, 16]> var_887_1 = split(axis = var_887_axis_0, split_sizes = var_887_split_sizes_0, x = inputs_33)[name = tensor<string, []>("op_887")];
+            tensor<fp32, [2, 256, 16]> var_889 = sigmoid(x = var_887_1)[name = tensor<string, []>("op_889")];
+            tensor<fp32, [2, 256, 16]> inputs_35 = mul(x = var_887_0, y = var_889)[name = tensor<string, []>("inputs_35")];
             tensor<string, []> outputs_aug_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_pad_0 = const()[name = tensor<string, []>("outputs_aug_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_groups_0 = const()[name = tensor<string, []>("outputs_aug_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_strides_0 = const()[name = tensor<string, []>("outputs_aug_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_dilations_0 = const()[name = tensor<string, []>("outputs_aug_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [2, 256, 31]> outputs_aug = conv(dilations = outputs_aug_dilations_0, groups = outputs_aug_groups_0, pad = outputs_aug_pad_0, pad_type = outputs_aug_pad_type_0, strides = outputs_aug_strides_0, weight = encoder_conv_module_3_sequential_4_conv_weight, x = inputs_35)[name = tensor<string, []>("outputs_aug")];
-            tensor<int32, [3]> input_145_begin_0 = const()[name = tensor<string, []>("input_145_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_145_end_0 = const()[name = tensor<string, []>("input_145_end_0"), val = tensor<int32, [3]>([2, 256, 16])];
-            tensor<bool, [3]> input_145_end_mask_0 = const()[name = tensor<string, []>("input_145_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [2, 256, 16]> input_145 = slice_by_index(begin = input_145_begin_0, end = input_145_end_0, end_mask = input_145_end_mask_0, x = outputs_aug)[name = tensor<string, []>("input_145")];
-            tensor<fp32, [2, 256, 16]> inputs_37 = batch_norm(beta = encoder_conv_module_3_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_3_sequential_5_weight, mean = encoder_conv_module_3_sequential_5_running_mean, variance = encoder_conv_module_3_sequential_5_running_var, x = input_145)[name = tensor<string, []>("inputs_37")];
-            tensor<fp32, [2, 256, 16]> input_147 = silu(x = inputs_37)[name = tensor<string, []>("input_147")];
-            tensor<string, []> input_149_pad_type_0 = const()[name = tensor<string, []>("input_149_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_149_strides_0 = const()[name = tensor<string, []>("input_149_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_149_pad_0 = const()[name = tensor<string, []>("input_149_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_149_dilations_0 = const()[name = tensor<string, []>("input_149_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_149_groups_0 = const()[name = tensor<string, []>("input_149_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> input_149 = conv(bias = encoder_conv_module_3_sequential_7_conv_bias, dilations = input_149_dilations_0, groups = input_149_groups_0, pad = input_149_pad_0, pad_type = input_149_pad_type_0, strides = input_149_strides_0, weight = encoder_conv_module_3_sequential_7_conv_weight, x = input_147)[name = tensor<string, []>("input_149")];
+            tensor<fp32, [2, 256, 31]> outputs_aug = conv(dilations = outputs_aug_dilations_0, groups = outputs_aug_groups_0, pad = outputs_aug_pad_0, pad_type = outputs_aug_pad_type_0, strides = outputs_aug_strides_0, weight = inner_encoder_conv_module_3_sequential_4_conv_weight, x = inputs_35)[name = tensor<string, []>("outputs_aug")];
+            tensor<int32, [3]> input_147_begin_0 = const()[name = tensor<string, []>("input_147_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_147_end_0 = const()[name = tensor<string, []>("input_147_end_0"), val = tensor<int32, [3]>([2, 256, 16])];
+            tensor<bool, [3]> input_147_end_mask_0 = const()[name = tensor<string, []>("input_147_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [2, 256, 16]> input_147 = slice_by_index(begin = input_147_begin_0, end = input_147_end_0, end_mask = input_147_end_mask_0, x = outputs_aug)[name = tensor<string, []>("input_147")];
+            tensor<fp32, [2, 256, 16]> inputs_37 = batch_norm(beta = inner_encoder_conv_module_3_sequential_5_bias, epsilon = var_46, gamma = inner_encoder_conv_module_3_sequential_5_weight, mean = inner_encoder_conv_module_3_sequential_5_running_mean, variance = inner_encoder_conv_module_3_sequential_5_running_var, x = input_147)[name = tensor<string, []>("inputs_37")];
+            tensor<fp32, [2, 256, 16]> input_149 = silu(x = inputs_37)[name = tensor<string, []>("input_149")];
+            tensor<string, []> input_151_pad_type_0 = const()[name = tensor<string, []>("input_151_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_151_strides_0 = const()[name = tensor<string, []>("input_151_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_151_pad_0 = const()[name = tensor<string, []>("input_151_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_151_dilations_0 = const()[name = tensor<string, []>("input_151_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_151_groups_0 = const()[name = tensor<string, []>("input_151_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [2, 256, 16]> input_151 = conv(bias = inner_encoder_conv_module_3_sequential_7_conv_bias, dilations = input_151_dilations_0, groups = input_151_groups_0, pad = input_151_pad_0, pad_type = input_151_pad_type_0, strides = input_151_strides_0, weight = inner_encoder_conv_module_3_sequential_7_conv_weight, x = input_149)[name = tensor<string, []>("input_151")];
             tensor<int32, [3]> conv_out_7_perm_0 = const()[name = tensor<string, []>("conv_out_7_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_884_begin_0 = const()[name = tensor<string, []>("op_884_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_884_end_0 = const()[name = tensor<string, []>("op_884_end_0"), val = tensor<int32, [3]>([2, 16, 256])];
-            tensor<bool, [3]> var_884_end_mask_0 = const()[name = tensor<string, []>("op_884_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [2, 16, 256]> conv_out_7 = transpose(perm = conv_out_7_perm_0, x = input_149)[name = tensor<string, []>("transpose_32")];
-            tensor<fp32, [2, 1, 256]> var_884 = slice_by_index(begin = var_884_begin_0, end = var_884_end_0, end_mask = var_884_end_mask_0, x = conv_out_7)[name = tensor<string, []>("op_884")];
-            tensor<int32, [3]> var_886_perm_0 = const()[name = tensor<string, []>("op_886_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 2, 256]> var_886 = transpose(perm = var_886_perm_0, x = var_884)[name = tensor<string, []>("transpose_31")];
-            tensor<fp32, [1, 2, 256]> input_151 = add(x = x_21, y = var_886)[name = tensor<string, []>("input_151")];
-            tensor<int32, [1]> input_153_axes_0 = const()[name = tensor<string, []>("input_153_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_153 = layer_norm(axes = input_153_axes_0, beta = encoder_ffn2_3_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_3_module_sequential_0_weight, x = input_151)[name = tensor<string, []>("input_153")];
-            tensor<fp32, [1, 2, 1024]> inputs = linear(bias = encoder_ffn2_3_module_sequential_1_linear_bias, weight = encoder_ffn2_3_module_sequential_1_linear_weight, x = input_153)[name = tensor<string, []>("linear_35")];
-            tensor<fp32, [1, 2, 1024]> input_155 = silu(x = inputs)[name = tensor<string, []>("input_155")];
-            tensor<fp32, [1, 2, 256]> input_159 = linear(bias = encoder_ffn2_3_module_sequential_4_linear_bias, weight = encoder_ffn2_3_module_sequential_4_linear_weight, x = input_155)[name = tensor<string, []>("linear_36")];
-            tensor<fp32, []> var_909 = const()[name = tensor<string, []>("op_909"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 2, 256]> var_910 = mul(x = input_159, y = var_909)[name = tensor<string, []>("op_910")];
-            tensor<fp32, [1, 2, 256]> input_161 = add(x = var_910, y = input_151)[name = tensor<string, []>("input_161")];
+            tensor<int32, [3]> var_920_begin_0 = const()[name = tensor<string, []>("op_920_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_920_end_0 = const()[name = tensor<string, []>("op_920_end_0"), val = tensor<int32, [3]>([2, 16, 256])];
+            tensor<bool, [3]> var_920_end_mask_0 = const()[name = tensor<string, []>("op_920_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [2, 16, 256]> conv_out_7 = transpose(perm = conv_out_7_perm_0, x = input_151)[name = tensor<string, []>("transpose_32")];
+            tensor<fp32, [2, 1, 256]> var_920 = slice_by_index(begin = var_920_begin_0, end = var_920_end_0, end_mask = var_920_end_mask_0, x = conv_out_7)[name = tensor<string, []>("op_920")];
+            tensor<int32, [3]> var_922_perm_0 = const()[name = tensor<string, []>("op_922_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 2, 256]> var_922 = transpose(perm = var_922_perm_0, x = var_920)[name = tensor<string, []>("transpose_31")];
+            tensor<fp32, [1, 2, 256]> input_153 = add(x = x_21, y = var_922)[name = tensor<string, []>("input_153")];
+            tensor<int32, [1]> input_155_axes_0 = const()[name = tensor<string, []>("input_155_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 2, 256]> input_155 = layer_norm(axes = input_155_axes_0, beta = inner_encoder_ffn2_3_module_sequential_0_bias, epsilon = var_46, gamma = inner_encoder_ffn2_3_module_sequential_0_weight, x = input_153)[name = tensor<string, []>("input_155")];
+            tensor<fp32, [1, 2, 1024]> inputs = linear(bias = inner_encoder_ffn2_3_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_3_module_sequential_1_linear_weight, x = input_155)[name = tensor<string, []>("linear_35")];
+            tensor<fp32, [1, 2, 1024]> input_157 = silu(x = inputs)[name = tensor<string, []>("input_157")];
+            tensor<fp32, [1, 2, 256]> input_161 = linear(bias = inner_encoder_ffn2_3_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_3_module_sequential_4_linear_weight, x = input_157)[name = tensor<string, []>("linear_36")];
+            tensor<fp32, []> var_945 = const()[name = tensor<string, []>("op_945"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 2, 256]> var_946 = mul(x = input_161, y = var_945)[name = tensor<string, []>("op_946")];
+            tensor<fp32, [1, 2, 256]> input_163 = add(x = var_946, y = input_153)[name = tensor<string, []>("input_163")];
             tensor<int32, [1]> x_25_axes_0 = const()[name = tensor<string, []>("x_25_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> x_25 = layer_norm(axes = x_25_axes_0, beta = encoder_layer_norm_3_bias, epsilon = var_28, gamma = encoder_layer_norm_3_weight, x = input_161)[name = tensor<string, []>("x_25")];
+            tensor<fp32, [1, 2, 256]> x_25 = layer_norm(axes = x_25_axes_0, beta = inner_encoder_layer_norm_3_bias, epsilon = var_46, gamma = inner_encoder_layer_norm_3_weight, x = input_163)[name = tensor<string, []>("x_25")];
             tensor<int32, [3]> x_ct_perm_0 = const()[name = tensor<string, []>("x_ct_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<bool, []> cat_interleave_0 = const()[name = tensor<string, []>("cat_interleave_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [1, 256, 2]> x_ct = transpose(perm = x_ct_perm_0, x = x_25)[name = tensor<string, []>("transpose_30")];
-            tensor<fp32, [1, 256, 20]> cat = concat(axis = var_20, interleave = cat_interleave_0, values = (cnn_window, x_ct))[name = tensor<string, []>("cat")];
+            tensor<fp32, [1, 256, 20]> cat = concat(axis = var_51, interleave = cat_interleave_0, values = (cnn_window, x_ct))[name = tensor<string, []>("cat")];
             tensor<string, []> conv_out_pad_type_0 = const()[name = tensor<string, []>("conv_out_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> conv_out_strides_0 = const()[name = tensor<string, []>("conv_out_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> conv_out_pad_0 = const()[name = tensor<string, []>("conv_out_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> conv_out_dilations_0 = const()[name = tensor<string, []>("conv_out_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> conv_out_groups_0 = const()[name = tensor<string, []>("conv_out_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 2]> conv_out = conv(bias = encoder_cnn_bias, dilations = conv_out_dilations_0, groups = conv_out_groups_0, pad = conv_out_pad_0, pad_type = conv_out_pad_type_0, strides = conv_out_strides_0, weight = encoder_cnn_weight, x = cat)[name = tensor<string, []>("conv_out")];
-            tensor<int32, [3]> var_928_begin_0 = const()[name = tensor<string, []>("op_928_begin_0"), val = tensor<int32, [3]>([0, 0, 2])];
-            tensor<int32, [3]> var_928_end_0 = const()[name = tensor<string, []>("op_928_end_0"), val = tensor<int32, [3]>([1, 256, 20])];
-            tensor<bool, [3]> var_928_end_mask_0 = const()[name = tensor<string, []>("op_928_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 256, 18]> cnn_window_new = slice_by_index(begin = var_928_begin_0, end = var_928_end_0, end_mask = var_928_end_mask_0, x = cat)[name = tensor<string, []>("op_928")];
-            tensor<int32, [3]> input_163_perm_0 = const()[name = tensor<string, []>("input_163_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [1]> var_930 = const()[name = tensor<string, []>("op_930"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_163 = transpose(perm = input_163_perm_0, x = conv_out)[name = tensor<string, []>("transpose_29")];
-            tensor<fp32, [1, 2, 1]> var_931 = reduce_l2_norm(axes = var_930, keep_dims = var_29, x = input_163)[name = tensor<string, []>("op_931")];
+            tensor<fp32, [1, 256, 2]> conv_out = conv(bias = inner_encoder_cnn_bias, dilations = conv_out_dilations_0, groups = conv_out_groups_0, pad = conv_out_pad_0, pad_type = conv_out_pad_type_0, strides = conv_out_strides_0, weight = inner_encoder_cnn_weight, x = cat)[name = tensor<string, []>("conv_out")];
+            tensor<int32, [3]> var_964_begin_0 = const()[name = tensor<string, []>("op_964_begin_0"), val = tensor<int32, [3]>([0, 0, 2])];
+            tensor<int32, [3]> var_964_end_0 = const()[name = tensor<string, []>("op_964_end_0"), val = tensor<int32, [3]>([1, 256, 20])];
+            tensor<bool, [3]> var_964_end_mask_0 = const()[name = tensor<string, []>("op_964_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 256, 18]> cnn_window_new = slice_by_index(begin = var_964_begin_0, end = var_964_end_0, end_mask = var_964_end_mask_0, x = cat)[name = tensor<string, []>("op_964")];
+            tensor<int32, [3]> input_165_perm_0 = const()[name = tensor<string, []>("input_165_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_966 = const()[name = tensor<string, []>("op_966"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 2, 256]> input_165 = transpose(perm = input_165_perm_0, x = conv_out)[name = tensor<string, []>("transpose_29")];
+            tensor<fp32, [1, 2, 1]> var_967 = reduce_l2_norm(axes = var_966, keep_dims = var_45, x = input_165)[name = tensor<string, []>("op_967")];
             tensor<fp32, []> const_12 = const()[name = tensor<string, []>("const_12"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1, 2, 1]> clip_0 = clip(alpha = var_11, beta = const_12, x = var_931)[name = tensor<string, []>("clip_0")];
-            tensor<fp32, [1, 2, 256]> emb = real_div(x = input_163, y = clip_0)[name = tensor<string, []>("emb")];
-            tensor<int32, []> var_935_axis_0 = const()[name = tensor<string, []>("op_935_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1, 4, 64, 64]> enc_kv_new = stack(axis = var_935_axis_0, values = (var_206, var_404, var_602, nkv_1))[name = tensor<string, []>("op_935")];
-            tensor<int32, []> var_937_axis_0 = const()[name = tensor<string, []>("op_937_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1]> enc_scale_new = stack(axis = var_937_axis_0, values = (new_scale_1, new_scale_3, new_scale_5, new_scale_7))[name = tensor<string, []>("op_937")];
-            tensor<int32, []> var_939_axis_0 = const()[name = tensor<string, []>("op_939_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1, 16, 256]> enc_conv_cache_new = stack(axis = var_939_axis_0, values = (window_5, window_11, window_17, window))[name = tensor<string, []>("op_939")];
-            tensor<fp32, []> var_948 = const()[name = tensor<string, []>("op_948"), val = tensor<fp32, []>(0x1.5798eep-27)];
-            tensor<fp32, []> var_953 = const()[name = tensor<string, []>("op_953"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
-            tensor<fp32, []> var_955 = const()[name = tensor<string, []>("op_955"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
-            tensor<bool, []> var_956 = const()[name = tensor<string, []>("op_956"), val = tensor<bool, []>(true)];
-            tensor<fp32, []> var_958 = const()[name = tensor<string, []>("op_958"), val = tensor<fp32, []>(0x1p+0)];
-            tensor<int32, []> var_962 = const()[name = tensor<string, []>("op_962"), val = tensor<int32, []>(-1)];
-            tensor<int32, []> var_968 = const()[name = tensor<string, []>("op_968"), val = tensor<int32, []>(0)];
+            tensor<fp32, [1, 2, 1]> clip_0 = clip(alpha = var_59, beta = const_12, x = var_967)[name = tensor<string, []>("clip_0")];
+            tensor<fp32, [1, 2, 256]> emb = real_div(x = input_165, y = clip_0)[name = tensor<string, []>("emb")];
+            tensor<int32, []> var_971_axis_0 = const()[name = tensor<string, []>("op_971_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1, 4, 64, 64]> enc_kv_new = stack(axis = var_971_axis_0, values = (var_242, var_440, var_638, nkv_1))[name = tensor<string, []>("op_971")];
+            tensor<int32, []> var_973_axis_0 = const()[name = tensor<string, []>("op_973_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1]> enc_scale_new = stack(axis = var_973_axis_0, values = (new_scale_1, new_scale_3, new_scale_5, new_scale_7))[name = tensor<string, []>("op_973")];
+            tensor<int32, []> var_975_axis_0 = const()[name = tensor<string, []>("op_975_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1, 16, 256]> enc_conv_cache_new = stack(axis = var_975_axis_0, values = (window_5, window_11, window_17, window))[name = tensor<string, []>("op_975")];
             tensor<fp32, [1, 2, 12, 256]> pos = const()[name = tensor<string, []>("pos"), val = tensor<fp32, [1, 2, 12, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44395328)))];
-            tensor<int32, [1]> var_1030_axes_0 = const()[name = tensor<string, []>("op_1030_axes_0"), val = tensor<int32, [1]>([2])];
-            tensor<fp32, [1, 2, 1, 256]> var_1030 = expand_dims(axes = var_1030_axes_0, x = emb)[name = tensor<string, []>("op_1030")];
+            tensor<int32, [1]> var_1043_axes_0 = const()[name = tensor<string, []>("op_1043_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp32, [1, 2, 1, 256]> var_1043 = expand_dims(axes = var_1043_axes_0, x = emb)[name = tensor<string, []>("op_1043")];
             tensor<int32, [4]> emb_exp_reps_0 = const()[name = tensor<string, []>("emb_exp_reps_0"), val = tensor<int32, [4]>([1, 1, 12, 1])];
-            tensor<fp32, [1, 2, 12, 256]> emb_exp = tile(reps = emb_exp_reps_0, x = var_1030)[name = tensor<string, []>("emb_exp")];
-            tensor<bool, []> input_165_interleave_0 = const()[name = tensor<string, []>("input_165_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 2, 12, 512]> input_165 = concat(axis = var_962, interleave = input_165_interleave_0, values = (emb_exp, pos))[name = tensor<string, []>("input_165")];
-            tensor<fp32, [1, 2, 12, 256]> x_27 = linear(bias = decoder_convert_bias, weight = decoder_convert_weight, x = input_165)[name = tensor<string, []>("linear_37")];
-            tensor<int32, [4]> var_1038_perm_0 = const()[name = tensor<string, []>("op_1038_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1042 = const()[name = tensor<string, []>("op_1042"), val = tensor<int32, [3]>([12, 2, 256])];
-            tensor<fp32, [1, 12, 2, 256]> var_1038 = transpose(perm = var_1038_perm_0, x = x_27)[name = tensor<string, []>("transpose_28")];
-            tensor<fp32, [12, 2, 256]> x_29 = reshape(shape = var_1042, x = var_1038)[name = tensor<string, []>("x_29")];
+            tensor<fp32, [1, 2, 12, 256]> emb_exp = tile(reps = emb_exp_reps_0, x = var_1043)[name = tensor<string, []>("emb_exp")];
+            tensor<bool, []> input_167_interleave_0 = const()[name = tensor<string, []>("input_167_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 2, 12, 512]> input_167 = concat(axis = var_52, interleave = input_167_interleave_0, values = (emb_exp, pos))[name = tensor<string, []>("input_167")];
+            tensor<fp32, [1, 2, 12, 256]> x_27 = linear(bias = inner_decoder_convert_bias, weight = inner_decoder_convert_weight, x = input_167)[name = tensor<string, []>("linear_37")];
+            tensor<int32, [4]> var_1051_perm_0 = const()[name = tensor<string, []>("op_1051_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1055 = const()[name = tensor<string, []>("op_1055"), val = tensor<int32, [3]>([12, 2, 256])];
+            tensor<fp32, [1, 12, 2, 256]> var_1051 = transpose(perm = var_1051_perm_0, x = x_27)[name = tensor<string, []>("transpose_28")];
+            tensor<fp32, [12, 2, 256]> x_29 = reshape(shape = var_1055, x = var_1051)[name = tensor<string, []>("x_29")];
             tensor<int32, [5]> prev_kv_9_begin_0 = const()[name = tensor<string, []>("prev_kv_9_begin_0"), val = tensor<int32, [5]>([0, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_9_end_0 = const()[name = tensor<string, []>("prev_kv_9_end_0"), val = tensor<int32, [5]>([1, 12, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_9_end_mask_0 = const()[name = tensor<string, []>("prev_kv_9_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -903,132 +910,132 @@ program(1.0)
             tensor<bool, [2]> prev_scale_9_end_mask_0 = const()[name = tensor<string, []>("prev_scale_9_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_9_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_9_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_9 = slice_by_index(begin = prev_scale_9_begin_0, end = prev_scale_9_end_0, end_mask = prev_scale_9_end_mask_0, squeeze_mask = prev_scale_9_squeeze_mask_0, x = dec_scale)[name = tensor<string, []>("prev_scale_9")];
-            tensor<fp32, [12, 2, 256]> var_1050 = linear(bias = decoder_q_proj_0_bias, weight = decoder_q_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_38")];
-            tensor<int32, [4]> var_1051 = const()[name = tensor<string, []>("op_1051"), val = tensor<int32, [4]>([12, 2, 4, 64])];
-            tensor<fp32, [12, 2, 4, 64]> var_1052 = reshape(shape = var_1051, x = var_1050)[name = tensor<string, []>("op_1052")];
+            tensor<fp32, [12, 2, 256]> var_1063 = linear(bias = inner_decoder_q_proj_0_bias, weight = inner_decoder_q_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_38")];
+            tensor<int32, [4]> var_1064 = const()[name = tensor<string, []>("op_1064"), val = tensor<int32, [4]>([12, 2, 4, 64])];
+            tensor<fp32, [12, 2, 4, 64]> var_1065 = reshape(shape = var_1064, x = var_1063)[name = tensor<string, []>("op_1065")];
             tensor<int32, [4]> q_9_perm_0 = const()[name = tensor<string, []>("q_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 2, 256]> var_1056 = linear(bias = decoder_k_proj_0_bias, weight = decoder_k_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_39")];
-            tensor<fp32, []> var_1057 = const()[name = tensor<string, []>("op_1057"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [12, 2, 256]> var_1058 = mul(x = var_1056, y = var_1057)[name = tensor<string, []>("op_1058")];
-            tensor<int32, [4]> var_1059 = const()[name = tensor<string, []>("op_1059"), val = tensor<int32, [4]>([12, 2, 4, 64])];
-            tensor<fp32, [12, 2, 4, 64]> var_1060 = reshape(shape = var_1059, x = var_1058)[name = tensor<string, []>("op_1060")];
+            tensor<fp32, [12, 2, 256]> var_1069 = linear(bias = inner_decoder_k_proj_0_bias, weight = inner_decoder_k_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_39")];
+            tensor<fp32, []> var_1070 = const()[name = tensor<string, []>("op_1070"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [12, 2, 256]> var_1071 = mul(x = var_1069, y = var_1070)[name = tensor<string, []>("op_1071")];
+            tensor<int32, [4]> var_1072 = const()[name = tensor<string, []>("op_1072"), val = tensor<int32, [4]>([12, 2, 4, 64])];
+            tensor<fp32, [12, 2, 4, 64]> var_1073 = reshape(shape = var_1072, x = var_1071)[name = tensor<string, []>("op_1073")];
             tensor<int32, [4]> k_9_perm_0 = const()[name = tensor<string, []>("k_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 2, 256]> var_1064 = linear(bias = decoder_v_proj_0_bias, weight = decoder_v_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_40")];
-            tensor<int32, [4]> var_1065 = const()[name = tensor<string, []>("op_1065"), val = tensor<int32, [4]>([12, 2, 4, 64])];
-            tensor<fp32, [12, 2, 4, 64]> var_1066 = reshape(shape = var_1065, x = var_1064)[name = tensor<string, []>("op_1066")];
+            tensor<fp32, [12, 2, 256]> var_1077 = linear(bias = inner_decoder_v_proj_0_bias, weight = inner_decoder_v_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_40")];
+            tensor<int32, [4]> var_1078 = const()[name = tensor<string, []>("op_1078"), val = tensor<int32, [4]>([12, 2, 4, 64])];
+            tensor<fp32, [12, 2, 4, 64]> var_1079 = reshape(shape = var_1078, x = var_1077)[name = tensor<string, []>("op_1079")];
             tensor<int32, [4]> v_9_perm_0 = const()[name = tensor<string, []>("v_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 2, 256]> input_169 = linear(bias = decoder_g_proj_0_bias, weight = decoder_g_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_41")];
+            tensor<fp32, [12, 2, 256]> input_171 = linear(bias = inner_decoder_g_proj_0_bias, weight = inner_decoder_g_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_41")];
             tensor<bool, []> cumsum_mask_1_exclusive_0 = const()[name = tensor<string, []>("cumsum_mask_1_exclusive_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> cumsum_mask_1_reverse_0 = const()[name = tensor<string, []>("cumsum_mask_1_reverse_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [2]> cumsum_mask_1 = cumsum(axis = var_968, exclusive = cumsum_mask_1_exclusive_0, reverse = cumsum_mask_1_reverse_0, x = valid_mask)[name = tensor<string, []>("cumsum_mask_1")];
+            tensor<fp32, [2]> cumsum_mask_1 = cumsum(axis = var_49, exclusive = cumsum_mask_1_exclusive_0, reverse = cumsum_mask_1_reverse_0, x = valid_mask)[name = tensor<string, []>("cumsum_mask_1")];
             tensor<fp32, [1]> sqrt_s0_9 = sqrt(x = prev_scale_9)[name = tensor<string, []>("sqrt_s0_9")];
             tensor<fp32, [2]> s_t_9 = add(x = prev_scale_9, y = cumsum_mask_1)[name = tensor<string, []>("s_t_9")];
             tensor<fp32, []> const_20 = const()[name = tensor<string, []>("const_20"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [2]> clip_1 = clip(alpha = var_958, beta = const_20, x = s_t_9)[name = tensor<string, []>("clip_1")];
+            tensor<fp32, [2]> clip_1 = clip(alpha = var_39, beta = const_20, x = s_t_9)[name = tensor<string, []>("clip_1")];
             tensor<fp32, [2]> sqrt_s_t_9 = sqrt(x = clip_1)[name = tensor<string, []>("sqrt_s_t_9")];
             tensor<bool, []> qk_9_transpose_x_1 = const()[name = tensor<string, []>("qk_9_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_9_transpose_y_1 = const()[name = tensor<string, []>("qk_9_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [12, 4, 2, 64]> k_9 = transpose(perm = k_9_perm_0, x = var_1060)[name = tensor<string, []>("transpose_26")];
-            tensor<fp32, [12, 4, 2, 64]> q_9 = transpose(perm = q_9_perm_0, x = var_1052)[name = tensor<string, []>("transpose_27")];
+            tensor<fp32, [12, 4, 2, 64]> k_9 = transpose(perm = k_9_perm_0, x = var_1073)[name = tensor<string, []>("transpose_26")];
+            tensor<fp32, [12, 4, 2, 64]> q_9 = transpose(perm = q_9_perm_0, x = var_1065)[name = tensor<string, []>("transpose_27")];
             tensor<fp32, [12, 4, 2, 2]> qk_9 = matmul(transpose_x = qk_9_transpose_x_1, transpose_y = qk_9_transpose_y_1, x = q_9, y = k_9)[name = tensor<string, []>("qk_9")];
-            tensor<int32, [2]> var_1078 = const()[name = tensor<string, []>("op_1078"), val = tensor<int32, [2]>([1, 2])];
-            tensor<fp32, [1, 2]> var_1079 = reshape(shape = var_1078, x = valid_mask)[name = tensor<string, []>("op_1079")];
-            tensor<fp32, [2, 2]> causal_with_valid_1 = mul(x = decoder__causal_mask, y = var_1079)[name = tensor<string, []>("causal_with_valid_1")];
-            tensor<int32, [2]> var_1081 = const()[name = tensor<string, []>("op_1081"), val = tensor<int32, [2]>([2, 1])];
-            tensor<fp32, [2, 1]> var_1082 = reshape(shape = var_1081, x = sqrt_s_t_9)[name = tensor<string, []>("op_1082")];
-            tensor<fp32, [2, 2]> M_9 = real_div(x = causal_with_valid_1, y = var_1082)[name = tensor<string, []>("M_9")];
-            tensor<fp32, [12, 4, 2, 2]> var_1084 = mul(x = qk_9, y = M_9)[name = tensor<string, []>("op_1084")];
+            tensor<int32, [2]> var_1091 = const()[name = tensor<string, []>("op_1091"), val = tensor<int32, [2]>([1, 2])];
+            tensor<fp32, [1, 2]> var_1092 = reshape(shape = var_1091, x = valid_mask)[name = tensor<string, []>("op_1092")];
+            tensor<fp32, [2, 2]> causal_with_valid_1 = mul(x = inner_decoder__causal_mask, y = var_1092)[name = tensor<string, []>("causal_with_valid_1")];
+            tensor<int32, [2]> var_1094 = const()[name = tensor<string, []>("op_1094"), val = tensor<int32, [2]>([2, 1])];
+            tensor<fp32, [2, 1]> var_1095 = reshape(shape = var_1094, x = sqrt_s_t_9)[name = tensor<string, []>("op_1095")];
+            tensor<fp32, [2, 2]> M_9 = real_div(x = causal_with_valid_1, y = var_1095)[name = tensor<string, []>("M_9")];
+            tensor<fp32, [12, 4, 2, 2]> var_1097 = mul(x = qk_9, y = M_9)[name = tensor<string, []>("op_1097")];
             tensor<bool, []> inner_9_transpose_x_0 = const()[name = tensor<string, []>("inner_9_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_9_transpose_y_0 = const()[name = tensor<string, []>("inner_9_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 2, 64]> v_9 = transpose(perm = v_9_perm_0, x = var_1066)[name = tensor<string, []>("transpose_25")];
-            tensor<fp32, [12, 4, 2, 64]> inner_9 = matmul(transpose_x = inner_9_transpose_x_0, transpose_y = inner_9_transpose_y_0, x = var_1084, y = v_9)[name = tensor<string, []>("inner_9")];
-            tensor<bool, []> var_1086_transpose_x_0 = const()[name = tensor<string, []>("op_1086_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_1086_transpose_y_0 = const()[name = tensor<string, []>("op_1086_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 2, 64]> var_1086 = matmul(transpose_x = var_1086_transpose_x_0, transpose_y = var_1086_transpose_y_0, x = q_9, y = prev_kv_9)[name = tensor<string, []>("op_1086")];
-            tensor<fp32, [2]> var_1087 = real_div(x = sqrt_s0_9, y = sqrt_s_t_9)[name = tensor<string, []>("op_1087")];
-            tensor<int32, [4]> var_1088 = const()[name = tensor<string, []>("op_1088"), val = tensor<int32, [4]>([1, 1, 2, 1])];
-            tensor<fp32, [1, 1, 2, 1]> var_1089 = reshape(shape = var_1088, x = var_1087)[name = tensor<string, []>("op_1089")];
-            tensor<fp32, [12, 4, 2, 64]> cross_9 = mul(x = var_1086, y = var_1089)[name = tensor<string, []>("cross_9")];
+            tensor<fp32, [12, 4, 2, 64]> v_9 = transpose(perm = v_9_perm_0, x = var_1079)[name = tensor<string, []>("transpose_25")];
+            tensor<fp32, [12, 4, 2, 64]> inner_9 = matmul(transpose_x = inner_9_transpose_x_0, transpose_y = inner_9_transpose_y_0, x = var_1097, y = v_9)[name = tensor<string, []>("inner_9")];
+            tensor<bool, []> var_1099_transpose_x_0 = const()[name = tensor<string, []>("op_1099_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_1099_transpose_y_0 = const()[name = tensor<string, []>("op_1099_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 2, 64]> var_1099 = matmul(transpose_x = var_1099_transpose_x_0, transpose_y = var_1099_transpose_y_0, x = q_9, y = prev_kv_9)[name = tensor<string, []>("op_1099")];
+            tensor<fp32, [2]> var_1100 = real_div(x = sqrt_s0_9, y = sqrt_s_t_9)[name = tensor<string, []>("op_1100")];
+            tensor<int32, [4]> var_1101 = const()[name = tensor<string, []>("op_1101"), val = tensor<int32, [4]>([1, 1, 2, 1])];
+            tensor<fp32, [1, 1, 2, 1]> var_1102 = reshape(shape = var_1101, x = var_1100)[name = tensor<string, []>("op_1102")];
+            tensor<fp32, [12, 4, 2, 64]> cross_9 = mul(x = var_1099, y = var_1102)[name = tensor<string, []>("cross_9")];
             tensor<fp32, [12, 4, 2, 64]> out_25 = add(x = inner_9, y = cross_9)[name = tensor<string, []>("out_25")];
-            tensor<int32, [4]> var_1092 = const()[name = tensor<string, []>("op_1092"), val = tensor<int32, [4]>([1, 1, 2, 1])];
-            tensor<fp32, [1, 1, 2, 1]> var_1093 = reshape(shape = var_1092, x = valid_mask)[name = tensor<string, []>("op_1093")];
-            tensor<fp32, [12, 4, 2, 64]> v_masked_1 = mul(x = v_9, y = var_1093)[name = tensor<string, []>("v_masked_1")];
-            tensor<fp32, [12, 4, 64, 64]> var_1095 = mul(x = prev_kv_9, y = sqrt_s0_9)[name = tensor<string, []>("op_1095")];
-            tensor<bool, []> var_1097_transpose_x_1 = const()[name = tensor<string, []>("op_1097_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_1097_transpose_y_1 = const()[name = tensor<string, []>("op_1097_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 64, 64]> var_1097 = matmul(transpose_x = var_1097_transpose_x_1, transpose_y = var_1097_transpose_y_1, x = k_9, y = v_masked_1)[name = tensor<string, []>("op_1097")];
-            tensor<fp32, [12, 4, 64, 64]> new_kv_unnorm_9 = add(x = var_1095, y = var_1097)[name = tensor<string, []>("new_kv_unnorm_9")];
-            tensor<bool, []> var_1099_keep_dims_0 = const()[name = tensor<string, []>("op_1099_keep_dims_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, []> var_1099 = reduce_sum(keep_dims = var_1099_keep_dims_0, x = valid_mask)[name = tensor<string, []>("op_1099")];
-            tensor<int32, [1]> var_1100 = const()[name = tensor<string, []>("op_1100"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [1]> var_1101 = reshape(shape = var_1100, x = var_1099)[name = tensor<string, []>("op_1101")];
-            tensor<fp32, [1]> new_scale_9 = add(x = prev_scale_9, y = var_1101)[name = tensor<string, []>("new_scale_9")];
+            tensor<int32, [4]> var_1105 = const()[name = tensor<string, []>("op_1105"), val = tensor<int32, [4]>([1, 1, 2, 1])];
+            tensor<fp32, [1, 1, 2, 1]> var_1106 = reshape(shape = var_1105, x = valid_mask)[name = tensor<string, []>("op_1106")];
+            tensor<fp32, [12, 4, 2, 64]> v_masked_1 = mul(x = v_9, y = var_1106)[name = tensor<string, []>("v_masked_1")];
+            tensor<fp32, [12, 4, 64, 64]> var_1108 = mul(x = prev_kv_9, y = sqrt_s0_9)[name = tensor<string, []>("op_1108")];
+            tensor<bool, []> var_1110_transpose_x_1 = const()[name = tensor<string, []>("op_1110_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_1110_transpose_y_1 = const()[name = tensor<string, []>("op_1110_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 64, 64]> var_1110 = matmul(transpose_x = var_1110_transpose_x_1, transpose_y = var_1110_transpose_y_1, x = k_9, y = v_masked_1)[name = tensor<string, []>("op_1110")];
+            tensor<fp32, [12, 4, 64, 64]> new_kv_unnorm_9 = add(x = var_1108, y = var_1110)[name = tensor<string, []>("new_kv_unnorm_9")];
+            tensor<bool, []> var_1112_keep_dims_0 = const()[name = tensor<string, []>("op_1112_keep_dims_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, []> var_1112 = reduce_sum(keep_dims = var_1112_keep_dims_0, x = valid_mask)[name = tensor<string, []>("op_1112")];
+            tensor<int32, [1]> var_1113 = const()[name = tensor<string, []>("op_1113"), val = tensor<int32, [1]>([1])];
+            tensor<fp32, [1]> var_1114 = reshape(shape = var_1113, x = var_1112)[name = tensor<string, []>("op_1114")];
+            tensor<fp32, [1]> new_scale_9 = add(x = prev_scale_9, y = var_1114)[name = tensor<string, []>("new_scale_9")];
             tensor<fp32, []> const_21 = const()[name = tensor<string, []>("const_21"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_2 = clip(alpha = var_958, beta = const_21, x = new_scale_9)[name = tensor<string, []>("clip_2")];
+            tensor<fp32, [1]> clip_2 = clip(alpha = var_39, beta = const_21, x = new_scale_9)[name = tensor<string, []>("clip_2")];
             tensor<fp32, [1]> sqrt_new_scale_1 = sqrt(x = clip_2)[name = tensor<string, []>("sqrt_new_scale_1")];
-            tensor<fp32, [12, 4, 64, 64]> var_1105 = real_div(x = new_kv_unnorm_9, y = sqrt_new_scale_1)[name = tensor<string, []>("op_1105")];
-            tensor<int32, [4]> var_1106_perm_0 = const()[name = tensor<string, []>("op_1106_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [12, 4, 64, 64]> var_1118 = real_div(x = new_kv_unnorm_9, y = sqrt_new_scale_1)[name = tensor<string, []>("op_1118")];
+            tensor<int32, [4]> var_1119_perm_0 = const()[name = tensor<string, []>("op_1119_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_27_axes_0 = const()[name = tensor<string, []>("out_27_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [12, 2, 4, 64]> var_1106 = transpose(perm = var_1106_perm_0, x = out_25)[name = tensor<string, []>("transpose_24")];
-            tensor<fp32, [12, 2, 4, 64]> out_27 = layer_norm(axes = out_27_axes_0, epsilon = var_955, x = var_1106)[name = tensor<string, []>("out_27")];
-            tensor<int32, [3]> var_1110 = const()[name = tensor<string, []>("op_1110"), val = tensor<int32, [3]>([12, 2, 256])];
-            tensor<fp32, [12, 2, 256]> out_29 = reshape(shape = var_1110, x = out_27)[name = tensor<string, []>("out_29")];
-            tensor<fp32, [12, 2, 256]> var_1112 = silu(x = input_169)[name = tensor<string, []>("op_1112")];
-            tensor<fp32, [12, 2, 256]> input_171 = mul(x = var_1112, y = out_29)[name = tensor<string, []>("input_171")];
-            tensor<fp32, [12, 2, 256]> ret_out_9 = linear(bias = decoder_out_proj_0_bias, weight = decoder_out_proj_0_weight, x = input_171)[name = tensor<string, []>("linear_42")];
-            tensor<fp32, [12, 2, 256]> input_173 = add(x = x_29, y = ret_out_9)[name = tensor<string, []>("input_173")];
+            tensor<fp32, [12, 2, 4, 64]> var_1119 = transpose(perm = var_1119_perm_0, x = out_25)[name = tensor<string, []>("transpose_24")];
+            tensor<fp32, [12, 2, 4, 64]> out_27 = layer_norm(axes = out_27_axes_0, epsilon = var_54, x = var_1119)[name = tensor<string, []>("out_27")];
+            tensor<int32, [3]> var_1123 = const()[name = tensor<string, []>("op_1123"), val = tensor<int32, [3]>([12, 2, 256])];
+            tensor<fp32, [12, 2, 256]> out_29 = reshape(shape = var_1123, x = out_27)[name = tensor<string, []>("out_29")];
+            tensor<fp32, [12, 2, 256]> var_1125 = silu(x = input_171)[name = tensor<string, []>("op_1125")];
+            tensor<fp32, [12, 2, 256]> input_173 = mul(x = var_1125, y = out_29)[name = tensor<string, []>("input_173")];
+            tensor<fp32, [12, 2, 256]> ret_out_9 = linear(bias = inner_decoder_out_proj_0_bias, weight = inner_decoder_out_proj_0_weight, x = input_173)[name = tensor<string, []>("linear_42")];
+            tensor<fp32, [12, 2, 256]> input_175 = add(x = x_29, y = ret_out_9)[name = tensor<string, []>("input_175")];
             tensor<int32, [1]> xt_1_axes_0 = const()[name = tensor<string, []>("xt_1_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [12, 2, 256]> xt_1 = layer_norm(axes = xt_1_axes_0, beta = decoder_norm11_0_bias, epsilon = var_953, gamma = decoder_norm11_0_weight, x = input_173)[name = tensor<string, []>("xt_1")];
-            tensor<int32, [4]> var_1122 = const()[name = tensor<string, []>("op_1122"), val = tensor<int32, [4]>([1, 12, 2, 256])];
-            tensor<fp32, [1, 12, 2, 256]> var_1123 = reshape(shape = var_1122, x = xt_1)[name = tensor<string, []>("op_1123")];
-            tensor<int32, [4]> var_1124_perm_0 = const()[name = tensor<string, []>("op_1124_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1127 = const()[name = tensor<string, []>("op_1127"), val = tensor<int32, [3]>([2, 12, 256])];
-            tensor<fp32, [1, 2, 12, 256]> var_1124 = transpose(perm = var_1124_perm_0, x = var_1123)[name = tensor<string, []>("transpose_23")];
-            tensor<fp32, [2, 12, 256]> query_1 = reshape(shape = var_1127, x = var_1124)[name = tensor<string, []>("query_1")];
+            tensor<fp32, [12, 2, 256]> xt_1 = layer_norm(axes = xt_1_axes_0, beta = inner_decoder_norm11_0_bias, epsilon = var_46, gamma = inner_decoder_norm11_0_weight, x = input_175)[name = tensor<string, []>("xt_1")];
+            tensor<int32, [4]> var_1135 = const()[name = tensor<string, []>("op_1135"), val = tensor<int32, [4]>([1, 12, 2, 256])];
+            tensor<fp32, [1, 12, 2, 256]> var_1136 = reshape(shape = var_1135, x = xt_1)[name = tensor<string, []>("op_1136")];
+            tensor<int32, [4]> var_1137_perm_0 = const()[name = tensor<string, []>("op_1137_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1140 = const()[name = tensor<string, []>("op_1140"), val = tensor<int32, [3]>([2, 12, 256])];
+            tensor<fp32, [1, 2, 12, 256]> var_1137 = transpose(perm = var_1137_perm_0, x = var_1136)[name = tensor<string, []>("transpose_23")];
+            tensor<fp32, [2, 12, 256]> query_1 = reshape(shape = var_1140, x = var_1137)[name = tensor<string, []>("query_1")];
             tensor<int32, [3]> query_3_perm_0 = const()[name = tensor<string, []>("query_3_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [12, 2, 256]> query_3 = transpose(perm = query_3_perm_0, x = query_1)[name = tensor<string, []>("transpose_22")];
-            tensor<fp32, [12, 2, 768]> var_1150 = linear(bias = decoder_self_attn2_0_in_proj_bias, weight = decoder_self_attn2_0_in_proj_weight, x = query_3)[name = tensor<string, []>("linear_43")];
+            tensor<fp32, [12, 2, 768]> var_1163 = linear(bias = inner_decoder_self_attn2_0_in_proj_bias, weight = inner_decoder_self_attn2_0_in_proj_weight, x = query_3)[name = tensor<string, []>("linear_43")];
             tensor<int32, [4]> concat_1 = const()[name = tensor<string, []>("concat_1"), val = tensor<int32, [4]>([12, 2, 3, 256])];
-            tensor<fp32, [12, 2, 3, 256]> var_1152 = reshape(shape = concat_1, x = var_1150)[name = tensor<string, []>("op_1152")];
-            tensor<int32, [1]> var_1153_axes_0 = const()[name = tensor<string, []>("op_1153_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp32, [1, 12, 2, 3, 256]> var_1153 = expand_dims(axes = var_1153_axes_0, x = var_1152)[name = tensor<string, []>("op_1153")];
-            tensor<int32, [5]> var_1154_perm_0 = const()[name = tensor<string, []>("op_1154_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
-            tensor<int32, [1]> var_1155_axes_0 = const()[name = tensor<string, []>("op_1155_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp32, [3, 12, 2, 1, 256]> var_1154 = transpose(perm = var_1154_perm_0, x = var_1153)[name = tensor<string, []>("transpose_21")];
-            tensor<fp32, [3, 12, 2, 256]> var_1155 = squeeze(axes = var_1155_axes_0, x = var_1154)[name = tensor<string, []>("op_1155")];
+            tensor<fp32, [12, 2, 3, 256]> var_1165 = reshape(shape = concat_1, x = var_1163)[name = tensor<string, []>("op_1165")];
+            tensor<int32, [1]> var_1166_axes_0 = const()[name = tensor<string, []>("op_1166_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 12, 2, 3, 256]> var_1166 = expand_dims(axes = var_1166_axes_0, x = var_1165)[name = tensor<string, []>("op_1166")];
+            tensor<int32, [5]> var_1167_perm_0 = const()[name = tensor<string, []>("op_1167_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
+            tensor<int32, [1]> var_1168_axes_0 = const()[name = tensor<string, []>("op_1168_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp32, [3, 12, 2, 1, 256]> var_1167 = transpose(perm = var_1167_perm_0, x = var_1166)[name = tensor<string, []>("transpose_21")];
+            tensor<fp32, [3, 12, 2, 256]> var_1168 = squeeze(axes = var_1168_axes_0, x = var_1167)[name = tensor<string, []>("op_1168")];
             tensor<int32, [4]> q_11_begin_0 = const()[name = tensor<string, []>("q_11_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> q_11_end_0 = const()[name = tensor<string, []>("q_11_end_0"), val = tensor<int32, [4]>([1, 12, 2, 256])];
             tensor<bool, [4]> q_11_end_mask_0 = const()[name = tensor<string, []>("q_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> q_11_squeeze_mask_0 = const()[name = tensor<string, []>("q_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 2, 256]> q_11 = slice_by_index(begin = q_11_begin_0, end = q_11_end_0, end_mask = q_11_end_mask_0, squeeze_mask = q_11_squeeze_mask_0, x = var_1155)[name = tensor<string, []>("q_11")];
+            tensor<fp32, [12, 2, 256]> q_11 = slice_by_index(begin = q_11_begin_0, end = q_11_end_0, end_mask = q_11_end_mask_0, squeeze_mask = q_11_squeeze_mask_0, x = var_1168)[name = tensor<string, []>("q_11")];
             tensor<int32, [4]> k_11_begin_0 = const()[name = tensor<string, []>("k_11_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> k_11_end_0 = const()[name = tensor<string, []>("k_11_end_0"), val = tensor<int32, [4]>([2, 12, 2, 256])];
             tensor<bool, [4]> k_11_end_mask_0 = const()[name = tensor<string, []>("k_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> k_11_squeeze_mask_0 = const()[name = tensor<string, []>("k_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 2, 256]> k_11 = slice_by_index(begin = k_11_begin_0, end = k_11_end_0, end_mask = k_11_end_mask_0, squeeze_mask = k_11_squeeze_mask_0, x = var_1155)[name = tensor<string, []>("k_11")];
+            tensor<fp32, [12, 2, 256]> k_11 = slice_by_index(begin = k_11_begin_0, end = k_11_end_0, end_mask = k_11_end_mask_0, squeeze_mask = k_11_squeeze_mask_0, x = var_1168)[name = tensor<string, []>("k_11")];
             tensor<int32, [4]> v_11_begin_0 = const()[name = tensor<string, []>("v_11_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> v_11_end_0 = const()[name = tensor<string, []>("v_11_end_0"), val = tensor<int32, [4]>([3, 12, 2, 256])];
             tensor<bool, [4]> v_11_end_mask_0 = const()[name = tensor<string, []>("v_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> v_11_squeeze_mask_0 = const()[name = tensor<string, []>("v_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 2, 256]> v_11 = slice_by_index(begin = v_11_begin_0, end = v_11_end_0, end_mask = v_11_end_mask_0, squeeze_mask = v_11_squeeze_mask_0, x = var_1155)[name = tensor<string, []>("v_11")];
-            tensor<int32, [3]> var_1163 = const()[name = tensor<string, []>("op_1163"), val = tensor<int32, [3]>([12, 8, 64])];
-            tensor<fp32, [12, 8, 64]> var_1164 = reshape(shape = var_1163, x = q_11)[name = tensor<string, []>("op_1164")];
+            tensor<fp32, [12, 2, 256]> v_11 = slice_by_index(begin = v_11_begin_0, end = v_11_end_0, end_mask = v_11_end_mask_0, squeeze_mask = v_11_squeeze_mask_0, x = var_1168)[name = tensor<string, []>("v_11")];
+            tensor<int32, [3]> var_1176 = const()[name = tensor<string, []>("op_1176"), val = tensor<int32, [3]>([12, 8, 64])];
+            tensor<fp32, [12, 8, 64]> var_1177 = reshape(shape = var_1176, x = q_11)[name = tensor<string, []>("op_1177")];
             tensor<int32, [3]> q_13_perm_0 = const()[name = tensor<string, []>("q_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1170 = const()[name = tensor<string, []>("op_1170"), val = tensor<int32, [3]>([12, 8, 64])];
-            tensor<fp32, [12, 8, 64]> var_1171 = reshape(shape = var_1170, x = k_11)[name = tensor<string, []>("op_1171")];
+            tensor<int32, [3]> var_1183 = const()[name = tensor<string, []>("op_1183"), val = tensor<int32, [3]>([12, 8, 64])];
+            tensor<fp32, [12, 8, 64]> var_1184 = reshape(shape = var_1183, x = k_11)[name = tensor<string, []>("op_1184")];
             tensor<int32, [3]> k_13_perm_0 = const()[name = tensor<string, []>("k_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1177 = const()[name = tensor<string, []>("op_1177"), val = tensor<int32, [3]>([12, 8, 64])];
-            tensor<fp32, [12, 8, 64]> var_1178 = reshape(shape = var_1177, x = v_11)[name = tensor<string, []>("op_1178")];
+            tensor<int32, [3]> var_1190 = const()[name = tensor<string, []>("op_1190"), val = tensor<int32, [3]>([12, 8, 64])];
+            tensor<fp32, [12, 8, 64]> var_1191 = reshape(shape = var_1190, x = v_11)[name = tensor<string, []>("op_1191")];
             tensor<int32, [3]> v_13_perm_0 = const()[name = tensor<string, []>("v_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [4]> var_1181 = const()[name = tensor<string, []>("op_1181"), val = tensor<int32, [4]>([2, 4, 12, 64])];
-            tensor<fp32, [8, 12, 64]> q_13 = transpose(perm = q_13_perm_0, x = var_1164)[name = tensor<string, []>("transpose_20")];
-            tensor<fp32, [2, 4, 12, 64]> q_15 = reshape(shape = var_1181, x = q_13)[name = tensor<string, []>("q_15")];
-            tensor<int32, [4]> var_1183 = const()[name = tensor<string, []>("op_1183"), val = tensor<int32, [4]>([2, 4, 12, 64])];
-            tensor<fp32, [8, 12, 64]> k_13 = transpose(perm = k_13_perm_0, x = var_1171)[name = tensor<string, []>("transpose_19")];
-            tensor<fp32, [2, 4, 12, 64]> k_15 = reshape(shape = var_1183, x = k_13)[name = tensor<string, []>("k_15")];
-            tensor<int32, [4]> var_1185 = const()[name = tensor<string, []>("op_1185"), val = tensor<int32, [4]>([2, 4, 12, 64])];
-            tensor<fp32, [8, 12, 64]> v_13 = transpose(perm = v_13_perm_0, x = var_1178)[name = tensor<string, []>("transpose_18")];
-            tensor<fp32, [2, 4, 12, 64]> v_15 = reshape(shape = var_1185, x = v_13)[name = tensor<string, []>("v_15")];
+            tensor<int32, [4]> var_1194 = const()[name = tensor<string, []>("op_1194"), val = tensor<int32, [4]>([2, 4, 12, 64])];
+            tensor<fp32, [8, 12, 64]> q_13 = transpose(perm = q_13_perm_0, x = var_1177)[name = tensor<string, []>("transpose_20")];
+            tensor<fp32, [2, 4, 12, 64]> q_15 = reshape(shape = var_1194, x = q_13)[name = tensor<string, []>("q_15")];
+            tensor<int32, [4]> var_1196 = const()[name = tensor<string, []>("op_1196"), val = tensor<int32, [4]>([2, 4, 12, 64])];
+            tensor<fp32, [8, 12, 64]> k_13 = transpose(perm = k_13_perm_0, x = var_1184)[name = tensor<string, []>("transpose_19")];
+            tensor<fp32, [2, 4, 12, 64]> k_15 = reshape(shape = var_1196, x = k_13)[name = tensor<string, []>("k_15")];
+            tensor<int32, [4]> var_1198 = const()[name = tensor<string, []>("op_1198"), val = tensor<int32, [4]>([2, 4, 12, 64])];
+            tensor<fp32, [8, 12, 64]> v_13 = transpose(perm = v_13_perm_0, x = var_1191)[name = tensor<string, []>("transpose_18")];
+            tensor<fp32, [2, 4, 12, 64]> v_15 = reshape(shape = var_1198, x = v_13)[name = tensor<string, []>("v_15")];
             tensor<fp32, []> mul_1_y_0 = const()[name = tensor<string, []>("mul_1_y_0"), val = tensor<fp32, []>(0x1p-3)];
             tensor<fp32, [2, 4, 12, 64]> mul_1 = mul(x = q_15, y = mul_1_y_0)[name = tensor<string, []>("mul_1")];
             tensor<bool, []> matmul_0_transpose_y_0 = const()[name = tensor<string, []>("matmul_0_transpose_y_0"), val = tensor<bool, []>(true)];
@@ -1039,30 +1046,30 @@ program(1.0)
             tensor<bool, []> attn_output_1_transpose_x_0 = const()[name = tensor<string, []>("attn_output_1_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> attn_output_1_transpose_y_0 = const()[name = tensor<string, []>("attn_output_1_transpose_y_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [2, 4, 12, 64]> attn_output_1 = matmul(transpose_x = attn_output_1_transpose_x_0, transpose_y = attn_output_1_transpose_y_0, x = softmax_0, y = v_15)[name = tensor<string, []>("attn_output_1")];
-            tensor<int32, [4]> var_1188 = const()[name = tensor<string, []>("op_1188"), val = tensor<int32, [4]>([2, 0, 1, 3])];
-            tensor<int32, [2]> var_1193 = const()[name = tensor<string, []>("op_1193"), val = tensor<int32, [2]>([24, 256])];
-            tensor<fp32, [12, 2, 4, 64]> var_1189 = transpose(perm = var_1188, x = attn_output_1)[name = tensor<string, []>("transpose_17")];
-            tensor<fp32, [24, 256]> attn_output_3 = reshape(shape = var_1193, x = var_1189)[name = tensor<string, []>("attn_output_3")];
-            tensor<fp32, [24, 256]> attn_output_5 = linear(bias = decoder_self_attn2_0_out_proj_bias, weight = decoder_self_attn2_0_out_proj_weight, x = attn_output_3)[name = tensor<string, []>("linear_44")];
-            tensor<int32, [3]> var_1197 = const()[name = tensor<string, []>("op_1197"), val = tensor<int32, [3]>([12, 2, 256])];
-            tensor<fp32, [12, 2, 256]> attn_output_7 = reshape(shape = var_1197, x = attn_output_5)[name = tensor<string, []>("attn_output_7")];
+            tensor<int32, [4]> var_1201 = const()[name = tensor<string, []>("op_1201"), val = tensor<int32, [4]>([2, 0, 1, 3])];
+            tensor<int32, [2]> var_1206 = const()[name = tensor<string, []>("op_1206"), val = tensor<int32, [2]>([24, 256])];
+            tensor<fp32, [12, 2, 4, 64]> var_1202 = transpose(perm = var_1201, x = attn_output_1)[name = tensor<string, []>("transpose_17")];
+            tensor<fp32, [24, 256]> attn_output_3 = reshape(shape = var_1206, x = var_1202)[name = tensor<string, []>("attn_output_3")];
+            tensor<fp32, [24, 256]> attn_output_5 = linear(bias = inner_decoder_self_attn2_0_out_proj_bias, weight = inner_decoder_self_attn2_0_out_proj_weight, x = attn_output_3)[name = tensor<string, []>("linear_44")];
+            tensor<int32, [3]> var_1210 = const()[name = tensor<string, []>("op_1210"), val = tensor<int32, [3]>([12, 2, 256])];
+            tensor<fp32, [12, 2, 256]> attn_output_7 = reshape(shape = var_1210, x = attn_output_5)[name = tensor<string, []>("attn_output_7")];
             tensor<int32, [3]> sa2_out_1_perm_0 = const()[name = tensor<string, []>("sa2_out_1_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [2, 12, 256]> sa2_out_1 = transpose(perm = sa2_out_1_perm_0, x = attn_output_7)[name = tensor<string, []>("transpose_16")];
-            tensor<fp32, [2, 12, 256]> input_175 = add(x = query_1, y = sa2_out_1)[name = tensor<string, []>("input_175")];
-            tensor<int32, [1]> input_177_axes_0 = const()[name = tensor<string, []>("input_177_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [2, 12, 256]> input_177 = layer_norm(axes = input_177_axes_0, beta = decoder_norm21_0_bias, epsilon = var_953, gamma = decoder_norm21_0_weight, x = input_175)[name = tensor<string, []>("input_177")];
-            tensor<fp32, [2, 12, 2048]> input_179 = linear(bias = decoder_linear1_0_bias, weight = decoder_linear1_0_weight, x = input_177)[name = tensor<string, []>("linear_45")];
-            tensor<fp32, [2, 12, 2048]> input_181 = relu(x = input_179)[name = tensor<string, []>("input_181")];
-            tensor<fp32, [2, 12, 256]> ffn_out_1 = linear(bias = decoder_linear2_0_bias, weight = decoder_linear2_0_weight, x = input_181)[name = tensor<string, []>("linear_46")];
-            tensor<fp32, [2, 12, 256]> input_183 = add(x = input_177, y = ffn_out_1)[name = tensor<string, []>("input_183")];
+            tensor<fp32, [2, 12, 256]> input_177 = add(x = query_1, y = sa2_out_1)[name = tensor<string, []>("input_177")];
+            tensor<int32, [1]> input_179_axes_0 = const()[name = tensor<string, []>("input_179_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [2, 12, 256]> input_179 = layer_norm(axes = input_179_axes_0, beta = inner_decoder_norm21_0_bias, epsilon = var_46, gamma = inner_decoder_norm21_0_weight, x = input_177)[name = tensor<string, []>("input_179")];
+            tensor<fp32, [2, 12, 2048]> input_181 = linear(bias = inner_decoder_linear1_0_bias, weight = inner_decoder_linear1_0_weight, x = input_179)[name = tensor<string, []>("linear_45")];
+            tensor<fp32, [2, 12, 2048]> input_183 = relu(x = input_181)[name = tensor<string, []>("input_183")];
+            tensor<fp32, [2, 12, 256]> ffn_out_1 = linear(bias = inner_decoder_linear2_0_bias, weight = inner_decoder_linear2_0_weight, x = input_183)[name = tensor<string, []>("linear_46")];
+            tensor<fp32, [2, 12, 256]> input_185 = add(x = input_179, y = ffn_out_1)[name = tensor<string, []>("input_185")];
             tensor<int32, [1]> xt_3_axes_0 = const()[name = tensor<string, []>("xt_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [2, 12, 256]> xt_3 = layer_norm(axes = xt_3_axes_0, beta = decoder_norm22_0_bias, epsilon = var_953, gamma = decoder_norm22_0_weight, x = input_183)[name = tensor<string, []>("xt_3")];
-            tensor<int32, [4]> var_1217 = const()[name = tensor<string, []>("op_1217"), val = tensor<int32, [4]>([1, 2, 12, 256])];
-            tensor<fp32, [1, 2, 12, 256]> x_31 = reshape(shape = var_1217, x = xt_3)[name = tensor<string, []>("x_31")];
-            tensor<int32, [4]> var_1219_perm_0 = const()[name = tensor<string, []>("op_1219_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1223 = const()[name = tensor<string, []>("op_1223"), val = tensor<int32, [3]>([12, 2, 256])];
-            tensor<fp32, [1, 12, 2, 256]> var_1219 = transpose(perm = var_1219_perm_0, x = x_31)[name = tensor<string, []>("transpose_15")];
-            tensor<fp32, [12, 2, 256]> x = reshape(shape = var_1223, x = var_1219)[name = tensor<string, []>("x")];
+            tensor<fp32, [2, 12, 256]> xt_3 = layer_norm(axes = xt_3_axes_0, beta = inner_decoder_norm22_0_bias, epsilon = var_46, gamma = inner_decoder_norm22_0_weight, x = input_185)[name = tensor<string, []>("xt_3")];
+            tensor<int32, [4]> var_1230 = const()[name = tensor<string, []>("op_1230"), val = tensor<int32, [4]>([1, 2, 12, 256])];
+            tensor<fp32, [1, 2, 12, 256]> x_31 = reshape(shape = var_1230, x = xt_3)[name = tensor<string, []>("x_31")];
+            tensor<int32, [4]> var_1232_perm_0 = const()[name = tensor<string, []>("op_1232_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1236 = const()[name = tensor<string, []>("op_1236"), val = tensor<int32, [3]>([12, 2, 256])];
+            tensor<fp32, [1, 12, 2, 256]> var_1232 = transpose(perm = var_1232_perm_0, x = x_31)[name = tensor<string, []>("transpose_15")];
+            tensor<fp32, [12, 2, 256]> x = reshape(shape = var_1236, x = var_1232)[name = tensor<string, []>("x")];
             tensor<int32, [5]> prev_kv_begin_0 = const()[name = tensor<string, []>("prev_kv_begin_0"), val = tensor<int32, [5]>([1, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_end_0 = const()[name = tensor<string, []>("prev_kv_end_0"), val = tensor<int32, [5]>([2, 12, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_end_mask_0 = const()[name = tensor<string, []>("prev_kv_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -1073,120 +1080,120 @@ program(1.0)
             tensor<bool, [2]> prev_scale_end_mask_0 = const()[name = tensor<string, []>("prev_scale_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale = slice_by_index(begin = prev_scale_begin_0, end = prev_scale_end_0, end_mask = prev_scale_end_mask_0, squeeze_mask = prev_scale_squeeze_mask_0, x = dec_scale)[name = tensor<string, []>("prev_scale")];
-            tensor<fp32, [12, 2, 256]> var_1231 = linear(bias = decoder_q_proj_1_bias, weight = decoder_q_proj_1_weight, x = x)[name = tensor<string, []>("linear_47")];
-            tensor<int32, [4]> var_1232 = const()[name = tensor<string, []>("op_1232"), val = tensor<int32, [4]>([12, 2, 4, 64])];
-            tensor<fp32, [12, 2, 4, 64]> var_1233 = reshape(shape = var_1232, x = var_1231)[name = tensor<string, []>("op_1233")];
+            tensor<fp32, [12, 2, 256]> var_1244 = linear(bias = inner_decoder_q_proj_1_bias, weight = inner_decoder_q_proj_1_weight, x = x)[name = tensor<string, []>("linear_47")];
+            tensor<int32, [4]> var_1245 = const()[name = tensor<string, []>("op_1245"), val = tensor<int32, [4]>([12, 2, 4, 64])];
+            tensor<fp32, [12, 2, 4, 64]> var_1246 = reshape(shape = var_1245, x = var_1244)[name = tensor<string, []>("op_1246")];
             tensor<int32, [4]> q_17_perm_0 = const()[name = tensor<string, []>("q_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 2, 256]> var_1237 = linear(bias = decoder_k_proj_1_bias, weight = decoder_k_proj_1_weight, x = x)[name = tensor<string, []>("linear_48")];
-            tensor<fp32, []> var_1238 = const()[name = tensor<string, []>("op_1238"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [12, 2, 256]> var_1239 = mul(x = var_1237, y = var_1238)[name = tensor<string, []>("op_1239")];
-            tensor<int32, [4]> var_1240 = const()[name = tensor<string, []>("op_1240"), val = tensor<int32, [4]>([12, 2, 4, 64])];
-            tensor<fp32, [12, 2, 4, 64]> var_1241 = reshape(shape = var_1240, x = var_1239)[name = tensor<string, []>("op_1241")];
+            tensor<fp32, [12, 2, 256]> var_1250 = linear(bias = inner_decoder_k_proj_1_bias, weight = inner_decoder_k_proj_1_weight, x = x)[name = tensor<string, []>("linear_48")];
+            tensor<fp32, []> var_1251 = const()[name = tensor<string, []>("op_1251"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [12, 2, 256]> var_1252 = mul(x = var_1250, y = var_1251)[name = tensor<string, []>("op_1252")];
+            tensor<int32, [4]> var_1253 = const()[name = tensor<string, []>("op_1253"), val = tensor<int32, [4]>([12, 2, 4, 64])];
+            tensor<fp32, [12, 2, 4, 64]> var_1254 = reshape(shape = var_1253, x = var_1252)[name = tensor<string, []>("op_1254")];
             tensor<int32, [4]> k_17_perm_0 = const()[name = tensor<string, []>("k_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 2, 256]> var_1245 = linear(bias = decoder_v_proj_1_bias, weight = decoder_v_proj_1_weight, x = x)[name = tensor<string, []>("linear_49")];
-            tensor<int32, [4]> var_1246 = const()[name = tensor<string, []>("op_1246"), val = tensor<int32, [4]>([12, 2, 4, 64])];
-            tensor<fp32, [12, 2, 4, 64]> var_1247 = reshape(shape = var_1246, x = var_1245)[name = tensor<string, []>("op_1247")];
+            tensor<fp32, [12, 2, 256]> var_1258 = linear(bias = inner_decoder_v_proj_1_bias, weight = inner_decoder_v_proj_1_weight, x = x)[name = tensor<string, []>("linear_49")];
+            tensor<int32, [4]> var_1259 = const()[name = tensor<string, []>("op_1259"), val = tensor<int32, [4]>([12, 2, 4, 64])];
+            tensor<fp32, [12, 2, 4, 64]> var_1260 = reshape(shape = var_1259, x = var_1258)[name = tensor<string, []>("op_1260")];
             tensor<int32, [4]> v_17_perm_0 = const()[name = tensor<string, []>("v_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 2, 256]> input_187 = linear(bias = decoder_g_proj_1_bias, weight = decoder_g_proj_1_weight, x = x)[name = tensor<string, []>("linear_50")];
+            tensor<fp32, [12, 2, 256]> input_189 = linear(bias = inner_decoder_g_proj_1_bias, weight = inner_decoder_g_proj_1_weight, x = x)[name = tensor<string, []>("linear_50")];
             tensor<fp32, [1]> sqrt_s0 = sqrt(x = prev_scale)[name = tensor<string, []>("sqrt_s0")];
             tensor<fp32, [2]> s_t = add(x = prev_scale, y = cumsum_mask_1)[name = tensor<string, []>("s_t")];
             tensor<fp32, []> const_32 = const()[name = tensor<string, []>("const_32"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [2]> clip_3 = clip(alpha = var_958, beta = const_32, x = s_t)[name = tensor<string, []>("clip_3")];
+            tensor<fp32, [2]> clip_3 = clip(alpha = var_39, beta = const_32, x = s_t)[name = tensor<string, []>("clip_3")];
             tensor<fp32, [2]> sqrt_s_t = sqrt(x = clip_3)[name = tensor<string, []>("sqrt_s_t")];
             tensor<bool, []> qk_transpose_x_1 = const()[name = tensor<string, []>("qk_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_transpose_y_1 = const()[name = tensor<string, []>("qk_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [12, 4, 2, 64]> k_17 = transpose(perm = k_17_perm_0, x = var_1241)[name = tensor<string, []>("transpose_13")];
-            tensor<fp32, [12, 4, 2, 64]> q_17 = transpose(perm = q_17_perm_0, x = var_1233)[name = tensor<string, []>("transpose_14")];
+            tensor<fp32, [12, 4, 2, 64]> k_17 = transpose(perm = k_17_perm_0, x = var_1254)[name = tensor<string, []>("transpose_13")];
+            tensor<fp32, [12, 4, 2, 64]> q_17 = transpose(perm = q_17_perm_0, x = var_1246)[name = tensor<string, []>("transpose_14")];
             tensor<fp32, [12, 4, 2, 2]> qk = matmul(transpose_x = qk_transpose_x_1, transpose_y = qk_transpose_y_1, x = q_17, y = k_17)[name = tensor<string, []>("qk")];
-            tensor<int32, [2]> var_1262 = const()[name = tensor<string, []>("op_1262"), val = tensor<int32, [2]>([2, 1])];
-            tensor<fp32, [2, 1]> var_1263 = reshape(shape = var_1262, x = sqrt_s_t)[name = tensor<string, []>("op_1263")];
-            tensor<fp32, [2, 2]> M = real_div(x = causal_with_valid_1, y = var_1263)[name = tensor<string, []>("M")];
-            tensor<fp32, [12, 4, 2, 2]> var_1265 = mul(x = qk, y = M)[name = tensor<string, []>("op_1265")];
-            tensor<bool, []> inner_transpose_x_0 = const()[name = tensor<string, []>("inner_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> inner_transpose_y_0 = const()[name = tensor<string, []>("inner_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 2, 64]> v_17 = transpose(perm = v_17_perm_0, x = var_1247)[name = tensor<string, []>("transpose_12")];
-            tensor<fp32, [12, 4, 2, 64]> inner = matmul(transpose_x = inner_transpose_x_0, transpose_y = inner_transpose_y_0, x = var_1265, y = v_17)[name = tensor<string, []>("inner")];
-            tensor<bool, []> var_1267_transpose_x_0 = const()[name = tensor<string, []>("op_1267_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_1267_transpose_y_0 = const()[name = tensor<string, []>("op_1267_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 2, 64]> var_1267 = matmul(transpose_x = var_1267_transpose_x_0, transpose_y = var_1267_transpose_y_0, x = q_17, y = prev_kv)[name = tensor<string, []>("op_1267")];
-            tensor<fp32, [2]> var_1268 = real_div(x = sqrt_s0, y = sqrt_s_t)[name = tensor<string, []>("op_1268")];
-            tensor<int32, [4]> var_1269 = const()[name = tensor<string, []>("op_1269"), val = tensor<int32, [4]>([1, 1, 2, 1])];
-            tensor<fp32, [1, 1, 2, 1]> var_1270 = reshape(shape = var_1269, x = var_1268)[name = tensor<string, []>("op_1270")];
-            tensor<fp32, [12, 4, 2, 64]> cross = mul(x = var_1267, y = var_1270)[name = tensor<string, []>("cross")];
-            tensor<fp32, [12, 4, 2, 64]> out_31 = add(x = inner, y = cross)[name = tensor<string, []>("out_31")];
-            tensor<fp32, [12, 4, 2, 64]> v_masked = mul(x = v_17, y = var_1093)[name = tensor<string, []>("v_masked")];
-            tensor<fp32, [12, 4, 64, 64]> var_1276 = mul(x = prev_kv, y = sqrt_s0)[name = tensor<string, []>("op_1276")];
-            tensor<bool, []> var_1278_transpose_x_1 = const()[name = tensor<string, []>("op_1278_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_1278_transpose_y_1 = const()[name = tensor<string, []>("op_1278_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 64, 64]> var_1278 = matmul(transpose_x = var_1278_transpose_x_1, transpose_y = var_1278_transpose_y_1, x = k_17, y = v_masked)[name = tensor<string, []>("op_1278")];
-            tensor<fp32, [12, 4, 64, 64]> new_kv_unnorm = add(x = var_1276, y = var_1278)[name = tensor<string, []>("new_kv_unnorm")];
-            tensor<fp32, [1]> new_scale = add(x = prev_scale, y = var_1101)[name = tensor<string, []>("new_scale")];
+            tensor<int32, [2]> var_1275 = const()[name = tensor<string, []>("op_1275"), val = tensor<int32, [2]>([2, 1])];
+            tensor<fp32, [2, 1]> var_1276 = reshape(shape = var_1275, x = sqrt_s_t)[name = tensor<string, []>("op_1276")];
+            tensor<fp32, [2, 2]> M = real_div(x = causal_with_valid_1, y = var_1276)[name = tensor<string, []>("M")];
+            tensor<fp32, [12, 4, 2, 2]> var_1278 = mul(x = qk, y = M)[name = tensor<string, []>("op_1278")];
+            tensor<bool, []> inner_11_transpose_x_0 = const()[name = tensor<string, []>("inner_11_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> inner_11_transpose_y_0 = const()[name = tensor<string, []>("inner_11_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 2, 64]> v_17 = transpose(perm = v_17_perm_0, x = var_1260)[name = tensor<string, []>("transpose_12")];
+            tensor<fp32, [12, 4, 2, 64]> inner_11 = matmul(transpose_x = inner_11_transpose_x_0, transpose_y = inner_11_transpose_y_0, x = var_1278, y = v_17)[name = tensor<string, []>("inner_11")];
+            tensor<bool, []> var_1280_transpose_x_0 = const()[name = tensor<string, []>("op_1280_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_1280_transpose_y_0 = const()[name = tensor<string, []>("op_1280_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 2, 64]> var_1280 = matmul(transpose_x = var_1280_transpose_x_0, transpose_y = var_1280_transpose_y_0, x = q_17, y = prev_kv)[name = tensor<string, []>("op_1280")];
+            tensor<fp32, [2]> var_1281 = real_div(x = sqrt_s0, y = sqrt_s_t)[name = tensor<string, []>("op_1281")];
+            tensor<int32, [4]> var_1282 = const()[name = tensor<string, []>("op_1282"), val = tensor<int32, [4]>([1, 1, 2, 1])];
+            tensor<fp32, [1, 1, 2, 1]> var_1283 = reshape(shape = var_1282, x = var_1281)[name = tensor<string, []>("op_1283")];
+            tensor<fp32, [12, 4, 2, 64]> cross = mul(x = var_1280, y = var_1283)[name = tensor<string, []>("cross")];
+            tensor<fp32, [12, 4, 2, 64]> out_31 = add(x = inner_11, y = cross)[name = tensor<string, []>("out_31")];
+            tensor<fp32, [12, 4, 2, 64]> v_masked = mul(x = v_17, y = var_1106)[name = tensor<string, []>("v_masked")];
+            tensor<fp32, [12, 4, 64, 64]> var_1289 = mul(x = prev_kv, y = sqrt_s0)[name = tensor<string, []>("op_1289")];
+            tensor<bool, []> var_1291_transpose_x_1 = const()[name = tensor<string, []>("op_1291_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_1291_transpose_y_1 = const()[name = tensor<string, []>("op_1291_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 64, 64]> var_1291 = matmul(transpose_x = var_1291_transpose_x_1, transpose_y = var_1291_transpose_y_1, x = k_17, y = v_masked)[name = tensor<string, []>("op_1291")];
+            tensor<fp32, [12, 4, 64, 64]> new_kv_unnorm = add(x = var_1289, y = var_1291)[name = tensor<string, []>("new_kv_unnorm")];
+            tensor<fp32, [1]> new_scale = add(x = prev_scale, y = var_1114)[name = tensor<string, []>("new_scale")];
             tensor<fp32, []> const_33 = const()[name = tensor<string, []>("const_33"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_4 = clip(alpha = var_958, beta = const_33, x = new_scale)[name = tensor<string, []>("clip_4")];
+            tensor<fp32, [1]> clip_4 = clip(alpha = var_39, beta = const_33, x = new_scale)[name = tensor<string, []>("clip_4")];
             tensor<fp32, [1]> sqrt_new_scale = sqrt(x = clip_4)[name = tensor<string, []>("sqrt_new_scale")];
             tensor<fp32, [12, 4, 64, 64]> nkv = real_div(x = new_kv_unnorm, y = sqrt_new_scale)[name = tensor<string, []>("nkv")];
-            tensor<int32, [4]> var_1287_perm_0 = const()[name = tensor<string, []>("op_1287_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [4]> var_1300_perm_0 = const()[name = tensor<string, []>("op_1300_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_33_axes_0 = const()[name = tensor<string, []>("out_33_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [12, 2, 4, 64]> var_1287 = transpose(perm = var_1287_perm_0, x = out_31)[name = tensor<string, []>("transpose_11")];
-            tensor<fp32, [12, 2, 4, 64]> out_33 = layer_norm(axes = out_33_axes_0, epsilon = var_955, x = var_1287)[name = tensor<string, []>("out_33")];
-            tensor<int32, [3]> var_1291 = const()[name = tensor<string, []>("op_1291"), val = tensor<int32, [3]>([12, 2, 256])];
-            tensor<fp32, [12, 2, 256]> out = reshape(shape = var_1291, x = out_33)[name = tensor<string, []>("out")];
-            tensor<fp32, [12, 2, 256]> var_1293 = silu(x = input_187)[name = tensor<string, []>("op_1293")];
-            tensor<fp32, [12, 2, 256]> input_189 = mul(x = var_1293, y = out)[name = tensor<string, []>("input_189")];
-            tensor<fp32, [12, 2, 256]> ret_out = linear(bias = decoder_out_proj_1_bias, weight = decoder_out_proj_1_weight, x = input_189)[name = tensor<string, []>("linear_51")];
-            tensor<fp32, [12, 2, 256]> input_191 = add(x = x, y = ret_out)[name = tensor<string, []>("input_191")];
+            tensor<fp32, [12, 2, 4, 64]> var_1300 = transpose(perm = var_1300_perm_0, x = out_31)[name = tensor<string, []>("transpose_11")];
+            tensor<fp32, [12, 2, 4, 64]> out_33 = layer_norm(axes = out_33_axes_0, epsilon = var_54, x = var_1300)[name = tensor<string, []>("out_33")];
+            tensor<int32, [3]> var_1304 = const()[name = tensor<string, []>("op_1304"), val = tensor<int32, [3]>([12, 2, 256])];
+            tensor<fp32, [12, 2, 256]> out = reshape(shape = var_1304, x = out_33)[name = tensor<string, []>("out")];
+            tensor<fp32, [12, 2, 256]> var_1306 = silu(x = input_189)[name = tensor<string, []>("op_1306")];
+            tensor<fp32, [12, 2, 256]> input_191 = mul(x = var_1306, y = out)[name = tensor<string, []>("input_191")];
+            tensor<fp32, [12, 2, 256]> ret_out = linear(bias = inner_decoder_out_proj_1_bias, weight = inner_decoder_out_proj_1_weight, x = input_191)[name = tensor<string, []>("linear_51")];
+            tensor<fp32, [12, 2, 256]> input_193 = add(x = x, y = ret_out)[name = tensor<string, []>("input_193")];
             tensor<int32, [1]> xt_5_axes_0 = const()[name = tensor<string, []>("xt_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [12, 2, 256]> xt_5 = layer_norm(axes = xt_5_axes_0, beta = decoder_norm11_1_bias, epsilon = var_953, gamma = decoder_norm11_1_weight, x = input_191)[name = tensor<string, []>("xt_5")];
-            tensor<int32, [4]> var_1303 = const()[name = tensor<string, []>("op_1303"), val = tensor<int32, [4]>([1, 12, 2, 256])];
-            tensor<fp32, [1, 12, 2, 256]> var_1304 = reshape(shape = var_1303, x = xt_5)[name = tensor<string, []>("op_1304")];
-            tensor<int32, [4]> var_1305_perm_0 = const()[name = tensor<string, []>("op_1305_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1308 = const()[name = tensor<string, []>("op_1308"), val = tensor<int32, [3]>([2, 12, 256])];
-            tensor<fp32, [1, 2, 12, 256]> var_1305 = transpose(perm = var_1305_perm_0, x = var_1304)[name = tensor<string, []>("transpose_10")];
-            tensor<fp32, [2, 12, 256]> query_5 = reshape(shape = var_1308, x = var_1305)[name = tensor<string, []>("query_5")];
+            tensor<fp32, [12, 2, 256]> xt_5 = layer_norm(axes = xt_5_axes_0, beta = inner_decoder_norm11_1_bias, epsilon = var_46, gamma = inner_decoder_norm11_1_weight, x = input_193)[name = tensor<string, []>("xt_5")];
+            tensor<int32, [4]> var_1316 = const()[name = tensor<string, []>("op_1316"), val = tensor<int32, [4]>([1, 12, 2, 256])];
+            tensor<fp32, [1, 12, 2, 256]> var_1317 = reshape(shape = var_1316, x = xt_5)[name = tensor<string, []>("op_1317")];
+            tensor<int32, [4]> var_1318_perm_0 = const()[name = tensor<string, []>("op_1318_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1321 = const()[name = tensor<string, []>("op_1321"), val = tensor<int32, [3]>([2, 12, 256])];
+            tensor<fp32, [1, 2, 12, 256]> var_1318 = transpose(perm = var_1318_perm_0, x = var_1317)[name = tensor<string, []>("transpose_10")];
+            tensor<fp32, [2, 12, 256]> query_5 = reshape(shape = var_1321, x = var_1318)[name = tensor<string, []>("query_5")];
             tensor<int32, [3]> query_perm_0 = const()[name = tensor<string, []>("query_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [12, 2, 256]> query = transpose(perm = query_perm_0, x = query_5)[name = tensor<string, []>("transpose_9")];
-            tensor<fp32, [12, 2, 768]> var_1331 = linear(bias = decoder_self_attn2_1_in_proj_bias, weight = decoder_self_attn2_1_in_proj_weight, x = query)[name = tensor<string, []>("linear_52")];
+            tensor<fp32, [12, 2, 768]> var_1344 = linear(bias = inner_decoder_self_attn2_1_in_proj_bias, weight = inner_decoder_self_attn2_1_in_proj_weight, x = query)[name = tensor<string, []>("linear_52")];
             tensor<int32, [4]> concat_2 = const()[name = tensor<string, []>("concat_2"), val = tensor<int32, [4]>([12, 2, 3, 256])];
-            tensor<fp32, [12, 2, 3, 256]> var_1333 = reshape(shape = concat_2, x = var_1331)[name = tensor<string, []>("op_1333")];
-            tensor<int32, [1]> var_1334_axes_0 = const()[name = tensor<string, []>("op_1334_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp32, [1, 12, 2, 3, 256]> var_1334 = expand_dims(axes = var_1334_axes_0, x = var_1333)[name = tensor<string, []>("op_1334")];
-            tensor<int32, [5]> var_1335_perm_0 = const()[name = tensor<string, []>("op_1335_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
-            tensor<int32, [1]> var_1336_axes_0 = const()[name = tensor<string, []>("op_1336_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp32, [3, 12, 2, 1, 256]> var_1335 = transpose(perm = var_1335_perm_0, x = var_1334)[name = tensor<string, []>("transpose_8")];
-            tensor<fp32, [3, 12, 2, 256]> var_1336 = squeeze(axes = var_1336_axes_0, x = var_1335)[name = tensor<string, []>("op_1336")];
+            tensor<fp32, [12, 2, 3, 256]> var_1346 = reshape(shape = concat_2, x = var_1344)[name = tensor<string, []>("op_1346")];
+            tensor<int32, [1]> var_1347_axes_0 = const()[name = tensor<string, []>("op_1347_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 12, 2, 3, 256]> var_1347 = expand_dims(axes = var_1347_axes_0, x = var_1346)[name = tensor<string, []>("op_1347")];
+            tensor<int32, [5]> var_1348_perm_0 = const()[name = tensor<string, []>("op_1348_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
+            tensor<int32, [1]> var_1349_axes_0 = const()[name = tensor<string, []>("op_1349_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp32, [3, 12, 2, 1, 256]> var_1348 = transpose(perm = var_1348_perm_0, x = var_1347)[name = tensor<string, []>("transpose_8")];
+            tensor<fp32, [3, 12, 2, 256]> var_1349 = squeeze(axes = var_1349_axes_0, x = var_1348)[name = tensor<string, []>("op_1349")];
             tensor<int32, [4]> q_19_begin_0 = const()[name = tensor<string, []>("q_19_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> q_19_end_0 = const()[name = tensor<string, []>("q_19_end_0"), val = tensor<int32, [4]>([1, 12, 2, 256])];
             tensor<bool, [4]> q_19_end_mask_0 = const()[name = tensor<string, []>("q_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> q_19_squeeze_mask_0 = const()[name = tensor<string, []>("q_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 2, 256]> q_19 = slice_by_index(begin = q_19_begin_0, end = q_19_end_0, end_mask = q_19_end_mask_0, squeeze_mask = q_19_squeeze_mask_0, x = var_1336)[name = tensor<string, []>("q_19")];
+            tensor<fp32, [12, 2, 256]> q_19 = slice_by_index(begin = q_19_begin_0, end = q_19_end_0, end_mask = q_19_end_mask_0, squeeze_mask = q_19_squeeze_mask_0, x = var_1349)[name = tensor<string, []>("q_19")];
             tensor<int32, [4]> k_19_begin_0 = const()[name = tensor<string, []>("k_19_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> k_19_end_0 = const()[name = tensor<string, []>("k_19_end_0"), val = tensor<int32, [4]>([2, 12, 2, 256])];
             tensor<bool, [4]> k_19_end_mask_0 = const()[name = tensor<string, []>("k_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> k_19_squeeze_mask_0 = const()[name = tensor<string, []>("k_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 2, 256]> k_19 = slice_by_index(begin = k_19_begin_0, end = k_19_end_0, end_mask = k_19_end_mask_0, squeeze_mask = k_19_squeeze_mask_0, x = var_1336)[name = tensor<string, []>("k_19")];
+            tensor<fp32, [12, 2, 256]> k_19 = slice_by_index(begin = k_19_begin_0, end = k_19_end_0, end_mask = k_19_end_mask_0, squeeze_mask = k_19_squeeze_mask_0, x = var_1349)[name = tensor<string, []>("k_19")];
             tensor<int32, [4]> v_19_begin_0 = const()[name = tensor<string, []>("v_19_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> v_19_end_0 = const()[name = tensor<string, []>("v_19_end_0"), val = tensor<int32, [4]>([3, 12, 2, 256])];
             tensor<bool, [4]> v_19_end_mask_0 = const()[name = tensor<string, []>("v_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> v_19_squeeze_mask_0 = const()[name = tensor<string, []>("v_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 2, 256]> v_19 = slice_by_index(begin = v_19_begin_0, end = v_19_end_0, end_mask = v_19_end_mask_0, squeeze_mask = v_19_squeeze_mask_0, x = var_1336)[name = tensor<string, []>("v_19")];
-            tensor<int32, [3]> var_1344 = const()[name = tensor<string, []>("op_1344"), val = tensor<int32, [3]>([12, 8, 64])];
-            tensor<fp32, [12, 8, 64]> var_1345 = reshape(shape = var_1344, x = q_19)[name = tensor<string, []>("op_1345")];
+            tensor<fp32, [12, 2, 256]> v_19 = slice_by_index(begin = v_19_begin_0, end = v_19_end_0, end_mask = v_19_end_mask_0, squeeze_mask = v_19_squeeze_mask_0, x = var_1349)[name = tensor<string, []>("v_19")];
+            tensor<int32, [3]> var_1357 = const()[name = tensor<string, []>("op_1357"), val = tensor<int32, [3]>([12, 8, 64])];
+            tensor<fp32, [12, 8, 64]> var_1358 = reshape(shape = var_1357, x = q_19)[name = tensor<string, []>("op_1358")];
             tensor<int32, [3]> q_21_perm_0 = const()[name = tensor<string, []>("q_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1351 = const()[name = tensor<string, []>("op_1351"), val = tensor<int32, [3]>([12, 8, 64])];
-            tensor<fp32, [12, 8, 64]> var_1352 = reshape(shape = var_1351, x = k_19)[name = tensor<string, []>("op_1352")];
+            tensor<int32, [3]> var_1364 = const()[name = tensor<string, []>("op_1364"), val = tensor<int32, [3]>([12, 8, 64])];
+            tensor<fp32, [12, 8, 64]> var_1365 = reshape(shape = var_1364, x = k_19)[name = tensor<string, []>("op_1365")];
             tensor<int32, [3]> k_21_perm_0 = const()[name = tensor<string, []>("k_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1358 = const()[name = tensor<string, []>("op_1358"), val = tensor<int32, [3]>([12, 8, 64])];
-            tensor<fp32, [12, 8, 64]> var_1359 = reshape(shape = var_1358, x = v_19)[name = tensor<string, []>("op_1359")];
+            tensor<int32, [3]> var_1371 = const()[name = tensor<string, []>("op_1371"), val = tensor<int32, [3]>([12, 8, 64])];
+            tensor<fp32, [12, 8, 64]> var_1372 = reshape(shape = var_1371, x = v_19)[name = tensor<string, []>("op_1372")];
             tensor<int32, [3]> v_21_perm_0 = const()[name = tensor<string, []>("v_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [4]> var_1362 = const()[name = tensor<string, []>("op_1362"), val = tensor<int32, [4]>([2, 4, 12, 64])];
-            tensor<fp32, [8, 12, 64]> q_21 = transpose(perm = q_21_perm_0, x = var_1345)[name = tensor<string, []>("transpose_7")];
-            tensor<fp32, [2, 4, 12, 64]> q = reshape(shape = var_1362, x = q_21)[name = tensor<string, []>("q")];
-            tensor<int32, [4]> var_1364 = const()[name = tensor<string, []>("op_1364"), val = tensor<int32, [4]>([2, 4, 12, 64])];
-            tensor<fp32, [8, 12, 64]> k_21 = transpose(perm = k_21_perm_0, x = var_1352)[name = tensor<string, []>("transpose_6")];
-            tensor<fp32, [2, 4, 12, 64]> k = reshape(shape = var_1364, x = k_21)[name = tensor<string, []>("k")];
-            tensor<int32, [4]> var_1366 = const()[name = tensor<string, []>("op_1366"), val = tensor<int32, [4]>([2, 4, 12, 64])];
-            tensor<fp32, [8, 12, 64]> v_21 = transpose(perm = v_21_perm_0, x = var_1359)[name = tensor<string, []>("transpose_5")];
-            tensor<fp32, [2, 4, 12, 64]> v = reshape(shape = var_1366, x = v_21)[name = tensor<string, []>("v")];
+            tensor<int32, [4]> var_1375 = const()[name = tensor<string, []>("op_1375"), val = tensor<int32, [4]>([2, 4, 12, 64])];
+            tensor<fp32, [8, 12, 64]> q_21 = transpose(perm = q_21_perm_0, x = var_1358)[name = tensor<string, []>("transpose_7")];
+            tensor<fp32, [2, 4, 12, 64]> q = reshape(shape = var_1375, x = q_21)[name = tensor<string, []>("q")];
+            tensor<int32, [4]> var_1377 = const()[name = tensor<string, []>("op_1377"), val = tensor<int32, [4]>([2, 4, 12, 64])];
+            tensor<fp32, [8, 12, 64]> k_21 = transpose(perm = k_21_perm_0, x = var_1365)[name = tensor<string, []>("transpose_6")];
+            tensor<fp32, [2, 4, 12, 64]> k = reshape(shape = var_1377, x = k_21)[name = tensor<string, []>("k")];
+            tensor<int32, [4]> var_1379 = const()[name = tensor<string, []>("op_1379"), val = tensor<int32, [4]>([2, 4, 12, 64])];
+            tensor<fp32, [8, 12, 64]> v_21 = transpose(perm = v_21_perm_0, x = var_1372)[name = tensor<string, []>("transpose_5")];
+            tensor<fp32, [2, 4, 12, 64]> v = reshape(shape = var_1379, x = v_21)[name = tensor<string, []>("v")];
             tensor<fp32, []> mul_3_y_0 = const()[name = tensor<string, []>("mul_3_y_0"), val = tensor<fp32, []>(0x1p-3)];
             tensor<fp32, [2, 4, 12, 64]> mul_3 = mul(x = q, y = mul_3_y_0)[name = tensor<string, []>("mul_3")];
             tensor<bool, []> matmul_1_transpose_y_0 = const()[name = tensor<string, []>("matmul_1_transpose_y_0"), val = tensor<bool, []>(true)];
@@ -1197,36 +1204,36 @@ program(1.0)
             tensor<bool, []> attn_output_9_transpose_x_0 = const()[name = tensor<string, []>("attn_output_9_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> attn_output_9_transpose_y_0 = const()[name = tensor<string, []>("attn_output_9_transpose_y_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [2, 4, 12, 64]> attn_output_9 = matmul(transpose_x = attn_output_9_transpose_x_0, transpose_y = attn_output_9_transpose_y_0, x = softmax_1, y = v)[name = tensor<string, []>("attn_output_9")];
-            tensor<int32, [4]> var_1369 = const()[name = tensor<string, []>("op_1369"), val = tensor<int32, [4]>([2, 0, 1, 3])];
-            tensor<int32, [2]> var_1374 = const()[name = tensor<string, []>("op_1374"), val = tensor<int32, [2]>([24, 256])];
-            tensor<fp32, [12, 2, 4, 64]> var_1370 = transpose(perm = var_1369, x = attn_output_9)[name = tensor<string, []>("transpose_4")];
-            tensor<fp32, [24, 256]> attn_output_11 = reshape(shape = var_1374, x = var_1370)[name = tensor<string, []>("attn_output_11")];
-            tensor<fp32, [24, 256]> attn_output_13 = linear(bias = decoder_self_attn2_1_out_proj_bias, weight = decoder_self_attn2_1_out_proj_weight, x = attn_output_11)[name = tensor<string, []>("linear_53")];
-            tensor<int32, [3]> var_1378 = const()[name = tensor<string, []>("op_1378"), val = tensor<int32, [3]>([12, 2, 256])];
-            tensor<fp32, [12, 2, 256]> attn_output = reshape(shape = var_1378, x = attn_output_13)[name = tensor<string, []>("attn_output")];
+            tensor<int32, [4]> var_1382 = const()[name = tensor<string, []>("op_1382"), val = tensor<int32, [4]>([2, 0, 1, 3])];
+            tensor<int32, [2]> var_1387 = const()[name = tensor<string, []>("op_1387"), val = tensor<int32, [2]>([24, 256])];
+            tensor<fp32, [12, 2, 4, 64]> var_1383 = transpose(perm = var_1382, x = attn_output_9)[name = tensor<string, []>("transpose_4")];
+            tensor<fp32, [24, 256]> attn_output_11 = reshape(shape = var_1387, x = var_1383)[name = tensor<string, []>("attn_output_11")];
+            tensor<fp32, [24, 256]> attn_output_13 = linear(bias = inner_decoder_self_attn2_1_out_proj_bias, weight = inner_decoder_self_attn2_1_out_proj_weight, x = attn_output_11)[name = tensor<string, []>("linear_53")];
+            tensor<int32, [3]> var_1391 = const()[name = tensor<string, []>("op_1391"), val = tensor<int32, [3]>([12, 2, 256])];
+            tensor<fp32, [12, 2, 256]> attn_output = reshape(shape = var_1391, x = attn_output_13)[name = tensor<string, []>("attn_output")];
             tensor<int32, [3]> sa2_out_perm_0 = const()[name = tensor<string, []>("sa2_out_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [2, 12, 256]> sa2_out = transpose(perm = sa2_out_perm_0, x = attn_output)[name = tensor<string, []>("transpose_3")];
-            tensor<fp32, [2, 12, 256]> input_193 = add(x = query_5, y = sa2_out)[name = tensor<string, []>("input_193")];
-            tensor<int32, [1]> input_195_axes_0 = const()[name = tensor<string, []>("input_195_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [2, 12, 256]> input_195 = layer_norm(axes = input_195_axes_0, beta = decoder_norm21_1_bias, epsilon = var_953, gamma = decoder_norm21_1_weight, x = input_193)[name = tensor<string, []>("input_195")];
-            tensor<fp32, [2, 12, 2048]> input_197 = linear(bias = decoder_linear1_1_bias, weight = decoder_linear1_1_weight, x = input_195)[name = tensor<string, []>("linear_54")];
-            tensor<fp32, [2, 12, 2048]> input_199 = relu(x = input_197)[name = tensor<string, []>("input_199")];
-            tensor<fp32, [2, 12, 256]> ffn_out = linear(bias = decoder_linear2_1_bias, weight = decoder_linear2_1_weight, x = input_199)[name = tensor<string, []>("linear_55")];
-            tensor<fp32, [2, 12, 256]> input_201 = add(x = input_195, y = ffn_out)[name = tensor<string, []>("input_201")];
+            tensor<fp32, [2, 12, 256]> input_195 = add(x = query_5, y = sa2_out)[name = tensor<string, []>("input_195")];
+            tensor<int32, [1]> input_197_axes_0 = const()[name = tensor<string, []>("input_197_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [2, 12, 256]> input_197 = layer_norm(axes = input_197_axes_0, beta = inner_decoder_norm21_1_bias, epsilon = var_46, gamma = inner_decoder_norm21_1_weight, x = input_195)[name = tensor<string, []>("input_197")];
+            tensor<fp32, [2, 12, 2048]> input_199 = linear(bias = inner_decoder_linear1_1_bias, weight = inner_decoder_linear1_1_weight, x = input_197)[name = tensor<string, []>("linear_54")];
+            tensor<fp32, [2, 12, 2048]> input_201 = relu(x = input_199)[name = tensor<string, []>("input_201")];
+            tensor<fp32, [2, 12, 256]> ffn_out = linear(bias = inner_decoder_linear2_1_bias, weight = inner_decoder_linear2_1_weight, x = input_201)[name = tensor<string, []>("linear_55")];
+            tensor<fp32, [2, 12, 256]> input_203 = add(x = input_197, y = ffn_out)[name = tensor<string, []>("input_203")];
             tensor<int32, [1]> xt_axes_0 = const()[name = tensor<string, []>("xt_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [2, 12, 256]> xt = layer_norm(axes = xt_axes_0, beta = decoder_norm22_1_bias, epsilon = var_953, gamma = decoder_norm22_1_weight, x = input_201)[name = tensor<string, []>("xt")];
-            tensor<int32, [4]> var_1398 = const()[name = tensor<string, []>("op_1398"), val = tensor<int32, [4]>([1, 2, 12, 256])];
-            tensor<fp32, [1, 2, 12, 256]> input = reshape(shape = var_1398, x = xt)[name = tensor<string, []>("input")];
-            tensor<int32, [1]> var_1400 = const()[name = tensor<string, []>("op_1400"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 12, 1]> var_1401 = reduce_l2_norm(axes = var_1400, keep_dims = var_956, x = input)[name = tensor<string, []>("op_1401")];
+            tensor<fp32, [2, 12, 256]> xt = layer_norm(axes = xt_axes_0, beta = inner_decoder_norm22_1_bias, epsilon = var_46, gamma = inner_decoder_norm22_1_weight, x = input_203)[name = tensor<string, []>("xt")];
+            tensor<int32, [4]> var_1411 = const()[name = tensor<string, []>("op_1411"), val = tensor<int32, [4]>([1, 2, 12, 256])];
+            tensor<fp32, [1, 2, 12, 256]> input = reshape(shape = var_1411, x = xt)[name = tensor<string, []>("input")];
+            tensor<int32, [1]> var_1413 = const()[name = tensor<string, []>("op_1413"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 2, 12, 1]> var_1414 = reduce_l2_norm(axes = var_1413, keep_dims = var_45, x = input)[name = tensor<string, []>("op_1414")];
             tensor<fp32, []> const_42 = const()[name = tensor<string, []>("const_42"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1, 2, 12, 1]> clip_5 = clip(alpha = var_948, beta = const_42, x = var_1401)[name = tensor<string, []>("clip_5")];
-            tensor<fp32, [1, 2, 12, 256]> var_1403 = real_div(x = input, y = clip_5)[name = tensor<string, []>("op_1403")];
+            tensor<fp32, [1, 2, 12, 1]> clip_5 = clip(alpha = var_59, beta = const_42, x = var_1414)[name = tensor<string, []>("clip_5")];
+            tensor<fp32, [1, 2, 12, 256]> var_1416 = real_div(x = input, y = clip_5)[name = tensor<string, []>("op_1416")];
             tensor<int32, [3]> concat_6 = const()[name = tensor<string, []>("concat_6"), val = tensor<int32, [3]>([2, 1, 256])];
             tensor<fp32, [2, 1, 256]> reshape_0 = reshape(shape = concat_6, x = emb)[name = tensor<string, []>("reshape_0")];
             tensor<int32, [4]> transpose_1_perm_0 = const()[name = tensor<string, []>("transpose_1_perm_0"), val = tensor<int32, [4]>([0, 1, 3, 2])];
             tensor<int32, [3]> concat_7 = const()[name = tensor<string, []>("concat_7"), val = tensor<int32, [3]>([2, 256, 12])];
-            tensor<fp32, [1, 2, 256, 12]> transpose_1 = transpose(perm = transpose_1_perm_0, x = var_1403)[name = tensor<string, []>("transpose_2")];
+            tensor<fp32, [1, 2, 256, 12]> transpose_1 = transpose(perm = transpose_1_perm_0, x = var_1416)[name = tensor<string, []>("transpose_2")];
             tensor<fp32, [2, 256, 12]> reshape_1 = reshape(shape = concat_7, x = transpose_1)[name = tensor<string, []>("reshape_1")];
             tensor<bool, []> matmul_2_transpose_x_0 = const()[name = tensor<string, []>("matmul_2_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> matmul_2_transpose_y_0 = const()[name = tensor<string, []>("matmul_2_transpose_y_0"), val = tensor<bool, []>(false)];
@@ -1237,10 +1244,10 @@ program(1.0)
             tensor<int32, [3]> output_end_0 = const()[name = tensor<string, []>("output_end_0"), val = tensor<int32, [3]>([1, 2, 11])];
             tensor<bool, [3]> output_end_mask_0 = const()[name = tensor<string, []>("output_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
             tensor<fp32, [1, 2, 10]> output = slice_by_index(begin = output_begin_0, end = output_end_0, end_mask = output_end_mask_0, x = reshape_2)[name = tensor<string, []>("output")];
-            tensor<fp32, [1, 2, 10]> probs = sigmoid(x = output)[name = tensor<string, []>("op_1407")];
-            tensor<int32, []> var_1409_axis_0 = const()[name = tensor<string, []>("op_1409_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [2, 12, 4, 64, 64]> dec_kv_new = stack(axis = var_1409_axis_0, values = (var_1105, nkv))[name = tensor<string, []>("op_1409")];
-            tensor<int32, []> var_1411_axis_0 = const()[name = tensor<string, []>("op_1411_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [2, 1]> dec_scale_new = stack(axis = var_1411_axis_0, values = (new_scale_9, new_scale))[name = tensor<string, []>("op_1411")];
+            tensor<fp32, [1, 2, 10]> probs = sigmoid(x = output)[name = tensor<string, []>("op_1420")];
+            tensor<int32, []> var_1422_axis_0 = const()[name = tensor<string, []>("op_1422_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [2, 12, 4, 64, 64]> dec_kv_new = stack(axis = var_1422_axis_0, values = (var_1118, nkv))[name = tensor<string, []>("op_1422")];
+            tensor<int32, []> var_1424_axis_0 = const()[name = tensor<string, []>("op_1424_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [2, 1]> dec_scale_new = stack(axis = var_1424_axis_0, values = (new_scale_9, new_scale))[name = tensor<string, []>("op_1424")];
         } -> (probs, enc_kv_new, enc_scale_new, enc_conv_cache_new, cnn_window_new, dec_kv_new, dec_scale_new);
 }
\ No newline at end of file
diff --git a/optimized/dih2/200ms/ls_eend_dih2_200ms.mlpackage/Data/com.apple.CoreML/model.mlmodel b/optimized/dih2/200ms/ls_eend_dih2_200ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
index 51b7851ed05bebc0051a092a84d8dcde58dd2ed9..ffbe0985dc3832581dca4b17786a3207330562f7 100644
--- a/optimized/dih2/200ms/ls_eend_dih2_200ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
+++ b/optimized/dih2/200ms/ls_eend_dih2_200ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2de52b68412da2b73e916665ebf41cce5eacd2010c99e37de2722c6ce40094a2
-size 179874
+oid sha256:9fd639729c2e10e76067e27f6f5a186e23712630cbb548e9b7620438e1ac94c6
+size 184854
diff --git a/optimized/dih2/200ms/ls_eend_dih2_200ms.mlpackage/Manifest.json b/optimized/dih2/200ms/ls_eend_dih2_200ms.mlpackage/Manifest.json
index 439619156a11723a4eb1eb25e302786f8f2998b0..ef6ea4a36e08efa69e926663bf9bddbfb17bdfd9 100644
--- a/optimized/dih2/200ms/ls_eend_dih2_200ms.mlpackage/Manifest.json
+++ b/optimized/dih2/200ms/ls_eend_dih2_200ms.mlpackage/Manifest.json
@@ -1,18 +1,18 @@
 {
     "fileFormatVersion": "1.0.0",
     "itemInfoEntries": {
-        "07858CB5-65AA-49FE-837C-C3CED8A0B958": {
-            "author": "com.apple.CoreML",
-            "description": "CoreML Model Weights",
-            "name": "weights",
-            "path": "com.apple.CoreML/weights"
-        },
-        "E331E090-E6BB-40EA-B66D-83ED2654BAA6": {
+        "5F736C82-ABFF-4D26-B541-9C4CCF47CDCA": {
             "author": "com.apple.CoreML",
             "description": "CoreML Model Specification",
             "name": "model.mlmodel",
             "path": "com.apple.CoreML/model.mlmodel"
+        },
+        "B05FC407-28F8-4F04-B6D6-27CDC5F0D3B1": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Weights",
+            "name": "weights",
+            "path": "com.apple.CoreML/weights"
         }
     },
-    "rootModelIdentifier": "E331E090-E6BB-40EA-B66D-83ED2654BAA6"
+    "rootModelIdentifier": "5F736C82-ABFF-4D26-B541-9C4CCF47CDCA"
 }
diff --git a/optimized/dih2/300ms/ls_eend_dih2_300ms.mlmodelc/analytics/coremldata.bin b/optimized/dih2/300ms/ls_eend_dih2_300ms.mlmodelc/analytics/coremldata.bin
index aa2356ddff010ed9351b885712064ec8c4872b59..65a87f95a21450aecd95f34377a2914d0593c27e 100644
--- a/optimized/dih2/300ms/ls_eend_dih2_300ms.mlmodelc/analytics/coremldata.bin
+++ b/optimized/dih2/300ms/ls_eend_dih2_300ms.mlmodelc/analytics/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a0e2c05f0fa28ecf43578092711f1d39b3176b08681e17319b4ff36eac9caff0
+oid sha256:16ca3f2bb974e8c6e8d39726f5a028c8f9b01e4b894b4fae3109c92c5f67b136
 size 243
diff --git a/optimized/dih2/300ms/ls_eend_dih2_300ms.mlmodelc/coremldata.bin b/optimized/dih2/300ms/ls_eend_dih2_300ms.mlmodelc/coremldata.bin
index 4bac83cc94d2161567ef0eaff6090eac0493e52a..36fbd09f1caa54a582a266c828e716be7bb6c777 100644
--- a/optimized/dih2/300ms/ls_eend_dih2_300ms.mlmodelc/coremldata.bin
+++ b/optimized/dih2/300ms/ls_eend_dih2_300ms.mlmodelc/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f7894d3847f177d05bc7f718737302118c6e2735e776cb8d02cf573f134d75b4
-size 1308
+oid sha256:f0494f0c13f34614bb55bd5d8c5498411214bdef6540124ea7f63c9a8515f6c7
+size 1411
diff --git a/optimized/dih2/300ms/ls_eend_dih2_300ms.mlmodelc/metadata.json b/optimized/dih2/300ms/ls_eend_dih2_300ms.mlmodelc/metadata.json
index 61a23767da180facd27d20100346d45d4522e1f9..fbdf79e0cb1eaf56046b0f5273fb89d1118c174b 100644
--- a/optimized/dih2/300ms/ls_eend_dih2_300ms.mlmodelc/metadata.json
+++ b/optimized/dih2/300ms/ls_eend_dih2_300ms.mlmodelc/metadata.json
@@ -1,7 +1,7 @@
 [
   {
     "metadataOutputVersion" : "3.0",
-    "shortDescription" : "LS-EEND DIHARD II streaming diarizer (pipeline, T=3, max_speakers=10)",
+    "shortDescription" : "LS-EEND DIHARD II streaming diarizer (pipeline, T=3, max_speakers=10, layout=raw_mel, cu=all)",
     "outputSchema" : [
       {
         "hasShapeFlexibility" : "0",
@@ -81,7 +81,7 @@
     "specificationVersion" : 8,
     "mlProgramOperationTypeHistogram" : {
       "Ios16.reduceL2Norm" : 2,
-      "Ios17.reshape" : 68,
+      "Ios17.reshape" : 69,
       "Ios16.softmax" : 2,
       "Ios17.matmul" : 29,
       "Ios17.transpose" : 57,
@@ -89,7 +89,7 @@
       "Ios17.expandDims" : 3,
       "Ios17.add" : 46,
       "Ios16.sigmoid" : 5,
-      "Ios17.sliceByIndex" : 56,
+      "Ios17.sliceByIndex" : 59,
       "Tile" : 1,
       "Ios16.reduceSum" : 1,
       "Ios17.squeeze" : 2,
@@ -101,7 +101,7 @@
       "Ios16.silu" : 18,
       "Ios17.realDiv" : 20,
       "Ios17.linear" : 56,
-      "Stack" : 5,
+      "Stack" : 6,
       "Ios17.concat" : 18,
       "Ios16.relu" : 2,
       "Ios16.cumsum" : 1,
@@ -128,9 +128,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float32",
-        "formattedType" : "MultiArray (Float32 1 × 3 × 345)",
+        "formattedType" : "MultiArray (Float32 1 × 35 × 23)",
         "shortDescription" : "",
-        "shape" : "[1, 3, 345]",
+        "shape" : "[1, 35, 23]",
         "name" : "features",
         "type" : "MultiArray"
       },
@@ -206,8 +206,8 @@
       }
     ],
     "userDefinedMetadata" : {
-      "com.github.apple.coremltools.conversion_date" : "2026-04-16",
-      "config" : "{\"model_name\": \"dih2\", \"model_label\": \"DIHARD II\", \"variant\": \"pipeline\", \"chunk_size\": 3, \"step_duration_ms\": 300, \"frame_duration_ms\": 100, \"frame_duration_seconds\": 0.1, \"max_speakers\": 10, \"max_nspks\": 12, \"n_units\": 256, \"n_heads\": 4, \"enc_n_layers\": 4, \"dec_n_layers\": 2, \"conv_kernel_size\": 16, \"conv_delay\": 9, \"sample_rate\": 8000, \"win_length\": 200, \"hop_length\": 80, \"n_mels\": 23, \"context_size\": 7, \"subsampling\": 10, \"feat_type\": \"logmel23_cummn\", \"pure_roll\": true}",
+      "com.github.apple.coremltools.conversion_date" : "2026-04-18",
+      "config" : "{\"model_name\": \"dih2\", \"model_label\": \"DIHARD II\", \"variant\": \"pipeline\", \"chunk_size\": 3, \"step_duration_ms\": 300, \"frame_duration_ms\": 100, \"frame_duration_seconds\": 0.1, \"max_speakers\": 10, \"max_nspks\": 12, \"n_units\": 256, \"n_heads\": 4, \"enc_n_layers\": 4, \"dec_n_layers\": 2, \"conv_kernel_size\": 16, \"conv_delay\": 9, \"sample_rate\": 8000, \"win_length\": 200, \"hop_length\": 80, \"n_mels\": 23, \"context_size\": 7, \"subsampling\": 10, \"feat_type\": \"logmel23_cummn\", \"pure_roll\": true, \"input_layout\": \"raw_mel\", \"compute_units_export\": \"all\", \"raw_mel_length\": 35}",
       "com.github.apple.coremltools.source" : "torch==2.6.0",
       "com.github.apple.coremltools.version" : "9.0",
       "com.github.apple.coremltools.source_dialect" : "TorchScript"
diff --git a/optimized/dih2/300ms/ls_eend_dih2_300ms.mlmodelc/model.mil b/optimized/dih2/300ms/ls_eend_dih2_300ms.mlmodelc/model.mil
index 48f1c415ceeabfb14a7edfb748becb4366d6de9f..d54a8f41d62f449a9ccc5f0085a42da328ef56f7 100644
--- a/optimized/dih2/300ms/ls_eend_dih2_300ms.mlmodelc/model.mil
+++ b/optimized/dih2/300ms/ls_eend_dih2_300ms.mlmodelc/model.mil
@@ -1,234 +1,252 @@
 program(1.0)
 [buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.6.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "9.0"}})]
 {
-    func main<ios17>(tensor<fp32, [1, 256, 18]> cnn_window, tensor<fp32, [2, 12, 4, 64, 64]> dec_kv, tensor<fp32, [2, 1]> dec_scale, tensor<fp32, [4, 1, 16, 256]> enc_conv_cache, tensor<fp32, [4, 1, 4, 64, 64]> enc_kv, tensor<fp32, [4, 1]> enc_scale, tensor<fp32, [1, 3, 345]> features, tensor<fp32, [3]> valid_mask) {
-            tensor<fp32, [256]> encoder_cnn_bias = const()[name = tensor<string, []>("encoder_cnn_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
-            tensor<fp32, [256, 256, 19]> encoder_cnn_weight = const()[name = tensor<string, []>("encoder_cnn_weight"), val = tensor<fp32, [256, 256, 19]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
-            tensor<fp32, [3, 3]> encoder__causal_mask = const()[name = tensor<string, []>("encoder__causal_mask"), val = tensor<fp32, [3, 3]>([[0x1p+0, 0x0p+0, 0x0p+0], [0x1p+0, 0x1p+0, 0x0p+0], [0x1p+0, 0x1p+0, 0x1p+0]])];
-            tensor<fp32, [3]> encoder__t_index = const()[name = tensor<string, []>("encoder__t_index"), val = tensor<fp32, [3]>([0x1p+0, 0x1p+1, 0x1.8p+1])];
-            tensor<fp32, [256]> encoder_input_projection_linear_bias = const()[name = tensor<string, []>("encoder_input_projection_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4981952)))];
-            tensor<fp32, [256, 345]> encoder_input_projection_linear_weight = const()[name = tensor<string, []>("encoder_input_projection_linear_weight"), val = tensor<fp32, [256, 345]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4983040)))];
-            tensor<fp32, [256]> encoder_pre_layer_norm_bias = const()[name = tensor<string, []>("encoder_pre_layer_norm_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5336384)))];
-            tensor<fp32, [256]> encoder_pre_layer_norm_weight = const()[name = tensor<string, []>("encoder_pre_layer_norm_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5337472)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5338560)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5339648)))];
-            tensor<fp32, [1024]> encoder_ffn1_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5340736)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5344896)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6393536)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6394624)))];
-            tensor<fp32, [256]> encoder_ret_lns_0_bias = const()[name = tensor<string, []>("encoder_ret_lns_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7443264)))];
-            tensor<fp32, [256]> encoder_ret_lns_0_weight = const()[name = tensor<string, []>("encoder_ret_lns_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7444352)))];
-            tensor<fp32, [256]> encoder_q_proj_0_bias = const()[name = tensor<string, []>("encoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7445440)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_0_weight = const()[name = tensor<string, []>("encoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7446528)))];
-            tensor<fp32, [256]> encoder_k_proj_0_bias = const()[name = tensor<string, []>("encoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7708736)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_0_weight = const()[name = tensor<string, []>("encoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7709824)))];
-            tensor<fp32, [256]> encoder_v_proj_0_bias = const()[name = tensor<string, []>("encoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7972032)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_0_weight = const()[name = tensor<string, []>("encoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7973120)))];
-            tensor<fp32, [256]> encoder_g_proj_0_bias = const()[name = tensor<string, []>("encoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8235328)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_0_weight = const()[name = tensor<string, []>("encoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8236416)))];
-            tensor<fp32, [256]> encoder_out_proj_0_bias = const()[name = tensor<string, []>("encoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8498624)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_0_weight = const()[name = tensor<string, []>("encoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8499712)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8761920)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8763008)))];
-            tensor<fp32, [512]> encoder_conv_module_0_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8764096)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_0_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8766208)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_0_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9290560)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9307008)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9308096)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9309184)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9310272)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9311360)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_0_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9312448)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9574656)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9575744)))];
-            tensor<fp32, [1024]> encoder_ffn2_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9576832)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9580992)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10629632)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10630720)))];
-            tensor<fp32, [256]> encoder_layer_norm_0_bias = const()[name = tensor<string, []>("encoder_layer_norm_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11679360)))];
-            tensor<fp32, [256]> encoder_layer_norm_0_weight = const()[name = tensor<string, []>("encoder_layer_norm_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11680448)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11681536)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11682624)))];
-            tensor<fp32, [1024]> encoder_ffn1_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11683712)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11687872)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12736512)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12737600)))];
-            tensor<fp32, [256]> encoder_ret_lns_1_bias = const()[name = tensor<string, []>("encoder_ret_lns_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13786240)))];
-            tensor<fp32, [256]> encoder_ret_lns_1_weight = const()[name = tensor<string, []>("encoder_ret_lns_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13787328)))];
-            tensor<fp32, [256]> encoder_q_proj_1_bias = const()[name = tensor<string, []>("encoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13788416)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_1_weight = const()[name = tensor<string, []>("encoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13789504)))];
-            tensor<fp32, [256]> encoder_k_proj_1_bias = const()[name = tensor<string, []>("encoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14051712)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_1_weight = const()[name = tensor<string, []>("encoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14052800)))];
-            tensor<fp32, [256]> encoder_v_proj_1_bias = const()[name = tensor<string, []>("encoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14315008)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_1_weight = const()[name = tensor<string, []>("encoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14316096)))];
-            tensor<fp32, [256]> encoder_g_proj_1_bias = const()[name = tensor<string, []>("encoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14578304)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_1_weight = const()[name = tensor<string, []>("encoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14579392)))];
-            tensor<fp32, [256]> encoder_out_proj_1_bias = const()[name = tensor<string, []>("encoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14841600)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_1_weight = const()[name = tensor<string, []>("encoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14842688)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15104896)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15105984)))];
-            tensor<fp32, [512]> encoder_conv_module_1_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15107072)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_1_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15109184)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_1_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15633536)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15649984)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15651072)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15652160)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15653248)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15654336)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_1_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15655424)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15917632)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15918720)))];
-            tensor<fp32, [1024]> encoder_ffn2_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15919808)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15923968)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16972608)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16973696)))];
-            tensor<fp32, [256]> encoder_layer_norm_1_bias = const()[name = tensor<string, []>("encoder_layer_norm_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18022336)))];
-            tensor<fp32, [256]> encoder_layer_norm_1_weight = const()[name = tensor<string, []>("encoder_layer_norm_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18023424)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18024512)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18025600)))];
-            tensor<fp32, [1024]> encoder_ffn1_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18026688)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18030848)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19079488)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19080576)))];
-            tensor<fp32, [256]> encoder_ret_lns_2_bias = const()[name = tensor<string, []>("encoder_ret_lns_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20129216)))];
-            tensor<fp32, [256]> encoder_ret_lns_2_weight = const()[name = tensor<string, []>("encoder_ret_lns_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20130304)))];
-            tensor<fp32, [256]> encoder_q_proj_2_bias = const()[name = tensor<string, []>("encoder_q_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20131392)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_2_weight = const()[name = tensor<string, []>("encoder_q_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20132480)))];
-            tensor<fp32, [256]> encoder_k_proj_2_bias = const()[name = tensor<string, []>("encoder_k_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20394688)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_2_weight = const()[name = tensor<string, []>("encoder_k_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20395776)))];
-            tensor<fp32, [256]> encoder_v_proj_2_bias = const()[name = tensor<string, []>("encoder_v_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20657984)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_2_weight = const()[name = tensor<string, []>("encoder_v_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20659072)))];
-            tensor<fp32, [256]> encoder_g_proj_2_bias = const()[name = tensor<string, []>("encoder_g_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20921280)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_2_weight = const()[name = tensor<string, []>("encoder_g_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20922368)))];
-            tensor<fp32, [256]> encoder_out_proj_2_bias = const()[name = tensor<string, []>("encoder_out_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21184576)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_2_weight = const()[name = tensor<string, []>("encoder_out_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21185664)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21447872)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21448960)))];
-            tensor<fp32, [512]> encoder_conv_module_2_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21450048)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_2_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21452160)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_2_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21976512)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21992960)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21994048)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21995136)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21996224)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21997312)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_2_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21998400)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22260608)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22261696)))];
-            tensor<fp32, [1024]> encoder_ffn2_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22262784)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22266944)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23315584)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23316672)))];
-            tensor<fp32, [256]> encoder_layer_norm_2_bias = const()[name = tensor<string, []>("encoder_layer_norm_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24365312)))];
-            tensor<fp32, [256]> encoder_layer_norm_2_weight = const()[name = tensor<string, []>("encoder_layer_norm_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24366400)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24367488)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24368576)))];
-            tensor<fp32, [1024]> encoder_ffn1_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24369664)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24373824)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25422464)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25423552)))];
-            tensor<fp32, [256]> encoder_ret_lns_3_bias = const()[name = tensor<string, []>("encoder_ret_lns_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26472192)))];
-            tensor<fp32, [256]> encoder_ret_lns_3_weight = const()[name = tensor<string, []>("encoder_ret_lns_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26473280)))];
-            tensor<fp32, [256]> encoder_q_proj_3_bias = const()[name = tensor<string, []>("encoder_q_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26474368)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_3_weight = const()[name = tensor<string, []>("encoder_q_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26475456)))];
-            tensor<fp32, [256]> encoder_k_proj_3_bias = const()[name = tensor<string, []>("encoder_k_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26737664)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_3_weight = const()[name = tensor<string, []>("encoder_k_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26738752)))];
-            tensor<fp32, [256]> encoder_v_proj_3_bias = const()[name = tensor<string, []>("encoder_v_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27000960)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_3_weight = const()[name = tensor<string, []>("encoder_v_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27002048)))];
-            tensor<fp32, [256]> encoder_g_proj_3_bias = const()[name = tensor<string, []>("encoder_g_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27264256)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_3_weight = const()[name = tensor<string, []>("encoder_g_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27265344)))];
-            tensor<fp32, [256]> encoder_out_proj_3_bias = const()[name = tensor<string, []>("encoder_out_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27527552)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_3_weight = const()[name = tensor<string, []>("encoder_out_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27528640)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27790848)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27791936)))];
-            tensor<fp32, [512]> encoder_conv_module_3_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27793024)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_3_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27795136)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_3_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28319488)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28335936)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28337024)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28338112)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28339200)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28340288)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_3_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28341376)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28603584)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28604672)))];
-            tensor<fp32, [1024]> encoder_ffn2_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28605760)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28609920)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29658560)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29659648)))];
-            tensor<fp32, [256]> encoder_layer_norm_3_bias = const()[name = tensor<string, []>("encoder_layer_norm_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30708288)))];
-            tensor<fp32, [256]> encoder_layer_norm_3_weight = const()[name = tensor<string, []>("encoder_layer_norm_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30709376)))];
-            tensor<fp32, [3, 3]> decoder__causal_mask = const()[name = tensor<string, []>("decoder__causal_mask"), val = tensor<fp32, [3, 3]>([[0x1p+0, 0x0p+0, 0x0p+0], [0x1p+0, 0x1p+0, 0x0p+0], [0x1p+0, 0x1p+0, 0x1p+0]])];
-            tensor<fp32, [256]> decoder_convert_bias = const()[name = tensor<string, []>("decoder_convert_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710464)))];
-            tensor<fp32, [256, 512]> decoder_convert_weight = const()[name = tensor<string, []>("decoder_convert_weight"), val = tensor<fp32, [256, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30711552)))];
-            tensor<fp32, [256]> decoder_q_proj_0_bias = const()[name = tensor<string, []>("decoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31235904)))];
-            tensor<fp32, [256, 256]> decoder_q_proj_0_weight = const()[name = tensor<string, []>("decoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31236992)))];
-            tensor<fp32, [256]> decoder_k_proj_0_bias = const()[name = tensor<string, []>("decoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31499200)))];
-            tensor<fp32, [256, 256]> decoder_k_proj_0_weight = const()[name = tensor<string, []>("decoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31500288)))];
-            tensor<fp32, [256]> decoder_v_proj_0_bias = const()[name = tensor<string, []>("decoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31762496)))];
-            tensor<fp32, [256, 256]> decoder_v_proj_0_weight = const()[name = tensor<string, []>("decoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31763584)))];
-            tensor<fp32, [256]> decoder_g_proj_0_bias = const()[name = tensor<string, []>("decoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32025792)))];
-            tensor<fp32, [256, 256]> decoder_g_proj_0_weight = const()[name = tensor<string, []>("decoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32026880)))];
-            tensor<fp32, [256]> decoder_out_proj_0_bias = const()[name = tensor<string, []>("decoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32289088)))];
-            tensor<fp32, [256, 256]> decoder_out_proj_0_weight = const()[name = tensor<string, []>("decoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32290176)))];
-            tensor<fp32, [256]> decoder_norm11_0_bias = const()[name = tensor<string, []>("decoder_norm11_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32552384)))];
-            tensor<fp32, [256]> decoder_norm11_0_weight = const()[name = tensor<string, []>("decoder_norm11_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32553472)))];
-            tensor<fp32, [256]> decoder_self_attn2_0_out_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_0_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32554560)))];
-            tensor<fp32, [256, 256]> decoder_self_attn2_0_out_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_0_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32555648)))];
-            tensor<fp32, [768]> decoder_self_attn2_0_in_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_0_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32817856)))];
-            tensor<fp32, [768, 256]> decoder_self_attn2_0_in_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_0_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32820992)))];
-            tensor<fp32, [256]> decoder_norm21_0_bias = const()[name = tensor<string, []>("decoder_norm21_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33607488)))];
-            tensor<fp32, [256]> decoder_norm21_0_weight = const()[name = tensor<string, []>("decoder_norm21_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33608576)))];
-            tensor<fp32, [2048]> decoder_linear1_0_bias = const()[name = tensor<string, []>("decoder_linear1_0_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33609664)))];
-            tensor<fp32, [2048, 256]> decoder_linear1_0_weight = const()[name = tensor<string, []>("decoder_linear1_0_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33617920)))];
-            tensor<fp32, [256]> decoder_linear2_0_bias = const()[name = tensor<string, []>("decoder_linear2_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35715136)))];
-            tensor<fp32, [256, 2048]> decoder_linear2_0_weight = const()[name = tensor<string, []>("decoder_linear2_0_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35716224)))];
-            tensor<fp32, [256]> decoder_norm22_0_bias = const()[name = tensor<string, []>("decoder_norm22_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37813440)))];
-            tensor<fp32, [256]> decoder_norm22_0_weight = const()[name = tensor<string, []>("decoder_norm22_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37814528)))];
-            tensor<fp32, [256]> decoder_q_proj_1_bias = const()[name = tensor<string, []>("decoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37815616)))];
-            tensor<fp32, [256, 256]> decoder_q_proj_1_weight = const()[name = tensor<string, []>("decoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37816704)))];
-            tensor<fp32, [256]> decoder_k_proj_1_bias = const()[name = tensor<string, []>("decoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38078912)))];
-            tensor<fp32, [256, 256]> decoder_k_proj_1_weight = const()[name = tensor<string, []>("decoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38080000)))];
-            tensor<fp32, [256]> decoder_v_proj_1_bias = const()[name = tensor<string, []>("decoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38342208)))];
-            tensor<fp32, [256, 256]> decoder_v_proj_1_weight = const()[name = tensor<string, []>("decoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38343296)))];
-            tensor<fp32, [256]> decoder_g_proj_1_bias = const()[name = tensor<string, []>("decoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38605504)))];
-            tensor<fp32, [256, 256]> decoder_g_proj_1_weight = const()[name = tensor<string, []>("decoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38606592)))];
-            tensor<fp32, [256]> decoder_out_proj_1_bias = const()[name = tensor<string, []>("decoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38868800)))];
-            tensor<fp32, [256, 256]> decoder_out_proj_1_weight = const()[name = tensor<string, []>("decoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38869888)))];
-            tensor<fp32, [256]> decoder_norm11_1_bias = const()[name = tensor<string, []>("decoder_norm11_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39132096)))];
-            tensor<fp32, [256]> decoder_norm11_1_weight = const()[name = tensor<string, []>("decoder_norm11_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39133184)))];
-            tensor<fp32, [256]> decoder_self_attn2_1_out_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_1_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39134272)))];
-            tensor<fp32, [256, 256]> decoder_self_attn2_1_out_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_1_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39135360)))];
-            tensor<fp32, [768]> decoder_self_attn2_1_in_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_1_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39397568)))];
-            tensor<fp32, [768, 256]> decoder_self_attn2_1_in_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_1_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39400704)))];
-            tensor<fp32, [256]> decoder_norm21_1_bias = const()[name = tensor<string, []>("decoder_norm21_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40187200)))];
-            tensor<fp32, [256]> decoder_norm21_1_weight = const()[name = tensor<string, []>("decoder_norm21_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40188288)))];
-            tensor<fp32, [2048]> decoder_linear1_1_bias = const()[name = tensor<string, []>("decoder_linear1_1_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40189376)))];
-            tensor<fp32, [2048, 256]> decoder_linear1_1_weight = const()[name = tensor<string, []>("decoder_linear1_1_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40197632)))];
-            tensor<fp32, [256]> decoder_linear2_1_bias = const()[name = tensor<string, []>("decoder_linear2_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42294848)))];
-            tensor<fp32, [256, 2048]> decoder_linear2_1_weight = const()[name = tensor<string, []>("decoder_linear2_1_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42295936)))];
-            tensor<fp32, [256]> decoder_norm22_1_bias = const()[name = tensor<string, []>("decoder_norm22_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44393152)))];
-            tensor<fp32, [256]> decoder_norm22_1_weight = const()[name = tensor<string, []>("decoder_norm22_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44394240)))];
-            tensor<fp32, []> var_11 = const()[name = tensor<string, []>("op_11"), val = tensor<fp32, []>(0x1.5798eep-27)];
-            tensor<fp32, []> var_17 = const()[name = tensor<string, []>("op_17"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
-            tensor<int32, []> var_20 = const()[name = tensor<string, []>("op_20"), val = tensor<int32, []>(2)];
-            tensor<int32, []> var_23 = const()[name = tensor<string, []>("op_23"), val = tensor<int32, []>(0)];
-            tensor<int32, []> var_26 = const()[name = tensor<string, []>("op_26"), val = tensor<int32, []>(1)];
-            tensor<fp32, []> var_28 = const()[name = tensor<string, []>("op_28"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
-            tensor<bool, []> var_29 = const()[name = tensor<string, []>("op_29"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 3, 256]> input_1 = linear(bias = encoder_input_projection_linear_bias, weight = encoder_input_projection_linear_weight, x = features)[name = tensor<string, []>("linear_0")];
-            tensor<int32, [1]> input_3_axes_0 = const()[name = tensor<string, []>("input_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_3 = layer_norm(axes = input_3_axes_0, beta = encoder_pre_layer_norm_bias, epsilon = var_28, gamma = encoder_pre_layer_norm_weight, x = input_1)[name = tensor<string, []>("input_3")];
+    func main<ios17>(tensor<fp32, [1, 256, 18]> cnn_window, tensor<fp32, [2, 12, 4, 64, 64]> dec_kv, tensor<fp32, [2, 1]> dec_scale, tensor<fp32, [4, 1, 16, 256]> enc_conv_cache, tensor<fp32, [4, 1, 4, 64, 64]> enc_kv, tensor<fp32, [4, 1]> enc_scale, tensor<fp32, [1, 35, 23]> features, tensor<fp32, [3]> valid_mask) {
+            tensor<fp32, [256]> inner_encoder_cnn_bias = const()[name = tensor<string, []>("inner_encoder_cnn_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp32, [256, 256, 19]> inner_encoder_cnn_weight = const()[name = tensor<string, []>("inner_encoder_cnn_weight"), val = tensor<fp32, [256, 256, 19]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
+            tensor<fp32, [3, 3]> inner_encoder__causal_mask = const()[name = tensor<string, []>("inner_encoder__causal_mask"), val = tensor<fp32, [3, 3]>([[0x1p+0, 0x0p+0, 0x0p+0], [0x1p+0, 0x1p+0, 0x0p+0], [0x1p+0, 0x1p+0, 0x1p+0]])];
+            tensor<fp32, [3]> inner_encoder__t_index = const()[name = tensor<string, []>("inner_encoder__t_index"), val = tensor<fp32, [3]>([0x1p+0, 0x1p+1, 0x1.8p+1])];
+            tensor<fp32, [256]> inner_encoder_input_projection_linear_bias = const()[name = tensor<string, []>("inner_encoder_input_projection_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4981952)))];
+            tensor<fp32, [256, 345]> inner_encoder_input_projection_linear_weight = const()[name = tensor<string, []>("inner_encoder_input_projection_linear_weight"), val = tensor<fp32, [256, 345]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4983040)))];
+            tensor<fp32, [256]> inner_encoder_pre_layer_norm_bias = const()[name = tensor<string, []>("inner_encoder_pre_layer_norm_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5336384)))];
+            tensor<fp32, [256]> inner_encoder_pre_layer_norm_weight = const()[name = tensor<string, []>("inner_encoder_pre_layer_norm_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5337472)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5338560)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5339648)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5340736)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5344896)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6393536)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6394624)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_0_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7443264)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_0_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7444352)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7445440)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7446528)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7708736)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7709824)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7972032)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7973120)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8235328)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8236416)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8498624)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8499712)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8761920)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8763008)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_0_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8764096)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_0_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8766208)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_0_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9290560)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9307008)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9308096)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9309184)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9310272)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9311360)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_0_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9312448)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9574656)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9575744)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9576832)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9580992)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10629632)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10630720)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_0_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11679360)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_0_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11680448)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11681536)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11682624)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11683712)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11687872)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12736512)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12737600)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_1_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13786240)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_1_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13787328)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13788416)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13789504)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14051712)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14052800)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14315008)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14316096)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14578304)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14579392)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14841600)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14842688)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15104896)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15105984)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_1_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15107072)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_1_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15109184)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_1_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15633536)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15649984)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15651072)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15652160)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15653248)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15654336)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_1_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15655424)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15917632)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15918720)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15919808)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15923968)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16972608)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16973696)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_1_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18022336)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_1_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18023424)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18024512)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18025600)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18026688)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18030848)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19079488)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19080576)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_2_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20129216)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_2_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20130304)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20131392)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20132480)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20394688)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20395776)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20657984)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20659072)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20921280)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20922368)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21184576)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21185664)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21447872)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21448960)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_2_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21450048)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_2_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21452160)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_2_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21976512)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21992960)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21994048)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21995136)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21996224)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21997312)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_2_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21998400)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22260608)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22261696)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22262784)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22266944)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23315584)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23316672)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_2_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24365312)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_2_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24366400)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24367488)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24368576)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24369664)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24373824)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25422464)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25423552)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_3_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26472192)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_3_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26473280)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26474368)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26475456)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26737664)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26738752)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27000960)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27002048)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27264256)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27265344)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27527552)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27528640)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27790848)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27791936)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_3_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27793024)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_3_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27795136)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_3_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28319488)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28335936)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28337024)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28338112)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28339200)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28340288)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_3_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28341376)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28603584)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28604672)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28605760)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28609920)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29658560)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29659648)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_3_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30708288)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_3_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30709376)))];
+            tensor<fp32, [3, 3]> inner_decoder__causal_mask = const()[name = tensor<string, []>("inner_decoder__causal_mask"), val = tensor<fp32, [3, 3]>([[0x1p+0, 0x0p+0, 0x0p+0], [0x1p+0, 0x1p+0, 0x0p+0], [0x1p+0, 0x1p+0, 0x1p+0]])];
+            tensor<fp32, [256]> inner_decoder_convert_bias = const()[name = tensor<string, []>("inner_decoder_convert_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710464)))];
+            tensor<fp32, [256, 512]> inner_decoder_convert_weight = const()[name = tensor<string, []>("inner_decoder_convert_weight"), val = tensor<fp32, [256, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30711552)))];
+            tensor<fp32, [256]> inner_decoder_q_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31235904)))];
+            tensor<fp32, [256, 256]> inner_decoder_q_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31236992)))];
+            tensor<fp32, [256]> inner_decoder_k_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31499200)))];
+            tensor<fp32, [256, 256]> inner_decoder_k_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31500288)))];
+            tensor<fp32, [256]> inner_decoder_v_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31762496)))];
+            tensor<fp32, [256, 256]> inner_decoder_v_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31763584)))];
+            tensor<fp32, [256]> inner_decoder_g_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32025792)))];
+            tensor<fp32, [256, 256]> inner_decoder_g_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32026880)))];
+            tensor<fp32, [256]> inner_decoder_out_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32289088)))];
+            tensor<fp32, [256, 256]> inner_decoder_out_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32290176)))];
+            tensor<fp32, [256]> inner_decoder_norm11_0_bias = const()[name = tensor<string, []>("inner_decoder_norm11_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32552384)))];
+            tensor<fp32, [256]> inner_decoder_norm11_0_weight = const()[name = tensor<string, []>("inner_decoder_norm11_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32553472)))];
+            tensor<fp32, [256]> inner_decoder_self_attn2_0_out_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32554560)))];
+            tensor<fp32, [256, 256]> inner_decoder_self_attn2_0_out_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32555648)))];
+            tensor<fp32, [768]> inner_decoder_self_attn2_0_in_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32817856)))];
+            tensor<fp32, [768, 256]> inner_decoder_self_attn2_0_in_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32820992)))];
+            tensor<fp32, [256]> inner_decoder_norm21_0_bias = const()[name = tensor<string, []>("inner_decoder_norm21_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33607488)))];
+            tensor<fp32, [256]> inner_decoder_norm21_0_weight = const()[name = tensor<string, []>("inner_decoder_norm21_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33608576)))];
+            tensor<fp32, [2048]> inner_decoder_linear1_0_bias = const()[name = tensor<string, []>("inner_decoder_linear1_0_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33609664)))];
+            tensor<fp32, [2048, 256]> inner_decoder_linear1_0_weight = const()[name = tensor<string, []>("inner_decoder_linear1_0_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33617920)))];
+            tensor<fp32, [256]> inner_decoder_linear2_0_bias = const()[name = tensor<string, []>("inner_decoder_linear2_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35715136)))];
+            tensor<fp32, [256, 2048]> inner_decoder_linear2_0_weight = const()[name = tensor<string, []>("inner_decoder_linear2_0_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35716224)))];
+            tensor<fp32, [256]> inner_decoder_norm22_0_bias = const()[name = tensor<string, []>("inner_decoder_norm22_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37813440)))];
+            tensor<fp32, [256]> inner_decoder_norm22_0_weight = const()[name = tensor<string, []>("inner_decoder_norm22_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37814528)))];
+            tensor<fp32, [256]> inner_decoder_q_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37815616)))];
+            tensor<fp32, [256, 256]> inner_decoder_q_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37816704)))];
+            tensor<fp32, [256]> inner_decoder_k_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38078912)))];
+            tensor<fp32, [256, 256]> inner_decoder_k_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38080000)))];
+            tensor<fp32, [256]> inner_decoder_v_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38342208)))];
+            tensor<fp32, [256, 256]> inner_decoder_v_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38343296)))];
+            tensor<fp32, [256]> inner_decoder_g_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38605504)))];
+            tensor<fp32, [256, 256]> inner_decoder_g_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38606592)))];
+            tensor<fp32, [256]> inner_decoder_out_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38868800)))];
+            tensor<fp32, [256, 256]> inner_decoder_out_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38869888)))];
+            tensor<fp32, [256]> inner_decoder_norm11_1_bias = const()[name = tensor<string, []>("inner_decoder_norm11_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39132096)))];
+            tensor<fp32, [256]> inner_decoder_norm11_1_weight = const()[name = tensor<string, []>("inner_decoder_norm11_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39133184)))];
+            tensor<fp32, [256]> inner_decoder_self_attn2_1_out_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39134272)))];
+            tensor<fp32, [256, 256]> inner_decoder_self_attn2_1_out_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39135360)))];
+            tensor<fp32, [768]> inner_decoder_self_attn2_1_in_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39397568)))];
+            tensor<fp32, [768, 256]> inner_decoder_self_attn2_1_in_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39400704)))];
+            tensor<fp32, [256]> inner_decoder_norm21_1_bias = const()[name = tensor<string, []>("inner_decoder_norm21_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40187200)))];
+            tensor<fp32, [256]> inner_decoder_norm21_1_weight = const()[name = tensor<string, []>("inner_decoder_norm21_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40188288)))];
+            tensor<fp32, [2048]> inner_decoder_linear1_1_bias = const()[name = tensor<string, []>("inner_decoder_linear1_1_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40189376)))];
+            tensor<fp32, [2048, 256]> inner_decoder_linear1_1_weight = const()[name = tensor<string, []>("inner_decoder_linear1_1_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40197632)))];
+            tensor<fp32, [256]> inner_decoder_linear2_1_bias = const()[name = tensor<string, []>("inner_decoder_linear2_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42294848)))];
+            tensor<fp32, [256, 2048]> inner_decoder_linear2_1_weight = const()[name = tensor<string, []>("inner_decoder_linear2_1_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42295936)))];
+            tensor<fp32, [256]> inner_decoder_norm22_1_bias = const()[name = tensor<string, []>("inner_decoder_norm22_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44393152)))];
+            tensor<fp32, [256]> inner_decoder_norm22_1_weight = const()[name = tensor<string, []>("inner_decoder_norm22_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44394240)))];
+            tensor<int32, [3]> var_19_begin_0 = const()[name = tensor<string, []>("op_19_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_19_end_0 = const()[name = tensor<string, []>("op_19_end_0"), val = tensor<int32, [3]>([1, 15, 23])];
+            tensor<bool, [3]> var_19_end_mask_0 = const()[name = tensor<string, []>("op_19_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 15, 23]> var_19 = slice_by_index(begin = var_19_begin_0, end = var_19_end_0, end_mask = var_19_end_mask_0, x = features)[name = tensor<string, []>("op_19")];
+            tensor<int32, [3]> var_29_begin_0 = const()[name = tensor<string, []>("op_29_begin_0"), val = tensor<int32, [3]>([0, 10, 0])];
+            tensor<int32, [3]> var_29_end_0 = const()[name = tensor<string, []>("op_29_end_0"), val = tensor<int32, [3]>([1, 25, 23])];
+            tensor<bool, [3]> var_29_end_mask_0 = const()[name = tensor<string, []>("op_29_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 15, 23]> var_29 = slice_by_index(begin = var_29_begin_0, end = var_29_end_0, end_mask = var_29_end_mask_0, x = features)[name = tensor<string, []>("op_29")];
+            tensor<int32, [3]> var_39_begin_0 = const()[name = tensor<string, []>("op_39_begin_0"), val = tensor<int32, [3]>([0, 20, 0])];
+            tensor<int32, [3]> var_39_end_0 = const()[name = tensor<string, []>("op_39_end_0"), val = tensor<int32, [3]>([1, 1, 23])];
+            tensor<bool, [3]> var_39_end_mask_0 = const()[name = tensor<string, []>("op_39_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 23]> var_39 = slice_by_index(begin = var_39_begin_0, end = var_39_end_0, end_mask = var_39_end_mask_0, x = features)[name = tensor<string, []>("op_39")];
+            tensor<int32, []> stacked_axis_0 = const()[name = tensor<string, []>("stacked_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 3, 15, 23]> stacked = stack(axis = stacked_axis_0, values = (var_19, var_29, var_39))[name = tensor<string, []>("stacked")];
+            tensor<int32, [3]> var_46 = const()[name = tensor<string, []>("op_46"), val = tensor<int32, [3]>([1, 3, 345])];
+            tensor<fp32, [1, 3, 345]> input_1 = reshape(shape = var_46, x = stacked)[name = tensor<string, []>("input_1")];
+            tensor<fp32, []> var_49 = const()[name = tensor<string, []>("op_49"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<bool, []> var_55 = const()[name = tensor<string, []>("op_55"), val = tensor<bool, []>(true)];
+            tensor<fp32, []> var_56 = const()[name = tensor<string, []>("op_56"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
+            tensor<int32, []> var_59 = const()[name = tensor<string, []>("op_59"), val = tensor<int32, []>(0)];
+            tensor<int32, []> var_61 = const()[name = tensor<string, []>("op_61"), val = tensor<int32, []>(2)];
+            tensor<int32, []> var_62 = const()[name = tensor<string, []>("op_62"), val = tensor<int32, []>(-1)];
+            tensor<fp32, []> var_64 = const()[name = tensor<string, []>("op_64"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
+            tensor<fp32, []> var_69 = const()[name = tensor<string, []>("op_69"), val = tensor<fp32, []>(0x1.5798eep-27)];
+            tensor<int32, []> var_72 = const()[name = tensor<string, []>("op_72"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 3, 256]> input_3 = linear(bias = inner_encoder_input_projection_linear_bias, weight = inner_encoder_input_projection_linear_weight, x = input_1)[name = tensor<string, []>("linear_0")];
             tensor<int32, [1]> input_5_axes_0 = const()[name = tensor<string, []>("input_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_5 = layer_norm(axes = input_5_axes_0, beta = encoder_ffn1_0_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_0_module_sequential_0_weight, x = input_3)[name = tensor<string, []>("input_5")];
-            tensor<fp32, [1, 3, 1024]> inputs_1 = linear(bias = encoder_ffn1_0_module_sequential_1_linear_bias, weight = encoder_ffn1_0_module_sequential_1_linear_weight, x = input_5)[name = tensor<string, []>("linear_1")];
-            tensor<fp32, [1, 3, 1024]> input_7 = silu(x = inputs_1)[name = tensor<string, []>("input_7")];
-            tensor<fp32, [1, 3, 256]> input_11 = linear(bias = encoder_ffn1_0_module_sequential_4_linear_bias, weight = encoder_ffn1_0_module_sequential_4_linear_weight, x = input_7)[name = tensor<string, []>("linear_2")];
-            tensor<fp32, []> var_147 = const()[name = tensor<string, []>("op_147"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 3, 256]> var_148 = mul(x = input_11, y = var_147)[name = tensor<string, []>("op_148")];
-            tensor<fp32, [1, 3, 256]> input_13 = add(x = var_148, y = input_3)[name = tensor<string, []>("input_13")];
+            tensor<fp32, [1, 3, 256]> input_5 = layer_norm(axes = input_5_axes_0, beta = inner_encoder_pre_layer_norm_bias, epsilon = var_56, gamma = inner_encoder_pre_layer_norm_weight, x = input_3)[name = tensor<string, []>("input_5")];
+            tensor<int32, [1]> input_7_axes_0 = const()[name = tensor<string, []>("input_7_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 3, 256]> input_7 = layer_norm(axes = input_7_axes_0, beta = inner_encoder_ffn1_0_module_sequential_0_bias, epsilon = var_56, gamma = inner_encoder_ffn1_0_module_sequential_0_weight, x = input_5)[name = tensor<string, []>("input_7")];
+            tensor<fp32, [1, 3, 1024]> inputs_1 = linear(bias = inner_encoder_ffn1_0_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_0_module_sequential_1_linear_weight, x = input_7)[name = tensor<string, []>("linear_1")];
+            tensor<fp32, [1, 3, 1024]> input_9 = silu(x = inputs_1)[name = tensor<string, []>("input_9")];
+            tensor<fp32, [1, 3, 256]> input_13 = linear(bias = inner_encoder_ffn1_0_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_0_module_sequential_4_linear_weight, x = input_9)[name = tensor<string, []>("linear_2")];
+            tensor<fp32, []> var_193 = const()[name = tensor<string, []>("op_193"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 3, 256]> var_194 = mul(x = input_13, y = var_193)[name = tensor<string, []>("op_194")];
+            tensor<fp32, [1, 3, 256]> input_15 = add(x = var_194, y = input_5)[name = tensor<string, []>("input_15")];
             tensor<int32, [1]> x_1_axes_0 = const()[name = tensor<string, []>("x_1_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> x_1 = layer_norm(axes = x_1_axes_0, beta = encoder_ret_lns_0_bias, epsilon = var_28, gamma = encoder_ret_lns_0_weight, x = input_13)[name = tensor<string, []>("x_1")];
+            tensor<fp32, [1, 3, 256]> x_1 = layer_norm(axes = x_1_axes_0, beta = inner_encoder_ret_lns_0_bias, epsilon = var_56, gamma = inner_encoder_ret_lns_0_weight, x = input_15)[name = tensor<string, []>("x_1")];
             tensor<int32, [5]> prev_kv_1_begin_0 = const()[name = tensor<string, []>("prev_kv_1_begin_0"), val = tensor<int32, [5]>([0, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_1_end_0 = const()[name = tensor<string, []>("prev_kv_1_end_0"), val = tensor<int32, [5]>([1, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_1_end_mask_0 = const()[name = tensor<string, []>("prev_kv_1_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -239,163 +257,163 @@ program(1.0)
             tensor<bool, [2]> prev_scale_1_end_mask_0 = const()[name = tensor<string, []>("prev_scale_1_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_1_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_1_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_1 = slice_by_index(begin = prev_scale_1_begin_0, end = prev_scale_1_end_0, end_mask = prev_scale_1_end_mask_0, squeeze_mask = prev_scale_1_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_1")];
-            tensor<fp32, [1, 3, 256]> var_162 = linear(bias = encoder_q_proj_0_bias, weight = encoder_q_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_3")];
-            tensor<int32, [4]> var_163 = const()[name = tensor<string, []>("op_163"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_164 = reshape(shape = var_163, x = var_162)[name = tensor<string, []>("op_164")];
+            tensor<fp32, [1, 3, 256]> var_208 = linear(bias = inner_encoder_q_proj_0_bias, weight = inner_encoder_q_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_3")];
+            tensor<int32, [4]> var_209 = const()[name = tensor<string, []>("op_209"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_210 = reshape(shape = var_209, x = var_208)[name = tensor<string, []>("op_210")];
             tensor<int32, [4]> q_1_perm_0 = const()[name = tensor<string, []>("q_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> var_168 = linear(bias = encoder_k_proj_0_bias, weight = encoder_k_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_4")];
-            tensor<fp32, []> var_169 = const()[name = tensor<string, []>("op_169"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 3, 256]> var_170 = mul(x = var_168, y = var_169)[name = tensor<string, []>("op_170")];
-            tensor<int32, [4]> var_171 = const()[name = tensor<string, []>("op_171"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_172 = reshape(shape = var_171, x = var_170)[name = tensor<string, []>("op_172")];
+            tensor<fp32, [1, 3, 256]> var_214 = linear(bias = inner_encoder_k_proj_0_bias, weight = inner_encoder_k_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_4")];
+            tensor<fp32, []> var_215 = const()[name = tensor<string, []>("op_215"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 3, 256]> var_216 = mul(x = var_214, y = var_215)[name = tensor<string, []>("op_216")];
+            tensor<int32, [4]> var_217 = const()[name = tensor<string, []>("op_217"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_218 = reshape(shape = var_217, x = var_216)[name = tensor<string, []>("op_218")];
             tensor<int32, [4]> k_1_perm_0 = const()[name = tensor<string, []>("k_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> var_176 = linear(bias = encoder_v_proj_0_bias, weight = encoder_v_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_5")];
-            tensor<int32, [4]> var_177 = const()[name = tensor<string, []>("op_177"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_178 = reshape(shape = var_177, x = var_176)[name = tensor<string, []>("op_178")];
+            tensor<fp32, [1, 3, 256]> var_222 = linear(bias = inner_encoder_v_proj_0_bias, weight = inner_encoder_v_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_5")];
+            tensor<int32, [4]> var_223 = const()[name = tensor<string, []>("op_223"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_224 = reshape(shape = var_223, x = var_222)[name = tensor<string, []>("op_224")];
             tensor<int32, [4]> v_1_perm_0 = const()[name = tensor<string, []>("v_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> input_17 = linear(bias = encoder_g_proj_0_bias, weight = encoder_g_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_6")];
+            tensor<fp32, [1, 3, 256]> input_19 = linear(bias = inner_encoder_g_proj_0_bias, weight = inner_encoder_g_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_6")];
             tensor<fp32, [1]> sqrt_s0_1 = sqrt(x = prev_scale_1)[name = tensor<string, []>("sqrt_s0_1")];
-            tensor<fp32, [3]> s_t_1 = add(x = prev_scale_1, y = encoder__t_index)[name = tensor<string, []>("s_t_1")];
+            tensor<fp32, [3]> s_t_1 = add(x = prev_scale_1, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_1")];
             tensor<fp32, [3]> sqrt_s_t_1 = sqrt(x = s_t_1)[name = tensor<string, []>("sqrt_s_t_1")];
             tensor<bool, []> qk_1_transpose_x_1 = const()[name = tensor<string, []>("qk_1_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_1_transpose_y_1 = const()[name = tensor<string, []>("qk_1_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 3, 64]> k_1 = transpose(perm = k_1_perm_0, x = var_172)[name = tensor<string, []>("transpose_57")];
-            tensor<fp32, [1, 4, 3, 64]> q_1 = transpose(perm = q_1_perm_0, x = var_164)[name = tensor<string, []>("transpose_58")];
+            tensor<fp32, [1, 4, 3, 64]> k_1 = transpose(perm = k_1_perm_0, x = var_218)[name = tensor<string, []>("transpose_57")];
+            tensor<fp32, [1, 4, 3, 64]> q_1 = transpose(perm = q_1_perm_0, x = var_210)[name = tensor<string, []>("transpose_58")];
             tensor<fp32, [1, 4, 3, 3]> qk_1 = matmul(transpose_x = qk_1_transpose_x_1, transpose_y = qk_1_transpose_y_1, x = q_1, y = k_1)[name = tensor<string, []>("qk_1")];
-            tensor<int32, [2]> var_188 = const()[name = tensor<string, []>("op_188"), val = tensor<int32, [2]>([3, 1])];
-            tensor<fp32, [3, 1]> var_189 = reshape(shape = var_188, x = sqrt_s_t_1)[name = tensor<string, []>("op_189")];
-            tensor<fp32, [3, 3]> M_1 = real_div(x = encoder__causal_mask, y = var_189)[name = tensor<string, []>("M_1")];
-            tensor<fp32, [1, 4, 3, 3]> var_191 = mul(x = qk_1, y = M_1)[name = tensor<string, []>("op_191")];
+            tensor<int32, [2]> var_234 = const()[name = tensor<string, []>("op_234"), val = tensor<int32, [2]>([3, 1])];
+            tensor<fp32, [3, 1]> var_235 = reshape(shape = var_234, x = sqrt_s_t_1)[name = tensor<string, []>("op_235")];
+            tensor<fp32, [3, 3]> M_1 = real_div(x = inner_encoder__causal_mask, y = var_235)[name = tensor<string, []>("M_1")];
+            tensor<fp32, [1, 4, 3, 3]> var_237 = mul(x = qk_1, y = M_1)[name = tensor<string, []>("op_237")];
             tensor<bool, []> inner_1_transpose_x_0 = const()[name = tensor<string, []>("inner_1_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_1_transpose_y_0 = const()[name = tensor<string, []>("inner_1_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 3, 64]> v_1 = transpose(perm = v_1_perm_0, x = var_178)[name = tensor<string, []>("transpose_56")];
-            tensor<fp32, [1, 4, 3, 64]> inner_1 = matmul(transpose_x = inner_1_transpose_x_0, transpose_y = inner_1_transpose_y_0, x = var_191, y = v_1)[name = tensor<string, []>("inner_1")];
-            tensor<bool, []> var_193_transpose_x_0 = const()[name = tensor<string, []>("op_193_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_193_transpose_y_0 = const()[name = tensor<string, []>("op_193_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 3, 64]> var_193 = matmul(transpose_x = var_193_transpose_x_0, transpose_y = var_193_transpose_y_0, x = q_1, y = prev_kv_1)[name = tensor<string, []>("op_193")];
-            tensor<fp32, [3]> var_194 = real_div(x = sqrt_s0_1, y = sqrt_s_t_1)[name = tensor<string, []>("op_194")];
-            tensor<int32, [4]> var_195 = const()[name = tensor<string, []>("op_195"), val = tensor<int32, [4]>([1, 1, 3, 1])];
-            tensor<fp32, [1, 1, 3, 1]> var_196 = reshape(shape = var_195, x = var_194)[name = tensor<string, []>("op_196")];
-            tensor<fp32, [1, 4, 3, 64]> cross_1 = mul(x = var_193, y = var_196)[name = tensor<string, []>("cross_1")];
+            tensor<fp32, [1, 4, 3, 64]> v_1 = transpose(perm = v_1_perm_0, x = var_224)[name = tensor<string, []>("transpose_56")];
+            tensor<fp32, [1, 4, 3, 64]> inner_1 = matmul(transpose_x = inner_1_transpose_x_0, transpose_y = inner_1_transpose_y_0, x = var_237, y = v_1)[name = tensor<string, []>("inner_1")];
+            tensor<bool, []> var_239_transpose_x_0 = const()[name = tensor<string, []>("op_239_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_239_transpose_y_0 = const()[name = tensor<string, []>("op_239_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 3, 64]> var_239 = matmul(transpose_x = var_239_transpose_x_0, transpose_y = var_239_transpose_y_0, x = q_1, y = prev_kv_1)[name = tensor<string, []>("op_239")];
+            tensor<fp32, [3]> var_240 = real_div(x = sqrt_s0_1, y = sqrt_s_t_1)[name = tensor<string, []>("op_240")];
+            tensor<int32, [4]> var_241 = const()[name = tensor<string, []>("op_241"), val = tensor<int32, [4]>([1, 1, 3, 1])];
+            tensor<fp32, [1, 1, 3, 1]> var_242 = reshape(shape = var_241, x = var_240)[name = tensor<string, []>("op_242")];
+            tensor<fp32, [1, 4, 3, 64]> cross_1 = mul(x = var_239, y = var_242)[name = tensor<string, []>("cross_1")];
             tensor<fp32, [1, 4, 3, 64]> out_1 = add(x = inner_1, y = cross_1)[name = tensor<string, []>("out_1")];
-            tensor<fp32, [1, 4, 64, 64]> var_199 = mul(x = prev_kv_1, y = sqrt_s0_1)[name = tensor<string, []>("op_199")];
-            tensor<bool, []> var_201_transpose_x_1 = const()[name = tensor<string, []>("op_201_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_201_transpose_y_1 = const()[name = tensor<string, []>("op_201_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_201 = matmul(transpose_x = var_201_transpose_x_1, transpose_y = var_201_transpose_y_1, x = k_1, y = v_1)[name = tensor<string, []>("op_201")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_1 = add(x = var_199, y = var_201)[name = tensor<string, []>("new_kv_unnorm_1")];
-            tensor<fp32, []> var_203 = const()[name = tensor<string, []>("op_203"), val = tensor<fp32, []>(0x1.8p+1)];
-            tensor<fp32, [1]> new_scale_1 = add(x = prev_scale_1, y = var_203)[name = tensor<string, []>("new_scale_1")];
-            tensor<fp32, [1]> var_205 = sqrt(x = new_scale_1)[name = tensor<string, []>("op_205")];
-            tensor<fp32, [1, 4, 64, 64]> var_206 = real_div(x = new_kv_unnorm_1, y = var_205)[name = tensor<string, []>("op_206")];
-            tensor<int32, [4]> var_207_perm_0 = const()[name = tensor<string, []>("op_207_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_245 = mul(x = prev_kv_1, y = sqrt_s0_1)[name = tensor<string, []>("op_245")];
+            tensor<bool, []> var_247_transpose_x_1 = const()[name = tensor<string, []>("op_247_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_247_transpose_y_1 = const()[name = tensor<string, []>("op_247_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_247 = matmul(transpose_x = var_247_transpose_x_1, transpose_y = var_247_transpose_y_1, x = k_1, y = v_1)[name = tensor<string, []>("op_247")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_1 = add(x = var_245, y = var_247)[name = tensor<string, []>("new_kv_unnorm_1")];
+            tensor<fp32, []> var_249 = const()[name = tensor<string, []>("op_249"), val = tensor<fp32, []>(0x1.8p+1)];
+            tensor<fp32, [1]> new_scale_1 = add(x = prev_scale_1, y = var_249)[name = tensor<string, []>("new_scale_1")];
+            tensor<fp32, [1]> var_251 = sqrt(x = new_scale_1)[name = tensor<string, []>("op_251")];
+            tensor<fp32, [1, 4, 64, 64]> var_252 = real_div(x = new_kv_unnorm_1, y = var_251)[name = tensor<string, []>("op_252")];
+            tensor<int32, [4]> var_253_perm_0 = const()[name = tensor<string, []>("op_253_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_3_axes_0 = const()[name = tensor<string, []>("out_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 4, 64]> var_207 = transpose(perm = var_207_perm_0, x = out_1)[name = tensor<string, []>("transpose_55")];
-            tensor<fp32, [1, 3, 4, 64]> out_3 = layer_norm(axes = out_3_axes_0, epsilon = var_17, x = var_207)[name = tensor<string, []>("out_3")];
-            tensor<int32, [3]> var_211 = const()[name = tensor<string, []>("op_211"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<fp32, [1, 3, 256]> out_5 = reshape(shape = var_211, x = out_3)[name = tensor<string, []>("out_5")];
-            tensor<fp32, [1, 3, 256]> var_213 = silu(x = input_17)[name = tensor<string, []>("op_213")];
-            tensor<fp32, [1, 3, 256]> input_19 = mul(x = var_213, y = out_5)[name = tensor<string, []>("input_19")];
-            tensor<fp32, [1, 3, 256]> ret_out_1 = linear(bias = encoder_out_proj_0_bias, weight = encoder_out_proj_0_weight, x = input_19)[name = tensor<string, []>("linear_7")];
-            tensor<fp32, [1, 3, 256]> x_3 = add(x = input_13, y = ret_out_1)[name = tensor<string, []>("x_3")];
+            tensor<fp32, [1, 3, 4, 64]> var_253 = transpose(perm = var_253_perm_0, x = out_1)[name = tensor<string, []>("transpose_55")];
+            tensor<fp32, [1, 3, 4, 64]> out_3 = layer_norm(axes = out_3_axes_0, epsilon = var_64, x = var_253)[name = tensor<string, []>("out_3")];
+            tensor<int32, [3]> var_257 = const()[name = tensor<string, []>("op_257"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<fp32, [1, 3, 256]> out_5 = reshape(shape = var_257, x = out_3)[name = tensor<string, []>("out_5")];
+            tensor<fp32, [1, 3, 256]> var_259 = silu(x = input_19)[name = tensor<string, []>("op_259")];
+            tensor<fp32, [1, 3, 256]> input_21 = mul(x = var_259, y = out_5)[name = tensor<string, []>("input_21")];
+            tensor<fp32, [1, 3, 256]> ret_out_1 = linear(bias = inner_encoder_out_proj_0_bias, weight = inner_encoder_out_proj_0_weight, x = input_21)[name = tensor<string, []>("linear_7")];
+            tensor<fp32, [1, 3, 256]> x_3 = add(x = input_15, y = ret_out_1)[name = tensor<string, []>("x_3")];
             tensor<int32, [4]> window_1_begin_0 = const()[name = tensor<string, []>("window_1_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> window_1_end_0 = const()[name = tensor<string, []>("window_1_end_0"), val = tensor<int32, [4]>([1, 1, 16, 256])];
             tensor<bool, [4]> window_1_end_mask_0 = const()[name = tensor<string, []>("window_1_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_1_squeeze_mask_0 = const()[name = tensor<string, []>("window_1_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_1 = slice_by_index(begin = window_1_begin_0, end = window_1_end_0, end_mask = window_1_end_mask_0, squeeze_mask = window_1_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_1")];
-            tensor<int32, [3]> var_221_begin_0 = const()[name = tensor<string, []>("op_221_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_221_end_0 = const()[name = tensor<string, []>("op_221_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_221_end_mask_0 = const()[name = tensor<string, []>("op_221_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_221 = slice_by_index(begin = var_221_begin_0, end = var_221_end_0, end_mask = var_221_end_mask_0, x = x_3)[name = tensor<string, []>("op_221")];
-            tensor<int32, [3]> var_224_begin_0 = const()[name = tensor<string, []>("op_224_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_224_end_0 = const()[name = tensor<string, []>("op_224_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_224_end_mask_0 = const()[name = tensor<string, []>("op_224_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_224 = slice_by_index(begin = var_224_begin_0, end = var_224_end_0, end_mask = var_224_end_mask_0, x = window_1)[name = tensor<string, []>("op_224")];
+            tensor<int32, [3]> var_267_begin_0 = const()[name = tensor<string, []>("op_267_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_267_end_0 = const()[name = tensor<string, []>("op_267_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_267_end_mask_0 = const()[name = tensor<string, []>("op_267_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_267 = slice_by_index(begin = var_267_begin_0, end = var_267_end_0, end_mask = var_267_end_mask_0, x = x_3)[name = tensor<string, []>("op_267")];
+            tensor<int32, [3]> var_270_begin_0 = const()[name = tensor<string, []>("op_270_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_270_end_0 = const()[name = tensor<string, []>("op_270_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_270_end_mask_0 = const()[name = tensor<string, []>("op_270_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_270 = slice_by_index(begin = var_270_begin_0, end = var_270_end_0, end_mask = var_270_end_mask_0, x = window_1)[name = tensor<string, []>("op_270")];
             tensor<bool, []> window_3_interleave_0 = const()[name = tensor<string, []>("window_3_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_3 = concat(axis = var_26, interleave = window_3_interleave_0, values = (var_224, var_221))[name = tensor<string, []>("window_3")];
-            tensor<int32, [3]> var_229_begin_0 = const()[name = tensor<string, []>("op_229_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_229_end_0 = const()[name = tensor<string, []>("op_229_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_229_end_mask_0 = const()[name = tensor<string, []>("op_229_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_229 = slice_by_index(begin = var_229_begin_0, end = var_229_end_0, end_mask = var_229_end_mask_0, x = x_3)[name = tensor<string, []>("op_229")];
-            tensor<int32, [3]> var_232_begin_0 = const()[name = tensor<string, []>("op_232_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_232_end_0 = const()[name = tensor<string, []>("op_232_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_232_end_mask_0 = const()[name = tensor<string, []>("op_232_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_232 = slice_by_index(begin = var_232_begin_0, end = var_232_end_0, end_mask = var_232_end_mask_0, x = window_3)[name = tensor<string, []>("op_232")];
+            tensor<fp32, [1, 16, 256]> window_3 = concat(axis = var_72, interleave = window_3_interleave_0, values = (var_270, var_267))[name = tensor<string, []>("window_3")];
+            tensor<int32, [3]> var_275_begin_0 = const()[name = tensor<string, []>("op_275_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_275_end_0 = const()[name = tensor<string, []>("op_275_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_275_end_mask_0 = const()[name = tensor<string, []>("op_275_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_275 = slice_by_index(begin = var_275_begin_0, end = var_275_end_0, end_mask = var_275_end_mask_0, x = x_3)[name = tensor<string, []>("op_275")];
+            tensor<int32, [3]> var_278_begin_0 = const()[name = tensor<string, []>("op_278_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_278_end_0 = const()[name = tensor<string, []>("op_278_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_278_end_mask_0 = const()[name = tensor<string, []>("op_278_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_278 = slice_by_index(begin = var_278_begin_0, end = var_278_end_0, end_mask = var_278_end_mask_0, x = window_3)[name = tensor<string, []>("op_278")];
             tensor<bool, []> window_5_interleave_0 = const()[name = tensor<string, []>("window_5_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_5 = concat(axis = var_26, interleave = window_5_interleave_0, values = (var_232, var_229))[name = tensor<string, []>("window_5")];
-            tensor<int32, [3]> var_237_begin_0 = const()[name = tensor<string, []>("op_237_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_237_end_0 = const()[name = tensor<string, []>("op_237_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_237_end_mask_0 = const()[name = tensor<string, []>("op_237_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_237 = slice_by_index(begin = var_237_begin_0, end = var_237_end_0, end_mask = var_237_end_mask_0, x = x_3)[name = tensor<string, []>("op_237")];
-            tensor<int32, [3]> var_240_begin_0 = const()[name = tensor<string, []>("op_240_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_240_end_0 = const()[name = tensor<string, []>("op_240_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_240_end_mask_0 = const()[name = tensor<string, []>("op_240_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_240 = slice_by_index(begin = var_240_begin_0, end = var_240_end_0, end_mask = var_240_end_mask_0, x = window_5)[name = tensor<string, []>("op_240")];
+            tensor<fp32, [1, 16, 256]> window_5 = concat(axis = var_72, interleave = window_5_interleave_0, values = (var_278, var_275))[name = tensor<string, []>("window_5")];
+            tensor<int32, [3]> var_283_begin_0 = const()[name = tensor<string, []>("op_283_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_283_end_0 = const()[name = tensor<string, []>("op_283_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_283_end_mask_0 = const()[name = tensor<string, []>("op_283_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_283 = slice_by_index(begin = var_283_begin_0, end = var_283_end_0, end_mask = var_283_end_mask_0, x = x_3)[name = tensor<string, []>("op_283")];
+            tensor<int32, [3]> var_286_begin_0 = const()[name = tensor<string, []>("op_286_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_286_end_0 = const()[name = tensor<string, []>("op_286_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_286_end_mask_0 = const()[name = tensor<string, []>("op_286_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_286 = slice_by_index(begin = var_286_begin_0, end = var_286_end_0, end_mask = var_286_end_mask_0, x = window_5)[name = tensor<string, []>("op_286")];
             tensor<bool, []> window_7_interleave_0 = const()[name = tensor<string, []>("window_7_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_7 = concat(axis = var_26, interleave = window_7_interleave_0, values = (var_240, var_237))[name = tensor<string, []>("window_7")];
-            tensor<bool, []> input_21_interleave_0 = const()[name = tensor<string, []>("input_21_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [3, 16, 256]> input_21 = concat(axis = var_23, interleave = input_21_interleave_0, values = (window_3, window_5, window_7))[name = tensor<string, []>("input_21")];
+            tensor<fp32, [1, 16, 256]> window_7 = concat(axis = var_72, interleave = window_7_interleave_0, values = (var_286, var_283))[name = tensor<string, []>("window_7")];
+            tensor<bool, []> input_23_interleave_0 = const()[name = tensor<string, []>("input_23_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [3, 16, 256]> input_23 = concat(axis = var_59, interleave = input_23_interleave_0, values = (window_3, window_5, window_7))[name = tensor<string, []>("input_23")];
             tensor<int32, [1]> x_5_axes_0 = const()[name = tensor<string, []>("x_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [3, 16, 256]> x_5 = layer_norm(axes = x_5_axes_0, beta = encoder_conv_module_0_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_0_sequential_0_weight, x = input_21)[name = tensor<string, []>("x_5")];
-            tensor<int32, [3]> input_23_perm_0 = const()[name = tensor<string, []>("input_23_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [3, 16, 256]> x_5 = layer_norm(axes = x_5_axes_0, beta = inner_encoder_conv_module_0_sequential_0_bias, epsilon = var_56, gamma = inner_encoder_conv_module_0_sequential_0_weight, x = input_23)[name = tensor<string, []>("x_5")];
+            tensor<int32, [3]> input_25_perm_0 = const()[name = tensor<string, []>("input_25_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_3_pad_type_0 = const()[name = tensor<string, []>("inputs_3_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_3_strides_0 = const()[name = tensor<string, []>("inputs_3_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_3_pad_0 = const()[name = tensor<string, []>("inputs_3_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_3_dilations_0 = const()[name = tensor<string, []>("inputs_3_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_3_groups_0 = const()[name = tensor<string, []>("inputs_3_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> input_23 = transpose(perm = input_23_perm_0, x = x_5)[name = tensor<string, []>("transpose_54")];
-            tensor<fp32, [3, 512, 16]> inputs_3 = conv(bias = encoder_conv_module_0_sequential_2_conv_bias, dilations = inputs_3_dilations_0, groups = inputs_3_groups_0, pad = inputs_3_pad_0, pad_type = inputs_3_pad_type_0, strides = inputs_3_strides_0, weight = encoder_conv_module_0_sequential_2_conv_weight, x = input_23)[name = tensor<string, []>("inputs_3")];
-            tensor<int32, [2]> var_265_split_sizes_0 = const()[name = tensor<string, []>("op_265_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_265_axis_0 = const()[name = tensor<string, []>("op_265_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> var_265_0, tensor<fp32, [3, 256, 16]> var_265_1 = split(axis = var_265_axis_0, split_sizes = var_265_split_sizes_0, x = inputs_3)[name = tensor<string, []>("op_265")];
-            tensor<fp32, [3, 256, 16]> var_267 = sigmoid(x = var_265_1)[name = tensor<string, []>("op_267")];
-            tensor<fp32, [3, 256, 16]> inputs_5 = mul(x = var_265_0, y = var_267)[name = tensor<string, []>("inputs_5")];
+            tensor<fp32, [3, 256, 16]> input_25 = transpose(perm = input_25_perm_0, x = x_5)[name = tensor<string, []>("transpose_54")];
+            tensor<fp32, [3, 512, 16]> inputs_3 = conv(bias = inner_encoder_conv_module_0_sequential_2_conv_bias, dilations = inputs_3_dilations_0, groups = inputs_3_groups_0, pad = inputs_3_pad_0, pad_type = inputs_3_pad_type_0, strides = inputs_3_strides_0, weight = inner_encoder_conv_module_0_sequential_2_conv_weight, x = input_25)[name = tensor<string, []>("inputs_3")];
+            tensor<int32, [2]> var_311_split_sizes_0 = const()[name = tensor<string, []>("op_311_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_311_axis_0 = const()[name = tensor<string, []>("op_311_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [3, 256, 16]> var_311_0, tensor<fp32, [3, 256, 16]> var_311_1 = split(axis = var_311_axis_0, split_sizes = var_311_split_sizes_0, x = inputs_3)[name = tensor<string, []>("op_311")];
+            tensor<fp32, [3, 256, 16]> var_313 = sigmoid(x = var_311_1)[name = tensor<string, []>("op_313")];
+            tensor<fp32, [3, 256, 16]> inputs_5 = mul(x = var_311_0, y = var_313)[name = tensor<string, []>("inputs_5")];
             tensor<string, []> outputs_aug_1_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_1_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_1_pad_0 = const()[name = tensor<string, []>("outputs_aug_1_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_1_groups_0 = const()[name = tensor<string, []>("outputs_aug_1_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_1_strides_0 = const()[name = tensor<string, []>("outputs_aug_1_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_1_dilations_0 = const()[name = tensor<string, []>("outputs_aug_1_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [3, 256, 31]> outputs_aug_1 = conv(dilations = outputs_aug_1_dilations_0, groups = outputs_aug_1_groups_0, pad = outputs_aug_1_pad_0, pad_type = outputs_aug_1_pad_type_0, strides = outputs_aug_1_strides_0, weight = encoder_conv_module_0_sequential_4_conv_weight, x = inputs_5)[name = tensor<string, []>("outputs_aug_1")];
-            tensor<int32, [3]> input_25_begin_0 = const()[name = tensor<string, []>("input_25_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_25_end_0 = const()[name = tensor<string, []>("input_25_end_0"), val = tensor<int32, [3]>([3, 256, 16])];
-            tensor<bool, [3]> input_25_end_mask_0 = const()[name = tensor<string, []>("input_25_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [3, 256, 16]> input_25 = slice_by_index(begin = input_25_begin_0, end = input_25_end_0, end_mask = input_25_end_mask_0, x = outputs_aug_1)[name = tensor<string, []>("input_25")];
-            tensor<fp32, [3, 256, 16]> inputs_7 = batch_norm(beta = encoder_conv_module_0_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_0_sequential_5_weight, mean = encoder_conv_module_0_sequential_5_running_mean, variance = encoder_conv_module_0_sequential_5_running_var, x = input_25)[name = tensor<string, []>("inputs_7")];
-            tensor<fp32, [3, 256, 16]> input_27 = silu(x = inputs_7)[name = tensor<string, []>("input_27")];
-            tensor<string, []> input_29_pad_type_0 = const()[name = tensor<string, []>("input_29_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_29_strides_0 = const()[name = tensor<string, []>("input_29_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_29_pad_0 = const()[name = tensor<string, []>("input_29_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_29_dilations_0 = const()[name = tensor<string, []>("input_29_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_29_groups_0 = const()[name = tensor<string, []>("input_29_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> input_29 = conv(bias = encoder_conv_module_0_sequential_7_conv_bias, dilations = input_29_dilations_0, groups = input_29_groups_0, pad = input_29_pad_0, pad_type = input_29_pad_type_0, strides = input_29_strides_0, weight = encoder_conv_module_0_sequential_7_conv_weight, x = input_27)[name = tensor<string, []>("input_29")];
+            tensor<fp32, [3, 256, 31]> outputs_aug_1 = conv(dilations = outputs_aug_1_dilations_0, groups = outputs_aug_1_groups_0, pad = outputs_aug_1_pad_0, pad_type = outputs_aug_1_pad_type_0, strides = outputs_aug_1_strides_0, weight = inner_encoder_conv_module_0_sequential_4_conv_weight, x = inputs_5)[name = tensor<string, []>("outputs_aug_1")];
+            tensor<int32, [3]> input_27_begin_0 = const()[name = tensor<string, []>("input_27_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_27_end_0 = const()[name = tensor<string, []>("input_27_end_0"), val = tensor<int32, [3]>([3, 256, 16])];
+            tensor<bool, [3]> input_27_end_mask_0 = const()[name = tensor<string, []>("input_27_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [3, 256, 16]> input_27 = slice_by_index(begin = input_27_begin_0, end = input_27_end_0, end_mask = input_27_end_mask_0, x = outputs_aug_1)[name = tensor<string, []>("input_27")];
+            tensor<fp32, [3, 256, 16]> inputs_7 = batch_norm(beta = inner_encoder_conv_module_0_sequential_5_bias, epsilon = var_56, gamma = inner_encoder_conv_module_0_sequential_5_weight, mean = inner_encoder_conv_module_0_sequential_5_running_mean, variance = inner_encoder_conv_module_0_sequential_5_running_var, x = input_27)[name = tensor<string, []>("inputs_7")];
+            tensor<fp32, [3, 256, 16]> input_29 = silu(x = inputs_7)[name = tensor<string, []>("input_29")];
+            tensor<string, []> input_31_pad_type_0 = const()[name = tensor<string, []>("input_31_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_31_strides_0 = const()[name = tensor<string, []>("input_31_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_31_pad_0 = const()[name = tensor<string, []>("input_31_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_31_dilations_0 = const()[name = tensor<string, []>("input_31_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_31_groups_0 = const()[name = tensor<string, []>("input_31_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [3, 256, 16]> input_31 = conv(bias = inner_encoder_conv_module_0_sequential_7_conv_bias, dilations = input_31_dilations_0, groups = input_31_groups_0, pad = input_31_pad_0, pad_type = input_31_pad_type_0, strides = input_31_strides_0, weight = inner_encoder_conv_module_0_sequential_7_conv_weight, x = input_29)[name = tensor<string, []>("input_31")];
             tensor<int32, [3]> conv_out_1_perm_0 = const()[name = tensor<string, []>("conv_out_1_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_298_begin_0 = const()[name = tensor<string, []>("op_298_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_298_end_0 = const()[name = tensor<string, []>("op_298_end_0"), val = tensor<int32, [3]>([3, 16, 256])];
-            tensor<bool, [3]> var_298_end_mask_0 = const()[name = tensor<string, []>("op_298_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [3, 16, 256]> conv_out_1 = transpose(perm = conv_out_1_perm_0, x = input_29)[name = tensor<string, []>("transpose_53")];
-            tensor<fp32, [3, 1, 256]> var_298 = slice_by_index(begin = var_298_begin_0, end = var_298_end_0, end_mask = var_298_end_mask_0, x = conv_out_1)[name = tensor<string, []>("op_298")];
-            tensor<int32, [3]> var_300_perm_0 = const()[name = tensor<string, []>("op_300_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 3, 256]> var_300 = transpose(perm = var_300_perm_0, x = var_298)[name = tensor<string, []>("transpose_52")];
-            tensor<fp32, [1, 3, 256]> input_31 = add(x = x_3, y = var_300)[name = tensor<string, []>("input_31")];
-            tensor<int32, [1]> input_33_axes_0 = const()[name = tensor<string, []>("input_33_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_33 = layer_norm(axes = input_33_axes_0, beta = encoder_ffn2_0_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_0_module_sequential_0_weight, x = input_31)[name = tensor<string, []>("input_33")];
-            tensor<fp32, [1, 3, 1024]> inputs_9 = linear(bias = encoder_ffn2_0_module_sequential_1_linear_bias, weight = encoder_ffn2_0_module_sequential_1_linear_weight, x = input_33)[name = tensor<string, []>("linear_8")];
-            tensor<fp32, [1, 3, 1024]> input_35 = silu(x = inputs_9)[name = tensor<string, []>("input_35")];
-            tensor<fp32, [1, 3, 256]> input_39 = linear(bias = encoder_ffn2_0_module_sequential_4_linear_bias, weight = encoder_ffn2_0_module_sequential_4_linear_weight, x = input_35)[name = tensor<string, []>("linear_9")];
-            tensor<fp32, []> var_323 = const()[name = tensor<string, []>("op_323"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 3, 256]> var_324 = mul(x = input_39, y = var_323)[name = tensor<string, []>("op_324")];
-            tensor<fp32, [1, 3, 256]> input_41 = add(x = var_324, y = input_31)[name = tensor<string, []>("input_41")];
-            tensor<int32, [1]> input_43_axes_0 = const()[name = tensor<string, []>("input_43_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_43 = layer_norm(axes = input_43_axes_0, beta = encoder_layer_norm_0_bias, epsilon = var_28, gamma = encoder_layer_norm_0_weight, x = input_41)[name = tensor<string, []>("input_43")];
+            tensor<int32, [3]> var_344_begin_0 = const()[name = tensor<string, []>("op_344_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_344_end_0 = const()[name = tensor<string, []>("op_344_end_0"), val = tensor<int32, [3]>([3, 16, 256])];
+            tensor<bool, [3]> var_344_end_mask_0 = const()[name = tensor<string, []>("op_344_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [3, 16, 256]> conv_out_1 = transpose(perm = conv_out_1_perm_0, x = input_31)[name = tensor<string, []>("transpose_53")];
+            tensor<fp32, [3, 1, 256]> var_344 = slice_by_index(begin = var_344_begin_0, end = var_344_end_0, end_mask = var_344_end_mask_0, x = conv_out_1)[name = tensor<string, []>("op_344")];
+            tensor<int32, [3]> var_346_perm_0 = const()[name = tensor<string, []>("op_346_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 3, 256]> var_346 = transpose(perm = var_346_perm_0, x = var_344)[name = tensor<string, []>("transpose_52")];
+            tensor<fp32, [1, 3, 256]> input_33 = add(x = x_3, y = var_346)[name = tensor<string, []>("input_33")];
+            tensor<int32, [1]> input_35_axes_0 = const()[name = tensor<string, []>("input_35_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 3, 256]> input_35 = layer_norm(axes = input_35_axes_0, beta = inner_encoder_ffn2_0_module_sequential_0_bias, epsilon = var_56, gamma = inner_encoder_ffn2_0_module_sequential_0_weight, x = input_33)[name = tensor<string, []>("input_35")];
+            tensor<fp32, [1, 3, 1024]> inputs_9 = linear(bias = inner_encoder_ffn2_0_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_0_module_sequential_1_linear_weight, x = input_35)[name = tensor<string, []>("linear_8")];
+            tensor<fp32, [1, 3, 1024]> input_37 = silu(x = inputs_9)[name = tensor<string, []>("input_37")];
+            tensor<fp32, [1, 3, 256]> input_41 = linear(bias = inner_encoder_ffn2_0_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_0_module_sequential_4_linear_weight, x = input_37)[name = tensor<string, []>("linear_9")];
+            tensor<fp32, []> var_369 = const()[name = tensor<string, []>("op_369"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 3, 256]> var_370 = mul(x = input_41, y = var_369)[name = tensor<string, []>("op_370")];
+            tensor<fp32, [1, 3, 256]> input_43 = add(x = var_370, y = input_33)[name = tensor<string, []>("input_43")];
             tensor<int32, [1]> input_45_axes_0 = const()[name = tensor<string, []>("input_45_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_45 = layer_norm(axes = input_45_axes_0, beta = encoder_ffn1_1_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_1_module_sequential_0_weight, x = input_43)[name = tensor<string, []>("input_45")];
-            tensor<fp32, [1, 3, 1024]> inputs_11 = linear(bias = encoder_ffn1_1_module_sequential_1_linear_bias, weight = encoder_ffn1_1_module_sequential_1_linear_weight, x = input_45)[name = tensor<string, []>("linear_10")];
-            tensor<fp32, [1, 3, 1024]> input_47 = silu(x = inputs_11)[name = tensor<string, []>("input_47")];
-            tensor<fp32, [1, 3, 256]> input_51 = linear(bias = encoder_ffn1_1_module_sequential_4_linear_bias, weight = encoder_ffn1_1_module_sequential_4_linear_weight, x = input_47)[name = tensor<string, []>("linear_11")];
-            tensor<fp32, []> var_353 = const()[name = tensor<string, []>("op_353"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 3, 256]> var_354 = mul(x = input_51, y = var_353)[name = tensor<string, []>("op_354")];
-            tensor<fp32, [1, 3, 256]> input_53 = add(x = var_354, y = input_43)[name = tensor<string, []>("input_53")];
+            tensor<fp32, [1, 3, 256]> input_45 = layer_norm(axes = input_45_axes_0, beta = inner_encoder_layer_norm_0_bias, epsilon = var_56, gamma = inner_encoder_layer_norm_0_weight, x = input_43)[name = tensor<string, []>("input_45")];
+            tensor<int32, [1]> input_47_axes_0 = const()[name = tensor<string, []>("input_47_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 3, 256]> input_47 = layer_norm(axes = input_47_axes_0, beta = inner_encoder_ffn1_1_module_sequential_0_bias, epsilon = var_56, gamma = inner_encoder_ffn1_1_module_sequential_0_weight, x = input_45)[name = tensor<string, []>("input_47")];
+            tensor<fp32, [1, 3, 1024]> inputs_11 = linear(bias = inner_encoder_ffn1_1_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_1_module_sequential_1_linear_weight, x = input_47)[name = tensor<string, []>("linear_10")];
+            tensor<fp32, [1, 3, 1024]> input_49 = silu(x = inputs_11)[name = tensor<string, []>("input_49")];
+            tensor<fp32, [1, 3, 256]> input_53 = linear(bias = inner_encoder_ffn1_1_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_1_module_sequential_4_linear_weight, x = input_49)[name = tensor<string, []>("linear_11")];
+            tensor<fp32, []> var_399 = const()[name = tensor<string, []>("op_399"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 3, 256]> var_400 = mul(x = input_53, y = var_399)[name = tensor<string, []>("op_400")];
+            tensor<fp32, [1, 3, 256]> input_55 = add(x = var_400, y = input_45)[name = tensor<string, []>("input_55")];
             tensor<int32, [1]> x_7_axes_0 = const()[name = tensor<string, []>("x_7_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> x_7 = layer_norm(axes = x_7_axes_0, beta = encoder_ret_lns_1_bias, epsilon = var_28, gamma = encoder_ret_lns_1_weight, x = input_53)[name = tensor<string, []>("x_7")];
+            tensor<fp32, [1, 3, 256]> x_7 = layer_norm(axes = x_7_axes_0, beta = inner_encoder_ret_lns_1_bias, epsilon = var_56, gamma = inner_encoder_ret_lns_1_weight, x = input_55)[name = tensor<string, []>("x_7")];
             tensor<int32, [5]> prev_kv_3_begin_0 = const()[name = tensor<string, []>("prev_kv_3_begin_0"), val = tensor<int32, [5]>([1, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_3_end_0 = const()[name = tensor<string, []>("prev_kv_3_end_0"), val = tensor<int32, [5]>([2, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_3_end_mask_0 = const()[name = tensor<string, []>("prev_kv_3_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -406,163 +424,163 @@ program(1.0)
             tensor<bool, [2]> prev_scale_3_end_mask_0 = const()[name = tensor<string, []>("prev_scale_3_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_3_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_3_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_3 = slice_by_index(begin = prev_scale_3_begin_0, end = prev_scale_3_end_0, end_mask = prev_scale_3_end_mask_0, squeeze_mask = prev_scale_3_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_3")];
-            tensor<fp32, [1, 3, 256]> var_368 = linear(bias = encoder_q_proj_1_bias, weight = encoder_q_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_12")];
-            tensor<int32, [4]> var_369 = const()[name = tensor<string, []>("op_369"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_370 = reshape(shape = var_369, x = var_368)[name = tensor<string, []>("op_370")];
+            tensor<fp32, [1, 3, 256]> var_414 = linear(bias = inner_encoder_q_proj_1_bias, weight = inner_encoder_q_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_12")];
+            tensor<int32, [4]> var_415 = const()[name = tensor<string, []>("op_415"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_416 = reshape(shape = var_415, x = var_414)[name = tensor<string, []>("op_416")];
             tensor<int32, [4]> q_3_perm_0 = const()[name = tensor<string, []>("q_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> var_374 = linear(bias = encoder_k_proj_1_bias, weight = encoder_k_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_13")];
-            tensor<fp32, []> var_375 = const()[name = tensor<string, []>("op_375"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 3, 256]> var_376 = mul(x = var_374, y = var_375)[name = tensor<string, []>("op_376")];
-            tensor<int32, [4]> var_377 = const()[name = tensor<string, []>("op_377"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_378 = reshape(shape = var_377, x = var_376)[name = tensor<string, []>("op_378")];
+            tensor<fp32, [1, 3, 256]> var_420 = linear(bias = inner_encoder_k_proj_1_bias, weight = inner_encoder_k_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_13")];
+            tensor<fp32, []> var_421 = const()[name = tensor<string, []>("op_421"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 3, 256]> var_422 = mul(x = var_420, y = var_421)[name = tensor<string, []>("op_422")];
+            tensor<int32, [4]> var_423 = const()[name = tensor<string, []>("op_423"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_424 = reshape(shape = var_423, x = var_422)[name = tensor<string, []>("op_424")];
             tensor<int32, [4]> k_3_perm_0 = const()[name = tensor<string, []>("k_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> var_382 = linear(bias = encoder_v_proj_1_bias, weight = encoder_v_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_14")];
-            tensor<int32, [4]> var_383 = const()[name = tensor<string, []>("op_383"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_384 = reshape(shape = var_383, x = var_382)[name = tensor<string, []>("op_384")];
+            tensor<fp32, [1, 3, 256]> var_428 = linear(bias = inner_encoder_v_proj_1_bias, weight = inner_encoder_v_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_14")];
+            tensor<int32, [4]> var_429 = const()[name = tensor<string, []>("op_429"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_430 = reshape(shape = var_429, x = var_428)[name = tensor<string, []>("op_430")];
             tensor<int32, [4]> v_3_perm_0 = const()[name = tensor<string, []>("v_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> input_57 = linear(bias = encoder_g_proj_1_bias, weight = encoder_g_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_15")];
+            tensor<fp32, [1, 3, 256]> input_59 = linear(bias = inner_encoder_g_proj_1_bias, weight = inner_encoder_g_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_15")];
             tensor<fp32, [1]> sqrt_s0_3 = sqrt(x = prev_scale_3)[name = tensor<string, []>("sqrt_s0_3")];
-            tensor<fp32, [3]> s_t_3 = add(x = prev_scale_3, y = encoder__t_index)[name = tensor<string, []>("s_t_3")];
+            tensor<fp32, [3]> s_t_3 = add(x = prev_scale_3, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_3")];
             tensor<fp32, [3]> sqrt_s_t_3 = sqrt(x = s_t_3)[name = tensor<string, []>("sqrt_s_t_3")];
             tensor<bool, []> qk_3_transpose_x_1 = const()[name = tensor<string, []>("qk_3_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_3_transpose_y_1 = const()[name = tensor<string, []>("qk_3_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 3, 64]> k_3 = transpose(perm = k_3_perm_0, x = var_378)[name = tensor<string, []>("transpose_50")];
-            tensor<fp32, [1, 4, 3, 64]> q_3 = transpose(perm = q_3_perm_0, x = var_370)[name = tensor<string, []>("transpose_51")];
+            tensor<fp32, [1, 4, 3, 64]> k_3 = transpose(perm = k_3_perm_0, x = var_424)[name = tensor<string, []>("transpose_50")];
+            tensor<fp32, [1, 4, 3, 64]> q_3 = transpose(perm = q_3_perm_0, x = var_416)[name = tensor<string, []>("transpose_51")];
             tensor<fp32, [1, 4, 3, 3]> qk_3 = matmul(transpose_x = qk_3_transpose_x_1, transpose_y = qk_3_transpose_y_1, x = q_3, y = k_3)[name = tensor<string, []>("qk_3")];
-            tensor<int32, [2]> var_394 = const()[name = tensor<string, []>("op_394"), val = tensor<int32, [2]>([3, 1])];
-            tensor<fp32, [3, 1]> var_395 = reshape(shape = var_394, x = sqrt_s_t_3)[name = tensor<string, []>("op_395")];
-            tensor<fp32, [3, 3]> M_3 = real_div(x = encoder__causal_mask, y = var_395)[name = tensor<string, []>("M_3")];
-            tensor<fp32, [1, 4, 3, 3]> var_397 = mul(x = qk_3, y = M_3)[name = tensor<string, []>("op_397")];
+            tensor<int32, [2]> var_440 = const()[name = tensor<string, []>("op_440"), val = tensor<int32, [2]>([3, 1])];
+            tensor<fp32, [3, 1]> var_441 = reshape(shape = var_440, x = sqrt_s_t_3)[name = tensor<string, []>("op_441")];
+            tensor<fp32, [3, 3]> M_3 = real_div(x = inner_encoder__causal_mask, y = var_441)[name = tensor<string, []>("M_3")];
+            tensor<fp32, [1, 4, 3, 3]> var_443 = mul(x = qk_3, y = M_3)[name = tensor<string, []>("op_443")];
             tensor<bool, []> inner_3_transpose_x_0 = const()[name = tensor<string, []>("inner_3_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_3_transpose_y_0 = const()[name = tensor<string, []>("inner_3_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 3, 64]> v_3 = transpose(perm = v_3_perm_0, x = var_384)[name = tensor<string, []>("transpose_49")];
-            tensor<fp32, [1, 4, 3, 64]> inner_3 = matmul(transpose_x = inner_3_transpose_x_0, transpose_y = inner_3_transpose_y_0, x = var_397, y = v_3)[name = tensor<string, []>("inner_3")];
-            tensor<bool, []> var_399_transpose_x_0 = const()[name = tensor<string, []>("op_399_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_399_transpose_y_0 = const()[name = tensor<string, []>("op_399_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 3, 64]> var_399 = matmul(transpose_x = var_399_transpose_x_0, transpose_y = var_399_transpose_y_0, x = q_3, y = prev_kv_3)[name = tensor<string, []>("op_399")];
-            tensor<fp32, [3]> var_400 = real_div(x = sqrt_s0_3, y = sqrt_s_t_3)[name = tensor<string, []>("op_400")];
-            tensor<int32, [4]> var_401 = const()[name = tensor<string, []>("op_401"), val = tensor<int32, [4]>([1, 1, 3, 1])];
-            tensor<fp32, [1, 1, 3, 1]> var_402 = reshape(shape = var_401, x = var_400)[name = tensor<string, []>("op_402")];
-            tensor<fp32, [1, 4, 3, 64]> cross_3 = mul(x = var_399, y = var_402)[name = tensor<string, []>("cross_3")];
+            tensor<fp32, [1, 4, 3, 64]> v_3 = transpose(perm = v_3_perm_0, x = var_430)[name = tensor<string, []>("transpose_49")];
+            tensor<fp32, [1, 4, 3, 64]> inner_3 = matmul(transpose_x = inner_3_transpose_x_0, transpose_y = inner_3_transpose_y_0, x = var_443, y = v_3)[name = tensor<string, []>("inner_3")];
+            tensor<bool, []> var_445_transpose_x_0 = const()[name = tensor<string, []>("op_445_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_445_transpose_y_0 = const()[name = tensor<string, []>("op_445_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 3, 64]> var_445 = matmul(transpose_x = var_445_transpose_x_0, transpose_y = var_445_transpose_y_0, x = q_3, y = prev_kv_3)[name = tensor<string, []>("op_445")];
+            tensor<fp32, [3]> var_446 = real_div(x = sqrt_s0_3, y = sqrt_s_t_3)[name = tensor<string, []>("op_446")];
+            tensor<int32, [4]> var_447 = const()[name = tensor<string, []>("op_447"), val = tensor<int32, [4]>([1, 1, 3, 1])];
+            tensor<fp32, [1, 1, 3, 1]> var_448 = reshape(shape = var_447, x = var_446)[name = tensor<string, []>("op_448")];
+            tensor<fp32, [1, 4, 3, 64]> cross_3 = mul(x = var_445, y = var_448)[name = tensor<string, []>("cross_3")];
             tensor<fp32, [1, 4, 3, 64]> out_7 = add(x = inner_3, y = cross_3)[name = tensor<string, []>("out_7")];
-            tensor<fp32, [1, 4, 64, 64]> var_405 = mul(x = prev_kv_3, y = sqrt_s0_3)[name = tensor<string, []>("op_405")];
-            tensor<bool, []> var_407_transpose_x_1 = const()[name = tensor<string, []>("op_407_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_407_transpose_y_1 = const()[name = tensor<string, []>("op_407_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_407 = matmul(transpose_x = var_407_transpose_x_1, transpose_y = var_407_transpose_y_1, x = k_3, y = v_3)[name = tensor<string, []>("op_407")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_3 = add(x = var_405, y = var_407)[name = tensor<string, []>("new_kv_unnorm_3")];
-            tensor<fp32, []> var_409 = const()[name = tensor<string, []>("op_409"), val = tensor<fp32, []>(0x1.8p+1)];
-            tensor<fp32, [1]> new_scale_3 = add(x = prev_scale_3, y = var_409)[name = tensor<string, []>("new_scale_3")];
-            tensor<fp32, [1]> var_411 = sqrt(x = new_scale_3)[name = tensor<string, []>("op_411")];
-            tensor<fp32, [1, 4, 64, 64]> var_412 = real_div(x = new_kv_unnorm_3, y = var_411)[name = tensor<string, []>("op_412")];
-            tensor<int32, [4]> var_413_perm_0 = const()[name = tensor<string, []>("op_413_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_451 = mul(x = prev_kv_3, y = sqrt_s0_3)[name = tensor<string, []>("op_451")];
+            tensor<bool, []> var_453_transpose_x_1 = const()[name = tensor<string, []>("op_453_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_453_transpose_y_1 = const()[name = tensor<string, []>("op_453_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_453 = matmul(transpose_x = var_453_transpose_x_1, transpose_y = var_453_transpose_y_1, x = k_3, y = v_3)[name = tensor<string, []>("op_453")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_3 = add(x = var_451, y = var_453)[name = tensor<string, []>("new_kv_unnorm_3")];
+            tensor<fp32, []> var_455 = const()[name = tensor<string, []>("op_455"), val = tensor<fp32, []>(0x1.8p+1)];
+            tensor<fp32, [1]> new_scale_3 = add(x = prev_scale_3, y = var_455)[name = tensor<string, []>("new_scale_3")];
+            tensor<fp32, [1]> var_457 = sqrt(x = new_scale_3)[name = tensor<string, []>("op_457")];
+            tensor<fp32, [1, 4, 64, 64]> var_458 = real_div(x = new_kv_unnorm_3, y = var_457)[name = tensor<string, []>("op_458")];
+            tensor<int32, [4]> var_459_perm_0 = const()[name = tensor<string, []>("op_459_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_9_axes_0 = const()[name = tensor<string, []>("out_9_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 4, 64]> var_413 = transpose(perm = var_413_perm_0, x = out_7)[name = tensor<string, []>("transpose_48")];
-            tensor<fp32, [1, 3, 4, 64]> out_9 = layer_norm(axes = out_9_axes_0, epsilon = var_17, x = var_413)[name = tensor<string, []>("out_9")];
-            tensor<int32, [3]> var_417 = const()[name = tensor<string, []>("op_417"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<fp32, [1, 3, 256]> out_11 = reshape(shape = var_417, x = out_9)[name = tensor<string, []>("out_11")];
-            tensor<fp32, [1, 3, 256]> var_419 = silu(x = input_57)[name = tensor<string, []>("op_419")];
-            tensor<fp32, [1, 3, 256]> input_59 = mul(x = var_419, y = out_11)[name = tensor<string, []>("input_59")];
-            tensor<fp32, [1, 3, 256]> ret_out_3 = linear(bias = encoder_out_proj_1_bias, weight = encoder_out_proj_1_weight, x = input_59)[name = tensor<string, []>("linear_16")];
-            tensor<fp32, [1, 3, 256]> x_9 = add(x = input_53, y = ret_out_3)[name = tensor<string, []>("x_9")];
+            tensor<fp32, [1, 3, 4, 64]> var_459 = transpose(perm = var_459_perm_0, x = out_7)[name = tensor<string, []>("transpose_48")];
+            tensor<fp32, [1, 3, 4, 64]> out_9 = layer_norm(axes = out_9_axes_0, epsilon = var_64, x = var_459)[name = tensor<string, []>("out_9")];
+            tensor<int32, [3]> var_463 = const()[name = tensor<string, []>("op_463"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<fp32, [1, 3, 256]> out_11 = reshape(shape = var_463, x = out_9)[name = tensor<string, []>("out_11")];
+            tensor<fp32, [1, 3, 256]> var_465 = silu(x = input_59)[name = tensor<string, []>("op_465")];
+            tensor<fp32, [1, 3, 256]> input_61 = mul(x = var_465, y = out_11)[name = tensor<string, []>("input_61")];
+            tensor<fp32, [1, 3, 256]> ret_out_3 = linear(bias = inner_encoder_out_proj_1_bias, weight = inner_encoder_out_proj_1_weight, x = input_61)[name = tensor<string, []>("linear_16")];
+            tensor<fp32, [1, 3, 256]> x_9 = add(x = input_55, y = ret_out_3)[name = tensor<string, []>("x_9")];
             tensor<int32, [4]> window_9_begin_0 = const()[name = tensor<string, []>("window_9_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> window_9_end_0 = const()[name = tensor<string, []>("window_9_end_0"), val = tensor<int32, [4]>([2, 1, 16, 256])];
             tensor<bool, [4]> window_9_end_mask_0 = const()[name = tensor<string, []>("window_9_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_9_squeeze_mask_0 = const()[name = tensor<string, []>("window_9_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_9 = slice_by_index(begin = window_9_begin_0, end = window_9_end_0, end_mask = window_9_end_mask_0, squeeze_mask = window_9_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_9")];
-            tensor<int32, [3]> var_427_begin_0 = const()[name = tensor<string, []>("op_427_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_427_end_0 = const()[name = tensor<string, []>("op_427_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_427_end_mask_0 = const()[name = tensor<string, []>("op_427_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_427 = slice_by_index(begin = var_427_begin_0, end = var_427_end_0, end_mask = var_427_end_mask_0, x = x_9)[name = tensor<string, []>("op_427")];
-            tensor<int32, [3]> var_430_begin_0 = const()[name = tensor<string, []>("op_430_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_430_end_0 = const()[name = tensor<string, []>("op_430_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_430_end_mask_0 = const()[name = tensor<string, []>("op_430_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_430 = slice_by_index(begin = var_430_begin_0, end = var_430_end_0, end_mask = var_430_end_mask_0, x = window_9)[name = tensor<string, []>("op_430")];
+            tensor<int32, [3]> var_473_begin_0 = const()[name = tensor<string, []>("op_473_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_473_end_0 = const()[name = tensor<string, []>("op_473_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_473_end_mask_0 = const()[name = tensor<string, []>("op_473_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_473 = slice_by_index(begin = var_473_begin_0, end = var_473_end_0, end_mask = var_473_end_mask_0, x = x_9)[name = tensor<string, []>("op_473")];
+            tensor<int32, [3]> var_476_begin_0 = const()[name = tensor<string, []>("op_476_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_476_end_0 = const()[name = tensor<string, []>("op_476_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_476_end_mask_0 = const()[name = tensor<string, []>("op_476_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_476 = slice_by_index(begin = var_476_begin_0, end = var_476_end_0, end_mask = var_476_end_mask_0, x = window_9)[name = tensor<string, []>("op_476")];
             tensor<bool, []> window_11_interleave_0 = const()[name = tensor<string, []>("window_11_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_11 = concat(axis = var_26, interleave = window_11_interleave_0, values = (var_430, var_427))[name = tensor<string, []>("window_11")];
-            tensor<int32, [3]> var_435_begin_0 = const()[name = tensor<string, []>("op_435_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_435_end_0 = const()[name = tensor<string, []>("op_435_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_435_end_mask_0 = const()[name = tensor<string, []>("op_435_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_435 = slice_by_index(begin = var_435_begin_0, end = var_435_end_0, end_mask = var_435_end_mask_0, x = x_9)[name = tensor<string, []>("op_435")];
-            tensor<int32, [3]> var_438_begin_0 = const()[name = tensor<string, []>("op_438_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_438_end_0 = const()[name = tensor<string, []>("op_438_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_438_end_mask_0 = const()[name = tensor<string, []>("op_438_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_438 = slice_by_index(begin = var_438_begin_0, end = var_438_end_0, end_mask = var_438_end_mask_0, x = window_11)[name = tensor<string, []>("op_438")];
+            tensor<fp32, [1, 16, 256]> window_11 = concat(axis = var_72, interleave = window_11_interleave_0, values = (var_476, var_473))[name = tensor<string, []>("window_11")];
+            tensor<int32, [3]> var_481_begin_0 = const()[name = tensor<string, []>("op_481_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_481_end_0 = const()[name = tensor<string, []>("op_481_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_481_end_mask_0 = const()[name = tensor<string, []>("op_481_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_481 = slice_by_index(begin = var_481_begin_0, end = var_481_end_0, end_mask = var_481_end_mask_0, x = x_9)[name = tensor<string, []>("op_481")];
+            tensor<int32, [3]> var_484_begin_0 = const()[name = tensor<string, []>("op_484_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_484_end_0 = const()[name = tensor<string, []>("op_484_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_484_end_mask_0 = const()[name = tensor<string, []>("op_484_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_484 = slice_by_index(begin = var_484_begin_0, end = var_484_end_0, end_mask = var_484_end_mask_0, x = window_11)[name = tensor<string, []>("op_484")];
             tensor<bool, []> window_13_interleave_0 = const()[name = tensor<string, []>("window_13_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_13 = concat(axis = var_26, interleave = window_13_interleave_0, values = (var_438, var_435))[name = tensor<string, []>("window_13")];
-            tensor<int32, [3]> var_443_begin_0 = const()[name = tensor<string, []>("op_443_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_443_end_0 = const()[name = tensor<string, []>("op_443_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_443_end_mask_0 = const()[name = tensor<string, []>("op_443_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_443 = slice_by_index(begin = var_443_begin_0, end = var_443_end_0, end_mask = var_443_end_mask_0, x = x_9)[name = tensor<string, []>("op_443")];
-            tensor<int32, [3]> var_446_begin_0 = const()[name = tensor<string, []>("op_446_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_446_end_0 = const()[name = tensor<string, []>("op_446_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_446_end_mask_0 = const()[name = tensor<string, []>("op_446_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_446 = slice_by_index(begin = var_446_begin_0, end = var_446_end_0, end_mask = var_446_end_mask_0, x = window_13)[name = tensor<string, []>("op_446")];
+            tensor<fp32, [1, 16, 256]> window_13 = concat(axis = var_72, interleave = window_13_interleave_0, values = (var_484, var_481))[name = tensor<string, []>("window_13")];
+            tensor<int32, [3]> var_489_begin_0 = const()[name = tensor<string, []>("op_489_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_489_end_0 = const()[name = tensor<string, []>("op_489_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_489_end_mask_0 = const()[name = tensor<string, []>("op_489_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_489 = slice_by_index(begin = var_489_begin_0, end = var_489_end_0, end_mask = var_489_end_mask_0, x = x_9)[name = tensor<string, []>("op_489")];
+            tensor<int32, [3]> var_492_begin_0 = const()[name = tensor<string, []>("op_492_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_492_end_0 = const()[name = tensor<string, []>("op_492_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_492_end_mask_0 = const()[name = tensor<string, []>("op_492_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_492 = slice_by_index(begin = var_492_begin_0, end = var_492_end_0, end_mask = var_492_end_mask_0, x = window_13)[name = tensor<string, []>("op_492")];
             tensor<bool, []> window_15_interleave_0 = const()[name = tensor<string, []>("window_15_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_15 = concat(axis = var_26, interleave = window_15_interleave_0, values = (var_446, var_443))[name = tensor<string, []>("window_15")];
-            tensor<bool, []> input_61_interleave_0 = const()[name = tensor<string, []>("input_61_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [3, 16, 256]> input_61 = concat(axis = var_23, interleave = input_61_interleave_0, values = (window_11, window_13, window_15))[name = tensor<string, []>("input_61")];
+            tensor<fp32, [1, 16, 256]> window_15 = concat(axis = var_72, interleave = window_15_interleave_0, values = (var_492, var_489))[name = tensor<string, []>("window_15")];
+            tensor<bool, []> input_63_interleave_0 = const()[name = tensor<string, []>("input_63_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [3, 16, 256]> input_63 = concat(axis = var_59, interleave = input_63_interleave_0, values = (window_11, window_13, window_15))[name = tensor<string, []>("input_63")];
             tensor<int32, [1]> x_11_axes_0 = const()[name = tensor<string, []>("x_11_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [3, 16, 256]> x_11 = layer_norm(axes = x_11_axes_0, beta = encoder_conv_module_1_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_1_sequential_0_weight, x = input_61)[name = tensor<string, []>("x_11")];
-            tensor<int32, [3]> input_63_perm_0 = const()[name = tensor<string, []>("input_63_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [3, 16, 256]> x_11 = layer_norm(axes = x_11_axes_0, beta = inner_encoder_conv_module_1_sequential_0_bias, epsilon = var_56, gamma = inner_encoder_conv_module_1_sequential_0_weight, x = input_63)[name = tensor<string, []>("x_11")];
+            tensor<int32, [3]> input_65_perm_0 = const()[name = tensor<string, []>("input_65_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_13_pad_type_0 = const()[name = tensor<string, []>("inputs_13_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_13_strides_0 = const()[name = tensor<string, []>("inputs_13_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_13_pad_0 = const()[name = tensor<string, []>("inputs_13_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_13_dilations_0 = const()[name = tensor<string, []>("inputs_13_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_13_groups_0 = const()[name = tensor<string, []>("inputs_13_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> input_63 = transpose(perm = input_63_perm_0, x = x_11)[name = tensor<string, []>("transpose_47")];
-            tensor<fp32, [3, 512, 16]> inputs_13 = conv(bias = encoder_conv_module_1_sequential_2_conv_bias, dilations = inputs_13_dilations_0, groups = inputs_13_groups_0, pad = inputs_13_pad_0, pad_type = inputs_13_pad_type_0, strides = inputs_13_strides_0, weight = encoder_conv_module_1_sequential_2_conv_weight, x = input_63)[name = tensor<string, []>("inputs_13")];
-            tensor<int32, [2]> var_471_split_sizes_0 = const()[name = tensor<string, []>("op_471_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_471_axis_0 = const()[name = tensor<string, []>("op_471_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> var_471_0, tensor<fp32, [3, 256, 16]> var_471_1 = split(axis = var_471_axis_0, split_sizes = var_471_split_sizes_0, x = inputs_13)[name = tensor<string, []>("op_471")];
-            tensor<fp32, [3, 256, 16]> var_473 = sigmoid(x = var_471_1)[name = tensor<string, []>("op_473")];
-            tensor<fp32, [3, 256, 16]> inputs_15 = mul(x = var_471_0, y = var_473)[name = tensor<string, []>("inputs_15")];
+            tensor<fp32, [3, 256, 16]> input_65 = transpose(perm = input_65_perm_0, x = x_11)[name = tensor<string, []>("transpose_47")];
+            tensor<fp32, [3, 512, 16]> inputs_13 = conv(bias = inner_encoder_conv_module_1_sequential_2_conv_bias, dilations = inputs_13_dilations_0, groups = inputs_13_groups_0, pad = inputs_13_pad_0, pad_type = inputs_13_pad_type_0, strides = inputs_13_strides_0, weight = inner_encoder_conv_module_1_sequential_2_conv_weight, x = input_65)[name = tensor<string, []>("inputs_13")];
+            tensor<int32, [2]> var_517_split_sizes_0 = const()[name = tensor<string, []>("op_517_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_517_axis_0 = const()[name = tensor<string, []>("op_517_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [3, 256, 16]> var_517_0, tensor<fp32, [3, 256, 16]> var_517_1 = split(axis = var_517_axis_0, split_sizes = var_517_split_sizes_0, x = inputs_13)[name = tensor<string, []>("op_517")];
+            tensor<fp32, [3, 256, 16]> var_519 = sigmoid(x = var_517_1)[name = tensor<string, []>("op_519")];
+            tensor<fp32, [3, 256, 16]> inputs_15 = mul(x = var_517_0, y = var_519)[name = tensor<string, []>("inputs_15")];
             tensor<string, []> outputs_aug_3_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_3_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_3_pad_0 = const()[name = tensor<string, []>("outputs_aug_3_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_3_groups_0 = const()[name = tensor<string, []>("outputs_aug_3_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_3_strides_0 = const()[name = tensor<string, []>("outputs_aug_3_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_3_dilations_0 = const()[name = tensor<string, []>("outputs_aug_3_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [3, 256, 31]> outputs_aug_3 = conv(dilations = outputs_aug_3_dilations_0, groups = outputs_aug_3_groups_0, pad = outputs_aug_3_pad_0, pad_type = outputs_aug_3_pad_type_0, strides = outputs_aug_3_strides_0, weight = encoder_conv_module_1_sequential_4_conv_weight, x = inputs_15)[name = tensor<string, []>("outputs_aug_3")];
-            tensor<int32, [3]> input_65_begin_0 = const()[name = tensor<string, []>("input_65_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_65_end_0 = const()[name = tensor<string, []>("input_65_end_0"), val = tensor<int32, [3]>([3, 256, 16])];
-            tensor<bool, [3]> input_65_end_mask_0 = const()[name = tensor<string, []>("input_65_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [3, 256, 16]> input_65 = slice_by_index(begin = input_65_begin_0, end = input_65_end_0, end_mask = input_65_end_mask_0, x = outputs_aug_3)[name = tensor<string, []>("input_65")];
-            tensor<fp32, [3, 256, 16]> inputs_17 = batch_norm(beta = encoder_conv_module_1_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_1_sequential_5_weight, mean = encoder_conv_module_1_sequential_5_running_mean, variance = encoder_conv_module_1_sequential_5_running_var, x = input_65)[name = tensor<string, []>("inputs_17")];
-            tensor<fp32, [3, 256, 16]> input_67 = silu(x = inputs_17)[name = tensor<string, []>("input_67")];
-            tensor<string, []> input_69_pad_type_0 = const()[name = tensor<string, []>("input_69_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_69_strides_0 = const()[name = tensor<string, []>("input_69_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_69_pad_0 = const()[name = tensor<string, []>("input_69_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_69_dilations_0 = const()[name = tensor<string, []>("input_69_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_69_groups_0 = const()[name = tensor<string, []>("input_69_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> input_69 = conv(bias = encoder_conv_module_1_sequential_7_conv_bias, dilations = input_69_dilations_0, groups = input_69_groups_0, pad = input_69_pad_0, pad_type = input_69_pad_type_0, strides = input_69_strides_0, weight = encoder_conv_module_1_sequential_7_conv_weight, x = input_67)[name = tensor<string, []>("input_69")];
+            tensor<fp32, [3, 256, 31]> outputs_aug_3 = conv(dilations = outputs_aug_3_dilations_0, groups = outputs_aug_3_groups_0, pad = outputs_aug_3_pad_0, pad_type = outputs_aug_3_pad_type_0, strides = outputs_aug_3_strides_0, weight = inner_encoder_conv_module_1_sequential_4_conv_weight, x = inputs_15)[name = tensor<string, []>("outputs_aug_3")];
+            tensor<int32, [3]> input_67_begin_0 = const()[name = tensor<string, []>("input_67_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_67_end_0 = const()[name = tensor<string, []>("input_67_end_0"), val = tensor<int32, [3]>([3, 256, 16])];
+            tensor<bool, [3]> input_67_end_mask_0 = const()[name = tensor<string, []>("input_67_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [3, 256, 16]> input_67 = slice_by_index(begin = input_67_begin_0, end = input_67_end_0, end_mask = input_67_end_mask_0, x = outputs_aug_3)[name = tensor<string, []>("input_67")];
+            tensor<fp32, [3, 256, 16]> inputs_17 = batch_norm(beta = inner_encoder_conv_module_1_sequential_5_bias, epsilon = var_56, gamma = inner_encoder_conv_module_1_sequential_5_weight, mean = inner_encoder_conv_module_1_sequential_5_running_mean, variance = inner_encoder_conv_module_1_sequential_5_running_var, x = input_67)[name = tensor<string, []>("inputs_17")];
+            tensor<fp32, [3, 256, 16]> input_69 = silu(x = inputs_17)[name = tensor<string, []>("input_69")];
+            tensor<string, []> input_71_pad_type_0 = const()[name = tensor<string, []>("input_71_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_71_strides_0 = const()[name = tensor<string, []>("input_71_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_71_pad_0 = const()[name = tensor<string, []>("input_71_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_71_dilations_0 = const()[name = tensor<string, []>("input_71_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_71_groups_0 = const()[name = tensor<string, []>("input_71_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [3, 256, 16]> input_71 = conv(bias = inner_encoder_conv_module_1_sequential_7_conv_bias, dilations = input_71_dilations_0, groups = input_71_groups_0, pad = input_71_pad_0, pad_type = input_71_pad_type_0, strides = input_71_strides_0, weight = inner_encoder_conv_module_1_sequential_7_conv_weight, x = input_69)[name = tensor<string, []>("input_71")];
             tensor<int32, [3]> conv_out_3_perm_0 = const()[name = tensor<string, []>("conv_out_3_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_504_begin_0 = const()[name = tensor<string, []>("op_504_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_504_end_0 = const()[name = tensor<string, []>("op_504_end_0"), val = tensor<int32, [3]>([3, 16, 256])];
-            tensor<bool, [3]> var_504_end_mask_0 = const()[name = tensor<string, []>("op_504_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [3, 16, 256]> conv_out_3 = transpose(perm = conv_out_3_perm_0, x = input_69)[name = tensor<string, []>("transpose_46")];
-            tensor<fp32, [3, 1, 256]> var_504 = slice_by_index(begin = var_504_begin_0, end = var_504_end_0, end_mask = var_504_end_mask_0, x = conv_out_3)[name = tensor<string, []>("op_504")];
-            tensor<int32, [3]> var_506_perm_0 = const()[name = tensor<string, []>("op_506_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 3, 256]> var_506 = transpose(perm = var_506_perm_0, x = var_504)[name = tensor<string, []>("transpose_45")];
-            tensor<fp32, [1, 3, 256]> input_71 = add(x = x_9, y = var_506)[name = tensor<string, []>("input_71")];
-            tensor<int32, [1]> input_73_axes_0 = const()[name = tensor<string, []>("input_73_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_73 = layer_norm(axes = input_73_axes_0, beta = encoder_ffn2_1_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_1_module_sequential_0_weight, x = input_71)[name = tensor<string, []>("input_73")];
-            tensor<fp32, [1, 3, 1024]> inputs_19 = linear(bias = encoder_ffn2_1_module_sequential_1_linear_bias, weight = encoder_ffn2_1_module_sequential_1_linear_weight, x = input_73)[name = tensor<string, []>("linear_17")];
-            tensor<fp32, [1, 3, 1024]> input_75 = silu(x = inputs_19)[name = tensor<string, []>("input_75")];
-            tensor<fp32, [1, 3, 256]> input_79 = linear(bias = encoder_ffn2_1_module_sequential_4_linear_bias, weight = encoder_ffn2_1_module_sequential_4_linear_weight, x = input_75)[name = tensor<string, []>("linear_18")];
-            tensor<fp32, []> var_529 = const()[name = tensor<string, []>("op_529"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 3, 256]> var_530 = mul(x = input_79, y = var_529)[name = tensor<string, []>("op_530")];
-            tensor<fp32, [1, 3, 256]> input_81 = add(x = var_530, y = input_71)[name = tensor<string, []>("input_81")];
-            tensor<int32, [1]> input_83_axes_0 = const()[name = tensor<string, []>("input_83_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_83 = layer_norm(axes = input_83_axes_0, beta = encoder_layer_norm_1_bias, epsilon = var_28, gamma = encoder_layer_norm_1_weight, x = input_81)[name = tensor<string, []>("input_83")];
+            tensor<int32, [3]> var_550_begin_0 = const()[name = tensor<string, []>("op_550_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_550_end_0 = const()[name = tensor<string, []>("op_550_end_0"), val = tensor<int32, [3]>([3, 16, 256])];
+            tensor<bool, [3]> var_550_end_mask_0 = const()[name = tensor<string, []>("op_550_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [3, 16, 256]> conv_out_3 = transpose(perm = conv_out_3_perm_0, x = input_71)[name = tensor<string, []>("transpose_46")];
+            tensor<fp32, [3, 1, 256]> var_550 = slice_by_index(begin = var_550_begin_0, end = var_550_end_0, end_mask = var_550_end_mask_0, x = conv_out_3)[name = tensor<string, []>("op_550")];
+            tensor<int32, [3]> var_552_perm_0 = const()[name = tensor<string, []>("op_552_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 3, 256]> var_552 = transpose(perm = var_552_perm_0, x = var_550)[name = tensor<string, []>("transpose_45")];
+            tensor<fp32, [1, 3, 256]> input_73 = add(x = x_9, y = var_552)[name = tensor<string, []>("input_73")];
+            tensor<int32, [1]> input_75_axes_0 = const()[name = tensor<string, []>("input_75_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 3, 256]> input_75 = layer_norm(axes = input_75_axes_0, beta = inner_encoder_ffn2_1_module_sequential_0_bias, epsilon = var_56, gamma = inner_encoder_ffn2_1_module_sequential_0_weight, x = input_73)[name = tensor<string, []>("input_75")];
+            tensor<fp32, [1, 3, 1024]> inputs_19 = linear(bias = inner_encoder_ffn2_1_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_1_module_sequential_1_linear_weight, x = input_75)[name = tensor<string, []>("linear_17")];
+            tensor<fp32, [1, 3, 1024]> input_77 = silu(x = inputs_19)[name = tensor<string, []>("input_77")];
+            tensor<fp32, [1, 3, 256]> input_81 = linear(bias = inner_encoder_ffn2_1_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_1_module_sequential_4_linear_weight, x = input_77)[name = tensor<string, []>("linear_18")];
+            tensor<fp32, []> var_575 = const()[name = tensor<string, []>("op_575"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 3, 256]> var_576 = mul(x = input_81, y = var_575)[name = tensor<string, []>("op_576")];
+            tensor<fp32, [1, 3, 256]> input_83 = add(x = var_576, y = input_73)[name = tensor<string, []>("input_83")];
             tensor<int32, [1]> input_85_axes_0 = const()[name = tensor<string, []>("input_85_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_85 = layer_norm(axes = input_85_axes_0, beta = encoder_ffn1_2_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_2_module_sequential_0_weight, x = input_83)[name = tensor<string, []>("input_85")];
-            tensor<fp32, [1, 3, 1024]> inputs_21 = linear(bias = encoder_ffn1_2_module_sequential_1_linear_bias, weight = encoder_ffn1_2_module_sequential_1_linear_weight, x = input_85)[name = tensor<string, []>("linear_19")];
-            tensor<fp32, [1, 3, 1024]> input_87 = silu(x = inputs_21)[name = tensor<string, []>("input_87")];
-            tensor<fp32, [1, 3, 256]> input_91 = linear(bias = encoder_ffn1_2_module_sequential_4_linear_bias, weight = encoder_ffn1_2_module_sequential_4_linear_weight, x = input_87)[name = tensor<string, []>("linear_20")];
-            tensor<fp32, []> var_559 = const()[name = tensor<string, []>("op_559"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 3, 256]> var_560 = mul(x = input_91, y = var_559)[name = tensor<string, []>("op_560")];
-            tensor<fp32, [1, 3, 256]> input_93 = add(x = var_560, y = input_83)[name = tensor<string, []>("input_93")];
+            tensor<fp32, [1, 3, 256]> input_85 = layer_norm(axes = input_85_axes_0, beta = inner_encoder_layer_norm_1_bias, epsilon = var_56, gamma = inner_encoder_layer_norm_1_weight, x = input_83)[name = tensor<string, []>("input_85")];
+            tensor<int32, [1]> input_87_axes_0 = const()[name = tensor<string, []>("input_87_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 3, 256]> input_87 = layer_norm(axes = input_87_axes_0, beta = inner_encoder_ffn1_2_module_sequential_0_bias, epsilon = var_56, gamma = inner_encoder_ffn1_2_module_sequential_0_weight, x = input_85)[name = tensor<string, []>("input_87")];
+            tensor<fp32, [1, 3, 1024]> inputs_21 = linear(bias = inner_encoder_ffn1_2_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_2_module_sequential_1_linear_weight, x = input_87)[name = tensor<string, []>("linear_19")];
+            tensor<fp32, [1, 3, 1024]> input_89 = silu(x = inputs_21)[name = tensor<string, []>("input_89")];
+            tensor<fp32, [1, 3, 256]> input_93 = linear(bias = inner_encoder_ffn1_2_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_2_module_sequential_4_linear_weight, x = input_89)[name = tensor<string, []>("linear_20")];
+            tensor<fp32, []> var_605 = const()[name = tensor<string, []>("op_605"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 3, 256]> var_606 = mul(x = input_93, y = var_605)[name = tensor<string, []>("op_606")];
+            tensor<fp32, [1, 3, 256]> input_95 = add(x = var_606, y = input_85)[name = tensor<string, []>("input_95")];
             tensor<int32, [1]> x_13_axes_0 = const()[name = tensor<string, []>("x_13_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> x_13 = layer_norm(axes = x_13_axes_0, beta = encoder_ret_lns_2_bias, epsilon = var_28, gamma = encoder_ret_lns_2_weight, x = input_93)[name = tensor<string, []>("x_13")];
+            tensor<fp32, [1, 3, 256]> x_13 = layer_norm(axes = x_13_axes_0, beta = inner_encoder_ret_lns_2_bias, epsilon = var_56, gamma = inner_encoder_ret_lns_2_weight, x = input_95)[name = tensor<string, []>("x_13")];
             tensor<int32, [5]> prev_kv_5_begin_0 = const()[name = tensor<string, []>("prev_kv_5_begin_0"), val = tensor<int32, [5]>([2, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_5_end_0 = const()[name = tensor<string, []>("prev_kv_5_end_0"), val = tensor<int32, [5]>([3, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_5_end_mask_0 = const()[name = tensor<string, []>("prev_kv_5_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -573,163 +591,163 @@ program(1.0)
             tensor<bool, [2]> prev_scale_5_end_mask_0 = const()[name = tensor<string, []>("prev_scale_5_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_5_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_5_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_5 = slice_by_index(begin = prev_scale_5_begin_0, end = prev_scale_5_end_0, end_mask = prev_scale_5_end_mask_0, squeeze_mask = prev_scale_5_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_5")];
-            tensor<fp32, [1, 3, 256]> var_574 = linear(bias = encoder_q_proj_2_bias, weight = encoder_q_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_21")];
-            tensor<int32, [4]> var_575 = const()[name = tensor<string, []>("op_575"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_576 = reshape(shape = var_575, x = var_574)[name = tensor<string, []>("op_576")];
+            tensor<fp32, [1, 3, 256]> var_620 = linear(bias = inner_encoder_q_proj_2_bias, weight = inner_encoder_q_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_21")];
+            tensor<int32, [4]> var_621 = const()[name = tensor<string, []>("op_621"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_622 = reshape(shape = var_621, x = var_620)[name = tensor<string, []>("op_622")];
             tensor<int32, [4]> q_5_perm_0 = const()[name = tensor<string, []>("q_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> var_580 = linear(bias = encoder_k_proj_2_bias, weight = encoder_k_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_22")];
-            tensor<fp32, []> var_581 = const()[name = tensor<string, []>("op_581"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 3, 256]> var_582 = mul(x = var_580, y = var_581)[name = tensor<string, []>("op_582")];
-            tensor<int32, [4]> var_583 = const()[name = tensor<string, []>("op_583"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_584 = reshape(shape = var_583, x = var_582)[name = tensor<string, []>("op_584")];
+            tensor<fp32, [1, 3, 256]> var_626 = linear(bias = inner_encoder_k_proj_2_bias, weight = inner_encoder_k_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_22")];
+            tensor<fp32, []> var_627 = const()[name = tensor<string, []>("op_627"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 3, 256]> var_628 = mul(x = var_626, y = var_627)[name = tensor<string, []>("op_628")];
+            tensor<int32, [4]> var_629 = const()[name = tensor<string, []>("op_629"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_630 = reshape(shape = var_629, x = var_628)[name = tensor<string, []>("op_630")];
             tensor<int32, [4]> k_5_perm_0 = const()[name = tensor<string, []>("k_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> var_588 = linear(bias = encoder_v_proj_2_bias, weight = encoder_v_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_23")];
-            tensor<int32, [4]> var_589 = const()[name = tensor<string, []>("op_589"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_590 = reshape(shape = var_589, x = var_588)[name = tensor<string, []>("op_590")];
+            tensor<fp32, [1, 3, 256]> var_634 = linear(bias = inner_encoder_v_proj_2_bias, weight = inner_encoder_v_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_23")];
+            tensor<int32, [4]> var_635 = const()[name = tensor<string, []>("op_635"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_636 = reshape(shape = var_635, x = var_634)[name = tensor<string, []>("op_636")];
             tensor<int32, [4]> v_5_perm_0 = const()[name = tensor<string, []>("v_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> input_97 = linear(bias = encoder_g_proj_2_bias, weight = encoder_g_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_24")];
+            tensor<fp32, [1, 3, 256]> input_99 = linear(bias = inner_encoder_g_proj_2_bias, weight = inner_encoder_g_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_24")];
             tensor<fp32, [1]> sqrt_s0_5 = sqrt(x = prev_scale_5)[name = tensor<string, []>("sqrt_s0_5")];
-            tensor<fp32, [3]> s_t_5 = add(x = prev_scale_5, y = encoder__t_index)[name = tensor<string, []>("s_t_5")];
+            tensor<fp32, [3]> s_t_5 = add(x = prev_scale_5, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_5")];
             tensor<fp32, [3]> sqrt_s_t_5 = sqrt(x = s_t_5)[name = tensor<string, []>("sqrt_s_t_5")];
             tensor<bool, []> qk_5_transpose_x_1 = const()[name = tensor<string, []>("qk_5_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_5_transpose_y_1 = const()[name = tensor<string, []>("qk_5_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 3, 64]> k_5 = transpose(perm = k_5_perm_0, x = var_584)[name = tensor<string, []>("transpose_43")];
-            tensor<fp32, [1, 4, 3, 64]> q_5 = transpose(perm = q_5_perm_0, x = var_576)[name = tensor<string, []>("transpose_44")];
+            tensor<fp32, [1, 4, 3, 64]> k_5 = transpose(perm = k_5_perm_0, x = var_630)[name = tensor<string, []>("transpose_43")];
+            tensor<fp32, [1, 4, 3, 64]> q_5 = transpose(perm = q_5_perm_0, x = var_622)[name = tensor<string, []>("transpose_44")];
             tensor<fp32, [1, 4, 3, 3]> qk_5 = matmul(transpose_x = qk_5_transpose_x_1, transpose_y = qk_5_transpose_y_1, x = q_5, y = k_5)[name = tensor<string, []>("qk_5")];
-            tensor<int32, [2]> var_600 = const()[name = tensor<string, []>("op_600"), val = tensor<int32, [2]>([3, 1])];
-            tensor<fp32, [3, 1]> var_601 = reshape(shape = var_600, x = sqrt_s_t_5)[name = tensor<string, []>("op_601")];
-            tensor<fp32, [3, 3]> M_5 = real_div(x = encoder__causal_mask, y = var_601)[name = tensor<string, []>("M_5")];
-            tensor<fp32, [1, 4, 3, 3]> var_603 = mul(x = qk_5, y = M_5)[name = tensor<string, []>("op_603")];
+            tensor<int32, [2]> var_646 = const()[name = tensor<string, []>("op_646"), val = tensor<int32, [2]>([3, 1])];
+            tensor<fp32, [3, 1]> var_647 = reshape(shape = var_646, x = sqrt_s_t_5)[name = tensor<string, []>("op_647")];
+            tensor<fp32, [3, 3]> M_5 = real_div(x = inner_encoder__causal_mask, y = var_647)[name = tensor<string, []>("M_5")];
+            tensor<fp32, [1, 4, 3, 3]> var_649 = mul(x = qk_5, y = M_5)[name = tensor<string, []>("op_649")];
             tensor<bool, []> inner_5_transpose_x_0 = const()[name = tensor<string, []>("inner_5_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_5_transpose_y_0 = const()[name = tensor<string, []>("inner_5_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 3, 64]> v_5 = transpose(perm = v_5_perm_0, x = var_590)[name = tensor<string, []>("transpose_42")];
-            tensor<fp32, [1, 4, 3, 64]> inner_5 = matmul(transpose_x = inner_5_transpose_x_0, transpose_y = inner_5_transpose_y_0, x = var_603, y = v_5)[name = tensor<string, []>("inner_5")];
-            tensor<bool, []> var_605_transpose_x_0 = const()[name = tensor<string, []>("op_605_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_605_transpose_y_0 = const()[name = tensor<string, []>("op_605_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 3, 64]> var_605 = matmul(transpose_x = var_605_transpose_x_0, transpose_y = var_605_transpose_y_0, x = q_5, y = prev_kv_5)[name = tensor<string, []>("op_605")];
-            tensor<fp32, [3]> var_606 = real_div(x = sqrt_s0_5, y = sqrt_s_t_5)[name = tensor<string, []>("op_606")];
-            tensor<int32, [4]> var_607 = const()[name = tensor<string, []>("op_607"), val = tensor<int32, [4]>([1, 1, 3, 1])];
-            tensor<fp32, [1, 1, 3, 1]> var_608 = reshape(shape = var_607, x = var_606)[name = tensor<string, []>("op_608")];
-            tensor<fp32, [1, 4, 3, 64]> cross_5 = mul(x = var_605, y = var_608)[name = tensor<string, []>("cross_5")];
+            tensor<fp32, [1, 4, 3, 64]> v_5 = transpose(perm = v_5_perm_0, x = var_636)[name = tensor<string, []>("transpose_42")];
+            tensor<fp32, [1, 4, 3, 64]> inner_5 = matmul(transpose_x = inner_5_transpose_x_0, transpose_y = inner_5_transpose_y_0, x = var_649, y = v_5)[name = tensor<string, []>("inner_5")];
+            tensor<bool, []> var_651_transpose_x_0 = const()[name = tensor<string, []>("op_651_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_651_transpose_y_0 = const()[name = tensor<string, []>("op_651_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 3, 64]> var_651 = matmul(transpose_x = var_651_transpose_x_0, transpose_y = var_651_transpose_y_0, x = q_5, y = prev_kv_5)[name = tensor<string, []>("op_651")];
+            tensor<fp32, [3]> var_652 = real_div(x = sqrt_s0_5, y = sqrt_s_t_5)[name = tensor<string, []>("op_652")];
+            tensor<int32, [4]> var_653 = const()[name = tensor<string, []>("op_653"), val = tensor<int32, [4]>([1, 1, 3, 1])];
+            tensor<fp32, [1, 1, 3, 1]> var_654 = reshape(shape = var_653, x = var_652)[name = tensor<string, []>("op_654")];
+            tensor<fp32, [1, 4, 3, 64]> cross_5 = mul(x = var_651, y = var_654)[name = tensor<string, []>("cross_5")];
             tensor<fp32, [1, 4, 3, 64]> out_13 = add(x = inner_5, y = cross_5)[name = tensor<string, []>("out_13")];
-            tensor<fp32, [1, 4, 64, 64]> var_611 = mul(x = prev_kv_5, y = sqrt_s0_5)[name = tensor<string, []>("op_611")];
-            tensor<bool, []> var_613_transpose_x_1 = const()[name = tensor<string, []>("op_613_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_613_transpose_y_1 = const()[name = tensor<string, []>("op_613_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_613 = matmul(transpose_x = var_613_transpose_x_1, transpose_y = var_613_transpose_y_1, x = k_5, y = v_5)[name = tensor<string, []>("op_613")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_5 = add(x = var_611, y = var_613)[name = tensor<string, []>("new_kv_unnorm_5")];
-            tensor<fp32, []> var_615 = const()[name = tensor<string, []>("op_615"), val = tensor<fp32, []>(0x1.8p+1)];
-            tensor<fp32, [1]> new_scale_5 = add(x = prev_scale_5, y = var_615)[name = tensor<string, []>("new_scale_5")];
-            tensor<fp32, [1]> var_617 = sqrt(x = new_scale_5)[name = tensor<string, []>("op_617")];
-            tensor<fp32, [1, 4, 64, 64]> var_618 = real_div(x = new_kv_unnorm_5, y = var_617)[name = tensor<string, []>("op_618")];
-            tensor<int32, [4]> var_619_perm_0 = const()[name = tensor<string, []>("op_619_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_657 = mul(x = prev_kv_5, y = sqrt_s0_5)[name = tensor<string, []>("op_657")];
+            tensor<bool, []> var_659_transpose_x_1 = const()[name = tensor<string, []>("op_659_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_659_transpose_y_1 = const()[name = tensor<string, []>("op_659_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_659 = matmul(transpose_x = var_659_transpose_x_1, transpose_y = var_659_transpose_y_1, x = k_5, y = v_5)[name = tensor<string, []>("op_659")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_5 = add(x = var_657, y = var_659)[name = tensor<string, []>("new_kv_unnorm_5")];
+            tensor<fp32, []> var_661 = const()[name = tensor<string, []>("op_661"), val = tensor<fp32, []>(0x1.8p+1)];
+            tensor<fp32, [1]> new_scale_5 = add(x = prev_scale_5, y = var_661)[name = tensor<string, []>("new_scale_5")];
+            tensor<fp32, [1]> var_663 = sqrt(x = new_scale_5)[name = tensor<string, []>("op_663")];
+            tensor<fp32, [1, 4, 64, 64]> var_664 = real_div(x = new_kv_unnorm_5, y = var_663)[name = tensor<string, []>("op_664")];
+            tensor<int32, [4]> var_665_perm_0 = const()[name = tensor<string, []>("op_665_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_15_axes_0 = const()[name = tensor<string, []>("out_15_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 4, 64]> var_619 = transpose(perm = var_619_perm_0, x = out_13)[name = tensor<string, []>("transpose_41")];
-            tensor<fp32, [1, 3, 4, 64]> out_15 = layer_norm(axes = out_15_axes_0, epsilon = var_17, x = var_619)[name = tensor<string, []>("out_15")];
-            tensor<int32, [3]> var_623 = const()[name = tensor<string, []>("op_623"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<fp32, [1, 3, 256]> out_17 = reshape(shape = var_623, x = out_15)[name = tensor<string, []>("out_17")];
-            tensor<fp32, [1, 3, 256]> var_625 = silu(x = input_97)[name = tensor<string, []>("op_625")];
-            tensor<fp32, [1, 3, 256]> input_99 = mul(x = var_625, y = out_17)[name = tensor<string, []>("input_99")];
-            tensor<fp32, [1, 3, 256]> ret_out_5 = linear(bias = encoder_out_proj_2_bias, weight = encoder_out_proj_2_weight, x = input_99)[name = tensor<string, []>("linear_25")];
-            tensor<fp32, [1, 3, 256]> x_15 = add(x = input_93, y = ret_out_5)[name = tensor<string, []>("x_15")];
+            tensor<fp32, [1, 3, 4, 64]> var_665 = transpose(perm = var_665_perm_0, x = out_13)[name = tensor<string, []>("transpose_41")];
+            tensor<fp32, [1, 3, 4, 64]> out_15 = layer_norm(axes = out_15_axes_0, epsilon = var_64, x = var_665)[name = tensor<string, []>("out_15")];
+            tensor<int32, [3]> var_669 = const()[name = tensor<string, []>("op_669"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<fp32, [1, 3, 256]> out_17 = reshape(shape = var_669, x = out_15)[name = tensor<string, []>("out_17")];
+            tensor<fp32, [1, 3, 256]> var_671 = silu(x = input_99)[name = tensor<string, []>("op_671")];
+            tensor<fp32, [1, 3, 256]> input_101 = mul(x = var_671, y = out_17)[name = tensor<string, []>("input_101")];
+            tensor<fp32, [1, 3, 256]> ret_out_5 = linear(bias = inner_encoder_out_proj_2_bias, weight = inner_encoder_out_proj_2_weight, x = input_101)[name = tensor<string, []>("linear_25")];
+            tensor<fp32, [1, 3, 256]> x_15 = add(x = input_95, y = ret_out_5)[name = tensor<string, []>("x_15")];
             tensor<int32, [4]> window_17_begin_0 = const()[name = tensor<string, []>("window_17_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> window_17_end_0 = const()[name = tensor<string, []>("window_17_end_0"), val = tensor<int32, [4]>([3, 1, 16, 256])];
             tensor<bool, [4]> window_17_end_mask_0 = const()[name = tensor<string, []>("window_17_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_17_squeeze_mask_0 = const()[name = tensor<string, []>("window_17_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_17 = slice_by_index(begin = window_17_begin_0, end = window_17_end_0, end_mask = window_17_end_mask_0, squeeze_mask = window_17_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_17")];
-            tensor<int32, [3]> var_633_begin_0 = const()[name = tensor<string, []>("op_633_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_633_end_0 = const()[name = tensor<string, []>("op_633_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_633_end_mask_0 = const()[name = tensor<string, []>("op_633_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_633 = slice_by_index(begin = var_633_begin_0, end = var_633_end_0, end_mask = var_633_end_mask_0, x = x_15)[name = tensor<string, []>("op_633")];
-            tensor<int32, [3]> var_636_begin_0 = const()[name = tensor<string, []>("op_636_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_636_end_0 = const()[name = tensor<string, []>("op_636_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_636_end_mask_0 = const()[name = tensor<string, []>("op_636_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_636 = slice_by_index(begin = var_636_begin_0, end = var_636_end_0, end_mask = var_636_end_mask_0, x = window_17)[name = tensor<string, []>("op_636")];
+            tensor<int32, [3]> var_679_begin_0 = const()[name = tensor<string, []>("op_679_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_679_end_0 = const()[name = tensor<string, []>("op_679_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_679_end_mask_0 = const()[name = tensor<string, []>("op_679_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_679 = slice_by_index(begin = var_679_begin_0, end = var_679_end_0, end_mask = var_679_end_mask_0, x = x_15)[name = tensor<string, []>("op_679")];
+            tensor<int32, [3]> var_682_begin_0 = const()[name = tensor<string, []>("op_682_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_682_end_0 = const()[name = tensor<string, []>("op_682_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_682_end_mask_0 = const()[name = tensor<string, []>("op_682_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_682 = slice_by_index(begin = var_682_begin_0, end = var_682_end_0, end_mask = var_682_end_mask_0, x = window_17)[name = tensor<string, []>("op_682")];
             tensor<bool, []> window_19_interleave_0 = const()[name = tensor<string, []>("window_19_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_19 = concat(axis = var_26, interleave = window_19_interleave_0, values = (var_636, var_633))[name = tensor<string, []>("window_19")];
-            tensor<int32, [3]> var_641_begin_0 = const()[name = tensor<string, []>("op_641_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_641_end_0 = const()[name = tensor<string, []>("op_641_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_641_end_mask_0 = const()[name = tensor<string, []>("op_641_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_641 = slice_by_index(begin = var_641_begin_0, end = var_641_end_0, end_mask = var_641_end_mask_0, x = x_15)[name = tensor<string, []>("op_641")];
-            tensor<int32, [3]> var_644_begin_0 = const()[name = tensor<string, []>("op_644_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_644_end_0 = const()[name = tensor<string, []>("op_644_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_644_end_mask_0 = const()[name = tensor<string, []>("op_644_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_644 = slice_by_index(begin = var_644_begin_0, end = var_644_end_0, end_mask = var_644_end_mask_0, x = window_19)[name = tensor<string, []>("op_644")];
+            tensor<fp32, [1, 16, 256]> window_19 = concat(axis = var_72, interleave = window_19_interleave_0, values = (var_682, var_679))[name = tensor<string, []>("window_19")];
+            tensor<int32, [3]> var_687_begin_0 = const()[name = tensor<string, []>("op_687_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_687_end_0 = const()[name = tensor<string, []>("op_687_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_687_end_mask_0 = const()[name = tensor<string, []>("op_687_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_687 = slice_by_index(begin = var_687_begin_0, end = var_687_end_0, end_mask = var_687_end_mask_0, x = x_15)[name = tensor<string, []>("op_687")];
+            tensor<int32, [3]> var_690_begin_0 = const()[name = tensor<string, []>("op_690_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_690_end_0 = const()[name = tensor<string, []>("op_690_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_690_end_mask_0 = const()[name = tensor<string, []>("op_690_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_690 = slice_by_index(begin = var_690_begin_0, end = var_690_end_0, end_mask = var_690_end_mask_0, x = window_19)[name = tensor<string, []>("op_690")];
             tensor<bool, []> window_21_interleave_0 = const()[name = tensor<string, []>("window_21_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_21 = concat(axis = var_26, interleave = window_21_interleave_0, values = (var_644, var_641))[name = tensor<string, []>("window_21")];
-            tensor<int32, [3]> var_649_begin_0 = const()[name = tensor<string, []>("op_649_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_649_end_0 = const()[name = tensor<string, []>("op_649_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_649_end_mask_0 = const()[name = tensor<string, []>("op_649_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_649 = slice_by_index(begin = var_649_begin_0, end = var_649_end_0, end_mask = var_649_end_mask_0, x = x_15)[name = tensor<string, []>("op_649")];
-            tensor<int32, [3]> var_652_begin_0 = const()[name = tensor<string, []>("op_652_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_652_end_0 = const()[name = tensor<string, []>("op_652_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_652_end_mask_0 = const()[name = tensor<string, []>("op_652_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_652 = slice_by_index(begin = var_652_begin_0, end = var_652_end_0, end_mask = var_652_end_mask_0, x = window_21)[name = tensor<string, []>("op_652")];
+            tensor<fp32, [1, 16, 256]> window_21 = concat(axis = var_72, interleave = window_21_interleave_0, values = (var_690, var_687))[name = tensor<string, []>("window_21")];
+            tensor<int32, [3]> var_695_begin_0 = const()[name = tensor<string, []>("op_695_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_695_end_0 = const()[name = tensor<string, []>("op_695_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_695_end_mask_0 = const()[name = tensor<string, []>("op_695_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_695 = slice_by_index(begin = var_695_begin_0, end = var_695_end_0, end_mask = var_695_end_mask_0, x = x_15)[name = tensor<string, []>("op_695")];
+            tensor<int32, [3]> var_698_begin_0 = const()[name = tensor<string, []>("op_698_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_698_end_0 = const()[name = tensor<string, []>("op_698_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_698_end_mask_0 = const()[name = tensor<string, []>("op_698_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_698 = slice_by_index(begin = var_698_begin_0, end = var_698_end_0, end_mask = var_698_end_mask_0, x = window_21)[name = tensor<string, []>("op_698")];
             tensor<bool, []> window_23_interleave_0 = const()[name = tensor<string, []>("window_23_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_23 = concat(axis = var_26, interleave = window_23_interleave_0, values = (var_652, var_649))[name = tensor<string, []>("window_23")];
-            tensor<bool, []> input_101_interleave_0 = const()[name = tensor<string, []>("input_101_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [3, 16, 256]> input_101 = concat(axis = var_23, interleave = input_101_interleave_0, values = (window_19, window_21, window_23))[name = tensor<string, []>("input_101")];
+            tensor<fp32, [1, 16, 256]> window_23 = concat(axis = var_72, interleave = window_23_interleave_0, values = (var_698, var_695))[name = tensor<string, []>("window_23")];
+            tensor<bool, []> input_103_interleave_0 = const()[name = tensor<string, []>("input_103_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [3, 16, 256]> input_103 = concat(axis = var_59, interleave = input_103_interleave_0, values = (window_19, window_21, window_23))[name = tensor<string, []>("input_103")];
             tensor<int32, [1]> x_17_axes_0 = const()[name = tensor<string, []>("x_17_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [3, 16, 256]> x_17 = layer_norm(axes = x_17_axes_0, beta = encoder_conv_module_2_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_2_sequential_0_weight, x = input_101)[name = tensor<string, []>("x_17")];
-            tensor<int32, [3]> input_103_perm_0 = const()[name = tensor<string, []>("input_103_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [3, 16, 256]> x_17 = layer_norm(axes = x_17_axes_0, beta = inner_encoder_conv_module_2_sequential_0_bias, epsilon = var_56, gamma = inner_encoder_conv_module_2_sequential_0_weight, x = input_103)[name = tensor<string, []>("x_17")];
+            tensor<int32, [3]> input_105_perm_0 = const()[name = tensor<string, []>("input_105_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_23_pad_type_0 = const()[name = tensor<string, []>("inputs_23_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_23_strides_0 = const()[name = tensor<string, []>("inputs_23_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_23_pad_0 = const()[name = tensor<string, []>("inputs_23_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_23_dilations_0 = const()[name = tensor<string, []>("inputs_23_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_23_groups_0 = const()[name = tensor<string, []>("inputs_23_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> input_103 = transpose(perm = input_103_perm_0, x = x_17)[name = tensor<string, []>("transpose_40")];
-            tensor<fp32, [3, 512, 16]> inputs_23 = conv(bias = encoder_conv_module_2_sequential_2_conv_bias, dilations = inputs_23_dilations_0, groups = inputs_23_groups_0, pad = inputs_23_pad_0, pad_type = inputs_23_pad_type_0, strides = inputs_23_strides_0, weight = encoder_conv_module_2_sequential_2_conv_weight, x = input_103)[name = tensor<string, []>("inputs_23")];
-            tensor<int32, [2]> var_677_split_sizes_0 = const()[name = tensor<string, []>("op_677_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_677_axis_0 = const()[name = tensor<string, []>("op_677_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> var_677_0, tensor<fp32, [3, 256, 16]> var_677_1 = split(axis = var_677_axis_0, split_sizes = var_677_split_sizes_0, x = inputs_23)[name = tensor<string, []>("op_677")];
-            tensor<fp32, [3, 256, 16]> var_679 = sigmoid(x = var_677_1)[name = tensor<string, []>("op_679")];
-            tensor<fp32, [3, 256, 16]> inputs_25 = mul(x = var_677_0, y = var_679)[name = tensor<string, []>("inputs_25")];
+            tensor<fp32, [3, 256, 16]> input_105 = transpose(perm = input_105_perm_0, x = x_17)[name = tensor<string, []>("transpose_40")];
+            tensor<fp32, [3, 512, 16]> inputs_23 = conv(bias = inner_encoder_conv_module_2_sequential_2_conv_bias, dilations = inputs_23_dilations_0, groups = inputs_23_groups_0, pad = inputs_23_pad_0, pad_type = inputs_23_pad_type_0, strides = inputs_23_strides_0, weight = inner_encoder_conv_module_2_sequential_2_conv_weight, x = input_105)[name = tensor<string, []>("inputs_23")];
+            tensor<int32, [2]> var_723_split_sizes_0 = const()[name = tensor<string, []>("op_723_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_723_axis_0 = const()[name = tensor<string, []>("op_723_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [3, 256, 16]> var_723_0, tensor<fp32, [3, 256, 16]> var_723_1 = split(axis = var_723_axis_0, split_sizes = var_723_split_sizes_0, x = inputs_23)[name = tensor<string, []>("op_723")];
+            tensor<fp32, [3, 256, 16]> var_725 = sigmoid(x = var_723_1)[name = tensor<string, []>("op_725")];
+            tensor<fp32, [3, 256, 16]> inputs_25 = mul(x = var_723_0, y = var_725)[name = tensor<string, []>("inputs_25")];
             tensor<string, []> outputs_aug_5_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_5_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_5_pad_0 = const()[name = tensor<string, []>("outputs_aug_5_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_5_groups_0 = const()[name = tensor<string, []>("outputs_aug_5_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_5_strides_0 = const()[name = tensor<string, []>("outputs_aug_5_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_5_dilations_0 = const()[name = tensor<string, []>("outputs_aug_5_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [3, 256, 31]> outputs_aug_5 = conv(dilations = outputs_aug_5_dilations_0, groups = outputs_aug_5_groups_0, pad = outputs_aug_5_pad_0, pad_type = outputs_aug_5_pad_type_0, strides = outputs_aug_5_strides_0, weight = encoder_conv_module_2_sequential_4_conv_weight, x = inputs_25)[name = tensor<string, []>("outputs_aug_5")];
-            tensor<int32, [3]> input_105_begin_0 = const()[name = tensor<string, []>("input_105_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_105_end_0 = const()[name = tensor<string, []>("input_105_end_0"), val = tensor<int32, [3]>([3, 256, 16])];
-            tensor<bool, [3]> input_105_end_mask_0 = const()[name = tensor<string, []>("input_105_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [3, 256, 16]> input_105 = slice_by_index(begin = input_105_begin_0, end = input_105_end_0, end_mask = input_105_end_mask_0, x = outputs_aug_5)[name = tensor<string, []>("input_105")];
-            tensor<fp32, [3, 256, 16]> inputs_27 = batch_norm(beta = encoder_conv_module_2_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_2_sequential_5_weight, mean = encoder_conv_module_2_sequential_5_running_mean, variance = encoder_conv_module_2_sequential_5_running_var, x = input_105)[name = tensor<string, []>("inputs_27")];
-            tensor<fp32, [3, 256, 16]> input_107 = silu(x = inputs_27)[name = tensor<string, []>("input_107")];
-            tensor<string, []> input_109_pad_type_0 = const()[name = tensor<string, []>("input_109_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_109_strides_0 = const()[name = tensor<string, []>("input_109_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_109_pad_0 = const()[name = tensor<string, []>("input_109_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_109_dilations_0 = const()[name = tensor<string, []>("input_109_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_109_groups_0 = const()[name = tensor<string, []>("input_109_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> input_109 = conv(bias = encoder_conv_module_2_sequential_7_conv_bias, dilations = input_109_dilations_0, groups = input_109_groups_0, pad = input_109_pad_0, pad_type = input_109_pad_type_0, strides = input_109_strides_0, weight = encoder_conv_module_2_sequential_7_conv_weight, x = input_107)[name = tensor<string, []>("input_109")];
+            tensor<fp32, [3, 256, 31]> outputs_aug_5 = conv(dilations = outputs_aug_5_dilations_0, groups = outputs_aug_5_groups_0, pad = outputs_aug_5_pad_0, pad_type = outputs_aug_5_pad_type_0, strides = outputs_aug_5_strides_0, weight = inner_encoder_conv_module_2_sequential_4_conv_weight, x = inputs_25)[name = tensor<string, []>("outputs_aug_5")];
+            tensor<int32, [3]> input_107_begin_0 = const()[name = tensor<string, []>("input_107_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_107_end_0 = const()[name = tensor<string, []>("input_107_end_0"), val = tensor<int32, [3]>([3, 256, 16])];
+            tensor<bool, [3]> input_107_end_mask_0 = const()[name = tensor<string, []>("input_107_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [3, 256, 16]> input_107 = slice_by_index(begin = input_107_begin_0, end = input_107_end_0, end_mask = input_107_end_mask_0, x = outputs_aug_5)[name = tensor<string, []>("input_107")];
+            tensor<fp32, [3, 256, 16]> inputs_27 = batch_norm(beta = inner_encoder_conv_module_2_sequential_5_bias, epsilon = var_56, gamma = inner_encoder_conv_module_2_sequential_5_weight, mean = inner_encoder_conv_module_2_sequential_5_running_mean, variance = inner_encoder_conv_module_2_sequential_5_running_var, x = input_107)[name = tensor<string, []>("inputs_27")];
+            tensor<fp32, [3, 256, 16]> input_109 = silu(x = inputs_27)[name = tensor<string, []>("input_109")];
+            tensor<string, []> input_111_pad_type_0 = const()[name = tensor<string, []>("input_111_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_111_strides_0 = const()[name = tensor<string, []>("input_111_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_111_pad_0 = const()[name = tensor<string, []>("input_111_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_111_dilations_0 = const()[name = tensor<string, []>("input_111_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_111_groups_0 = const()[name = tensor<string, []>("input_111_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [3, 256, 16]> input_111 = conv(bias = inner_encoder_conv_module_2_sequential_7_conv_bias, dilations = input_111_dilations_0, groups = input_111_groups_0, pad = input_111_pad_0, pad_type = input_111_pad_type_0, strides = input_111_strides_0, weight = inner_encoder_conv_module_2_sequential_7_conv_weight, x = input_109)[name = tensor<string, []>("input_111")];
             tensor<int32, [3]> conv_out_5_perm_0 = const()[name = tensor<string, []>("conv_out_5_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_710_begin_0 = const()[name = tensor<string, []>("op_710_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_710_end_0 = const()[name = tensor<string, []>("op_710_end_0"), val = tensor<int32, [3]>([3, 16, 256])];
-            tensor<bool, [3]> var_710_end_mask_0 = const()[name = tensor<string, []>("op_710_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [3, 16, 256]> conv_out_5 = transpose(perm = conv_out_5_perm_0, x = input_109)[name = tensor<string, []>("transpose_39")];
-            tensor<fp32, [3, 1, 256]> var_710 = slice_by_index(begin = var_710_begin_0, end = var_710_end_0, end_mask = var_710_end_mask_0, x = conv_out_5)[name = tensor<string, []>("op_710")];
-            tensor<int32, [3]> var_712_perm_0 = const()[name = tensor<string, []>("op_712_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 3, 256]> var_712 = transpose(perm = var_712_perm_0, x = var_710)[name = tensor<string, []>("transpose_38")];
-            tensor<fp32, [1, 3, 256]> input_111 = add(x = x_15, y = var_712)[name = tensor<string, []>("input_111")];
-            tensor<int32, [1]> input_113_axes_0 = const()[name = tensor<string, []>("input_113_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_113 = layer_norm(axes = input_113_axes_0, beta = encoder_ffn2_2_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_2_module_sequential_0_weight, x = input_111)[name = tensor<string, []>("input_113")];
-            tensor<fp32, [1, 3, 1024]> inputs_29 = linear(bias = encoder_ffn2_2_module_sequential_1_linear_bias, weight = encoder_ffn2_2_module_sequential_1_linear_weight, x = input_113)[name = tensor<string, []>("linear_26")];
-            tensor<fp32, [1, 3, 1024]> input_115 = silu(x = inputs_29)[name = tensor<string, []>("input_115")];
-            tensor<fp32, [1, 3, 256]> input_119 = linear(bias = encoder_ffn2_2_module_sequential_4_linear_bias, weight = encoder_ffn2_2_module_sequential_4_linear_weight, x = input_115)[name = tensor<string, []>("linear_27")];
-            tensor<fp32, []> var_735 = const()[name = tensor<string, []>("op_735"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 3, 256]> var_736 = mul(x = input_119, y = var_735)[name = tensor<string, []>("op_736")];
-            tensor<fp32, [1, 3, 256]> input_121 = add(x = var_736, y = input_111)[name = tensor<string, []>("input_121")];
-            tensor<int32, [1]> input_123_axes_0 = const()[name = tensor<string, []>("input_123_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_123 = layer_norm(axes = input_123_axes_0, beta = encoder_layer_norm_2_bias, epsilon = var_28, gamma = encoder_layer_norm_2_weight, x = input_121)[name = tensor<string, []>("input_123")];
+            tensor<int32, [3]> var_756_begin_0 = const()[name = tensor<string, []>("op_756_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_756_end_0 = const()[name = tensor<string, []>("op_756_end_0"), val = tensor<int32, [3]>([3, 16, 256])];
+            tensor<bool, [3]> var_756_end_mask_0 = const()[name = tensor<string, []>("op_756_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [3, 16, 256]> conv_out_5 = transpose(perm = conv_out_5_perm_0, x = input_111)[name = tensor<string, []>("transpose_39")];
+            tensor<fp32, [3, 1, 256]> var_756 = slice_by_index(begin = var_756_begin_0, end = var_756_end_0, end_mask = var_756_end_mask_0, x = conv_out_5)[name = tensor<string, []>("op_756")];
+            tensor<int32, [3]> var_758_perm_0 = const()[name = tensor<string, []>("op_758_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 3, 256]> var_758 = transpose(perm = var_758_perm_0, x = var_756)[name = tensor<string, []>("transpose_38")];
+            tensor<fp32, [1, 3, 256]> input_113 = add(x = x_15, y = var_758)[name = tensor<string, []>("input_113")];
+            tensor<int32, [1]> input_115_axes_0 = const()[name = tensor<string, []>("input_115_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 3, 256]> input_115 = layer_norm(axes = input_115_axes_0, beta = inner_encoder_ffn2_2_module_sequential_0_bias, epsilon = var_56, gamma = inner_encoder_ffn2_2_module_sequential_0_weight, x = input_113)[name = tensor<string, []>("input_115")];
+            tensor<fp32, [1, 3, 1024]> inputs_29 = linear(bias = inner_encoder_ffn2_2_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_2_module_sequential_1_linear_weight, x = input_115)[name = tensor<string, []>("linear_26")];
+            tensor<fp32, [1, 3, 1024]> input_117 = silu(x = inputs_29)[name = tensor<string, []>("input_117")];
+            tensor<fp32, [1, 3, 256]> input_121 = linear(bias = inner_encoder_ffn2_2_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_2_module_sequential_4_linear_weight, x = input_117)[name = tensor<string, []>("linear_27")];
+            tensor<fp32, []> var_781 = const()[name = tensor<string, []>("op_781"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 3, 256]> var_782 = mul(x = input_121, y = var_781)[name = tensor<string, []>("op_782")];
+            tensor<fp32, [1, 3, 256]> input_123 = add(x = var_782, y = input_113)[name = tensor<string, []>("input_123")];
             tensor<int32, [1]> input_125_axes_0 = const()[name = tensor<string, []>("input_125_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_125 = layer_norm(axes = input_125_axes_0, beta = encoder_ffn1_3_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_3_module_sequential_0_weight, x = input_123)[name = tensor<string, []>("input_125")];
-            tensor<fp32, [1, 3, 1024]> inputs_31 = linear(bias = encoder_ffn1_3_module_sequential_1_linear_bias, weight = encoder_ffn1_3_module_sequential_1_linear_weight, x = input_125)[name = tensor<string, []>("linear_28")];
-            tensor<fp32, [1, 3, 1024]> input_127 = silu(x = inputs_31)[name = tensor<string, []>("input_127")];
-            tensor<fp32, [1, 3, 256]> input_131 = linear(bias = encoder_ffn1_3_module_sequential_4_linear_bias, weight = encoder_ffn1_3_module_sequential_4_linear_weight, x = input_127)[name = tensor<string, []>("linear_29")];
-            tensor<fp32, []> var_765 = const()[name = tensor<string, []>("op_765"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 3, 256]> var_766 = mul(x = input_131, y = var_765)[name = tensor<string, []>("op_766")];
-            tensor<fp32, [1, 3, 256]> input_133 = add(x = var_766, y = input_123)[name = tensor<string, []>("input_133")];
+            tensor<fp32, [1, 3, 256]> input_125 = layer_norm(axes = input_125_axes_0, beta = inner_encoder_layer_norm_2_bias, epsilon = var_56, gamma = inner_encoder_layer_norm_2_weight, x = input_123)[name = tensor<string, []>("input_125")];
+            tensor<int32, [1]> input_127_axes_0 = const()[name = tensor<string, []>("input_127_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 3, 256]> input_127 = layer_norm(axes = input_127_axes_0, beta = inner_encoder_ffn1_3_module_sequential_0_bias, epsilon = var_56, gamma = inner_encoder_ffn1_3_module_sequential_0_weight, x = input_125)[name = tensor<string, []>("input_127")];
+            tensor<fp32, [1, 3, 1024]> inputs_31 = linear(bias = inner_encoder_ffn1_3_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_3_module_sequential_1_linear_weight, x = input_127)[name = tensor<string, []>("linear_28")];
+            tensor<fp32, [1, 3, 1024]> input_129 = silu(x = inputs_31)[name = tensor<string, []>("input_129")];
+            tensor<fp32, [1, 3, 256]> input_133 = linear(bias = inner_encoder_ffn1_3_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_3_module_sequential_4_linear_weight, x = input_129)[name = tensor<string, []>("linear_29")];
+            tensor<fp32, []> var_811 = const()[name = tensor<string, []>("op_811"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 3, 256]> var_812 = mul(x = input_133, y = var_811)[name = tensor<string, []>("op_812")];
+            tensor<fp32, [1, 3, 256]> input_135 = add(x = var_812, y = input_125)[name = tensor<string, []>("input_135")];
             tensor<int32, [1]> x_19_axes_0 = const()[name = tensor<string, []>("x_19_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> x_19 = layer_norm(axes = x_19_axes_0, beta = encoder_ret_lns_3_bias, epsilon = var_28, gamma = encoder_ret_lns_3_weight, x = input_133)[name = tensor<string, []>("x_19")];
+            tensor<fp32, [1, 3, 256]> x_19 = layer_norm(axes = x_19_axes_0, beta = inner_encoder_ret_lns_3_bias, epsilon = var_56, gamma = inner_encoder_ret_lns_3_weight, x = input_135)[name = tensor<string, []>("x_19")];
             tensor<int32, [5]> prev_kv_7_begin_0 = const()[name = tensor<string, []>("prev_kv_7_begin_0"), val = tensor<int32, [5]>([3, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_7_end_0 = const()[name = tensor<string, []>("prev_kv_7_end_0"), val = tensor<int32, [5]>([4, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_7_end_mask_0 = const()[name = tensor<string, []>("prev_kv_7_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -740,199 +758,192 @@ program(1.0)
             tensor<bool, [2]> prev_scale_7_end_mask_0 = const()[name = tensor<string, []>("prev_scale_7_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_7_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_7_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_7 = slice_by_index(begin = prev_scale_7_begin_0, end = prev_scale_7_end_0, end_mask = prev_scale_7_end_mask_0, squeeze_mask = prev_scale_7_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_7")];
-            tensor<fp32, [1, 3, 256]> var_780 = linear(bias = encoder_q_proj_3_bias, weight = encoder_q_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_30")];
-            tensor<int32, [4]> var_781 = const()[name = tensor<string, []>("op_781"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_782 = reshape(shape = var_781, x = var_780)[name = tensor<string, []>("op_782")];
+            tensor<fp32, [1, 3, 256]> var_826 = linear(bias = inner_encoder_q_proj_3_bias, weight = inner_encoder_q_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_30")];
+            tensor<int32, [4]> var_827 = const()[name = tensor<string, []>("op_827"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_828 = reshape(shape = var_827, x = var_826)[name = tensor<string, []>("op_828")];
             tensor<int32, [4]> q_7_perm_0 = const()[name = tensor<string, []>("q_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> var_786 = linear(bias = encoder_k_proj_3_bias, weight = encoder_k_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_31")];
-            tensor<fp32, []> var_787 = const()[name = tensor<string, []>("op_787"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 3, 256]> var_788 = mul(x = var_786, y = var_787)[name = tensor<string, []>("op_788")];
-            tensor<int32, [4]> var_789 = const()[name = tensor<string, []>("op_789"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_790 = reshape(shape = var_789, x = var_788)[name = tensor<string, []>("op_790")];
+            tensor<fp32, [1, 3, 256]> var_832 = linear(bias = inner_encoder_k_proj_3_bias, weight = inner_encoder_k_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_31")];
+            tensor<fp32, []> var_833 = const()[name = tensor<string, []>("op_833"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 3, 256]> var_834 = mul(x = var_832, y = var_833)[name = tensor<string, []>("op_834")];
+            tensor<int32, [4]> var_835 = const()[name = tensor<string, []>("op_835"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_836 = reshape(shape = var_835, x = var_834)[name = tensor<string, []>("op_836")];
             tensor<int32, [4]> k_7_perm_0 = const()[name = tensor<string, []>("k_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> var_794 = linear(bias = encoder_v_proj_3_bias, weight = encoder_v_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_32")];
-            tensor<int32, [4]> var_795 = const()[name = tensor<string, []>("op_795"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_796 = reshape(shape = var_795, x = var_794)[name = tensor<string, []>("op_796")];
+            tensor<fp32, [1, 3, 256]> var_840 = linear(bias = inner_encoder_v_proj_3_bias, weight = inner_encoder_v_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_32")];
+            tensor<int32, [4]> var_841 = const()[name = tensor<string, []>("op_841"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_842 = reshape(shape = var_841, x = var_840)[name = tensor<string, []>("op_842")];
             tensor<int32, [4]> v_7_perm_0 = const()[name = tensor<string, []>("v_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> input_137 = linear(bias = encoder_g_proj_3_bias, weight = encoder_g_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_33")];
+            tensor<fp32, [1, 3, 256]> input_139 = linear(bias = inner_encoder_g_proj_3_bias, weight = inner_encoder_g_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_33")];
             tensor<fp32, [1]> sqrt_s0_7 = sqrt(x = prev_scale_7)[name = tensor<string, []>("sqrt_s0_7")];
-            tensor<fp32, [3]> s_t_7 = add(x = prev_scale_7, y = encoder__t_index)[name = tensor<string, []>("s_t_7")];
+            tensor<fp32, [3]> s_t_7 = add(x = prev_scale_7, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_7")];
             tensor<fp32, [3]> sqrt_s_t_7 = sqrt(x = s_t_7)[name = tensor<string, []>("sqrt_s_t_7")];
             tensor<bool, []> qk_7_transpose_x_1 = const()[name = tensor<string, []>("qk_7_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_7_transpose_y_1 = const()[name = tensor<string, []>("qk_7_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 3, 64]> k_7 = transpose(perm = k_7_perm_0, x = var_790)[name = tensor<string, []>("transpose_36")];
-            tensor<fp32, [1, 4, 3, 64]> q_7 = transpose(perm = q_7_perm_0, x = var_782)[name = tensor<string, []>("transpose_37")];
+            tensor<fp32, [1, 4, 3, 64]> k_7 = transpose(perm = k_7_perm_0, x = var_836)[name = tensor<string, []>("transpose_36")];
+            tensor<fp32, [1, 4, 3, 64]> q_7 = transpose(perm = q_7_perm_0, x = var_828)[name = tensor<string, []>("transpose_37")];
             tensor<fp32, [1, 4, 3, 3]> qk_7 = matmul(transpose_x = qk_7_transpose_x_1, transpose_y = qk_7_transpose_y_1, x = q_7, y = k_7)[name = tensor<string, []>("qk_7")];
-            tensor<int32, [2]> var_806 = const()[name = tensor<string, []>("op_806"), val = tensor<int32, [2]>([3, 1])];
-            tensor<fp32, [3, 1]> var_807 = reshape(shape = var_806, x = sqrt_s_t_7)[name = tensor<string, []>("op_807")];
-            tensor<fp32, [3, 3]> M_7 = real_div(x = encoder__causal_mask, y = var_807)[name = tensor<string, []>("M_7")];
-            tensor<fp32, [1, 4, 3, 3]> var_809 = mul(x = qk_7, y = M_7)[name = tensor<string, []>("op_809")];
+            tensor<int32, [2]> var_852 = const()[name = tensor<string, []>("op_852"), val = tensor<int32, [2]>([3, 1])];
+            tensor<fp32, [3, 1]> var_853 = reshape(shape = var_852, x = sqrt_s_t_7)[name = tensor<string, []>("op_853")];
+            tensor<fp32, [3, 3]> M_7 = real_div(x = inner_encoder__causal_mask, y = var_853)[name = tensor<string, []>("M_7")];
+            tensor<fp32, [1, 4, 3, 3]> var_855 = mul(x = qk_7, y = M_7)[name = tensor<string, []>("op_855")];
             tensor<bool, []> inner_7_transpose_x_0 = const()[name = tensor<string, []>("inner_7_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_7_transpose_y_0 = const()[name = tensor<string, []>("inner_7_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 3, 64]> v_7 = transpose(perm = v_7_perm_0, x = var_796)[name = tensor<string, []>("transpose_35")];
-            tensor<fp32, [1, 4, 3, 64]> inner_7 = matmul(transpose_x = inner_7_transpose_x_0, transpose_y = inner_7_transpose_y_0, x = var_809, y = v_7)[name = tensor<string, []>("inner_7")];
-            tensor<bool, []> var_811_transpose_x_0 = const()[name = tensor<string, []>("op_811_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_811_transpose_y_0 = const()[name = tensor<string, []>("op_811_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 3, 64]> var_811 = matmul(transpose_x = var_811_transpose_x_0, transpose_y = var_811_transpose_y_0, x = q_7, y = prev_kv_7)[name = tensor<string, []>("op_811")];
-            tensor<fp32, [3]> var_812 = real_div(x = sqrt_s0_7, y = sqrt_s_t_7)[name = tensor<string, []>("op_812")];
-            tensor<int32, [4]> var_813 = const()[name = tensor<string, []>("op_813"), val = tensor<int32, [4]>([1, 1, 3, 1])];
-            tensor<fp32, [1, 1, 3, 1]> var_814 = reshape(shape = var_813, x = var_812)[name = tensor<string, []>("op_814")];
-            tensor<fp32, [1, 4, 3, 64]> cross_7 = mul(x = var_811, y = var_814)[name = tensor<string, []>("cross_7")];
+            tensor<fp32, [1, 4, 3, 64]> v_7 = transpose(perm = v_7_perm_0, x = var_842)[name = tensor<string, []>("transpose_35")];
+            tensor<fp32, [1, 4, 3, 64]> inner_7 = matmul(transpose_x = inner_7_transpose_x_0, transpose_y = inner_7_transpose_y_0, x = var_855, y = v_7)[name = tensor<string, []>("inner_7")];
+            tensor<bool, []> var_857_transpose_x_0 = const()[name = tensor<string, []>("op_857_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_857_transpose_y_0 = const()[name = tensor<string, []>("op_857_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 3, 64]> var_857 = matmul(transpose_x = var_857_transpose_x_0, transpose_y = var_857_transpose_y_0, x = q_7, y = prev_kv_7)[name = tensor<string, []>("op_857")];
+            tensor<fp32, [3]> var_858 = real_div(x = sqrt_s0_7, y = sqrt_s_t_7)[name = tensor<string, []>("op_858")];
+            tensor<int32, [4]> var_859 = const()[name = tensor<string, []>("op_859"), val = tensor<int32, [4]>([1, 1, 3, 1])];
+            tensor<fp32, [1, 1, 3, 1]> var_860 = reshape(shape = var_859, x = var_858)[name = tensor<string, []>("op_860")];
+            tensor<fp32, [1, 4, 3, 64]> cross_7 = mul(x = var_857, y = var_860)[name = tensor<string, []>("cross_7")];
             tensor<fp32, [1, 4, 3, 64]> out_19 = add(x = inner_7, y = cross_7)[name = tensor<string, []>("out_19")];
-            tensor<fp32, [1, 4, 64, 64]> var_817 = mul(x = prev_kv_7, y = sqrt_s0_7)[name = tensor<string, []>("op_817")];
-            tensor<bool, []> var_819_transpose_x_1 = const()[name = tensor<string, []>("op_819_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_819_transpose_y_1 = const()[name = tensor<string, []>("op_819_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_819 = matmul(transpose_x = var_819_transpose_x_1, transpose_y = var_819_transpose_y_1, x = k_7, y = v_7)[name = tensor<string, []>("op_819")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_7 = add(x = var_817, y = var_819)[name = tensor<string, []>("new_kv_unnorm_7")];
-            tensor<fp32, []> var_821 = const()[name = tensor<string, []>("op_821"), val = tensor<fp32, []>(0x1.8p+1)];
-            tensor<fp32, [1]> new_scale_7 = add(x = prev_scale_7, y = var_821)[name = tensor<string, []>("new_scale_7")];
-            tensor<fp32, [1]> var_823 = sqrt(x = new_scale_7)[name = tensor<string, []>("op_823")];
-            tensor<fp32, [1, 4, 64, 64]> nkv_1 = real_div(x = new_kv_unnorm_7, y = var_823)[name = tensor<string, []>("nkv_1")];
-            tensor<int32, [4]> var_825_perm_0 = const()[name = tensor<string, []>("op_825_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_863 = mul(x = prev_kv_7, y = sqrt_s0_7)[name = tensor<string, []>("op_863")];
+            tensor<bool, []> var_865_transpose_x_1 = const()[name = tensor<string, []>("op_865_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_865_transpose_y_1 = const()[name = tensor<string, []>("op_865_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_865 = matmul(transpose_x = var_865_transpose_x_1, transpose_y = var_865_transpose_y_1, x = k_7, y = v_7)[name = tensor<string, []>("op_865")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_7 = add(x = var_863, y = var_865)[name = tensor<string, []>("new_kv_unnorm_7")];
+            tensor<fp32, []> var_867 = const()[name = tensor<string, []>("op_867"), val = tensor<fp32, []>(0x1.8p+1)];
+            tensor<fp32, [1]> new_scale_7 = add(x = prev_scale_7, y = var_867)[name = tensor<string, []>("new_scale_7")];
+            tensor<fp32, [1]> var_869 = sqrt(x = new_scale_7)[name = tensor<string, []>("op_869")];
+            tensor<fp32, [1, 4, 64, 64]> nkv_1 = real_div(x = new_kv_unnorm_7, y = var_869)[name = tensor<string, []>("nkv_1")];
+            tensor<int32, [4]> var_871_perm_0 = const()[name = tensor<string, []>("op_871_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_21_axes_0 = const()[name = tensor<string, []>("out_21_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 4, 64]> var_825 = transpose(perm = var_825_perm_0, x = out_19)[name = tensor<string, []>("transpose_34")];
-            tensor<fp32, [1, 3, 4, 64]> out_21 = layer_norm(axes = out_21_axes_0, epsilon = var_17, x = var_825)[name = tensor<string, []>("out_21")];
-            tensor<int32, [3]> var_829 = const()[name = tensor<string, []>("op_829"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<fp32, [1, 3, 256]> out_23 = reshape(shape = var_829, x = out_21)[name = tensor<string, []>("out_23")];
-            tensor<fp32, [1, 3, 256]> var_831 = silu(x = input_137)[name = tensor<string, []>("op_831")];
-            tensor<fp32, [1, 3, 256]> input_139 = mul(x = var_831, y = out_23)[name = tensor<string, []>("input_139")];
-            tensor<fp32, [1, 3, 256]> ret_out_7 = linear(bias = encoder_out_proj_3_bias, weight = encoder_out_proj_3_weight, x = input_139)[name = tensor<string, []>("linear_34")];
-            tensor<fp32, [1, 3, 256]> x_21 = add(x = input_133, y = ret_out_7)[name = tensor<string, []>("x_21")];
+            tensor<fp32, [1, 3, 4, 64]> var_871 = transpose(perm = var_871_perm_0, x = out_19)[name = tensor<string, []>("transpose_34")];
+            tensor<fp32, [1, 3, 4, 64]> out_21 = layer_norm(axes = out_21_axes_0, epsilon = var_64, x = var_871)[name = tensor<string, []>("out_21")];
+            tensor<int32, [3]> var_875 = const()[name = tensor<string, []>("op_875"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<fp32, [1, 3, 256]> out_23 = reshape(shape = var_875, x = out_21)[name = tensor<string, []>("out_23")];
+            tensor<fp32, [1, 3, 256]> var_877 = silu(x = input_139)[name = tensor<string, []>("op_877")];
+            tensor<fp32, [1, 3, 256]> input_141 = mul(x = var_877, y = out_23)[name = tensor<string, []>("input_141")];
+            tensor<fp32, [1, 3, 256]> ret_out_7 = linear(bias = inner_encoder_out_proj_3_bias, weight = inner_encoder_out_proj_3_weight, x = input_141)[name = tensor<string, []>("linear_34")];
+            tensor<fp32, [1, 3, 256]> x_21 = add(x = input_135, y = ret_out_7)[name = tensor<string, []>("x_21")];
             tensor<int32, [4]> window_25_begin_0 = const()[name = tensor<string, []>("window_25_begin_0"), val = tensor<int32, [4]>([3, 0, 0, 0])];
             tensor<int32, [4]> window_25_end_0 = const()[name = tensor<string, []>("window_25_end_0"), val = tensor<int32, [4]>([4, 1, 16, 256])];
             tensor<bool, [4]> window_25_end_mask_0 = const()[name = tensor<string, []>("window_25_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_25_squeeze_mask_0 = const()[name = tensor<string, []>("window_25_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_25 = slice_by_index(begin = window_25_begin_0, end = window_25_end_0, end_mask = window_25_end_mask_0, squeeze_mask = window_25_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_25")];
-            tensor<int32, [3]> var_839_begin_0 = const()[name = tensor<string, []>("op_839_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_839_end_0 = const()[name = tensor<string, []>("op_839_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_839_end_mask_0 = const()[name = tensor<string, []>("op_839_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_839 = slice_by_index(begin = var_839_begin_0, end = var_839_end_0, end_mask = var_839_end_mask_0, x = x_21)[name = tensor<string, []>("op_839")];
-            tensor<int32, [3]> var_842_begin_0 = const()[name = tensor<string, []>("op_842_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_842_end_0 = const()[name = tensor<string, []>("op_842_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_842_end_mask_0 = const()[name = tensor<string, []>("op_842_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_842 = slice_by_index(begin = var_842_begin_0, end = var_842_end_0, end_mask = var_842_end_mask_0, x = window_25)[name = tensor<string, []>("op_842")];
+            tensor<int32, [3]> var_885_begin_0 = const()[name = tensor<string, []>("op_885_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_885_end_0 = const()[name = tensor<string, []>("op_885_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_885_end_mask_0 = const()[name = tensor<string, []>("op_885_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_885 = slice_by_index(begin = var_885_begin_0, end = var_885_end_0, end_mask = var_885_end_mask_0, x = x_21)[name = tensor<string, []>("op_885")];
+            tensor<int32, [3]> var_888_begin_0 = const()[name = tensor<string, []>("op_888_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_888_end_0 = const()[name = tensor<string, []>("op_888_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_888_end_mask_0 = const()[name = tensor<string, []>("op_888_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_888 = slice_by_index(begin = var_888_begin_0, end = var_888_end_0, end_mask = var_888_end_mask_0, x = window_25)[name = tensor<string, []>("op_888")];
             tensor<bool, []> window_27_interleave_0 = const()[name = tensor<string, []>("window_27_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_27 = concat(axis = var_26, interleave = window_27_interleave_0, values = (var_842, var_839))[name = tensor<string, []>("window_27")];
-            tensor<int32, [3]> var_847_begin_0 = const()[name = tensor<string, []>("op_847_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_847_end_0 = const()[name = tensor<string, []>("op_847_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_847_end_mask_0 = const()[name = tensor<string, []>("op_847_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_847 = slice_by_index(begin = var_847_begin_0, end = var_847_end_0, end_mask = var_847_end_mask_0, x = x_21)[name = tensor<string, []>("op_847")];
-            tensor<int32, [3]> var_850_begin_0 = const()[name = tensor<string, []>("op_850_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_850_end_0 = const()[name = tensor<string, []>("op_850_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_850_end_mask_0 = const()[name = tensor<string, []>("op_850_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_850 = slice_by_index(begin = var_850_begin_0, end = var_850_end_0, end_mask = var_850_end_mask_0, x = window_27)[name = tensor<string, []>("op_850")];
+            tensor<fp32, [1, 16, 256]> window_27 = concat(axis = var_72, interleave = window_27_interleave_0, values = (var_888, var_885))[name = tensor<string, []>("window_27")];
+            tensor<int32, [3]> var_893_begin_0 = const()[name = tensor<string, []>("op_893_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_893_end_0 = const()[name = tensor<string, []>("op_893_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_893_end_mask_0 = const()[name = tensor<string, []>("op_893_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_893 = slice_by_index(begin = var_893_begin_0, end = var_893_end_0, end_mask = var_893_end_mask_0, x = x_21)[name = tensor<string, []>("op_893")];
+            tensor<int32, [3]> var_896_begin_0 = const()[name = tensor<string, []>("op_896_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_896_end_0 = const()[name = tensor<string, []>("op_896_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_896_end_mask_0 = const()[name = tensor<string, []>("op_896_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_896 = slice_by_index(begin = var_896_begin_0, end = var_896_end_0, end_mask = var_896_end_mask_0, x = window_27)[name = tensor<string, []>("op_896")];
             tensor<bool, []> window_29_interleave_0 = const()[name = tensor<string, []>("window_29_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_29 = concat(axis = var_26, interleave = window_29_interleave_0, values = (var_850, var_847))[name = tensor<string, []>("window_29")];
-            tensor<int32, [3]> var_855_begin_0 = const()[name = tensor<string, []>("op_855_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_855_end_0 = const()[name = tensor<string, []>("op_855_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_855_end_mask_0 = const()[name = tensor<string, []>("op_855_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_855 = slice_by_index(begin = var_855_begin_0, end = var_855_end_0, end_mask = var_855_end_mask_0, x = x_21)[name = tensor<string, []>("op_855")];
-            tensor<int32, [3]> var_858_begin_0 = const()[name = tensor<string, []>("op_858_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_858_end_0 = const()[name = tensor<string, []>("op_858_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_858_end_mask_0 = const()[name = tensor<string, []>("op_858_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_858 = slice_by_index(begin = var_858_begin_0, end = var_858_end_0, end_mask = var_858_end_mask_0, x = window_29)[name = tensor<string, []>("op_858")];
+            tensor<fp32, [1, 16, 256]> window_29 = concat(axis = var_72, interleave = window_29_interleave_0, values = (var_896, var_893))[name = tensor<string, []>("window_29")];
+            tensor<int32, [3]> var_901_begin_0 = const()[name = tensor<string, []>("op_901_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_901_end_0 = const()[name = tensor<string, []>("op_901_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_901_end_mask_0 = const()[name = tensor<string, []>("op_901_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_901 = slice_by_index(begin = var_901_begin_0, end = var_901_end_0, end_mask = var_901_end_mask_0, x = x_21)[name = tensor<string, []>("op_901")];
+            tensor<int32, [3]> var_904_begin_0 = const()[name = tensor<string, []>("op_904_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_904_end_0 = const()[name = tensor<string, []>("op_904_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_904_end_mask_0 = const()[name = tensor<string, []>("op_904_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_904 = slice_by_index(begin = var_904_begin_0, end = var_904_end_0, end_mask = var_904_end_mask_0, x = window_29)[name = tensor<string, []>("op_904")];
             tensor<bool, []> window_interleave_0 = const()[name = tensor<string, []>("window_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window = concat(axis = var_26, interleave = window_interleave_0, values = (var_858, var_855))[name = tensor<string, []>("window")];
-            tensor<bool, []> input_141_interleave_0 = const()[name = tensor<string, []>("input_141_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [3, 16, 256]> input_141 = concat(axis = var_23, interleave = input_141_interleave_0, values = (window_27, window_29, window))[name = tensor<string, []>("input_141")];
+            tensor<fp32, [1, 16, 256]> window = concat(axis = var_72, interleave = window_interleave_0, values = (var_904, var_901))[name = tensor<string, []>("window")];
+            tensor<bool, []> input_143_interleave_0 = const()[name = tensor<string, []>("input_143_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [3, 16, 256]> input_143 = concat(axis = var_59, interleave = input_143_interleave_0, values = (window_27, window_29, window))[name = tensor<string, []>("input_143")];
             tensor<int32, [1]> x_23_axes_0 = const()[name = tensor<string, []>("x_23_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [3, 16, 256]> x_23 = layer_norm(axes = x_23_axes_0, beta = encoder_conv_module_3_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_3_sequential_0_weight, x = input_141)[name = tensor<string, []>("x_23")];
-            tensor<int32, [3]> input_143_perm_0 = const()[name = tensor<string, []>("input_143_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [3, 16, 256]> x_23 = layer_norm(axes = x_23_axes_0, beta = inner_encoder_conv_module_3_sequential_0_bias, epsilon = var_56, gamma = inner_encoder_conv_module_3_sequential_0_weight, x = input_143)[name = tensor<string, []>("x_23")];
+            tensor<int32, [3]> input_145_perm_0 = const()[name = tensor<string, []>("input_145_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_33_pad_type_0 = const()[name = tensor<string, []>("inputs_33_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_33_strides_0 = const()[name = tensor<string, []>("inputs_33_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_33_pad_0 = const()[name = tensor<string, []>("inputs_33_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_33_dilations_0 = const()[name = tensor<string, []>("inputs_33_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_33_groups_0 = const()[name = tensor<string, []>("inputs_33_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> input_143 = transpose(perm = input_143_perm_0, x = x_23)[name = tensor<string, []>("transpose_33")];
-            tensor<fp32, [3, 512, 16]> inputs_33 = conv(bias = encoder_conv_module_3_sequential_2_conv_bias, dilations = inputs_33_dilations_0, groups = inputs_33_groups_0, pad = inputs_33_pad_0, pad_type = inputs_33_pad_type_0, strides = inputs_33_strides_0, weight = encoder_conv_module_3_sequential_2_conv_weight, x = input_143)[name = tensor<string, []>("inputs_33")];
-            tensor<int32, [2]> var_883_split_sizes_0 = const()[name = tensor<string, []>("op_883_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_883_axis_0 = const()[name = tensor<string, []>("op_883_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> var_883_0, tensor<fp32, [3, 256, 16]> var_883_1 = split(axis = var_883_axis_0, split_sizes = var_883_split_sizes_0, x = inputs_33)[name = tensor<string, []>("op_883")];
-            tensor<fp32, [3, 256, 16]> var_885 = sigmoid(x = var_883_1)[name = tensor<string, []>("op_885")];
-            tensor<fp32, [3, 256, 16]> inputs_35 = mul(x = var_883_0, y = var_885)[name = tensor<string, []>("inputs_35")];
+            tensor<fp32, [3, 256, 16]> input_145 = transpose(perm = input_145_perm_0, x = x_23)[name = tensor<string, []>("transpose_33")];
+            tensor<fp32, [3, 512, 16]> inputs_33 = conv(bias = inner_encoder_conv_module_3_sequential_2_conv_bias, dilations = inputs_33_dilations_0, groups = inputs_33_groups_0, pad = inputs_33_pad_0, pad_type = inputs_33_pad_type_0, strides = inputs_33_strides_0, weight = inner_encoder_conv_module_3_sequential_2_conv_weight, x = input_145)[name = tensor<string, []>("inputs_33")];
+            tensor<int32, [2]> var_929_split_sizes_0 = const()[name = tensor<string, []>("op_929_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_929_axis_0 = const()[name = tensor<string, []>("op_929_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [3, 256, 16]> var_929_0, tensor<fp32, [3, 256, 16]> var_929_1 = split(axis = var_929_axis_0, split_sizes = var_929_split_sizes_0, x = inputs_33)[name = tensor<string, []>("op_929")];
+            tensor<fp32, [3, 256, 16]> var_931 = sigmoid(x = var_929_1)[name = tensor<string, []>("op_931")];
+            tensor<fp32, [3, 256, 16]> inputs_35 = mul(x = var_929_0, y = var_931)[name = tensor<string, []>("inputs_35")];
             tensor<string, []> outputs_aug_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_pad_0 = const()[name = tensor<string, []>("outputs_aug_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_groups_0 = const()[name = tensor<string, []>("outputs_aug_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_strides_0 = const()[name = tensor<string, []>("outputs_aug_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_dilations_0 = const()[name = tensor<string, []>("outputs_aug_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [3, 256, 31]> outputs_aug = conv(dilations = outputs_aug_dilations_0, groups = outputs_aug_groups_0, pad = outputs_aug_pad_0, pad_type = outputs_aug_pad_type_0, strides = outputs_aug_strides_0, weight = encoder_conv_module_3_sequential_4_conv_weight, x = inputs_35)[name = tensor<string, []>("outputs_aug")];
-            tensor<int32, [3]> input_145_begin_0 = const()[name = tensor<string, []>("input_145_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_145_end_0 = const()[name = tensor<string, []>("input_145_end_0"), val = tensor<int32, [3]>([3, 256, 16])];
-            tensor<bool, [3]> input_145_end_mask_0 = const()[name = tensor<string, []>("input_145_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [3, 256, 16]> input_145 = slice_by_index(begin = input_145_begin_0, end = input_145_end_0, end_mask = input_145_end_mask_0, x = outputs_aug)[name = tensor<string, []>("input_145")];
-            tensor<fp32, [3, 256, 16]> inputs_37 = batch_norm(beta = encoder_conv_module_3_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_3_sequential_5_weight, mean = encoder_conv_module_3_sequential_5_running_mean, variance = encoder_conv_module_3_sequential_5_running_var, x = input_145)[name = tensor<string, []>("inputs_37")];
-            tensor<fp32, [3, 256, 16]> input_147 = silu(x = inputs_37)[name = tensor<string, []>("input_147")];
-            tensor<string, []> input_149_pad_type_0 = const()[name = tensor<string, []>("input_149_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_149_strides_0 = const()[name = tensor<string, []>("input_149_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_149_pad_0 = const()[name = tensor<string, []>("input_149_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_149_dilations_0 = const()[name = tensor<string, []>("input_149_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_149_groups_0 = const()[name = tensor<string, []>("input_149_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> input_149 = conv(bias = encoder_conv_module_3_sequential_7_conv_bias, dilations = input_149_dilations_0, groups = input_149_groups_0, pad = input_149_pad_0, pad_type = input_149_pad_type_0, strides = input_149_strides_0, weight = encoder_conv_module_3_sequential_7_conv_weight, x = input_147)[name = tensor<string, []>("input_149")];
+            tensor<fp32, [3, 256, 31]> outputs_aug = conv(dilations = outputs_aug_dilations_0, groups = outputs_aug_groups_0, pad = outputs_aug_pad_0, pad_type = outputs_aug_pad_type_0, strides = outputs_aug_strides_0, weight = inner_encoder_conv_module_3_sequential_4_conv_weight, x = inputs_35)[name = tensor<string, []>("outputs_aug")];
+            tensor<int32, [3]> input_147_begin_0 = const()[name = tensor<string, []>("input_147_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_147_end_0 = const()[name = tensor<string, []>("input_147_end_0"), val = tensor<int32, [3]>([3, 256, 16])];
+            tensor<bool, [3]> input_147_end_mask_0 = const()[name = tensor<string, []>("input_147_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [3, 256, 16]> input_147 = slice_by_index(begin = input_147_begin_0, end = input_147_end_0, end_mask = input_147_end_mask_0, x = outputs_aug)[name = tensor<string, []>("input_147")];
+            tensor<fp32, [3, 256, 16]> inputs_37 = batch_norm(beta = inner_encoder_conv_module_3_sequential_5_bias, epsilon = var_56, gamma = inner_encoder_conv_module_3_sequential_5_weight, mean = inner_encoder_conv_module_3_sequential_5_running_mean, variance = inner_encoder_conv_module_3_sequential_5_running_var, x = input_147)[name = tensor<string, []>("inputs_37")];
+            tensor<fp32, [3, 256, 16]> input_149 = silu(x = inputs_37)[name = tensor<string, []>("input_149")];
+            tensor<string, []> input_151_pad_type_0 = const()[name = tensor<string, []>("input_151_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_151_strides_0 = const()[name = tensor<string, []>("input_151_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_151_pad_0 = const()[name = tensor<string, []>("input_151_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_151_dilations_0 = const()[name = tensor<string, []>("input_151_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_151_groups_0 = const()[name = tensor<string, []>("input_151_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [3, 256, 16]> input_151 = conv(bias = inner_encoder_conv_module_3_sequential_7_conv_bias, dilations = input_151_dilations_0, groups = input_151_groups_0, pad = input_151_pad_0, pad_type = input_151_pad_type_0, strides = input_151_strides_0, weight = inner_encoder_conv_module_3_sequential_7_conv_weight, x = input_149)[name = tensor<string, []>("input_151")];
             tensor<int32, [3]> conv_out_7_perm_0 = const()[name = tensor<string, []>("conv_out_7_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_916_begin_0 = const()[name = tensor<string, []>("op_916_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_916_end_0 = const()[name = tensor<string, []>("op_916_end_0"), val = tensor<int32, [3]>([3, 16, 256])];
-            tensor<bool, [3]> var_916_end_mask_0 = const()[name = tensor<string, []>("op_916_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [3, 16, 256]> conv_out_7 = transpose(perm = conv_out_7_perm_0, x = input_149)[name = tensor<string, []>("transpose_32")];
-            tensor<fp32, [3, 1, 256]> var_916 = slice_by_index(begin = var_916_begin_0, end = var_916_end_0, end_mask = var_916_end_mask_0, x = conv_out_7)[name = tensor<string, []>("op_916")];
-            tensor<int32, [3]> var_918_perm_0 = const()[name = tensor<string, []>("op_918_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 3, 256]> var_918 = transpose(perm = var_918_perm_0, x = var_916)[name = tensor<string, []>("transpose_31")];
-            tensor<fp32, [1, 3, 256]> input_151 = add(x = x_21, y = var_918)[name = tensor<string, []>("input_151")];
-            tensor<int32, [1]> input_153_axes_0 = const()[name = tensor<string, []>("input_153_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_153 = layer_norm(axes = input_153_axes_0, beta = encoder_ffn2_3_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_3_module_sequential_0_weight, x = input_151)[name = tensor<string, []>("input_153")];
-            tensor<fp32, [1, 3, 1024]> inputs = linear(bias = encoder_ffn2_3_module_sequential_1_linear_bias, weight = encoder_ffn2_3_module_sequential_1_linear_weight, x = input_153)[name = tensor<string, []>("linear_35")];
-            tensor<fp32, [1, 3, 1024]> input_155 = silu(x = inputs)[name = tensor<string, []>("input_155")];
-            tensor<fp32, [1, 3, 256]> input_159 = linear(bias = encoder_ffn2_3_module_sequential_4_linear_bias, weight = encoder_ffn2_3_module_sequential_4_linear_weight, x = input_155)[name = tensor<string, []>("linear_36")];
-            tensor<fp32, []> var_941 = const()[name = tensor<string, []>("op_941"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 3, 256]> var_942 = mul(x = input_159, y = var_941)[name = tensor<string, []>("op_942")];
-            tensor<fp32, [1, 3, 256]> input_161 = add(x = var_942, y = input_151)[name = tensor<string, []>("input_161")];
+            tensor<int32, [3]> var_962_begin_0 = const()[name = tensor<string, []>("op_962_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_962_end_0 = const()[name = tensor<string, []>("op_962_end_0"), val = tensor<int32, [3]>([3, 16, 256])];
+            tensor<bool, [3]> var_962_end_mask_0 = const()[name = tensor<string, []>("op_962_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [3, 16, 256]> conv_out_7 = transpose(perm = conv_out_7_perm_0, x = input_151)[name = tensor<string, []>("transpose_32")];
+            tensor<fp32, [3, 1, 256]> var_962 = slice_by_index(begin = var_962_begin_0, end = var_962_end_0, end_mask = var_962_end_mask_0, x = conv_out_7)[name = tensor<string, []>("op_962")];
+            tensor<int32, [3]> var_964_perm_0 = const()[name = tensor<string, []>("op_964_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 3, 256]> var_964 = transpose(perm = var_964_perm_0, x = var_962)[name = tensor<string, []>("transpose_31")];
+            tensor<fp32, [1, 3, 256]> input_153 = add(x = x_21, y = var_964)[name = tensor<string, []>("input_153")];
+            tensor<int32, [1]> input_155_axes_0 = const()[name = tensor<string, []>("input_155_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 3, 256]> input_155 = layer_norm(axes = input_155_axes_0, beta = inner_encoder_ffn2_3_module_sequential_0_bias, epsilon = var_56, gamma = inner_encoder_ffn2_3_module_sequential_0_weight, x = input_153)[name = tensor<string, []>("input_155")];
+            tensor<fp32, [1, 3, 1024]> inputs = linear(bias = inner_encoder_ffn2_3_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_3_module_sequential_1_linear_weight, x = input_155)[name = tensor<string, []>("linear_35")];
+            tensor<fp32, [1, 3, 1024]> input_157 = silu(x = inputs)[name = tensor<string, []>("input_157")];
+            tensor<fp32, [1, 3, 256]> input_161 = linear(bias = inner_encoder_ffn2_3_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_3_module_sequential_4_linear_weight, x = input_157)[name = tensor<string, []>("linear_36")];
+            tensor<fp32, []> var_987 = const()[name = tensor<string, []>("op_987"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 3, 256]> var_988 = mul(x = input_161, y = var_987)[name = tensor<string, []>("op_988")];
+            tensor<fp32, [1, 3, 256]> input_163 = add(x = var_988, y = input_153)[name = tensor<string, []>("input_163")];
             tensor<int32, [1]> x_25_axes_0 = const()[name = tensor<string, []>("x_25_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> x_25 = layer_norm(axes = x_25_axes_0, beta = encoder_layer_norm_3_bias, epsilon = var_28, gamma = encoder_layer_norm_3_weight, x = input_161)[name = tensor<string, []>("x_25")];
+            tensor<fp32, [1, 3, 256]> x_25 = layer_norm(axes = x_25_axes_0, beta = inner_encoder_layer_norm_3_bias, epsilon = var_56, gamma = inner_encoder_layer_norm_3_weight, x = input_163)[name = tensor<string, []>("x_25")];
             tensor<int32, [3]> x_ct_perm_0 = const()[name = tensor<string, []>("x_ct_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<bool, []> cat_interleave_0 = const()[name = tensor<string, []>("cat_interleave_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [1, 256, 3]> x_ct = transpose(perm = x_ct_perm_0, x = x_25)[name = tensor<string, []>("transpose_30")];
-            tensor<fp32, [1, 256, 21]> cat = concat(axis = var_20, interleave = cat_interleave_0, values = (cnn_window, x_ct))[name = tensor<string, []>("cat")];
+            tensor<fp32, [1, 256, 21]> cat = concat(axis = var_61, interleave = cat_interleave_0, values = (cnn_window, x_ct))[name = tensor<string, []>("cat")];
             tensor<string, []> conv_out_pad_type_0 = const()[name = tensor<string, []>("conv_out_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> conv_out_strides_0 = const()[name = tensor<string, []>("conv_out_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> conv_out_pad_0 = const()[name = tensor<string, []>("conv_out_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> conv_out_dilations_0 = const()[name = tensor<string, []>("conv_out_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> conv_out_groups_0 = const()[name = tensor<string, []>("conv_out_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 3]> conv_out = conv(bias = encoder_cnn_bias, dilations = conv_out_dilations_0, groups = conv_out_groups_0, pad = conv_out_pad_0, pad_type = conv_out_pad_type_0, strides = conv_out_strides_0, weight = encoder_cnn_weight, x = cat)[name = tensor<string, []>("conv_out")];
-            tensor<int32, [3]> var_960_begin_0 = const()[name = tensor<string, []>("op_960_begin_0"), val = tensor<int32, [3]>([0, 0, 3])];
-            tensor<int32, [3]> var_960_end_0 = const()[name = tensor<string, []>("op_960_end_0"), val = tensor<int32, [3]>([1, 256, 21])];
-            tensor<bool, [3]> var_960_end_mask_0 = const()[name = tensor<string, []>("op_960_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 256, 18]> cnn_window_new = slice_by_index(begin = var_960_begin_0, end = var_960_end_0, end_mask = var_960_end_mask_0, x = cat)[name = tensor<string, []>("op_960")];
-            tensor<int32, [3]> input_163_perm_0 = const()[name = tensor<string, []>("input_163_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [1]> var_962 = const()[name = tensor<string, []>("op_962"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_163 = transpose(perm = input_163_perm_0, x = conv_out)[name = tensor<string, []>("transpose_29")];
-            tensor<fp32, [1, 3, 1]> var_963 = reduce_l2_norm(axes = var_962, keep_dims = var_29, x = input_163)[name = tensor<string, []>("op_963")];
+            tensor<fp32, [1, 256, 3]> conv_out = conv(bias = inner_encoder_cnn_bias, dilations = conv_out_dilations_0, groups = conv_out_groups_0, pad = conv_out_pad_0, pad_type = conv_out_pad_type_0, strides = conv_out_strides_0, weight = inner_encoder_cnn_weight, x = cat)[name = tensor<string, []>("conv_out")];
+            tensor<int32, [3]> var_1006_begin_0 = const()[name = tensor<string, []>("op_1006_begin_0"), val = tensor<int32, [3]>([0, 0, 3])];
+            tensor<int32, [3]> var_1006_end_0 = const()[name = tensor<string, []>("op_1006_end_0"), val = tensor<int32, [3]>([1, 256, 21])];
+            tensor<bool, [3]> var_1006_end_mask_0 = const()[name = tensor<string, []>("op_1006_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 256, 18]> cnn_window_new = slice_by_index(begin = var_1006_begin_0, end = var_1006_end_0, end_mask = var_1006_end_mask_0, x = cat)[name = tensor<string, []>("op_1006")];
+            tensor<int32, [3]> input_165_perm_0 = const()[name = tensor<string, []>("input_165_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_1008 = const()[name = tensor<string, []>("op_1008"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 3, 256]> input_165 = transpose(perm = input_165_perm_0, x = conv_out)[name = tensor<string, []>("transpose_29")];
+            tensor<fp32, [1, 3, 1]> var_1009 = reduce_l2_norm(axes = var_1008, keep_dims = var_55, x = input_165)[name = tensor<string, []>("op_1009")];
             tensor<fp32, []> const_12 = const()[name = tensor<string, []>("const_12"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1, 3, 1]> clip_0 = clip(alpha = var_11, beta = const_12, x = var_963)[name = tensor<string, []>("clip_0")];
-            tensor<fp32, [1, 3, 256]> emb = real_div(x = input_163, y = clip_0)[name = tensor<string, []>("emb")];
-            tensor<int32, []> var_967_axis_0 = const()[name = tensor<string, []>("op_967_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1, 4, 64, 64]> enc_kv_new = stack(axis = var_967_axis_0, values = (var_206, var_412, var_618, nkv_1))[name = tensor<string, []>("op_967")];
-            tensor<int32, []> var_969_axis_0 = const()[name = tensor<string, []>("op_969_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1]> enc_scale_new = stack(axis = var_969_axis_0, values = (new_scale_1, new_scale_3, new_scale_5, new_scale_7))[name = tensor<string, []>("op_969")];
-            tensor<int32, []> var_971_axis_0 = const()[name = tensor<string, []>("op_971_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1, 16, 256]> enc_conv_cache_new = stack(axis = var_971_axis_0, values = (window_7, window_15, window_23, window))[name = tensor<string, []>("op_971")];
-            tensor<fp32, []> var_980 = const()[name = tensor<string, []>("op_980"), val = tensor<fp32, []>(0x1.5798eep-27)];
-            tensor<fp32, []> var_985 = const()[name = tensor<string, []>("op_985"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
-            tensor<fp32, []> var_987 = const()[name = tensor<string, []>("op_987"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
-            tensor<bool, []> var_988 = const()[name = tensor<string, []>("op_988"), val = tensor<bool, []>(true)];
-            tensor<fp32, []> var_990 = const()[name = tensor<string, []>("op_990"), val = tensor<fp32, []>(0x1p+0)];
-            tensor<int32, []> var_994 = const()[name = tensor<string, []>("op_994"), val = tensor<int32, []>(-1)];
-            tensor<int32, []> var_1000 = const()[name = tensor<string, []>("op_1000"), val = tensor<int32, []>(0)];
+            tensor<fp32, [1, 3, 1]> clip_0 = clip(alpha = var_69, beta = const_12, x = var_1009)[name = tensor<string, []>("clip_0")];
+            tensor<fp32, [1, 3, 256]> emb = real_div(x = input_165, y = clip_0)[name = tensor<string, []>("emb")];
+            tensor<int32, []> var_1013_axis_0 = const()[name = tensor<string, []>("op_1013_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1, 4, 64, 64]> enc_kv_new = stack(axis = var_1013_axis_0, values = (var_252, var_458, var_664, nkv_1))[name = tensor<string, []>("op_1013")];
+            tensor<int32, []> var_1015_axis_0 = const()[name = tensor<string, []>("op_1015_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1]> enc_scale_new = stack(axis = var_1015_axis_0, values = (new_scale_1, new_scale_3, new_scale_5, new_scale_7))[name = tensor<string, []>("op_1015")];
+            tensor<int32, []> var_1017_axis_0 = const()[name = tensor<string, []>("op_1017_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1, 16, 256]> enc_conv_cache_new = stack(axis = var_1017_axis_0, values = (window_7, window_15, window_23, window))[name = tensor<string, []>("op_1017")];
             tensor<fp32, [1, 3, 12, 256]> pos = const()[name = tensor<string, []>("pos"), val = tensor<fp32, [1, 3, 12, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44395328)))];
-            tensor<int32, [1]> var_1062_axes_0 = const()[name = tensor<string, []>("op_1062_axes_0"), val = tensor<int32, [1]>([2])];
-            tensor<fp32, [1, 3, 1, 256]> var_1062 = expand_dims(axes = var_1062_axes_0, x = emb)[name = tensor<string, []>("op_1062")];
+            tensor<int32, [1]> var_1085_axes_0 = const()[name = tensor<string, []>("op_1085_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp32, [1, 3, 1, 256]> var_1085 = expand_dims(axes = var_1085_axes_0, x = emb)[name = tensor<string, []>("op_1085")];
             tensor<int32, [4]> emb_exp_reps_0 = const()[name = tensor<string, []>("emb_exp_reps_0"), val = tensor<int32, [4]>([1, 1, 12, 1])];
-            tensor<fp32, [1, 3, 12, 256]> emb_exp = tile(reps = emb_exp_reps_0, x = var_1062)[name = tensor<string, []>("emb_exp")];
-            tensor<bool, []> input_165_interleave_0 = const()[name = tensor<string, []>("input_165_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 3, 12, 512]> input_165 = concat(axis = var_994, interleave = input_165_interleave_0, values = (emb_exp, pos))[name = tensor<string, []>("input_165")];
-            tensor<fp32, [1, 3, 12, 256]> x_27 = linear(bias = decoder_convert_bias, weight = decoder_convert_weight, x = input_165)[name = tensor<string, []>("linear_37")];
-            tensor<int32, [4]> var_1070_perm_0 = const()[name = tensor<string, []>("op_1070_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1074 = const()[name = tensor<string, []>("op_1074"), val = tensor<int32, [3]>([12, 3, 256])];
-            tensor<fp32, [1, 12, 3, 256]> var_1070 = transpose(perm = var_1070_perm_0, x = x_27)[name = tensor<string, []>("transpose_28")];
-            tensor<fp32, [12, 3, 256]> x_29 = reshape(shape = var_1074, x = var_1070)[name = tensor<string, []>("x_29")];
+            tensor<fp32, [1, 3, 12, 256]> emb_exp = tile(reps = emb_exp_reps_0, x = var_1085)[name = tensor<string, []>("emb_exp")];
+            tensor<bool, []> input_167_interleave_0 = const()[name = tensor<string, []>("input_167_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 3, 12, 512]> input_167 = concat(axis = var_62, interleave = input_167_interleave_0, values = (emb_exp, pos))[name = tensor<string, []>("input_167")];
+            tensor<fp32, [1, 3, 12, 256]> x_27 = linear(bias = inner_decoder_convert_bias, weight = inner_decoder_convert_weight, x = input_167)[name = tensor<string, []>("linear_37")];
+            tensor<int32, [4]> var_1093_perm_0 = const()[name = tensor<string, []>("op_1093_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1097 = const()[name = tensor<string, []>("op_1097"), val = tensor<int32, [3]>([12, 3, 256])];
+            tensor<fp32, [1, 12, 3, 256]> var_1093 = transpose(perm = var_1093_perm_0, x = x_27)[name = tensor<string, []>("transpose_28")];
+            tensor<fp32, [12, 3, 256]> x_29 = reshape(shape = var_1097, x = var_1093)[name = tensor<string, []>("x_29")];
             tensor<int32, [5]> prev_kv_9_begin_0 = const()[name = tensor<string, []>("prev_kv_9_begin_0"), val = tensor<int32, [5]>([0, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_9_end_0 = const()[name = tensor<string, []>("prev_kv_9_end_0"), val = tensor<int32, [5]>([1, 12, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_9_end_mask_0 = const()[name = tensor<string, []>("prev_kv_9_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -943,132 +954,132 @@ program(1.0)
             tensor<bool, [2]> prev_scale_9_end_mask_0 = const()[name = tensor<string, []>("prev_scale_9_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_9_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_9_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_9 = slice_by_index(begin = prev_scale_9_begin_0, end = prev_scale_9_end_0, end_mask = prev_scale_9_end_mask_0, squeeze_mask = prev_scale_9_squeeze_mask_0, x = dec_scale)[name = tensor<string, []>("prev_scale_9")];
-            tensor<fp32, [12, 3, 256]> var_1082 = linear(bias = decoder_q_proj_0_bias, weight = decoder_q_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_38")];
-            tensor<int32, [4]> var_1083 = const()[name = tensor<string, []>("op_1083"), val = tensor<int32, [4]>([12, 3, 4, 64])];
-            tensor<fp32, [12, 3, 4, 64]> var_1084 = reshape(shape = var_1083, x = var_1082)[name = tensor<string, []>("op_1084")];
+            tensor<fp32, [12, 3, 256]> var_1105 = linear(bias = inner_decoder_q_proj_0_bias, weight = inner_decoder_q_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_38")];
+            tensor<int32, [4]> var_1106 = const()[name = tensor<string, []>("op_1106"), val = tensor<int32, [4]>([12, 3, 4, 64])];
+            tensor<fp32, [12, 3, 4, 64]> var_1107 = reshape(shape = var_1106, x = var_1105)[name = tensor<string, []>("op_1107")];
             tensor<int32, [4]> q_9_perm_0 = const()[name = tensor<string, []>("q_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 3, 256]> var_1088 = linear(bias = decoder_k_proj_0_bias, weight = decoder_k_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_39")];
-            tensor<fp32, []> var_1089 = const()[name = tensor<string, []>("op_1089"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [12, 3, 256]> var_1090 = mul(x = var_1088, y = var_1089)[name = tensor<string, []>("op_1090")];
-            tensor<int32, [4]> var_1091 = const()[name = tensor<string, []>("op_1091"), val = tensor<int32, [4]>([12, 3, 4, 64])];
-            tensor<fp32, [12, 3, 4, 64]> var_1092 = reshape(shape = var_1091, x = var_1090)[name = tensor<string, []>("op_1092")];
+            tensor<fp32, [12, 3, 256]> var_1111 = linear(bias = inner_decoder_k_proj_0_bias, weight = inner_decoder_k_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_39")];
+            tensor<fp32, []> var_1112 = const()[name = tensor<string, []>("op_1112"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [12, 3, 256]> var_1113 = mul(x = var_1111, y = var_1112)[name = tensor<string, []>("op_1113")];
+            tensor<int32, [4]> var_1114 = const()[name = tensor<string, []>("op_1114"), val = tensor<int32, [4]>([12, 3, 4, 64])];
+            tensor<fp32, [12, 3, 4, 64]> var_1115 = reshape(shape = var_1114, x = var_1113)[name = tensor<string, []>("op_1115")];
             tensor<int32, [4]> k_9_perm_0 = const()[name = tensor<string, []>("k_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 3, 256]> var_1096 = linear(bias = decoder_v_proj_0_bias, weight = decoder_v_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_40")];
-            tensor<int32, [4]> var_1097 = const()[name = tensor<string, []>("op_1097"), val = tensor<int32, [4]>([12, 3, 4, 64])];
-            tensor<fp32, [12, 3, 4, 64]> var_1098 = reshape(shape = var_1097, x = var_1096)[name = tensor<string, []>("op_1098")];
+            tensor<fp32, [12, 3, 256]> var_1119 = linear(bias = inner_decoder_v_proj_0_bias, weight = inner_decoder_v_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_40")];
+            tensor<int32, [4]> var_1120 = const()[name = tensor<string, []>("op_1120"), val = tensor<int32, [4]>([12, 3, 4, 64])];
+            tensor<fp32, [12, 3, 4, 64]> var_1121 = reshape(shape = var_1120, x = var_1119)[name = tensor<string, []>("op_1121")];
             tensor<int32, [4]> v_9_perm_0 = const()[name = tensor<string, []>("v_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 3, 256]> input_169 = linear(bias = decoder_g_proj_0_bias, weight = decoder_g_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_41")];
+            tensor<fp32, [12, 3, 256]> input_171 = linear(bias = inner_decoder_g_proj_0_bias, weight = inner_decoder_g_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_41")];
             tensor<bool, []> cumsum_mask_1_exclusive_0 = const()[name = tensor<string, []>("cumsum_mask_1_exclusive_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> cumsum_mask_1_reverse_0 = const()[name = tensor<string, []>("cumsum_mask_1_reverse_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [3]> cumsum_mask_1 = cumsum(axis = var_1000, exclusive = cumsum_mask_1_exclusive_0, reverse = cumsum_mask_1_reverse_0, x = valid_mask)[name = tensor<string, []>("cumsum_mask_1")];
+            tensor<fp32, [3]> cumsum_mask_1 = cumsum(axis = var_59, exclusive = cumsum_mask_1_exclusive_0, reverse = cumsum_mask_1_reverse_0, x = valid_mask)[name = tensor<string, []>("cumsum_mask_1")];
             tensor<fp32, [1]> sqrt_s0_9 = sqrt(x = prev_scale_9)[name = tensor<string, []>("sqrt_s0_9")];
             tensor<fp32, [3]> s_t_9 = add(x = prev_scale_9, y = cumsum_mask_1)[name = tensor<string, []>("s_t_9")];
             tensor<fp32, []> const_20 = const()[name = tensor<string, []>("const_20"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [3]> clip_1 = clip(alpha = var_990, beta = const_20, x = s_t_9)[name = tensor<string, []>("clip_1")];
+            tensor<fp32, [3]> clip_1 = clip(alpha = var_49, beta = const_20, x = s_t_9)[name = tensor<string, []>("clip_1")];
             tensor<fp32, [3]> sqrt_s_t_9 = sqrt(x = clip_1)[name = tensor<string, []>("sqrt_s_t_9")];
             tensor<bool, []> qk_9_transpose_x_1 = const()[name = tensor<string, []>("qk_9_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_9_transpose_y_1 = const()[name = tensor<string, []>("qk_9_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [12, 4, 3, 64]> k_9 = transpose(perm = k_9_perm_0, x = var_1092)[name = tensor<string, []>("transpose_26")];
-            tensor<fp32, [12, 4, 3, 64]> q_9 = transpose(perm = q_9_perm_0, x = var_1084)[name = tensor<string, []>("transpose_27")];
+            tensor<fp32, [12, 4, 3, 64]> k_9 = transpose(perm = k_9_perm_0, x = var_1115)[name = tensor<string, []>("transpose_26")];
+            tensor<fp32, [12, 4, 3, 64]> q_9 = transpose(perm = q_9_perm_0, x = var_1107)[name = tensor<string, []>("transpose_27")];
             tensor<fp32, [12, 4, 3, 3]> qk_9 = matmul(transpose_x = qk_9_transpose_x_1, transpose_y = qk_9_transpose_y_1, x = q_9, y = k_9)[name = tensor<string, []>("qk_9")];
-            tensor<int32, [2]> var_1110 = const()[name = tensor<string, []>("op_1110"), val = tensor<int32, [2]>([1, 3])];
-            tensor<fp32, [1, 3]> var_1111 = reshape(shape = var_1110, x = valid_mask)[name = tensor<string, []>("op_1111")];
-            tensor<fp32, [3, 3]> causal_with_valid_1 = mul(x = decoder__causal_mask, y = var_1111)[name = tensor<string, []>("causal_with_valid_1")];
-            tensor<int32, [2]> var_1113 = const()[name = tensor<string, []>("op_1113"), val = tensor<int32, [2]>([3, 1])];
-            tensor<fp32, [3, 1]> var_1114 = reshape(shape = var_1113, x = sqrt_s_t_9)[name = tensor<string, []>("op_1114")];
-            tensor<fp32, [3, 3]> M_9 = real_div(x = causal_with_valid_1, y = var_1114)[name = tensor<string, []>("M_9")];
-            tensor<fp32, [12, 4, 3, 3]> var_1116 = mul(x = qk_9, y = M_9)[name = tensor<string, []>("op_1116")];
+            tensor<int32, [2]> var_1133 = const()[name = tensor<string, []>("op_1133"), val = tensor<int32, [2]>([1, 3])];
+            tensor<fp32, [1, 3]> var_1134 = reshape(shape = var_1133, x = valid_mask)[name = tensor<string, []>("op_1134")];
+            tensor<fp32, [3, 3]> causal_with_valid_1 = mul(x = inner_decoder__causal_mask, y = var_1134)[name = tensor<string, []>("causal_with_valid_1")];
+            tensor<int32, [2]> var_1136 = const()[name = tensor<string, []>("op_1136"), val = tensor<int32, [2]>([3, 1])];
+            tensor<fp32, [3, 1]> var_1137 = reshape(shape = var_1136, x = sqrt_s_t_9)[name = tensor<string, []>("op_1137")];
+            tensor<fp32, [3, 3]> M_9 = real_div(x = causal_with_valid_1, y = var_1137)[name = tensor<string, []>("M_9")];
+            tensor<fp32, [12, 4, 3, 3]> var_1139 = mul(x = qk_9, y = M_9)[name = tensor<string, []>("op_1139")];
             tensor<bool, []> inner_9_transpose_x_0 = const()[name = tensor<string, []>("inner_9_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_9_transpose_y_0 = const()[name = tensor<string, []>("inner_9_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 3, 64]> v_9 = transpose(perm = v_9_perm_0, x = var_1098)[name = tensor<string, []>("transpose_25")];
-            tensor<fp32, [12, 4, 3, 64]> inner_9 = matmul(transpose_x = inner_9_transpose_x_0, transpose_y = inner_9_transpose_y_0, x = var_1116, y = v_9)[name = tensor<string, []>("inner_9")];
-            tensor<bool, []> var_1118_transpose_x_0 = const()[name = tensor<string, []>("op_1118_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_1118_transpose_y_0 = const()[name = tensor<string, []>("op_1118_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 3, 64]> var_1118 = matmul(transpose_x = var_1118_transpose_x_0, transpose_y = var_1118_transpose_y_0, x = q_9, y = prev_kv_9)[name = tensor<string, []>("op_1118")];
-            tensor<fp32, [3]> var_1119 = real_div(x = sqrt_s0_9, y = sqrt_s_t_9)[name = tensor<string, []>("op_1119")];
-            tensor<int32, [4]> var_1120 = const()[name = tensor<string, []>("op_1120"), val = tensor<int32, [4]>([1, 1, 3, 1])];
-            tensor<fp32, [1, 1, 3, 1]> var_1121 = reshape(shape = var_1120, x = var_1119)[name = tensor<string, []>("op_1121")];
-            tensor<fp32, [12, 4, 3, 64]> cross_9 = mul(x = var_1118, y = var_1121)[name = tensor<string, []>("cross_9")];
+            tensor<fp32, [12, 4, 3, 64]> v_9 = transpose(perm = v_9_perm_0, x = var_1121)[name = tensor<string, []>("transpose_25")];
+            tensor<fp32, [12, 4, 3, 64]> inner_9 = matmul(transpose_x = inner_9_transpose_x_0, transpose_y = inner_9_transpose_y_0, x = var_1139, y = v_9)[name = tensor<string, []>("inner_9")];
+            tensor<bool, []> var_1141_transpose_x_0 = const()[name = tensor<string, []>("op_1141_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_1141_transpose_y_0 = const()[name = tensor<string, []>("op_1141_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 3, 64]> var_1141 = matmul(transpose_x = var_1141_transpose_x_0, transpose_y = var_1141_transpose_y_0, x = q_9, y = prev_kv_9)[name = tensor<string, []>("op_1141")];
+            tensor<fp32, [3]> var_1142 = real_div(x = sqrt_s0_9, y = sqrt_s_t_9)[name = tensor<string, []>("op_1142")];
+            tensor<int32, [4]> var_1143 = const()[name = tensor<string, []>("op_1143"), val = tensor<int32, [4]>([1, 1, 3, 1])];
+            tensor<fp32, [1, 1, 3, 1]> var_1144 = reshape(shape = var_1143, x = var_1142)[name = tensor<string, []>("op_1144")];
+            tensor<fp32, [12, 4, 3, 64]> cross_9 = mul(x = var_1141, y = var_1144)[name = tensor<string, []>("cross_9")];
             tensor<fp32, [12, 4, 3, 64]> out_25 = add(x = inner_9, y = cross_9)[name = tensor<string, []>("out_25")];
-            tensor<int32, [4]> var_1124 = const()[name = tensor<string, []>("op_1124"), val = tensor<int32, [4]>([1, 1, 3, 1])];
-            tensor<fp32, [1, 1, 3, 1]> var_1125 = reshape(shape = var_1124, x = valid_mask)[name = tensor<string, []>("op_1125")];
-            tensor<fp32, [12, 4, 3, 64]> v_masked_1 = mul(x = v_9, y = var_1125)[name = tensor<string, []>("v_masked_1")];
-            tensor<fp32, [12, 4, 64, 64]> var_1127 = mul(x = prev_kv_9, y = sqrt_s0_9)[name = tensor<string, []>("op_1127")];
-            tensor<bool, []> var_1129_transpose_x_1 = const()[name = tensor<string, []>("op_1129_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_1129_transpose_y_1 = const()[name = tensor<string, []>("op_1129_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 64, 64]> var_1129 = matmul(transpose_x = var_1129_transpose_x_1, transpose_y = var_1129_transpose_y_1, x = k_9, y = v_masked_1)[name = tensor<string, []>("op_1129")];
-            tensor<fp32, [12, 4, 64, 64]> new_kv_unnorm_9 = add(x = var_1127, y = var_1129)[name = tensor<string, []>("new_kv_unnorm_9")];
-            tensor<bool, []> var_1131_keep_dims_0 = const()[name = tensor<string, []>("op_1131_keep_dims_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, []> var_1131 = reduce_sum(keep_dims = var_1131_keep_dims_0, x = valid_mask)[name = tensor<string, []>("op_1131")];
-            tensor<int32, [1]> var_1132 = const()[name = tensor<string, []>("op_1132"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [1]> var_1133 = reshape(shape = var_1132, x = var_1131)[name = tensor<string, []>("op_1133")];
-            tensor<fp32, [1]> new_scale_9 = add(x = prev_scale_9, y = var_1133)[name = tensor<string, []>("new_scale_9")];
+            tensor<int32, [4]> var_1147 = const()[name = tensor<string, []>("op_1147"), val = tensor<int32, [4]>([1, 1, 3, 1])];
+            tensor<fp32, [1, 1, 3, 1]> var_1148 = reshape(shape = var_1147, x = valid_mask)[name = tensor<string, []>("op_1148")];
+            tensor<fp32, [12, 4, 3, 64]> v_masked_1 = mul(x = v_9, y = var_1148)[name = tensor<string, []>("v_masked_1")];
+            tensor<fp32, [12, 4, 64, 64]> var_1150 = mul(x = prev_kv_9, y = sqrt_s0_9)[name = tensor<string, []>("op_1150")];
+            tensor<bool, []> var_1152_transpose_x_1 = const()[name = tensor<string, []>("op_1152_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_1152_transpose_y_1 = const()[name = tensor<string, []>("op_1152_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 64, 64]> var_1152 = matmul(transpose_x = var_1152_transpose_x_1, transpose_y = var_1152_transpose_y_1, x = k_9, y = v_masked_1)[name = tensor<string, []>("op_1152")];
+            tensor<fp32, [12, 4, 64, 64]> new_kv_unnorm_9 = add(x = var_1150, y = var_1152)[name = tensor<string, []>("new_kv_unnorm_9")];
+            tensor<bool, []> var_1154_keep_dims_0 = const()[name = tensor<string, []>("op_1154_keep_dims_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, []> var_1154 = reduce_sum(keep_dims = var_1154_keep_dims_0, x = valid_mask)[name = tensor<string, []>("op_1154")];
+            tensor<int32, [1]> var_1155 = const()[name = tensor<string, []>("op_1155"), val = tensor<int32, [1]>([1])];
+            tensor<fp32, [1]> var_1156 = reshape(shape = var_1155, x = var_1154)[name = tensor<string, []>("op_1156")];
+            tensor<fp32, [1]> new_scale_9 = add(x = prev_scale_9, y = var_1156)[name = tensor<string, []>("new_scale_9")];
             tensor<fp32, []> const_21 = const()[name = tensor<string, []>("const_21"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_2 = clip(alpha = var_990, beta = const_21, x = new_scale_9)[name = tensor<string, []>("clip_2")];
+            tensor<fp32, [1]> clip_2 = clip(alpha = var_49, beta = const_21, x = new_scale_9)[name = tensor<string, []>("clip_2")];
             tensor<fp32, [1]> sqrt_new_scale_1 = sqrt(x = clip_2)[name = tensor<string, []>("sqrt_new_scale_1")];
-            tensor<fp32, [12, 4, 64, 64]> var_1137 = real_div(x = new_kv_unnorm_9, y = sqrt_new_scale_1)[name = tensor<string, []>("op_1137")];
-            tensor<int32, [4]> var_1138_perm_0 = const()[name = tensor<string, []>("op_1138_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [12, 4, 64, 64]> var_1160 = real_div(x = new_kv_unnorm_9, y = sqrt_new_scale_1)[name = tensor<string, []>("op_1160")];
+            tensor<int32, [4]> var_1161_perm_0 = const()[name = tensor<string, []>("op_1161_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_27_axes_0 = const()[name = tensor<string, []>("out_27_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [12, 3, 4, 64]> var_1138 = transpose(perm = var_1138_perm_0, x = out_25)[name = tensor<string, []>("transpose_24")];
-            tensor<fp32, [12, 3, 4, 64]> out_27 = layer_norm(axes = out_27_axes_0, epsilon = var_987, x = var_1138)[name = tensor<string, []>("out_27")];
-            tensor<int32, [3]> var_1142 = const()[name = tensor<string, []>("op_1142"), val = tensor<int32, [3]>([12, 3, 256])];
-            tensor<fp32, [12, 3, 256]> out_29 = reshape(shape = var_1142, x = out_27)[name = tensor<string, []>("out_29")];
-            tensor<fp32, [12, 3, 256]> var_1144 = silu(x = input_169)[name = tensor<string, []>("op_1144")];
-            tensor<fp32, [12, 3, 256]> input_171 = mul(x = var_1144, y = out_29)[name = tensor<string, []>("input_171")];
-            tensor<fp32, [12, 3, 256]> ret_out_9 = linear(bias = decoder_out_proj_0_bias, weight = decoder_out_proj_0_weight, x = input_171)[name = tensor<string, []>("linear_42")];
-            tensor<fp32, [12, 3, 256]> input_173 = add(x = x_29, y = ret_out_9)[name = tensor<string, []>("input_173")];
+            tensor<fp32, [12, 3, 4, 64]> var_1161 = transpose(perm = var_1161_perm_0, x = out_25)[name = tensor<string, []>("transpose_24")];
+            tensor<fp32, [12, 3, 4, 64]> out_27 = layer_norm(axes = out_27_axes_0, epsilon = var_64, x = var_1161)[name = tensor<string, []>("out_27")];
+            tensor<int32, [3]> var_1165 = const()[name = tensor<string, []>("op_1165"), val = tensor<int32, [3]>([12, 3, 256])];
+            tensor<fp32, [12, 3, 256]> out_29 = reshape(shape = var_1165, x = out_27)[name = tensor<string, []>("out_29")];
+            tensor<fp32, [12, 3, 256]> var_1167 = silu(x = input_171)[name = tensor<string, []>("op_1167")];
+            tensor<fp32, [12, 3, 256]> input_173 = mul(x = var_1167, y = out_29)[name = tensor<string, []>("input_173")];
+            tensor<fp32, [12, 3, 256]> ret_out_9 = linear(bias = inner_decoder_out_proj_0_bias, weight = inner_decoder_out_proj_0_weight, x = input_173)[name = tensor<string, []>("linear_42")];
+            tensor<fp32, [12, 3, 256]> input_175 = add(x = x_29, y = ret_out_9)[name = tensor<string, []>("input_175")];
             tensor<int32, [1]> xt_1_axes_0 = const()[name = tensor<string, []>("xt_1_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [12, 3, 256]> xt_1 = layer_norm(axes = xt_1_axes_0, beta = decoder_norm11_0_bias, epsilon = var_985, gamma = decoder_norm11_0_weight, x = input_173)[name = tensor<string, []>("xt_1")];
-            tensor<int32, [4]> var_1154 = const()[name = tensor<string, []>("op_1154"), val = tensor<int32, [4]>([1, 12, 3, 256])];
-            tensor<fp32, [1, 12, 3, 256]> var_1155 = reshape(shape = var_1154, x = xt_1)[name = tensor<string, []>("op_1155")];
-            tensor<int32, [4]> var_1156_perm_0 = const()[name = tensor<string, []>("op_1156_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1159 = const()[name = tensor<string, []>("op_1159"), val = tensor<int32, [3]>([3, 12, 256])];
-            tensor<fp32, [1, 3, 12, 256]> var_1156 = transpose(perm = var_1156_perm_0, x = var_1155)[name = tensor<string, []>("transpose_23")];
-            tensor<fp32, [3, 12, 256]> query_1 = reshape(shape = var_1159, x = var_1156)[name = tensor<string, []>("query_1")];
+            tensor<fp32, [12, 3, 256]> xt_1 = layer_norm(axes = xt_1_axes_0, beta = inner_decoder_norm11_0_bias, epsilon = var_56, gamma = inner_decoder_norm11_0_weight, x = input_175)[name = tensor<string, []>("xt_1")];
+            tensor<int32, [4]> var_1177 = const()[name = tensor<string, []>("op_1177"), val = tensor<int32, [4]>([1, 12, 3, 256])];
+            tensor<fp32, [1, 12, 3, 256]> var_1178 = reshape(shape = var_1177, x = xt_1)[name = tensor<string, []>("op_1178")];
+            tensor<int32, [4]> var_1179_perm_0 = const()[name = tensor<string, []>("op_1179_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1182 = const()[name = tensor<string, []>("op_1182"), val = tensor<int32, [3]>([3, 12, 256])];
+            tensor<fp32, [1, 3, 12, 256]> var_1179 = transpose(perm = var_1179_perm_0, x = var_1178)[name = tensor<string, []>("transpose_23")];
+            tensor<fp32, [3, 12, 256]> query_1 = reshape(shape = var_1182, x = var_1179)[name = tensor<string, []>("query_1")];
             tensor<int32, [3]> query_3_perm_0 = const()[name = tensor<string, []>("query_3_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [12, 3, 256]> query_3 = transpose(perm = query_3_perm_0, x = query_1)[name = tensor<string, []>("transpose_22")];
-            tensor<fp32, [12, 3, 768]> var_1182 = linear(bias = decoder_self_attn2_0_in_proj_bias, weight = decoder_self_attn2_0_in_proj_weight, x = query_3)[name = tensor<string, []>("linear_43")];
+            tensor<fp32, [12, 3, 768]> var_1205 = linear(bias = inner_decoder_self_attn2_0_in_proj_bias, weight = inner_decoder_self_attn2_0_in_proj_weight, x = query_3)[name = tensor<string, []>("linear_43")];
             tensor<int32, [4]> concat_1 = const()[name = tensor<string, []>("concat_1"), val = tensor<int32, [4]>([12, 3, 3, 256])];
-            tensor<fp32, [12, 3, 3, 256]> var_1184 = reshape(shape = concat_1, x = var_1182)[name = tensor<string, []>("op_1184")];
-            tensor<int32, [1]> var_1185_axes_0 = const()[name = tensor<string, []>("op_1185_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp32, [1, 12, 3, 3, 256]> var_1185 = expand_dims(axes = var_1185_axes_0, x = var_1184)[name = tensor<string, []>("op_1185")];
-            tensor<int32, [5]> var_1186_perm_0 = const()[name = tensor<string, []>("op_1186_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
-            tensor<int32, [1]> var_1187_axes_0 = const()[name = tensor<string, []>("op_1187_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp32, [3, 12, 3, 1, 256]> var_1186 = transpose(perm = var_1186_perm_0, x = var_1185)[name = tensor<string, []>("transpose_21")];
-            tensor<fp32, [3, 12, 3, 256]> var_1187 = squeeze(axes = var_1187_axes_0, x = var_1186)[name = tensor<string, []>("op_1187")];
+            tensor<fp32, [12, 3, 3, 256]> var_1207 = reshape(shape = concat_1, x = var_1205)[name = tensor<string, []>("op_1207")];
+            tensor<int32, [1]> var_1208_axes_0 = const()[name = tensor<string, []>("op_1208_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 12, 3, 3, 256]> var_1208 = expand_dims(axes = var_1208_axes_0, x = var_1207)[name = tensor<string, []>("op_1208")];
+            tensor<int32, [5]> var_1209_perm_0 = const()[name = tensor<string, []>("op_1209_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
+            tensor<int32, [1]> var_1210_axes_0 = const()[name = tensor<string, []>("op_1210_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp32, [3, 12, 3, 1, 256]> var_1209 = transpose(perm = var_1209_perm_0, x = var_1208)[name = tensor<string, []>("transpose_21")];
+            tensor<fp32, [3, 12, 3, 256]> var_1210 = squeeze(axes = var_1210_axes_0, x = var_1209)[name = tensor<string, []>("op_1210")];
             tensor<int32, [4]> q_11_begin_0 = const()[name = tensor<string, []>("q_11_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> q_11_end_0 = const()[name = tensor<string, []>("q_11_end_0"), val = tensor<int32, [4]>([1, 12, 3, 256])];
             tensor<bool, [4]> q_11_end_mask_0 = const()[name = tensor<string, []>("q_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> q_11_squeeze_mask_0 = const()[name = tensor<string, []>("q_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 3, 256]> q_11 = slice_by_index(begin = q_11_begin_0, end = q_11_end_0, end_mask = q_11_end_mask_0, squeeze_mask = q_11_squeeze_mask_0, x = var_1187)[name = tensor<string, []>("q_11")];
+            tensor<fp32, [12, 3, 256]> q_11 = slice_by_index(begin = q_11_begin_0, end = q_11_end_0, end_mask = q_11_end_mask_0, squeeze_mask = q_11_squeeze_mask_0, x = var_1210)[name = tensor<string, []>("q_11")];
             tensor<int32, [4]> k_11_begin_0 = const()[name = tensor<string, []>("k_11_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> k_11_end_0 = const()[name = tensor<string, []>("k_11_end_0"), val = tensor<int32, [4]>([2, 12, 3, 256])];
             tensor<bool, [4]> k_11_end_mask_0 = const()[name = tensor<string, []>("k_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> k_11_squeeze_mask_0 = const()[name = tensor<string, []>("k_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 3, 256]> k_11 = slice_by_index(begin = k_11_begin_0, end = k_11_end_0, end_mask = k_11_end_mask_0, squeeze_mask = k_11_squeeze_mask_0, x = var_1187)[name = tensor<string, []>("k_11")];
+            tensor<fp32, [12, 3, 256]> k_11 = slice_by_index(begin = k_11_begin_0, end = k_11_end_0, end_mask = k_11_end_mask_0, squeeze_mask = k_11_squeeze_mask_0, x = var_1210)[name = tensor<string, []>("k_11")];
             tensor<int32, [4]> v_11_begin_0 = const()[name = tensor<string, []>("v_11_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> v_11_end_0 = const()[name = tensor<string, []>("v_11_end_0"), val = tensor<int32, [4]>([3, 12, 3, 256])];
             tensor<bool, [4]> v_11_end_mask_0 = const()[name = tensor<string, []>("v_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> v_11_squeeze_mask_0 = const()[name = tensor<string, []>("v_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 3, 256]> v_11 = slice_by_index(begin = v_11_begin_0, end = v_11_end_0, end_mask = v_11_end_mask_0, squeeze_mask = v_11_squeeze_mask_0, x = var_1187)[name = tensor<string, []>("v_11")];
-            tensor<int32, [3]> var_1195 = const()[name = tensor<string, []>("op_1195"), val = tensor<int32, [3]>([12, 12, 64])];
-            tensor<fp32, [12, 12, 64]> var_1196 = reshape(shape = var_1195, x = q_11)[name = tensor<string, []>("op_1196")];
+            tensor<fp32, [12, 3, 256]> v_11 = slice_by_index(begin = v_11_begin_0, end = v_11_end_0, end_mask = v_11_end_mask_0, squeeze_mask = v_11_squeeze_mask_0, x = var_1210)[name = tensor<string, []>("v_11")];
+            tensor<int32, [3]> var_1218 = const()[name = tensor<string, []>("op_1218"), val = tensor<int32, [3]>([12, 12, 64])];
+            tensor<fp32, [12, 12, 64]> var_1219 = reshape(shape = var_1218, x = q_11)[name = tensor<string, []>("op_1219")];
             tensor<int32, [3]> q_13_perm_0 = const()[name = tensor<string, []>("q_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1202 = const()[name = tensor<string, []>("op_1202"), val = tensor<int32, [3]>([12, 12, 64])];
-            tensor<fp32, [12, 12, 64]> var_1203 = reshape(shape = var_1202, x = k_11)[name = tensor<string, []>("op_1203")];
+            tensor<int32, [3]> var_1225 = const()[name = tensor<string, []>("op_1225"), val = tensor<int32, [3]>([12, 12, 64])];
+            tensor<fp32, [12, 12, 64]> var_1226 = reshape(shape = var_1225, x = k_11)[name = tensor<string, []>("op_1226")];
             tensor<int32, [3]> k_13_perm_0 = const()[name = tensor<string, []>("k_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1209 = const()[name = tensor<string, []>("op_1209"), val = tensor<int32, [3]>([12, 12, 64])];
-            tensor<fp32, [12, 12, 64]> var_1210 = reshape(shape = var_1209, x = v_11)[name = tensor<string, []>("op_1210")];
+            tensor<int32, [3]> var_1232 = const()[name = tensor<string, []>("op_1232"), val = tensor<int32, [3]>([12, 12, 64])];
+            tensor<fp32, [12, 12, 64]> var_1233 = reshape(shape = var_1232, x = v_11)[name = tensor<string, []>("op_1233")];
             tensor<int32, [3]> v_13_perm_0 = const()[name = tensor<string, []>("v_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [4]> var_1213 = const()[name = tensor<string, []>("op_1213"), val = tensor<int32, [4]>([3, 4, 12, 64])];
-            tensor<fp32, [12, 12, 64]> q_13 = transpose(perm = q_13_perm_0, x = var_1196)[name = tensor<string, []>("transpose_20")];
-            tensor<fp32, [3, 4, 12, 64]> q_15 = reshape(shape = var_1213, x = q_13)[name = tensor<string, []>("q_15")];
-            tensor<int32, [4]> var_1215 = const()[name = tensor<string, []>("op_1215"), val = tensor<int32, [4]>([3, 4, 12, 64])];
-            tensor<fp32, [12, 12, 64]> k_13 = transpose(perm = k_13_perm_0, x = var_1203)[name = tensor<string, []>("transpose_19")];
-            tensor<fp32, [3, 4, 12, 64]> k_15 = reshape(shape = var_1215, x = k_13)[name = tensor<string, []>("k_15")];
-            tensor<int32, [4]> var_1217 = const()[name = tensor<string, []>("op_1217"), val = tensor<int32, [4]>([3, 4, 12, 64])];
-            tensor<fp32, [12, 12, 64]> v_13 = transpose(perm = v_13_perm_0, x = var_1210)[name = tensor<string, []>("transpose_18")];
-            tensor<fp32, [3, 4, 12, 64]> v_15 = reshape(shape = var_1217, x = v_13)[name = tensor<string, []>("v_15")];
+            tensor<int32, [4]> var_1236 = const()[name = tensor<string, []>("op_1236"), val = tensor<int32, [4]>([3, 4, 12, 64])];
+            tensor<fp32, [12, 12, 64]> q_13 = transpose(perm = q_13_perm_0, x = var_1219)[name = tensor<string, []>("transpose_20")];
+            tensor<fp32, [3, 4, 12, 64]> q_15 = reshape(shape = var_1236, x = q_13)[name = tensor<string, []>("q_15")];
+            tensor<int32, [4]> var_1238 = const()[name = tensor<string, []>("op_1238"), val = tensor<int32, [4]>([3, 4, 12, 64])];
+            tensor<fp32, [12, 12, 64]> k_13 = transpose(perm = k_13_perm_0, x = var_1226)[name = tensor<string, []>("transpose_19")];
+            tensor<fp32, [3, 4, 12, 64]> k_15 = reshape(shape = var_1238, x = k_13)[name = tensor<string, []>("k_15")];
+            tensor<int32, [4]> var_1240 = const()[name = tensor<string, []>("op_1240"), val = tensor<int32, [4]>([3, 4, 12, 64])];
+            tensor<fp32, [12, 12, 64]> v_13 = transpose(perm = v_13_perm_0, x = var_1233)[name = tensor<string, []>("transpose_18")];
+            tensor<fp32, [3, 4, 12, 64]> v_15 = reshape(shape = var_1240, x = v_13)[name = tensor<string, []>("v_15")];
             tensor<fp32, []> mul_1_y_0 = const()[name = tensor<string, []>("mul_1_y_0"), val = tensor<fp32, []>(0x1p-3)];
             tensor<fp32, [3, 4, 12, 64]> mul_1 = mul(x = q_15, y = mul_1_y_0)[name = tensor<string, []>("mul_1")];
             tensor<bool, []> matmul_0_transpose_y_0 = const()[name = tensor<string, []>("matmul_0_transpose_y_0"), val = tensor<bool, []>(true)];
@@ -1079,30 +1090,30 @@ program(1.0)
             tensor<bool, []> attn_output_1_transpose_x_0 = const()[name = tensor<string, []>("attn_output_1_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> attn_output_1_transpose_y_0 = const()[name = tensor<string, []>("attn_output_1_transpose_y_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [3, 4, 12, 64]> attn_output_1 = matmul(transpose_x = attn_output_1_transpose_x_0, transpose_y = attn_output_1_transpose_y_0, x = softmax_0, y = v_15)[name = tensor<string, []>("attn_output_1")];
-            tensor<int32, [4]> var_1220 = const()[name = tensor<string, []>("op_1220"), val = tensor<int32, [4]>([2, 0, 1, 3])];
-            tensor<int32, [2]> var_1225 = const()[name = tensor<string, []>("op_1225"), val = tensor<int32, [2]>([36, 256])];
-            tensor<fp32, [12, 3, 4, 64]> var_1221 = transpose(perm = var_1220, x = attn_output_1)[name = tensor<string, []>("transpose_17")];
-            tensor<fp32, [36, 256]> attn_output_3 = reshape(shape = var_1225, x = var_1221)[name = tensor<string, []>("attn_output_3")];
-            tensor<fp32, [36, 256]> attn_output_5 = linear(bias = decoder_self_attn2_0_out_proj_bias, weight = decoder_self_attn2_0_out_proj_weight, x = attn_output_3)[name = tensor<string, []>("linear_44")];
-            tensor<int32, [3]> var_1229 = const()[name = tensor<string, []>("op_1229"), val = tensor<int32, [3]>([12, 3, 256])];
-            tensor<fp32, [12, 3, 256]> attn_output_7 = reshape(shape = var_1229, x = attn_output_5)[name = tensor<string, []>("attn_output_7")];
+            tensor<int32, [4]> var_1243 = const()[name = tensor<string, []>("op_1243"), val = tensor<int32, [4]>([2, 0, 1, 3])];
+            tensor<int32, [2]> var_1248 = const()[name = tensor<string, []>("op_1248"), val = tensor<int32, [2]>([36, 256])];
+            tensor<fp32, [12, 3, 4, 64]> var_1244 = transpose(perm = var_1243, x = attn_output_1)[name = tensor<string, []>("transpose_17")];
+            tensor<fp32, [36, 256]> attn_output_3 = reshape(shape = var_1248, x = var_1244)[name = tensor<string, []>("attn_output_3")];
+            tensor<fp32, [36, 256]> attn_output_5 = linear(bias = inner_decoder_self_attn2_0_out_proj_bias, weight = inner_decoder_self_attn2_0_out_proj_weight, x = attn_output_3)[name = tensor<string, []>("linear_44")];
+            tensor<int32, [3]> var_1252 = const()[name = tensor<string, []>("op_1252"), val = tensor<int32, [3]>([12, 3, 256])];
+            tensor<fp32, [12, 3, 256]> attn_output_7 = reshape(shape = var_1252, x = attn_output_5)[name = tensor<string, []>("attn_output_7")];
             tensor<int32, [3]> sa2_out_1_perm_0 = const()[name = tensor<string, []>("sa2_out_1_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [3, 12, 256]> sa2_out_1 = transpose(perm = sa2_out_1_perm_0, x = attn_output_7)[name = tensor<string, []>("transpose_16")];
-            tensor<fp32, [3, 12, 256]> input_175 = add(x = query_1, y = sa2_out_1)[name = tensor<string, []>("input_175")];
-            tensor<int32, [1]> input_177_axes_0 = const()[name = tensor<string, []>("input_177_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [3, 12, 256]> input_177 = layer_norm(axes = input_177_axes_0, beta = decoder_norm21_0_bias, epsilon = var_985, gamma = decoder_norm21_0_weight, x = input_175)[name = tensor<string, []>("input_177")];
-            tensor<fp32, [3, 12, 2048]> input_179 = linear(bias = decoder_linear1_0_bias, weight = decoder_linear1_0_weight, x = input_177)[name = tensor<string, []>("linear_45")];
-            tensor<fp32, [3, 12, 2048]> input_181 = relu(x = input_179)[name = tensor<string, []>("input_181")];
-            tensor<fp32, [3, 12, 256]> ffn_out_1 = linear(bias = decoder_linear2_0_bias, weight = decoder_linear2_0_weight, x = input_181)[name = tensor<string, []>("linear_46")];
-            tensor<fp32, [3, 12, 256]> input_183 = add(x = input_177, y = ffn_out_1)[name = tensor<string, []>("input_183")];
+            tensor<fp32, [3, 12, 256]> input_177 = add(x = query_1, y = sa2_out_1)[name = tensor<string, []>("input_177")];
+            tensor<int32, [1]> input_179_axes_0 = const()[name = tensor<string, []>("input_179_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [3, 12, 256]> input_179 = layer_norm(axes = input_179_axes_0, beta = inner_decoder_norm21_0_bias, epsilon = var_56, gamma = inner_decoder_norm21_0_weight, x = input_177)[name = tensor<string, []>("input_179")];
+            tensor<fp32, [3, 12, 2048]> input_181 = linear(bias = inner_decoder_linear1_0_bias, weight = inner_decoder_linear1_0_weight, x = input_179)[name = tensor<string, []>("linear_45")];
+            tensor<fp32, [3, 12, 2048]> input_183 = relu(x = input_181)[name = tensor<string, []>("input_183")];
+            tensor<fp32, [3, 12, 256]> ffn_out_1 = linear(bias = inner_decoder_linear2_0_bias, weight = inner_decoder_linear2_0_weight, x = input_183)[name = tensor<string, []>("linear_46")];
+            tensor<fp32, [3, 12, 256]> input_185 = add(x = input_179, y = ffn_out_1)[name = tensor<string, []>("input_185")];
             tensor<int32, [1]> xt_3_axes_0 = const()[name = tensor<string, []>("xt_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [3, 12, 256]> xt_3 = layer_norm(axes = xt_3_axes_0, beta = decoder_norm22_0_bias, epsilon = var_985, gamma = decoder_norm22_0_weight, x = input_183)[name = tensor<string, []>("xt_3")];
-            tensor<int32, [4]> var_1249 = const()[name = tensor<string, []>("op_1249"), val = tensor<int32, [4]>([1, 3, 12, 256])];
-            tensor<fp32, [1, 3, 12, 256]> x_31 = reshape(shape = var_1249, x = xt_3)[name = tensor<string, []>("x_31")];
-            tensor<int32, [4]> var_1251_perm_0 = const()[name = tensor<string, []>("op_1251_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1255 = const()[name = tensor<string, []>("op_1255"), val = tensor<int32, [3]>([12, 3, 256])];
-            tensor<fp32, [1, 12, 3, 256]> var_1251 = transpose(perm = var_1251_perm_0, x = x_31)[name = tensor<string, []>("transpose_15")];
-            tensor<fp32, [12, 3, 256]> x = reshape(shape = var_1255, x = var_1251)[name = tensor<string, []>("x")];
+            tensor<fp32, [3, 12, 256]> xt_3 = layer_norm(axes = xt_3_axes_0, beta = inner_decoder_norm22_0_bias, epsilon = var_56, gamma = inner_decoder_norm22_0_weight, x = input_185)[name = tensor<string, []>("xt_3")];
+            tensor<int32, [4]> var_1272 = const()[name = tensor<string, []>("op_1272"), val = tensor<int32, [4]>([1, 3, 12, 256])];
+            tensor<fp32, [1, 3, 12, 256]> x_31 = reshape(shape = var_1272, x = xt_3)[name = tensor<string, []>("x_31")];
+            tensor<int32, [4]> var_1274_perm_0 = const()[name = tensor<string, []>("op_1274_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1278 = const()[name = tensor<string, []>("op_1278"), val = tensor<int32, [3]>([12, 3, 256])];
+            tensor<fp32, [1, 12, 3, 256]> var_1274 = transpose(perm = var_1274_perm_0, x = x_31)[name = tensor<string, []>("transpose_15")];
+            tensor<fp32, [12, 3, 256]> x = reshape(shape = var_1278, x = var_1274)[name = tensor<string, []>("x")];
             tensor<int32, [5]> prev_kv_begin_0 = const()[name = tensor<string, []>("prev_kv_begin_0"), val = tensor<int32, [5]>([1, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_end_0 = const()[name = tensor<string, []>("prev_kv_end_0"), val = tensor<int32, [5]>([2, 12, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_end_mask_0 = const()[name = tensor<string, []>("prev_kv_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -1113,120 +1124,120 @@ program(1.0)
             tensor<bool, [2]> prev_scale_end_mask_0 = const()[name = tensor<string, []>("prev_scale_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale = slice_by_index(begin = prev_scale_begin_0, end = prev_scale_end_0, end_mask = prev_scale_end_mask_0, squeeze_mask = prev_scale_squeeze_mask_0, x = dec_scale)[name = tensor<string, []>("prev_scale")];
-            tensor<fp32, [12, 3, 256]> var_1263 = linear(bias = decoder_q_proj_1_bias, weight = decoder_q_proj_1_weight, x = x)[name = tensor<string, []>("linear_47")];
-            tensor<int32, [4]> var_1264 = const()[name = tensor<string, []>("op_1264"), val = tensor<int32, [4]>([12, 3, 4, 64])];
-            tensor<fp32, [12, 3, 4, 64]> var_1265 = reshape(shape = var_1264, x = var_1263)[name = tensor<string, []>("op_1265")];
+            tensor<fp32, [12, 3, 256]> var_1286 = linear(bias = inner_decoder_q_proj_1_bias, weight = inner_decoder_q_proj_1_weight, x = x)[name = tensor<string, []>("linear_47")];
+            tensor<int32, [4]> var_1287 = const()[name = tensor<string, []>("op_1287"), val = tensor<int32, [4]>([12, 3, 4, 64])];
+            tensor<fp32, [12, 3, 4, 64]> var_1288 = reshape(shape = var_1287, x = var_1286)[name = tensor<string, []>("op_1288")];
             tensor<int32, [4]> q_17_perm_0 = const()[name = tensor<string, []>("q_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 3, 256]> var_1269 = linear(bias = decoder_k_proj_1_bias, weight = decoder_k_proj_1_weight, x = x)[name = tensor<string, []>("linear_48")];
-            tensor<fp32, []> var_1270 = const()[name = tensor<string, []>("op_1270"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [12, 3, 256]> var_1271 = mul(x = var_1269, y = var_1270)[name = tensor<string, []>("op_1271")];
-            tensor<int32, [4]> var_1272 = const()[name = tensor<string, []>("op_1272"), val = tensor<int32, [4]>([12, 3, 4, 64])];
-            tensor<fp32, [12, 3, 4, 64]> var_1273 = reshape(shape = var_1272, x = var_1271)[name = tensor<string, []>("op_1273")];
+            tensor<fp32, [12, 3, 256]> var_1292 = linear(bias = inner_decoder_k_proj_1_bias, weight = inner_decoder_k_proj_1_weight, x = x)[name = tensor<string, []>("linear_48")];
+            tensor<fp32, []> var_1293 = const()[name = tensor<string, []>("op_1293"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [12, 3, 256]> var_1294 = mul(x = var_1292, y = var_1293)[name = tensor<string, []>("op_1294")];
+            tensor<int32, [4]> var_1295 = const()[name = tensor<string, []>("op_1295"), val = tensor<int32, [4]>([12, 3, 4, 64])];
+            tensor<fp32, [12, 3, 4, 64]> var_1296 = reshape(shape = var_1295, x = var_1294)[name = tensor<string, []>("op_1296")];
             tensor<int32, [4]> k_17_perm_0 = const()[name = tensor<string, []>("k_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 3, 256]> var_1277 = linear(bias = decoder_v_proj_1_bias, weight = decoder_v_proj_1_weight, x = x)[name = tensor<string, []>("linear_49")];
-            tensor<int32, [4]> var_1278 = const()[name = tensor<string, []>("op_1278"), val = tensor<int32, [4]>([12, 3, 4, 64])];
-            tensor<fp32, [12, 3, 4, 64]> var_1279 = reshape(shape = var_1278, x = var_1277)[name = tensor<string, []>("op_1279")];
+            tensor<fp32, [12, 3, 256]> var_1300 = linear(bias = inner_decoder_v_proj_1_bias, weight = inner_decoder_v_proj_1_weight, x = x)[name = tensor<string, []>("linear_49")];
+            tensor<int32, [4]> var_1301 = const()[name = tensor<string, []>("op_1301"), val = tensor<int32, [4]>([12, 3, 4, 64])];
+            tensor<fp32, [12, 3, 4, 64]> var_1302 = reshape(shape = var_1301, x = var_1300)[name = tensor<string, []>("op_1302")];
             tensor<int32, [4]> v_17_perm_0 = const()[name = tensor<string, []>("v_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 3, 256]> input_187 = linear(bias = decoder_g_proj_1_bias, weight = decoder_g_proj_1_weight, x = x)[name = tensor<string, []>("linear_50")];
+            tensor<fp32, [12, 3, 256]> input_189 = linear(bias = inner_decoder_g_proj_1_bias, weight = inner_decoder_g_proj_1_weight, x = x)[name = tensor<string, []>("linear_50")];
             tensor<fp32, [1]> sqrt_s0 = sqrt(x = prev_scale)[name = tensor<string, []>("sqrt_s0")];
             tensor<fp32, [3]> s_t = add(x = prev_scale, y = cumsum_mask_1)[name = tensor<string, []>("s_t")];
             tensor<fp32, []> const_32 = const()[name = tensor<string, []>("const_32"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [3]> clip_3 = clip(alpha = var_990, beta = const_32, x = s_t)[name = tensor<string, []>("clip_3")];
+            tensor<fp32, [3]> clip_3 = clip(alpha = var_49, beta = const_32, x = s_t)[name = tensor<string, []>("clip_3")];
             tensor<fp32, [3]> sqrt_s_t = sqrt(x = clip_3)[name = tensor<string, []>("sqrt_s_t")];
             tensor<bool, []> qk_transpose_x_1 = const()[name = tensor<string, []>("qk_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_transpose_y_1 = const()[name = tensor<string, []>("qk_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [12, 4, 3, 64]> k_17 = transpose(perm = k_17_perm_0, x = var_1273)[name = tensor<string, []>("transpose_13")];
-            tensor<fp32, [12, 4, 3, 64]> q_17 = transpose(perm = q_17_perm_0, x = var_1265)[name = tensor<string, []>("transpose_14")];
+            tensor<fp32, [12, 4, 3, 64]> k_17 = transpose(perm = k_17_perm_0, x = var_1296)[name = tensor<string, []>("transpose_13")];
+            tensor<fp32, [12, 4, 3, 64]> q_17 = transpose(perm = q_17_perm_0, x = var_1288)[name = tensor<string, []>("transpose_14")];
             tensor<fp32, [12, 4, 3, 3]> qk = matmul(transpose_x = qk_transpose_x_1, transpose_y = qk_transpose_y_1, x = q_17, y = k_17)[name = tensor<string, []>("qk")];
-            tensor<int32, [2]> var_1294 = const()[name = tensor<string, []>("op_1294"), val = tensor<int32, [2]>([3, 1])];
-            tensor<fp32, [3, 1]> var_1295 = reshape(shape = var_1294, x = sqrt_s_t)[name = tensor<string, []>("op_1295")];
-            tensor<fp32, [3, 3]> M = real_div(x = causal_with_valid_1, y = var_1295)[name = tensor<string, []>("M")];
-            tensor<fp32, [12, 4, 3, 3]> var_1297 = mul(x = qk, y = M)[name = tensor<string, []>("op_1297")];
-            tensor<bool, []> inner_transpose_x_0 = const()[name = tensor<string, []>("inner_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> inner_transpose_y_0 = const()[name = tensor<string, []>("inner_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 3, 64]> v_17 = transpose(perm = v_17_perm_0, x = var_1279)[name = tensor<string, []>("transpose_12")];
-            tensor<fp32, [12, 4, 3, 64]> inner = matmul(transpose_x = inner_transpose_x_0, transpose_y = inner_transpose_y_0, x = var_1297, y = v_17)[name = tensor<string, []>("inner")];
-            tensor<bool, []> var_1299_transpose_x_0 = const()[name = tensor<string, []>("op_1299_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_1299_transpose_y_0 = const()[name = tensor<string, []>("op_1299_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 3, 64]> var_1299 = matmul(transpose_x = var_1299_transpose_x_0, transpose_y = var_1299_transpose_y_0, x = q_17, y = prev_kv)[name = tensor<string, []>("op_1299")];
-            tensor<fp32, [3]> var_1300 = real_div(x = sqrt_s0, y = sqrt_s_t)[name = tensor<string, []>("op_1300")];
-            tensor<int32, [4]> var_1301 = const()[name = tensor<string, []>("op_1301"), val = tensor<int32, [4]>([1, 1, 3, 1])];
-            tensor<fp32, [1, 1, 3, 1]> var_1302 = reshape(shape = var_1301, x = var_1300)[name = tensor<string, []>("op_1302")];
-            tensor<fp32, [12, 4, 3, 64]> cross = mul(x = var_1299, y = var_1302)[name = tensor<string, []>("cross")];
-            tensor<fp32, [12, 4, 3, 64]> out_31 = add(x = inner, y = cross)[name = tensor<string, []>("out_31")];
-            tensor<fp32, [12, 4, 3, 64]> v_masked = mul(x = v_17, y = var_1125)[name = tensor<string, []>("v_masked")];
-            tensor<fp32, [12, 4, 64, 64]> var_1308 = mul(x = prev_kv, y = sqrt_s0)[name = tensor<string, []>("op_1308")];
-            tensor<bool, []> var_1310_transpose_x_1 = const()[name = tensor<string, []>("op_1310_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_1310_transpose_y_1 = const()[name = tensor<string, []>("op_1310_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 64, 64]> var_1310 = matmul(transpose_x = var_1310_transpose_x_1, transpose_y = var_1310_transpose_y_1, x = k_17, y = v_masked)[name = tensor<string, []>("op_1310")];
-            tensor<fp32, [12, 4, 64, 64]> new_kv_unnorm = add(x = var_1308, y = var_1310)[name = tensor<string, []>("new_kv_unnorm")];
-            tensor<fp32, [1]> new_scale = add(x = prev_scale, y = var_1133)[name = tensor<string, []>("new_scale")];
+            tensor<int32, [2]> var_1317 = const()[name = tensor<string, []>("op_1317"), val = tensor<int32, [2]>([3, 1])];
+            tensor<fp32, [3, 1]> var_1318 = reshape(shape = var_1317, x = sqrt_s_t)[name = tensor<string, []>("op_1318")];
+            tensor<fp32, [3, 3]> M = real_div(x = causal_with_valid_1, y = var_1318)[name = tensor<string, []>("M")];
+            tensor<fp32, [12, 4, 3, 3]> var_1320 = mul(x = qk, y = M)[name = tensor<string, []>("op_1320")];
+            tensor<bool, []> inner_11_transpose_x_0 = const()[name = tensor<string, []>("inner_11_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> inner_11_transpose_y_0 = const()[name = tensor<string, []>("inner_11_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 3, 64]> v_17 = transpose(perm = v_17_perm_0, x = var_1302)[name = tensor<string, []>("transpose_12")];
+            tensor<fp32, [12, 4, 3, 64]> inner_11 = matmul(transpose_x = inner_11_transpose_x_0, transpose_y = inner_11_transpose_y_0, x = var_1320, y = v_17)[name = tensor<string, []>("inner_11")];
+            tensor<bool, []> var_1322_transpose_x_0 = const()[name = tensor<string, []>("op_1322_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_1322_transpose_y_0 = const()[name = tensor<string, []>("op_1322_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 3, 64]> var_1322 = matmul(transpose_x = var_1322_transpose_x_0, transpose_y = var_1322_transpose_y_0, x = q_17, y = prev_kv)[name = tensor<string, []>("op_1322")];
+            tensor<fp32, [3]> var_1323 = real_div(x = sqrt_s0, y = sqrt_s_t)[name = tensor<string, []>("op_1323")];
+            tensor<int32, [4]> var_1324 = const()[name = tensor<string, []>("op_1324"), val = tensor<int32, [4]>([1, 1, 3, 1])];
+            tensor<fp32, [1, 1, 3, 1]> var_1325 = reshape(shape = var_1324, x = var_1323)[name = tensor<string, []>("op_1325")];
+            tensor<fp32, [12, 4, 3, 64]> cross = mul(x = var_1322, y = var_1325)[name = tensor<string, []>("cross")];
+            tensor<fp32, [12, 4, 3, 64]> out_31 = add(x = inner_11, y = cross)[name = tensor<string, []>("out_31")];
+            tensor<fp32, [12, 4, 3, 64]> v_masked = mul(x = v_17, y = var_1148)[name = tensor<string, []>("v_masked")];
+            tensor<fp32, [12, 4, 64, 64]> var_1331 = mul(x = prev_kv, y = sqrt_s0)[name = tensor<string, []>("op_1331")];
+            tensor<bool, []> var_1333_transpose_x_1 = const()[name = tensor<string, []>("op_1333_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_1333_transpose_y_1 = const()[name = tensor<string, []>("op_1333_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 64, 64]> var_1333 = matmul(transpose_x = var_1333_transpose_x_1, transpose_y = var_1333_transpose_y_1, x = k_17, y = v_masked)[name = tensor<string, []>("op_1333")];
+            tensor<fp32, [12, 4, 64, 64]> new_kv_unnorm = add(x = var_1331, y = var_1333)[name = tensor<string, []>("new_kv_unnorm")];
+            tensor<fp32, [1]> new_scale = add(x = prev_scale, y = var_1156)[name = tensor<string, []>("new_scale")];
             tensor<fp32, []> const_33 = const()[name = tensor<string, []>("const_33"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_4 = clip(alpha = var_990, beta = const_33, x = new_scale)[name = tensor<string, []>("clip_4")];
+            tensor<fp32, [1]> clip_4 = clip(alpha = var_49, beta = const_33, x = new_scale)[name = tensor<string, []>("clip_4")];
             tensor<fp32, [1]> sqrt_new_scale = sqrt(x = clip_4)[name = tensor<string, []>("sqrt_new_scale")];
             tensor<fp32, [12, 4, 64, 64]> nkv = real_div(x = new_kv_unnorm, y = sqrt_new_scale)[name = tensor<string, []>("nkv")];
-            tensor<int32, [4]> var_1319_perm_0 = const()[name = tensor<string, []>("op_1319_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [4]> var_1342_perm_0 = const()[name = tensor<string, []>("op_1342_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_33_axes_0 = const()[name = tensor<string, []>("out_33_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [12, 3, 4, 64]> var_1319 = transpose(perm = var_1319_perm_0, x = out_31)[name = tensor<string, []>("transpose_11")];
-            tensor<fp32, [12, 3, 4, 64]> out_33 = layer_norm(axes = out_33_axes_0, epsilon = var_987, x = var_1319)[name = tensor<string, []>("out_33")];
-            tensor<int32, [3]> var_1323 = const()[name = tensor<string, []>("op_1323"), val = tensor<int32, [3]>([12, 3, 256])];
-            tensor<fp32, [12, 3, 256]> out = reshape(shape = var_1323, x = out_33)[name = tensor<string, []>("out")];
-            tensor<fp32, [12, 3, 256]> var_1325 = silu(x = input_187)[name = tensor<string, []>("op_1325")];
-            tensor<fp32, [12, 3, 256]> input_189 = mul(x = var_1325, y = out)[name = tensor<string, []>("input_189")];
-            tensor<fp32, [12, 3, 256]> ret_out = linear(bias = decoder_out_proj_1_bias, weight = decoder_out_proj_1_weight, x = input_189)[name = tensor<string, []>("linear_51")];
-            tensor<fp32, [12, 3, 256]> input_191 = add(x = x, y = ret_out)[name = tensor<string, []>("input_191")];
+            tensor<fp32, [12, 3, 4, 64]> var_1342 = transpose(perm = var_1342_perm_0, x = out_31)[name = tensor<string, []>("transpose_11")];
+            tensor<fp32, [12, 3, 4, 64]> out_33 = layer_norm(axes = out_33_axes_0, epsilon = var_64, x = var_1342)[name = tensor<string, []>("out_33")];
+            tensor<int32, [3]> var_1346 = const()[name = tensor<string, []>("op_1346"), val = tensor<int32, [3]>([12, 3, 256])];
+            tensor<fp32, [12, 3, 256]> out = reshape(shape = var_1346, x = out_33)[name = tensor<string, []>("out")];
+            tensor<fp32, [12, 3, 256]> var_1348 = silu(x = input_189)[name = tensor<string, []>("op_1348")];
+            tensor<fp32, [12, 3, 256]> input_191 = mul(x = var_1348, y = out)[name = tensor<string, []>("input_191")];
+            tensor<fp32, [12, 3, 256]> ret_out = linear(bias = inner_decoder_out_proj_1_bias, weight = inner_decoder_out_proj_1_weight, x = input_191)[name = tensor<string, []>("linear_51")];
+            tensor<fp32, [12, 3, 256]> input_193 = add(x = x, y = ret_out)[name = tensor<string, []>("input_193")];
             tensor<int32, [1]> xt_5_axes_0 = const()[name = tensor<string, []>("xt_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [12, 3, 256]> xt_5 = layer_norm(axes = xt_5_axes_0, beta = decoder_norm11_1_bias, epsilon = var_985, gamma = decoder_norm11_1_weight, x = input_191)[name = tensor<string, []>("xt_5")];
-            tensor<int32, [4]> var_1335 = const()[name = tensor<string, []>("op_1335"), val = tensor<int32, [4]>([1, 12, 3, 256])];
-            tensor<fp32, [1, 12, 3, 256]> var_1336 = reshape(shape = var_1335, x = xt_5)[name = tensor<string, []>("op_1336")];
-            tensor<int32, [4]> var_1337_perm_0 = const()[name = tensor<string, []>("op_1337_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1340 = const()[name = tensor<string, []>("op_1340"), val = tensor<int32, [3]>([3, 12, 256])];
-            tensor<fp32, [1, 3, 12, 256]> var_1337 = transpose(perm = var_1337_perm_0, x = var_1336)[name = tensor<string, []>("transpose_10")];
-            tensor<fp32, [3, 12, 256]> query_5 = reshape(shape = var_1340, x = var_1337)[name = tensor<string, []>("query_5")];
+            tensor<fp32, [12, 3, 256]> xt_5 = layer_norm(axes = xt_5_axes_0, beta = inner_decoder_norm11_1_bias, epsilon = var_56, gamma = inner_decoder_norm11_1_weight, x = input_193)[name = tensor<string, []>("xt_5")];
+            tensor<int32, [4]> var_1358 = const()[name = tensor<string, []>("op_1358"), val = tensor<int32, [4]>([1, 12, 3, 256])];
+            tensor<fp32, [1, 12, 3, 256]> var_1359 = reshape(shape = var_1358, x = xt_5)[name = tensor<string, []>("op_1359")];
+            tensor<int32, [4]> var_1360_perm_0 = const()[name = tensor<string, []>("op_1360_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1363 = const()[name = tensor<string, []>("op_1363"), val = tensor<int32, [3]>([3, 12, 256])];
+            tensor<fp32, [1, 3, 12, 256]> var_1360 = transpose(perm = var_1360_perm_0, x = var_1359)[name = tensor<string, []>("transpose_10")];
+            tensor<fp32, [3, 12, 256]> query_5 = reshape(shape = var_1363, x = var_1360)[name = tensor<string, []>("query_5")];
             tensor<int32, [3]> query_perm_0 = const()[name = tensor<string, []>("query_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [12, 3, 256]> query = transpose(perm = query_perm_0, x = query_5)[name = tensor<string, []>("transpose_9")];
-            tensor<fp32, [12, 3, 768]> var_1363 = linear(bias = decoder_self_attn2_1_in_proj_bias, weight = decoder_self_attn2_1_in_proj_weight, x = query)[name = tensor<string, []>("linear_52")];
+            tensor<fp32, [12, 3, 768]> var_1386 = linear(bias = inner_decoder_self_attn2_1_in_proj_bias, weight = inner_decoder_self_attn2_1_in_proj_weight, x = query)[name = tensor<string, []>("linear_52")];
             tensor<int32, [4]> concat_2 = const()[name = tensor<string, []>("concat_2"), val = tensor<int32, [4]>([12, 3, 3, 256])];
-            tensor<fp32, [12, 3, 3, 256]> var_1365 = reshape(shape = concat_2, x = var_1363)[name = tensor<string, []>("op_1365")];
-            tensor<int32, [1]> var_1366_axes_0 = const()[name = tensor<string, []>("op_1366_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp32, [1, 12, 3, 3, 256]> var_1366 = expand_dims(axes = var_1366_axes_0, x = var_1365)[name = tensor<string, []>("op_1366")];
-            tensor<int32, [5]> var_1367_perm_0 = const()[name = tensor<string, []>("op_1367_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
-            tensor<int32, [1]> var_1368_axes_0 = const()[name = tensor<string, []>("op_1368_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp32, [3, 12, 3, 1, 256]> var_1367 = transpose(perm = var_1367_perm_0, x = var_1366)[name = tensor<string, []>("transpose_8")];
-            tensor<fp32, [3, 12, 3, 256]> var_1368 = squeeze(axes = var_1368_axes_0, x = var_1367)[name = tensor<string, []>("op_1368")];
+            tensor<fp32, [12, 3, 3, 256]> var_1388 = reshape(shape = concat_2, x = var_1386)[name = tensor<string, []>("op_1388")];
+            tensor<int32, [1]> var_1389_axes_0 = const()[name = tensor<string, []>("op_1389_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 12, 3, 3, 256]> var_1389 = expand_dims(axes = var_1389_axes_0, x = var_1388)[name = tensor<string, []>("op_1389")];
+            tensor<int32, [5]> var_1390_perm_0 = const()[name = tensor<string, []>("op_1390_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
+            tensor<int32, [1]> var_1391_axes_0 = const()[name = tensor<string, []>("op_1391_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp32, [3, 12, 3, 1, 256]> var_1390 = transpose(perm = var_1390_perm_0, x = var_1389)[name = tensor<string, []>("transpose_8")];
+            tensor<fp32, [3, 12, 3, 256]> var_1391 = squeeze(axes = var_1391_axes_0, x = var_1390)[name = tensor<string, []>("op_1391")];
             tensor<int32, [4]> q_19_begin_0 = const()[name = tensor<string, []>("q_19_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> q_19_end_0 = const()[name = tensor<string, []>("q_19_end_0"), val = tensor<int32, [4]>([1, 12, 3, 256])];
             tensor<bool, [4]> q_19_end_mask_0 = const()[name = tensor<string, []>("q_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> q_19_squeeze_mask_0 = const()[name = tensor<string, []>("q_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 3, 256]> q_19 = slice_by_index(begin = q_19_begin_0, end = q_19_end_0, end_mask = q_19_end_mask_0, squeeze_mask = q_19_squeeze_mask_0, x = var_1368)[name = tensor<string, []>("q_19")];
+            tensor<fp32, [12, 3, 256]> q_19 = slice_by_index(begin = q_19_begin_0, end = q_19_end_0, end_mask = q_19_end_mask_0, squeeze_mask = q_19_squeeze_mask_0, x = var_1391)[name = tensor<string, []>("q_19")];
             tensor<int32, [4]> k_19_begin_0 = const()[name = tensor<string, []>("k_19_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> k_19_end_0 = const()[name = tensor<string, []>("k_19_end_0"), val = tensor<int32, [4]>([2, 12, 3, 256])];
             tensor<bool, [4]> k_19_end_mask_0 = const()[name = tensor<string, []>("k_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> k_19_squeeze_mask_0 = const()[name = tensor<string, []>("k_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 3, 256]> k_19 = slice_by_index(begin = k_19_begin_0, end = k_19_end_0, end_mask = k_19_end_mask_0, squeeze_mask = k_19_squeeze_mask_0, x = var_1368)[name = tensor<string, []>("k_19")];
+            tensor<fp32, [12, 3, 256]> k_19 = slice_by_index(begin = k_19_begin_0, end = k_19_end_0, end_mask = k_19_end_mask_0, squeeze_mask = k_19_squeeze_mask_0, x = var_1391)[name = tensor<string, []>("k_19")];
             tensor<int32, [4]> v_19_begin_0 = const()[name = tensor<string, []>("v_19_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> v_19_end_0 = const()[name = tensor<string, []>("v_19_end_0"), val = tensor<int32, [4]>([3, 12, 3, 256])];
             tensor<bool, [4]> v_19_end_mask_0 = const()[name = tensor<string, []>("v_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> v_19_squeeze_mask_0 = const()[name = tensor<string, []>("v_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 3, 256]> v_19 = slice_by_index(begin = v_19_begin_0, end = v_19_end_0, end_mask = v_19_end_mask_0, squeeze_mask = v_19_squeeze_mask_0, x = var_1368)[name = tensor<string, []>("v_19")];
-            tensor<int32, [3]> var_1376 = const()[name = tensor<string, []>("op_1376"), val = tensor<int32, [3]>([12, 12, 64])];
-            tensor<fp32, [12, 12, 64]> var_1377 = reshape(shape = var_1376, x = q_19)[name = tensor<string, []>("op_1377")];
+            tensor<fp32, [12, 3, 256]> v_19 = slice_by_index(begin = v_19_begin_0, end = v_19_end_0, end_mask = v_19_end_mask_0, squeeze_mask = v_19_squeeze_mask_0, x = var_1391)[name = tensor<string, []>("v_19")];
+            tensor<int32, [3]> var_1399 = const()[name = tensor<string, []>("op_1399"), val = tensor<int32, [3]>([12, 12, 64])];
+            tensor<fp32, [12, 12, 64]> var_1400 = reshape(shape = var_1399, x = q_19)[name = tensor<string, []>("op_1400")];
             tensor<int32, [3]> q_21_perm_0 = const()[name = tensor<string, []>("q_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1383 = const()[name = tensor<string, []>("op_1383"), val = tensor<int32, [3]>([12, 12, 64])];
-            tensor<fp32, [12, 12, 64]> var_1384 = reshape(shape = var_1383, x = k_19)[name = tensor<string, []>("op_1384")];
+            tensor<int32, [3]> var_1406 = const()[name = tensor<string, []>("op_1406"), val = tensor<int32, [3]>([12, 12, 64])];
+            tensor<fp32, [12, 12, 64]> var_1407 = reshape(shape = var_1406, x = k_19)[name = tensor<string, []>("op_1407")];
             tensor<int32, [3]> k_21_perm_0 = const()[name = tensor<string, []>("k_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1390 = const()[name = tensor<string, []>("op_1390"), val = tensor<int32, [3]>([12, 12, 64])];
-            tensor<fp32, [12, 12, 64]> var_1391 = reshape(shape = var_1390, x = v_19)[name = tensor<string, []>("op_1391")];
+            tensor<int32, [3]> var_1413 = const()[name = tensor<string, []>("op_1413"), val = tensor<int32, [3]>([12, 12, 64])];
+            tensor<fp32, [12, 12, 64]> var_1414 = reshape(shape = var_1413, x = v_19)[name = tensor<string, []>("op_1414")];
             tensor<int32, [3]> v_21_perm_0 = const()[name = tensor<string, []>("v_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [4]> var_1394 = const()[name = tensor<string, []>("op_1394"), val = tensor<int32, [4]>([3, 4, 12, 64])];
-            tensor<fp32, [12, 12, 64]> q_21 = transpose(perm = q_21_perm_0, x = var_1377)[name = tensor<string, []>("transpose_7")];
-            tensor<fp32, [3, 4, 12, 64]> q = reshape(shape = var_1394, x = q_21)[name = tensor<string, []>("q")];
-            tensor<int32, [4]> var_1396 = const()[name = tensor<string, []>("op_1396"), val = tensor<int32, [4]>([3, 4, 12, 64])];
-            tensor<fp32, [12, 12, 64]> k_21 = transpose(perm = k_21_perm_0, x = var_1384)[name = tensor<string, []>("transpose_6")];
-            tensor<fp32, [3, 4, 12, 64]> k = reshape(shape = var_1396, x = k_21)[name = tensor<string, []>("k")];
-            tensor<int32, [4]> var_1398 = const()[name = tensor<string, []>("op_1398"), val = tensor<int32, [4]>([3, 4, 12, 64])];
-            tensor<fp32, [12, 12, 64]> v_21 = transpose(perm = v_21_perm_0, x = var_1391)[name = tensor<string, []>("transpose_5")];
-            tensor<fp32, [3, 4, 12, 64]> v = reshape(shape = var_1398, x = v_21)[name = tensor<string, []>("v")];
+            tensor<int32, [4]> var_1417 = const()[name = tensor<string, []>("op_1417"), val = tensor<int32, [4]>([3, 4, 12, 64])];
+            tensor<fp32, [12, 12, 64]> q_21 = transpose(perm = q_21_perm_0, x = var_1400)[name = tensor<string, []>("transpose_7")];
+            tensor<fp32, [3, 4, 12, 64]> q = reshape(shape = var_1417, x = q_21)[name = tensor<string, []>("q")];
+            tensor<int32, [4]> var_1419 = const()[name = tensor<string, []>("op_1419"), val = tensor<int32, [4]>([3, 4, 12, 64])];
+            tensor<fp32, [12, 12, 64]> k_21 = transpose(perm = k_21_perm_0, x = var_1407)[name = tensor<string, []>("transpose_6")];
+            tensor<fp32, [3, 4, 12, 64]> k = reshape(shape = var_1419, x = k_21)[name = tensor<string, []>("k")];
+            tensor<int32, [4]> var_1421 = const()[name = tensor<string, []>("op_1421"), val = tensor<int32, [4]>([3, 4, 12, 64])];
+            tensor<fp32, [12, 12, 64]> v_21 = transpose(perm = v_21_perm_0, x = var_1414)[name = tensor<string, []>("transpose_5")];
+            tensor<fp32, [3, 4, 12, 64]> v = reshape(shape = var_1421, x = v_21)[name = tensor<string, []>("v")];
             tensor<fp32, []> mul_3_y_0 = const()[name = tensor<string, []>("mul_3_y_0"), val = tensor<fp32, []>(0x1p-3)];
             tensor<fp32, [3, 4, 12, 64]> mul_3 = mul(x = q, y = mul_3_y_0)[name = tensor<string, []>("mul_3")];
             tensor<bool, []> matmul_1_transpose_y_0 = const()[name = tensor<string, []>("matmul_1_transpose_y_0"), val = tensor<bool, []>(true)];
@@ -1237,36 +1248,36 @@ program(1.0)
             tensor<bool, []> attn_output_9_transpose_x_0 = const()[name = tensor<string, []>("attn_output_9_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> attn_output_9_transpose_y_0 = const()[name = tensor<string, []>("attn_output_9_transpose_y_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [3, 4, 12, 64]> attn_output_9 = matmul(transpose_x = attn_output_9_transpose_x_0, transpose_y = attn_output_9_transpose_y_0, x = softmax_1, y = v)[name = tensor<string, []>("attn_output_9")];
-            tensor<int32, [4]> var_1401 = const()[name = tensor<string, []>("op_1401"), val = tensor<int32, [4]>([2, 0, 1, 3])];
-            tensor<int32, [2]> var_1406 = const()[name = tensor<string, []>("op_1406"), val = tensor<int32, [2]>([36, 256])];
-            tensor<fp32, [12, 3, 4, 64]> var_1402 = transpose(perm = var_1401, x = attn_output_9)[name = tensor<string, []>("transpose_4")];
-            tensor<fp32, [36, 256]> attn_output_11 = reshape(shape = var_1406, x = var_1402)[name = tensor<string, []>("attn_output_11")];
-            tensor<fp32, [36, 256]> attn_output_13 = linear(bias = decoder_self_attn2_1_out_proj_bias, weight = decoder_self_attn2_1_out_proj_weight, x = attn_output_11)[name = tensor<string, []>("linear_53")];
-            tensor<int32, [3]> var_1410 = const()[name = tensor<string, []>("op_1410"), val = tensor<int32, [3]>([12, 3, 256])];
-            tensor<fp32, [12, 3, 256]> attn_output = reshape(shape = var_1410, x = attn_output_13)[name = tensor<string, []>("attn_output")];
+            tensor<int32, [4]> var_1424 = const()[name = tensor<string, []>("op_1424"), val = tensor<int32, [4]>([2, 0, 1, 3])];
+            tensor<int32, [2]> var_1429 = const()[name = tensor<string, []>("op_1429"), val = tensor<int32, [2]>([36, 256])];
+            tensor<fp32, [12, 3, 4, 64]> var_1425 = transpose(perm = var_1424, x = attn_output_9)[name = tensor<string, []>("transpose_4")];
+            tensor<fp32, [36, 256]> attn_output_11 = reshape(shape = var_1429, x = var_1425)[name = tensor<string, []>("attn_output_11")];
+            tensor<fp32, [36, 256]> attn_output_13 = linear(bias = inner_decoder_self_attn2_1_out_proj_bias, weight = inner_decoder_self_attn2_1_out_proj_weight, x = attn_output_11)[name = tensor<string, []>("linear_53")];
+            tensor<int32, [3]> var_1433 = const()[name = tensor<string, []>("op_1433"), val = tensor<int32, [3]>([12, 3, 256])];
+            tensor<fp32, [12, 3, 256]> attn_output = reshape(shape = var_1433, x = attn_output_13)[name = tensor<string, []>("attn_output")];
             tensor<int32, [3]> sa2_out_perm_0 = const()[name = tensor<string, []>("sa2_out_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [3, 12, 256]> sa2_out = transpose(perm = sa2_out_perm_0, x = attn_output)[name = tensor<string, []>("transpose_3")];
-            tensor<fp32, [3, 12, 256]> input_193 = add(x = query_5, y = sa2_out)[name = tensor<string, []>("input_193")];
-            tensor<int32, [1]> input_195_axes_0 = const()[name = tensor<string, []>("input_195_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [3, 12, 256]> input_195 = layer_norm(axes = input_195_axes_0, beta = decoder_norm21_1_bias, epsilon = var_985, gamma = decoder_norm21_1_weight, x = input_193)[name = tensor<string, []>("input_195")];
-            tensor<fp32, [3, 12, 2048]> input_197 = linear(bias = decoder_linear1_1_bias, weight = decoder_linear1_1_weight, x = input_195)[name = tensor<string, []>("linear_54")];
-            tensor<fp32, [3, 12, 2048]> input_199 = relu(x = input_197)[name = tensor<string, []>("input_199")];
-            tensor<fp32, [3, 12, 256]> ffn_out = linear(bias = decoder_linear2_1_bias, weight = decoder_linear2_1_weight, x = input_199)[name = tensor<string, []>("linear_55")];
-            tensor<fp32, [3, 12, 256]> input_201 = add(x = input_195, y = ffn_out)[name = tensor<string, []>("input_201")];
+            tensor<fp32, [3, 12, 256]> input_195 = add(x = query_5, y = sa2_out)[name = tensor<string, []>("input_195")];
+            tensor<int32, [1]> input_197_axes_0 = const()[name = tensor<string, []>("input_197_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [3, 12, 256]> input_197 = layer_norm(axes = input_197_axes_0, beta = inner_decoder_norm21_1_bias, epsilon = var_56, gamma = inner_decoder_norm21_1_weight, x = input_195)[name = tensor<string, []>("input_197")];
+            tensor<fp32, [3, 12, 2048]> input_199 = linear(bias = inner_decoder_linear1_1_bias, weight = inner_decoder_linear1_1_weight, x = input_197)[name = tensor<string, []>("linear_54")];
+            tensor<fp32, [3, 12, 2048]> input_201 = relu(x = input_199)[name = tensor<string, []>("input_201")];
+            tensor<fp32, [3, 12, 256]> ffn_out = linear(bias = inner_decoder_linear2_1_bias, weight = inner_decoder_linear2_1_weight, x = input_201)[name = tensor<string, []>("linear_55")];
+            tensor<fp32, [3, 12, 256]> input_203 = add(x = input_197, y = ffn_out)[name = tensor<string, []>("input_203")];
             tensor<int32, [1]> xt_axes_0 = const()[name = tensor<string, []>("xt_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [3, 12, 256]> xt = layer_norm(axes = xt_axes_0, beta = decoder_norm22_1_bias, epsilon = var_985, gamma = decoder_norm22_1_weight, x = input_201)[name = tensor<string, []>("xt")];
-            tensor<int32, [4]> var_1430 = const()[name = tensor<string, []>("op_1430"), val = tensor<int32, [4]>([1, 3, 12, 256])];
-            tensor<fp32, [1, 3, 12, 256]> input = reshape(shape = var_1430, x = xt)[name = tensor<string, []>("input")];
-            tensor<int32, [1]> var_1432 = const()[name = tensor<string, []>("op_1432"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 12, 1]> var_1433 = reduce_l2_norm(axes = var_1432, keep_dims = var_988, x = input)[name = tensor<string, []>("op_1433")];
+            tensor<fp32, [3, 12, 256]> xt = layer_norm(axes = xt_axes_0, beta = inner_decoder_norm22_1_bias, epsilon = var_56, gamma = inner_decoder_norm22_1_weight, x = input_203)[name = tensor<string, []>("xt")];
+            tensor<int32, [4]> var_1453 = const()[name = tensor<string, []>("op_1453"), val = tensor<int32, [4]>([1, 3, 12, 256])];
+            tensor<fp32, [1, 3, 12, 256]> input = reshape(shape = var_1453, x = xt)[name = tensor<string, []>("input")];
+            tensor<int32, [1]> var_1455 = const()[name = tensor<string, []>("op_1455"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 3, 12, 1]> var_1456 = reduce_l2_norm(axes = var_1455, keep_dims = var_55, x = input)[name = tensor<string, []>("op_1456")];
             tensor<fp32, []> const_42 = const()[name = tensor<string, []>("const_42"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1, 3, 12, 1]> clip_5 = clip(alpha = var_980, beta = const_42, x = var_1433)[name = tensor<string, []>("clip_5")];
-            tensor<fp32, [1, 3, 12, 256]> var_1435 = real_div(x = input, y = clip_5)[name = tensor<string, []>("op_1435")];
+            tensor<fp32, [1, 3, 12, 1]> clip_5 = clip(alpha = var_69, beta = const_42, x = var_1456)[name = tensor<string, []>("clip_5")];
+            tensor<fp32, [1, 3, 12, 256]> var_1458 = real_div(x = input, y = clip_5)[name = tensor<string, []>("op_1458")];
             tensor<int32, [3]> concat_6 = const()[name = tensor<string, []>("concat_6"), val = tensor<int32, [3]>([3, 1, 256])];
             tensor<fp32, [3, 1, 256]> reshape_0 = reshape(shape = concat_6, x = emb)[name = tensor<string, []>("reshape_0")];
             tensor<int32, [4]> transpose_1_perm_0 = const()[name = tensor<string, []>("transpose_1_perm_0"), val = tensor<int32, [4]>([0, 1, 3, 2])];
             tensor<int32, [3]> concat_7 = const()[name = tensor<string, []>("concat_7"), val = tensor<int32, [3]>([3, 256, 12])];
-            tensor<fp32, [1, 3, 256, 12]> transpose_1 = transpose(perm = transpose_1_perm_0, x = var_1435)[name = tensor<string, []>("transpose_2")];
+            tensor<fp32, [1, 3, 256, 12]> transpose_1 = transpose(perm = transpose_1_perm_0, x = var_1458)[name = tensor<string, []>("transpose_2")];
             tensor<fp32, [3, 256, 12]> reshape_1 = reshape(shape = concat_7, x = transpose_1)[name = tensor<string, []>("reshape_1")];
             tensor<bool, []> matmul_2_transpose_x_0 = const()[name = tensor<string, []>("matmul_2_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> matmul_2_transpose_y_0 = const()[name = tensor<string, []>("matmul_2_transpose_y_0"), val = tensor<bool, []>(false)];
@@ -1277,10 +1288,10 @@ program(1.0)
             tensor<int32, [3]> output_end_0 = const()[name = tensor<string, []>("output_end_0"), val = tensor<int32, [3]>([1, 3, 11])];
             tensor<bool, [3]> output_end_mask_0 = const()[name = tensor<string, []>("output_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
             tensor<fp32, [1, 3, 10]> output = slice_by_index(begin = output_begin_0, end = output_end_0, end_mask = output_end_mask_0, x = reshape_2)[name = tensor<string, []>("output")];
-            tensor<fp32, [1, 3, 10]> probs = sigmoid(x = output)[name = tensor<string, []>("op_1439")];
-            tensor<int32, []> var_1441_axis_0 = const()[name = tensor<string, []>("op_1441_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [2, 12, 4, 64, 64]> dec_kv_new = stack(axis = var_1441_axis_0, values = (var_1137, nkv))[name = tensor<string, []>("op_1441")];
-            tensor<int32, []> var_1443_axis_0 = const()[name = tensor<string, []>("op_1443_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [2, 1]> dec_scale_new = stack(axis = var_1443_axis_0, values = (new_scale_9, new_scale))[name = tensor<string, []>("op_1443")];
+            tensor<fp32, [1, 3, 10]> probs = sigmoid(x = output)[name = tensor<string, []>("op_1462")];
+            tensor<int32, []> var_1464_axis_0 = const()[name = tensor<string, []>("op_1464_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [2, 12, 4, 64, 64]> dec_kv_new = stack(axis = var_1464_axis_0, values = (var_1160, nkv))[name = tensor<string, []>("op_1464")];
+            tensor<int32, []> var_1466_axis_0 = const()[name = tensor<string, []>("op_1466_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [2, 1]> dec_scale_new = stack(axis = var_1466_axis_0, values = (new_scale_9, new_scale))[name = tensor<string, []>("op_1466")];
         } -> (probs, enc_kv_new, enc_scale_new, enc_conv_cache_new, cnn_window_new, dec_kv_new, dec_scale_new);
 }
\ No newline at end of file
diff --git a/optimized/dih2/300ms/ls_eend_dih2_300ms.mlpackage/Data/com.apple.CoreML/model.mlmodel b/optimized/dih2/300ms/ls_eend_dih2_300ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
index 667d39182df060a6ae205f7f1936b532292aaa00..b82e8993338870af6f757ecc95fccb122327ae99 100644
--- a/optimized/dih2/300ms/ls_eend_dih2_300ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
+++ b/optimized/dih2/300ms/ls_eend_dih2_300ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9912c0d401a73181999617e3ae8cf8781300b6effc6ddd777a0d2d293c9dc351
-size 185467
+oid sha256:dce292a0f39072fef89d7e5722dcd50d690c6e70a70a8eab1fe6cf29832302fc
+size 191012
diff --git a/optimized/dih2/300ms/ls_eend_dih2_300ms.mlpackage/Manifest.json b/optimized/dih2/300ms/ls_eend_dih2_300ms.mlpackage/Manifest.json
index 94cff4c78a7ab58f03319cc612dc624390995764..5cf676d24266c54b61b52b18af6c04e45a45d021 100644
--- a/optimized/dih2/300ms/ls_eend_dih2_300ms.mlpackage/Manifest.json
+++ b/optimized/dih2/300ms/ls_eend_dih2_300ms.mlpackage/Manifest.json
@@ -1,18 +1,18 @@
 {
     "fileFormatVersion": "1.0.0",
     "itemInfoEntries": {
-        "655B25EA-0632-4B1A-B464-F7DFCE1FB184": {
+        "C166D8CC-202F-4648-A5B5-459F6BCC257A": {
             "author": "com.apple.CoreML",
             "description": "CoreML Model Weights",
             "name": "weights",
             "path": "com.apple.CoreML/weights"
         },
-        "E19F6AA3-6B6D-4C5E-8868-BE6F38C9CFE1": {
+        "D324280A-5B20-4ADB-9871-DFBBBF9AFC4C": {
             "author": "com.apple.CoreML",
             "description": "CoreML Model Specification",
             "name": "model.mlmodel",
             "path": "com.apple.CoreML/model.mlmodel"
         }
     },
-    "rootModelIdentifier": "E19F6AA3-6B6D-4C5E-8868-BE6F38C9CFE1"
+    "rootModelIdentifier": "D324280A-5B20-4ADB-9871-DFBBBF9AFC4C"
 }
diff --git a/optimized/dih2/400ms/ls_eend_dih2_400ms.mlmodelc/analytics/coremldata.bin b/optimized/dih2/400ms/ls_eend_dih2_400ms.mlmodelc/analytics/coremldata.bin
index 60912e32f95f143258d8e51db95a3edea8749f6c..820140bf91e3fe099ecc6f42e7c99fcdf677d533 100644
--- a/optimized/dih2/400ms/ls_eend_dih2_400ms.mlmodelc/analytics/coremldata.bin
+++ b/optimized/dih2/400ms/ls_eend_dih2_400ms.mlmodelc/analytics/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4159dfa0fd403ab65353f56fd6e2f87e53062263e1e136c98c4fcc19b79ffab8
+oid sha256:910909e4d986721254e02d189e9a227e6ea0bd1d273f43d6dd5d416333399b8e
 size 243
diff --git a/optimized/dih2/400ms/ls_eend_dih2_400ms.mlmodelc/coremldata.bin b/optimized/dih2/400ms/ls_eend_dih2_400ms.mlmodelc/coremldata.bin
index 410afba3c04fb8e61c00656e4d9170768e2bbe4a..4af4939a9045bcbd57ae5aca0a2b8ad2cc3dba00 100644
--- a/optimized/dih2/400ms/ls_eend_dih2_400ms.mlmodelc/coremldata.bin
+++ b/optimized/dih2/400ms/ls_eend_dih2_400ms.mlmodelc/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a4deedfd08d987353bb9e2e6585c389e10e71fd41bcbe7d5b3140ff4ff9bf985
-size 1308
+oid sha256:02454ef8dbe689c98b17a5dc454f28d1b26bd1c945e5ec74b62390d6d4fca1ae
+size 1411
diff --git a/optimized/dih2/400ms/ls_eend_dih2_400ms.mlmodelc/metadata.json b/optimized/dih2/400ms/ls_eend_dih2_400ms.mlmodelc/metadata.json
index c179a7eaf3aa7525369b7d58fdb519fbdc635371..607e5330f76f0f52016a51e5bcf2c00979aaa521 100644
--- a/optimized/dih2/400ms/ls_eend_dih2_400ms.mlmodelc/metadata.json
+++ b/optimized/dih2/400ms/ls_eend_dih2_400ms.mlmodelc/metadata.json
@@ -1,7 +1,7 @@
 [
   {
     "metadataOutputVersion" : "3.0",
-    "shortDescription" : "LS-EEND DIHARD II streaming diarizer (pipeline, T=4, max_speakers=10)",
+    "shortDescription" : "LS-EEND DIHARD II streaming diarizer (pipeline, T=4, max_speakers=10, layout=raw_mel, cu=all)",
     "outputSchema" : [
       {
         "hasShapeFlexibility" : "0",
@@ -81,7 +81,7 @@
     "specificationVersion" : 8,
     "mlProgramOperationTypeHistogram" : {
       "Ios16.reduceL2Norm" : 2,
-      "Ios17.reshape" : 68,
+      "Ios17.reshape" : 69,
       "Ios16.softmax" : 2,
       "Ios17.matmul" : 29,
       "Ios17.transpose" : 57,
@@ -89,7 +89,7 @@
       "Ios17.expandDims" : 3,
       "Ios17.add" : 46,
       "Ios16.sigmoid" : 5,
-      "Ios17.sliceByIndex" : 64,
+      "Ios17.sliceByIndex" : 68,
       "Tile" : 1,
       "Ios16.reduceSum" : 1,
       "Ios17.squeeze" : 2,
@@ -101,7 +101,7 @@
       "Ios16.silu" : 18,
       "Ios17.realDiv" : 20,
       "Ios17.linear" : 56,
-      "Stack" : 5,
+      "Stack" : 6,
       "Ios17.concat" : 22,
       "Ios16.relu" : 2,
       "Ios16.cumsum" : 1,
@@ -128,9 +128,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float32",
-        "formattedType" : "MultiArray (Float32 1 × 4 × 345)",
+        "formattedType" : "MultiArray (Float32 1 × 45 × 23)",
         "shortDescription" : "",
-        "shape" : "[1, 4, 345]",
+        "shape" : "[1, 45, 23]",
         "name" : "features",
         "type" : "MultiArray"
       },
@@ -206,8 +206,8 @@
       }
     ],
     "userDefinedMetadata" : {
-      "com.github.apple.coremltools.conversion_date" : "2026-04-16",
-      "config" : "{\"model_name\": \"dih2\", \"model_label\": \"DIHARD II\", \"variant\": \"pipeline\", \"chunk_size\": 4, \"step_duration_ms\": 400, \"frame_duration_ms\": 100, \"frame_duration_seconds\": 0.1, \"max_speakers\": 10, \"max_nspks\": 12, \"n_units\": 256, \"n_heads\": 4, \"enc_n_layers\": 4, \"dec_n_layers\": 2, \"conv_kernel_size\": 16, \"conv_delay\": 9, \"sample_rate\": 8000, \"win_length\": 200, \"hop_length\": 80, \"n_mels\": 23, \"context_size\": 7, \"subsampling\": 10, \"feat_type\": \"logmel23_cummn\", \"pure_roll\": true}",
+      "com.github.apple.coremltools.conversion_date" : "2026-04-18",
+      "config" : "{\"model_name\": \"dih2\", \"model_label\": \"DIHARD II\", \"variant\": \"pipeline\", \"chunk_size\": 4, \"step_duration_ms\": 400, \"frame_duration_ms\": 100, \"frame_duration_seconds\": 0.1, \"max_speakers\": 10, \"max_nspks\": 12, \"n_units\": 256, \"n_heads\": 4, \"enc_n_layers\": 4, \"dec_n_layers\": 2, \"conv_kernel_size\": 16, \"conv_delay\": 9, \"sample_rate\": 8000, \"win_length\": 200, \"hop_length\": 80, \"n_mels\": 23, \"context_size\": 7, \"subsampling\": 10, \"feat_type\": \"logmel23_cummn\", \"pure_roll\": true, \"input_layout\": \"raw_mel\", \"compute_units_export\": \"all\", \"raw_mel_length\": 45}",
       "com.github.apple.coremltools.source" : "torch==2.6.0",
       "com.github.apple.coremltools.version" : "9.0",
       "com.github.apple.coremltools.source_dialect" : "TorchScript"
diff --git a/optimized/dih2/400ms/ls_eend_dih2_400ms.mlmodelc/model.mil b/optimized/dih2/400ms/ls_eend_dih2_400ms.mlmodelc/model.mil
index 7135f6328b831d1038e863b851b7563d75bba5c3..546c200804eb017fad873f0d353fbf1c38982ea7 100644
--- a/optimized/dih2/400ms/ls_eend_dih2_400ms.mlmodelc/model.mil
+++ b/optimized/dih2/400ms/ls_eend_dih2_400ms.mlmodelc/model.mil
@@ -1,234 +1,256 @@
 program(1.0)
 [buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.6.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "9.0"}})]
 {
-    func main<ios17>(tensor<fp32, [1, 256, 18]> cnn_window, tensor<fp32, [2, 12, 4, 64, 64]> dec_kv, tensor<fp32, [2, 1]> dec_scale, tensor<fp32, [4, 1, 16, 256]> enc_conv_cache, tensor<fp32, [4, 1, 4, 64, 64]> enc_kv, tensor<fp32, [4, 1]> enc_scale, tensor<fp32, [1, 4, 345]> features, tensor<fp32, [4]> valid_mask) {
-            tensor<fp32, [256]> encoder_cnn_bias = const()[name = tensor<string, []>("encoder_cnn_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
-            tensor<fp32, [256, 256, 19]> encoder_cnn_weight = const()[name = tensor<string, []>("encoder_cnn_weight"), val = tensor<fp32, [256, 256, 19]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
-            tensor<fp32, [4, 4]> encoder__causal_mask = const()[name = tensor<string, []>("encoder__causal_mask"), val = tensor<fp32, [4, 4]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4981952)))];
-            tensor<fp32, [4]> encoder__t_index = const()[name = tensor<string, []>("encoder__t_index"), val = tensor<fp32, [4]>([0x1p+0, 0x1p+1, 0x1.8p+1, 0x1p+2])];
-            tensor<fp32, [256]> encoder_input_projection_linear_bias = const()[name = tensor<string, []>("encoder_input_projection_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4982080)))];
-            tensor<fp32, [256, 345]> encoder_input_projection_linear_weight = const()[name = tensor<string, []>("encoder_input_projection_linear_weight"), val = tensor<fp32, [256, 345]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4983168)))];
-            tensor<fp32, [256]> encoder_pre_layer_norm_bias = const()[name = tensor<string, []>("encoder_pre_layer_norm_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5336512)))];
-            tensor<fp32, [256]> encoder_pre_layer_norm_weight = const()[name = tensor<string, []>("encoder_pre_layer_norm_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5337600)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5338688)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5339776)))];
-            tensor<fp32, [1024]> encoder_ffn1_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5340864)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5345024)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6393664)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6394752)))];
-            tensor<fp32, [256]> encoder_ret_lns_0_bias = const()[name = tensor<string, []>("encoder_ret_lns_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7443392)))];
-            tensor<fp32, [256]> encoder_ret_lns_0_weight = const()[name = tensor<string, []>("encoder_ret_lns_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7444480)))];
-            tensor<fp32, [256]> encoder_q_proj_0_bias = const()[name = tensor<string, []>("encoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7445568)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_0_weight = const()[name = tensor<string, []>("encoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7446656)))];
-            tensor<fp32, [256]> encoder_k_proj_0_bias = const()[name = tensor<string, []>("encoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7708864)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_0_weight = const()[name = tensor<string, []>("encoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7709952)))];
-            tensor<fp32, [256]> encoder_v_proj_0_bias = const()[name = tensor<string, []>("encoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7972160)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_0_weight = const()[name = tensor<string, []>("encoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7973248)))];
-            tensor<fp32, [256]> encoder_g_proj_0_bias = const()[name = tensor<string, []>("encoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8235456)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_0_weight = const()[name = tensor<string, []>("encoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8236544)))];
-            tensor<fp32, [256]> encoder_out_proj_0_bias = const()[name = tensor<string, []>("encoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8498752)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_0_weight = const()[name = tensor<string, []>("encoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8499840)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8762048)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8763136)))];
-            tensor<fp32, [512]> encoder_conv_module_0_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8764224)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_0_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8766336)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_0_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9290688)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9307136)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9308224)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9309312)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9310400)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9311488)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_0_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9312576)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9574784)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9575872)))];
-            tensor<fp32, [1024]> encoder_ffn2_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9576960)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9581120)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10629760)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10630848)))];
-            tensor<fp32, [256]> encoder_layer_norm_0_bias = const()[name = tensor<string, []>("encoder_layer_norm_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11679488)))];
-            tensor<fp32, [256]> encoder_layer_norm_0_weight = const()[name = tensor<string, []>("encoder_layer_norm_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11680576)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11681664)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11682752)))];
-            tensor<fp32, [1024]> encoder_ffn1_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11683840)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11688000)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12736640)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12737728)))];
-            tensor<fp32, [256]> encoder_ret_lns_1_bias = const()[name = tensor<string, []>("encoder_ret_lns_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13786368)))];
-            tensor<fp32, [256]> encoder_ret_lns_1_weight = const()[name = tensor<string, []>("encoder_ret_lns_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13787456)))];
-            tensor<fp32, [256]> encoder_q_proj_1_bias = const()[name = tensor<string, []>("encoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13788544)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_1_weight = const()[name = tensor<string, []>("encoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13789632)))];
-            tensor<fp32, [256]> encoder_k_proj_1_bias = const()[name = tensor<string, []>("encoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14051840)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_1_weight = const()[name = tensor<string, []>("encoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14052928)))];
-            tensor<fp32, [256]> encoder_v_proj_1_bias = const()[name = tensor<string, []>("encoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14315136)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_1_weight = const()[name = tensor<string, []>("encoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14316224)))];
-            tensor<fp32, [256]> encoder_g_proj_1_bias = const()[name = tensor<string, []>("encoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14578432)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_1_weight = const()[name = tensor<string, []>("encoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14579520)))];
-            tensor<fp32, [256]> encoder_out_proj_1_bias = const()[name = tensor<string, []>("encoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14841728)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_1_weight = const()[name = tensor<string, []>("encoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14842816)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15105024)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15106112)))];
-            tensor<fp32, [512]> encoder_conv_module_1_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15107200)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_1_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15109312)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_1_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15633664)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15650112)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15651200)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15652288)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15653376)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15654464)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_1_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15655552)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15917760)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15918848)))];
-            tensor<fp32, [1024]> encoder_ffn2_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15919936)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15924096)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16972736)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16973824)))];
-            tensor<fp32, [256]> encoder_layer_norm_1_bias = const()[name = tensor<string, []>("encoder_layer_norm_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18022464)))];
-            tensor<fp32, [256]> encoder_layer_norm_1_weight = const()[name = tensor<string, []>("encoder_layer_norm_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18023552)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18024640)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18025728)))];
-            tensor<fp32, [1024]> encoder_ffn1_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18026816)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18030976)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19079616)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19080704)))];
-            tensor<fp32, [256]> encoder_ret_lns_2_bias = const()[name = tensor<string, []>("encoder_ret_lns_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20129344)))];
-            tensor<fp32, [256]> encoder_ret_lns_2_weight = const()[name = tensor<string, []>("encoder_ret_lns_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20130432)))];
-            tensor<fp32, [256]> encoder_q_proj_2_bias = const()[name = tensor<string, []>("encoder_q_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20131520)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_2_weight = const()[name = tensor<string, []>("encoder_q_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20132608)))];
-            tensor<fp32, [256]> encoder_k_proj_2_bias = const()[name = tensor<string, []>("encoder_k_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20394816)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_2_weight = const()[name = tensor<string, []>("encoder_k_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20395904)))];
-            tensor<fp32, [256]> encoder_v_proj_2_bias = const()[name = tensor<string, []>("encoder_v_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20658112)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_2_weight = const()[name = tensor<string, []>("encoder_v_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20659200)))];
-            tensor<fp32, [256]> encoder_g_proj_2_bias = const()[name = tensor<string, []>("encoder_g_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20921408)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_2_weight = const()[name = tensor<string, []>("encoder_g_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20922496)))];
-            tensor<fp32, [256]> encoder_out_proj_2_bias = const()[name = tensor<string, []>("encoder_out_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21184704)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_2_weight = const()[name = tensor<string, []>("encoder_out_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21185792)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21448000)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21449088)))];
-            tensor<fp32, [512]> encoder_conv_module_2_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21450176)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_2_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21452288)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_2_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21976640)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21993088)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21994176)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21995264)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21996352)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21997440)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_2_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21998528)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22260736)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22261824)))];
-            tensor<fp32, [1024]> encoder_ffn2_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22262912)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22267072)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23315712)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23316800)))];
-            tensor<fp32, [256]> encoder_layer_norm_2_bias = const()[name = tensor<string, []>("encoder_layer_norm_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24365440)))];
-            tensor<fp32, [256]> encoder_layer_norm_2_weight = const()[name = tensor<string, []>("encoder_layer_norm_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24366528)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24367616)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24368704)))];
-            tensor<fp32, [1024]> encoder_ffn1_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24369792)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24373952)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25422592)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25423680)))];
-            tensor<fp32, [256]> encoder_ret_lns_3_bias = const()[name = tensor<string, []>("encoder_ret_lns_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26472320)))];
-            tensor<fp32, [256]> encoder_ret_lns_3_weight = const()[name = tensor<string, []>("encoder_ret_lns_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26473408)))];
-            tensor<fp32, [256]> encoder_q_proj_3_bias = const()[name = tensor<string, []>("encoder_q_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26474496)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_3_weight = const()[name = tensor<string, []>("encoder_q_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26475584)))];
-            tensor<fp32, [256]> encoder_k_proj_3_bias = const()[name = tensor<string, []>("encoder_k_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26737792)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_3_weight = const()[name = tensor<string, []>("encoder_k_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26738880)))];
-            tensor<fp32, [256]> encoder_v_proj_3_bias = const()[name = tensor<string, []>("encoder_v_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27001088)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_3_weight = const()[name = tensor<string, []>("encoder_v_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27002176)))];
-            tensor<fp32, [256]> encoder_g_proj_3_bias = const()[name = tensor<string, []>("encoder_g_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27264384)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_3_weight = const()[name = tensor<string, []>("encoder_g_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27265472)))];
-            tensor<fp32, [256]> encoder_out_proj_3_bias = const()[name = tensor<string, []>("encoder_out_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27527680)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_3_weight = const()[name = tensor<string, []>("encoder_out_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27528768)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27790976)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27792064)))];
-            tensor<fp32, [512]> encoder_conv_module_3_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27793152)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_3_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27795264)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_3_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28319616)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28336064)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28337152)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28338240)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28339328)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28340416)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_3_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28341504)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28603712)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28604800)))];
-            tensor<fp32, [1024]> encoder_ffn2_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28605888)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28610048)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29658688)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29659776)))];
-            tensor<fp32, [256]> encoder_layer_norm_3_bias = const()[name = tensor<string, []>("encoder_layer_norm_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30708416)))];
-            tensor<fp32, [256]> encoder_layer_norm_3_weight = const()[name = tensor<string, []>("encoder_layer_norm_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30709504)))];
-            tensor<fp32, [4, 4]> decoder__causal_mask = const()[name = tensor<string, []>("decoder__causal_mask"), val = tensor<fp32, [4, 4]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710592)))];
-            tensor<fp32, [256]> decoder_convert_bias = const()[name = tensor<string, []>("decoder_convert_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710720)))];
-            tensor<fp32, [256, 512]> decoder_convert_weight = const()[name = tensor<string, []>("decoder_convert_weight"), val = tensor<fp32, [256, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30711808)))];
-            tensor<fp32, [256]> decoder_q_proj_0_bias = const()[name = tensor<string, []>("decoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31236160)))];
-            tensor<fp32, [256, 256]> decoder_q_proj_0_weight = const()[name = tensor<string, []>("decoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31237248)))];
-            tensor<fp32, [256]> decoder_k_proj_0_bias = const()[name = tensor<string, []>("decoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31499456)))];
-            tensor<fp32, [256, 256]> decoder_k_proj_0_weight = const()[name = tensor<string, []>("decoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31500544)))];
-            tensor<fp32, [256]> decoder_v_proj_0_bias = const()[name = tensor<string, []>("decoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31762752)))];
-            tensor<fp32, [256, 256]> decoder_v_proj_0_weight = const()[name = tensor<string, []>("decoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31763840)))];
-            tensor<fp32, [256]> decoder_g_proj_0_bias = const()[name = tensor<string, []>("decoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32026048)))];
-            tensor<fp32, [256, 256]> decoder_g_proj_0_weight = const()[name = tensor<string, []>("decoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32027136)))];
-            tensor<fp32, [256]> decoder_out_proj_0_bias = const()[name = tensor<string, []>("decoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32289344)))];
-            tensor<fp32, [256, 256]> decoder_out_proj_0_weight = const()[name = tensor<string, []>("decoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32290432)))];
-            tensor<fp32, [256]> decoder_norm11_0_bias = const()[name = tensor<string, []>("decoder_norm11_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32552640)))];
-            tensor<fp32, [256]> decoder_norm11_0_weight = const()[name = tensor<string, []>("decoder_norm11_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32553728)))];
-            tensor<fp32, [256]> decoder_self_attn2_0_out_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_0_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32554816)))];
-            tensor<fp32, [256, 256]> decoder_self_attn2_0_out_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_0_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32555904)))];
-            tensor<fp32, [768]> decoder_self_attn2_0_in_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_0_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32818112)))];
-            tensor<fp32, [768, 256]> decoder_self_attn2_0_in_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_0_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32821248)))];
-            tensor<fp32, [256]> decoder_norm21_0_bias = const()[name = tensor<string, []>("decoder_norm21_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33607744)))];
-            tensor<fp32, [256]> decoder_norm21_0_weight = const()[name = tensor<string, []>("decoder_norm21_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33608832)))];
-            tensor<fp32, [2048]> decoder_linear1_0_bias = const()[name = tensor<string, []>("decoder_linear1_0_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33609920)))];
-            tensor<fp32, [2048, 256]> decoder_linear1_0_weight = const()[name = tensor<string, []>("decoder_linear1_0_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33618176)))];
-            tensor<fp32, [256]> decoder_linear2_0_bias = const()[name = tensor<string, []>("decoder_linear2_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35715392)))];
-            tensor<fp32, [256, 2048]> decoder_linear2_0_weight = const()[name = tensor<string, []>("decoder_linear2_0_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35716480)))];
-            tensor<fp32, [256]> decoder_norm22_0_bias = const()[name = tensor<string, []>("decoder_norm22_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37813696)))];
-            tensor<fp32, [256]> decoder_norm22_0_weight = const()[name = tensor<string, []>("decoder_norm22_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37814784)))];
-            tensor<fp32, [256]> decoder_q_proj_1_bias = const()[name = tensor<string, []>("decoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37815872)))];
-            tensor<fp32, [256, 256]> decoder_q_proj_1_weight = const()[name = tensor<string, []>("decoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37816960)))];
-            tensor<fp32, [256]> decoder_k_proj_1_bias = const()[name = tensor<string, []>("decoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38079168)))];
-            tensor<fp32, [256, 256]> decoder_k_proj_1_weight = const()[name = tensor<string, []>("decoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38080256)))];
-            tensor<fp32, [256]> decoder_v_proj_1_bias = const()[name = tensor<string, []>("decoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38342464)))];
-            tensor<fp32, [256, 256]> decoder_v_proj_1_weight = const()[name = tensor<string, []>("decoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38343552)))];
-            tensor<fp32, [256]> decoder_g_proj_1_bias = const()[name = tensor<string, []>("decoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38605760)))];
-            tensor<fp32, [256, 256]> decoder_g_proj_1_weight = const()[name = tensor<string, []>("decoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38606848)))];
-            tensor<fp32, [256]> decoder_out_proj_1_bias = const()[name = tensor<string, []>("decoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38869056)))];
-            tensor<fp32, [256, 256]> decoder_out_proj_1_weight = const()[name = tensor<string, []>("decoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38870144)))];
-            tensor<fp32, [256]> decoder_norm11_1_bias = const()[name = tensor<string, []>("decoder_norm11_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39132352)))];
-            tensor<fp32, [256]> decoder_norm11_1_weight = const()[name = tensor<string, []>("decoder_norm11_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39133440)))];
-            tensor<fp32, [256]> decoder_self_attn2_1_out_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_1_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39134528)))];
-            tensor<fp32, [256, 256]> decoder_self_attn2_1_out_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_1_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39135616)))];
-            tensor<fp32, [768]> decoder_self_attn2_1_in_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_1_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39397824)))];
-            tensor<fp32, [768, 256]> decoder_self_attn2_1_in_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_1_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39400960)))];
-            tensor<fp32, [256]> decoder_norm21_1_bias = const()[name = tensor<string, []>("decoder_norm21_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40187456)))];
-            tensor<fp32, [256]> decoder_norm21_1_weight = const()[name = tensor<string, []>("decoder_norm21_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40188544)))];
-            tensor<fp32, [2048]> decoder_linear1_1_bias = const()[name = tensor<string, []>("decoder_linear1_1_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40189632)))];
-            tensor<fp32, [2048, 256]> decoder_linear1_1_weight = const()[name = tensor<string, []>("decoder_linear1_1_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40197888)))];
-            tensor<fp32, [256]> decoder_linear2_1_bias = const()[name = tensor<string, []>("decoder_linear2_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42295104)))];
-            tensor<fp32, [256, 2048]> decoder_linear2_1_weight = const()[name = tensor<string, []>("decoder_linear2_1_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42296192)))];
-            tensor<fp32, [256]> decoder_norm22_1_bias = const()[name = tensor<string, []>("decoder_norm22_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44393408)))];
-            tensor<fp32, [256]> decoder_norm22_1_weight = const()[name = tensor<string, []>("decoder_norm22_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44394496)))];
-            tensor<fp32, []> var_11 = const()[name = tensor<string, []>("op_11"), val = tensor<fp32, []>(0x1.5798eep-27)];
-            tensor<fp32, []> var_17 = const()[name = tensor<string, []>("op_17"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
-            tensor<int32, []> var_20 = const()[name = tensor<string, []>("op_20"), val = tensor<int32, []>(2)];
-            tensor<int32, []> var_23 = const()[name = tensor<string, []>("op_23"), val = tensor<int32, []>(0)];
-            tensor<int32, []> var_26 = const()[name = tensor<string, []>("op_26"), val = tensor<int32, []>(1)];
-            tensor<fp32, []> var_28 = const()[name = tensor<string, []>("op_28"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
-            tensor<bool, []> var_29 = const()[name = tensor<string, []>("op_29"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 256]> input_1 = linear(bias = encoder_input_projection_linear_bias, weight = encoder_input_projection_linear_weight, x = features)[name = tensor<string, []>("linear_0")];
-            tensor<int32, [1]> input_3_axes_0 = const()[name = tensor<string, []>("input_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_3 = layer_norm(axes = input_3_axes_0, beta = encoder_pre_layer_norm_bias, epsilon = var_28, gamma = encoder_pre_layer_norm_weight, x = input_1)[name = tensor<string, []>("input_3")];
+    func main<ios17>(tensor<fp32, [1, 256, 18]> cnn_window, tensor<fp32, [2, 12, 4, 64, 64]> dec_kv, tensor<fp32, [2, 1]> dec_scale, tensor<fp32, [4, 1, 16, 256]> enc_conv_cache, tensor<fp32, [4, 1, 4, 64, 64]> enc_kv, tensor<fp32, [4, 1]> enc_scale, tensor<fp32, [1, 45, 23]> features, tensor<fp32, [4]> valid_mask) {
+            tensor<fp32, [256]> inner_encoder_cnn_bias = const()[name = tensor<string, []>("inner_encoder_cnn_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp32, [256, 256, 19]> inner_encoder_cnn_weight = const()[name = tensor<string, []>("inner_encoder_cnn_weight"), val = tensor<fp32, [256, 256, 19]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
+            tensor<fp32, [4, 4]> inner_encoder__causal_mask = const()[name = tensor<string, []>("inner_encoder__causal_mask"), val = tensor<fp32, [4, 4]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4981952)))];
+            tensor<fp32, [4]> inner_encoder__t_index = const()[name = tensor<string, []>("inner_encoder__t_index"), val = tensor<fp32, [4]>([0x1p+0, 0x1p+1, 0x1.8p+1, 0x1p+2])];
+            tensor<fp32, [256]> inner_encoder_input_projection_linear_bias = const()[name = tensor<string, []>("inner_encoder_input_projection_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4982080)))];
+            tensor<fp32, [256, 345]> inner_encoder_input_projection_linear_weight = const()[name = tensor<string, []>("inner_encoder_input_projection_linear_weight"), val = tensor<fp32, [256, 345]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4983168)))];
+            tensor<fp32, [256]> inner_encoder_pre_layer_norm_bias = const()[name = tensor<string, []>("inner_encoder_pre_layer_norm_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5336512)))];
+            tensor<fp32, [256]> inner_encoder_pre_layer_norm_weight = const()[name = tensor<string, []>("inner_encoder_pre_layer_norm_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5337600)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5338688)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5339776)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5340864)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5345024)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6393664)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6394752)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_0_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7443392)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_0_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7444480)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7445568)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7446656)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7708864)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7709952)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7972160)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7973248)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8235456)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8236544)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8498752)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8499840)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8762048)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8763136)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_0_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8764224)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_0_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8766336)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_0_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9290688)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9307136)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9308224)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9309312)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9310400)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9311488)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_0_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9312576)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9574784)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9575872)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9576960)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9581120)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10629760)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10630848)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_0_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11679488)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_0_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11680576)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11681664)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11682752)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11683840)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11688000)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12736640)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12737728)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_1_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13786368)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_1_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13787456)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13788544)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13789632)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14051840)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14052928)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14315136)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14316224)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14578432)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14579520)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14841728)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14842816)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15105024)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15106112)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_1_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15107200)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_1_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15109312)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_1_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15633664)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15650112)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15651200)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15652288)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15653376)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15654464)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_1_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15655552)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15917760)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15918848)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15919936)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15924096)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16972736)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16973824)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_1_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18022464)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_1_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18023552)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18024640)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18025728)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18026816)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18030976)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19079616)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19080704)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_2_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20129344)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_2_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20130432)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20131520)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20132608)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20394816)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20395904)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20658112)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20659200)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20921408)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20922496)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21184704)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21185792)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21448000)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21449088)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_2_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21450176)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_2_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21452288)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_2_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21976640)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21993088)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21994176)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21995264)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21996352)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21997440)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_2_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21998528)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22260736)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22261824)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22262912)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22267072)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23315712)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23316800)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_2_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24365440)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_2_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24366528)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24367616)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24368704)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24369792)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24373952)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25422592)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25423680)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_3_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26472320)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_3_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26473408)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26474496)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26475584)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26737792)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26738880)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27001088)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27002176)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27264384)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27265472)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27527680)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27528768)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27790976)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27792064)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_3_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27793152)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_3_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27795264)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_3_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28319616)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28336064)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28337152)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28338240)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28339328)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28340416)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_3_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28341504)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28603712)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28604800)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28605888)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28610048)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29658688)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29659776)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_3_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30708416)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_3_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30709504)))];
+            tensor<fp32, [4, 4]> inner_decoder__causal_mask = const()[name = tensor<string, []>("inner_decoder__causal_mask"), val = tensor<fp32, [4, 4]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710592)))];
+            tensor<fp32, [256]> inner_decoder_convert_bias = const()[name = tensor<string, []>("inner_decoder_convert_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710720)))];
+            tensor<fp32, [256, 512]> inner_decoder_convert_weight = const()[name = tensor<string, []>("inner_decoder_convert_weight"), val = tensor<fp32, [256, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30711808)))];
+            tensor<fp32, [256]> inner_decoder_q_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31236160)))];
+            tensor<fp32, [256, 256]> inner_decoder_q_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31237248)))];
+            tensor<fp32, [256]> inner_decoder_k_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31499456)))];
+            tensor<fp32, [256, 256]> inner_decoder_k_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31500544)))];
+            tensor<fp32, [256]> inner_decoder_v_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31762752)))];
+            tensor<fp32, [256, 256]> inner_decoder_v_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31763840)))];
+            tensor<fp32, [256]> inner_decoder_g_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32026048)))];
+            tensor<fp32, [256, 256]> inner_decoder_g_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32027136)))];
+            tensor<fp32, [256]> inner_decoder_out_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32289344)))];
+            tensor<fp32, [256, 256]> inner_decoder_out_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32290432)))];
+            tensor<fp32, [256]> inner_decoder_norm11_0_bias = const()[name = tensor<string, []>("inner_decoder_norm11_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32552640)))];
+            tensor<fp32, [256]> inner_decoder_norm11_0_weight = const()[name = tensor<string, []>("inner_decoder_norm11_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32553728)))];
+            tensor<fp32, [256]> inner_decoder_self_attn2_0_out_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32554816)))];
+            tensor<fp32, [256, 256]> inner_decoder_self_attn2_0_out_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32555904)))];
+            tensor<fp32, [768]> inner_decoder_self_attn2_0_in_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32818112)))];
+            tensor<fp32, [768, 256]> inner_decoder_self_attn2_0_in_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32821248)))];
+            tensor<fp32, [256]> inner_decoder_norm21_0_bias = const()[name = tensor<string, []>("inner_decoder_norm21_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33607744)))];
+            tensor<fp32, [256]> inner_decoder_norm21_0_weight = const()[name = tensor<string, []>("inner_decoder_norm21_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33608832)))];
+            tensor<fp32, [2048]> inner_decoder_linear1_0_bias = const()[name = tensor<string, []>("inner_decoder_linear1_0_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33609920)))];
+            tensor<fp32, [2048, 256]> inner_decoder_linear1_0_weight = const()[name = tensor<string, []>("inner_decoder_linear1_0_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33618176)))];
+            tensor<fp32, [256]> inner_decoder_linear2_0_bias = const()[name = tensor<string, []>("inner_decoder_linear2_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35715392)))];
+            tensor<fp32, [256, 2048]> inner_decoder_linear2_0_weight = const()[name = tensor<string, []>("inner_decoder_linear2_0_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35716480)))];
+            tensor<fp32, [256]> inner_decoder_norm22_0_bias = const()[name = tensor<string, []>("inner_decoder_norm22_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37813696)))];
+            tensor<fp32, [256]> inner_decoder_norm22_0_weight = const()[name = tensor<string, []>("inner_decoder_norm22_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37814784)))];
+            tensor<fp32, [256]> inner_decoder_q_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37815872)))];
+            tensor<fp32, [256, 256]> inner_decoder_q_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37816960)))];
+            tensor<fp32, [256]> inner_decoder_k_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38079168)))];
+            tensor<fp32, [256, 256]> inner_decoder_k_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38080256)))];
+            tensor<fp32, [256]> inner_decoder_v_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38342464)))];
+            tensor<fp32, [256, 256]> inner_decoder_v_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38343552)))];
+            tensor<fp32, [256]> inner_decoder_g_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38605760)))];
+            tensor<fp32, [256, 256]> inner_decoder_g_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38606848)))];
+            tensor<fp32, [256]> inner_decoder_out_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38869056)))];
+            tensor<fp32, [256, 256]> inner_decoder_out_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38870144)))];
+            tensor<fp32, [256]> inner_decoder_norm11_1_bias = const()[name = tensor<string, []>("inner_decoder_norm11_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39132352)))];
+            tensor<fp32, [256]> inner_decoder_norm11_1_weight = const()[name = tensor<string, []>("inner_decoder_norm11_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39133440)))];
+            tensor<fp32, [256]> inner_decoder_self_attn2_1_out_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39134528)))];
+            tensor<fp32, [256, 256]> inner_decoder_self_attn2_1_out_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39135616)))];
+            tensor<fp32, [768]> inner_decoder_self_attn2_1_in_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39397824)))];
+            tensor<fp32, [768, 256]> inner_decoder_self_attn2_1_in_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39400960)))];
+            tensor<fp32, [256]> inner_decoder_norm21_1_bias = const()[name = tensor<string, []>("inner_decoder_norm21_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40187456)))];
+            tensor<fp32, [256]> inner_decoder_norm21_1_weight = const()[name = tensor<string, []>("inner_decoder_norm21_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40188544)))];
+            tensor<fp32, [2048]> inner_decoder_linear1_1_bias = const()[name = tensor<string, []>("inner_decoder_linear1_1_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40189632)))];
+            tensor<fp32, [2048, 256]> inner_decoder_linear1_1_weight = const()[name = tensor<string, []>("inner_decoder_linear1_1_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40197888)))];
+            tensor<fp32, [256]> inner_decoder_linear2_1_bias = const()[name = tensor<string, []>("inner_decoder_linear2_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42295104)))];
+            tensor<fp32, [256, 2048]> inner_decoder_linear2_1_weight = const()[name = tensor<string, []>("inner_decoder_linear2_1_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42296192)))];
+            tensor<fp32, [256]> inner_decoder_norm22_1_bias = const()[name = tensor<string, []>("inner_decoder_norm22_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44393408)))];
+            tensor<fp32, [256]> inner_decoder_norm22_1_weight = const()[name = tensor<string, []>("inner_decoder_norm22_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44394496)))];
+            tensor<int32, [3]> var_19_begin_0 = const()[name = tensor<string, []>("op_19_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_19_end_0 = const()[name = tensor<string, []>("op_19_end_0"), val = tensor<int32, [3]>([1, 15, 23])];
+            tensor<bool, [3]> var_19_end_mask_0 = const()[name = tensor<string, []>("op_19_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 15, 23]> var_19 = slice_by_index(begin = var_19_begin_0, end = var_19_end_0, end_mask = var_19_end_mask_0, x = features)[name = tensor<string, []>("op_19")];
+            tensor<int32, [3]> var_29_begin_0 = const()[name = tensor<string, []>("op_29_begin_0"), val = tensor<int32, [3]>([0, 10, 0])];
+            tensor<int32, [3]> var_29_end_0 = const()[name = tensor<string, []>("op_29_end_0"), val = tensor<int32, [3]>([1, 25, 23])];
+            tensor<bool, [3]> var_29_end_mask_0 = const()[name = tensor<string, []>("op_29_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 15, 23]> var_29 = slice_by_index(begin = var_29_begin_0, end = var_29_end_0, end_mask = var_29_end_mask_0, x = features)[name = tensor<string, []>("op_29")];
+            tensor<int32, [3]> var_39_begin_0 = const()[name = tensor<string, []>("op_39_begin_0"), val = tensor<int32, [3]>([0, 20, 0])];
+            tensor<int32, [3]> var_39_end_0 = const()[name = tensor<string, []>("op_39_end_0"), val = tensor<int32, [3]>([1, 35, 23])];
+            tensor<bool, [3]> var_39_end_mask_0 = const()[name = tensor<string, []>("op_39_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 15, 23]> var_39 = slice_by_index(begin = var_39_begin_0, end = var_39_end_0, end_mask = var_39_end_mask_0, x = features)[name = tensor<string, []>("op_39")];
+            tensor<int32, [3]> var_49_begin_0 = const()[name = tensor<string, []>("op_49_begin_0"), val = tensor<int32, [3]>([0, 30, 0])];
+            tensor<int32, [3]> var_49_end_0 = const()[name = tensor<string, []>("op_49_end_0"), val = tensor<int32, [3]>([1, 1, 23])];
+            tensor<bool, [3]> var_49_end_mask_0 = const()[name = tensor<string, []>("op_49_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 23]> var_49 = slice_by_index(begin = var_49_begin_0, end = var_49_end_0, end_mask = var_49_end_mask_0, x = features)[name = tensor<string, []>("op_49")];
+            tensor<int32, []> stacked_axis_0 = const()[name = tensor<string, []>("stacked_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 4, 15, 23]> stacked = stack(axis = stacked_axis_0, values = (var_19, var_29, var_39, var_49))[name = tensor<string, []>("stacked")];
+            tensor<int32, [3]> var_56 = const()[name = tensor<string, []>("op_56"), val = tensor<int32, [3]>([1, 4, 345])];
+            tensor<fp32, [1, 4, 345]> input_1 = reshape(shape = var_56, x = stacked)[name = tensor<string, []>("input_1")];
+            tensor<fp32, []> var_59 = const()[name = tensor<string, []>("op_59"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<bool, []> var_65 = const()[name = tensor<string, []>("op_65"), val = tensor<bool, []>(true)];
+            tensor<fp32, []> var_66 = const()[name = tensor<string, []>("op_66"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
+            tensor<int32, []> var_69 = const()[name = tensor<string, []>("op_69"), val = tensor<int32, []>(0)];
+            tensor<int32, []> var_71 = const()[name = tensor<string, []>("op_71"), val = tensor<int32, []>(2)];
+            tensor<int32, []> var_72 = const()[name = tensor<string, []>("op_72"), val = tensor<int32, []>(-1)];
+            tensor<fp32, []> var_74 = const()[name = tensor<string, []>("op_74"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
+            tensor<fp32, []> var_79 = const()[name = tensor<string, []>("op_79"), val = tensor<fp32, []>(0x1.5798eep-27)];
+            tensor<int32, []> var_82 = const()[name = tensor<string, []>("op_82"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 4, 256]> input_3 = linear(bias = inner_encoder_input_projection_linear_bias, weight = inner_encoder_input_projection_linear_weight, x = input_1)[name = tensor<string, []>("linear_0")];
             tensor<int32, [1]> input_5_axes_0 = const()[name = tensor<string, []>("input_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_5 = layer_norm(axes = input_5_axes_0, beta = encoder_ffn1_0_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_0_module_sequential_0_weight, x = input_3)[name = tensor<string, []>("input_5")];
-            tensor<fp32, [1, 4, 1024]> inputs_1 = linear(bias = encoder_ffn1_0_module_sequential_1_linear_bias, weight = encoder_ffn1_0_module_sequential_1_linear_weight, x = input_5)[name = tensor<string, []>("linear_1")];
-            tensor<fp32, [1, 4, 1024]> input_7 = silu(x = inputs_1)[name = tensor<string, []>("input_7")];
-            tensor<fp32, [1, 4, 256]> input_11 = linear(bias = encoder_ffn1_0_module_sequential_4_linear_bias, weight = encoder_ffn1_0_module_sequential_4_linear_weight, x = input_7)[name = tensor<string, []>("linear_2")];
-            tensor<fp32, []> var_147 = const()[name = tensor<string, []>("op_147"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 4, 256]> var_148 = mul(x = input_11, y = var_147)[name = tensor<string, []>("op_148")];
-            tensor<fp32, [1, 4, 256]> input_13 = add(x = var_148, y = input_3)[name = tensor<string, []>("input_13")];
+            tensor<fp32, [1, 4, 256]> input_5 = layer_norm(axes = input_5_axes_0, beta = inner_encoder_pre_layer_norm_bias, epsilon = var_66, gamma = inner_encoder_pre_layer_norm_weight, x = input_3)[name = tensor<string, []>("input_5")];
+            tensor<int32, [1]> input_7_axes_0 = const()[name = tensor<string, []>("input_7_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 4, 256]> input_7 = layer_norm(axes = input_7_axes_0, beta = inner_encoder_ffn1_0_module_sequential_0_bias, epsilon = var_66, gamma = inner_encoder_ffn1_0_module_sequential_0_weight, x = input_5)[name = tensor<string, []>("input_7")];
+            tensor<fp32, [1, 4, 1024]> inputs_1 = linear(bias = inner_encoder_ffn1_0_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_0_module_sequential_1_linear_weight, x = input_7)[name = tensor<string, []>("linear_1")];
+            tensor<fp32, [1, 4, 1024]> input_9 = silu(x = inputs_1)[name = tensor<string, []>("input_9")];
+            tensor<fp32, [1, 4, 256]> input_13 = linear(bias = inner_encoder_ffn1_0_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_0_module_sequential_4_linear_weight, x = input_9)[name = tensor<string, []>("linear_2")];
+            tensor<fp32, []> var_203 = const()[name = tensor<string, []>("op_203"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 4, 256]> var_204 = mul(x = input_13, y = var_203)[name = tensor<string, []>("op_204")];
+            tensor<fp32, [1, 4, 256]> input_15 = add(x = var_204, y = input_5)[name = tensor<string, []>("input_15")];
             tensor<int32, [1]> x_1_axes_0 = const()[name = tensor<string, []>("x_1_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> x_1 = layer_norm(axes = x_1_axes_0, beta = encoder_ret_lns_0_bias, epsilon = var_28, gamma = encoder_ret_lns_0_weight, x = input_13)[name = tensor<string, []>("x_1")];
+            tensor<fp32, [1, 4, 256]> x_1 = layer_norm(axes = x_1_axes_0, beta = inner_encoder_ret_lns_0_bias, epsilon = var_66, gamma = inner_encoder_ret_lns_0_weight, x = input_15)[name = tensor<string, []>("x_1")];
             tensor<int32, [5]> prev_kv_1_begin_0 = const()[name = tensor<string, []>("prev_kv_1_begin_0"), val = tensor<int32, [5]>([0, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_1_end_0 = const()[name = tensor<string, []>("prev_kv_1_end_0"), val = tensor<int32, [5]>([1, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_1_end_mask_0 = const()[name = tensor<string, []>("prev_kv_1_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -239,173 +261,173 @@ program(1.0)
             tensor<bool, [2]> prev_scale_1_end_mask_0 = const()[name = tensor<string, []>("prev_scale_1_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_1_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_1_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_1 = slice_by_index(begin = prev_scale_1_begin_0, end = prev_scale_1_end_0, end_mask = prev_scale_1_end_mask_0, squeeze_mask = prev_scale_1_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_1")];
-            tensor<fp32, [1, 4, 256]> var_162 = linear(bias = encoder_q_proj_0_bias, weight = encoder_q_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_3")];
-            tensor<int32, [4]> var_163 = const()[name = tensor<string, []>("op_163"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_164 = reshape(shape = var_163, x = var_162)[name = tensor<string, []>("op_164")];
+            tensor<fp32, [1, 4, 256]> var_218 = linear(bias = inner_encoder_q_proj_0_bias, weight = inner_encoder_q_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_3")];
+            tensor<int32, [4]> var_219 = const()[name = tensor<string, []>("op_219"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_220 = reshape(shape = var_219, x = var_218)[name = tensor<string, []>("op_220")];
             tensor<int32, [4]> q_1_perm_0 = const()[name = tensor<string, []>("q_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> var_168 = linear(bias = encoder_k_proj_0_bias, weight = encoder_k_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_4")];
-            tensor<fp32, []> var_169 = const()[name = tensor<string, []>("op_169"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 4, 256]> var_170 = mul(x = var_168, y = var_169)[name = tensor<string, []>("op_170")];
-            tensor<int32, [4]> var_171 = const()[name = tensor<string, []>("op_171"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_172 = reshape(shape = var_171, x = var_170)[name = tensor<string, []>("op_172")];
+            tensor<fp32, [1, 4, 256]> var_224 = linear(bias = inner_encoder_k_proj_0_bias, weight = inner_encoder_k_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_4")];
+            tensor<fp32, []> var_225 = const()[name = tensor<string, []>("op_225"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 4, 256]> var_226 = mul(x = var_224, y = var_225)[name = tensor<string, []>("op_226")];
+            tensor<int32, [4]> var_227 = const()[name = tensor<string, []>("op_227"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_228 = reshape(shape = var_227, x = var_226)[name = tensor<string, []>("op_228")];
             tensor<int32, [4]> k_1_perm_0 = const()[name = tensor<string, []>("k_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> var_176 = linear(bias = encoder_v_proj_0_bias, weight = encoder_v_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_5")];
-            tensor<int32, [4]> var_177 = const()[name = tensor<string, []>("op_177"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_178 = reshape(shape = var_177, x = var_176)[name = tensor<string, []>("op_178")];
+            tensor<fp32, [1, 4, 256]> var_232 = linear(bias = inner_encoder_v_proj_0_bias, weight = inner_encoder_v_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_5")];
+            tensor<int32, [4]> var_233 = const()[name = tensor<string, []>("op_233"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_234 = reshape(shape = var_233, x = var_232)[name = tensor<string, []>("op_234")];
             tensor<int32, [4]> v_1_perm_0 = const()[name = tensor<string, []>("v_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> input_17 = linear(bias = encoder_g_proj_0_bias, weight = encoder_g_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_6")];
+            tensor<fp32, [1, 4, 256]> input_19 = linear(bias = inner_encoder_g_proj_0_bias, weight = inner_encoder_g_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_6")];
             tensor<fp32, [1]> sqrt_s0_1 = sqrt(x = prev_scale_1)[name = tensor<string, []>("sqrt_s0_1")];
-            tensor<fp32, [4]> s_t_1 = add(x = prev_scale_1, y = encoder__t_index)[name = tensor<string, []>("s_t_1")];
+            tensor<fp32, [4]> s_t_1 = add(x = prev_scale_1, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_1")];
             tensor<fp32, [4]> sqrt_s_t_1 = sqrt(x = s_t_1)[name = tensor<string, []>("sqrt_s_t_1")];
             tensor<bool, []> qk_1_transpose_x_1 = const()[name = tensor<string, []>("qk_1_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_1_transpose_y_1 = const()[name = tensor<string, []>("qk_1_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 4, 64]> k_1 = transpose(perm = k_1_perm_0, x = var_172)[name = tensor<string, []>("transpose_57")];
-            tensor<fp32, [1, 4, 4, 64]> q_1 = transpose(perm = q_1_perm_0, x = var_164)[name = tensor<string, []>("transpose_58")];
+            tensor<fp32, [1, 4, 4, 64]> k_1 = transpose(perm = k_1_perm_0, x = var_228)[name = tensor<string, []>("transpose_57")];
+            tensor<fp32, [1, 4, 4, 64]> q_1 = transpose(perm = q_1_perm_0, x = var_220)[name = tensor<string, []>("transpose_58")];
             tensor<fp32, [1, 4, 4, 4]> qk_1 = matmul(transpose_x = qk_1_transpose_x_1, transpose_y = qk_1_transpose_y_1, x = q_1, y = k_1)[name = tensor<string, []>("qk_1")];
-            tensor<int32, [2]> var_188 = const()[name = tensor<string, []>("op_188"), val = tensor<int32, [2]>([4, 1])];
-            tensor<fp32, [4, 1]> var_189 = reshape(shape = var_188, x = sqrt_s_t_1)[name = tensor<string, []>("op_189")];
-            tensor<fp32, [4, 4]> M_1 = real_div(x = encoder__causal_mask, y = var_189)[name = tensor<string, []>("M_1")];
-            tensor<fp32, [1, 4, 4, 4]> var_191 = mul(x = qk_1, y = M_1)[name = tensor<string, []>("op_191")];
+            tensor<int32, [2]> var_244 = const()[name = tensor<string, []>("op_244"), val = tensor<int32, [2]>([4, 1])];
+            tensor<fp32, [4, 1]> var_245 = reshape(shape = var_244, x = sqrt_s_t_1)[name = tensor<string, []>("op_245")];
+            tensor<fp32, [4, 4]> M_1 = real_div(x = inner_encoder__causal_mask, y = var_245)[name = tensor<string, []>("M_1")];
+            tensor<fp32, [1, 4, 4, 4]> var_247 = mul(x = qk_1, y = M_1)[name = tensor<string, []>("op_247")];
             tensor<bool, []> inner_1_transpose_x_0 = const()[name = tensor<string, []>("inner_1_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_1_transpose_y_0 = const()[name = tensor<string, []>("inner_1_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 4, 64]> v_1 = transpose(perm = v_1_perm_0, x = var_178)[name = tensor<string, []>("transpose_56")];
-            tensor<fp32, [1, 4, 4, 64]> inner_1 = matmul(transpose_x = inner_1_transpose_x_0, transpose_y = inner_1_transpose_y_0, x = var_191, y = v_1)[name = tensor<string, []>("inner_1")];
-            tensor<bool, []> var_193_transpose_x_0 = const()[name = tensor<string, []>("op_193_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_193_transpose_y_0 = const()[name = tensor<string, []>("op_193_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 4, 64]> var_193 = matmul(transpose_x = var_193_transpose_x_0, transpose_y = var_193_transpose_y_0, x = q_1, y = prev_kv_1)[name = tensor<string, []>("op_193")];
-            tensor<fp32, [4]> var_194 = real_div(x = sqrt_s0_1, y = sqrt_s_t_1)[name = tensor<string, []>("op_194")];
-            tensor<int32, [4]> var_195 = const()[name = tensor<string, []>("op_195"), val = tensor<int32, [4]>([1, 1, 4, 1])];
-            tensor<fp32, [1, 1, 4, 1]> var_196 = reshape(shape = var_195, x = var_194)[name = tensor<string, []>("op_196")];
-            tensor<fp32, [1, 4, 4, 64]> cross_1 = mul(x = var_193, y = var_196)[name = tensor<string, []>("cross_1")];
+            tensor<fp32, [1, 4, 4, 64]> v_1 = transpose(perm = v_1_perm_0, x = var_234)[name = tensor<string, []>("transpose_56")];
+            tensor<fp32, [1, 4, 4, 64]> inner_1 = matmul(transpose_x = inner_1_transpose_x_0, transpose_y = inner_1_transpose_y_0, x = var_247, y = v_1)[name = tensor<string, []>("inner_1")];
+            tensor<bool, []> var_249_transpose_x_0 = const()[name = tensor<string, []>("op_249_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_249_transpose_y_0 = const()[name = tensor<string, []>("op_249_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 4, 64]> var_249 = matmul(transpose_x = var_249_transpose_x_0, transpose_y = var_249_transpose_y_0, x = q_1, y = prev_kv_1)[name = tensor<string, []>("op_249")];
+            tensor<fp32, [4]> var_250 = real_div(x = sqrt_s0_1, y = sqrt_s_t_1)[name = tensor<string, []>("op_250")];
+            tensor<int32, [4]> var_251 = const()[name = tensor<string, []>("op_251"), val = tensor<int32, [4]>([1, 1, 4, 1])];
+            tensor<fp32, [1, 1, 4, 1]> var_252 = reshape(shape = var_251, x = var_250)[name = tensor<string, []>("op_252")];
+            tensor<fp32, [1, 4, 4, 64]> cross_1 = mul(x = var_249, y = var_252)[name = tensor<string, []>("cross_1")];
             tensor<fp32, [1, 4, 4, 64]> out_1 = add(x = inner_1, y = cross_1)[name = tensor<string, []>("out_1")];
-            tensor<fp32, [1, 4, 64, 64]> var_199 = mul(x = prev_kv_1, y = sqrt_s0_1)[name = tensor<string, []>("op_199")];
-            tensor<bool, []> var_201_transpose_x_1 = const()[name = tensor<string, []>("op_201_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_201_transpose_y_1 = const()[name = tensor<string, []>("op_201_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_201 = matmul(transpose_x = var_201_transpose_x_1, transpose_y = var_201_transpose_y_1, x = k_1, y = v_1)[name = tensor<string, []>("op_201")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_1 = add(x = var_199, y = var_201)[name = tensor<string, []>("new_kv_unnorm_1")];
-            tensor<fp32, []> var_203 = const()[name = tensor<string, []>("op_203"), val = tensor<fp32, []>(0x1p+2)];
-            tensor<fp32, [1]> new_scale_1 = add(x = prev_scale_1, y = var_203)[name = tensor<string, []>("new_scale_1")];
-            tensor<fp32, [1]> var_205 = sqrt(x = new_scale_1)[name = tensor<string, []>("op_205")];
-            tensor<fp32, [1, 4, 64, 64]> var_206 = real_div(x = new_kv_unnorm_1, y = var_205)[name = tensor<string, []>("op_206")];
-            tensor<int32, [4]> var_207_perm_0 = const()[name = tensor<string, []>("op_207_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_255 = mul(x = prev_kv_1, y = sqrt_s0_1)[name = tensor<string, []>("op_255")];
+            tensor<bool, []> var_257_transpose_x_1 = const()[name = tensor<string, []>("op_257_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_257_transpose_y_1 = const()[name = tensor<string, []>("op_257_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_257 = matmul(transpose_x = var_257_transpose_x_1, transpose_y = var_257_transpose_y_1, x = k_1, y = v_1)[name = tensor<string, []>("op_257")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_1 = add(x = var_255, y = var_257)[name = tensor<string, []>("new_kv_unnorm_1")];
+            tensor<fp32, []> var_259 = const()[name = tensor<string, []>("op_259"), val = tensor<fp32, []>(0x1p+2)];
+            tensor<fp32, [1]> new_scale_1 = add(x = prev_scale_1, y = var_259)[name = tensor<string, []>("new_scale_1")];
+            tensor<fp32, [1]> var_261 = sqrt(x = new_scale_1)[name = tensor<string, []>("op_261")];
+            tensor<fp32, [1, 4, 64, 64]> var_262 = real_div(x = new_kv_unnorm_1, y = var_261)[name = tensor<string, []>("op_262")];
+            tensor<int32, [4]> var_263_perm_0 = const()[name = tensor<string, []>("op_263_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_3_axes_0 = const()[name = tensor<string, []>("out_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 4, 64]> var_207 = transpose(perm = var_207_perm_0, x = out_1)[name = tensor<string, []>("transpose_55")];
-            tensor<fp32, [1, 4, 4, 64]> out_3 = layer_norm(axes = out_3_axes_0, epsilon = var_17, x = var_207)[name = tensor<string, []>("out_3")];
-            tensor<int32, [3]> var_211 = const()[name = tensor<string, []>("op_211"), val = tensor<int32, [3]>([1, 4, 256])];
-            tensor<fp32, [1, 4, 256]> out_5 = reshape(shape = var_211, x = out_3)[name = tensor<string, []>("out_5")];
-            tensor<fp32, [1, 4, 256]> var_213 = silu(x = input_17)[name = tensor<string, []>("op_213")];
-            tensor<fp32, [1, 4, 256]> input_19 = mul(x = var_213, y = out_5)[name = tensor<string, []>("input_19")];
-            tensor<fp32, [1, 4, 256]> ret_out_1 = linear(bias = encoder_out_proj_0_bias, weight = encoder_out_proj_0_weight, x = input_19)[name = tensor<string, []>("linear_7")];
-            tensor<fp32, [1, 4, 256]> x_3 = add(x = input_13, y = ret_out_1)[name = tensor<string, []>("x_3")];
+            tensor<fp32, [1, 4, 4, 64]> var_263 = transpose(perm = var_263_perm_0, x = out_1)[name = tensor<string, []>("transpose_55")];
+            tensor<fp32, [1, 4, 4, 64]> out_3 = layer_norm(axes = out_3_axes_0, epsilon = var_74, x = var_263)[name = tensor<string, []>("out_3")];
+            tensor<int32, [3]> var_267 = const()[name = tensor<string, []>("op_267"), val = tensor<int32, [3]>([1, 4, 256])];
+            tensor<fp32, [1, 4, 256]> out_5 = reshape(shape = var_267, x = out_3)[name = tensor<string, []>("out_5")];
+            tensor<fp32, [1, 4, 256]> var_269 = silu(x = input_19)[name = tensor<string, []>("op_269")];
+            tensor<fp32, [1, 4, 256]> input_21 = mul(x = var_269, y = out_5)[name = tensor<string, []>("input_21")];
+            tensor<fp32, [1, 4, 256]> ret_out_1 = linear(bias = inner_encoder_out_proj_0_bias, weight = inner_encoder_out_proj_0_weight, x = input_21)[name = tensor<string, []>("linear_7")];
+            tensor<fp32, [1, 4, 256]> x_3 = add(x = input_15, y = ret_out_1)[name = tensor<string, []>("x_3")];
             tensor<int32, [4]> window_1_begin_0 = const()[name = tensor<string, []>("window_1_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> window_1_end_0 = const()[name = tensor<string, []>("window_1_end_0"), val = tensor<int32, [4]>([1, 1, 16, 256])];
             tensor<bool, [4]> window_1_end_mask_0 = const()[name = tensor<string, []>("window_1_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_1_squeeze_mask_0 = const()[name = tensor<string, []>("window_1_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_1 = slice_by_index(begin = window_1_begin_0, end = window_1_end_0, end_mask = window_1_end_mask_0, squeeze_mask = window_1_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_1")];
-            tensor<int32, [3]> var_221_begin_0 = const()[name = tensor<string, []>("op_221_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_221_end_0 = const()[name = tensor<string, []>("op_221_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_221_end_mask_0 = const()[name = tensor<string, []>("op_221_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_221 = slice_by_index(begin = var_221_begin_0, end = var_221_end_0, end_mask = var_221_end_mask_0, x = x_3)[name = tensor<string, []>("op_221")];
-            tensor<int32, [3]> var_224_begin_0 = const()[name = tensor<string, []>("op_224_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_224_end_0 = const()[name = tensor<string, []>("op_224_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_224_end_mask_0 = const()[name = tensor<string, []>("op_224_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_224 = slice_by_index(begin = var_224_begin_0, end = var_224_end_0, end_mask = var_224_end_mask_0, x = window_1)[name = tensor<string, []>("op_224")];
+            tensor<int32, [3]> var_277_begin_0 = const()[name = tensor<string, []>("op_277_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_277_end_0 = const()[name = tensor<string, []>("op_277_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_277_end_mask_0 = const()[name = tensor<string, []>("op_277_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_277 = slice_by_index(begin = var_277_begin_0, end = var_277_end_0, end_mask = var_277_end_mask_0, x = x_3)[name = tensor<string, []>("op_277")];
+            tensor<int32, [3]> var_280_begin_0 = const()[name = tensor<string, []>("op_280_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_280_end_0 = const()[name = tensor<string, []>("op_280_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_280_end_mask_0 = const()[name = tensor<string, []>("op_280_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_280 = slice_by_index(begin = var_280_begin_0, end = var_280_end_0, end_mask = var_280_end_mask_0, x = window_1)[name = tensor<string, []>("op_280")];
             tensor<bool, []> window_3_interleave_0 = const()[name = tensor<string, []>("window_3_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_3 = concat(axis = var_26, interleave = window_3_interleave_0, values = (var_224, var_221))[name = tensor<string, []>("window_3")];
-            tensor<int32, [3]> var_229_begin_0 = const()[name = tensor<string, []>("op_229_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_229_end_0 = const()[name = tensor<string, []>("op_229_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_229_end_mask_0 = const()[name = tensor<string, []>("op_229_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_229 = slice_by_index(begin = var_229_begin_0, end = var_229_end_0, end_mask = var_229_end_mask_0, x = x_3)[name = tensor<string, []>("op_229")];
-            tensor<int32, [3]> var_232_begin_0 = const()[name = tensor<string, []>("op_232_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_232_end_0 = const()[name = tensor<string, []>("op_232_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_232_end_mask_0 = const()[name = tensor<string, []>("op_232_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_232 = slice_by_index(begin = var_232_begin_0, end = var_232_end_0, end_mask = var_232_end_mask_0, x = window_3)[name = tensor<string, []>("op_232")];
+            tensor<fp32, [1, 16, 256]> window_3 = concat(axis = var_82, interleave = window_3_interleave_0, values = (var_280, var_277))[name = tensor<string, []>("window_3")];
+            tensor<int32, [3]> var_285_begin_0 = const()[name = tensor<string, []>("op_285_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_285_end_0 = const()[name = tensor<string, []>("op_285_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_285_end_mask_0 = const()[name = tensor<string, []>("op_285_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_285 = slice_by_index(begin = var_285_begin_0, end = var_285_end_0, end_mask = var_285_end_mask_0, x = x_3)[name = tensor<string, []>("op_285")];
+            tensor<int32, [3]> var_288_begin_0 = const()[name = tensor<string, []>("op_288_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_288_end_0 = const()[name = tensor<string, []>("op_288_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_288_end_mask_0 = const()[name = tensor<string, []>("op_288_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_288 = slice_by_index(begin = var_288_begin_0, end = var_288_end_0, end_mask = var_288_end_mask_0, x = window_3)[name = tensor<string, []>("op_288")];
             tensor<bool, []> window_5_interleave_0 = const()[name = tensor<string, []>("window_5_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_5 = concat(axis = var_26, interleave = window_5_interleave_0, values = (var_232, var_229))[name = tensor<string, []>("window_5")];
-            tensor<int32, [3]> var_237_begin_0 = const()[name = tensor<string, []>("op_237_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_237_end_0 = const()[name = tensor<string, []>("op_237_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<bool, [3]> var_237_end_mask_0 = const()[name = tensor<string, []>("op_237_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_237 = slice_by_index(begin = var_237_begin_0, end = var_237_end_0, end_mask = var_237_end_mask_0, x = x_3)[name = tensor<string, []>("op_237")];
-            tensor<int32, [3]> var_240_begin_0 = const()[name = tensor<string, []>("op_240_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_240_end_0 = const()[name = tensor<string, []>("op_240_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_240_end_mask_0 = const()[name = tensor<string, []>("op_240_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_240 = slice_by_index(begin = var_240_begin_0, end = var_240_end_0, end_mask = var_240_end_mask_0, x = window_5)[name = tensor<string, []>("op_240")];
+            tensor<fp32, [1, 16, 256]> window_5 = concat(axis = var_82, interleave = window_5_interleave_0, values = (var_288, var_285))[name = tensor<string, []>("window_5")];
+            tensor<int32, [3]> var_293_begin_0 = const()[name = tensor<string, []>("op_293_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_293_end_0 = const()[name = tensor<string, []>("op_293_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<bool, [3]> var_293_end_mask_0 = const()[name = tensor<string, []>("op_293_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_293 = slice_by_index(begin = var_293_begin_0, end = var_293_end_0, end_mask = var_293_end_mask_0, x = x_3)[name = tensor<string, []>("op_293")];
+            tensor<int32, [3]> var_296_begin_0 = const()[name = tensor<string, []>("op_296_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_296_end_0 = const()[name = tensor<string, []>("op_296_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_296_end_mask_0 = const()[name = tensor<string, []>("op_296_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_296 = slice_by_index(begin = var_296_begin_0, end = var_296_end_0, end_mask = var_296_end_mask_0, x = window_5)[name = tensor<string, []>("op_296")];
             tensor<bool, []> window_7_interleave_0 = const()[name = tensor<string, []>("window_7_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_7 = concat(axis = var_26, interleave = window_7_interleave_0, values = (var_240, var_237))[name = tensor<string, []>("window_7")];
-            tensor<int32, [3]> var_245_begin_0 = const()[name = tensor<string, []>("op_245_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
-            tensor<int32, [3]> var_245_end_0 = const()[name = tensor<string, []>("op_245_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_245_end_mask_0 = const()[name = tensor<string, []>("op_245_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_245 = slice_by_index(begin = var_245_begin_0, end = var_245_end_0, end_mask = var_245_end_mask_0, x = x_3)[name = tensor<string, []>("op_245")];
-            tensor<int32, [3]> var_248_begin_0 = const()[name = tensor<string, []>("op_248_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_248_end_0 = const()[name = tensor<string, []>("op_248_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_248_end_mask_0 = const()[name = tensor<string, []>("op_248_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_248 = slice_by_index(begin = var_248_begin_0, end = var_248_end_0, end_mask = var_248_end_mask_0, x = window_7)[name = tensor<string, []>("op_248")];
+            tensor<fp32, [1, 16, 256]> window_7 = concat(axis = var_82, interleave = window_7_interleave_0, values = (var_296, var_293))[name = tensor<string, []>("window_7")];
+            tensor<int32, [3]> var_301_begin_0 = const()[name = tensor<string, []>("op_301_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
+            tensor<int32, [3]> var_301_end_0 = const()[name = tensor<string, []>("op_301_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_301_end_mask_0 = const()[name = tensor<string, []>("op_301_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_301 = slice_by_index(begin = var_301_begin_0, end = var_301_end_0, end_mask = var_301_end_mask_0, x = x_3)[name = tensor<string, []>("op_301")];
+            tensor<int32, [3]> var_304_begin_0 = const()[name = tensor<string, []>("op_304_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_304_end_0 = const()[name = tensor<string, []>("op_304_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_304_end_mask_0 = const()[name = tensor<string, []>("op_304_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_304 = slice_by_index(begin = var_304_begin_0, end = var_304_end_0, end_mask = var_304_end_mask_0, x = window_7)[name = tensor<string, []>("op_304")];
             tensor<bool, []> window_9_interleave_0 = const()[name = tensor<string, []>("window_9_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_9 = concat(axis = var_26, interleave = window_9_interleave_0, values = (var_248, var_245))[name = tensor<string, []>("window_9")];
-            tensor<bool, []> input_21_interleave_0 = const()[name = tensor<string, []>("input_21_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [4, 16, 256]> input_21 = concat(axis = var_23, interleave = input_21_interleave_0, values = (window_3, window_5, window_7, window_9))[name = tensor<string, []>("input_21")];
+            tensor<fp32, [1, 16, 256]> window_9 = concat(axis = var_82, interleave = window_9_interleave_0, values = (var_304, var_301))[name = tensor<string, []>("window_9")];
+            tensor<bool, []> input_23_interleave_0 = const()[name = tensor<string, []>("input_23_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [4, 16, 256]> input_23 = concat(axis = var_69, interleave = input_23_interleave_0, values = (window_3, window_5, window_7, window_9))[name = tensor<string, []>("input_23")];
             tensor<int32, [1]> x_5_axes_0 = const()[name = tensor<string, []>("x_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [4, 16, 256]> x_5 = layer_norm(axes = x_5_axes_0, beta = encoder_conv_module_0_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_0_sequential_0_weight, x = input_21)[name = tensor<string, []>("x_5")];
-            tensor<int32, [3]> input_23_perm_0 = const()[name = tensor<string, []>("input_23_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [4, 16, 256]> x_5 = layer_norm(axes = x_5_axes_0, beta = inner_encoder_conv_module_0_sequential_0_bias, epsilon = var_66, gamma = inner_encoder_conv_module_0_sequential_0_weight, x = input_23)[name = tensor<string, []>("x_5")];
+            tensor<int32, [3]> input_25_perm_0 = const()[name = tensor<string, []>("input_25_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_3_pad_type_0 = const()[name = tensor<string, []>("inputs_3_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_3_strides_0 = const()[name = tensor<string, []>("inputs_3_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_3_pad_0 = const()[name = tensor<string, []>("inputs_3_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_3_dilations_0 = const()[name = tensor<string, []>("inputs_3_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_3_groups_0 = const()[name = tensor<string, []>("inputs_3_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> input_23 = transpose(perm = input_23_perm_0, x = x_5)[name = tensor<string, []>("transpose_54")];
-            tensor<fp32, [4, 512, 16]> inputs_3 = conv(bias = encoder_conv_module_0_sequential_2_conv_bias, dilations = inputs_3_dilations_0, groups = inputs_3_groups_0, pad = inputs_3_pad_0, pad_type = inputs_3_pad_type_0, strides = inputs_3_strides_0, weight = encoder_conv_module_0_sequential_2_conv_weight, x = input_23)[name = tensor<string, []>("inputs_3")];
-            tensor<int32, [2]> var_273_split_sizes_0 = const()[name = tensor<string, []>("op_273_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_273_axis_0 = const()[name = tensor<string, []>("op_273_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> var_273_0, tensor<fp32, [4, 256, 16]> var_273_1 = split(axis = var_273_axis_0, split_sizes = var_273_split_sizes_0, x = inputs_3)[name = tensor<string, []>("op_273")];
-            tensor<fp32, [4, 256, 16]> var_275 = sigmoid(x = var_273_1)[name = tensor<string, []>("op_275")];
-            tensor<fp32, [4, 256, 16]> inputs_5 = mul(x = var_273_0, y = var_275)[name = tensor<string, []>("inputs_5")];
+            tensor<fp32, [4, 256, 16]> input_25 = transpose(perm = input_25_perm_0, x = x_5)[name = tensor<string, []>("transpose_54")];
+            tensor<fp32, [4, 512, 16]> inputs_3 = conv(bias = inner_encoder_conv_module_0_sequential_2_conv_bias, dilations = inputs_3_dilations_0, groups = inputs_3_groups_0, pad = inputs_3_pad_0, pad_type = inputs_3_pad_type_0, strides = inputs_3_strides_0, weight = inner_encoder_conv_module_0_sequential_2_conv_weight, x = input_25)[name = tensor<string, []>("inputs_3")];
+            tensor<int32, [2]> var_329_split_sizes_0 = const()[name = tensor<string, []>("op_329_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_329_axis_0 = const()[name = tensor<string, []>("op_329_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [4, 256, 16]> var_329_0, tensor<fp32, [4, 256, 16]> var_329_1 = split(axis = var_329_axis_0, split_sizes = var_329_split_sizes_0, x = inputs_3)[name = tensor<string, []>("op_329")];
+            tensor<fp32, [4, 256, 16]> var_331 = sigmoid(x = var_329_1)[name = tensor<string, []>("op_331")];
+            tensor<fp32, [4, 256, 16]> inputs_5 = mul(x = var_329_0, y = var_331)[name = tensor<string, []>("inputs_5")];
             tensor<string, []> outputs_aug_1_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_1_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_1_pad_0 = const()[name = tensor<string, []>("outputs_aug_1_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_1_groups_0 = const()[name = tensor<string, []>("outputs_aug_1_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_1_strides_0 = const()[name = tensor<string, []>("outputs_aug_1_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_1_dilations_0 = const()[name = tensor<string, []>("outputs_aug_1_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [4, 256, 31]> outputs_aug_1 = conv(dilations = outputs_aug_1_dilations_0, groups = outputs_aug_1_groups_0, pad = outputs_aug_1_pad_0, pad_type = outputs_aug_1_pad_type_0, strides = outputs_aug_1_strides_0, weight = encoder_conv_module_0_sequential_4_conv_weight, x = inputs_5)[name = tensor<string, []>("outputs_aug_1")];
-            tensor<int32, [3]> input_25_begin_0 = const()[name = tensor<string, []>("input_25_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_25_end_0 = const()[name = tensor<string, []>("input_25_end_0"), val = tensor<int32, [3]>([4, 256, 16])];
-            tensor<bool, [3]> input_25_end_mask_0 = const()[name = tensor<string, []>("input_25_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [4, 256, 16]> input_25 = slice_by_index(begin = input_25_begin_0, end = input_25_end_0, end_mask = input_25_end_mask_0, x = outputs_aug_1)[name = tensor<string, []>("input_25")];
-            tensor<fp32, [4, 256, 16]> inputs_7 = batch_norm(beta = encoder_conv_module_0_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_0_sequential_5_weight, mean = encoder_conv_module_0_sequential_5_running_mean, variance = encoder_conv_module_0_sequential_5_running_var, x = input_25)[name = tensor<string, []>("inputs_7")];
-            tensor<fp32, [4, 256, 16]> input_27 = silu(x = inputs_7)[name = tensor<string, []>("input_27")];
-            tensor<string, []> input_29_pad_type_0 = const()[name = tensor<string, []>("input_29_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_29_strides_0 = const()[name = tensor<string, []>("input_29_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_29_pad_0 = const()[name = tensor<string, []>("input_29_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_29_dilations_0 = const()[name = tensor<string, []>("input_29_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_29_groups_0 = const()[name = tensor<string, []>("input_29_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> input_29 = conv(bias = encoder_conv_module_0_sequential_7_conv_bias, dilations = input_29_dilations_0, groups = input_29_groups_0, pad = input_29_pad_0, pad_type = input_29_pad_type_0, strides = input_29_strides_0, weight = encoder_conv_module_0_sequential_7_conv_weight, x = input_27)[name = tensor<string, []>("input_29")];
+            tensor<fp32, [4, 256, 31]> outputs_aug_1 = conv(dilations = outputs_aug_1_dilations_0, groups = outputs_aug_1_groups_0, pad = outputs_aug_1_pad_0, pad_type = outputs_aug_1_pad_type_0, strides = outputs_aug_1_strides_0, weight = inner_encoder_conv_module_0_sequential_4_conv_weight, x = inputs_5)[name = tensor<string, []>("outputs_aug_1")];
+            tensor<int32, [3]> input_27_begin_0 = const()[name = tensor<string, []>("input_27_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_27_end_0 = const()[name = tensor<string, []>("input_27_end_0"), val = tensor<int32, [3]>([4, 256, 16])];
+            tensor<bool, [3]> input_27_end_mask_0 = const()[name = tensor<string, []>("input_27_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [4, 256, 16]> input_27 = slice_by_index(begin = input_27_begin_0, end = input_27_end_0, end_mask = input_27_end_mask_0, x = outputs_aug_1)[name = tensor<string, []>("input_27")];
+            tensor<fp32, [4, 256, 16]> inputs_7 = batch_norm(beta = inner_encoder_conv_module_0_sequential_5_bias, epsilon = var_66, gamma = inner_encoder_conv_module_0_sequential_5_weight, mean = inner_encoder_conv_module_0_sequential_5_running_mean, variance = inner_encoder_conv_module_0_sequential_5_running_var, x = input_27)[name = tensor<string, []>("inputs_7")];
+            tensor<fp32, [4, 256, 16]> input_29 = silu(x = inputs_7)[name = tensor<string, []>("input_29")];
+            tensor<string, []> input_31_pad_type_0 = const()[name = tensor<string, []>("input_31_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_31_strides_0 = const()[name = tensor<string, []>("input_31_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_31_pad_0 = const()[name = tensor<string, []>("input_31_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_31_dilations_0 = const()[name = tensor<string, []>("input_31_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_31_groups_0 = const()[name = tensor<string, []>("input_31_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [4, 256, 16]> input_31 = conv(bias = inner_encoder_conv_module_0_sequential_7_conv_bias, dilations = input_31_dilations_0, groups = input_31_groups_0, pad = input_31_pad_0, pad_type = input_31_pad_type_0, strides = input_31_strides_0, weight = inner_encoder_conv_module_0_sequential_7_conv_weight, x = input_29)[name = tensor<string, []>("input_31")];
             tensor<int32, [3]> conv_out_1_perm_0 = const()[name = tensor<string, []>("conv_out_1_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_306_begin_0 = const()[name = tensor<string, []>("op_306_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_306_end_0 = const()[name = tensor<string, []>("op_306_end_0"), val = tensor<int32, [3]>([4, 16, 256])];
-            tensor<bool, [3]> var_306_end_mask_0 = const()[name = tensor<string, []>("op_306_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [4, 16, 256]> conv_out_1 = transpose(perm = conv_out_1_perm_0, x = input_29)[name = tensor<string, []>("transpose_53")];
-            tensor<fp32, [4, 1, 256]> var_306 = slice_by_index(begin = var_306_begin_0, end = var_306_end_0, end_mask = var_306_end_mask_0, x = conv_out_1)[name = tensor<string, []>("op_306")];
-            tensor<int32, [3]> var_308_perm_0 = const()[name = tensor<string, []>("op_308_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 4, 256]> var_308 = transpose(perm = var_308_perm_0, x = var_306)[name = tensor<string, []>("transpose_52")];
-            tensor<fp32, [1, 4, 256]> input_31 = add(x = x_3, y = var_308)[name = tensor<string, []>("input_31")];
-            tensor<int32, [1]> input_33_axes_0 = const()[name = tensor<string, []>("input_33_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_33 = layer_norm(axes = input_33_axes_0, beta = encoder_ffn2_0_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_0_module_sequential_0_weight, x = input_31)[name = tensor<string, []>("input_33")];
-            tensor<fp32, [1, 4, 1024]> inputs_9 = linear(bias = encoder_ffn2_0_module_sequential_1_linear_bias, weight = encoder_ffn2_0_module_sequential_1_linear_weight, x = input_33)[name = tensor<string, []>("linear_8")];
-            tensor<fp32, [1, 4, 1024]> input_35 = silu(x = inputs_9)[name = tensor<string, []>("input_35")];
-            tensor<fp32, [1, 4, 256]> input_39 = linear(bias = encoder_ffn2_0_module_sequential_4_linear_bias, weight = encoder_ffn2_0_module_sequential_4_linear_weight, x = input_35)[name = tensor<string, []>("linear_9")];
-            tensor<fp32, []> var_331 = const()[name = tensor<string, []>("op_331"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 4, 256]> var_332 = mul(x = input_39, y = var_331)[name = tensor<string, []>("op_332")];
-            tensor<fp32, [1, 4, 256]> input_41 = add(x = var_332, y = input_31)[name = tensor<string, []>("input_41")];
-            tensor<int32, [1]> input_43_axes_0 = const()[name = tensor<string, []>("input_43_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_43 = layer_norm(axes = input_43_axes_0, beta = encoder_layer_norm_0_bias, epsilon = var_28, gamma = encoder_layer_norm_0_weight, x = input_41)[name = tensor<string, []>("input_43")];
+            tensor<int32, [3]> var_362_begin_0 = const()[name = tensor<string, []>("op_362_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_362_end_0 = const()[name = tensor<string, []>("op_362_end_0"), val = tensor<int32, [3]>([4, 16, 256])];
+            tensor<bool, [3]> var_362_end_mask_0 = const()[name = tensor<string, []>("op_362_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [4, 16, 256]> conv_out_1 = transpose(perm = conv_out_1_perm_0, x = input_31)[name = tensor<string, []>("transpose_53")];
+            tensor<fp32, [4, 1, 256]> var_362 = slice_by_index(begin = var_362_begin_0, end = var_362_end_0, end_mask = var_362_end_mask_0, x = conv_out_1)[name = tensor<string, []>("op_362")];
+            tensor<int32, [3]> var_364_perm_0 = const()[name = tensor<string, []>("op_364_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 4, 256]> var_364 = transpose(perm = var_364_perm_0, x = var_362)[name = tensor<string, []>("transpose_52")];
+            tensor<fp32, [1, 4, 256]> input_33 = add(x = x_3, y = var_364)[name = tensor<string, []>("input_33")];
+            tensor<int32, [1]> input_35_axes_0 = const()[name = tensor<string, []>("input_35_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 4, 256]> input_35 = layer_norm(axes = input_35_axes_0, beta = inner_encoder_ffn2_0_module_sequential_0_bias, epsilon = var_66, gamma = inner_encoder_ffn2_0_module_sequential_0_weight, x = input_33)[name = tensor<string, []>("input_35")];
+            tensor<fp32, [1, 4, 1024]> inputs_9 = linear(bias = inner_encoder_ffn2_0_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_0_module_sequential_1_linear_weight, x = input_35)[name = tensor<string, []>("linear_8")];
+            tensor<fp32, [1, 4, 1024]> input_37 = silu(x = inputs_9)[name = tensor<string, []>("input_37")];
+            tensor<fp32, [1, 4, 256]> input_41 = linear(bias = inner_encoder_ffn2_0_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_0_module_sequential_4_linear_weight, x = input_37)[name = tensor<string, []>("linear_9")];
+            tensor<fp32, []> var_387 = const()[name = tensor<string, []>("op_387"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 4, 256]> var_388 = mul(x = input_41, y = var_387)[name = tensor<string, []>("op_388")];
+            tensor<fp32, [1, 4, 256]> input_43 = add(x = var_388, y = input_33)[name = tensor<string, []>("input_43")];
             tensor<int32, [1]> input_45_axes_0 = const()[name = tensor<string, []>("input_45_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_45 = layer_norm(axes = input_45_axes_0, beta = encoder_ffn1_1_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_1_module_sequential_0_weight, x = input_43)[name = tensor<string, []>("input_45")];
-            tensor<fp32, [1, 4, 1024]> inputs_11 = linear(bias = encoder_ffn1_1_module_sequential_1_linear_bias, weight = encoder_ffn1_1_module_sequential_1_linear_weight, x = input_45)[name = tensor<string, []>("linear_10")];
-            tensor<fp32, [1, 4, 1024]> input_47 = silu(x = inputs_11)[name = tensor<string, []>("input_47")];
-            tensor<fp32, [1, 4, 256]> input_51 = linear(bias = encoder_ffn1_1_module_sequential_4_linear_bias, weight = encoder_ffn1_1_module_sequential_4_linear_weight, x = input_47)[name = tensor<string, []>("linear_11")];
-            tensor<fp32, []> var_361 = const()[name = tensor<string, []>("op_361"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 4, 256]> var_362 = mul(x = input_51, y = var_361)[name = tensor<string, []>("op_362")];
-            tensor<fp32, [1, 4, 256]> input_53 = add(x = var_362, y = input_43)[name = tensor<string, []>("input_53")];
+            tensor<fp32, [1, 4, 256]> input_45 = layer_norm(axes = input_45_axes_0, beta = inner_encoder_layer_norm_0_bias, epsilon = var_66, gamma = inner_encoder_layer_norm_0_weight, x = input_43)[name = tensor<string, []>("input_45")];
+            tensor<int32, [1]> input_47_axes_0 = const()[name = tensor<string, []>("input_47_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 4, 256]> input_47 = layer_norm(axes = input_47_axes_0, beta = inner_encoder_ffn1_1_module_sequential_0_bias, epsilon = var_66, gamma = inner_encoder_ffn1_1_module_sequential_0_weight, x = input_45)[name = tensor<string, []>("input_47")];
+            tensor<fp32, [1, 4, 1024]> inputs_11 = linear(bias = inner_encoder_ffn1_1_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_1_module_sequential_1_linear_weight, x = input_47)[name = tensor<string, []>("linear_10")];
+            tensor<fp32, [1, 4, 1024]> input_49 = silu(x = inputs_11)[name = tensor<string, []>("input_49")];
+            tensor<fp32, [1, 4, 256]> input_53 = linear(bias = inner_encoder_ffn1_1_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_1_module_sequential_4_linear_weight, x = input_49)[name = tensor<string, []>("linear_11")];
+            tensor<fp32, []> var_417 = const()[name = tensor<string, []>("op_417"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 4, 256]> var_418 = mul(x = input_53, y = var_417)[name = tensor<string, []>("op_418")];
+            tensor<fp32, [1, 4, 256]> input_55 = add(x = var_418, y = input_45)[name = tensor<string, []>("input_55")];
             tensor<int32, [1]> x_7_axes_0 = const()[name = tensor<string, []>("x_7_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> x_7 = layer_norm(axes = x_7_axes_0, beta = encoder_ret_lns_1_bias, epsilon = var_28, gamma = encoder_ret_lns_1_weight, x = input_53)[name = tensor<string, []>("x_7")];
+            tensor<fp32, [1, 4, 256]> x_7 = layer_norm(axes = x_7_axes_0, beta = inner_encoder_ret_lns_1_bias, epsilon = var_66, gamma = inner_encoder_ret_lns_1_weight, x = input_55)[name = tensor<string, []>("x_7")];
             tensor<int32, [5]> prev_kv_3_begin_0 = const()[name = tensor<string, []>("prev_kv_3_begin_0"), val = tensor<int32, [5]>([1, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_3_end_0 = const()[name = tensor<string, []>("prev_kv_3_end_0"), val = tensor<int32, [5]>([2, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_3_end_mask_0 = const()[name = tensor<string, []>("prev_kv_3_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -416,173 +438,173 @@ program(1.0)
             tensor<bool, [2]> prev_scale_3_end_mask_0 = const()[name = tensor<string, []>("prev_scale_3_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_3_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_3_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_3 = slice_by_index(begin = prev_scale_3_begin_0, end = prev_scale_3_end_0, end_mask = prev_scale_3_end_mask_0, squeeze_mask = prev_scale_3_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_3")];
-            tensor<fp32, [1, 4, 256]> var_376 = linear(bias = encoder_q_proj_1_bias, weight = encoder_q_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_12")];
-            tensor<int32, [4]> var_377 = const()[name = tensor<string, []>("op_377"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_378 = reshape(shape = var_377, x = var_376)[name = tensor<string, []>("op_378")];
+            tensor<fp32, [1, 4, 256]> var_432 = linear(bias = inner_encoder_q_proj_1_bias, weight = inner_encoder_q_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_12")];
+            tensor<int32, [4]> var_433 = const()[name = tensor<string, []>("op_433"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_434 = reshape(shape = var_433, x = var_432)[name = tensor<string, []>("op_434")];
             tensor<int32, [4]> q_3_perm_0 = const()[name = tensor<string, []>("q_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> var_382 = linear(bias = encoder_k_proj_1_bias, weight = encoder_k_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_13")];
-            tensor<fp32, []> var_383 = const()[name = tensor<string, []>("op_383"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 4, 256]> var_384 = mul(x = var_382, y = var_383)[name = tensor<string, []>("op_384")];
-            tensor<int32, [4]> var_385 = const()[name = tensor<string, []>("op_385"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_386 = reshape(shape = var_385, x = var_384)[name = tensor<string, []>("op_386")];
+            tensor<fp32, [1, 4, 256]> var_438 = linear(bias = inner_encoder_k_proj_1_bias, weight = inner_encoder_k_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_13")];
+            tensor<fp32, []> var_439 = const()[name = tensor<string, []>("op_439"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 4, 256]> var_440 = mul(x = var_438, y = var_439)[name = tensor<string, []>("op_440")];
+            tensor<int32, [4]> var_441 = const()[name = tensor<string, []>("op_441"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_442 = reshape(shape = var_441, x = var_440)[name = tensor<string, []>("op_442")];
             tensor<int32, [4]> k_3_perm_0 = const()[name = tensor<string, []>("k_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> var_390 = linear(bias = encoder_v_proj_1_bias, weight = encoder_v_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_14")];
-            tensor<int32, [4]> var_391 = const()[name = tensor<string, []>("op_391"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_392 = reshape(shape = var_391, x = var_390)[name = tensor<string, []>("op_392")];
+            tensor<fp32, [1, 4, 256]> var_446 = linear(bias = inner_encoder_v_proj_1_bias, weight = inner_encoder_v_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_14")];
+            tensor<int32, [4]> var_447 = const()[name = tensor<string, []>("op_447"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_448 = reshape(shape = var_447, x = var_446)[name = tensor<string, []>("op_448")];
             tensor<int32, [4]> v_3_perm_0 = const()[name = tensor<string, []>("v_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> input_57 = linear(bias = encoder_g_proj_1_bias, weight = encoder_g_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_15")];
+            tensor<fp32, [1, 4, 256]> input_59 = linear(bias = inner_encoder_g_proj_1_bias, weight = inner_encoder_g_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_15")];
             tensor<fp32, [1]> sqrt_s0_3 = sqrt(x = prev_scale_3)[name = tensor<string, []>("sqrt_s0_3")];
-            tensor<fp32, [4]> s_t_3 = add(x = prev_scale_3, y = encoder__t_index)[name = tensor<string, []>("s_t_3")];
+            tensor<fp32, [4]> s_t_3 = add(x = prev_scale_3, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_3")];
             tensor<fp32, [4]> sqrt_s_t_3 = sqrt(x = s_t_3)[name = tensor<string, []>("sqrt_s_t_3")];
             tensor<bool, []> qk_3_transpose_x_1 = const()[name = tensor<string, []>("qk_3_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_3_transpose_y_1 = const()[name = tensor<string, []>("qk_3_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 4, 64]> k_3 = transpose(perm = k_3_perm_0, x = var_386)[name = tensor<string, []>("transpose_50")];
-            tensor<fp32, [1, 4, 4, 64]> q_3 = transpose(perm = q_3_perm_0, x = var_378)[name = tensor<string, []>("transpose_51")];
+            tensor<fp32, [1, 4, 4, 64]> k_3 = transpose(perm = k_3_perm_0, x = var_442)[name = tensor<string, []>("transpose_50")];
+            tensor<fp32, [1, 4, 4, 64]> q_3 = transpose(perm = q_3_perm_0, x = var_434)[name = tensor<string, []>("transpose_51")];
             tensor<fp32, [1, 4, 4, 4]> qk_3 = matmul(transpose_x = qk_3_transpose_x_1, transpose_y = qk_3_transpose_y_1, x = q_3, y = k_3)[name = tensor<string, []>("qk_3")];
-            tensor<int32, [2]> var_402 = const()[name = tensor<string, []>("op_402"), val = tensor<int32, [2]>([4, 1])];
-            tensor<fp32, [4, 1]> var_403 = reshape(shape = var_402, x = sqrt_s_t_3)[name = tensor<string, []>("op_403")];
-            tensor<fp32, [4, 4]> M_3 = real_div(x = encoder__causal_mask, y = var_403)[name = tensor<string, []>("M_3")];
-            tensor<fp32, [1, 4, 4, 4]> var_405 = mul(x = qk_3, y = M_3)[name = tensor<string, []>("op_405")];
+            tensor<int32, [2]> var_458 = const()[name = tensor<string, []>("op_458"), val = tensor<int32, [2]>([4, 1])];
+            tensor<fp32, [4, 1]> var_459 = reshape(shape = var_458, x = sqrt_s_t_3)[name = tensor<string, []>("op_459")];
+            tensor<fp32, [4, 4]> M_3 = real_div(x = inner_encoder__causal_mask, y = var_459)[name = tensor<string, []>("M_3")];
+            tensor<fp32, [1, 4, 4, 4]> var_461 = mul(x = qk_3, y = M_3)[name = tensor<string, []>("op_461")];
             tensor<bool, []> inner_3_transpose_x_0 = const()[name = tensor<string, []>("inner_3_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_3_transpose_y_0 = const()[name = tensor<string, []>("inner_3_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 4, 64]> v_3 = transpose(perm = v_3_perm_0, x = var_392)[name = tensor<string, []>("transpose_49")];
-            tensor<fp32, [1, 4, 4, 64]> inner_3 = matmul(transpose_x = inner_3_transpose_x_0, transpose_y = inner_3_transpose_y_0, x = var_405, y = v_3)[name = tensor<string, []>("inner_3")];
-            tensor<bool, []> var_407_transpose_x_0 = const()[name = tensor<string, []>("op_407_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_407_transpose_y_0 = const()[name = tensor<string, []>("op_407_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 4, 64]> var_407 = matmul(transpose_x = var_407_transpose_x_0, transpose_y = var_407_transpose_y_0, x = q_3, y = prev_kv_3)[name = tensor<string, []>("op_407")];
-            tensor<fp32, [4]> var_408 = real_div(x = sqrt_s0_3, y = sqrt_s_t_3)[name = tensor<string, []>("op_408")];
-            tensor<int32, [4]> var_409 = const()[name = tensor<string, []>("op_409"), val = tensor<int32, [4]>([1, 1, 4, 1])];
-            tensor<fp32, [1, 1, 4, 1]> var_410 = reshape(shape = var_409, x = var_408)[name = tensor<string, []>("op_410")];
-            tensor<fp32, [1, 4, 4, 64]> cross_3 = mul(x = var_407, y = var_410)[name = tensor<string, []>("cross_3")];
+            tensor<fp32, [1, 4, 4, 64]> v_3 = transpose(perm = v_3_perm_0, x = var_448)[name = tensor<string, []>("transpose_49")];
+            tensor<fp32, [1, 4, 4, 64]> inner_3 = matmul(transpose_x = inner_3_transpose_x_0, transpose_y = inner_3_transpose_y_0, x = var_461, y = v_3)[name = tensor<string, []>("inner_3")];
+            tensor<bool, []> var_463_transpose_x_0 = const()[name = tensor<string, []>("op_463_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_463_transpose_y_0 = const()[name = tensor<string, []>("op_463_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 4, 64]> var_463 = matmul(transpose_x = var_463_transpose_x_0, transpose_y = var_463_transpose_y_0, x = q_3, y = prev_kv_3)[name = tensor<string, []>("op_463")];
+            tensor<fp32, [4]> var_464 = real_div(x = sqrt_s0_3, y = sqrt_s_t_3)[name = tensor<string, []>("op_464")];
+            tensor<int32, [4]> var_465 = const()[name = tensor<string, []>("op_465"), val = tensor<int32, [4]>([1, 1, 4, 1])];
+            tensor<fp32, [1, 1, 4, 1]> var_466 = reshape(shape = var_465, x = var_464)[name = tensor<string, []>("op_466")];
+            tensor<fp32, [1, 4, 4, 64]> cross_3 = mul(x = var_463, y = var_466)[name = tensor<string, []>("cross_3")];
             tensor<fp32, [1, 4, 4, 64]> out_7 = add(x = inner_3, y = cross_3)[name = tensor<string, []>("out_7")];
-            tensor<fp32, [1, 4, 64, 64]> var_413 = mul(x = prev_kv_3, y = sqrt_s0_3)[name = tensor<string, []>("op_413")];
-            tensor<bool, []> var_415_transpose_x_1 = const()[name = tensor<string, []>("op_415_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_415_transpose_y_1 = const()[name = tensor<string, []>("op_415_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_415 = matmul(transpose_x = var_415_transpose_x_1, transpose_y = var_415_transpose_y_1, x = k_3, y = v_3)[name = tensor<string, []>("op_415")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_3 = add(x = var_413, y = var_415)[name = tensor<string, []>("new_kv_unnorm_3")];
-            tensor<fp32, []> var_417 = const()[name = tensor<string, []>("op_417"), val = tensor<fp32, []>(0x1p+2)];
-            tensor<fp32, [1]> new_scale_3 = add(x = prev_scale_3, y = var_417)[name = tensor<string, []>("new_scale_3")];
-            tensor<fp32, [1]> var_419 = sqrt(x = new_scale_3)[name = tensor<string, []>("op_419")];
-            tensor<fp32, [1, 4, 64, 64]> var_420 = real_div(x = new_kv_unnorm_3, y = var_419)[name = tensor<string, []>("op_420")];
-            tensor<int32, [4]> var_421_perm_0 = const()[name = tensor<string, []>("op_421_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_469 = mul(x = prev_kv_3, y = sqrt_s0_3)[name = tensor<string, []>("op_469")];
+            tensor<bool, []> var_471_transpose_x_1 = const()[name = tensor<string, []>("op_471_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_471_transpose_y_1 = const()[name = tensor<string, []>("op_471_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_471 = matmul(transpose_x = var_471_transpose_x_1, transpose_y = var_471_transpose_y_1, x = k_3, y = v_3)[name = tensor<string, []>("op_471")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_3 = add(x = var_469, y = var_471)[name = tensor<string, []>("new_kv_unnorm_3")];
+            tensor<fp32, []> var_473 = const()[name = tensor<string, []>("op_473"), val = tensor<fp32, []>(0x1p+2)];
+            tensor<fp32, [1]> new_scale_3 = add(x = prev_scale_3, y = var_473)[name = tensor<string, []>("new_scale_3")];
+            tensor<fp32, [1]> var_475 = sqrt(x = new_scale_3)[name = tensor<string, []>("op_475")];
+            tensor<fp32, [1, 4, 64, 64]> var_476 = real_div(x = new_kv_unnorm_3, y = var_475)[name = tensor<string, []>("op_476")];
+            tensor<int32, [4]> var_477_perm_0 = const()[name = tensor<string, []>("op_477_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_9_axes_0 = const()[name = tensor<string, []>("out_9_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 4, 64]> var_421 = transpose(perm = var_421_perm_0, x = out_7)[name = tensor<string, []>("transpose_48")];
-            tensor<fp32, [1, 4, 4, 64]> out_9 = layer_norm(axes = out_9_axes_0, epsilon = var_17, x = var_421)[name = tensor<string, []>("out_9")];
-            tensor<int32, [3]> var_425 = const()[name = tensor<string, []>("op_425"), val = tensor<int32, [3]>([1, 4, 256])];
-            tensor<fp32, [1, 4, 256]> out_11 = reshape(shape = var_425, x = out_9)[name = tensor<string, []>("out_11")];
-            tensor<fp32, [1, 4, 256]> var_427 = silu(x = input_57)[name = tensor<string, []>("op_427")];
-            tensor<fp32, [1, 4, 256]> input_59 = mul(x = var_427, y = out_11)[name = tensor<string, []>("input_59")];
-            tensor<fp32, [1, 4, 256]> ret_out_3 = linear(bias = encoder_out_proj_1_bias, weight = encoder_out_proj_1_weight, x = input_59)[name = tensor<string, []>("linear_16")];
-            tensor<fp32, [1, 4, 256]> x_9 = add(x = input_53, y = ret_out_3)[name = tensor<string, []>("x_9")];
+            tensor<fp32, [1, 4, 4, 64]> var_477 = transpose(perm = var_477_perm_0, x = out_7)[name = tensor<string, []>("transpose_48")];
+            tensor<fp32, [1, 4, 4, 64]> out_9 = layer_norm(axes = out_9_axes_0, epsilon = var_74, x = var_477)[name = tensor<string, []>("out_9")];
+            tensor<int32, [3]> var_481 = const()[name = tensor<string, []>("op_481"), val = tensor<int32, [3]>([1, 4, 256])];
+            tensor<fp32, [1, 4, 256]> out_11 = reshape(shape = var_481, x = out_9)[name = tensor<string, []>("out_11")];
+            tensor<fp32, [1, 4, 256]> var_483 = silu(x = input_59)[name = tensor<string, []>("op_483")];
+            tensor<fp32, [1, 4, 256]> input_61 = mul(x = var_483, y = out_11)[name = tensor<string, []>("input_61")];
+            tensor<fp32, [1, 4, 256]> ret_out_3 = linear(bias = inner_encoder_out_proj_1_bias, weight = inner_encoder_out_proj_1_weight, x = input_61)[name = tensor<string, []>("linear_16")];
+            tensor<fp32, [1, 4, 256]> x_9 = add(x = input_55, y = ret_out_3)[name = tensor<string, []>("x_9")];
             tensor<int32, [4]> window_11_begin_0 = const()[name = tensor<string, []>("window_11_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> window_11_end_0 = const()[name = tensor<string, []>("window_11_end_0"), val = tensor<int32, [4]>([2, 1, 16, 256])];
             tensor<bool, [4]> window_11_end_mask_0 = const()[name = tensor<string, []>("window_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_11_squeeze_mask_0 = const()[name = tensor<string, []>("window_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_11 = slice_by_index(begin = window_11_begin_0, end = window_11_end_0, end_mask = window_11_end_mask_0, squeeze_mask = window_11_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_11")];
-            tensor<int32, [3]> var_435_begin_0 = const()[name = tensor<string, []>("op_435_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_435_end_0 = const()[name = tensor<string, []>("op_435_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_435_end_mask_0 = const()[name = tensor<string, []>("op_435_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_435 = slice_by_index(begin = var_435_begin_0, end = var_435_end_0, end_mask = var_435_end_mask_0, x = x_9)[name = tensor<string, []>("op_435")];
-            tensor<int32, [3]> var_438_begin_0 = const()[name = tensor<string, []>("op_438_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_438_end_0 = const()[name = tensor<string, []>("op_438_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_438_end_mask_0 = const()[name = tensor<string, []>("op_438_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_438 = slice_by_index(begin = var_438_begin_0, end = var_438_end_0, end_mask = var_438_end_mask_0, x = window_11)[name = tensor<string, []>("op_438")];
+            tensor<int32, [3]> var_491_begin_0 = const()[name = tensor<string, []>("op_491_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_491_end_0 = const()[name = tensor<string, []>("op_491_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_491_end_mask_0 = const()[name = tensor<string, []>("op_491_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_491 = slice_by_index(begin = var_491_begin_0, end = var_491_end_0, end_mask = var_491_end_mask_0, x = x_9)[name = tensor<string, []>("op_491")];
+            tensor<int32, [3]> var_494_begin_0 = const()[name = tensor<string, []>("op_494_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_494_end_0 = const()[name = tensor<string, []>("op_494_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_494_end_mask_0 = const()[name = tensor<string, []>("op_494_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_494 = slice_by_index(begin = var_494_begin_0, end = var_494_end_0, end_mask = var_494_end_mask_0, x = window_11)[name = tensor<string, []>("op_494")];
             tensor<bool, []> window_13_interleave_0 = const()[name = tensor<string, []>("window_13_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_13 = concat(axis = var_26, interleave = window_13_interleave_0, values = (var_438, var_435))[name = tensor<string, []>("window_13")];
-            tensor<int32, [3]> var_443_begin_0 = const()[name = tensor<string, []>("op_443_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_443_end_0 = const()[name = tensor<string, []>("op_443_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_443_end_mask_0 = const()[name = tensor<string, []>("op_443_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_443 = slice_by_index(begin = var_443_begin_0, end = var_443_end_0, end_mask = var_443_end_mask_0, x = x_9)[name = tensor<string, []>("op_443")];
-            tensor<int32, [3]> var_446_begin_0 = const()[name = tensor<string, []>("op_446_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_446_end_0 = const()[name = tensor<string, []>("op_446_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_446_end_mask_0 = const()[name = tensor<string, []>("op_446_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_446 = slice_by_index(begin = var_446_begin_0, end = var_446_end_0, end_mask = var_446_end_mask_0, x = window_13)[name = tensor<string, []>("op_446")];
+            tensor<fp32, [1, 16, 256]> window_13 = concat(axis = var_82, interleave = window_13_interleave_0, values = (var_494, var_491))[name = tensor<string, []>("window_13")];
+            tensor<int32, [3]> var_499_begin_0 = const()[name = tensor<string, []>("op_499_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_499_end_0 = const()[name = tensor<string, []>("op_499_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_499_end_mask_0 = const()[name = tensor<string, []>("op_499_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_499 = slice_by_index(begin = var_499_begin_0, end = var_499_end_0, end_mask = var_499_end_mask_0, x = x_9)[name = tensor<string, []>("op_499")];
+            tensor<int32, [3]> var_502_begin_0 = const()[name = tensor<string, []>("op_502_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_502_end_0 = const()[name = tensor<string, []>("op_502_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_502_end_mask_0 = const()[name = tensor<string, []>("op_502_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_502 = slice_by_index(begin = var_502_begin_0, end = var_502_end_0, end_mask = var_502_end_mask_0, x = window_13)[name = tensor<string, []>("op_502")];
             tensor<bool, []> window_15_interleave_0 = const()[name = tensor<string, []>("window_15_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_15 = concat(axis = var_26, interleave = window_15_interleave_0, values = (var_446, var_443))[name = tensor<string, []>("window_15")];
-            tensor<int32, [3]> var_451_begin_0 = const()[name = tensor<string, []>("op_451_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_451_end_0 = const()[name = tensor<string, []>("op_451_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<bool, [3]> var_451_end_mask_0 = const()[name = tensor<string, []>("op_451_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_451 = slice_by_index(begin = var_451_begin_0, end = var_451_end_0, end_mask = var_451_end_mask_0, x = x_9)[name = tensor<string, []>("op_451")];
-            tensor<int32, [3]> var_454_begin_0 = const()[name = tensor<string, []>("op_454_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_454_end_0 = const()[name = tensor<string, []>("op_454_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_454_end_mask_0 = const()[name = tensor<string, []>("op_454_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_454 = slice_by_index(begin = var_454_begin_0, end = var_454_end_0, end_mask = var_454_end_mask_0, x = window_15)[name = tensor<string, []>("op_454")];
+            tensor<fp32, [1, 16, 256]> window_15 = concat(axis = var_82, interleave = window_15_interleave_0, values = (var_502, var_499))[name = tensor<string, []>("window_15")];
+            tensor<int32, [3]> var_507_begin_0 = const()[name = tensor<string, []>("op_507_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_507_end_0 = const()[name = tensor<string, []>("op_507_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<bool, [3]> var_507_end_mask_0 = const()[name = tensor<string, []>("op_507_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_507 = slice_by_index(begin = var_507_begin_0, end = var_507_end_0, end_mask = var_507_end_mask_0, x = x_9)[name = tensor<string, []>("op_507")];
+            tensor<int32, [3]> var_510_begin_0 = const()[name = tensor<string, []>("op_510_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_510_end_0 = const()[name = tensor<string, []>("op_510_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_510_end_mask_0 = const()[name = tensor<string, []>("op_510_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_510 = slice_by_index(begin = var_510_begin_0, end = var_510_end_0, end_mask = var_510_end_mask_0, x = window_15)[name = tensor<string, []>("op_510")];
             tensor<bool, []> window_17_interleave_0 = const()[name = tensor<string, []>("window_17_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_17 = concat(axis = var_26, interleave = window_17_interleave_0, values = (var_454, var_451))[name = tensor<string, []>("window_17")];
-            tensor<int32, [3]> var_459_begin_0 = const()[name = tensor<string, []>("op_459_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
-            tensor<int32, [3]> var_459_end_0 = const()[name = tensor<string, []>("op_459_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_459_end_mask_0 = const()[name = tensor<string, []>("op_459_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_459 = slice_by_index(begin = var_459_begin_0, end = var_459_end_0, end_mask = var_459_end_mask_0, x = x_9)[name = tensor<string, []>("op_459")];
-            tensor<int32, [3]> var_462_begin_0 = const()[name = tensor<string, []>("op_462_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_462_end_0 = const()[name = tensor<string, []>("op_462_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_462_end_mask_0 = const()[name = tensor<string, []>("op_462_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_462 = slice_by_index(begin = var_462_begin_0, end = var_462_end_0, end_mask = var_462_end_mask_0, x = window_17)[name = tensor<string, []>("op_462")];
+            tensor<fp32, [1, 16, 256]> window_17 = concat(axis = var_82, interleave = window_17_interleave_0, values = (var_510, var_507))[name = tensor<string, []>("window_17")];
+            tensor<int32, [3]> var_515_begin_0 = const()[name = tensor<string, []>("op_515_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
+            tensor<int32, [3]> var_515_end_0 = const()[name = tensor<string, []>("op_515_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_515_end_mask_0 = const()[name = tensor<string, []>("op_515_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_515 = slice_by_index(begin = var_515_begin_0, end = var_515_end_0, end_mask = var_515_end_mask_0, x = x_9)[name = tensor<string, []>("op_515")];
+            tensor<int32, [3]> var_518_begin_0 = const()[name = tensor<string, []>("op_518_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_518_end_0 = const()[name = tensor<string, []>("op_518_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_518_end_mask_0 = const()[name = tensor<string, []>("op_518_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_518 = slice_by_index(begin = var_518_begin_0, end = var_518_end_0, end_mask = var_518_end_mask_0, x = window_17)[name = tensor<string, []>("op_518")];
             tensor<bool, []> window_19_interleave_0 = const()[name = tensor<string, []>("window_19_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_19 = concat(axis = var_26, interleave = window_19_interleave_0, values = (var_462, var_459))[name = tensor<string, []>("window_19")];
-            tensor<bool, []> input_61_interleave_0 = const()[name = tensor<string, []>("input_61_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [4, 16, 256]> input_61 = concat(axis = var_23, interleave = input_61_interleave_0, values = (window_13, window_15, window_17, window_19))[name = tensor<string, []>("input_61")];
+            tensor<fp32, [1, 16, 256]> window_19 = concat(axis = var_82, interleave = window_19_interleave_0, values = (var_518, var_515))[name = tensor<string, []>("window_19")];
+            tensor<bool, []> input_63_interleave_0 = const()[name = tensor<string, []>("input_63_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [4, 16, 256]> input_63 = concat(axis = var_69, interleave = input_63_interleave_0, values = (window_13, window_15, window_17, window_19))[name = tensor<string, []>("input_63")];
             tensor<int32, [1]> x_11_axes_0 = const()[name = tensor<string, []>("x_11_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [4, 16, 256]> x_11 = layer_norm(axes = x_11_axes_0, beta = encoder_conv_module_1_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_1_sequential_0_weight, x = input_61)[name = tensor<string, []>("x_11")];
-            tensor<int32, [3]> input_63_perm_0 = const()[name = tensor<string, []>("input_63_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [4, 16, 256]> x_11 = layer_norm(axes = x_11_axes_0, beta = inner_encoder_conv_module_1_sequential_0_bias, epsilon = var_66, gamma = inner_encoder_conv_module_1_sequential_0_weight, x = input_63)[name = tensor<string, []>("x_11")];
+            tensor<int32, [3]> input_65_perm_0 = const()[name = tensor<string, []>("input_65_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_13_pad_type_0 = const()[name = tensor<string, []>("inputs_13_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_13_strides_0 = const()[name = tensor<string, []>("inputs_13_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_13_pad_0 = const()[name = tensor<string, []>("inputs_13_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_13_dilations_0 = const()[name = tensor<string, []>("inputs_13_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_13_groups_0 = const()[name = tensor<string, []>("inputs_13_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> input_63 = transpose(perm = input_63_perm_0, x = x_11)[name = tensor<string, []>("transpose_47")];
-            tensor<fp32, [4, 512, 16]> inputs_13 = conv(bias = encoder_conv_module_1_sequential_2_conv_bias, dilations = inputs_13_dilations_0, groups = inputs_13_groups_0, pad = inputs_13_pad_0, pad_type = inputs_13_pad_type_0, strides = inputs_13_strides_0, weight = encoder_conv_module_1_sequential_2_conv_weight, x = input_63)[name = tensor<string, []>("inputs_13")];
-            tensor<int32, [2]> var_487_split_sizes_0 = const()[name = tensor<string, []>("op_487_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_487_axis_0 = const()[name = tensor<string, []>("op_487_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> var_487_0, tensor<fp32, [4, 256, 16]> var_487_1 = split(axis = var_487_axis_0, split_sizes = var_487_split_sizes_0, x = inputs_13)[name = tensor<string, []>("op_487")];
-            tensor<fp32, [4, 256, 16]> var_489 = sigmoid(x = var_487_1)[name = tensor<string, []>("op_489")];
-            tensor<fp32, [4, 256, 16]> inputs_15 = mul(x = var_487_0, y = var_489)[name = tensor<string, []>("inputs_15")];
+            tensor<fp32, [4, 256, 16]> input_65 = transpose(perm = input_65_perm_0, x = x_11)[name = tensor<string, []>("transpose_47")];
+            tensor<fp32, [4, 512, 16]> inputs_13 = conv(bias = inner_encoder_conv_module_1_sequential_2_conv_bias, dilations = inputs_13_dilations_0, groups = inputs_13_groups_0, pad = inputs_13_pad_0, pad_type = inputs_13_pad_type_0, strides = inputs_13_strides_0, weight = inner_encoder_conv_module_1_sequential_2_conv_weight, x = input_65)[name = tensor<string, []>("inputs_13")];
+            tensor<int32, [2]> var_543_split_sizes_0 = const()[name = tensor<string, []>("op_543_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_543_axis_0 = const()[name = tensor<string, []>("op_543_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [4, 256, 16]> var_543_0, tensor<fp32, [4, 256, 16]> var_543_1 = split(axis = var_543_axis_0, split_sizes = var_543_split_sizes_0, x = inputs_13)[name = tensor<string, []>("op_543")];
+            tensor<fp32, [4, 256, 16]> var_545 = sigmoid(x = var_543_1)[name = tensor<string, []>("op_545")];
+            tensor<fp32, [4, 256, 16]> inputs_15 = mul(x = var_543_0, y = var_545)[name = tensor<string, []>("inputs_15")];
             tensor<string, []> outputs_aug_3_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_3_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_3_pad_0 = const()[name = tensor<string, []>("outputs_aug_3_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_3_groups_0 = const()[name = tensor<string, []>("outputs_aug_3_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_3_strides_0 = const()[name = tensor<string, []>("outputs_aug_3_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_3_dilations_0 = const()[name = tensor<string, []>("outputs_aug_3_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [4, 256, 31]> outputs_aug_3 = conv(dilations = outputs_aug_3_dilations_0, groups = outputs_aug_3_groups_0, pad = outputs_aug_3_pad_0, pad_type = outputs_aug_3_pad_type_0, strides = outputs_aug_3_strides_0, weight = encoder_conv_module_1_sequential_4_conv_weight, x = inputs_15)[name = tensor<string, []>("outputs_aug_3")];
-            tensor<int32, [3]> input_65_begin_0 = const()[name = tensor<string, []>("input_65_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_65_end_0 = const()[name = tensor<string, []>("input_65_end_0"), val = tensor<int32, [3]>([4, 256, 16])];
-            tensor<bool, [3]> input_65_end_mask_0 = const()[name = tensor<string, []>("input_65_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [4, 256, 16]> input_65 = slice_by_index(begin = input_65_begin_0, end = input_65_end_0, end_mask = input_65_end_mask_0, x = outputs_aug_3)[name = tensor<string, []>("input_65")];
-            tensor<fp32, [4, 256, 16]> inputs_17 = batch_norm(beta = encoder_conv_module_1_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_1_sequential_5_weight, mean = encoder_conv_module_1_sequential_5_running_mean, variance = encoder_conv_module_1_sequential_5_running_var, x = input_65)[name = tensor<string, []>("inputs_17")];
-            tensor<fp32, [4, 256, 16]> input_67 = silu(x = inputs_17)[name = tensor<string, []>("input_67")];
-            tensor<string, []> input_69_pad_type_0 = const()[name = tensor<string, []>("input_69_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_69_strides_0 = const()[name = tensor<string, []>("input_69_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_69_pad_0 = const()[name = tensor<string, []>("input_69_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_69_dilations_0 = const()[name = tensor<string, []>("input_69_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_69_groups_0 = const()[name = tensor<string, []>("input_69_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> input_69 = conv(bias = encoder_conv_module_1_sequential_7_conv_bias, dilations = input_69_dilations_0, groups = input_69_groups_0, pad = input_69_pad_0, pad_type = input_69_pad_type_0, strides = input_69_strides_0, weight = encoder_conv_module_1_sequential_7_conv_weight, x = input_67)[name = tensor<string, []>("input_69")];
+            tensor<fp32, [4, 256, 31]> outputs_aug_3 = conv(dilations = outputs_aug_3_dilations_0, groups = outputs_aug_3_groups_0, pad = outputs_aug_3_pad_0, pad_type = outputs_aug_3_pad_type_0, strides = outputs_aug_3_strides_0, weight = inner_encoder_conv_module_1_sequential_4_conv_weight, x = inputs_15)[name = tensor<string, []>("outputs_aug_3")];
+            tensor<int32, [3]> input_67_begin_0 = const()[name = tensor<string, []>("input_67_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_67_end_0 = const()[name = tensor<string, []>("input_67_end_0"), val = tensor<int32, [3]>([4, 256, 16])];
+            tensor<bool, [3]> input_67_end_mask_0 = const()[name = tensor<string, []>("input_67_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [4, 256, 16]> input_67 = slice_by_index(begin = input_67_begin_0, end = input_67_end_0, end_mask = input_67_end_mask_0, x = outputs_aug_3)[name = tensor<string, []>("input_67")];
+            tensor<fp32, [4, 256, 16]> inputs_17 = batch_norm(beta = inner_encoder_conv_module_1_sequential_5_bias, epsilon = var_66, gamma = inner_encoder_conv_module_1_sequential_5_weight, mean = inner_encoder_conv_module_1_sequential_5_running_mean, variance = inner_encoder_conv_module_1_sequential_5_running_var, x = input_67)[name = tensor<string, []>("inputs_17")];
+            tensor<fp32, [4, 256, 16]> input_69 = silu(x = inputs_17)[name = tensor<string, []>("input_69")];
+            tensor<string, []> input_71_pad_type_0 = const()[name = tensor<string, []>("input_71_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_71_strides_0 = const()[name = tensor<string, []>("input_71_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_71_pad_0 = const()[name = tensor<string, []>("input_71_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_71_dilations_0 = const()[name = tensor<string, []>("input_71_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_71_groups_0 = const()[name = tensor<string, []>("input_71_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [4, 256, 16]> input_71 = conv(bias = inner_encoder_conv_module_1_sequential_7_conv_bias, dilations = input_71_dilations_0, groups = input_71_groups_0, pad = input_71_pad_0, pad_type = input_71_pad_type_0, strides = input_71_strides_0, weight = inner_encoder_conv_module_1_sequential_7_conv_weight, x = input_69)[name = tensor<string, []>("input_71")];
             tensor<int32, [3]> conv_out_3_perm_0 = const()[name = tensor<string, []>("conv_out_3_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_520_begin_0 = const()[name = tensor<string, []>("op_520_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_520_end_0 = const()[name = tensor<string, []>("op_520_end_0"), val = tensor<int32, [3]>([4, 16, 256])];
-            tensor<bool, [3]> var_520_end_mask_0 = const()[name = tensor<string, []>("op_520_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [4, 16, 256]> conv_out_3 = transpose(perm = conv_out_3_perm_0, x = input_69)[name = tensor<string, []>("transpose_46")];
-            tensor<fp32, [4, 1, 256]> var_520 = slice_by_index(begin = var_520_begin_0, end = var_520_end_0, end_mask = var_520_end_mask_0, x = conv_out_3)[name = tensor<string, []>("op_520")];
-            tensor<int32, [3]> var_522_perm_0 = const()[name = tensor<string, []>("op_522_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 4, 256]> var_522 = transpose(perm = var_522_perm_0, x = var_520)[name = tensor<string, []>("transpose_45")];
-            tensor<fp32, [1, 4, 256]> input_71 = add(x = x_9, y = var_522)[name = tensor<string, []>("input_71")];
-            tensor<int32, [1]> input_73_axes_0 = const()[name = tensor<string, []>("input_73_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_73 = layer_norm(axes = input_73_axes_0, beta = encoder_ffn2_1_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_1_module_sequential_0_weight, x = input_71)[name = tensor<string, []>("input_73")];
-            tensor<fp32, [1, 4, 1024]> inputs_19 = linear(bias = encoder_ffn2_1_module_sequential_1_linear_bias, weight = encoder_ffn2_1_module_sequential_1_linear_weight, x = input_73)[name = tensor<string, []>("linear_17")];
-            tensor<fp32, [1, 4, 1024]> input_75 = silu(x = inputs_19)[name = tensor<string, []>("input_75")];
-            tensor<fp32, [1, 4, 256]> input_79 = linear(bias = encoder_ffn2_1_module_sequential_4_linear_bias, weight = encoder_ffn2_1_module_sequential_4_linear_weight, x = input_75)[name = tensor<string, []>("linear_18")];
-            tensor<fp32, []> var_545 = const()[name = tensor<string, []>("op_545"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 4, 256]> var_546 = mul(x = input_79, y = var_545)[name = tensor<string, []>("op_546")];
-            tensor<fp32, [1, 4, 256]> input_81 = add(x = var_546, y = input_71)[name = tensor<string, []>("input_81")];
-            tensor<int32, [1]> input_83_axes_0 = const()[name = tensor<string, []>("input_83_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_83 = layer_norm(axes = input_83_axes_0, beta = encoder_layer_norm_1_bias, epsilon = var_28, gamma = encoder_layer_norm_1_weight, x = input_81)[name = tensor<string, []>("input_83")];
+            tensor<int32, [3]> var_576_begin_0 = const()[name = tensor<string, []>("op_576_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_576_end_0 = const()[name = tensor<string, []>("op_576_end_0"), val = tensor<int32, [3]>([4, 16, 256])];
+            tensor<bool, [3]> var_576_end_mask_0 = const()[name = tensor<string, []>("op_576_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [4, 16, 256]> conv_out_3 = transpose(perm = conv_out_3_perm_0, x = input_71)[name = tensor<string, []>("transpose_46")];
+            tensor<fp32, [4, 1, 256]> var_576 = slice_by_index(begin = var_576_begin_0, end = var_576_end_0, end_mask = var_576_end_mask_0, x = conv_out_3)[name = tensor<string, []>("op_576")];
+            tensor<int32, [3]> var_578_perm_0 = const()[name = tensor<string, []>("op_578_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 4, 256]> var_578 = transpose(perm = var_578_perm_0, x = var_576)[name = tensor<string, []>("transpose_45")];
+            tensor<fp32, [1, 4, 256]> input_73 = add(x = x_9, y = var_578)[name = tensor<string, []>("input_73")];
+            tensor<int32, [1]> input_75_axes_0 = const()[name = tensor<string, []>("input_75_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 4, 256]> input_75 = layer_norm(axes = input_75_axes_0, beta = inner_encoder_ffn2_1_module_sequential_0_bias, epsilon = var_66, gamma = inner_encoder_ffn2_1_module_sequential_0_weight, x = input_73)[name = tensor<string, []>("input_75")];
+            tensor<fp32, [1, 4, 1024]> inputs_19 = linear(bias = inner_encoder_ffn2_1_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_1_module_sequential_1_linear_weight, x = input_75)[name = tensor<string, []>("linear_17")];
+            tensor<fp32, [1, 4, 1024]> input_77 = silu(x = inputs_19)[name = tensor<string, []>("input_77")];
+            tensor<fp32, [1, 4, 256]> input_81 = linear(bias = inner_encoder_ffn2_1_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_1_module_sequential_4_linear_weight, x = input_77)[name = tensor<string, []>("linear_18")];
+            tensor<fp32, []> var_601 = const()[name = tensor<string, []>("op_601"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 4, 256]> var_602 = mul(x = input_81, y = var_601)[name = tensor<string, []>("op_602")];
+            tensor<fp32, [1, 4, 256]> input_83 = add(x = var_602, y = input_73)[name = tensor<string, []>("input_83")];
             tensor<int32, [1]> input_85_axes_0 = const()[name = tensor<string, []>("input_85_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_85 = layer_norm(axes = input_85_axes_0, beta = encoder_ffn1_2_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_2_module_sequential_0_weight, x = input_83)[name = tensor<string, []>("input_85")];
-            tensor<fp32, [1, 4, 1024]> inputs_21 = linear(bias = encoder_ffn1_2_module_sequential_1_linear_bias, weight = encoder_ffn1_2_module_sequential_1_linear_weight, x = input_85)[name = tensor<string, []>("linear_19")];
-            tensor<fp32, [1, 4, 1024]> input_87 = silu(x = inputs_21)[name = tensor<string, []>("input_87")];
-            tensor<fp32, [1, 4, 256]> input_91 = linear(bias = encoder_ffn1_2_module_sequential_4_linear_bias, weight = encoder_ffn1_2_module_sequential_4_linear_weight, x = input_87)[name = tensor<string, []>("linear_20")];
-            tensor<fp32, []> var_575 = const()[name = tensor<string, []>("op_575"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 4, 256]> var_576 = mul(x = input_91, y = var_575)[name = tensor<string, []>("op_576")];
-            tensor<fp32, [1, 4, 256]> input_93 = add(x = var_576, y = input_83)[name = tensor<string, []>("input_93")];
+            tensor<fp32, [1, 4, 256]> input_85 = layer_norm(axes = input_85_axes_0, beta = inner_encoder_layer_norm_1_bias, epsilon = var_66, gamma = inner_encoder_layer_norm_1_weight, x = input_83)[name = tensor<string, []>("input_85")];
+            tensor<int32, [1]> input_87_axes_0 = const()[name = tensor<string, []>("input_87_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 4, 256]> input_87 = layer_norm(axes = input_87_axes_0, beta = inner_encoder_ffn1_2_module_sequential_0_bias, epsilon = var_66, gamma = inner_encoder_ffn1_2_module_sequential_0_weight, x = input_85)[name = tensor<string, []>("input_87")];
+            tensor<fp32, [1, 4, 1024]> inputs_21 = linear(bias = inner_encoder_ffn1_2_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_2_module_sequential_1_linear_weight, x = input_87)[name = tensor<string, []>("linear_19")];
+            tensor<fp32, [1, 4, 1024]> input_89 = silu(x = inputs_21)[name = tensor<string, []>("input_89")];
+            tensor<fp32, [1, 4, 256]> input_93 = linear(bias = inner_encoder_ffn1_2_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_2_module_sequential_4_linear_weight, x = input_89)[name = tensor<string, []>("linear_20")];
+            tensor<fp32, []> var_631 = const()[name = tensor<string, []>("op_631"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 4, 256]> var_632 = mul(x = input_93, y = var_631)[name = tensor<string, []>("op_632")];
+            tensor<fp32, [1, 4, 256]> input_95 = add(x = var_632, y = input_85)[name = tensor<string, []>("input_95")];
             tensor<int32, [1]> x_13_axes_0 = const()[name = tensor<string, []>("x_13_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> x_13 = layer_norm(axes = x_13_axes_0, beta = encoder_ret_lns_2_bias, epsilon = var_28, gamma = encoder_ret_lns_2_weight, x = input_93)[name = tensor<string, []>("x_13")];
+            tensor<fp32, [1, 4, 256]> x_13 = layer_norm(axes = x_13_axes_0, beta = inner_encoder_ret_lns_2_bias, epsilon = var_66, gamma = inner_encoder_ret_lns_2_weight, x = input_95)[name = tensor<string, []>("x_13")];
             tensor<int32, [5]> prev_kv_5_begin_0 = const()[name = tensor<string, []>("prev_kv_5_begin_0"), val = tensor<int32, [5]>([2, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_5_end_0 = const()[name = tensor<string, []>("prev_kv_5_end_0"), val = tensor<int32, [5]>([3, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_5_end_mask_0 = const()[name = tensor<string, []>("prev_kv_5_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -593,173 +615,173 @@ program(1.0)
             tensor<bool, [2]> prev_scale_5_end_mask_0 = const()[name = tensor<string, []>("prev_scale_5_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_5_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_5_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_5 = slice_by_index(begin = prev_scale_5_begin_0, end = prev_scale_5_end_0, end_mask = prev_scale_5_end_mask_0, squeeze_mask = prev_scale_5_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_5")];
-            tensor<fp32, [1, 4, 256]> var_590 = linear(bias = encoder_q_proj_2_bias, weight = encoder_q_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_21")];
-            tensor<int32, [4]> var_591 = const()[name = tensor<string, []>("op_591"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_592 = reshape(shape = var_591, x = var_590)[name = tensor<string, []>("op_592")];
+            tensor<fp32, [1, 4, 256]> var_646 = linear(bias = inner_encoder_q_proj_2_bias, weight = inner_encoder_q_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_21")];
+            tensor<int32, [4]> var_647 = const()[name = tensor<string, []>("op_647"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_648 = reshape(shape = var_647, x = var_646)[name = tensor<string, []>("op_648")];
             tensor<int32, [4]> q_5_perm_0 = const()[name = tensor<string, []>("q_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> var_596 = linear(bias = encoder_k_proj_2_bias, weight = encoder_k_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_22")];
-            tensor<fp32, []> var_597 = const()[name = tensor<string, []>("op_597"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 4, 256]> var_598 = mul(x = var_596, y = var_597)[name = tensor<string, []>("op_598")];
-            tensor<int32, [4]> var_599 = const()[name = tensor<string, []>("op_599"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_600 = reshape(shape = var_599, x = var_598)[name = tensor<string, []>("op_600")];
+            tensor<fp32, [1, 4, 256]> var_652 = linear(bias = inner_encoder_k_proj_2_bias, weight = inner_encoder_k_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_22")];
+            tensor<fp32, []> var_653 = const()[name = tensor<string, []>("op_653"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 4, 256]> var_654 = mul(x = var_652, y = var_653)[name = tensor<string, []>("op_654")];
+            tensor<int32, [4]> var_655 = const()[name = tensor<string, []>("op_655"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_656 = reshape(shape = var_655, x = var_654)[name = tensor<string, []>("op_656")];
             tensor<int32, [4]> k_5_perm_0 = const()[name = tensor<string, []>("k_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> var_604 = linear(bias = encoder_v_proj_2_bias, weight = encoder_v_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_23")];
-            tensor<int32, [4]> var_605 = const()[name = tensor<string, []>("op_605"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_606 = reshape(shape = var_605, x = var_604)[name = tensor<string, []>("op_606")];
+            tensor<fp32, [1, 4, 256]> var_660 = linear(bias = inner_encoder_v_proj_2_bias, weight = inner_encoder_v_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_23")];
+            tensor<int32, [4]> var_661 = const()[name = tensor<string, []>("op_661"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_662 = reshape(shape = var_661, x = var_660)[name = tensor<string, []>("op_662")];
             tensor<int32, [4]> v_5_perm_0 = const()[name = tensor<string, []>("v_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> input_97 = linear(bias = encoder_g_proj_2_bias, weight = encoder_g_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_24")];
+            tensor<fp32, [1, 4, 256]> input_99 = linear(bias = inner_encoder_g_proj_2_bias, weight = inner_encoder_g_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_24")];
             tensor<fp32, [1]> sqrt_s0_5 = sqrt(x = prev_scale_5)[name = tensor<string, []>("sqrt_s0_5")];
-            tensor<fp32, [4]> s_t_5 = add(x = prev_scale_5, y = encoder__t_index)[name = tensor<string, []>("s_t_5")];
+            tensor<fp32, [4]> s_t_5 = add(x = prev_scale_5, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_5")];
             tensor<fp32, [4]> sqrt_s_t_5 = sqrt(x = s_t_5)[name = tensor<string, []>("sqrt_s_t_5")];
             tensor<bool, []> qk_5_transpose_x_1 = const()[name = tensor<string, []>("qk_5_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_5_transpose_y_1 = const()[name = tensor<string, []>("qk_5_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 4, 64]> k_5 = transpose(perm = k_5_perm_0, x = var_600)[name = tensor<string, []>("transpose_43")];
-            tensor<fp32, [1, 4, 4, 64]> q_5 = transpose(perm = q_5_perm_0, x = var_592)[name = tensor<string, []>("transpose_44")];
+            tensor<fp32, [1, 4, 4, 64]> k_5 = transpose(perm = k_5_perm_0, x = var_656)[name = tensor<string, []>("transpose_43")];
+            tensor<fp32, [1, 4, 4, 64]> q_5 = transpose(perm = q_5_perm_0, x = var_648)[name = tensor<string, []>("transpose_44")];
             tensor<fp32, [1, 4, 4, 4]> qk_5 = matmul(transpose_x = qk_5_transpose_x_1, transpose_y = qk_5_transpose_y_1, x = q_5, y = k_5)[name = tensor<string, []>("qk_5")];
-            tensor<int32, [2]> var_616 = const()[name = tensor<string, []>("op_616"), val = tensor<int32, [2]>([4, 1])];
-            tensor<fp32, [4, 1]> var_617 = reshape(shape = var_616, x = sqrt_s_t_5)[name = tensor<string, []>("op_617")];
-            tensor<fp32, [4, 4]> M_5 = real_div(x = encoder__causal_mask, y = var_617)[name = tensor<string, []>("M_5")];
-            tensor<fp32, [1, 4, 4, 4]> var_619 = mul(x = qk_5, y = M_5)[name = tensor<string, []>("op_619")];
+            tensor<int32, [2]> var_672 = const()[name = tensor<string, []>("op_672"), val = tensor<int32, [2]>([4, 1])];
+            tensor<fp32, [4, 1]> var_673 = reshape(shape = var_672, x = sqrt_s_t_5)[name = tensor<string, []>("op_673")];
+            tensor<fp32, [4, 4]> M_5 = real_div(x = inner_encoder__causal_mask, y = var_673)[name = tensor<string, []>("M_5")];
+            tensor<fp32, [1, 4, 4, 4]> var_675 = mul(x = qk_5, y = M_5)[name = tensor<string, []>("op_675")];
             tensor<bool, []> inner_5_transpose_x_0 = const()[name = tensor<string, []>("inner_5_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_5_transpose_y_0 = const()[name = tensor<string, []>("inner_5_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 4, 64]> v_5 = transpose(perm = v_5_perm_0, x = var_606)[name = tensor<string, []>("transpose_42")];
-            tensor<fp32, [1, 4, 4, 64]> inner_5 = matmul(transpose_x = inner_5_transpose_x_0, transpose_y = inner_5_transpose_y_0, x = var_619, y = v_5)[name = tensor<string, []>("inner_5")];
-            tensor<bool, []> var_621_transpose_x_0 = const()[name = tensor<string, []>("op_621_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_621_transpose_y_0 = const()[name = tensor<string, []>("op_621_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 4, 64]> var_621 = matmul(transpose_x = var_621_transpose_x_0, transpose_y = var_621_transpose_y_0, x = q_5, y = prev_kv_5)[name = tensor<string, []>("op_621")];
-            tensor<fp32, [4]> var_622 = real_div(x = sqrt_s0_5, y = sqrt_s_t_5)[name = tensor<string, []>("op_622")];
-            tensor<int32, [4]> var_623 = const()[name = tensor<string, []>("op_623"), val = tensor<int32, [4]>([1, 1, 4, 1])];
-            tensor<fp32, [1, 1, 4, 1]> var_624 = reshape(shape = var_623, x = var_622)[name = tensor<string, []>("op_624")];
-            tensor<fp32, [1, 4, 4, 64]> cross_5 = mul(x = var_621, y = var_624)[name = tensor<string, []>("cross_5")];
+            tensor<fp32, [1, 4, 4, 64]> v_5 = transpose(perm = v_5_perm_0, x = var_662)[name = tensor<string, []>("transpose_42")];
+            tensor<fp32, [1, 4, 4, 64]> inner_5 = matmul(transpose_x = inner_5_transpose_x_0, transpose_y = inner_5_transpose_y_0, x = var_675, y = v_5)[name = tensor<string, []>("inner_5")];
+            tensor<bool, []> var_677_transpose_x_0 = const()[name = tensor<string, []>("op_677_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_677_transpose_y_0 = const()[name = tensor<string, []>("op_677_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 4, 64]> var_677 = matmul(transpose_x = var_677_transpose_x_0, transpose_y = var_677_transpose_y_0, x = q_5, y = prev_kv_5)[name = tensor<string, []>("op_677")];
+            tensor<fp32, [4]> var_678 = real_div(x = sqrt_s0_5, y = sqrt_s_t_5)[name = tensor<string, []>("op_678")];
+            tensor<int32, [4]> var_679 = const()[name = tensor<string, []>("op_679"), val = tensor<int32, [4]>([1, 1, 4, 1])];
+            tensor<fp32, [1, 1, 4, 1]> var_680 = reshape(shape = var_679, x = var_678)[name = tensor<string, []>("op_680")];
+            tensor<fp32, [1, 4, 4, 64]> cross_5 = mul(x = var_677, y = var_680)[name = tensor<string, []>("cross_5")];
             tensor<fp32, [1, 4, 4, 64]> out_13 = add(x = inner_5, y = cross_5)[name = tensor<string, []>("out_13")];
-            tensor<fp32, [1, 4, 64, 64]> var_627 = mul(x = prev_kv_5, y = sqrt_s0_5)[name = tensor<string, []>("op_627")];
-            tensor<bool, []> var_629_transpose_x_1 = const()[name = tensor<string, []>("op_629_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_629_transpose_y_1 = const()[name = tensor<string, []>("op_629_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_629 = matmul(transpose_x = var_629_transpose_x_1, transpose_y = var_629_transpose_y_1, x = k_5, y = v_5)[name = tensor<string, []>("op_629")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_5 = add(x = var_627, y = var_629)[name = tensor<string, []>("new_kv_unnorm_5")];
-            tensor<fp32, []> var_631 = const()[name = tensor<string, []>("op_631"), val = tensor<fp32, []>(0x1p+2)];
-            tensor<fp32, [1]> new_scale_5 = add(x = prev_scale_5, y = var_631)[name = tensor<string, []>("new_scale_5")];
-            tensor<fp32, [1]> var_633 = sqrt(x = new_scale_5)[name = tensor<string, []>("op_633")];
-            tensor<fp32, [1, 4, 64, 64]> var_634 = real_div(x = new_kv_unnorm_5, y = var_633)[name = tensor<string, []>("op_634")];
-            tensor<int32, [4]> var_635_perm_0 = const()[name = tensor<string, []>("op_635_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_683 = mul(x = prev_kv_5, y = sqrt_s0_5)[name = tensor<string, []>("op_683")];
+            tensor<bool, []> var_685_transpose_x_1 = const()[name = tensor<string, []>("op_685_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_685_transpose_y_1 = const()[name = tensor<string, []>("op_685_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_685 = matmul(transpose_x = var_685_transpose_x_1, transpose_y = var_685_transpose_y_1, x = k_5, y = v_5)[name = tensor<string, []>("op_685")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_5 = add(x = var_683, y = var_685)[name = tensor<string, []>("new_kv_unnorm_5")];
+            tensor<fp32, []> var_687 = const()[name = tensor<string, []>("op_687"), val = tensor<fp32, []>(0x1p+2)];
+            tensor<fp32, [1]> new_scale_5 = add(x = prev_scale_5, y = var_687)[name = tensor<string, []>("new_scale_5")];
+            tensor<fp32, [1]> var_689 = sqrt(x = new_scale_5)[name = tensor<string, []>("op_689")];
+            tensor<fp32, [1, 4, 64, 64]> var_690 = real_div(x = new_kv_unnorm_5, y = var_689)[name = tensor<string, []>("op_690")];
+            tensor<int32, [4]> var_691_perm_0 = const()[name = tensor<string, []>("op_691_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_15_axes_0 = const()[name = tensor<string, []>("out_15_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 4, 64]> var_635 = transpose(perm = var_635_perm_0, x = out_13)[name = tensor<string, []>("transpose_41")];
-            tensor<fp32, [1, 4, 4, 64]> out_15 = layer_norm(axes = out_15_axes_0, epsilon = var_17, x = var_635)[name = tensor<string, []>("out_15")];
-            tensor<int32, [3]> var_639 = const()[name = tensor<string, []>("op_639"), val = tensor<int32, [3]>([1, 4, 256])];
-            tensor<fp32, [1, 4, 256]> out_17 = reshape(shape = var_639, x = out_15)[name = tensor<string, []>("out_17")];
-            tensor<fp32, [1, 4, 256]> var_641 = silu(x = input_97)[name = tensor<string, []>("op_641")];
-            tensor<fp32, [1, 4, 256]> input_99 = mul(x = var_641, y = out_17)[name = tensor<string, []>("input_99")];
-            tensor<fp32, [1, 4, 256]> ret_out_5 = linear(bias = encoder_out_proj_2_bias, weight = encoder_out_proj_2_weight, x = input_99)[name = tensor<string, []>("linear_25")];
-            tensor<fp32, [1, 4, 256]> x_15 = add(x = input_93, y = ret_out_5)[name = tensor<string, []>("x_15")];
+            tensor<fp32, [1, 4, 4, 64]> var_691 = transpose(perm = var_691_perm_0, x = out_13)[name = tensor<string, []>("transpose_41")];
+            tensor<fp32, [1, 4, 4, 64]> out_15 = layer_norm(axes = out_15_axes_0, epsilon = var_74, x = var_691)[name = tensor<string, []>("out_15")];
+            tensor<int32, [3]> var_695 = const()[name = tensor<string, []>("op_695"), val = tensor<int32, [3]>([1, 4, 256])];
+            tensor<fp32, [1, 4, 256]> out_17 = reshape(shape = var_695, x = out_15)[name = tensor<string, []>("out_17")];
+            tensor<fp32, [1, 4, 256]> var_697 = silu(x = input_99)[name = tensor<string, []>("op_697")];
+            tensor<fp32, [1, 4, 256]> input_101 = mul(x = var_697, y = out_17)[name = tensor<string, []>("input_101")];
+            tensor<fp32, [1, 4, 256]> ret_out_5 = linear(bias = inner_encoder_out_proj_2_bias, weight = inner_encoder_out_proj_2_weight, x = input_101)[name = tensor<string, []>("linear_25")];
+            tensor<fp32, [1, 4, 256]> x_15 = add(x = input_95, y = ret_out_5)[name = tensor<string, []>("x_15")];
             tensor<int32, [4]> window_21_begin_0 = const()[name = tensor<string, []>("window_21_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> window_21_end_0 = const()[name = tensor<string, []>("window_21_end_0"), val = tensor<int32, [4]>([3, 1, 16, 256])];
             tensor<bool, [4]> window_21_end_mask_0 = const()[name = tensor<string, []>("window_21_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_21_squeeze_mask_0 = const()[name = tensor<string, []>("window_21_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_21 = slice_by_index(begin = window_21_begin_0, end = window_21_end_0, end_mask = window_21_end_mask_0, squeeze_mask = window_21_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_21")];
-            tensor<int32, [3]> var_649_begin_0 = const()[name = tensor<string, []>("op_649_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_649_end_0 = const()[name = tensor<string, []>("op_649_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_649_end_mask_0 = const()[name = tensor<string, []>("op_649_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_649 = slice_by_index(begin = var_649_begin_0, end = var_649_end_0, end_mask = var_649_end_mask_0, x = x_15)[name = tensor<string, []>("op_649")];
-            tensor<int32, [3]> var_652_begin_0 = const()[name = tensor<string, []>("op_652_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_652_end_0 = const()[name = tensor<string, []>("op_652_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_652_end_mask_0 = const()[name = tensor<string, []>("op_652_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_652 = slice_by_index(begin = var_652_begin_0, end = var_652_end_0, end_mask = var_652_end_mask_0, x = window_21)[name = tensor<string, []>("op_652")];
+            tensor<int32, [3]> var_705_begin_0 = const()[name = tensor<string, []>("op_705_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_705_end_0 = const()[name = tensor<string, []>("op_705_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_705_end_mask_0 = const()[name = tensor<string, []>("op_705_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_705 = slice_by_index(begin = var_705_begin_0, end = var_705_end_0, end_mask = var_705_end_mask_0, x = x_15)[name = tensor<string, []>("op_705")];
+            tensor<int32, [3]> var_708_begin_0 = const()[name = tensor<string, []>("op_708_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_708_end_0 = const()[name = tensor<string, []>("op_708_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_708_end_mask_0 = const()[name = tensor<string, []>("op_708_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_708 = slice_by_index(begin = var_708_begin_0, end = var_708_end_0, end_mask = var_708_end_mask_0, x = window_21)[name = tensor<string, []>("op_708")];
             tensor<bool, []> window_23_interleave_0 = const()[name = tensor<string, []>("window_23_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_23 = concat(axis = var_26, interleave = window_23_interleave_0, values = (var_652, var_649))[name = tensor<string, []>("window_23")];
-            tensor<int32, [3]> var_657_begin_0 = const()[name = tensor<string, []>("op_657_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_657_end_0 = const()[name = tensor<string, []>("op_657_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_657_end_mask_0 = const()[name = tensor<string, []>("op_657_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_657 = slice_by_index(begin = var_657_begin_0, end = var_657_end_0, end_mask = var_657_end_mask_0, x = x_15)[name = tensor<string, []>("op_657")];
-            tensor<int32, [3]> var_660_begin_0 = const()[name = tensor<string, []>("op_660_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_660_end_0 = const()[name = tensor<string, []>("op_660_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_660_end_mask_0 = const()[name = tensor<string, []>("op_660_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_660 = slice_by_index(begin = var_660_begin_0, end = var_660_end_0, end_mask = var_660_end_mask_0, x = window_23)[name = tensor<string, []>("op_660")];
+            tensor<fp32, [1, 16, 256]> window_23 = concat(axis = var_82, interleave = window_23_interleave_0, values = (var_708, var_705))[name = tensor<string, []>("window_23")];
+            tensor<int32, [3]> var_713_begin_0 = const()[name = tensor<string, []>("op_713_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_713_end_0 = const()[name = tensor<string, []>("op_713_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_713_end_mask_0 = const()[name = tensor<string, []>("op_713_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_713 = slice_by_index(begin = var_713_begin_0, end = var_713_end_0, end_mask = var_713_end_mask_0, x = x_15)[name = tensor<string, []>("op_713")];
+            tensor<int32, [3]> var_716_begin_0 = const()[name = tensor<string, []>("op_716_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_716_end_0 = const()[name = tensor<string, []>("op_716_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_716_end_mask_0 = const()[name = tensor<string, []>("op_716_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_716 = slice_by_index(begin = var_716_begin_0, end = var_716_end_0, end_mask = var_716_end_mask_0, x = window_23)[name = tensor<string, []>("op_716")];
             tensor<bool, []> window_25_interleave_0 = const()[name = tensor<string, []>("window_25_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_25 = concat(axis = var_26, interleave = window_25_interleave_0, values = (var_660, var_657))[name = tensor<string, []>("window_25")];
-            tensor<int32, [3]> var_665_begin_0 = const()[name = tensor<string, []>("op_665_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_665_end_0 = const()[name = tensor<string, []>("op_665_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<bool, [3]> var_665_end_mask_0 = const()[name = tensor<string, []>("op_665_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_665 = slice_by_index(begin = var_665_begin_0, end = var_665_end_0, end_mask = var_665_end_mask_0, x = x_15)[name = tensor<string, []>("op_665")];
-            tensor<int32, [3]> var_668_begin_0 = const()[name = tensor<string, []>("op_668_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_668_end_0 = const()[name = tensor<string, []>("op_668_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_668_end_mask_0 = const()[name = tensor<string, []>("op_668_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_668 = slice_by_index(begin = var_668_begin_0, end = var_668_end_0, end_mask = var_668_end_mask_0, x = window_25)[name = tensor<string, []>("op_668")];
+            tensor<fp32, [1, 16, 256]> window_25 = concat(axis = var_82, interleave = window_25_interleave_0, values = (var_716, var_713))[name = tensor<string, []>("window_25")];
+            tensor<int32, [3]> var_721_begin_0 = const()[name = tensor<string, []>("op_721_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_721_end_0 = const()[name = tensor<string, []>("op_721_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<bool, [3]> var_721_end_mask_0 = const()[name = tensor<string, []>("op_721_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_721 = slice_by_index(begin = var_721_begin_0, end = var_721_end_0, end_mask = var_721_end_mask_0, x = x_15)[name = tensor<string, []>("op_721")];
+            tensor<int32, [3]> var_724_begin_0 = const()[name = tensor<string, []>("op_724_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_724_end_0 = const()[name = tensor<string, []>("op_724_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_724_end_mask_0 = const()[name = tensor<string, []>("op_724_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_724 = slice_by_index(begin = var_724_begin_0, end = var_724_end_0, end_mask = var_724_end_mask_0, x = window_25)[name = tensor<string, []>("op_724")];
             tensor<bool, []> window_27_interleave_0 = const()[name = tensor<string, []>("window_27_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_27 = concat(axis = var_26, interleave = window_27_interleave_0, values = (var_668, var_665))[name = tensor<string, []>("window_27")];
-            tensor<int32, [3]> var_673_begin_0 = const()[name = tensor<string, []>("op_673_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
-            tensor<int32, [3]> var_673_end_0 = const()[name = tensor<string, []>("op_673_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_673_end_mask_0 = const()[name = tensor<string, []>("op_673_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_673 = slice_by_index(begin = var_673_begin_0, end = var_673_end_0, end_mask = var_673_end_mask_0, x = x_15)[name = tensor<string, []>("op_673")];
-            tensor<int32, [3]> var_676_begin_0 = const()[name = tensor<string, []>("op_676_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_676_end_0 = const()[name = tensor<string, []>("op_676_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_676_end_mask_0 = const()[name = tensor<string, []>("op_676_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_676 = slice_by_index(begin = var_676_begin_0, end = var_676_end_0, end_mask = var_676_end_mask_0, x = window_27)[name = tensor<string, []>("op_676")];
+            tensor<fp32, [1, 16, 256]> window_27 = concat(axis = var_82, interleave = window_27_interleave_0, values = (var_724, var_721))[name = tensor<string, []>("window_27")];
+            tensor<int32, [3]> var_729_begin_0 = const()[name = tensor<string, []>("op_729_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
+            tensor<int32, [3]> var_729_end_0 = const()[name = tensor<string, []>("op_729_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_729_end_mask_0 = const()[name = tensor<string, []>("op_729_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_729 = slice_by_index(begin = var_729_begin_0, end = var_729_end_0, end_mask = var_729_end_mask_0, x = x_15)[name = tensor<string, []>("op_729")];
+            tensor<int32, [3]> var_732_begin_0 = const()[name = tensor<string, []>("op_732_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_732_end_0 = const()[name = tensor<string, []>("op_732_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_732_end_mask_0 = const()[name = tensor<string, []>("op_732_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_732 = slice_by_index(begin = var_732_begin_0, end = var_732_end_0, end_mask = var_732_end_mask_0, x = window_27)[name = tensor<string, []>("op_732")];
             tensor<bool, []> window_29_interleave_0 = const()[name = tensor<string, []>("window_29_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_29 = concat(axis = var_26, interleave = window_29_interleave_0, values = (var_676, var_673))[name = tensor<string, []>("window_29")];
-            tensor<bool, []> input_101_interleave_0 = const()[name = tensor<string, []>("input_101_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [4, 16, 256]> input_101 = concat(axis = var_23, interleave = input_101_interleave_0, values = (window_23, window_25, window_27, window_29))[name = tensor<string, []>("input_101")];
+            tensor<fp32, [1, 16, 256]> window_29 = concat(axis = var_82, interleave = window_29_interleave_0, values = (var_732, var_729))[name = tensor<string, []>("window_29")];
+            tensor<bool, []> input_103_interleave_0 = const()[name = tensor<string, []>("input_103_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [4, 16, 256]> input_103 = concat(axis = var_69, interleave = input_103_interleave_0, values = (window_23, window_25, window_27, window_29))[name = tensor<string, []>("input_103")];
             tensor<int32, [1]> x_17_axes_0 = const()[name = tensor<string, []>("x_17_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [4, 16, 256]> x_17 = layer_norm(axes = x_17_axes_0, beta = encoder_conv_module_2_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_2_sequential_0_weight, x = input_101)[name = tensor<string, []>("x_17")];
-            tensor<int32, [3]> input_103_perm_0 = const()[name = tensor<string, []>("input_103_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [4, 16, 256]> x_17 = layer_norm(axes = x_17_axes_0, beta = inner_encoder_conv_module_2_sequential_0_bias, epsilon = var_66, gamma = inner_encoder_conv_module_2_sequential_0_weight, x = input_103)[name = tensor<string, []>("x_17")];
+            tensor<int32, [3]> input_105_perm_0 = const()[name = tensor<string, []>("input_105_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_23_pad_type_0 = const()[name = tensor<string, []>("inputs_23_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_23_strides_0 = const()[name = tensor<string, []>("inputs_23_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_23_pad_0 = const()[name = tensor<string, []>("inputs_23_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_23_dilations_0 = const()[name = tensor<string, []>("inputs_23_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_23_groups_0 = const()[name = tensor<string, []>("inputs_23_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> input_103 = transpose(perm = input_103_perm_0, x = x_17)[name = tensor<string, []>("transpose_40")];
-            tensor<fp32, [4, 512, 16]> inputs_23 = conv(bias = encoder_conv_module_2_sequential_2_conv_bias, dilations = inputs_23_dilations_0, groups = inputs_23_groups_0, pad = inputs_23_pad_0, pad_type = inputs_23_pad_type_0, strides = inputs_23_strides_0, weight = encoder_conv_module_2_sequential_2_conv_weight, x = input_103)[name = tensor<string, []>("inputs_23")];
-            tensor<int32, [2]> var_701_split_sizes_0 = const()[name = tensor<string, []>("op_701_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_701_axis_0 = const()[name = tensor<string, []>("op_701_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> var_701_0, tensor<fp32, [4, 256, 16]> var_701_1 = split(axis = var_701_axis_0, split_sizes = var_701_split_sizes_0, x = inputs_23)[name = tensor<string, []>("op_701")];
-            tensor<fp32, [4, 256, 16]> var_703 = sigmoid(x = var_701_1)[name = tensor<string, []>("op_703")];
-            tensor<fp32, [4, 256, 16]> inputs_25 = mul(x = var_701_0, y = var_703)[name = tensor<string, []>("inputs_25")];
+            tensor<fp32, [4, 256, 16]> input_105 = transpose(perm = input_105_perm_0, x = x_17)[name = tensor<string, []>("transpose_40")];
+            tensor<fp32, [4, 512, 16]> inputs_23 = conv(bias = inner_encoder_conv_module_2_sequential_2_conv_bias, dilations = inputs_23_dilations_0, groups = inputs_23_groups_0, pad = inputs_23_pad_0, pad_type = inputs_23_pad_type_0, strides = inputs_23_strides_0, weight = inner_encoder_conv_module_2_sequential_2_conv_weight, x = input_105)[name = tensor<string, []>("inputs_23")];
+            tensor<int32, [2]> var_757_split_sizes_0 = const()[name = tensor<string, []>("op_757_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_757_axis_0 = const()[name = tensor<string, []>("op_757_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [4, 256, 16]> var_757_0, tensor<fp32, [4, 256, 16]> var_757_1 = split(axis = var_757_axis_0, split_sizes = var_757_split_sizes_0, x = inputs_23)[name = tensor<string, []>("op_757")];
+            tensor<fp32, [4, 256, 16]> var_759 = sigmoid(x = var_757_1)[name = tensor<string, []>("op_759")];
+            tensor<fp32, [4, 256, 16]> inputs_25 = mul(x = var_757_0, y = var_759)[name = tensor<string, []>("inputs_25")];
             tensor<string, []> outputs_aug_5_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_5_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_5_pad_0 = const()[name = tensor<string, []>("outputs_aug_5_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_5_groups_0 = const()[name = tensor<string, []>("outputs_aug_5_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_5_strides_0 = const()[name = tensor<string, []>("outputs_aug_5_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_5_dilations_0 = const()[name = tensor<string, []>("outputs_aug_5_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [4, 256, 31]> outputs_aug_5 = conv(dilations = outputs_aug_5_dilations_0, groups = outputs_aug_5_groups_0, pad = outputs_aug_5_pad_0, pad_type = outputs_aug_5_pad_type_0, strides = outputs_aug_5_strides_0, weight = encoder_conv_module_2_sequential_4_conv_weight, x = inputs_25)[name = tensor<string, []>("outputs_aug_5")];
-            tensor<int32, [3]> input_105_begin_0 = const()[name = tensor<string, []>("input_105_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_105_end_0 = const()[name = tensor<string, []>("input_105_end_0"), val = tensor<int32, [3]>([4, 256, 16])];
-            tensor<bool, [3]> input_105_end_mask_0 = const()[name = tensor<string, []>("input_105_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [4, 256, 16]> input_105 = slice_by_index(begin = input_105_begin_0, end = input_105_end_0, end_mask = input_105_end_mask_0, x = outputs_aug_5)[name = tensor<string, []>("input_105")];
-            tensor<fp32, [4, 256, 16]> inputs_27 = batch_norm(beta = encoder_conv_module_2_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_2_sequential_5_weight, mean = encoder_conv_module_2_sequential_5_running_mean, variance = encoder_conv_module_2_sequential_5_running_var, x = input_105)[name = tensor<string, []>("inputs_27")];
-            tensor<fp32, [4, 256, 16]> input_107 = silu(x = inputs_27)[name = tensor<string, []>("input_107")];
-            tensor<string, []> input_109_pad_type_0 = const()[name = tensor<string, []>("input_109_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_109_strides_0 = const()[name = tensor<string, []>("input_109_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_109_pad_0 = const()[name = tensor<string, []>("input_109_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_109_dilations_0 = const()[name = tensor<string, []>("input_109_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_109_groups_0 = const()[name = tensor<string, []>("input_109_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> input_109 = conv(bias = encoder_conv_module_2_sequential_7_conv_bias, dilations = input_109_dilations_0, groups = input_109_groups_0, pad = input_109_pad_0, pad_type = input_109_pad_type_0, strides = input_109_strides_0, weight = encoder_conv_module_2_sequential_7_conv_weight, x = input_107)[name = tensor<string, []>("input_109")];
+            tensor<fp32, [4, 256, 31]> outputs_aug_5 = conv(dilations = outputs_aug_5_dilations_0, groups = outputs_aug_5_groups_0, pad = outputs_aug_5_pad_0, pad_type = outputs_aug_5_pad_type_0, strides = outputs_aug_5_strides_0, weight = inner_encoder_conv_module_2_sequential_4_conv_weight, x = inputs_25)[name = tensor<string, []>("outputs_aug_5")];
+            tensor<int32, [3]> input_107_begin_0 = const()[name = tensor<string, []>("input_107_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_107_end_0 = const()[name = tensor<string, []>("input_107_end_0"), val = tensor<int32, [3]>([4, 256, 16])];
+            tensor<bool, [3]> input_107_end_mask_0 = const()[name = tensor<string, []>("input_107_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [4, 256, 16]> input_107 = slice_by_index(begin = input_107_begin_0, end = input_107_end_0, end_mask = input_107_end_mask_0, x = outputs_aug_5)[name = tensor<string, []>("input_107")];
+            tensor<fp32, [4, 256, 16]> inputs_27 = batch_norm(beta = inner_encoder_conv_module_2_sequential_5_bias, epsilon = var_66, gamma = inner_encoder_conv_module_2_sequential_5_weight, mean = inner_encoder_conv_module_2_sequential_5_running_mean, variance = inner_encoder_conv_module_2_sequential_5_running_var, x = input_107)[name = tensor<string, []>("inputs_27")];
+            tensor<fp32, [4, 256, 16]> input_109 = silu(x = inputs_27)[name = tensor<string, []>("input_109")];
+            tensor<string, []> input_111_pad_type_0 = const()[name = tensor<string, []>("input_111_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_111_strides_0 = const()[name = tensor<string, []>("input_111_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_111_pad_0 = const()[name = tensor<string, []>("input_111_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_111_dilations_0 = const()[name = tensor<string, []>("input_111_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_111_groups_0 = const()[name = tensor<string, []>("input_111_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [4, 256, 16]> input_111 = conv(bias = inner_encoder_conv_module_2_sequential_7_conv_bias, dilations = input_111_dilations_0, groups = input_111_groups_0, pad = input_111_pad_0, pad_type = input_111_pad_type_0, strides = input_111_strides_0, weight = inner_encoder_conv_module_2_sequential_7_conv_weight, x = input_109)[name = tensor<string, []>("input_111")];
             tensor<int32, [3]> conv_out_5_perm_0 = const()[name = tensor<string, []>("conv_out_5_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_734_begin_0 = const()[name = tensor<string, []>("op_734_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_734_end_0 = const()[name = tensor<string, []>("op_734_end_0"), val = tensor<int32, [3]>([4, 16, 256])];
-            tensor<bool, [3]> var_734_end_mask_0 = const()[name = tensor<string, []>("op_734_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [4, 16, 256]> conv_out_5 = transpose(perm = conv_out_5_perm_0, x = input_109)[name = tensor<string, []>("transpose_39")];
-            tensor<fp32, [4, 1, 256]> var_734 = slice_by_index(begin = var_734_begin_0, end = var_734_end_0, end_mask = var_734_end_mask_0, x = conv_out_5)[name = tensor<string, []>("op_734")];
-            tensor<int32, [3]> var_736_perm_0 = const()[name = tensor<string, []>("op_736_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 4, 256]> var_736 = transpose(perm = var_736_perm_0, x = var_734)[name = tensor<string, []>("transpose_38")];
-            tensor<fp32, [1, 4, 256]> input_111 = add(x = x_15, y = var_736)[name = tensor<string, []>("input_111")];
-            tensor<int32, [1]> input_113_axes_0 = const()[name = tensor<string, []>("input_113_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_113 = layer_norm(axes = input_113_axes_0, beta = encoder_ffn2_2_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_2_module_sequential_0_weight, x = input_111)[name = tensor<string, []>("input_113")];
-            tensor<fp32, [1, 4, 1024]> inputs_29 = linear(bias = encoder_ffn2_2_module_sequential_1_linear_bias, weight = encoder_ffn2_2_module_sequential_1_linear_weight, x = input_113)[name = tensor<string, []>("linear_26")];
-            tensor<fp32, [1, 4, 1024]> input_115 = silu(x = inputs_29)[name = tensor<string, []>("input_115")];
-            tensor<fp32, [1, 4, 256]> input_119 = linear(bias = encoder_ffn2_2_module_sequential_4_linear_bias, weight = encoder_ffn2_2_module_sequential_4_linear_weight, x = input_115)[name = tensor<string, []>("linear_27")];
-            tensor<fp32, []> var_759 = const()[name = tensor<string, []>("op_759"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 4, 256]> var_760 = mul(x = input_119, y = var_759)[name = tensor<string, []>("op_760")];
-            tensor<fp32, [1, 4, 256]> input_121 = add(x = var_760, y = input_111)[name = tensor<string, []>("input_121")];
-            tensor<int32, [1]> input_123_axes_0 = const()[name = tensor<string, []>("input_123_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_123 = layer_norm(axes = input_123_axes_0, beta = encoder_layer_norm_2_bias, epsilon = var_28, gamma = encoder_layer_norm_2_weight, x = input_121)[name = tensor<string, []>("input_123")];
+            tensor<int32, [3]> var_790_begin_0 = const()[name = tensor<string, []>("op_790_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_790_end_0 = const()[name = tensor<string, []>("op_790_end_0"), val = tensor<int32, [3]>([4, 16, 256])];
+            tensor<bool, [3]> var_790_end_mask_0 = const()[name = tensor<string, []>("op_790_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [4, 16, 256]> conv_out_5 = transpose(perm = conv_out_5_perm_0, x = input_111)[name = tensor<string, []>("transpose_39")];
+            tensor<fp32, [4, 1, 256]> var_790 = slice_by_index(begin = var_790_begin_0, end = var_790_end_0, end_mask = var_790_end_mask_0, x = conv_out_5)[name = tensor<string, []>("op_790")];
+            tensor<int32, [3]> var_792_perm_0 = const()[name = tensor<string, []>("op_792_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 4, 256]> var_792 = transpose(perm = var_792_perm_0, x = var_790)[name = tensor<string, []>("transpose_38")];
+            tensor<fp32, [1, 4, 256]> input_113 = add(x = x_15, y = var_792)[name = tensor<string, []>("input_113")];
+            tensor<int32, [1]> input_115_axes_0 = const()[name = tensor<string, []>("input_115_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 4, 256]> input_115 = layer_norm(axes = input_115_axes_0, beta = inner_encoder_ffn2_2_module_sequential_0_bias, epsilon = var_66, gamma = inner_encoder_ffn2_2_module_sequential_0_weight, x = input_113)[name = tensor<string, []>("input_115")];
+            tensor<fp32, [1, 4, 1024]> inputs_29 = linear(bias = inner_encoder_ffn2_2_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_2_module_sequential_1_linear_weight, x = input_115)[name = tensor<string, []>("linear_26")];
+            tensor<fp32, [1, 4, 1024]> input_117 = silu(x = inputs_29)[name = tensor<string, []>("input_117")];
+            tensor<fp32, [1, 4, 256]> input_121 = linear(bias = inner_encoder_ffn2_2_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_2_module_sequential_4_linear_weight, x = input_117)[name = tensor<string, []>("linear_27")];
+            tensor<fp32, []> var_815 = const()[name = tensor<string, []>("op_815"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 4, 256]> var_816 = mul(x = input_121, y = var_815)[name = tensor<string, []>("op_816")];
+            tensor<fp32, [1, 4, 256]> input_123 = add(x = var_816, y = input_113)[name = tensor<string, []>("input_123")];
             tensor<int32, [1]> input_125_axes_0 = const()[name = tensor<string, []>("input_125_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_125 = layer_norm(axes = input_125_axes_0, beta = encoder_ffn1_3_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_3_module_sequential_0_weight, x = input_123)[name = tensor<string, []>("input_125")];
-            tensor<fp32, [1, 4, 1024]> inputs_31 = linear(bias = encoder_ffn1_3_module_sequential_1_linear_bias, weight = encoder_ffn1_3_module_sequential_1_linear_weight, x = input_125)[name = tensor<string, []>("linear_28")];
-            tensor<fp32, [1, 4, 1024]> input_127 = silu(x = inputs_31)[name = tensor<string, []>("input_127")];
-            tensor<fp32, [1, 4, 256]> input_131 = linear(bias = encoder_ffn1_3_module_sequential_4_linear_bias, weight = encoder_ffn1_3_module_sequential_4_linear_weight, x = input_127)[name = tensor<string, []>("linear_29")];
-            tensor<fp32, []> var_789 = const()[name = tensor<string, []>("op_789"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 4, 256]> var_790 = mul(x = input_131, y = var_789)[name = tensor<string, []>("op_790")];
-            tensor<fp32, [1, 4, 256]> input_133 = add(x = var_790, y = input_123)[name = tensor<string, []>("input_133")];
+            tensor<fp32, [1, 4, 256]> input_125 = layer_norm(axes = input_125_axes_0, beta = inner_encoder_layer_norm_2_bias, epsilon = var_66, gamma = inner_encoder_layer_norm_2_weight, x = input_123)[name = tensor<string, []>("input_125")];
+            tensor<int32, [1]> input_127_axes_0 = const()[name = tensor<string, []>("input_127_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 4, 256]> input_127 = layer_norm(axes = input_127_axes_0, beta = inner_encoder_ffn1_3_module_sequential_0_bias, epsilon = var_66, gamma = inner_encoder_ffn1_3_module_sequential_0_weight, x = input_125)[name = tensor<string, []>("input_127")];
+            tensor<fp32, [1, 4, 1024]> inputs_31 = linear(bias = inner_encoder_ffn1_3_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_3_module_sequential_1_linear_weight, x = input_127)[name = tensor<string, []>("linear_28")];
+            tensor<fp32, [1, 4, 1024]> input_129 = silu(x = inputs_31)[name = tensor<string, []>("input_129")];
+            tensor<fp32, [1, 4, 256]> input_133 = linear(bias = inner_encoder_ffn1_3_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_3_module_sequential_4_linear_weight, x = input_129)[name = tensor<string, []>("linear_29")];
+            tensor<fp32, []> var_845 = const()[name = tensor<string, []>("op_845"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 4, 256]> var_846 = mul(x = input_133, y = var_845)[name = tensor<string, []>("op_846")];
+            tensor<fp32, [1, 4, 256]> input_135 = add(x = var_846, y = input_125)[name = tensor<string, []>("input_135")];
             tensor<int32, [1]> x_19_axes_0 = const()[name = tensor<string, []>("x_19_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> x_19 = layer_norm(axes = x_19_axes_0, beta = encoder_ret_lns_3_bias, epsilon = var_28, gamma = encoder_ret_lns_3_weight, x = input_133)[name = tensor<string, []>("x_19")];
+            tensor<fp32, [1, 4, 256]> x_19 = layer_norm(axes = x_19_axes_0, beta = inner_encoder_ret_lns_3_bias, epsilon = var_66, gamma = inner_encoder_ret_lns_3_weight, x = input_135)[name = tensor<string, []>("x_19")];
             tensor<int32, [5]> prev_kv_7_begin_0 = const()[name = tensor<string, []>("prev_kv_7_begin_0"), val = tensor<int32, [5]>([3, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_7_end_0 = const()[name = tensor<string, []>("prev_kv_7_end_0"), val = tensor<int32, [5]>([4, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_7_end_mask_0 = const()[name = tensor<string, []>("prev_kv_7_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -770,209 +792,202 @@ program(1.0)
             tensor<bool, [2]> prev_scale_7_end_mask_0 = const()[name = tensor<string, []>("prev_scale_7_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_7_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_7_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_7 = slice_by_index(begin = prev_scale_7_begin_0, end = prev_scale_7_end_0, end_mask = prev_scale_7_end_mask_0, squeeze_mask = prev_scale_7_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_7")];
-            tensor<fp32, [1, 4, 256]> var_804 = linear(bias = encoder_q_proj_3_bias, weight = encoder_q_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_30")];
-            tensor<int32, [4]> var_805 = const()[name = tensor<string, []>("op_805"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_806 = reshape(shape = var_805, x = var_804)[name = tensor<string, []>("op_806")];
+            tensor<fp32, [1, 4, 256]> var_860 = linear(bias = inner_encoder_q_proj_3_bias, weight = inner_encoder_q_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_30")];
+            tensor<int32, [4]> var_861 = const()[name = tensor<string, []>("op_861"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_862 = reshape(shape = var_861, x = var_860)[name = tensor<string, []>("op_862")];
             tensor<int32, [4]> q_7_perm_0 = const()[name = tensor<string, []>("q_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> var_810 = linear(bias = encoder_k_proj_3_bias, weight = encoder_k_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_31")];
-            tensor<fp32, []> var_811 = const()[name = tensor<string, []>("op_811"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 4, 256]> var_812 = mul(x = var_810, y = var_811)[name = tensor<string, []>("op_812")];
-            tensor<int32, [4]> var_813 = const()[name = tensor<string, []>("op_813"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_814 = reshape(shape = var_813, x = var_812)[name = tensor<string, []>("op_814")];
+            tensor<fp32, [1, 4, 256]> var_866 = linear(bias = inner_encoder_k_proj_3_bias, weight = inner_encoder_k_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_31")];
+            tensor<fp32, []> var_867 = const()[name = tensor<string, []>("op_867"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 4, 256]> var_868 = mul(x = var_866, y = var_867)[name = tensor<string, []>("op_868")];
+            tensor<int32, [4]> var_869 = const()[name = tensor<string, []>("op_869"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_870 = reshape(shape = var_869, x = var_868)[name = tensor<string, []>("op_870")];
             tensor<int32, [4]> k_7_perm_0 = const()[name = tensor<string, []>("k_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> var_818 = linear(bias = encoder_v_proj_3_bias, weight = encoder_v_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_32")];
-            tensor<int32, [4]> var_819 = const()[name = tensor<string, []>("op_819"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_820 = reshape(shape = var_819, x = var_818)[name = tensor<string, []>("op_820")];
+            tensor<fp32, [1, 4, 256]> var_874 = linear(bias = inner_encoder_v_proj_3_bias, weight = inner_encoder_v_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_32")];
+            tensor<int32, [4]> var_875 = const()[name = tensor<string, []>("op_875"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_876 = reshape(shape = var_875, x = var_874)[name = tensor<string, []>("op_876")];
             tensor<int32, [4]> v_7_perm_0 = const()[name = tensor<string, []>("v_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> input_137 = linear(bias = encoder_g_proj_3_bias, weight = encoder_g_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_33")];
+            tensor<fp32, [1, 4, 256]> input_139 = linear(bias = inner_encoder_g_proj_3_bias, weight = inner_encoder_g_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_33")];
             tensor<fp32, [1]> sqrt_s0_7 = sqrt(x = prev_scale_7)[name = tensor<string, []>("sqrt_s0_7")];
-            tensor<fp32, [4]> s_t_7 = add(x = prev_scale_7, y = encoder__t_index)[name = tensor<string, []>("s_t_7")];
+            tensor<fp32, [4]> s_t_7 = add(x = prev_scale_7, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_7")];
             tensor<fp32, [4]> sqrt_s_t_7 = sqrt(x = s_t_7)[name = tensor<string, []>("sqrt_s_t_7")];
             tensor<bool, []> qk_7_transpose_x_1 = const()[name = tensor<string, []>("qk_7_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_7_transpose_y_1 = const()[name = tensor<string, []>("qk_7_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 4, 64]> k_7 = transpose(perm = k_7_perm_0, x = var_814)[name = tensor<string, []>("transpose_36")];
-            tensor<fp32, [1, 4, 4, 64]> q_7 = transpose(perm = q_7_perm_0, x = var_806)[name = tensor<string, []>("transpose_37")];
+            tensor<fp32, [1, 4, 4, 64]> k_7 = transpose(perm = k_7_perm_0, x = var_870)[name = tensor<string, []>("transpose_36")];
+            tensor<fp32, [1, 4, 4, 64]> q_7 = transpose(perm = q_7_perm_0, x = var_862)[name = tensor<string, []>("transpose_37")];
             tensor<fp32, [1, 4, 4, 4]> qk_7 = matmul(transpose_x = qk_7_transpose_x_1, transpose_y = qk_7_transpose_y_1, x = q_7, y = k_7)[name = tensor<string, []>("qk_7")];
-            tensor<int32, [2]> var_830 = const()[name = tensor<string, []>("op_830"), val = tensor<int32, [2]>([4, 1])];
-            tensor<fp32, [4, 1]> var_831 = reshape(shape = var_830, x = sqrt_s_t_7)[name = tensor<string, []>("op_831")];
-            tensor<fp32, [4, 4]> M_7 = real_div(x = encoder__causal_mask, y = var_831)[name = tensor<string, []>("M_7")];
-            tensor<fp32, [1, 4, 4, 4]> var_833 = mul(x = qk_7, y = M_7)[name = tensor<string, []>("op_833")];
+            tensor<int32, [2]> var_886 = const()[name = tensor<string, []>("op_886"), val = tensor<int32, [2]>([4, 1])];
+            tensor<fp32, [4, 1]> var_887 = reshape(shape = var_886, x = sqrt_s_t_7)[name = tensor<string, []>("op_887")];
+            tensor<fp32, [4, 4]> M_7 = real_div(x = inner_encoder__causal_mask, y = var_887)[name = tensor<string, []>("M_7")];
+            tensor<fp32, [1, 4, 4, 4]> var_889 = mul(x = qk_7, y = M_7)[name = tensor<string, []>("op_889")];
             tensor<bool, []> inner_7_transpose_x_0 = const()[name = tensor<string, []>("inner_7_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_7_transpose_y_0 = const()[name = tensor<string, []>("inner_7_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 4, 64]> v_7 = transpose(perm = v_7_perm_0, x = var_820)[name = tensor<string, []>("transpose_35")];
-            tensor<fp32, [1, 4, 4, 64]> inner_7 = matmul(transpose_x = inner_7_transpose_x_0, transpose_y = inner_7_transpose_y_0, x = var_833, y = v_7)[name = tensor<string, []>("inner_7")];
-            tensor<bool, []> var_835_transpose_x_0 = const()[name = tensor<string, []>("op_835_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_835_transpose_y_0 = const()[name = tensor<string, []>("op_835_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 4, 64]> var_835 = matmul(transpose_x = var_835_transpose_x_0, transpose_y = var_835_transpose_y_0, x = q_7, y = prev_kv_7)[name = tensor<string, []>("op_835")];
-            tensor<fp32, [4]> var_836 = real_div(x = sqrt_s0_7, y = sqrt_s_t_7)[name = tensor<string, []>("op_836")];
-            tensor<int32, [4]> var_837 = const()[name = tensor<string, []>("op_837"), val = tensor<int32, [4]>([1, 1, 4, 1])];
-            tensor<fp32, [1, 1, 4, 1]> var_838 = reshape(shape = var_837, x = var_836)[name = tensor<string, []>("op_838")];
-            tensor<fp32, [1, 4, 4, 64]> cross_7 = mul(x = var_835, y = var_838)[name = tensor<string, []>("cross_7")];
+            tensor<fp32, [1, 4, 4, 64]> v_7 = transpose(perm = v_7_perm_0, x = var_876)[name = tensor<string, []>("transpose_35")];
+            tensor<fp32, [1, 4, 4, 64]> inner_7 = matmul(transpose_x = inner_7_transpose_x_0, transpose_y = inner_7_transpose_y_0, x = var_889, y = v_7)[name = tensor<string, []>("inner_7")];
+            tensor<bool, []> var_891_transpose_x_0 = const()[name = tensor<string, []>("op_891_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_891_transpose_y_0 = const()[name = tensor<string, []>("op_891_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 4, 64]> var_891 = matmul(transpose_x = var_891_transpose_x_0, transpose_y = var_891_transpose_y_0, x = q_7, y = prev_kv_7)[name = tensor<string, []>("op_891")];
+            tensor<fp32, [4]> var_892 = real_div(x = sqrt_s0_7, y = sqrt_s_t_7)[name = tensor<string, []>("op_892")];
+            tensor<int32, [4]> var_893 = const()[name = tensor<string, []>("op_893"), val = tensor<int32, [4]>([1, 1, 4, 1])];
+            tensor<fp32, [1, 1, 4, 1]> var_894 = reshape(shape = var_893, x = var_892)[name = tensor<string, []>("op_894")];
+            tensor<fp32, [1, 4, 4, 64]> cross_7 = mul(x = var_891, y = var_894)[name = tensor<string, []>("cross_7")];
             tensor<fp32, [1, 4, 4, 64]> out_19 = add(x = inner_7, y = cross_7)[name = tensor<string, []>("out_19")];
-            tensor<fp32, [1, 4, 64, 64]> var_841 = mul(x = prev_kv_7, y = sqrt_s0_7)[name = tensor<string, []>("op_841")];
-            tensor<bool, []> var_843_transpose_x_1 = const()[name = tensor<string, []>("op_843_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_843_transpose_y_1 = const()[name = tensor<string, []>("op_843_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_843 = matmul(transpose_x = var_843_transpose_x_1, transpose_y = var_843_transpose_y_1, x = k_7, y = v_7)[name = tensor<string, []>("op_843")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_7 = add(x = var_841, y = var_843)[name = tensor<string, []>("new_kv_unnorm_7")];
-            tensor<fp32, []> var_845 = const()[name = tensor<string, []>("op_845"), val = tensor<fp32, []>(0x1p+2)];
-            tensor<fp32, [1]> new_scale_7 = add(x = prev_scale_7, y = var_845)[name = tensor<string, []>("new_scale_7")];
-            tensor<fp32, [1]> var_847 = sqrt(x = new_scale_7)[name = tensor<string, []>("op_847")];
-            tensor<fp32, [1, 4, 64, 64]> nkv_1 = real_div(x = new_kv_unnorm_7, y = var_847)[name = tensor<string, []>("nkv_1")];
-            tensor<int32, [4]> var_849_perm_0 = const()[name = tensor<string, []>("op_849_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_897 = mul(x = prev_kv_7, y = sqrt_s0_7)[name = tensor<string, []>("op_897")];
+            tensor<bool, []> var_899_transpose_x_1 = const()[name = tensor<string, []>("op_899_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_899_transpose_y_1 = const()[name = tensor<string, []>("op_899_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_899 = matmul(transpose_x = var_899_transpose_x_1, transpose_y = var_899_transpose_y_1, x = k_7, y = v_7)[name = tensor<string, []>("op_899")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_7 = add(x = var_897, y = var_899)[name = tensor<string, []>("new_kv_unnorm_7")];
+            tensor<fp32, []> var_901 = const()[name = tensor<string, []>("op_901"), val = tensor<fp32, []>(0x1p+2)];
+            tensor<fp32, [1]> new_scale_7 = add(x = prev_scale_7, y = var_901)[name = tensor<string, []>("new_scale_7")];
+            tensor<fp32, [1]> var_903 = sqrt(x = new_scale_7)[name = tensor<string, []>("op_903")];
+            tensor<fp32, [1, 4, 64, 64]> nkv_1 = real_div(x = new_kv_unnorm_7, y = var_903)[name = tensor<string, []>("nkv_1")];
+            tensor<int32, [4]> var_905_perm_0 = const()[name = tensor<string, []>("op_905_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_21_axes_0 = const()[name = tensor<string, []>("out_21_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 4, 64]> var_849 = transpose(perm = var_849_perm_0, x = out_19)[name = tensor<string, []>("transpose_34")];
-            tensor<fp32, [1, 4, 4, 64]> out_21 = layer_norm(axes = out_21_axes_0, epsilon = var_17, x = var_849)[name = tensor<string, []>("out_21")];
-            tensor<int32, [3]> var_853 = const()[name = tensor<string, []>("op_853"), val = tensor<int32, [3]>([1, 4, 256])];
-            tensor<fp32, [1, 4, 256]> out_23 = reshape(shape = var_853, x = out_21)[name = tensor<string, []>("out_23")];
-            tensor<fp32, [1, 4, 256]> var_855 = silu(x = input_137)[name = tensor<string, []>("op_855")];
-            tensor<fp32, [1, 4, 256]> input_139 = mul(x = var_855, y = out_23)[name = tensor<string, []>("input_139")];
-            tensor<fp32, [1, 4, 256]> ret_out_7 = linear(bias = encoder_out_proj_3_bias, weight = encoder_out_proj_3_weight, x = input_139)[name = tensor<string, []>("linear_34")];
-            tensor<fp32, [1, 4, 256]> x_21 = add(x = input_133, y = ret_out_7)[name = tensor<string, []>("x_21")];
+            tensor<fp32, [1, 4, 4, 64]> var_905 = transpose(perm = var_905_perm_0, x = out_19)[name = tensor<string, []>("transpose_34")];
+            tensor<fp32, [1, 4, 4, 64]> out_21 = layer_norm(axes = out_21_axes_0, epsilon = var_74, x = var_905)[name = tensor<string, []>("out_21")];
+            tensor<int32, [3]> var_909 = const()[name = tensor<string, []>("op_909"), val = tensor<int32, [3]>([1, 4, 256])];
+            tensor<fp32, [1, 4, 256]> out_23 = reshape(shape = var_909, x = out_21)[name = tensor<string, []>("out_23")];
+            tensor<fp32, [1, 4, 256]> var_911 = silu(x = input_139)[name = tensor<string, []>("op_911")];
+            tensor<fp32, [1, 4, 256]> input_141 = mul(x = var_911, y = out_23)[name = tensor<string, []>("input_141")];
+            tensor<fp32, [1, 4, 256]> ret_out_7 = linear(bias = inner_encoder_out_proj_3_bias, weight = inner_encoder_out_proj_3_weight, x = input_141)[name = tensor<string, []>("linear_34")];
+            tensor<fp32, [1, 4, 256]> x_21 = add(x = input_135, y = ret_out_7)[name = tensor<string, []>("x_21")];
             tensor<int32, [4]> window_31_begin_0 = const()[name = tensor<string, []>("window_31_begin_0"), val = tensor<int32, [4]>([3, 0, 0, 0])];
             tensor<int32, [4]> window_31_end_0 = const()[name = tensor<string, []>("window_31_end_0"), val = tensor<int32, [4]>([4, 1, 16, 256])];
             tensor<bool, [4]> window_31_end_mask_0 = const()[name = tensor<string, []>("window_31_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_31_squeeze_mask_0 = const()[name = tensor<string, []>("window_31_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_31 = slice_by_index(begin = window_31_begin_0, end = window_31_end_0, end_mask = window_31_end_mask_0, squeeze_mask = window_31_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_31")];
-            tensor<int32, [3]> var_863_begin_0 = const()[name = tensor<string, []>("op_863_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_863_end_0 = const()[name = tensor<string, []>("op_863_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_863_end_mask_0 = const()[name = tensor<string, []>("op_863_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_863 = slice_by_index(begin = var_863_begin_0, end = var_863_end_0, end_mask = var_863_end_mask_0, x = x_21)[name = tensor<string, []>("op_863")];
-            tensor<int32, [3]> var_866_begin_0 = const()[name = tensor<string, []>("op_866_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_866_end_0 = const()[name = tensor<string, []>("op_866_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_866_end_mask_0 = const()[name = tensor<string, []>("op_866_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_866 = slice_by_index(begin = var_866_begin_0, end = var_866_end_0, end_mask = var_866_end_mask_0, x = window_31)[name = tensor<string, []>("op_866")];
+            tensor<int32, [3]> var_919_begin_0 = const()[name = tensor<string, []>("op_919_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_919_end_0 = const()[name = tensor<string, []>("op_919_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_919_end_mask_0 = const()[name = tensor<string, []>("op_919_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_919 = slice_by_index(begin = var_919_begin_0, end = var_919_end_0, end_mask = var_919_end_mask_0, x = x_21)[name = tensor<string, []>("op_919")];
+            tensor<int32, [3]> var_922_begin_0 = const()[name = tensor<string, []>("op_922_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_922_end_0 = const()[name = tensor<string, []>("op_922_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_922_end_mask_0 = const()[name = tensor<string, []>("op_922_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_922 = slice_by_index(begin = var_922_begin_0, end = var_922_end_0, end_mask = var_922_end_mask_0, x = window_31)[name = tensor<string, []>("op_922")];
             tensor<bool, []> window_33_interleave_0 = const()[name = tensor<string, []>("window_33_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_33 = concat(axis = var_26, interleave = window_33_interleave_0, values = (var_866, var_863))[name = tensor<string, []>("window_33")];
-            tensor<int32, [3]> var_871_begin_0 = const()[name = tensor<string, []>("op_871_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_871_end_0 = const()[name = tensor<string, []>("op_871_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_871_end_mask_0 = const()[name = tensor<string, []>("op_871_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_871 = slice_by_index(begin = var_871_begin_0, end = var_871_end_0, end_mask = var_871_end_mask_0, x = x_21)[name = tensor<string, []>("op_871")];
-            tensor<int32, [3]> var_874_begin_0 = const()[name = tensor<string, []>("op_874_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_874_end_0 = const()[name = tensor<string, []>("op_874_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_874_end_mask_0 = const()[name = tensor<string, []>("op_874_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_874 = slice_by_index(begin = var_874_begin_0, end = var_874_end_0, end_mask = var_874_end_mask_0, x = window_33)[name = tensor<string, []>("op_874")];
+            tensor<fp32, [1, 16, 256]> window_33 = concat(axis = var_82, interleave = window_33_interleave_0, values = (var_922, var_919))[name = tensor<string, []>("window_33")];
+            tensor<int32, [3]> var_927_begin_0 = const()[name = tensor<string, []>("op_927_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_927_end_0 = const()[name = tensor<string, []>("op_927_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_927_end_mask_0 = const()[name = tensor<string, []>("op_927_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_927 = slice_by_index(begin = var_927_begin_0, end = var_927_end_0, end_mask = var_927_end_mask_0, x = x_21)[name = tensor<string, []>("op_927")];
+            tensor<int32, [3]> var_930_begin_0 = const()[name = tensor<string, []>("op_930_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_930_end_0 = const()[name = tensor<string, []>("op_930_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_930_end_mask_0 = const()[name = tensor<string, []>("op_930_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_930 = slice_by_index(begin = var_930_begin_0, end = var_930_end_0, end_mask = var_930_end_mask_0, x = window_33)[name = tensor<string, []>("op_930")];
             tensor<bool, []> window_35_interleave_0 = const()[name = tensor<string, []>("window_35_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_35 = concat(axis = var_26, interleave = window_35_interleave_0, values = (var_874, var_871))[name = tensor<string, []>("window_35")];
-            tensor<int32, [3]> var_879_begin_0 = const()[name = tensor<string, []>("op_879_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_879_end_0 = const()[name = tensor<string, []>("op_879_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<bool, [3]> var_879_end_mask_0 = const()[name = tensor<string, []>("op_879_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_879 = slice_by_index(begin = var_879_begin_0, end = var_879_end_0, end_mask = var_879_end_mask_0, x = x_21)[name = tensor<string, []>("op_879")];
-            tensor<int32, [3]> var_882_begin_0 = const()[name = tensor<string, []>("op_882_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_882_end_0 = const()[name = tensor<string, []>("op_882_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_882_end_mask_0 = const()[name = tensor<string, []>("op_882_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_882 = slice_by_index(begin = var_882_begin_0, end = var_882_end_0, end_mask = var_882_end_mask_0, x = window_35)[name = tensor<string, []>("op_882")];
+            tensor<fp32, [1, 16, 256]> window_35 = concat(axis = var_82, interleave = window_35_interleave_0, values = (var_930, var_927))[name = tensor<string, []>("window_35")];
+            tensor<int32, [3]> var_935_begin_0 = const()[name = tensor<string, []>("op_935_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_935_end_0 = const()[name = tensor<string, []>("op_935_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<bool, [3]> var_935_end_mask_0 = const()[name = tensor<string, []>("op_935_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_935 = slice_by_index(begin = var_935_begin_0, end = var_935_end_0, end_mask = var_935_end_mask_0, x = x_21)[name = tensor<string, []>("op_935")];
+            tensor<int32, [3]> var_938_begin_0 = const()[name = tensor<string, []>("op_938_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_938_end_0 = const()[name = tensor<string, []>("op_938_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_938_end_mask_0 = const()[name = tensor<string, []>("op_938_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_938 = slice_by_index(begin = var_938_begin_0, end = var_938_end_0, end_mask = var_938_end_mask_0, x = window_35)[name = tensor<string, []>("op_938")];
             tensor<bool, []> window_37_interleave_0 = const()[name = tensor<string, []>("window_37_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_37 = concat(axis = var_26, interleave = window_37_interleave_0, values = (var_882, var_879))[name = tensor<string, []>("window_37")];
-            tensor<int32, [3]> var_887_begin_0 = const()[name = tensor<string, []>("op_887_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
-            tensor<int32, [3]> var_887_end_0 = const()[name = tensor<string, []>("op_887_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_887_end_mask_0 = const()[name = tensor<string, []>("op_887_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_887 = slice_by_index(begin = var_887_begin_0, end = var_887_end_0, end_mask = var_887_end_mask_0, x = x_21)[name = tensor<string, []>("op_887")];
-            tensor<int32, [3]> var_890_begin_0 = const()[name = tensor<string, []>("op_890_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_890_end_0 = const()[name = tensor<string, []>("op_890_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_890_end_mask_0 = const()[name = tensor<string, []>("op_890_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_890 = slice_by_index(begin = var_890_begin_0, end = var_890_end_0, end_mask = var_890_end_mask_0, x = window_37)[name = tensor<string, []>("op_890")];
+            tensor<fp32, [1, 16, 256]> window_37 = concat(axis = var_82, interleave = window_37_interleave_0, values = (var_938, var_935))[name = tensor<string, []>("window_37")];
+            tensor<int32, [3]> var_943_begin_0 = const()[name = tensor<string, []>("op_943_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
+            tensor<int32, [3]> var_943_end_0 = const()[name = tensor<string, []>("op_943_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_943_end_mask_0 = const()[name = tensor<string, []>("op_943_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_943 = slice_by_index(begin = var_943_begin_0, end = var_943_end_0, end_mask = var_943_end_mask_0, x = x_21)[name = tensor<string, []>("op_943")];
+            tensor<int32, [3]> var_946_begin_0 = const()[name = tensor<string, []>("op_946_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_946_end_0 = const()[name = tensor<string, []>("op_946_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_946_end_mask_0 = const()[name = tensor<string, []>("op_946_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_946 = slice_by_index(begin = var_946_begin_0, end = var_946_end_0, end_mask = var_946_end_mask_0, x = window_37)[name = tensor<string, []>("op_946")];
             tensor<bool, []> window_interleave_0 = const()[name = tensor<string, []>("window_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window = concat(axis = var_26, interleave = window_interleave_0, values = (var_890, var_887))[name = tensor<string, []>("window")];
-            tensor<bool, []> input_141_interleave_0 = const()[name = tensor<string, []>("input_141_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [4, 16, 256]> input_141 = concat(axis = var_23, interleave = input_141_interleave_0, values = (window_33, window_35, window_37, window))[name = tensor<string, []>("input_141")];
+            tensor<fp32, [1, 16, 256]> window = concat(axis = var_82, interleave = window_interleave_0, values = (var_946, var_943))[name = tensor<string, []>("window")];
+            tensor<bool, []> input_143_interleave_0 = const()[name = tensor<string, []>("input_143_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [4, 16, 256]> input_143 = concat(axis = var_69, interleave = input_143_interleave_0, values = (window_33, window_35, window_37, window))[name = tensor<string, []>("input_143")];
             tensor<int32, [1]> x_23_axes_0 = const()[name = tensor<string, []>("x_23_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [4, 16, 256]> x_23 = layer_norm(axes = x_23_axes_0, beta = encoder_conv_module_3_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_3_sequential_0_weight, x = input_141)[name = tensor<string, []>("x_23")];
-            tensor<int32, [3]> input_143_perm_0 = const()[name = tensor<string, []>("input_143_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [4, 16, 256]> x_23 = layer_norm(axes = x_23_axes_0, beta = inner_encoder_conv_module_3_sequential_0_bias, epsilon = var_66, gamma = inner_encoder_conv_module_3_sequential_0_weight, x = input_143)[name = tensor<string, []>("x_23")];
+            tensor<int32, [3]> input_145_perm_0 = const()[name = tensor<string, []>("input_145_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_33_pad_type_0 = const()[name = tensor<string, []>("inputs_33_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_33_strides_0 = const()[name = tensor<string, []>("inputs_33_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_33_pad_0 = const()[name = tensor<string, []>("inputs_33_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_33_dilations_0 = const()[name = tensor<string, []>("inputs_33_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_33_groups_0 = const()[name = tensor<string, []>("inputs_33_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> input_143 = transpose(perm = input_143_perm_0, x = x_23)[name = tensor<string, []>("transpose_33")];
-            tensor<fp32, [4, 512, 16]> inputs_33 = conv(bias = encoder_conv_module_3_sequential_2_conv_bias, dilations = inputs_33_dilations_0, groups = inputs_33_groups_0, pad = inputs_33_pad_0, pad_type = inputs_33_pad_type_0, strides = inputs_33_strides_0, weight = encoder_conv_module_3_sequential_2_conv_weight, x = input_143)[name = tensor<string, []>("inputs_33")];
-            tensor<int32, [2]> var_915_split_sizes_0 = const()[name = tensor<string, []>("op_915_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_915_axis_0 = const()[name = tensor<string, []>("op_915_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> var_915_0, tensor<fp32, [4, 256, 16]> var_915_1 = split(axis = var_915_axis_0, split_sizes = var_915_split_sizes_0, x = inputs_33)[name = tensor<string, []>("op_915")];
-            tensor<fp32, [4, 256, 16]> var_917 = sigmoid(x = var_915_1)[name = tensor<string, []>("op_917")];
-            tensor<fp32, [4, 256, 16]> inputs_35 = mul(x = var_915_0, y = var_917)[name = tensor<string, []>("inputs_35")];
+            tensor<fp32, [4, 256, 16]> input_145 = transpose(perm = input_145_perm_0, x = x_23)[name = tensor<string, []>("transpose_33")];
+            tensor<fp32, [4, 512, 16]> inputs_33 = conv(bias = inner_encoder_conv_module_3_sequential_2_conv_bias, dilations = inputs_33_dilations_0, groups = inputs_33_groups_0, pad = inputs_33_pad_0, pad_type = inputs_33_pad_type_0, strides = inputs_33_strides_0, weight = inner_encoder_conv_module_3_sequential_2_conv_weight, x = input_145)[name = tensor<string, []>("inputs_33")];
+            tensor<int32, [2]> var_971_split_sizes_0 = const()[name = tensor<string, []>("op_971_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_971_axis_0 = const()[name = tensor<string, []>("op_971_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [4, 256, 16]> var_971_0, tensor<fp32, [4, 256, 16]> var_971_1 = split(axis = var_971_axis_0, split_sizes = var_971_split_sizes_0, x = inputs_33)[name = tensor<string, []>("op_971")];
+            tensor<fp32, [4, 256, 16]> var_973 = sigmoid(x = var_971_1)[name = tensor<string, []>("op_973")];
+            tensor<fp32, [4, 256, 16]> inputs_35 = mul(x = var_971_0, y = var_973)[name = tensor<string, []>("inputs_35")];
             tensor<string, []> outputs_aug_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_pad_0 = const()[name = tensor<string, []>("outputs_aug_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_groups_0 = const()[name = tensor<string, []>("outputs_aug_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_strides_0 = const()[name = tensor<string, []>("outputs_aug_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_dilations_0 = const()[name = tensor<string, []>("outputs_aug_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [4, 256, 31]> outputs_aug = conv(dilations = outputs_aug_dilations_0, groups = outputs_aug_groups_0, pad = outputs_aug_pad_0, pad_type = outputs_aug_pad_type_0, strides = outputs_aug_strides_0, weight = encoder_conv_module_3_sequential_4_conv_weight, x = inputs_35)[name = tensor<string, []>("outputs_aug")];
-            tensor<int32, [3]> input_145_begin_0 = const()[name = tensor<string, []>("input_145_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_145_end_0 = const()[name = tensor<string, []>("input_145_end_0"), val = tensor<int32, [3]>([4, 256, 16])];
-            tensor<bool, [3]> input_145_end_mask_0 = const()[name = tensor<string, []>("input_145_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [4, 256, 16]> input_145 = slice_by_index(begin = input_145_begin_0, end = input_145_end_0, end_mask = input_145_end_mask_0, x = outputs_aug)[name = tensor<string, []>("input_145")];
-            tensor<fp32, [4, 256, 16]> inputs_37 = batch_norm(beta = encoder_conv_module_3_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_3_sequential_5_weight, mean = encoder_conv_module_3_sequential_5_running_mean, variance = encoder_conv_module_3_sequential_5_running_var, x = input_145)[name = tensor<string, []>("inputs_37")];
-            tensor<fp32, [4, 256, 16]> input_147 = silu(x = inputs_37)[name = tensor<string, []>("input_147")];
-            tensor<string, []> input_149_pad_type_0 = const()[name = tensor<string, []>("input_149_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_149_strides_0 = const()[name = tensor<string, []>("input_149_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_149_pad_0 = const()[name = tensor<string, []>("input_149_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_149_dilations_0 = const()[name = tensor<string, []>("input_149_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_149_groups_0 = const()[name = tensor<string, []>("input_149_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> input_149 = conv(bias = encoder_conv_module_3_sequential_7_conv_bias, dilations = input_149_dilations_0, groups = input_149_groups_0, pad = input_149_pad_0, pad_type = input_149_pad_type_0, strides = input_149_strides_0, weight = encoder_conv_module_3_sequential_7_conv_weight, x = input_147)[name = tensor<string, []>("input_149")];
+            tensor<fp32, [4, 256, 31]> outputs_aug = conv(dilations = outputs_aug_dilations_0, groups = outputs_aug_groups_0, pad = outputs_aug_pad_0, pad_type = outputs_aug_pad_type_0, strides = outputs_aug_strides_0, weight = inner_encoder_conv_module_3_sequential_4_conv_weight, x = inputs_35)[name = tensor<string, []>("outputs_aug")];
+            tensor<int32, [3]> input_147_begin_0 = const()[name = tensor<string, []>("input_147_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_147_end_0 = const()[name = tensor<string, []>("input_147_end_0"), val = tensor<int32, [3]>([4, 256, 16])];
+            tensor<bool, [3]> input_147_end_mask_0 = const()[name = tensor<string, []>("input_147_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [4, 256, 16]> input_147 = slice_by_index(begin = input_147_begin_0, end = input_147_end_0, end_mask = input_147_end_mask_0, x = outputs_aug)[name = tensor<string, []>("input_147")];
+            tensor<fp32, [4, 256, 16]> inputs_37 = batch_norm(beta = inner_encoder_conv_module_3_sequential_5_bias, epsilon = var_66, gamma = inner_encoder_conv_module_3_sequential_5_weight, mean = inner_encoder_conv_module_3_sequential_5_running_mean, variance = inner_encoder_conv_module_3_sequential_5_running_var, x = input_147)[name = tensor<string, []>("inputs_37")];
+            tensor<fp32, [4, 256, 16]> input_149 = silu(x = inputs_37)[name = tensor<string, []>("input_149")];
+            tensor<string, []> input_151_pad_type_0 = const()[name = tensor<string, []>("input_151_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_151_strides_0 = const()[name = tensor<string, []>("input_151_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_151_pad_0 = const()[name = tensor<string, []>("input_151_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_151_dilations_0 = const()[name = tensor<string, []>("input_151_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_151_groups_0 = const()[name = tensor<string, []>("input_151_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [4, 256, 16]> input_151 = conv(bias = inner_encoder_conv_module_3_sequential_7_conv_bias, dilations = input_151_dilations_0, groups = input_151_groups_0, pad = input_151_pad_0, pad_type = input_151_pad_type_0, strides = input_151_strides_0, weight = inner_encoder_conv_module_3_sequential_7_conv_weight, x = input_149)[name = tensor<string, []>("input_151")];
             tensor<int32, [3]> conv_out_7_perm_0 = const()[name = tensor<string, []>("conv_out_7_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_948_begin_0 = const()[name = tensor<string, []>("op_948_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_948_end_0 = const()[name = tensor<string, []>("op_948_end_0"), val = tensor<int32, [3]>([4, 16, 256])];
-            tensor<bool, [3]> var_948_end_mask_0 = const()[name = tensor<string, []>("op_948_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [4, 16, 256]> conv_out_7 = transpose(perm = conv_out_7_perm_0, x = input_149)[name = tensor<string, []>("transpose_32")];
-            tensor<fp32, [4, 1, 256]> var_948 = slice_by_index(begin = var_948_begin_0, end = var_948_end_0, end_mask = var_948_end_mask_0, x = conv_out_7)[name = tensor<string, []>("op_948")];
-            tensor<int32, [3]> var_950_perm_0 = const()[name = tensor<string, []>("op_950_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 4, 256]> var_950 = transpose(perm = var_950_perm_0, x = var_948)[name = tensor<string, []>("transpose_31")];
-            tensor<fp32, [1, 4, 256]> input_151 = add(x = x_21, y = var_950)[name = tensor<string, []>("input_151")];
-            tensor<int32, [1]> input_153_axes_0 = const()[name = tensor<string, []>("input_153_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_153 = layer_norm(axes = input_153_axes_0, beta = encoder_ffn2_3_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_3_module_sequential_0_weight, x = input_151)[name = tensor<string, []>("input_153")];
-            tensor<fp32, [1, 4, 1024]> inputs = linear(bias = encoder_ffn2_3_module_sequential_1_linear_bias, weight = encoder_ffn2_3_module_sequential_1_linear_weight, x = input_153)[name = tensor<string, []>("linear_35")];
-            tensor<fp32, [1, 4, 1024]> input_155 = silu(x = inputs)[name = tensor<string, []>("input_155")];
-            tensor<fp32, [1, 4, 256]> input_159 = linear(bias = encoder_ffn2_3_module_sequential_4_linear_bias, weight = encoder_ffn2_3_module_sequential_4_linear_weight, x = input_155)[name = tensor<string, []>("linear_36")];
-            tensor<fp32, []> var_973 = const()[name = tensor<string, []>("op_973"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 4, 256]> var_974 = mul(x = input_159, y = var_973)[name = tensor<string, []>("op_974")];
-            tensor<fp32, [1, 4, 256]> input_161 = add(x = var_974, y = input_151)[name = tensor<string, []>("input_161")];
+            tensor<int32, [3]> var_1004_begin_0 = const()[name = tensor<string, []>("op_1004_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_1004_end_0 = const()[name = tensor<string, []>("op_1004_end_0"), val = tensor<int32, [3]>([4, 16, 256])];
+            tensor<bool, [3]> var_1004_end_mask_0 = const()[name = tensor<string, []>("op_1004_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [4, 16, 256]> conv_out_7 = transpose(perm = conv_out_7_perm_0, x = input_151)[name = tensor<string, []>("transpose_32")];
+            tensor<fp32, [4, 1, 256]> var_1004 = slice_by_index(begin = var_1004_begin_0, end = var_1004_end_0, end_mask = var_1004_end_mask_0, x = conv_out_7)[name = tensor<string, []>("op_1004")];
+            tensor<int32, [3]> var_1006_perm_0 = const()[name = tensor<string, []>("op_1006_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 4, 256]> var_1006 = transpose(perm = var_1006_perm_0, x = var_1004)[name = tensor<string, []>("transpose_31")];
+            tensor<fp32, [1, 4, 256]> input_153 = add(x = x_21, y = var_1006)[name = tensor<string, []>("input_153")];
+            tensor<int32, [1]> input_155_axes_0 = const()[name = tensor<string, []>("input_155_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 4, 256]> input_155 = layer_norm(axes = input_155_axes_0, beta = inner_encoder_ffn2_3_module_sequential_0_bias, epsilon = var_66, gamma = inner_encoder_ffn2_3_module_sequential_0_weight, x = input_153)[name = tensor<string, []>("input_155")];
+            tensor<fp32, [1, 4, 1024]> inputs = linear(bias = inner_encoder_ffn2_3_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_3_module_sequential_1_linear_weight, x = input_155)[name = tensor<string, []>("linear_35")];
+            tensor<fp32, [1, 4, 1024]> input_157 = silu(x = inputs)[name = tensor<string, []>("input_157")];
+            tensor<fp32, [1, 4, 256]> input_161 = linear(bias = inner_encoder_ffn2_3_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_3_module_sequential_4_linear_weight, x = input_157)[name = tensor<string, []>("linear_36")];
+            tensor<fp32, []> var_1029 = const()[name = tensor<string, []>("op_1029"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 4, 256]> var_1030 = mul(x = input_161, y = var_1029)[name = tensor<string, []>("op_1030")];
+            tensor<fp32, [1, 4, 256]> input_163 = add(x = var_1030, y = input_153)[name = tensor<string, []>("input_163")];
             tensor<int32, [1]> x_25_axes_0 = const()[name = tensor<string, []>("x_25_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> x_25 = layer_norm(axes = x_25_axes_0, beta = encoder_layer_norm_3_bias, epsilon = var_28, gamma = encoder_layer_norm_3_weight, x = input_161)[name = tensor<string, []>("x_25")];
+            tensor<fp32, [1, 4, 256]> x_25 = layer_norm(axes = x_25_axes_0, beta = inner_encoder_layer_norm_3_bias, epsilon = var_66, gamma = inner_encoder_layer_norm_3_weight, x = input_163)[name = tensor<string, []>("x_25")];
             tensor<int32, [3]> x_ct_perm_0 = const()[name = tensor<string, []>("x_ct_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<bool, []> cat_interleave_0 = const()[name = tensor<string, []>("cat_interleave_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [1, 256, 4]> x_ct = transpose(perm = x_ct_perm_0, x = x_25)[name = tensor<string, []>("transpose_30")];
-            tensor<fp32, [1, 256, 22]> cat = concat(axis = var_20, interleave = cat_interleave_0, values = (cnn_window, x_ct))[name = tensor<string, []>("cat")];
+            tensor<fp32, [1, 256, 22]> cat = concat(axis = var_71, interleave = cat_interleave_0, values = (cnn_window, x_ct))[name = tensor<string, []>("cat")];
             tensor<string, []> conv_out_pad_type_0 = const()[name = tensor<string, []>("conv_out_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> conv_out_strides_0 = const()[name = tensor<string, []>("conv_out_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> conv_out_pad_0 = const()[name = tensor<string, []>("conv_out_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> conv_out_dilations_0 = const()[name = tensor<string, []>("conv_out_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> conv_out_groups_0 = const()[name = tensor<string, []>("conv_out_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 4]> conv_out = conv(bias = encoder_cnn_bias, dilations = conv_out_dilations_0, groups = conv_out_groups_0, pad = conv_out_pad_0, pad_type = conv_out_pad_type_0, strides = conv_out_strides_0, weight = encoder_cnn_weight, x = cat)[name = tensor<string, []>("conv_out")];
-            tensor<int32, [3]> var_992_begin_0 = const()[name = tensor<string, []>("op_992_begin_0"), val = tensor<int32, [3]>([0, 0, 4])];
-            tensor<int32, [3]> var_992_end_0 = const()[name = tensor<string, []>("op_992_end_0"), val = tensor<int32, [3]>([1, 256, 22])];
-            tensor<bool, [3]> var_992_end_mask_0 = const()[name = tensor<string, []>("op_992_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 256, 18]> cnn_window_new = slice_by_index(begin = var_992_begin_0, end = var_992_end_0, end_mask = var_992_end_mask_0, x = cat)[name = tensor<string, []>("op_992")];
-            tensor<int32, [3]> input_163_perm_0 = const()[name = tensor<string, []>("input_163_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [1]> var_994 = const()[name = tensor<string, []>("op_994"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_163 = transpose(perm = input_163_perm_0, x = conv_out)[name = tensor<string, []>("transpose_29")];
-            tensor<fp32, [1, 4, 1]> var_995 = reduce_l2_norm(axes = var_994, keep_dims = var_29, x = input_163)[name = tensor<string, []>("op_995")];
+            tensor<fp32, [1, 256, 4]> conv_out = conv(bias = inner_encoder_cnn_bias, dilations = conv_out_dilations_0, groups = conv_out_groups_0, pad = conv_out_pad_0, pad_type = conv_out_pad_type_0, strides = conv_out_strides_0, weight = inner_encoder_cnn_weight, x = cat)[name = tensor<string, []>("conv_out")];
+            tensor<int32, [3]> var_1048_begin_0 = const()[name = tensor<string, []>("op_1048_begin_0"), val = tensor<int32, [3]>([0, 0, 4])];
+            tensor<int32, [3]> var_1048_end_0 = const()[name = tensor<string, []>("op_1048_end_0"), val = tensor<int32, [3]>([1, 256, 22])];
+            tensor<bool, [3]> var_1048_end_mask_0 = const()[name = tensor<string, []>("op_1048_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 256, 18]> cnn_window_new = slice_by_index(begin = var_1048_begin_0, end = var_1048_end_0, end_mask = var_1048_end_mask_0, x = cat)[name = tensor<string, []>("op_1048")];
+            tensor<int32, [3]> input_165_perm_0 = const()[name = tensor<string, []>("input_165_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_1050 = const()[name = tensor<string, []>("op_1050"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 4, 256]> input_165 = transpose(perm = input_165_perm_0, x = conv_out)[name = tensor<string, []>("transpose_29")];
+            tensor<fp32, [1, 4, 1]> var_1051 = reduce_l2_norm(axes = var_1050, keep_dims = var_65, x = input_165)[name = tensor<string, []>("op_1051")];
             tensor<fp32, []> const_12 = const()[name = tensor<string, []>("const_12"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1, 4, 1]> clip_0 = clip(alpha = var_11, beta = const_12, x = var_995)[name = tensor<string, []>("clip_0")];
-            tensor<fp32, [1, 4, 256]> emb = real_div(x = input_163, y = clip_0)[name = tensor<string, []>("emb")];
-            tensor<int32, []> var_999_axis_0 = const()[name = tensor<string, []>("op_999_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1, 4, 64, 64]> enc_kv_new = stack(axis = var_999_axis_0, values = (var_206, var_420, var_634, nkv_1))[name = tensor<string, []>("op_999")];
-            tensor<int32, []> var_1001_axis_0 = const()[name = tensor<string, []>("op_1001_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1]> enc_scale_new = stack(axis = var_1001_axis_0, values = (new_scale_1, new_scale_3, new_scale_5, new_scale_7))[name = tensor<string, []>("op_1001")];
-            tensor<int32, []> var_1003_axis_0 = const()[name = tensor<string, []>("op_1003_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1, 16, 256]> enc_conv_cache_new = stack(axis = var_1003_axis_0, values = (window_9, window_19, window_29, window))[name = tensor<string, []>("op_1003")];
-            tensor<fp32, []> var_1012 = const()[name = tensor<string, []>("op_1012"), val = tensor<fp32, []>(0x1.5798eep-27)];
-            tensor<fp32, []> var_1017 = const()[name = tensor<string, []>("op_1017"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
-            tensor<fp32, []> var_1019 = const()[name = tensor<string, []>("op_1019"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
-            tensor<bool, []> var_1020 = const()[name = tensor<string, []>("op_1020"), val = tensor<bool, []>(true)];
-            tensor<fp32, []> var_1022 = const()[name = tensor<string, []>("op_1022"), val = tensor<fp32, []>(0x1p+0)];
-            tensor<int32, []> var_1026 = const()[name = tensor<string, []>("op_1026"), val = tensor<int32, []>(-1)];
-            tensor<int32, []> var_1032 = const()[name = tensor<string, []>("op_1032"), val = tensor<int32, []>(0)];
+            tensor<fp32, [1, 4, 1]> clip_0 = clip(alpha = var_79, beta = const_12, x = var_1051)[name = tensor<string, []>("clip_0")];
+            tensor<fp32, [1, 4, 256]> emb = real_div(x = input_165, y = clip_0)[name = tensor<string, []>("emb")];
+            tensor<int32, []> var_1055_axis_0 = const()[name = tensor<string, []>("op_1055_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1, 4, 64, 64]> enc_kv_new = stack(axis = var_1055_axis_0, values = (var_262, var_476, var_690, nkv_1))[name = tensor<string, []>("op_1055")];
+            tensor<int32, []> var_1057_axis_0 = const()[name = tensor<string, []>("op_1057_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1]> enc_scale_new = stack(axis = var_1057_axis_0, values = (new_scale_1, new_scale_3, new_scale_5, new_scale_7))[name = tensor<string, []>("op_1057")];
+            tensor<int32, []> var_1059_axis_0 = const()[name = tensor<string, []>("op_1059_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1, 16, 256]> enc_conv_cache_new = stack(axis = var_1059_axis_0, values = (window_9, window_19, window_29, window))[name = tensor<string, []>("op_1059")];
             tensor<fp32, [1, 4, 12, 256]> pos = const()[name = tensor<string, []>("pos"), val = tensor<fp32, [1, 4, 12, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44395584)))];
-            tensor<int32, [1]> var_1094_axes_0 = const()[name = tensor<string, []>("op_1094_axes_0"), val = tensor<int32, [1]>([2])];
-            tensor<fp32, [1, 4, 1, 256]> var_1094 = expand_dims(axes = var_1094_axes_0, x = emb)[name = tensor<string, []>("op_1094")];
+            tensor<int32, [1]> var_1127_axes_0 = const()[name = tensor<string, []>("op_1127_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp32, [1, 4, 1, 256]> var_1127 = expand_dims(axes = var_1127_axes_0, x = emb)[name = tensor<string, []>("op_1127")];
             tensor<int32, [4]> emb_exp_reps_0 = const()[name = tensor<string, []>("emb_exp_reps_0"), val = tensor<int32, [4]>([1, 1, 12, 1])];
-            tensor<fp32, [1, 4, 12, 256]> emb_exp = tile(reps = emb_exp_reps_0, x = var_1094)[name = tensor<string, []>("emb_exp")];
-            tensor<bool, []> input_165_interleave_0 = const()[name = tensor<string, []>("input_165_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 12, 512]> input_165 = concat(axis = var_1026, interleave = input_165_interleave_0, values = (emb_exp, pos))[name = tensor<string, []>("input_165")];
-            tensor<fp32, [1, 4, 12, 256]> x_27 = linear(bias = decoder_convert_bias, weight = decoder_convert_weight, x = input_165)[name = tensor<string, []>("linear_37")];
-            tensor<int32, [4]> var_1102_perm_0 = const()[name = tensor<string, []>("op_1102_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1106 = const()[name = tensor<string, []>("op_1106"), val = tensor<int32, [3]>([12, 4, 256])];
-            tensor<fp32, [1, 12, 4, 256]> var_1102 = transpose(perm = var_1102_perm_0, x = x_27)[name = tensor<string, []>("transpose_28")];
-            tensor<fp32, [12, 4, 256]> x_29 = reshape(shape = var_1106, x = var_1102)[name = tensor<string, []>("x_29")];
+            tensor<fp32, [1, 4, 12, 256]> emb_exp = tile(reps = emb_exp_reps_0, x = var_1127)[name = tensor<string, []>("emb_exp")];
+            tensor<bool, []> input_167_interleave_0 = const()[name = tensor<string, []>("input_167_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 12, 512]> input_167 = concat(axis = var_72, interleave = input_167_interleave_0, values = (emb_exp, pos))[name = tensor<string, []>("input_167")];
+            tensor<fp32, [1, 4, 12, 256]> x_27 = linear(bias = inner_decoder_convert_bias, weight = inner_decoder_convert_weight, x = input_167)[name = tensor<string, []>("linear_37")];
+            tensor<int32, [4]> var_1135_perm_0 = const()[name = tensor<string, []>("op_1135_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1139 = const()[name = tensor<string, []>("op_1139"), val = tensor<int32, [3]>([12, 4, 256])];
+            tensor<fp32, [1, 12, 4, 256]> var_1135 = transpose(perm = var_1135_perm_0, x = x_27)[name = tensor<string, []>("transpose_28")];
+            tensor<fp32, [12, 4, 256]> x_29 = reshape(shape = var_1139, x = var_1135)[name = tensor<string, []>("x_29")];
             tensor<int32, [5]> prev_kv_9_begin_0 = const()[name = tensor<string, []>("prev_kv_9_begin_0"), val = tensor<int32, [5]>([0, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_9_end_0 = const()[name = tensor<string, []>("prev_kv_9_end_0"), val = tensor<int32, [5]>([1, 12, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_9_end_mask_0 = const()[name = tensor<string, []>("prev_kv_9_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -983,132 +998,132 @@ program(1.0)
             tensor<bool, [2]> prev_scale_9_end_mask_0 = const()[name = tensor<string, []>("prev_scale_9_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_9_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_9_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_9 = slice_by_index(begin = prev_scale_9_begin_0, end = prev_scale_9_end_0, end_mask = prev_scale_9_end_mask_0, squeeze_mask = prev_scale_9_squeeze_mask_0, x = dec_scale)[name = tensor<string, []>("prev_scale_9")];
-            tensor<fp32, [12, 4, 256]> var_1114 = linear(bias = decoder_q_proj_0_bias, weight = decoder_q_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_38")];
-            tensor<int32, [4]> var_1115 = const()[name = tensor<string, []>("op_1115"), val = tensor<int32, [4]>([12, 4, 4, 64])];
-            tensor<fp32, [12, 4, 4, 64]> var_1116 = reshape(shape = var_1115, x = var_1114)[name = tensor<string, []>("op_1116")];
+            tensor<fp32, [12, 4, 256]> var_1147 = linear(bias = inner_decoder_q_proj_0_bias, weight = inner_decoder_q_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_38")];
+            tensor<int32, [4]> var_1148 = const()[name = tensor<string, []>("op_1148"), val = tensor<int32, [4]>([12, 4, 4, 64])];
+            tensor<fp32, [12, 4, 4, 64]> var_1149 = reshape(shape = var_1148, x = var_1147)[name = tensor<string, []>("op_1149")];
             tensor<int32, [4]> q_9_perm_0 = const()[name = tensor<string, []>("q_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 4, 256]> var_1120 = linear(bias = decoder_k_proj_0_bias, weight = decoder_k_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_39")];
-            tensor<fp32, []> var_1121 = const()[name = tensor<string, []>("op_1121"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [12, 4, 256]> var_1122 = mul(x = var_1120, y = var_1121)[name = tensor<string, []>("op_1122")];
-            tensor<int32, [4]> var_1123 = const()[name = tensor<string, []>("op_1123"), val = tensor<int32, [4]>([12, 4, 4, 64])];
-            tensor<fp32, [12, 4, 4, 64]> var_1124 = reshape(shape = var_1123, x = var_1122)[name = tensor<string, []>("op_1124")];
+            tensor<fp32, [12, 4, 256]> var_1153 = linear(bias = inner_decoder_k_proj_0_bias, weight = inner_decoder_k_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_39")];
+            tensor<fp32, []> var_1154 = const()[name = tensor<string, []>("op_1154"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [12, 4, 256]> var_1155 = mul(x = var_1153, y = var_1154)[name = tensor<string, []>("op_1155")];
+            tensor<int32, [4]> var_1156 = const()[name = tensor<string, []>("op_1156"), val = tensor<int32, [4]>([12, 4, 4, 64])];
+            tensor<fp32, [12, 4, 4, 64]> var_1157 = reshape(shape = var_1156, x = var_1155)[name = tensor<string, []>("op_1157")];
             tensor<int32, [4]> k_9_perm_0 = const()[name = tensor<string, []>("k_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 4, 256]> var_1128 = linear(bias = decoder_v_proj_0_bias, weight = decoder_v_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_40")];
-            tensor<int32, [4]> var_1129 = const()[name = tensor<string, []>("op_1129"), val = tensor<int32, [4]>([12, 4, 4, 64])];
-            tensor<fp32, [12, 4, 4, 64]> var_1130 = reshape(shape = var_1129, x = var_1128)[name = tensor<string, []>("op_1130")];
+            tensor<fp32, [12, 4, 256]> var_1161 = linear(bias = inner_decoder_v_proj_0_bias, weight = inner_decoder_v_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_40")];
+            tensor<int32, [4]> var_1162 = const()[name = tensor<string, []>("op_1162"), val = tensor<int32, [4]>([12, 4, 4, 64])];
+            tensor<fp32, [12, 4, 4, 64]> var_1163 = reshape(shape = var_1162, x = var_1161)[name = tensor<string, []>("op_1163")];
             tensor<int32, [4]> v_9_perm_0 = const()[name = tensor<string, []>("v_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 4, 256]> input_169 = linear(bias = decoder_g_proj_0_bias, weight = decoder_g_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_41")];
+            tensor<fp32, [12, 4, 256]> input_171 = linear(bias = inner_decoder_g_proj_0_bias, weight = inner_decoder_g_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_41")];
             tensor<bool, []> cumsum_mask_1_exclusive_0 = const()[name = tensor<string, []>("cumsum_mask_1_exclusive_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> cumsum_mask_1_reverse_0 = const()[name = tensor<string, []>("cumsum_mask_1_reverse_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [4]> cumsum_mask_1 = cumsum(axis = var_1032, exclusive = cumsum_mask_1_exclusive_0, reverse = cumsum_mask_1_reverse_0, x = valid_mask)[name = tensor<string, []>("cumsum_mask_1")];
+            tensor<fp32, [4]> cumsum_mask_1 = cumsum(axis = var_69, exclusive = cumsum_mask_1_exclusive_0, reverse = cumsum_mask_1_reverse_0, x = valid_mask)[name = tensor<string, []>("cumsum_mask_1")];
             tensor<fp32, [1]> sqrt_s0_9 = sqrt(x = prev_scale_9)[name = tensor<string, []>("sqrt_s0_9")];
             tensor<fp32, [4]> s_t_9 = add(x = prev_scale_9, y = cumsum_mask_1)[name = tensor<string, []>("s_t_9")];
             tensor<fp32, []> const_20 = const()[name = tensor<string, []>("const_20"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [4]> clip_1 = clip(alpha = var_1022, beta = const_20, x = s_t_9)[name = tensor<string, []>("clip_1")];
+            tensor<fp32, [4]> clip_1 = clip(alpha = var_59, beta = const_20, x = s_t_9)[name = tensor<string, []>("clip_1")];
             tensor<fp32, [4]> sqrt_s_t_9 = sqrt(x = clip_1)[name = tensor<string, []>("sqrt_s_t_9")];
             tensor<bool, []> qk_9_transpose_x_1 = const()[name = tensor<string, []>("qk_9_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_9_transpose_y_1 = const()[name = tensor<string, []>("qk_9_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [12, 4, 4, 64]> k_9 = transpose(perm = k_9_perm_0, x = var_1124)[name = tensor<string, []>("transpose_26")];
-            tensor<fp32, [12, 4, 4, 64]> q_9 = transpose(perm = q_9_perm_0, x = var_1116)[name = tensor<string, []>("transpose_27")];
+            tensor<fp32, [12, 4, 4, 64]> k_9 = transpose(perm = k_9_perm_0, x = var_1157)[name = tensor<string, []>("transpose_26")];
+            tensor<fp32, [12, 4, 4, 64]> q_9 = transpose(perm = q_9_perm_0, x = var_1149)[name = tensor<string, []>("transpose_27")];
             tensor<fp32, [12, 4, 4, 4]> qk_9 = matmul(transpose_x = qk_9_transpose_x_1, transpose_y = qk_9_transpose_y_1, x = q_9, y = k_9)[name = tensor<string, []>("qk_9")];
-            tensor<int32, [2]> var_1142 = const()[name = tensor<string, []>("op_1142"), val = tensor<int32, [2]>([1, 4])];
-            tensor<fp32, [1, 4]> var_1143 = reshape(shape = var_1142, x = valid_mask)[name = tensor<string, []>("op_1143")];
-            tensor<fp32, [4, 4]> causal_with_valid_1 = mul(x = decoder__causal_mask, y = var_1143)[name = tensor<string, []>("causal_with_valid_1")];
-            tensor<int32, [2]> var_1145 = const()[name = tensor<string, []>("op_1145"), val = tensor<int32, [2]>([4, 1])];
-            tensor<fp32, [4, 1]> var_1146 = reshape(shape = var_1145, x = sqrt_s_t_9)[name = tensor<string, []>("op_1146")];
-            tensor<fp32, [4, 4]> M_9 = real_div(x = causal_with_valid_1, y = var_1146)[name = tensor<string, []>("M_9")];
-            tensor<fp32, [12, 4, 4, 4]> var_1148 = mul(x = qk_9, y = M_9)[name = tensor<string, []>("op_1148")];
+            tensor<int32, [2]> var_1175 = const()[name = tensor<string, []>("op_1175"), val = tensor<int32, [2]>([1, 4])];
+            tensor<fp32, [1, 4]> var_1176 = reshape(shape = var_1175, x = valid_mask)[name = tensor<string, []>("op_1176")];
+            tensor<fp32, [4, 4]> causal_with_valid_1 = mul(x = inner_decoder__causal_mask, y = var_1176)[name = tensor<string, []>("causal_with_valid_1")];
+            tensor<int32, [2]> var_1178 = const()[name = tensor<string, []>("op_1178"), val = tensor<int32, [2]>([4, 1])];
+            tensor<fp32, [4, 1]> var_1179 = reshape(shape = var_1178, x = sqrt_s_t_9)[name = tensor<string, []>("op_1179")];
+            tensor<fp32, [4, 4]> M_9 = real_div(x = causal_with_valid_1, y = var_1179)[name = tensor<string, []>("M_9")];
+            tensor<fp32, [12, 4, 4, 4]> var_1181 = mul(x = qk_9, y = M_9)[name = tensor<string, []>("op_1181")];
             tensor<bool, []> inner_9_transpose_x_0 = const()[name = tensor<string, []>("inner_9_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_9_transpose_y_0 = const()[name = tensor<string, []>("inner_9_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 4, 64]> v_9 = transpose(perm = v_9_perm_0, x = var_1130)[name = tensor<string, []>("transpose_25")];
-            tensor<fp32, [12, 4, 4, 64]> inner_9 = matmul(transpose_x = inner_9_transpose_x_0, transpose_y = inner_9_transpose_y_0, x = var_1148, y = v_9)[name = tensor<string, []>("inner_9")];
-            tensor<bool, []> var_1150_transpose_x_0 = const()[name = tensor<string, []>("op_1150_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_1150_transpose_y_0 = const()[name = tensor<string, []>("op_1150_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 4, 64]> var_1150 = matmul(transpose_x = var_1150_transpose_x_0, transpose_y = var_1150_transpose_y_0, x = q_9, y = prev_kv_9)[name = tensor<string, []>("op_1150")];
-            tensor<fp32, [4]> var_1151 = real_div(x = sqrt_s0_9, y = sqrt_s_t_9)[name = tensor<string, []>("op_1151")];
-            tensor<int32, [4]> var_1152 = const()[name = tensor<string, []>("op_1152"), val = tensor<int32, [4]>([1, 1, 4, 1])];
-            tensor<fp32, [1, 1, 4, 1]> var_1153 = reshape(shape = var_1152, x = var_1151)[name = tensor<string, []>("op_1153")];
-            tensor<fp32, [12, 4, 4, 64]> cross_9 = mul(x = var_1150, y = var_1153)[name = tensor<string, []>("cross_9")];
+            tensor<fp32, [12, 4, 4, 64]> v_9 = transpose(perm = v_9_perm_0, x = var_1163)[name = tensor<string, []>("transpose_25")];
+            tensor<fp32, [12, 4, 4, 64]> inner_9 = matmul(transpose_x = inner_9_transpose_x_0, transpose_y = inner_9_transpose_y_0, x = var_1181, y = v_9)[name = tensor<string, []>("inner_9")];
+            tensor<bool, []> var_1183_transpose_x_0 = const()[name = tensor<string, []>("op_1183_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_1183_transpose_y_0 = const()[name = tensor<string, []>("op_1183_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 4, 64]> var_1183 = matmul(transpose_x = var_1183_transpose_x_0, transpose_y = var_1183_transpose_y_0, x = q_9, y = prev_kv_9)[name = tensor<string, []>("op_1183")];
+            tensor<fp32, [4]> var_1184 = real_div(x = sqrt_s0_9, y = sqrt_s_t_9)[name = tensor<string, []>("op_1184")];
+            tensor<int32, [4]> var_1185 = const()[name = tensor<string, []>("op_1185"), val = tensor<int32, [4]>([1, 1, 4, 1])];
+            tensor<fp32, [1, 1, 4, 1]> var_1186 = reshape(shape = var_1185, x = var_1184)[name = tensor<string, []>("op_1186")];
+            tensor<fp32, [12, 4, 4, 64]> cross_9 = mul(x = var_1183, y = var_1186)[name = tensor<string, []>("cross_9")];
             tensor<fp32, [12, 4, 4, 64]> out_25 = add(x = inner_9, y = cross_9)[name = tensor<string, []>("out_25")];
-            tensor<int32, [4]> var_1156 = const()[name = tensor<string, []>("op_1156"), val = tensor<int32, [4]>([1, 1, 4, 1])];
-            tensor<fp32, [1, 1, 4, 1]> var_1157 = reshape(shape = var_1156, x = valid_mask)[name = tensor<string, []>("op_1157")];
-            tensor<fp32, [12, 4, 4, 64]> v_masked_1 = mul(x = v_9, y = var_1157)[name = tensor<string, []>("v_masked_1")];
-            tensor<fp32, [12, 4, 64, 64]> var_1159 = mul(x = prev_kv_9, y = sqrt_s0_9)[name = tensor<string, []>("op_1159")];
-            tensor<bool, []> var_1161_transpose_x_1 = const()[name = tensor<string, []>("op_1161_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_1161_transpose_y_1 = const()[name = tensor<string, []>("op_1161_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 64, 64]> var_1161 = matmul(transpose_x = var_1161_transpose_x_1, transpose_y = var_1161_transpose_y_1, x = k_9, y = v_masked_1)[name = tensor<string, []>("op_1161")];
-            tensor<fp32, [12, 4, 64, 64]> new_kv_unnorm_9 = add(x = var_1159, y = var_1161)[name = tensor<string, []>("new_kv_unnorm_9")];
-            tensor<bool, []> var_1163_keep_dims_0 = const()[name = tensor<string, []>("op_1163_keep_dims_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, []> var_1163 = reduce_sum(keep_dims = var_1163_keep_dims_0, x = valid_mask)[name = tensor<string, []>("op_1163")];
-            tensor<int32, [1]> var_1164 = const()[name = tensor<string, []>("op_1164"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [1]> var_1165 = reshape(shape = var_1164, x = var_1163)[name = tensor<string, []>("op_1165")];
-            tensor<fp32, [1]> new_scale_9 = add(x = prev_scale_9, y = var_1165)[name = tensor<string, []>("new_scale_9")];
+            tensor<int32, [4]> var_1189 = const()[name = tensor<string, []>("op_1189"), val = tensor<int32, [4]>([1, 1, 4, 1])];
+            tensor<fp32, [1, 1, 4, 1]> var_1190 = reshape(shape = var_1189, x = valid_mask)[name = tensor<string, []>("op_1190")];
+            tensor<fp32, [12, 4, 4, 64]> v_masked_1 = mul(x = v_9, y = var_1190)[name = tensor<string, []>("v_masked_1")];
+            tensor<fp32, [12, 4, 64, 64]> var_1192 = mul(x = prev_kv_9, y = sqrt_s0_9)[name = tensor<string, []>("op_1192")];
+            tensor<bool, []> var_1194_transpose_x_1 = const()[name = tensor<string, []>("op_1194_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_1194_transpose_y_1 = const()[name = tensor<string, []>("op_1194_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 64, 64]> var_1194 = matmul(transpose_x = var_1194_transpose_x_1, transpose_y = var_1194_transpose_y_1, x = k_9, y = v_masked_1)[name = tensor<string, []>("op_1194")];
+            tensor<fp32, [12, 4, 64, 64]> new_kv_unnorm_9 = add(x = var_1192, y = var_1194)[name = tensor<string, []>("new_kv_unnorm_9")];
+            tensor<bool, []> var_1196_keep_dims_0 = const()[name = tensor<string, []>("op_1196_keep_dims_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, []> var_1196 = reduce_sum(keep_dims = var_1196_keep_dims_0, x = valid_mask)[name = tensor<string, []>("op_1196")];
+            tensor<int32, [1]> var_1197 = const()[name = tensor<string, []>("op_1197"), val = tensor<int32, [1]>([1])];
+            tensor<fp32, [1]> var_1198 = reshape(shape = var_1197, x = var_1196)[name = tensor<string, []>("op_1198")];
+            tensor<fp32, [1]> new_scale_9 = add(x = prev_scale_9, y = var_1198)[name = tensor<string, []>("new_scale_9")];
             tensor<fp32, []> const_21 = const()[name = tensor<string, []>("const_21"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_2 = clip(alpha = var_1022, beta = const_21, x = new_scale_9)[name = tensor<string, []>("clip_2")];
+            tensor<fp32, [1]> clip_2 = clip(alpha = var_59, beta = const_21, x = new_scale_9)[name = tensor<string, []>("clip_2")];
             tensor<fp32, [1]> sqrt_new_scale_1 = sqrt(x = clip_2)[name = tensor<string, []>("sqrt_new_scale_1")];
-            tensor<fp32, [12, 4, 64, 64]> var_1169 = real_div(x = new_kv_unnorm_9, y = sqrt_new_scale_1)[name = tensor<string, []>("op_1169")];
-            tensor<int32, [4]> var_1170_perm_0 = const()[name = tensor<string, []>("op_1170_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [12, 4, 64, 64]> var_1202 = real_div(x = new_kv_unnorm_9, y = sqrt_new_scale_1)[name = tensor<string, []>("op_1202")];
+            tensor<int32, [4]> var_1203_perm_0 = const()[name = tensor<string, []>("op_1203_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_27_axes_0 = const()[name = tensor<string, []>("out_27_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [12, 4, 4, 64]> var_1170 = transpose(perm = var_1170_perm_0, x = out_25)[name = tensor<string, []>("transpose_24")];
-            tensor<fp32, [12, 4, 4, 64]> out_27 = layer_norm(axes = out_27_axes_0, epsilon = var_1019, x = var_1170)[name = tensor<string, []>("out_27")];
-            tensor<int32, [3]> var_1174 = const()[name = tensor<string, []>("op_1174"), val = tensor<int32, [3]>([12, 4, 256])];
-            tensor<fp32, [12, 4, 256]> out_29 = reshape(shape = var_1174, x = out_27)[name = tensor<string, []>("out_29")];
-            tensor<fp32, [12, 4, 256]> var_1176 = silu(x = input_169)[name = tensor<string, []>("op_1176")];
-            tensor<fp32, [12, 4, 256]> input_171 = mul(x = var_1176, y = out_29)[name = tensor<string, []>("input_171")];
-            tensor<fp32, [12, 4, 256]> ret_out_9 = linear(bias = decoder_out_proj_0_bias, weight = decoder_out_proj_0_weight, x = input_171)[name = tensor<string, []>("linear_42")];
-            tensor<fp32, [12, 4, 256]> input_173 = add(x = x_29, y = ret_out_9)[name = tensor<string, []>("input_173")];
+            tensor<fp32, [12, 4, 4, 64]> var_1203 = transpose(perm = var_1203_perm_0, x = out_25)[name = tensor<string, []>("transpose_24")];
+            tensor<fp32, [12, 4, 4, 64]> out_27 = layer_norm(axes = out_27_axes_0, epsilon = var_74, x = var_1203)[name = tensor<string, []>("out_27")];
+            tensor<int32, [3]> var_1207 = const()[name = tensor<string, []>("op_1207"), val = tensor<int32, [3]>([12, 4, 256])];
+            tensor<fp32, [12, 4, 256]> out_29 = reshape(shape = var_1207, x = out_27)[name = tensor<string, []>("out_29")];
+            tensor<fp32, [12, 4, 256]> var_1209 = silu(x = input_171)[name = tensor<string, []>("op_1209")];
+            tensor<fp32, [12, 4, 256]> input_173 = mul(x = var_1209, y = out_29)[name = tensor<string, []>("input_173")];
+            tensor<fp32, [12, 4, 256]> ret_out_9 = linear(bias = inner_decoder_out_proj_0_bias, weight = inner_decoder_out_proj_0_weight, x = input_173)[name = tensor<string, []>("linear_42")];
+            tensor<fp32, [12, 4, 256]> input_175 = add(x = x_29, y = ret_out_9)[name = tensor<string, []>("input_175")];
             tensor<int32, [1]> xt_1_axes_0 = const()[name = tensor<string, []>("xt_1_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [12, 4, 256]> xt_1 = layer_norm(axes = xt_1_axes_0, beta = decoder_norm11_0_bias, epsilon = var_1017, gamma = decoder_norm11_0_weight, x = input_173)[name = tensor<string, []>("xt_1")];
-            tensor<int32, [4]> var_1186 = const()[name = tensor<string, []>("op_1186"), val = tensor<int32, [4]>([1, 12, 4, 256])];
-            tensor<fp32, [1, 12, 4, 256]> var_1187 = reshape(shape = var_1186, x = xt_1)[name = tensor<string, []>("op_1187")];
-            tensor<int32, [4]> var_1188_perm_0 = const()[name = tensor<string, []>("op_1188_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1191 = const()[name = tensor<string, []>("op_1191"), val = tensor<int32, [3]>([4, 12, 256])];
-            tensor<fp32, [1, 4, 12, 256]> var_1188 = transpose(perm = var_1188_perm_0, x = var_1187)[name = tensor<string, []>("transpose_23")];
-            tensor<fp32, [4, 12, 256]> query_1 = reshape(shape = var_1191, x = var_1188)[name = tensor<string, []>("query_1")];
+            tensor<fp32, [12, 4, 256]> xt_1 = layer_norm(axes = xt_1_axes_0, beta = inner_decoder_norm11_0_bias, epsilon = var_66, gamma = inner_decoder_norm11_0_weight, x = input_175)[name = tensor<string, []>("xt_1")];
+            tensor<int32, [4]> var_1219 = const()[name = tensor<string, []>("op_1219"), val = tensor<int32, [4]>([1, 12, 4, 256])];
+            tensor<fp32, [1, 12, 4, 256]> var_1220 = reshape(shape = var_1219, x = xt_1)[name = tensor<string, []>("op_1220")];
+            tensor<int32, [4]> var_1221_perm_0 = const()[name = tensor<string, []>("op_1221_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1224 = const()[name = tensor<string, []>("op_1224"), val = tensor<int32, [3]>([4, 12, 256])];
+            tensor<fp32, [1, 4, 12, 256]> var_1221 = transpose(perm = var_1221_perm_0, x = var_1220)[name = tensor<string, []>("transpose_23")];
+            tensor<fp32, [4, 12, 256]> query_1 = reshape(shape = var_1224, x = var_1221)[name = tensor<string, []>("query_1")];
             tensor<int32, [3]> query_3_perm_0 = const()[name = tensor<string, []>("query_3_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [12, 4, 256]> query_3 = transpose(perm = query_3_perm_0, x = query_1)[name = tensor<string, []>("transpose_22")];
-            tensor<fp32, [12, 4, 768]> var_1214 = linear(bias = decoder_self_attn2_0_in_proj_bias, weight = decoder_self_attn2_0_in_proj_weight, x = query_3)[name = tensor<string, []>("linear_43")];
+            tensor<fp32, [12, 4, 768]> var_1247 = linear(bias = inner_decoder_self_attn2_0_in_proj_bias, weight = inner_decoder_self_attn2_0_in_proj_weight, x = query_3)[name = tensor<string, []>("linear_43")];
             tensor<int32, [4]> concat_1 = const()[name = tensor<string, []>("concat_1"), val = tensor<int32, [4]>([12, 4, 3, 256])];
-            tensor<fp32, [12, 4, 3, 256]> var_1216 = reshape(shape = concat_1, x = var_1214)[name = tensor<string, []>("op_1216")];
-            tensor<int32, [1]> var_1217_axes_0 = const()[name = tensor<string, []>("op_1217_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp32, [1, 12, 4, 3, 256]> var_1217 = expand_dims(axes = var_1217_axes_0, x = var_1216)[name = tensor<string, []>("op_1217")];
-            tensor<int32, [5]> var_1218_perm_0 = const()[name = tensor<string, []>("op_1218_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
-            tensor<int32, [1]> var_1219_axes_0 = const()[name = tensor<string, []>("op_1219_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp32, [3, 12, 4, 1, 256]> var_1218 = transpose(perm = var_1218_perm_0, x = var_1217)[name = tensor<string, []>("transpose_21")];
-            tensor<fp32, [3, 12, 4, 256]> var_1219 = squeeze(axes = var_1219_axes_0, x = var_1218)[name = tensor<string, []>("op_1219")];
+            tensor<fp32, [12, 4, 3, 256]> var_1249 = reshape(shape = concat_1, x = var_1247)[name = tensor<string, []>("op_1249")];
+            tensor<int32, [1]> var_1250_axes_0 = const()[name = tensor<string, []>("op_1250_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 12, 4, 3, 256]> var_1250 = expand_dims(axes = var_1250_axes_0, x = var_1249)[name = tensor<string, []>("op_1250")];
+            tensor<int32, [5]> var_1251_perm_0 = const()[name = tensor<string, []>("op_1251_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
+            tensor<int32, [1]> var_1252_axes_0 = const()[name = tensor<string, []>("op_1252_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp32, [3, 12, 4, 1, 256]> var_1251 = transpose(perm = var_1251_perm_0, x = var_1250)[name = tensor<string, []>("transpose_21")];
+            tensor<fp32, [3, 12, 4, 256]> var_1252 = squeeze(axes = var_1252_axes_0, x = var_1251)[name = tensor<string, []>("op_1252")];
             tensor<int32, [4]> q_11_begin_0 = const()[name = tensor<string, []>("q_11_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> q_11_end_0 = const()[name = tensor<string, []>("q_11_end_0"), val = tensor<int32, [4]>([1, 12, 4, 256])];
             tensor<bool, [4]> q_11_end_mask_0 = const()[name = tensor<string, []>("q_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> q_11_squeeze_mask_0 = const()[name = tensor<string, []>("q_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 4, 256]> q_11 = slice_by_index(begin = q_11_begin_0, end = q_11_end_0, end_mask = q_11_end_mask_0, squeeze_mask = q_11_squeeze_mask_0, x = var_1219)[name = tensor<string, []>("q_11")];
+            tensor<fp32, [12, 4, 256]> q_11 = slice_by_index(begin = q_11_begin_0, end = q_11_end_0, end_mask = q_11_end_mask_0, squeeze_mask = q_11_squeeze_mask_0, x = var_1252)[name = tensor<string, []>("q_11")];
             tensor<int32, [4]> k_11_begin_0 = const()[name = tensor<string, []>("k_11_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> k_11_end_0 = const()[name = tensor<string, []>("k_11_end_0"), val = tensor<int32, [4]>([2, 12, 4, 256])];
             tensor<bool, [4]> k_11_end_mask_0 = const()[name = tensor<string, []>("k_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> k_11_squeeze_mask_0 = const()[name = tensor<string, []>("k_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 4, 256]> k_11 = slice_by_index(begin = k_11_begin_0, end = k_11_end_0, end_mask = k_11_end_mask_0, squeeze_mask = k_11_squeeze_mask_0, x = var_1219)[name = tensor<string, []>("k_11")];
+            tensor<fp32, [12, 4, 256]> k_11 = slice_by_index(begin = k_11_begin_0, end = k_11_end_0, end_mask = k_11_end_mask_0, squeeze_mask = k_11_squeeze_mask_0, x = var_1252)[name = tensor<string, []>("k_11")];
             tensor<int32, [4]> v_11_begin_0 = const()[name = tensor<string, []>("v_11_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> v_11_end_0 = const()[name = tensor<string, []>("v_11_end_0"), val = tensor<int32, [4]>([3, 12, 4, 256])];
             tensor<bool, [4]> v_11_end_mask_0 = const()[name = tensor<string, []>("v_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> v_11_squeeze_mask_0 = const()[name = tensor<string, []>("v_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 4, 256]> v_11 = slice_by_index(begin = v_11_begin_0, end = v_11_end_0, end_mask = v_11_end_mask_0, squeeze_mask = v_11_squeeze_mask_0, x = var_1219)[name = tensor<string, []>("v_11")];
-            tensor<int32, [3]> var_1227 = const()[name = tensor<string, []>("op_1227"), val = tensor<int32, [3]>([12, 16, 64])];
-            tensor<fp32, [12, 16, 64]> var_1228 = reshape(shape = var_1227, x = q_11)[name = tensor<string, []>("op_1228")];
+            tensor<fp32, [12, 4, 256]> v_11 = slice_by_index(begin = v_11_begin_0, end = v_11_end_0, end_mask = v_11_end_mask_0, squeeze_mask = v_11_squeeze_mask_0, x = var_1252)[name = tensor<string, []>("v_11")];
+            tensor<int32, [3]> var_1260 = const()[name = tensor<string, []>("op_1260"), val = tensor<int32, [3]>([12, 16, 64])];
+            tensor<fp32, [12, 16, 64]> var_1261 = reshape(shape = var_1260, x = q_11)[name = tensor<string, []>("op_1261")];
             tensor<int32, [3]> q_13_perm_0 = const()[name = tensor<string, []>("q_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1234 = const()[name = tensor<string, []>("op_1234"), val = tensor<int32, [3]>([12, 16, 64])];
-            tensor<fp32, [12, 16, 64]> var_1235 = reshape(shape = var_1234, x = k_11)[name = tensor<string, []>("op_1235")];
+            tensor<int32, [3]> var_1267 = const()[name = tensor<string, []>("op_1267"), val = tensor<int32, [3]>([12, 16, 64])];
+            tensor<fp32, [12, 16, 64]> var_1268 = reshape(shape = var_1267, x = k_11)[name = tensor<string, []>("op_1268")];
             tensor<int32, [3]> k_13_perm_0 = const()[name = tensor<string, []>("k_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1241 = const()[name = tensor<string, []>("op_1241"), val = tensor<int32, [3]>([12, 16, 64])];
-            tensor<fp32, [12, 16, 64]> var_1242 = reshape(shape = var_1241, x = v_11)[name = tensor<string, []>("op_1242")];
+            tensor<int32, [3]> var_1274 = const()[name = tensor<string, []>("op_1274"), val = tensor<int32, [3]>([12, 16, 64])];
+            tensor<fp32, [12, 16, 64]> var_1275 = reshape(shape = var_1274, x = v_11)[name = tensor<string, []>("op_1275")];
             tensor<int32, [3]> v_13_perm_0 = const()[name = tensor<string, []>("v_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [4]> var_1245 = const()[name = tensor<string, []>("op_1245"), val = tensor<int32, [4]>([4, 4, 12, 64])];
-            tensor<fp32, [16, 12, 64]> q_13 = transpose(perm = q_13_perm_0, x = var_1228)[name = tensor<string, []>("transpose_20")];
-            tensor<fp32, [4, 4, 12, 64]> q_15 = reshape(shape = var_1245, x = q_13)[name = tensor<string, []>("q_15")];
-            tensor<int32, [4]> var_1247 = const()[name = tensor<string, []>("op_1247"), val = tensor<int32, [4]>([4, 4, 12, 64])];
-            tensor<fp32, [16, 12, 64]> k_13 = transpose(perm = k_13_perm_0, x = var_1235)[name = tensor<string, []>("transpose_19")];
-            tensor<fp32, [4, 4, 12, 64]> k_15 = reshape(shape = var_1247, x = k_13)[name = tensor<string, []>("k_15")];
-            tensor<int32, [4]> var_1249 = const()[name = tensor<string, []>("op_1249"), val = tensor<int32, [4]>([4, 4, 12, 64])];
-            tensor<fp32, [16, 12, 64]> v_13 = transpose(perm = v_13_perm_0, x = var_1242)[name = tensor<string, []>("transpose_18")];
-            tensor<fp32, [4, 4, 12, 64]> v_15 = reshape(shape = var_1249, x = v_13)[name = tensor<string, []>("v_15")];
+            tensor<int32, [4]> var_1278 = const()[name = tensor<string, []>("op_1278"), val = tensor<int32, [4]>([4, 4, 12, 64])];
+            tensor<fp32, [16, 12, 64]> q_13 = transpose(perm = q_13_perm_0, x = var_1261)[name = tensor<string, []>("transpose_20")];
+            tensor<fp32, [4, 4, 12, 64]> q_15 = reshape(shape = var_1278, x = q_13)[name = tensor<string, []>("q_15")];
+            tensor<int32, [4]> var_1280 = const()[name = tensor<string, []>("op_1280"), val = tensor<int32, [4]>([4, 4, 12, 64])];
+            tensor<fp32, [16, 12, 64]> k_13 = transpose(perm = k_13_perm_0, x = var_1268)[name = tensor<string, []>("transpose_19")];
+            tensor<fp32, [4, 4, 12, 64]> k_15 = reshape(shape = var_1280, x = k_13)[name = tensor<string, []>("k_15")];
+            tensor<int32, [4]> var_1282 = const()[name = tensor<string, []>("op_1282"), val = tensor<int32, [4]>([4, 4, 12, 64])];
+            tensor<fp32, [16, 12, 64]> v_13 = transpose(perm = v_13_perm_0, x = var_1275)[name = tensor<string, []>("transpose_18")];
+            tensor<fp32, [4, 4, 12, 64]> v_15 = reshape(shape = var_1282, x = v_13)[name = tensor<string, []>("v_15")];
             tensor<fp32, []> mul_1_y_0 = const()[name = tensor<string, []>("mul_1_y_0"), val = tensor<fp32, []>(0x1p-3)];
             tensor<fp32, [4, 4, 12, 64]> mul_1 = mul(x = q_15, y = mul_1_y_0)[name = tensor<string, []>("mul_1")];
             tensor<bool, []> matmul_0_transpose_y_0 = const()[name = tensor<string, []>("matmul_0_transpose_y_0"), val = tensor<bool, []>(true)];
@@ -1119,30 +1134,30 @@ program(1.0)
             tensor<bool, []> attn_output_1_transpose_x_0 = const()[name = tensor<string, []>("attn_output_1_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> attn_output_1_transpose_y_0 = const()[name = tensor<string, []>("attn_output_1_transpose_y_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [4, 4, 12, 64]> attn_output_1 = matmul(transpose_x = attn_output_1_transpose_x_0, transpose_y = attn_output_1_transpose_y_0, x = softmax_0, y = v_15)[name = tensor<string, []>("attn_output_1")];
-            tensor<int32, [4]> var_1252 = const()[name = tensor<string, []>("op_1252"), val = tensor<int32, [4]>([2, 0, 1, 3])];
-            tensor<int32, [2]> var_1257 = const()[name = tensor<string, []>("op_1257"), val = tensor<int32, [2]>([48, 256])];
-            tensor<fp32, [12, 4, 4, 64]> var_1253 = transpose(perm = var_1252, x = attn_output_1)[name = tensor<string, []>("transpose_17")];
-            tensor<fp32, [48, 256]> attn_output_3 = reshape(shape = var_1257, x = var_1253)[name = tensor<string, []>("attn_output_3")];
-            tensor<fp32, [48, 256]> attn_output_5 = linear(bias = decoder_self_attn2_0_out_proj_bias, weight = decoder_self_attn2_0_out_proj_weight, x = attn_output_3)[name = tensor<string, []>("linear_44")];
-            tensor<int32, [3]> var_1261 = const()[name = tensor<string, []>("op_1261"), val = tensor<int32, [3]>([12, 4, 256])];
-            tensor<fp32, [12, 4, 256]> attn_output_7 = reshape(shape = var_1261, x = attn_output_5)[name = tensor<string, []>("attn_output_7")];
+            tensor<int32, [4]> var_1285 = const()[name = tensor<string, []>("op_1285"), val = tensor<int32, [4]>([2, 0, 1, 3])];
+            tensor<int32, [2]> var_1290 = const()[name = tensor<string, []>("op_1290"), val = tensor<int32, [2]>([48, 256])];
+            tensor<fp32, [12, 4, 4, 64]> var_1286 = transpose(perm = var_1285, x = attn_output_1)[name = tensor<string, []>("transpose_17")];
+            tensor<fp32, [48, 256]> attn_output_3 = reshape(shape = var_1290, x = var_1286)[name = tensor<string, []>("attn_output_3")];
+            tensor<fp32, [48, 256]> attn_output_5 = linear(bias = inner_decoder_self_attn2_0_out_proj_bias, weight = inner_decoder_self_attn2_0_out_proj_weight, x = attn_output_3)[name = tensor<string, []>("linear_44")];
+            tensor<int32, [3]> var_1294 = const()[name = tensor<string, []>("op_1294"), val = tensor<int32, [3]>([12, 4, 256])];
+            tensor<fp32, [12, 4, 256]> attn_output_7 = reshape(shape = var_1294, x = attn_output_5)[name = tensor<string, []>("attn_output_7")];
             tensor<int32, [3]> sa2_out_1_perm_0 = const()[name = tensor<string, []>("sa2_out_1_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [4, 12, 256]> sa2_out_1 = transpose(perm = sa2_out_1_perm_0, x = attn_output_7)[name = tensor<string, []>("transpose_16")];
-            tensor<fp32, [4, 12, 256]> input_175 = add(x = query_1, y = sa2_out_1)[name = tensor<string, []>("input_175")];
-            tensor<int32, [1]> input_177_axes_0 = const()[name = tensor<string, []>("input_177_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [4, 12, 256]> input_177 = layer_norm(axes = input_177_axes_0, beta = decoder_norm21_0_bias, epsilon = var_1017, gamma = decoder_norm21_0_weight, x = input_175)[name = tensor<string, []>("input_177")];
-            tensor<fp32, [4, 12, 2048]> input_179 = linear(bias = decoder_linear1_0_bias, weight = decoder_linear1_0_weight, x = input_177)[name = tensor<string, []>("linear_45")];
-            tensor<fp32, [4, 12, 2048]> input_181 = relu(x = input_179)[name = tensor<string, []>("input_181")];
-            tensor<fp32, [4, 12, 256]> ffn_out_1 = linear(bias = decoder_linear2_0_bias, weight = decoder_linear2_0_weight, x = input_181)[name = tensor<string, []>("linear_46")];
-            tensor<fp32, [4, 12, 256]> input_183 = add(x = input_177, y = ffn_out_1)[name = tensor<string, []>("input_183")];
+            tensor<fp32, [4, 12, 256]> input_177 = add(x = query_1, y = sa2_out_1)[name = tensor<string, []>("input_177")];
+            tensor<int32, [1]> input_179_axes_0 = const()[name = tensor<string, []>("input_179_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [4, 12, 256]> input_179 = layer_norm(axes = input_179_axes_0, beta = inner_decoder_norm21_0_bias, epsilon = var_66, gamma = inner_decoder_norm21_0_weight, x = input_177)[name = tensor<string, []>("input_179")];
+            tensor<fp32, [4, 12, 2048]> input_181 = linear(bias = inner_decoder_linear1_0_bias, weight = inner_decoder_linear1_0_weight, x = input_179)[name = tensor<string, []>("linear_45")];
+            tensor<fp32, [4, 12, 2048]> input_183 = relu(x = input_181)[name = tensor<string, []>("input_183")];
+            tensor<fp32, [4, 12, 256]> ffn_out_1 = linear(bias = inner_decoder_linear2_0_bias, weight = inner_decoder_linear2_0_weight, x = input_183)[name = tensor<string, []>("linear_46")];
+            tensor<fp32, [4, 12, 256]> input_185 = add(x = input_179, y = ffn_out_1)[name = tensor<string, []>("input_185")];
             tensor<int32, [1]> xt_3_axes_0 = const()[name = tensor<string, []>("xt_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [4, 12, 256]> xt_3 = layer_norm(axes = xt_3_axes_0, beta = decoder_norm22_0_bias, epsilon = var_1017, gamma = decoder_norm22_0_weight, x = input_183)[name = tensor<string, []>("xt_3")];
-            tensor<int32, [4]> var_1281 = const()[name = tensor<string, []>("op_1281"), val = tensor<int32, [4]>([1, 4, 12, 256])];
-            tensor<fp32, [1, 4, 12, 256]> x_31 = reshape(shape = var_1281, x = xt_3)[name = tensor<string, []>("x_31")];
-            tensor<int32, [4]> var_1283_perm_0 = const()[name = tensor<string, []>("op_1283_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1287 = const()[name = tensor<string, []>("op_1287"), val = tensor<int32, [3]>([12, 4, 256])];
-            tensor<fp32, [1, 12, 4, 256]> var_1283 = transpose(perm = var_1283_perm_0, x = x_31)[name = tensor<string, []>("transpose_15")];
-            tensor<fp32, [12, 4, 256]> x = reshape(shape = var_1287, x = var_1283)[name = tensor<string, []>("x")];
+            tensor<fp32, [4, 12, 256]> xt_3 = layer_norm(axes = xt_3_axes_0, beta = inner_decoder_norm22_0_bias, epsilon = var_66, gamma = inner_decoder_norm22_0_weight, x = input_185)[name = tensor<string, []>("xt_3")];
+            tensor<int32, [4]> var_1314 = const()[name = tensor<string, []>("op_1314"), val = tensor<int32, [4]>([1, 4, 12, 256])];
+            tensor<fp32, [1, 4, 12, 256]> x_31 = reshape(shape = var_1314, x = xt_3)[name = tensor<string, []>("x_31")];
+            tensor<int32, [4]> var_1316_perm_0 = const()[name = tensor<string, []>("op_1316_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1320 = const()[name = tensor<string, []>("op_1320"), val = tensor<int32, [3]>([12, 4, 256])];
+            tensor<fp32, [1, 12, 4, 256]> var_1316 = transpose(perm = var_1316_perm_0, x = x_31)[name = tensor<string, []>("transpose_15")];
+            tensor<fp32, [12, 4, 256]> x = reshape(shape = var_1320, x = var_1316)[name = tensor<string, []>("x")];
             tensor<int32, [5]> prev_kv_begin_0 = const()[name = tensor<string, []>("prev_kv_begin_0"), val = tensor<int32, [5]>([1, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_end_0 = const()[name = tensor<string, []>("prev_kv_end_0"), val = tensor<int32, [5]>([2, 12, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_end_mask_0 = const()[name = tensor<string, []>("prev_kv_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -1153,120 +1168,120 @@ program(1.0)
             tensor<bool, [2]> prev_scale_end_mask_0 = const()[name = tensor<string, []>("prev_scale_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale = slice_by_index(begin = prev_scale_begin_0, end = prev_scale_end_0, end_mask = prev_scale_end_mask_0, squeeze_mask = prev_scale_squeeze_mask_0, x = dec_scale)[name = tensor<string, []>("prev_scale")];
-            tensor<fp32, [12, 4, 256]> var_1295 = linear(bias = decoder_q_proj_1_bias, weight = decoder_q_proj_1_weight, x = x)[name = tensor<string, []>("linear_47")];
-            tensor<int32, [4]> var_1296 = const()[name = tensor<string, []>("op_1296"), val = tensor<int32, [4]>([12, 4, 4, 64])];
-            tensor<fp32, [12, 4, 4, 64]> var_1297 = reshape(shape = var_1296, x = var_1295)[name = tensor<string, []>("op_1297")];
+            tensor<fp32, [12, 4, 256]> var_1328 = linear(bias = inner_decoder_q_proj_1_bias, weight = inner_decoder_q_proj_1_weight, x = x)[name = tensor<string, []>("linear_47")];
+            tensor<int32, [4]> var_1329 = const()[name = tensor<string, []>("op_1329"), val = tensor<int32, [4]>([12, 4, 4, 64])];
+            tensor<fp32, [12, 4, 4, 64]> var_1330 = reshape(shape = var_1329, x = var_1328)[name = tensor<string, []>("op_1330")];
             tensor<int32, [4]> q_17_perm_0 = const()[name = tensor<string, []>("q_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 4, 256]> var_1301 = linear(bias = decoder_k_proj_1_bias, weight = decoder_k_proj_1_weight, x = x)[name = tensor<string, []>("linear_48")];
-            tensor<fp32, []> var_1302 = const()[name = tensor<string, []>("op_1302"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [12, 4, 256]> var_1303 = mul(x = var_1301, y = var_1302)[name = tensor<string, []>("op_1303")];
-            tensor<int32, [4]> var_1304 = const()[name = tensor<string, []>("op_1304"), val = tensor<int32, [4]>([12, 4, 4, 64])];
-            tensor<fp32, [12, 4, 4, 64]> var_1305 = reshape(shape = var_1304, x = var_1303)[name = tensor<string, []>("op_1305")];
+            tensor<fp32, [12, 4, 256]> var_1334 = linear(bias = inner_decoder_k_proj_1_bias, weight = inner_decoder_k_proj_1_weight, x = x)[name = tensor<string, []>("linear_48")];
+            tensor<fp32, []> var_1335 = const()[name = tensor<string, []>("op_1335"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [12, 4, 256]> var_1336 = mul(x = var_1334, y = var_1335)[name = tensor<string, []>("op_1336")];
+            tensor<int32, [4]> var_1337 = const()[name = tensor<string, []>("op_1337"), val = tensor<int32, [4]>([12, 4, 4, 64])];
+            tensor<fp32, [12, 4, 4, 64]> var_1338 = reshape(shape = var_1337, x = var_1336)[name = tensor<string, []>("op_1338")];
             tensor<int32, [4]> k_17_perm_0 = const()[name = tensor<string, []>("k_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 4, 256]> var_1309 = linear(bias = decoder_v_proj_1_bias, weight = decoder_v_proj_1_weight, x = x)[name = tensor<string, []>("linear_49")];
-            tensor<int32, [4]> var_1310 = const()[name = tensor<string, []>("op_1310"), val = tensor<int32, [4]>([12, 4, 4, 64])];
-            tensor<fp32, [12, 4, 4, 64]> var_1311 = reshape(shape = var_1310, x = var_1309)[name = tensor<string, []>("op_1311")];
+            tensor<fp32, [12, 4, 256]> var_1342 = linear(bias = inner_decoder_v_proj_1_bias, weight = inner_decoder_v_proj_1_weight, x = x)[name = tensor<string, []>("linear_49")];
+            tensor<int32, [4]> var_1343 = const()[name = tensor<string, []>("op_1343"), val = tensor<int32, [4]>([12, 4, 4, 64])];
+            tensor<fp32, [12, 4, 4, 64]> var_1344 = reshape(shape = var_1343, x = var_1342)[name = tensor<string, []>("op_1344")];
             tensor<int32, [4]> v_17_perm_0 = const()[name = tensor<string, []>("v_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 4, 256]> input_187 = linear(bias = decoder_g_proj_1_bias, weight = decoder_g_proj_1_weight, x = x)[name = tensor<string, []>("linear_50")];
+            tensor<fp32, [12, 4, 256]> input_189 = linear(bias = inner_decoder_g_proj_1_bias, weight = inner_decoder_g_proj_1_weight, x = x)[name = tensor<string, []>("linear_50")];
             tensor<fp32, [1]> sqrt_s0 = sqrt(x = prev_scale)[name = tensor<string, []>("sqrt_s0")];
             tensor<fp32, [4]> s_t = add(x = prev_scale, y = cumsum_mask_1)[name = tensor<string, []>("s_t")];
             tensor<fp32, []> const_32 = const()[name = tensor<string, []>("const_32"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [4]> clip_3 = clip(alpha = var_1022, beta = const_32, x = s_t)[name = tensor<string, []>("clip_3")];
+            tensor<fp32, [4]> clip_3 = clip(alpha = var_59, beta = const_32, x = s_t)[name = tensor<string, []>("clip_3")];
             tensor<fp32, [4]> sqrt_s_t = sqrt(x = clip_3)[name = tensor<string, []>("sqrt_s_t")];
             tensor<bool, []> qk_transpose_x_1 = const()[name = tensor<string, []>("qk_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_transpose_y_1 = const()[name = tensor<string, []>("qk_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [12, 4, 4, 64]> k_17 = transpose(perm = k_17_perm_0, x = var_1305)[name = tensor<string, []>("transpose_13")];
-            tensor<fp32, [12, 4, 4, 64]> q_17 = transpose(perm = q_17_perm_0, x = var_1297)[name = tensor<string, []>("transpose_14")];
+            tensor<fp32, [12, 4, 4, 64]> k_17 = transpose(perm = k_17_perm_0, x = var_1338)[name = tensor<string, []>("transpose_13")];
+            tensor<fp32, [12, 4, 4, 64]> q_17 = transpose(perm = q_17_perm_0, x = var_1330)[name = tensor<string, []>("transpose_14")];
             tensor<fp32, [12, 4, 4, 4]> qk = matmul(transpose_x = qk_transpose_x_1, transpose_y = qk_transpose_y_1, x = q_17, y = k_17)[name = tensor<string, []>("qk")];
-            tensor<int32, [2]> var_1326 = const()[name = tensor<string, []>("op_1326"), val = tensor<int32, [2]>([4, 1])];
-            tensor<fp32, [4, 1]> var_1327 = reshape(shape = var_1326, x = sqrt_s_t)[name = tensor<string, []>("op_1327")];
-            tensor<fp32, [4, 4]> M = real_div(x = causal_with_valid_1, y = var_1327)[name = tensor<string, []>("M")];
-            tensor<fp32, [12, 4, 4, 4]> var_1329 = mul(x = qk, y = M)[name = tensor<string, []>("op_1329")];
-            tensor<bool, []> inner_transpose_x_0 = const()[name = tensor<string, []>("inner_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> inner_transpose_y_0 = const()[name = tensor<string, []>("inner_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 4, 64]> v_17 = transpose(perm = v_17_perm_0, x = var_1311)[name = tensor<string, []>("transpose_12")];
-            tensor<fp32, [12, 4, 4, 64]> inner = matmul(transpose_x = inner_transpose_x_0, transpose_y = inner_transpose_y_0, x = var_1329, y = v_17)[name = tensor<string, []>("inner")];
-            tensor<bool, []> var_1331_transpose_x_0 = const()[name = tensor<string, []>("op_1331_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_1331_transpose_y_0 = const()[name = tensor<string, []>("op_1331_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 4, 64]> var_1331 = matmul(transpose_x = var_1331_transpose_x_0, transpose_y = var_1331_transpose_y_0, x = q_17, y = prev_kv)[name = tensor<string, []>("op_1331")];
-            tensor<fp32, [4]> var_1332 = real_div(x = sqrt_s0, y = sqrt_s_t)[name = tensor<string, []>("op_1332")];
-            tensor<int32, [4]> var_1333 = const()[name = tensor<string, []>("op_1333"), val = tensor<int32, [4]>([1, 1, 4, 1])];
-            tensor<fp32, [1, 1, 4, 1]> var_1334 = reshape(shape = var_1333, x = var_1332)[name = tensor<string, []>("op_1334")];
-            tensor<fp32, [12, 4, 4, 64]> cross = mul(x = var_1331, y = var_1334)[name = tensor<string, []>("cross")];
-            tensor<fp32, [12, 4, 4, 64]> out_31 = add(x = inner, y = cross)[name = tensor<string, []>("out_31")];
-            tensor<fp32, [12, 4, 4, 64]> v_masked = mul(x = v_17, y = var_1157)[name = tensor<string, []>("v_masked")];
-            tensor<fp32, [12, 4, 64, 64]> var_1340 = mul(x = prev_kv, y = sqrt_s0)[name = tensor<string, []>("op_1340")];
-            tensor<bool, []> var_1342_transpose_x_1 = const()[name = tensor<string, []>("op_1342_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_1342_transpose_y_1 = const()[name = tensor<string, []>("op_1342_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 64, 64]> var_1342 = matmul(transpose_x = var_1342_transpose_x_1, transpose_y = var_1342_transpose_y_1, x = k_17, y = v_masked)[name = tensor<string, []>("op_1342")];
-            tensor<fp32, [12, 4, 64, 64]> new_kv_unnorm = add(x = var_1340, y = var_1342)[name = tensor<string, []>("new_kv_unnorm")];
-            tensor<fp32, [1]> new_scale = add(x = prev_scale, y = var_1165)[name = tensor<string, []>("new_scale")];
+            tensor<int32, [2]> var_1359 = const()[name = tensor<string, []>("op_1359"), val = tensor<int32, [2]>([4, 1])];
+            tensor<fp32, [4, 1]> var_1360 = reshape(shape = var_1359, x = sqrt_s_t)[name = tensor<string, []>("op_1360")];
+            tensor<fp32, [4, 4]> M = real_div(x = causal_with_valid_1, y = var_1360)[name = tensor<string, []>("M")];
+            tensor<fp32, [12, 4, 4, 4]> var_1362 = mul(x = qk, y = M)[name = tensor<string, []>("op_1362")];
+            tensor<bool, []> inner_11_transpose_x_0 = const()[name = tensor<string, []>("inner_11_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> inner_11_transpose_y_0 = const()[name = tensor<string, []>("inner_11_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 4, 64]> v_17 = transpose(perm = v_17_perm_0, x = var_1344)[name = tensor<string, []>("transpose_12")];
+            tensor<fp32, [12, 4, 4, 64]> inner_11 = matmul(transpose_x = inner_11_transpose_x_0, transpose_y = inner_11_transpose_y_0, x = var_1362, y = v_17)[name = tensor<string, []>("inner_11")];
+            tensor<bool, []> var_1364_transpose_x_0 = const()[name = tensor<string, []>("op_1364_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_1364_transpose_y_0 = const()[name = tensor<string, []>("op_1364_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 4, 64]> var_1364 = matmul(transpose_x = var_1364_transpose_x_0, transpose_y = var_1364_transpose_y_0, x = q_17, y = prev_kv)[name = tensor<string, []>("op_1364")];
+            tensor<fp32, [4]> var_1365 = real_div(x = sqrt_s0, y = sqrt_s_t)[name = tensor<string, []>("op_1365")];
+            tensor<int32, [4]> var_1366 = const()[name = tensor<string, []>("op_1366"), val = tensor<int32, [4]>([1, 1, 4, 1])];
+            tensor<fp32, [1, 1, 4, 1]> var_1367 = reshape(shape = var_1366, x = var_1365)[name = tensor<string, []>("op_1367")];
+            tensor<fp32, [12, 4, 4, 64]> cross = mul(x = var_1364, y = var_1367)[name = tensor<string, []>("cross")];
+            tensor<fp32, [12, 4, 4, 64]> out_31 = add(x = inner_11, y = cross)[name = tensor<string, []>("out_31")];
+            tensor<fp32, [12, 4, 4, 64]> v_masked = mul(x = v_17, y = var_1190)[name = tensor<string, []>("v_masked")];
+            tensor<fp32, [12, 4, 64, 64]> var_1373 = mul(x = prev_kv, y = sqrt_s0)[name = tensor<string, []>("op_1373")];
+            tensor<bool, []> var_1375_transpose_x_1 = const()[name = tensor<string, []>("op_1375_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_1375_transpose_y_1 = const()[name = tensor<string, []>("op_1375_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 64, 64]> var_1375 = matmul(transpose_x = var_1375_transpose_x_1, transpose_y = var_1375_transpose_y_1, x = k_17, y = v_masked)[name = tensor<string, []>("op_1375")];
+            tensor<fp32, [12, 4, 64, 64]> new_kv_unnorm = add(x = var_1373, y = var_1375)[name = tensor<string, []>("new_kv_unnorm")];
+            tensor<fp32, [1]> new_scale = add(x = prev_scale, y = var_1198)[name = tensor<string, []>("new_scale")];
             tensor<fp32, []> const_33 = const()[name = tensor<string, []>("const_33"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_4 = clip(alpha = var_1022, beta = const_33, x = new_scale)[name = tensor<string, []>("clip_4")];
+            tensor<fp32, [1]> clip_4 = clip(alpha = var_59, beta = const_33, x = new_scale)[name = tensor<string, []>("clip_4")];
             tensor<fp32, [1]> sqrt_new_scale = sqrt(x = clip_4)[name = tensor<string, []>("sqrt_new_scale")];
             tensor<fp32, [12, 4, 64, 64]> nkv = real_div(x = new_kv_unnorm, y = sqrt_new_scale)[name = tensor<string, []>("nkv")];
-            tensor<int32, [4]> var_1351_perm_0 = const()[name = tensor<string, []>("op_1351_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [4]> var_1384_perm_0 = const()[name = tensor<string, []>("op_1384_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_33_axes_0 = const()[name = tensor<string, []>("out_33_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [12, 4, 4, 64]> var_1351 = transpose(perm = var_1351_perm_0, x = out_31)[name = tensor<string, []>("transpose_11")];
-            tensor<fp32, [12, 4, 4, 64]> out_33 = layer_norm(axes = out_33_axes_0, epsilon = var_1019, x = var_1351)[name = tensor<string, []>("out_33")];
-            tensor<int32, [3]> var_1355 = const()[name = tensor<string, []>("op_1355"), val = tensor<int32, [3]>([12, 4, 256])];
-            tensor<fp32, [12, 4, 256]> out = reshape(shape = var_1355, x = out_33)[name = tensor<string, []>("out")];
-            tensor<fp32, [12, 4, 256]> var_1357 = silu(x = input_187)[name = tensor<string, []>("op_1357")];
-            tensor<fp32, [12, 4, 256]> input_189 = mul(x = var_1357, y = out)[name = tensor<string, []>("input_189")];
-            tensor<fp32, [12, 4, 256]> ret_out = linear(bias = decoder_out_proj_1_bias, weight = decoder_out_proj_1_weight, x = input_189)[name = tensor<string, []>("linear_51")];
-            tensor<fp32, [12, 4, 256]> input_191 = add(x = x, y = ret_out)[name = tensor<string, []>("input_191")];
+            tensor<fp32, [12, 4, 4, 64]> var_1384 = transpose(perm = var_1384_perm_0, x = out_31)[name = tensor<string, []>("transpose_11")];
+            tensor<fp32, [12, 4, 4, 64]> out_33 = layer_norm(axes = out_33_axes_0, epsilon = var_74, x = var_1384)[name = tensor<string, []>("out_33")];
+            tensor<int32, [3]> var_1388 = const()[name = tensor<string, []>("op_1388"), val = tensor<int32, [3]>([12, 4, 256])];
+            tensor<fp32, [12, 4, 256]> out = reshape(shape = var_1388, x = out_33)[name = tensor<string, []>("out")];
+            tensor<fp32, [12, 4, 256]> var_1390 = silu(x = input_189)[name = tensor<string, []>("op_1390")];
+            tensor<fp32, [12, 4, 256]> input_191 = mul(x = var_1390, y = out)[name = tensor<string, []>("input_191")];
+            tensor<fp32, [12, 4, 256]> ret_out = linear(bias = inner_decoder_out_proj_1_bias, weight = inner_decoder_out_proj_1_weight, x = input_191)[name = tensor<string, []>("linear_51")];
+            tensor<fp32, [12, 4, 256]> input_193 = add(x = x, y = ret_out)[name = tensor<string, []>("input_193")];
             tensor<int32, [1]> xt_5_axes_0 = const()[name = tensor<string, []>("xt_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [12, 4, 256]> xt_5 = layer_norm(axes = xt_5_axes_0, beta = decoder_norm11_1_bias, epsilon = var_1017, gamma = decoder_norm11_1_weight, x = input_191)[name = tensor<string, []>("xt_5")];
-            tensor<int32, [4]> var_1367 = const()[name = tensor<string, []>("op_1367"), val = tensor<int32, [4]>([1, 12, 4, 256])];
-            tensor<fp32, [1, 12, 4, 256]> var_1368 = reshape(shape = var_1367, x = xt_5)[name = tensor<string, []>("op_1368")];
-            tensor<int32, [4]> var_1369_perm_0 = const()[name = tensor<string, []>("op_1369_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1372 = const()[name = tensor<string, []>("op_1372"), val = tensor<int32, [3]>([4, 12, 256])];
-            tensor<fp32, [1, 4, 12, 256]> var_1369 = transpose(perm = var_1369_perm_0, x = var_1368)[name = tensor<string, []>("transpose_10")];
-            tensor<fp32, [4, 12, 256]> query_5 = reshape(shape = var_1372, x = var_1369)[name = tensor<string, []>("query_5")];
+            tensor<fp32, [12, 4, 256]> xt_5 = layer_norm(axes = xt_5_axes_0, beta = inner_decoder_norm11_1_bias, epsilon = var_66, gamma = inner_decoder_norm11_1_weight, x = input_193)[name = tensor<string, []>("xt_5")];
+            tensor<int32, [4]> var_1400 = const()[name = tensor<string, []>("op_1400"), val = tensor<int32, [4]>([1, 12, 4, 256])];
+            tensor<fp32, [1, 12, 4, 256]> var_1401 = reshape(shape = var_1400, x = xt_5)[name = tensor<string, []>("op_1401")];
+            tensor<int32, [4]> var_1402_perm_0 = const()[name = tensor<string, []>("op_1402_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1405 = const()[name = tensor<string, []>("op_1405"), val = tensor<int32, [3]>([4, 12, 256])];
+            tensor<fp32, [1, 4, 12, 256]> var_1402 = transpose(perm = var_1402_perm_0, x = var_1401)[name = tensor<string, []>("transpose_10")];
+            tensor<fp32, [4, 12, 256]> query_5 = reshape(shape = var_1405, x = var_1402)[name = tensor<string, []>("query_5")];
             tensor<int32, [3]> query_perm_0 = const()[name = tensor<string, []>("query_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [12, 4, 256]> query = transpose(perm = query_perm_0, x = query_5)[name = tensor<string, []>("transpose_9")];
-            tensor<fp32, [12, 4, 768]> var_1395 = linear(bias = decoder_self_attn2_1_in_proj_bias, weight = decoder_self_attn2_1_in_proj_weight, x = query)[name = tensor<string, []>("linear_52")];
+            tensor<fp32, [12, 4, 768]> var_1428 = linear(bias = inner_decoder_self_attn2_1_in_proj_bias, weight = inner_decoder_self_attn2_1_in_proj_weight, x = query)[name = tensor<string, []>("linear_52")];
             tensor<int32, [4]> concat_2 = const()[name = tensor<string, []>("concat_2"), val = tensor<int32, [4]>([12, 4, 3, 256])];
-            tensor<fp32, [12, 4, 3, 256]> var_1397 = reshape(shape = concat_2, x = var_1395)[name = tensor<string, []>("op_1397")];
-            tensor<int32, [1]> var_1398_axes_0 = const()[name = tensor<string, []>("op_1398_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp32, [1, 12, 4, 3, 256]> var_1398 = expand_dims(axes = var_1398_axes_0, x = var_1397)[name = tensor<string, []>("op_1398")];
-            tensor<int32, [5]> var_1399_perm_0 = const()[name = tensor<string, []>("op_1399_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
-            tensor<int32, [1]> var_1400_axes_0 = const()[name = tensor<string, []>("op_1400_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp32, [3, 12, 4, 1, 256]> var_1399 = transpose(perm = var_1399_perm_0, x = var_1398)[name = tensor<string, []>("transpose_8")];
-            tensor<fp32, [3, 12, 4, 256]> var_1400 = squeeze(axes = var_1400_axes_0, x = var_1399)[name = tensor<string, []>("op_1400")];
+            tensor<fp32, [12, 4, 3, 256]> var_1430 = reshape(shape = concat_2, x = var_1428)[name = tensor<string, []>("op_1430")];
+            tensor<int32, [1]> var_1431_axes_0 = const()[name = tensor<string, []>("op_1431_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 12, 4, 3, 256]> var_1431 = expand_dims(axes = var_1431_axes_0, x = var_1430)[name = tensor<string, []>("op_1431")];
+            tensor<int32, [5]> var_1432_perm_0 = const()[name = tensor<string, []>("op_1432_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
+            tensor<int32, [1]> var_1433_axes_0 = const()[name = tensor<string, []>("op_1433_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp32, [3, 12, 4, 1, 256]> var_1432 = transpose(perm = var_1432_perm_0, x = var_1431)[name = tensor<string, []>("transpose_8")];
+            tensor<fp32, [3, 12, 4, 256]> var_1433 = squeeze(axes = var_1433_axes_0, x = var_1432)[name = tensor<string, []>("op_1433")];
             tensor<int32, [4]> q_19_begin_0 = const()[name = tensor<string, []>("q_19_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> q_19_end_0 = const()[name = tensor<string, []>("q_19_end_0"), val = tensor<int32, [4]>([1, 12, 4, 256])];
             tensor<bool, [4]> q_19_end_mask_0 = const()[name = tensor<string, []>("q_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> q_19_squeeze_mask_0 = const()[name = tensor<string, []>("q_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 4, 256]> q_19 = slice_by_index(begin = q_19_begin_0, end = q_19_end_0, end_mask = q_19_end_mask_0, squeeze_mask = q_19_squeeze_mask_0, x = var_1400)[name = tensor<string, []>("q_19")];
+            tensor<fp32, [12, 4, 256]> q_19 = slice_by_index(begin = q_19_begin_0, end = q_19_end_0, end_mask = q_19_end_mask_0, squeeze_mask = q_19_squeeze_mask_0, x = var_1433)[name = tensor<string, []>("q_19")];
             tensor<int32, [4]> k_19_begin_0 = const()[name = tensor<string, []>("k_19_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> k_19_end_0 = const()[name = tensor<string, []>("k_19_end_0"), val = tensor<int32, [4]>([2, 12, 4, 256])];
             tensor<bool, [4]> k_19_end_mask_0 = const()[name = tensor<string, []>("k_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> k_19_squeeze_mask_0 = const()[name = tensor<string, []>("k_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 4, 256]> k_19 = slice_by_index(begin = k_19_begin_0, end = k_19_end_0, end_mask = k_19_end_mask_0, squeeze_mask = k_19_squeeze_mask_0, x = var_1400)[name = tensor<string, []>("k_19")];
+            tensor<fp32, [12, 4, 256]> k_19 = slice_by_index(begin = k_19_begin_0, end = k_19_end_0, end_mask = k_19_end_mask_0, squeeze_mask = k_19_squeeze_mask_0, x = var_1433)[name = tensor<string, []>("k_19")];
             tensor<int32, [4]> v_19_begin_0 = const()[name = tensor<string, []>("v_19_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> v_19_end_0 = const()[name = tensor<string, []>("v_19_end_0"), val = tensor<int32, [4]>([3, 12, 4, 256])];
             tensor<bool, [4]> v_19_end_mask_0 = const()[name = tensor<string, []>("v_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> v_19_squeeze_mask_0 = const()[name = tensor<string, []>("v_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 4, 256]> v_19 = slice_by_index(begin = v_19_begin_0, end = v_19_end_0, end_mask = v_19_end_mask_0, squeeze_mask = v_19_squeeze_mask_0, x = var_1400)[name = tensor<string, []>("v_19")];
-            tensor<int32, [3]> var_1408 = const()[name = tensor<string, []>("op_1408"), val = tensor<int32, [3]>([12, 16, 64])];
-            tensor<fp32, [12, 16, 64]> var_1409 = reshape(shape = var_1408, x = q_19)[name = tensor<string, []>("op_1409")];
+            tensor<fp32, [12, 4, 256]> v_19 = slice_by_index(begin = v_19_begin_0, end = v_19_end_0, end_mask = v_19_end_mask_0, squeeze_mask = v_19_squeeze_mask_0, x = var_1433)[name = tensor<string, []>("v_19")];
+            tensor<int32, [3]> var_1441 = const()[name = tensor<string, []>("op_1441"), val = tensor<int32, [3]>([12, 16, 64])];
+            tensor<fp32, [12, 16, 64]> var_1442 = reshape(shape = var_1441, x = q_19)[name = tensor<string, []>("op_1442")];
             tensor<int32, [3]> q_21_perm_0 = const()[name = tensor<string, []>("q_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1415 = const()[name = tensor<string, []>("op_1415"), val = tensor<int32, [3]>([12, 16, 64])];
-            tensor<fp32, [12, 16, 64]> var_1416 = reshape(shape = var_1415, x = k_19)[name = tensor<string, []>("op_1416")];
+            tensor<int32, [3]> var_1448 = const()[name = tensor<string, []>("op_1448"), val = tensor<int32, [3]>([12, 16, 64])];
+            tensor<fp32, [12, 16, 64]> var_1449 = reshape(shape = var_1448, x = k_19)[name = tensor<string, []>("op_1449")];
             tensor<int32, [3]> k_21_perm_0 = const()[name = tensor<string, []>("k_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1422 = const()[name = tensor<string, []>("op_1422"), val = tensor<int32, [3]>([12, 16, 64])];
-            tensor<fp32, [12, 16, 64]> var_1423 = reshape(shape = var_1422, x = v_19)[name = tensor<string, []>("op_1423")];
+            tensor<int32, [3]> var_1455 = const()[name = tensor<string, []>("op_1455"), val = tensor<int32, [3]>([12, 16, 64])];
+            tensor<fp32, [12, 16, 64]> var_1456 = reshape(shape = var_1455, x = v_19)[name = tensor<string, []>("op_1456")];
             tensor<int32, [3]> v_21_perm_0 = const()[name = tensor<string, []>("v_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [4]> var_1426 = const()[name = tensor<string, []>("op_1426"), val = tensor<int32, [4]>([4, 4, 12, 64])];
-            tensor<fp32, [16, 12, 64]> q_21 = transpose(perm = q_21_perm_0, x = var_1409)[name = tensor<string, []>("transpose_7")];
-            tensor<fp32, [4, 4, 12, 64]> q = reshape(shape = var_1426, x = q_21)[name = tensor<string, []>("q")];
-            tensor<int32, [4]> var_1428 = const()[name = tensor<string, []>("op_1428"), val = tensor<int32, [4]>([4, 4, 12, 64])];
-            tensor<fp32, [16, 12, 64]> k_21 = transpose(perm = k_21_perm_0, x = var_1416)[name = tensor<string, []>("transpose_6")];
-            tensor<fp32, [4, 4, 12, 64]> k = reshape(shape = var_1428, x = k_21)[name = tensor<string, []>("k")];
-            tensor<int32, [4]> var_1430 = const()[name = tensor<string, []>("op_1430"), val = tensor<int32, [4]>([4, 4, 12, 64])];
-            tensor<fp32, [16, 12, 64]> v_21 = transpose(perm = v_21_perm_0, x = var_1423)[name = tensor<string, []>("transpose_5")];
-            tensor<fp32, [4, 4, 12, 64]> v = reshape(shape = var_1430, x = v_21)[name = tensor<string, []>("v")];
+            tensor<int32, [4]> var_1459 = const()[name = tensor<string, []>("op_1459"), val = tensor<int32, [4]>([4, 4, 12, 64])];
+            tensor<fp32, [16, 12, 64]> q_21 = transpose(perm = q_21_perm_0, x = var_1442)[name = tensor<string, []>("transpose_7")];
+            tensor<fp32, [4, 4, 12, 64]> q = reshape(shape = var_1459, x = q_21)[name = tensor<string, []>("q")];
+            tensor<int32, [4]> var_1461 = const()[name = tensor<string, []>("op_1461"), val = tensor<int32, [4]>([4, 4, 12, 64])];
+            tensor<fp32, [16, 12, 64]> k_21 = transpose(perm = k_21_perm_0, x = var_1449)[name = tensor<string, []>("transpose_6")];
+            tensor<fp32, [4, 4, 12, 64]> k = reshape(shape = var_1461, x = k_21)[name = tensor<string, []>("k")];
+            tensor<int32, [4]> var_1463 = const()[name = tensor<string, []>("op_1463"), val = tensor<int32, [4]>([4, 4, 12, 64])];
+            tensor<fp32, [16, 12, 64]> v_21 = transpose(perm = v_21_perm_0, x = var_1456)[name = tensor<string, []>("transpose_5")];
+            tensor<fp32, [4, 4, 12, 64]> v = reshape(shape = var_1463, x = v_21)[name = tensor<string, []>("v")];
             tensor<fp32, []> mul_3_y_0 = const()[name = tensor<string, []>("mul_3_y_0"), val = tensor<fp32, []>(0x1p-3)];
             tensor<fp32, [4, 4, 12, 64]> mul_3 = mul(x = q, y = mul_3_y_0)[name = tensor<string, []>("mul_3")];
             tensor<bool, []> matmul_1_transpose_y_0 = const()[name = tensor<string, []>("matmul_1_transpose_y_0"), val = tensor<bool, []>(true)];
@@ -1277,36 +1292,36 @@ program(1.0)
             tensor<bool, []> attn_output_9_transpose_x_0 = const()[name = tensor<string, []>("attn_output_9_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> attn_output_9_transpose_y_0 = const()[name = tensor<string, []>("attn_output_9_transpose_y_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [4, 4, 12, 64]> attn_output_9 = matmul(transpose_x = attn_output_9_transpose_x_0, transpose_y = attn_output_9_transpose_y_0, x = softmax_1, y = v)[name = tensor<string, []>("attn_output_9")];
-            tensor<int32, [4]> var_1433 = const()[name = tensor<string, []>("op_1433"), val = tensor<int32, [4]>([2, 0, 1, 3])];
-            tensor<int32, [2]> var_1438 = const()[name = tensor<string, []>("op_1438"), val = tensor<int32, [2]>([48, 256])];
-            tensor<fp32, [12, 4, 4, 64]> var_1434 = transpose(perm = var_1433, x = attn_output_9)[name = tensor<string, []>("transpose_4")];
-            tensor<fp32, [48, 256]> attn_output_11 = reshape(shape = var_1438, x = var_1434)[name = tensor<string, []>("attn_output_11")];
-            tensor<fp32, [48, 256]> attn_output_13 = linear(bias = decoder_self_attn2_1_out_proj_bias, weight = decoder_self_attn2_1_out_proj_weight, x = attn_output_11)[name = tensor<string, []>("linear_53")];
-            tensor<int32, [3]> var_1442 = const()[name = tensor<string, []>("op_1442"), val = tensor<int32, [3]>([12, 4, 256])];
-            tensor<fp32, [12, 4, 256]> attn_output = reshape(shape = var_1442, x = attn_output_13)[name = tensor<string, []>("attn_output")];
+            tensor<int32, [4]> var_1466 = const()[name = tensor<string, []>("op_1466"), val = tensor<int32, [4]>([2, 0, 1, 3])];
+            tensor<int32, [2]> var_1471 = const()[name = tensor<string, []>("op_1471"), val = tensor<int32, [2]>([48, 256])];
+            tensor<fp32, [12, 4, 4, 64]> var_1467 = transpose(perm = var_1466, x = attn_output_9)[name = tensor<string, []>("transpose_4")];
+            tensor<fp32, [48, 256]> attn_output_11 = reshape(shape = var_1471, x = var_1467)[name = tensor<string, []>("attn_output_11")];
+            tensor<fp32, [48, 256]> attn_output_13 = linear(bias = inner_decoder_self_attn2_1_out_proj_bias, weight = inner_decoder_self_attn2_1_out_proj_weight, x = attn_output_11)[name = tensor<string, []>("linear_53")];
+            tensor<int32, [3]> var_1475 = const()[name = tensor<string, []>("op_1475"), val = tensor<int32, [3]>([12, 4, 256])];
+            tensor<fp32, [12, 4, 256]> attn_output = reshape(shape = var_1475, x = attn_output_13)[name = tensor<string, []>("attn_output")];
             tensor<int32, [3]> sa2_out_perm_0 = const()[name = tensor<string, []>("sa2_out_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [4, 12, 256]> sa2_out = transpose(perm = sa2_out_perm_0, x = attn_output)[name = tensor<string, []>("transpose_3")];
-            tensor<fp32, [4, 12, 256]> input_193 = add(x = query_5, y = sa2_out)[name = tensor<string, []>("input_193")];
-            tensor<int32, [1]> input_195_axes_0 = const()[name = tensor<string, []>("input_195_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [4, 12, 256]> input_195 = layer_norm(axes = input_195_axes_0, beta = decoder_norm21_1_bias, epsilon = var_1017, gamma = decoder_norm21_1_weight, x = input_193)[name = tensor<string, []>("input_195")];
-            tensor<fp32, [4, 12, 2048]> input_197 = linear(bias = decoder_linear1_1_bias, weight = decoder_linear1_1_weight, x = input_195)[name = tensor<string, []>("linear_54")];
-            tensor<fp32, [4, 12, 2048]> input_199 = relu(x = input_197)[name = tensor<string, []>("input_199")];
-            tensor<fp32, [4, 12, 256]> ffn_out = linear(bias = decoder_linear2_1_bias, weight = decoder_linear2_1_weight, x = input_199)[name = tensor<string, []>("linear_55")];
-            tensor<fp32, [4, 12, 256]> input_201 = add(x = input_195, y = ffn_out)[name = tensor<string, []>("input_201")];
+            tensor<fp32, [4, 12, 256]> input_195 = add(x = query_5, y = sa2_out)[name = tensor<string, []>("input_195")];
+            tensor<int32, [1]> input_197_axes_0 = const()[name = tensor<string, []>("input_197_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [4, 12, 256]> input_197 = layer_norm(axes = input_197_axes_0, beta = inner_decoder_norm21_1_bias, epsilon = var_66, gamma = inner_decoder_norm21_1_weight, x = input_195)[name = tensor<string, []>("input_197")];
+            tensor<fp32, [4, 12, 2048]> input_199 = linear(bias = inner_decoder_linear1_1_bias, weight = inner_decoder_linear1_1_weight, x = input_197)[name = tensor<string, []>("linear_54")];
+            tensor<fp32, [4, 12, 2048]> input_201 = relu(x = input_199)[name = tensor<string, []>("input_201")];
+            tensor<fp32, [4, 12, 256]> ffn_out = linear(bias = inner_decoder_linear2_1_bias, weight = inner_decoder_linear2_1_weight, x = input_201)[name = tensor<string, []>("linear_55")];
+            tensor<fp32, [4, 12, 256]> input_203 = add(x = input_197, y = ffn_out)[name = tensor<string, []>("input_203")];
             tensor<int32, [1]> xt_axes_0 = const()[name = tensor<string, []>("xt_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [4, 12, 256]> xt = layer_norm(axes = xt_axes_0, beta = decoder_norm22_1_bias, epsilon = var_1017, gamma = decoder_norm22_1_weight, x = input_201)[name = tensor<string, []>("xt")];
-            tensor<int32, [4]> var_1462 = const()[name = tensor<string, []>("op_1462"), val = tensor<int32, [4]>([1, 4, 12, 256])];
-            tensor<fp32, [1, 4, 12, 256]> input = reshape(shape = var_1462, x = xt)[name = tensor<string, []>("input")];
-            tensor<int32, [1]> var_1464 = const()[name = tensor<string, []>("op_1464"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 12, 1]> var_1465 = reduce_l2_norm(axes = var_1464, keep_dims = var_1020, x = input)[name = tensor<string, []>("op_1465")];
+            tensor<fp32, [4, 12, 256]> xt = layer_norm(axes = xt_axes_0, beta = inner_decoder_norm22_1_bias, epsilon = var_66, gamma = inner_decoder_norm22_1_weight, x = input_203)[name = tensor<string, []>("xt")];
+            tensor<int32, [4]> var_1495 = const()[name = tensor<string, []>("op_1495"), val = tensor<int32, [4]>([1, 4, 12, 256])];
+            tensor<fp32, [1, 4, 12, 256]> input = reshape(shape = var_1495, x = xt)[name = tensor<string, []>("input")];
+            tensor<int32, [1]> var_1497 = const()[name = tensor<string, []>("op_1497"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 4, 12, 1]> var_1498 = reduce_l2_norm(axes = var_1497, keep_dims = var_65, x = input)[name = tensor<string, []>("op_1498")];
             tensor<fp32, []> const_42 = const()[name = tensor<string, []>("const_42"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1, 4, 12, 1]> clip_5 = clip(alpha = var_1012, beta = const_42, x = var_1465)[name = tensor<string, []>("clip_5")];
-            tensor<fp32, [1, 4, 12, 256]> var_1467 = real_div(x = input, y = clip_5)[name = tensor<string, []>("op_1467")];
+            tensor<fp32, [1, 4, 12, 1]> clip_5 = clip(alpha = var_79, beta = const_42, x = var_1498)[name = tensor<string, []>("clip_5")];
+            tensor<fp32, [1, 4, 12, 256]> var_1500 = real_div(x = input, y = clip_5)[name = tensor<string, []>("op_1500")];
             tensor<int32, [3]> concat_6 = const()[name = tensor<string, []>("concat_6"), val = tensor<int32, [3]>([4, 1, 256])];
             tensor<fp32, [4, 1, 256]> reshape_0 = reshape(shape = concat_6, x = emb)[name = tensor<string, []>("reshape_0")];
             tensor<int32, [4]> transpose_1_perm_0 = const()[name = tensor<string, []>("transpose_1_perm_0"), val = tensor<int32, [4]>([0, 1, 3, 2])];
             tensor<int32, [3]> concat_7 = const()[name = tensor<string, []>("concat_7"), val = tensor<int32, [3]>([4, 256, 12])];
-            tensor<fp32, [1, 4, 256, 12]> transpose_1 = transpose(perm = transpose_1_perm_0, x = var_1467)[name = tensor<string, []>("transpose_2")];
+            tensor<fp32, [1, 4, 256, 12]> transpose_1 = transpose(perm = transpose_1_perm_0, x = var_1500)[name = tensor<string, []>("transpose_2")];
             tensor<fp32, [4, 256, 12]> reshape_1 = reshape(shape = concat_7, x = transpose_1)[name = tensor<string, []>("reshape_1")];
             tensor<bool, []> matmul_2_transpose_x_0 = const()[name = tensor<string, []>("matmul_2_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> matmul_2_transpose_y_0 = const()[name = tensor<string, []>("matmul_2_transpose_y_0"), val = tensor<bool, []>(false)];
@@ -1317,10 +1332,10 @@ program(1.0)
             tensor<int32, [3]> output_end_0 = const()[name = tensor<string, []>("output_end_0"), val = tensor<int32, [3]>([1, 4, 11])];
             tensor<bool, [3]> output_end_mask_0 = const()[name = tensor<string, []>("output_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
             tensor<fp32, [1, 4, 10]> output = slice_by_index(begin = output_begin_0, end = output_end_0, end_mask = output_end_mask_0, x = reshape_2)[name = tensor<string, []>("output")];
-            tensor<fp32, [1, 4, 10]> probs = sigmoid(x = output)[name = tensor<string, []>("op_1471")];
-            tensor<int32, []> var_1473_axis_0 = const()[name = tensor<string, []>("op_1473_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [2, 12, 4, 64, 64]> dec_kv_new = stack(axis = var_1473_axis_0, values = (var_1169, nkv))[name = tensor<string, []>("op_1473")];
-            tensor<int32, []> var_1475_axis_0 = const()[name = tensor<string, []>("op_1475_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [2, 1]> dec_scale_new = stack(axis = var_1475_axis_0, values = (new_scale_9, new_scale))[name = tensor<string, []>("op_1475")];
+            tensor<fp32, [1, 4, 10]> probs = sigmoid(x = output)[name = tensor<string, []>("op_1504")];
+            tensor<int32, []> var_1506_axis_0 = const()[name = tensor<string, []>("op_1506_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [2, 12, 4, 64, 64]> dec_kv_new = stack(axis = var_1506_axis_0, values = (var_1202, nkv))[name = tensor<string, []>("op_1506")];
+            tensor<int32, []> var_1508_axis_0 = const()[name = tensor<string, []>("op_1508_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [2, 1]> dec_scale_new = stack(axis = var_1508_axis_0, values = (new_scale_9, new_scale))[name = tensor<string, []>("op_1508")];
         } -> (probs, enc_kv_new, enc_scale_new, enc_conv_cache_new, cnn_window_new, dec_kv_new, dec_scale_new);
 }
\ No newline at end of file
diff --git a/optimized/dih2/400ms/ls_eend_dih2_400ms.mlpackage/Data/com.apple.CoreML/model.mlmodel b/optimized/dih2/400ms/ls_eend_dih2_400ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
index 8c1ca861fb9558eb088d1aeb653c17845f7087b0..ce2691fc8e6729b80850a421f04c1ba265fe63e3 100644
--- a/optimized/dih2/400ms/ls_eend_dih2_400ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
+++ b/optimized/dih2/400ms/ls_eend_dih2_400ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:512460ea937729458d31e1520a4866d8d4204db686d08c065f181bcfa9f13c79
-size 191051
+oid sha256:8eca0c1cf1c295154af76c01fc161e535e1e1cea55de94bee0645f33ba0761c8
+size 197123
diff --git a/optimized/dih2/400ms/ls_eend_dih2_400ms.mlpackage/Manifest.json b/optimized/dih2/400ms/ls_eend_dih2_400ms.mlpackage/Manifest.json
index 1f2e5792778d89ff02951ec657bbd30348216ac5..7be2773e83f16979af8b07eeeb27521e182fb490 100644
--- a/optimized/dih2/400ms/ls_eend_dih2_400ms.mlpackage/Manifest.json
+++ b/optimized/dih2/400ms/ls_eend_dih2_400ms.mlpackage/Manifest.json
@@ -1,18 +1,18 @@
 {
     "fileFormatVersion": "1.0.0",
     "itemInfoEntries": {
-        "8AAD4EFD-A1AE-4676-AFDF-014CF1A0D1F3": {
-            "author": "com.apple.CoreML",
-            "description": "CoreML Model Specification",
-            "name": "model.mlmodel",
-            "path": "com.apple.CoreML/model.mlmodel"
-        },
-        "F3FC4090-5689-4260-9E54-5F399C74FF28": {
+        "817CC62A-ED39-420D-B527-1DBFEF63AF42": {
             "author": "com.apple.CoreML",
             "description": "CoreML Model Weights",
             "name": "weights",
             "path": "com.apple.CoreML/weights"
+        },
+        "945611BB-429A-4DA9-B76F-AE8F78395321": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Specification",
+            "name": "model.mlmodel",
+            "path": "com.apple.CoreML/model.mlmodel"
         }
     },
-    "rootModelIdentifier": "8AAD4EFD-A1AE-4676-AFDF-014CF1A0D1F3"
+    "rootModelIdentifier": "945611BB-429A-4DA9-B76F-AE8F78395321"
 }
diff --git a/optimized/dih2/500ms/ls_eend_dih2_500ms.mlmodelc/analytics/coremldata.bin b/optimized/dih2/500ms/ls_eend_dih2_500ms.mlmodelc/analytics/coremldata.bin
index 71b915acdd9abe5171728dadc38af3b7263e0247..f9423badbe72fa7ddbb043591f6219c56acdc2d5 100644
--- a/optimized/dih2/500ms/ls_eend_dih2_500ms.mlmodelc/analytics/coremldata.bin
+++ b/optimized/dih2/500ms/ls_eend_dih2_500ms.mlmodelc/analytics/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f3ab2696bac4217bb0bfc9f74776b079447b45424191c9523d5d87f9a001c761
+oid sha256:4c27ede5b72b63110ea75bc15260db0ccfac78e1baae1d7690710f20d3cccce0
 size 243
diff --git a/optimized/dih2/500ms/ls_eend_dih2_500ms.mlmodelc/coremldata.bin b/optimized/dih2/500ms/ls_eend_dih2_500ms.mlmodelc/coremldata.bin
index f1fe815e519cef7d8e571c8c3bb4fc3c51bf16bd..d89904087ef63365cfe5fa100c8bd5edc7c0c119 100644
--- a/optimized/dih2/500ms/ls_eend_dih2_500ms.mlmodelc/coremldata.bin
+++ b/optimized/dih2/500ms/ls_eend_dih2_500ms.mlmodelc/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f8aa47228de8673fd3089ab1dcf43fd425952e0eeaf3be2efeb7db0bdae626a6
-size 1308
+oid sha256:bee4de6a09887fa76315702a01b13f6d86ea41194abc0846c004f7052395b97d
+size 1411
diff --git a/optimized/dih2/500ms/ls_eend_dih2_500ms.mlmodelc/metadata.json b/optimized/dih2/500ms/ls_eend_dih2_500ms.mlmodelc/metadata.json
index 118719eec347c4d50bae0ae8b401d770b4bf1ac8..b2071cb30a9295712fc171bbed673c20ff18db8a 100644
--- a/optimized/dih2/500ms/ls_eend_dih2_500ms.mlmodelc/metadata.json
+++ b/optimized/dih2/500ms/ls_eend_dih2_500ms.mlmodelc/metadata.json
@@ -1,7 +1,7 @@
 [
   {
     "metadataOutputVersion" : "3.0",
-    "shortDescription" : "LS-EEND DIHARD II streaming diarizer (pipeline, T=5, max_speakers=10)",
+    "shortDescription" : "LS-EEND DIHARD II streaming diarizer (pipeline, T=5, max_speakers=10, layout=raw_mel, cu=all)",
     "outputSchema" : [
       {
         "hasShapeFlexibility" : "0",
@@ -81,7 +81,7 @@
     "specificationVersion" : 8,
     "mlProgramOperationTypeHistogram" : {
       "Ios16.reduceL2Norm" : 2,
-      "Ios17.reshape" : 68,
+      "Ios17.reshape" : 69,
       "Ios16.softmax" : 2,
       "Ios17.matmul" : 29,
       "Ios17.transpose" : 57,
@@ -89,7 +89,7 @@
       "Ios17.expandDims" : 3,
       "Ios17.add" : 46,
       "Ios16.sigmoid" : 5,
-      "Ios17.sliceByIndex" : 72,
+      "Ios17.sliceByIndex" : 77,
       "Tile" : 1,
       "Ios16.reduceSum" : 1,
       "Ios17.squeeze" : 2,
@@ -101,7 +101,7 @@
       "Ios16.silu" : 18,
       "Ios17.realDiv" : 20,
       "Ios17.linear" : 56,
-      "Stack" : 5,
+      "Stack" : 6,
       "Ios17.concat" : 26,
       "Ios16.relu" : 2,
       "Ios16.cumsum" : 1,
@@ -128,9 +128,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float32",
-        "formattedType" : "MultiArray (Float32 1 × 5 × 345)",
+        "formattedType" : "MultiArray (Float32 1 × 55 × 23)",
         "shortDescription" : "",
-        "shape" : "[1, 5, 345]",
+        "shape" : "[1, 55, 23]",
         "name" : "features",
         "type" : "MultiArray"
       },
@@ -206,8 +206,8 @@
       }
     ],
     "userDefinedMetadata" : {
-      "com.github.apple.coremltools.conversion_date" : "2026-04-16",
-      "config" : "{\"model_name\": \"dih2\", \"model_label\": \"DIHARD II\", \"variant\": \"pipeline\", \"chunk_size\": 5, \"step_duration_ms\": 500, \"frame_duration_ms\": 100, \"frame_duration_seconds\": 0.1, \"max_speakers\": 10, \"max_nspks\": 12, \"n_units\": 256, \"n_heads\": 4, \"enc_n_layers\": 4, \"dec_n_layers\": 2, \"conv_kernel_size\": 16, \"conv_delay\": 9, \"sample_rate\": 8000, \"win_length\": 200, \"hop_length\": 80, \"n_mels\": 23, \"context_size\": 7, \"subsampling\": 10, \"feat_type\": \"logmel23_cummn\", \"pure_roll\": true}",
+      "com.github.apple.coremltools.conversion_date" : "2026-04-18",
+      "config" : "{\"model_name\": \"dih2\", \"model_label\": \"DIHARD II\", \"variant\": \"pipeline\", \"chunk_size\": 5, \"step_duration_ms\": 500, \"frame_duration_ms\": 100, \"frame_duration_seconds\": 0.1, \"max_speakers\": 10, \"max_nspks\": 12, \"n_units\": 256, \"n_heads\": 4, \"enc_n_layers\": 4, \"dec_n_layers\": 2, \"conv_kernel_size\": 16, \"conv_delay\": 9, \"sample_rate\": 8000, \"win_length\": 200, \"hop_length\": 80, \"n_mels\": 23, \"context_size\": 7, \"subsampling\": 10, \"feat_type\": \"logmel23_cummn\", \"pure_roll\": true, \"input_layout\": \"raw_mel\", \"compute_units_export\": \"all\", \"raw_mel_length\": 55}",
       "com.github.apple.coremltools.source" : "torch==2.6.0",
       "com.github.apple.coremltools.version" : "9.0",
       "com.github.apple.coremltools.source_dialect" : "TorchScript"
diff --git a/optimized/dih2/500ms/ls_eend_dih2_500ms.mlmodelc/model.mil b/optimized/dih2/500ms/ls_eend_dih2_500ms.mlmodelc/model.mil
index d06d8cb5807c4c0b54c3e07f38ba5655c925422a..ab59896474305668d95443aae5b48ae6a2df960c 100644
--- a/optimized/dih2/500ms/ls_eend_dih2_500ms.mlmodelc/model.mil
+++ b/optimized/dih2/500ms/ls_eend_dih2_500ms.mlmodelc/model.mil
@@ -1,234 +1,260 @@
 program(1.0)
 [buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.6.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "9.0"}})]
 {
-    func main<ios17>(tensor<fp32, [1, 256, 18]> cnn_window, tensor<fp32, [2, 12, 4, 64, 64]> dec_kv, tensor<fp32, [2, 1]> dec_scale, tensor<fp32, [4, 1, 16, 256]> enc_conv_cache, tensor<fp32, [4, 1, 4, 64, 64]> enc_kv, tensor<fp32, [4, 1]> enc_scale, tensor<fp32, [1, 5, 345]> features, tensor<fp32, [5]> valid_mask) {
-            tensor<fp32, [256]> encoder_cnn_bias = const()[name = tensor<string, []>("encoder_cnn_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
-            tensor<fp32, [256, 256, 19]> encoder_cnn_weight = const()[name = tensor<string, []>("encoder_cnn_weight"), val = tensor<fp32, [256, 256, 19]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
-            tensor<fp32, [5, 5]> encoder__causal_mask = const()[name = tensor<string, []>("encoder__causal_mask"), val = tensor<fp32, [5, 5]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4981952)))];
-            tensor<fp32, [5]> encoder__t_index = const()[name = tensor<string, []>("encoder__t_index"), val = tensor<fp32, [5]>([0x1p+0, 0x1p+1, 0x1.8p+1, 0x1p+2, 0x1.4p+2])];
-            tensor<fp32, [256]> encoder_input_projection_linear_bias = const()[name = tensor<string, []>("encoder_input_projection_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4982144)))];
-            tensor<fp32, [256, 345]> encoder_input_projection_linear_weight = const()[name = tensor<string, []>("encoder_input_projection_linear_weight"), val = tensor<fp32, [256, 345]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4983232)))];
-            tensor<fp32, [256]> encoder_pre_layer_norm_bias = const()[name = tensor<string, []>("encoder_pre_layer_norm_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5336576)))];
-            tensor<fp32, [256]> encoder_pre_layer_norm_weight = const()[name = tensor<string, []>("encoder_pre_layer_norm_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5337664)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5338752)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5339840)))];
-            tensor<fp32, [1024]> encoder_ffn1_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5340928)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5345088)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6393728)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6394816)))];
-            tensor<fp32, [256]> encoder_ret_lns_0_bias = const()[name = tensor<string, []>("encoder_ret_lns_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7443456)))];
-            tensor<fp32, [256]> encoder_ret_lns_0_weight = const()[name = tensor<string, []>("encoder_ret_lns_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7444544)))];
-            tensor<fp32, [256]> encoder_q_proj_0_bias = const()[name = tensor<string, []>("encoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7445632)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_0_weight = const()[name = tensor<string, []>("encoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7446720)))];
-            tensor<fp32, [256]> encoder_k_proj_0_bias = const()[name = tensor<string, []>("encoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7708928)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_0_weight = const()[name = tensor<string, []>("encoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7710016)))];
-            tensor<fp32, [256]> encoder_v_proj_0_bias = const()[name = tensor<string, []>("encoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7972224)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_0_weight = const()[name = tensor<string, []>("encoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7973312)))];
-            tensor<fp32, [256]> encoder_g_proj_0_bias = const()[name = tensor<string, []>("encoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8235520)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_0_weight = const()[name = tensor<string, []>("encoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8236608)))];
-            tensor<fp32, [256]> encoder_out_proj_0_bias = const()[name = tensor<string, []>("encoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8498816)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_0_weight = const()[name = tensor<string, []>("encoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8499904)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8762112)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8763200)))];
-            tensor<fp32, [512]> encoder_conv_module_0_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8764288)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_0_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8766400)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_0_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9290752)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9307200)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9308288)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9309376)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9310464)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9311552)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_0_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9312640)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9574848)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9575936)))];
-            tensor<fp32, [1024]> encoder_ffn2_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9577024)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9581184)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10629824)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10630912)))];
-            tensor<fp32, [256]> encoder_layer_norm_0_bias = const()[name = tensor<string, []>("encoder_layer_norm_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11679552)))];
-            tensor<fp32, [256]> encoder_layer_norm_0_weight = const()[name = tensor<string, []>("encoder_layer_norm_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11680640)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11681728)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11682816)))];
-            tensor<fp32, [1024]> encoder_ffn1_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11683904)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11688064)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12736704)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12737792)))];
-            tensor<fp32, [256]> encoder_ret_lns_1_bias = const()[name = tensor<string, []>("encoder_ret_lns_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13786432)))];
-            tensor<fp32, [256]> encoder_ret_lns_1_weight = const()[name = tensor<string, []>("encoder_ret_lns_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13787520)))];
-            tensor<fp32, [256]> encoder_q_proj_1_bias = const()[name = tensor<string, []>("encoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13788608)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_1_weight = const()[name = tensor<string, []>("encoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13789696)))];
-            tensor<fp32, [256]> encoder_k_proj_1_bias = const()[name = tensor<string, []>("encoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14051904)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_1_weight = const()[name = tensor<string, []>("encoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14052992)))];
-            tensor<fp32, [256]> encoder_v_proj_1_bias = const()[name = tensor<string, []>("encoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14315200)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_1_weight = const()[name = tensor<string, []>("encoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14316288)))];
-            tensor<fp32, [256]> encoder_g_proj_1_bias = const()[name = tensor<string, []>("encoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14578496)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_1_weight = const()[name = tensor<string, []>("encoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14579584)))];
-            tensor<fp32, [256]> encoder_out_proj_1_bias = const()[name = tensor<string, []>("encoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14841792)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_1_weight = const()[name = tensor<string, []>("encoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14842880)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15105088)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15106176)))];
-            tensor<fp32, [512]> encoder_conv_module_1_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15107264)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_1_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15109376)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_1_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15633728)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15650176)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15651264)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15652352)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15653440)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15654528)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_1_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15655616)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15917824)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15918912)))];
-            tensor<fp32, [1024]> encoder_ffn2_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15920000)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15924160)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16972800)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16973888)))];
-            tensor<fp32, [256]> encoder_layer_norm_1_bias = const()[name = tensor<string, []>("encoder_layer_norm_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18022528)))];
-            tensor<fp32, [256]> encoder_layer_norm_1_weight = const()[name = tensor<string, []>("encoder_layer_norm_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18023616)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18024704)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18025792)))];
-            tensor<fp32, [1024]> encoder_ffn1_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18026880)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18031040)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19079680)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19080768)))];
-            tensor<fp32, [256]> encoder_ret_lns_2_bias = const()[name = tensor<string, []>("encoder_ret_lns_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20129408)))];
-            tensor<fp32, [256]> encoder_ret_lns_2_weight = const()[name = tensor<string, []>("encoder_ret_lns_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20130496)))];
-            tensor<fp32, [256]> encoder_q_proj_2_bias = const()[name = tensor<string, []>("encoder_q_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20131584)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_2_weight = const()[name = tensor<string, []>("encoder_q_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20132672)))];
-            tensor<fp32, [256]> encoder_k_proj_2_bias = const()[name = tensor<string, []>("encoder_k_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20394880)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_2_weight = const()[name = tensor<string, []>("encoder_k_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20395968)))];
-            tensor<fp32, [256]> encoder_v_proj_2_bias = const()[name = tensor<string, []>("encoder_v_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20658176)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_2_weight = const()[name = tensor<string, []>("encoder_v_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20659264)))];
-            tensor<fp32, [256]> encoder_g_proj_2_bias = const()[name = tensor<string, []>("encoder_g_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20921472)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_2_weight = const()[name = tensor<string, []>("encoder_g_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20922560)))];
-            tensor<fp32, [256]> encoder_out_proj_2_bias = const()[name = tensor<string, []>("encoder_out_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21184768)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_2_weight = const()[name = tensor<string, []>("encoder_out_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21185856)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21448064)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21449152)))];
-            tensor<fp32, [512]> encoder_conv_module_2_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21450240)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_2_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21452352)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_2_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21976704)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21993152)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21994240)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21995328)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21996416)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21997504)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_2_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21998592)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22260800)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22261888)))];
-            tensor<fp32, [1024]> encoder_ffn2_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22262976)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22267136)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23315776)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23316864)))];
-            tensor<fp32, [256]> encoder_layer_norm_2_bias = const()[name = tensor<string, []>("encoder_layer_norm_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24365504)))];
-            tensor<fp32, [256]> encoder_layer_norm_2_weight = const()[name = tensor<string, []>("encoder_layer_norm_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24366592)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24367680)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24368768)))];
-            tensor<fp32, [1024]> encoder_ffn1_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24369856)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24374016)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25422656)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25423744)))];
-            tensor<fp32, [256]> encoder_ret_lns_3_bias = const()[name = tensor<string, []>("encoder_ret_lns_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26472384)))];
-            tensor<fp32, [256]> encoder_ret_lns_3_weight = const()[name = tensor<string, []>("encoder_ret_lns_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26473472)))];
-            tensor<fp32, [256]> encoder_q_proj_3_bias = const()[name = tensor<string, []>("encoder_q_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26474560)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_3_weight = const()[name = tensor<string, []>("encoder_q_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26475648)))];
-            tensor<fp32, [256]> encoder_k_proj_3_bias = const()[name = tensor<string, []>("encoder_k_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26737856)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_3_weight = const()[name = tensor<string, []>("encoder_k_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26738944)))];
-            tensor<fp32, [256]> encoder_v_proj_3_bias = const()[name = tensor<string, []>("encoder_v_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27001152)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_3_weight = const()[name = tensor<string, []>("encoder_v_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27002240)))];
-            tensor<fp32, [256]> encoder_g_proj_3_bias = const()[name = tensor<string, []>("encoder_g_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27264448)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_3_weight = const()[name = tensor<string, []>("encoder_g_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27265536)))];
-            tensor<fp32, [256]> encoder_out_proj_3_bias = const()[name = tensor<string, []>("encoder_out_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27527744)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_3_weight = const()[name = tensor<string, []>("encoder_out_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27528832)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27791040)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27792128)))];
-            tensor<fp32, [512]> encoder_conv_module_3_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27793216)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_3_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27795328)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_3_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28319680)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28336128)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28337216)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28338304)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28339392)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28340480)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_3_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28341568)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28603776)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28604864)))];
-            tensor<fp32, [1024]> encoder_ffn2_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28605952)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28610112)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29658752)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29659840)))];
-            tensor<fp32, [256]> encoder_layer_norm_3_bias = const()[name = tensor<string, []>("encoder_layer_norm_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30708480)))];
-            tensor<fp32, [256]> encoder_layer_norm_3_weight = const()[name = tensor<string, []>("encoder_layer_norm_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30709568)))];
-            tensor<fp32, [5, 5]> decoder__causal_mask = const()[name = tensor<string, []>("decoder__causal_mask"), val = tensor<fp32, [5, 5]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710656)))];
-            tensor<fp32, [256]> decoder_convert_bias = const()[name = tensor<string, []>("decoder_convert_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710848)))];
-            tensor<fp32, [256, 512]> decoder_convert_weight = const()[name = tensor<string, []>("decoder_convert_weight"), val = tensor<fp32, [256, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30711936)))];
-            tensor<fp32, [256]> decoder_q_proj_0_bias = const()[name = tensor<string, []>("decoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31236288)))];
-            tensor<fp32, [256, 256]> decoder_q_proj_0_weight = const()[name = tensor<string, []>("decoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31237376)))];
-            tensor<fp32, [256]> decoder_k_proj_0_bias = const()[name = tensor<string, []>("decoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31499584)))];
-            tensor<fp32, [256, 256]> decoder_k_proj_0_weight = const()[name = tensor<string, []>("decoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31500672)))];
-            tensor<fp32, [256]> decoder_v_proj_0_bias = const()[name = tensor<string, []>("decoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31762880)))];
-            tensor<fp32, [256, 256]> decoder_v_proj_0_weight = const()[name = tensor<string, []>("decoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31763968)))];
-            tensor<fp32, [256]> decoder_g_proj_0_bias = const()[name = tensor<string, []>("decoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32026176)))];
-            tensor<fp32, [256, 256]> decoder_g_proj_0_weight = const()[name = tensor<string, []>("decoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32027264)))];
-            tensor<fp32, [256]> decoder_out_proj_0_bias = const()[name = tensor<string, []>("decoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32289472)))];
-            tensor<fp32, [256, 256]> decoder_out_proj_0_weight = const()[name = tensor<string, []>("decoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32290560)))];
-            tensor<fp32, [256]> decoder_norm11_0_bias = const()[name = tensor<string, []>("decoder_norm11_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32552768)))];
-            tensor<fp32, [256]> decoder_norm11_0_weight = const()[name = tensor<string, []>("decoder_norm11_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32553856)))];
-            tensor<fp32, [256]> decoder_self_attn2_0_out_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_0_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32554944)))];
-            tensor<fp32, [256, 256]> decoder_self_attn2_0_out_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_0_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32556032)))];
-            tensor<fp32, [768]> decoder_self_attn2_0_in_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_0_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32818240)))];
-            tensor<fp32, [768, 256]> decoder_self_attn2_0_in_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_0_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32821376)))];
-            tensor<fp32, [256]> decoder_norm21_0_bias = const()[name = tensor<string, []>("decoder_norm21_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33607872)))];
-            tensor<fp32, [256]> decoder_norm21_0_weight = const()[name = tensor<string, []>("decoder_norm21_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33608960)))];
-            tensor<fp32, [2048]> decoder_linear1_0_bias = const()[name = tensor<string, []>("decoder_linear1_0_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33610048)))];
-            tensor<fp32, [2048, 256]> decoder_linear1_0_weight = const()[name = tensor<string, []>("decoder_linear1_0_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33618304)))];
-            tensor<fp32, [256]> decoder_linear2_0_bias = const()[name = tensor<string, []>("decoder_linear2_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35715520)))];
-            tensor<fp32, [256, 2048]> decoder_linear2_0_weight = const()[name = tensor<string, []>("decoder_linear2_0_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35716608)))];
-            tensor<fp32, [256]> decoder_norm22_0_bias = const()[name = tensor<string, []>("decoder_norm22_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37813824)))];
-            tensor<fp32, [256]> decoder_norm22_0_weight = const()[name = tensor<string, []>("decoder_norm22_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37814912)))];
-            tensor<fp32, [256]> decoder_q_proj_1_bias = const()[name = tensor<string, []>("decoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37816000)))];
-            tensor<fp32, [256, 256]> decoder_q_proj_1_weight = const()[name = tensor<string, []>("decoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37817088)))];
-            tensor<fp32, [256]> decoder_k_proj_1_bias = const()[name = tensor<string, []>("decoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38079296)))];
-            tensor<fp32, [256, 256]> decoder_k_proj_1_weight = const()[name = tensor<string, []>("decoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38080384)))];
-            tensor<fp32, [256]> decoder_v_proj_1_bias = const()[name = tensor<string, []>("decoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38342592)))];
-            tensor<fp32, [256, 256]> decoder_v_proj_1_weight = const()[name = tensor<string, []>("decoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38343680)))];
-            tensor<fp32, [256]> decoder_g_proj_1_bias = const()[name = tensor<string, []>("decoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38605888)))];
-            tensor<fp32, [256, 256]> decoder_g_proj_1_weight = const()[name = tensor<string, []>("decoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38606976)))];
-            tensor<fp32, [256]> decoder_out_proj_1_bias = const()[name = tensor<string, []>("decoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38869184)))];
-            tensor<fp32, [256, 256]> decoder_out_proj_1_weight = const()[name = tensor<string, []>("decoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38870272)))];
-            tensor<fp32, [256]> decoder_norm11_1_bias = const()[name = tensor<string, []>("decoder_norm11_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39132480)))];
-            tensor<fp32, [256]> decoder_norm11_1_weight = const()[name = tensor<string, []>("decoder_norm11_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39133568)))];
-            tensor<fp32, [256]> decoder_self_attn2_1_out_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_1_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39134656)))];
-            tensor<fp32, [256, 256]> decoder_self_attn2_1_out_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_1_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39135744)))];
-            tensor<fp32, [768]> decoder_self_attn2_1_in_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_1_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39397952)))];
-            tensor<fp32, [768, 256]> decoder_self_attn2_1_in_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_1_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39401088)))];
-            tensor<fp32, [256]> decoder_norm21_1_bias = const()[name = tensor<string, []>("decoder_norm21_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40187584)))];
-            tensor<fp32, [256]> decoder_norm21_1_weight = const()[name = tensor<string, []>("decoder_norm21_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40188672)))];
-            tensor<fp32, [2048]> decoder_linear1_1_bias = const()[name = tensor<string, []>("decoder_linear1_1_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40189760)))];
-            tensor<fp32, [2048, 256]> decoder_linear1_1_weight = const()[name = tensor<string, []>("decoder_linear1_1_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40198016)))];
-            tensor<fp32, [256]> decoder_linear2_1_bias = const()[name = tensor<string, []>("decoder_linear2_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42295232)))];
-            tensor<fp32, [256, 2048]> decoder_linear2_1_weight = const()[name = tensor<string, []>("decoder_linear2_1_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42296320)))];
-            tensor<fp32, [256]> decoder_norm22_1_bias = const()[name = tensor<string, []>("decoder_norm22_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44393536)))];
-            tensor<fp32, [256]> decoder_norm22_1_weight = const()[name = tensor<string, []>("decoder_norm22_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44394624)))];
-            tensor<fp32, []> var_11 = const()[name = tensor<string, []>("op_11"), val = tensor<fp32, []>(0x1.5798eep-27)];
-            tensor<fp32, []> var_18 = const()[name = tensor<string, []>("op_18"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
-            tensor<int32, []> var_21 = const()[name = tensor<string, []>("op_21"), val = tensor<int32, []>(2)];
-            tensor<int32, []> var_24 = const()[name = tensor<string, []>("op_24"), val = tensor<int32, []>(0)];
-            tensor<int32, []> var_27 = const()[name = tensor<string, []>("op_27"), val = tensor<int32, []>(1)];
-            tensor<fp32, []> var_29 = const()[name = tensor<string, []>("op_29"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
-            tensor<bool, []> var_30 = const()[name = tensor<string, []>("op_30"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 5, 256]> input_1 = linear(bias = encoder_input_projection_linear_bias, weight = encoder_input_projection_linear_weight, x = features)[name = tensor<string, []>("linear_0")];
-            tensor<int32, [1]> input_3_axes_0 = const()[name = tensor<string, []>("input_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_3 = layer_norm(axes = input_3_axes_0, beta = encoder_pre_layer_norm_bias, epsilon = var_29, gamma = encoder_pre_layer_norm_weight, x = input_1)[name = tensor<string, []>("input_3")];
+    func main<ios17>(tensor<fp32, [1, 256, 18]> cnn_window, tensor<fp32, [2, 12, 4, 64, 64]> dec_kv, tensor<fp32, [2, 1]> dec_scale, tensor<fp32, [4, 1, 16, 256]> enc_conv_cache, tensor<fp32, [4, 1, 4, 64, 64]> enc_kv, tensor<fp32, [4, 1]> enc_scale, tensor<fp32, [1, 55, 23]> features, tensor<fp32, [5]> valid_mask) {
+            tensor<fp32, [256]> inner_encoder_cnn_bias = const()[name = tensor<string, []>("inner_encoder_cnn_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp32, [256, 256, 19]> inner_encoder_cnn_weight = const()[name = tensor<string, []>("inner_encoder_cnn_weight"), val = tensor<fp32, [256, 256, 19]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
+            tensor<fp32, [5, 5]> inner_encoder__causal_mask = const()[name = tensor<string, []>("inner_encoder__causal_mask"), val = tensor<fp32, [5, 5]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4981952)))];
+            tensor<fp32, [5]> inner_encoder__t_index = const()[name = tensor<string, []>("inner_encoder__t_index"), val = tensor<fp32, [5]>([0x1p+0, 0x1p+1, 0x1.8p+1, 0x1p+2, 0x1.4p+2])];
+            tensor<fp32, [256]> inner_encoder_input_projection_linear_bias = const()[name = tensor<string, []>("inner_encoder_input_projection_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4982144)))];
+            tensor<fp32, [256, 345]> inner_encoder_input_projection_linear_weight = const()[name = tensor<string, []>("inner_encoder_input_projection_linear_weight"), val = tensor<fp32, [256, 345]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4983232)))];
+            tensor<fp32, [256]> inner_encoder_pre_layer_norm_bias = const()[name = tensor<string, []>("inner_encoder_pre_layer_norm_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5336576)))];
+            tensor<fp32, [256]> inner_encoder_pre_layer_norm_weight = const()[name = tensor<string, []>("inner_encoder_pre_layer_norm_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5337664)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5338752)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5339840)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5340928)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5345088)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6393728)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6394816)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_0_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7443456)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_0_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7444544)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7445632)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7446720)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7708928)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7710016)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7972224)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7973312)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8235520)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8236608)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8498816)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8499904)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8762112)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8763200)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_0_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8764288)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_0_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8766400)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_0_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9290752)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9307200)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9308288)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9309376)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9310464)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9311552)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_0_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9312640)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9574848)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9575936)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9577024)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9581184)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10629824)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10630912)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_0_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11679552)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_0_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11680640)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11681728)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11682816)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11683904)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11688064)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12736704)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12737792)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_1_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13786432)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_1_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13787520)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13788608)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13789696)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14051904)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14052992)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14315200)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14316288)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14578496)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14579584)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14841792)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14842880)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15105088)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15106176)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_1_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15107264)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_1_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15109376)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_1_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15633728)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15650176)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15651264)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15652352)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15653440)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15654528)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_1_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15655616)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15917824)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15918912)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15920000)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15924160)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16972800)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16973888)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_1_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18022528)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_1_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18023616)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18024704)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18025792)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18026880)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18031040)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19079680)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19080768)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_2_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20129408)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_2_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20130496)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20131584)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20132672)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20394880)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20395968)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20658176)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20659264)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20921472)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20922560)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21184768)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21185856)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21448064)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21449152)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_2_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21450240)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_2_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21452352)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_2_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21976704)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21993152)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21994240)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21995328)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21996416)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21997504)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_2_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21998592)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22260800)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22261888)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22262976)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22267136)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23315776)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23316864)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_2_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24365504)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_2_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24366592)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24367680)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24368768)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24369856)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24374016)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25422656)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25423744)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_3_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26472384)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_3_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26473472)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26474560)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26475648)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26737856)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26738944)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27001152)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27002240)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27264448)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27265536)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27527744)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27528832)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27791040)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27792128)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_3_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27793216)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_3_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27795328)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_3_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28319680)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28336128)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28337216)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28338304)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28339392)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28340480)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_3_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28341568)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28603776)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28604864)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28605952)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28610112)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29658752)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29659840)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_3_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30708480)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_3_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30709568)))];
+            tensor<fp32, [5, 5]> inner_decoder__causal_mask = const()[name = tensor<string, []>("inner_decoder__causal_mask"), val = tensor<fp32, [5, 5]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710656)))];
+            tensor<fp32, [256]> inner_decoder_convert_bias = const()[name = tensor<string, []>("inner_decoder_convert_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710848)))];
+            tensor<fp32, [256, 512]> inner_decoder_convert_weight = const()[name = tensor<string, []>("inner_decoder_convert_weight"), val = tensor<fp32, [256, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30711936)))];
+            tensor<fp32, [256]> inner_decoder_q_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31236288)))];
+            tensor<fp32, [256, 256]> inner_decoder_q_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31237376)))];
+            tensor<fp32, [256]> inner_decoder_k_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31499584)))];
+            tensor<fp32, [256, 256]> inner_decoder_k_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31500672)))];
+            tensor<fp32, [256]> inner_decoder_v_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31762880)))];
+            tensor<fp32, [256, 256]> inner_decoder_v_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31763968)))];
+            tensor<fp32, [256]> inner_decoder_g_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32026176)))];
+            tensor<fp32, [256, 256]> inner_decoder_g_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32027264)))];
+            tensor<fp32, [256]> inner_decoder_out_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32289472)))];
+            tensor<fp32, [256, 256]> inner_decoder_out_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32290560)))];
+            tensor<fp32, [256]> inner_decoder_norm11_0_bias = const()[name = tensor<string, []>("inner_decoder_norm11_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32552768)))];
+            tensor<fp32, [256]> inner_decoder_norm11_0_weight = const()[name = tensor<string, []>("inner_decoder_norm11_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32553856)))];
+            tensor<fp32, [256]> inner_decoder_self_attn2_0_out_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32554944)))];
+            tensor<fp32, [256, 256]> inner_decoder_self_attn2_0_out_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32556032)))];
+            tensor<fp32, [768]> inner_decoder_self_attn2_0_in_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32818240)))];
+            tensor<fp32, [768, 256]> inner_decoder_self_attn2_0_in_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32821376)))];
+            tensor<fp32, [256]> inner_decoder_norm21_0_bias = const()[name = tensor<string, []>("inner_decoder_norm21_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33607872)))];
+            tensor<fp32, [256]> inner_decoder_norm21_0_weight = const()[name = tensor<string, []>("inner_decoder_norm21_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33608960)))];
+            tensor<fp32, [2048]> inner_decoder_linear1_0_bias = const()[name = tensor<string, []>("inner_decoder_linear1_0_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33610048)))];
+            tensor<fp32, [2048, 256]> inner_decoder_linear1_0_weight = const()[name = tensor<string, []>("inner_decoder_linear1_0_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33618304)))];
+            tensor<fp32, [256]> inner_decoder_linear2_0_bias = const()[name = tensor<string, []>("inner_decoder_linear2_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35715520)))];
+            tensor<fp32, [256, 2048]> inner_decoder_linear2_0_weight = const()[name = tensor<string, []>("inner_decoder_linear2_0_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35716608)))];
+            tensor<fp32, [256]> inner_decoder_norm22_0_bias = const()[name = tensor<string, []>("inner_decoder_norm22_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37813824)))];
+            tensor<fp32, [256]> inner_decoder_norm22_0_weight = const()[name = tensor<string, []>("inner_decoder_norm22_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37814912)))];
+            tensor<fp32, [256]> inner_decoder_q_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37816000)))];
+            tensor<fp32, [256, 256]> inner_decoder_q_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37817088)))];
+            tensor<fp32, [256]> inner_decoder_k_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38079296)))];
+            tensor<fp32, [256, 256]> inner_decoder_k_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38080384)))];
+            tensor<fp32, [256]> inner_decoder_v_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38342592)))];
+            tensor<fp32, [256, 256]> inner_decoder_v_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38343680)))];
+            tensor<fp32, [256]> inner_decoder_g_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38605888)))];
+            tensor<fp32, [256, 256]> inner_decoder_g_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38606976)))];
+            tensor<fp32, [256]> inner_decoder_out_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38869184)))];
+            tensor<fp32, [256, 256]> inner_decoder_out_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38870272)))];
+            tensor<fp32, [256]> inner_decoder_norm11_1_bias = const()[name = tensor<string, []>("inner_decoder_norm11_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39132480)))];
+            tensor<fp32, [256]> inner_decoder_norm11_1_weight = const()[name = tensor<string, []>("inner_decoder_norm11_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39133568)))];
+            tensor<fp32, [256]> inner_decoder_self_attn2_1_out_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39134656)))];
+            tensor<fp32, [256, 256]> inner_decoder_self_attn2_1_out_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39135744)))];
+            tensor<fp32, [768]> inner_decoder_self_attn2_1_in_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39397952)))];
+            tensor<fp32, [768, 256]> inner_decoder_self_attn2_1_in_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39401088)))];
+            tensor<fp32, [256]> inner_decoder_norm21_1_bias = const()[name = tensor<string, []>("inner_decoder_norm21_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40187584)))];
+            tensor<fp32, [256]> inner_decoder_norm21_1_weight = const()[name = tensor<string, []>("inner_decoder_norm21_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40188672)))];
+            tensor<fp32, [2048]> inner_decoder_linear1_1_bias = const()[name = tensor<string, []>("inner_decoder_linear1_1_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40189760)))];
+            tensor<fp32, [2048, 256]> inner_decoder_linear1_1_weight = const()[name = tensor<string, []>("inner_decoder_linear1_1_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40198016)))];
+            tensor<fp32, [256]> inner_decoder_linear2_1_bias = const()[name = tensor<string, []>("inner_decoder_linear2_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42295232)))];
+            tensor<fp32, [256, 2048]> inner_decoder_linear2_1_weight = const()[name = tensor<string, []>("inner_decoder_linear2_1_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42296320)))];
+            tensor<fp32, [256]> inner_decoder_norm22_1_bias = const()[name = tensor<string, []>("inner_decoder_norm22_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44393536)))];
+            tensor<fp32, [256]> inner_decoder_norm22_1_weight = const()[name = tensor<string, []>("inner_decoder_norm22_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44394624)))];
+            tensor<int32, [3]> var_19_begin_0 = const()[name = tensor<string, []>("op_19_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_19_end_0 = const()[name = tensor<string, []>("op_19_end_0"), val = tensor<int32, [3]>([1, 15, 23])];
+            tensor<bool, [3]> var_19_end_mask_0 = const()[name = tensor<string, []>("op_19_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 15, 23]> var_19 = slice_by_index(begin = var_19_begin_0, end = var_19_end_0, end_mask = var_19_end_mask_0, x = features)[name = tensor<string, []>("op_19")];
+            tensor<int32, [3]> var_29_begin_0 = const()[name = tensor<string, []>("op_29_begin_0"), val = tensor<int32, [3]>([0, 10, 0])];
+            tensor<int32, [3]> var_29_end_0 = const()[name = tensor<string, []>("op_29_end_0"), val = tensor<int32, [3]>([1, 25, 23])];
+            tensor<bool, [3]> var_29_end_mask_0 = const()[name = tensor<string, []>("op_29_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 15, 23]> var_29 = slice_by_index(begin = var_29_begin_0, end = var_29_end_0, end_mask = var_29_end_mask_0, x = features)[name = tensor<string, []>("op_29")];
+            tensor<int32, [3]> var_39_begin_0 = const()[name = tensor<string, []>("op_39_begin_0"), val = tensor<int32, [3]>([0, 20, 0])];
+            tensor<int32, [3]> var_39_end_0 = const()[name = tensor<string, []>("op_39_end_0"), val = tensor<int32, [3]>([1, 35, 23])];
+            tensor<bool, [3]> var_39_end_mask_0 = const()[name = tensor<string, []>("op_39_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 15, 23]> var_39 = slice_by_index(begin = var_39_begin_0, end = var_39_end_0, end_mask = var_39_end_mask_0, x = features)[name = tensor<string, []>("op_39")];
+            tensor<int32, [3]> var_49_begin_0 = const()[name = tensor<string, []>("op_49_begin_0"), val = tensor<int32, [3]>([0, 30, 0])];
+            tensor<int32, [3]> var_49_end_0 = const()[name = tensor<string, []>("op_49_end_0"), val = tensor<int32, [3]>([1, 45, 23])];
+            tensor<bool, [3]> var_49_end_mask_0 = const()[name = tensor<string, []>("op_49_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 15, 23]> var_49 = slice_by_index(begin = var_49_begin_0, end = var_49_end_0, end_mask = var_49_end_mask_0, x = features)[name = tensor<string, []>("op_49")];
+            tensor<int32, [3]> var_59_begin_0 = const()[name = tensor<string, []>("op_59_begin_0"), val = tensor<int32, [3]>([0, 40, 0])];
+            tensor<int32, [3]> var_59_end_0 = const()[name = tensor<string, []>("op_59_end_0"), val = tensor<int32, [3]>([1, 1, 23])];
+            tensor<bool, [3]> var_59_end_mask_0 = const()[name = tensor<string, []>("op_59_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 23]> var_59 = slice_by_index(begin = var_59_begin_0, end = var_59_end_0, end_mask = var_59_end_mask_0, x = features)[name = tensor<string, []>("op_59")];
+            tensor<int32, []> stacked_axis_0 = const()[name = tensor<string, []>("stacked_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 5, 15, 23]> stacked = stack(axis = stacked_axis_0, values = (var_19, var_29, var_39, var_49, var_59))[name = tensor<string, []>("stacked")];
+            tensor<int32, [3]> var_66 = const()[name = tensor<string, []>("op_66"), val = tensor<int32, [3]>([1, 5, 345])];
+            tensor<fp32, [1, 5, 345]> input_1 = reshape(shape = var_66, x = stacked)[name = tensor<string, []>("input_1")];
+            tensor<fp32, []> var_69 = const()[name = tensor<string, []>("op_69"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<bool, []> var_75 = const()[name = tensor<string, []>("op_75"), val = tensor<bool, []>(true)];
+            tensor<fp32, []> var_76 = const()[name = tensor<string, []>("op_76"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
+            tensor<int32, []> var_79 = const()[name = tensor<string, []>("op_79"), val = tensor<int32, []>(0)];
+            tensor<int32, []> var_81 = const()[name = tensor<string, []>("op_81"), val = tensor<int32, []>(2)];
+            tensor<int32, []> var_82 = const()[name = tensor<string, []>("op_82"), val = tensor<int32, []>(-1)];
+            tensor<fp32, []> var_84 = const()[name = tensor<string, []>("op_84"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
+            tensor<fp32, []> var_90 = const()[name = tensor<string, []>("op_90"), val = tensor<fp32, []>(0x1.5798eep-27)];
+            tensor<int32, []> var_93 = const()[name = tensor<string, []>("op_93"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 5, 256]> input_3 = linear(bias = inner_encoder_input_projection_linear_bias, weight = inner_encoder_input_projection_linear_weight, x = input_1)[name = tensor<string, []>("linear_0")];
             tensor<int32, [1]> input_5_axes_0 = const()[name = tensor<string, []>("input_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_5 = layer_norm(axes = input_5_axes_0, beta = encoder_ffn1_0_module_sequential_0_bias, epsilon = var_29, gamma = encoder_ffn1_0_module_sequential_0_weight, x = input_3)[name = tensor<string, []>("input_5")];
-            tensor<fp32, [1, 5, 1024]> inputs_1 = linear(bias = encoder_ffn1_0_module_sequential_1_linear_bias, weight = encoder_ffn1_0_module_sequential_1_linear_weight, x = input_5)[name = tensor<string, []>("linear_1")];
-            tensor<fp32, [1, 5, 1024]> input_7 = silu(x = inputs_1)[name = tensor<string, []>("input_7")];
-            tensor<fp32, [1, 5, 256]> input_11 = linear(bias = encoder_ffn1_0_module_sequential_4_linear_bias, weight = encoder_ffn1_0_module_sequential_4_linear_weight, x = input_7)[name = tensor<string, []>("linear_2")];
-            tensor<fp32, []> var_148 = const()[name = tensor<string, []>("op_148"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 5, 256]> var_149 = mul(x = input_11, y = var_148)[name = tensor<string, []>("op_149")];
-            tensor<fp32, [1, 5, 256]> input_13 = add(x = var_149, y = input_3)[name = tensor<string, []>("input_13")];
+            tensor<fp32, [1, 5, 256]> input_5 = layer_norm(axes = input_5_axes_0, beta = inner_encoder_pre_layer_norm_bias, epsilon = var_76, gamma = inner_encoder_pre_layer_norm_weight, x = input_3)[name = tensor<string, []>("input_5")];
+            tensor<int32, [1]> input_7_axes_0 = const()[name = tensor<string, []>("input_7_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 5, 256]> input_7 = layer_norm(axes = input_7_axes_0, beta = inner_encoder_ffn1_0_module_sequential_0_bias, epsilon = var_76, gamma = inner_encoder_ffn1_0_module_sequential_0_weight, x = input_5)[name = tensor<string, []>("input_7")];
+            tensor<fp32, [1, 5, 1024]> inputs_1 = linear(bias = inner_encoder_ffn1_0_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_0_module_sequential_1_linear_weight, x = input_7)[name = tensor<string, []>("linear_1")];
+            tensor<fp32, [1, 5, 1024]> input_9 = silu(x = inputs_1)[name = tensor<string, []>("input_9")];
+            tensor<fp32, [1, 5, 256]> input_13 = linear(bias = inner_encoder_ffn1_0_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_0_module_sequential_4_linear_weight, x = input_9)[name = tensor<string, []>("linear_2")];
+            tensor<fp32, []> var_214 = const()[name = tensor<string, []>("op_214"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 5, 256]> var_215 = mul(x = input_13, y = var_214)[name = tensor<string, []>("op_215")];
+            tensor<fp32, [1, 5, 256]> input_15 = add(x = var_215, y = input_5)[name = tensor<string, []>("input_15")];
             tensor<int32, [1]> x_1_axes_0 = const()[name = tensor<string, []>("x_1_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> x_1 = layer_norm(axes = x_1_axes_0, beta = encoder_ret_lns_0_bias, epsilon = var_29, gamma = encoder_ret_lns_0_weight, x = input_13)[name = tensor<string, []>("x_1")];
+            tensor<fp32, [1, 5, 256]> x_1 = layer_norm(axes = x_1_axes_0, beta = inner_encoder_ret_lns_0_bias, epsilon = var_76, gamma = inner_encoder_ret_lns_0_weight, x = input_15)[name = tensor<string, []>("x_1")];
             tensor<int32, [5]> prev_kv_1_begin_0 = const()[name = tensor<string, []>("prev_kv_1_begin_0"), val = tensor<int32, [5]>([0, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_1_end_0 = const()[name = tensor<string, []>("prev_kv_1_end_0"), val = tensor<int32, [5]>([1, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_1_end_mask_0 = const()[name = tensor<string, []>("prev_kv_1_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -239,183 +265,183 @@ program(1.0)
             tensor<bool, [2]> prev_scale_1_end_mask_0 = const()[name = tensor<string, []>("prev_scale_1_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_1_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_1_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_1 = slice_by_index(begin = prev_scale_1_begin_0, end = prev_scale_1_end_0, end_mask = prev_scale_1_end_mask_0, squeeze_mask = prev_scale_1_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_1")];
-            tensor<fp32, [1, 5, 256]> var_163 = linear(bias = encoder_q_proj_0_bias, weight = encoder_q_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_3")];
-            tensor<int32, [4]> var_164 = const()[name = tensor<string, []>("op_164"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_165 = reshape(shape = var_164, x = var_163)[name = tensor<string, []>("op_165")];
+            tensor<fp32, [1, 5, 256]> var_229 = linear(bias = inner_encoder_q_proj_0_bias, weight = inner_encoder_q_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_3")];
+            tensor<int32, [4]> var_230 = const()[name = tensor<string, []>("op_230"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_231 = reshape(shape = var_230, x = var_229)[name = tensor<string, []>("op_231")];
             tensor<int32, [4]> q_1_perm_0 = const()[name = tensor<string, []>("q_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> var_169 = linear(bias = encoder_k_proj_0_bias, weight = encoder_k_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_4")];
-            tensor<fp32, []> var_170 = const()[name = tensor<string, []>("op_170"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 5, 256]> var_171 = mul(x = var_169, y = var_170)[name = tensor<string, []>("op_171")];
-            tensor<int32, [4]> var_172 = const()[name = tensor<string, []>("op_172"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_173 = reshape(shape = var_172, x = var_171)[name = tensor<string, []>("op_173")];
+            tensor<fp32, [1, 5, 256]> var_235 = linear(bias = inner_encoder_k_proj_0_bias, weight = inner_encoder_k_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_4")];
+            tensor<fp32, []> var_236 = const()[name = tensor<string, []>("op_236"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 5, 256]> var_237 = mul(x = var_235, y = var_236)[name = tensor<string, []>("op_237")];
+            tensor<int32, [4]> var_238 = const()[name = tensor<string, []>("op_238"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_239 = reshape(shape = var_238, x = var_237)[name = tensor<string, []>("op_239")];
             tensor<int32, [4]> k_1_perm_0 = const()[name = tensor<string, []>("k_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> var_177 = linear(bias = encoder_v_proj_0_bias, weight = encoder_v_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_5")];
-            tensor<int32, [4]> var_178 = const()[name = tensor<string, []>("op_178"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_179 = reshape(shape = var_178, x = var_177)[name = tensor<string, []>("op_179")];
+            tensor<fp32, [1, 5, 256]> var_243 = linear(bias = inner_encoder_v_proj_0_bias, weight = inner_encoder_v_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_5")];
+            tensor<int32, [4]> var_244 = const()[name = tensor<string, []>("op_244"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_245 = reshape(shape = var_244, x = var_243)[name = tensor<string, []>("op_245")];
             tensor<int32, [4]> v_1_perm_0 = const()[name = tensor<string, []>("v_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> input_17 = linear(bias = encoder_g_proj_0_bias, weight = encoder_g_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_6")];
+            tensor<fp32, [1, 5, 256]> input_19 = linear(bias = inner_encoder_g_proj_0_bias, weight = inner_encoder_g_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_6")];
             tensor<fp32, [1]> sqrt_s0_1 = sqrt(x = prev_scale_1)[name = tensor<string, []>("sqrt_s0_1")];
-            tensor<fp32, [5]> s_t_1 = add(x = prev_scale_1, y = encoder__t_index)[name = tensor<string, []>("s_t_1")];
+            tensor<fp32, [5]> s_t_1 = add(x = prev_scale_1, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_1")];
             tensor<fp32, [5]> sqrt_s_t_1 = sqrt(x = s_t_1)[name = tensor<string, []>("sqrt_s_t_1")];
             tensor<bool, []> qk_1_transpose_x_1 = const()[name = tensor<string, []>("qk_1_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_1_transpose_y_1 = const()[name = tensor<string, []>("qk_1_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 5, 64]> k_1 = transpose(perm = k_1_perm_0, x = var_173)[name = tensor<string, []>("transpose_57")];
-            tensor<fp32, [1, 4, 5, 64]> q_1 = transpose(perm = q_1_perm_0, x = var_165)[name = tensor<string, []>("transpose_58")];
+            tensor<fp32, [1, 4, 5, 64]> k_1 = transpose(perm = k_1_perm_0, x = var_239)[name = tensor<string, []>("transpose_57")];
+            tensor<fp32, [1, 4, 5, 64]> q_1 = transpose(perm = q_1_perm_0, x = var_231)[name = tensor<string, []>("transpose_58")];
             tensor<fp32, [1, 4, 5, 5]> qk_1 = matmul(transpose_x = qk_1_transpose_x_1, transpose_y = qk_1_transpose_y_1, x = q_1, y = k_1)[name = tensor<string, []>("qk_1")];
-            tensor<int32, [2]> var_189 = const()[name = tensor<string, []>("op_189"), val = tensor<int32, [2]>([5, 1])];
-            tensor<fp32, [5, 1]> var_190 = reshape(shape = var_189, x = sqrt_s_t_1)[name = tensor<string, []>("op_190")];
-            tensor<fp32, [5, 5]> M_1 = real_div(x = encoder__causal_mask, y = var_190)[name = tensor<string, []>("M_1")];
-            tensor<fp32, [1, 4, 5, 5]> var_192 = mul(x = qk_1, y = M_1)[name = tensor<string, []>("op_192")];
+            tensor<int32, [2]> var_255 = const()[name = tensor<string, []>("op_255"), val = tensor<int32, [2]>([5, 1])];
+            tensor<fp32, [5, 1]> var_256 = reshape(shape = var_255, x = sqrt_s_t_1)[name = tensor<string, []>("op_256")];
+            tensor<fp32, [5, 5]> M_1 = real_div(x = inner_encoder__causal_mask, y = var_256)[name = tensor<string, []>("M_1")];
+            tensor<fp32, [1, 4, 5, 5]> var_258 = mul(x = qk_1, y = M_1)[name = tensor<string, []>("op_258")];
             tensor<bool, []> inner_1_transpose_x_0 = const()[name = tensor<string, []>("inner_1_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_1_transpose_y_0 = const()[name = tensor<string, []>("inner_1_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 5, 64]> v_1 = transpose(perm = v_1_perm_0, x = var_179)[name = tensor<string, []>("transpose_56")];
-            tensor<fp32, [1, 4, 5, 64]> inner_1 = matmul(transpose_x = inner_1_transpose_x_0, transpose_y = inner_1_transpose_y_0, x = var_192, y = v_1)[name = tensor<string, []>("inner_1")];
-            tensor<bool, []> var_194_transpose_x_0 = const()[name = tensor<string, []>("op_194_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_194_transpose_y_0 = const()[name = tensor<string, []>("op_194_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 5, 64]> var_194 = matmul(transpose_x = var_194_transpose_x_0, transpose_y = var_194_transpose_y_0, x = q_1, y = prev_kv_1)[name = tensor<string, []>("op_194")];
-            tensor<fp32, [5]> var_195 = real_div(x = sqrt_s0_1, y = sqrt_s_t_1)[name = tensor<string, []>("op_195")];
-            tensor<int32, [4]> var_196 = const()[name = tensor<string, []>("op_196"), val = tensor<int32, [4]>([1, 1, 5, 1])];
-            tensor<fp32, [1, 1, 5, 1]> var_197 = reshape(shape = var_196, x = var_195)[name = tensor<string, []>("op_197")];
-            tensor<fp32, [1, 4, 5, 64]> cross_1 = mul(x = var_194, y = var_197)[name = tensor<string, []>("cross_1")];
+            tensor<fp32, [1, 4, 5, 64]> v_1 = transpose(perm = v_1_perm_0, x = var_245)[name = tensor<string, []>("transpose_56")];
+            tensor<fp32, [1, 4, 5, 64]> inner_1 = matmul(transpose_x = inner_1_transpose_x_0, transpose_y = inner_1_transpose_y_0, x = var_258, y = v_1)[name = tensor<string, []>("inner_1")];
+            tensor<bool, []> var_260_transpose_x_0 = const()[name = tensor<string, []>("op_260_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_260_transpose_y_0 = const()[name = tensor<string, []>("op_260_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 5, 64]> var_260 = matmul(transpose_x = var_260_transpose_x_0, transpose_y = var_260_transpose_y_0, x = q_1, y = prev_kv_1)[name = tensor<string, []>("op_260")];
+            tensor<fp32, [5]> var_261 = real_div(x = sqrt_s0_1, y = sqrt_s_t_1)[name = tensor<string, []>("op_261")];
+            tensor<int32, [4]> var_262 = const()[name = tensor<string, []>("op_262"), val = tensor<int32, [4]>([1, 1, 5, 1])];
+            tensor<fp32, [1, 1, 5, 1]> var_263 = reshape(shape = var_262, x = var_261)[name = tensor<string, []>("op_263")];
+            tensor<fp32, [1, 4, 5, 64]> cross_1 = mul(x = var_260, y = var_263)[name = tensor<string, []>("cross_1")];
             tensor<fp32, [1, 4, 5, 64]> out_1 = add(x = inner_1, y = cross_1)[name = tensor<string, []>("out_1")];
-            tensor<fp32, [1, 4, 64, 64]> var_200 = mul(x = prev_kv_1, y = sqrt_s0_1)[name = tensor<string, []>("op_200")];
-            tensor<bool, []> var_202_transpose_x_1 = const()[name = tensor<string, []>("op_202_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_202_transpose_y_1 = const()[name = tensor<string, []>("op_202_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_202 = matmul(transpose_x = var_202_transpose_x_1, transpose_y = var_202_transpose_y_1, x = k_1, y = v_1)[name = tensor<string, []>("op_202")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_1 = add(x = var_200, y = var_202)[name = tensor<string, []>("new_kv_unnorm_1")];
-            tensor<fp32, []> var_204 = const()[name = tensor<string, []>("op_204"), val = tensor<fp32, []>(0x1.4p+2)];
-            tensor<fp32, [1]> new_scale_1 = add(x = prev_scale_1, y = var_204)[name = tensor<string, []>("new_scale_1")];
-            tensor<fp32, [1]> var_206 = sqrt(x = new_scale_1)[name = tensor<string, []>("op_206")];
-            tensor<fp32, [1, 4, 64, 64]> var_207 = real_div(x = new_kv_unnorm_1, y = var_206)[name = tensor<string, []>("op_207")];
-            tensor<int32, [4]> var_208_perm_0 = const()[name = tensor<string, []>("op_208_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_266 = mul(x = prev_kv_1, y = sqrt_s0_1)[name = tensor<string, []>("op_266")];
+            tensor<bool, []> var_268_transpose_x_1 = const()[name = tensor<string, []>("op_268_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_268_transpose_y_1 = const()[name = tensor<string, []>("op_268_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_268 = matmul(transpose_x = var_268_transpose_x_1, transpose_y = var_268_transpose_y_1, x = k_1, y = v_1)[name = tensor<string, []>("op_268")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_1 = add(x = var_266, y = var_268)[name = tensor<string, []>("new_kv_unnorm_1")];
+            tensor<fp32, []> var_270 = const()[name = tensor<string, []>("op_270"), val = tensor<fp32, []>(0x1.4p+2)];
+            tensor<fp32, [1]> new_scale_1 = add(x = prev_scale_1, y = var_270)[name = tensor<string, []>("new_scale_1")];
+            tensor<fp32, [1]> var_272 = sqrt(x = new_scale_1)[name = tensor<string, []>("op_272")];
+            tensor<fp32, [1, 4, 64, 64]> var_273 = real_div(x = new_kv_unnorm_1, y = var_272)[name = tensor<string, []>("op_273")];
+            tensor<int32, [4]> var_274_perm_0 = const()[name = tensor<string, []>("op_274_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_3_axes_0 = const()[name = tensor<string, []>("out_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 4, 64]> var_208 = transpose(perm = var_208_perm_0, x = out_1)[name = tensor<string, []>("transpose_55")];
-            tensor<fp32, [1, 5, 4, 64]> out_3 = layer_norm(axes = out_3_axes_0, epsilon = var_18, x = var_208)[name = tensor<string, []>("out_3")];
-            tensor<int32, [3]> var_212 = const()[name = tensor<string, []>("op_212"), val = tensor<int32, [3]>([1, 5, 256])];
-            tensor<fp32, [1, 5, 256]> out_5 = reshape(shape = var_212, x = out_3)[name = tensor<string, []>("out_5")];
-            tensor<fp32, [1, 5, 256]> var_214 = silu(x = input_17)[name = tensor<string, []>("op_214")];
-            tensor<fp32, [1, 5, 256]> input_19 = mul(x = var_214, y = out_5)[name = tensor<string, []>("input_19")];
-            tensor<fp32, [1, 5, 256]> ret_out_1 = linear(bias = encoder_out_proj_0_bias, weight = encoder_out_proj_0_weight, x = input_19)[name = tensor<string, []>("linear_7")];
-            tensor<fp32, [1, 5, 256]> x_3 = add(x = input_13, y = ret_out_1)[name = tensor<string, []>("x_3")];
+            tensor<fp32, [1, 5, 4, 64]> var_274 = transpose(perm = var_274_perm_0, x = out_1)[name = tensor<string, []>("transpose_55")];
+            tensor<fp32, [1, 5, 4, 64]> out_3 = layer_norm(axes = out_3_axes_0, epsilon = var_84, x = var_274)[name = tensor<string, []>("out_3")];
+            tensor<int32, [3]> var_278 = const()[name = tensor<string, []>("op_278"), val = tensor<int32, [3]>([1, 5, 256])];
+            tensor<fp32, [1, 5, 256]> out_5 = reshape(shape = var_278, x = out_3)[name = tensor<string, []>("out_5")];
+            tensor<fp32, [1, 5, 256]> var_280 = silu(x = input_19)[name = tensor<string, []>("op_280")];
+            tensor<fp32, [1, 5, 256]> input_21 = mul(x = var_280, y = out_5)[name = tensor<string, []>("input_21")];
+            tensor<fp32, [1, 5, 256]> ret_out_1 = linear(bias = inner_encoder_out_proj_0_bias, weight = inner_encoder_out_proj_0_weight, x = input_21)[name = tensor<string, []>("linear_7")];
+            tensor<fp32, [1, 5, 256]> x_3 = add(x = input_15, y = ret_out_1)[name = tensor<string, []>("x_3")];
             tensor<int32, [4]> window_1_begin_0 = const()[name = tensor<string, []>("window_1_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> window_1_end_0 = const()[name = tensor<string, []>("window_1_end_0"), val = tensor<int32, [4]>([1, 1, 16, 256])];
             tensor<bool, [4]> window_1_end_mask_0 = const()[name = tensor<string, []>("window_1_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_1_squeeze_mask_0 = const()[name = tensor<string, []>("window_1_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_1 = slice_by_index(begin = window_1_begin_0, end = window_1_end_0, end_mask = window_1_end_mask_0, squeeze_mask = window_1_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_1")];
-            tensor<int32, [3]> var_222_begin_0 = const()[name = tensor<string, []>("op_222_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_222_end_0 = const()[name = tensor<string, []>("op_222_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_222_end_mask_0 = const()[name = tensor<string, []>("op_222_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_222 = slice_by_index(begin = var_222_begin_0, end = var_222_end_0, end_mask = var_222_end_mask_0, x = x_3)[name = tensor<string, []>("op_222")];
-            tensor<int32, [3]> var_225_begin_0 = const()[name = tensor<string, []>("op_225_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_225_end_0 = const()[name = tensor<string, []>("op_225_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_225_end_mask_0 = const()[name = tensor<string, []>("op_225_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_225 = slice_by_index(begin = var_225_begin_0, end = var_225_end_0, end_mask = var_225_end_mask_0, x = window_1)[name = tensor<string, []>("op_225")];
+            tensor<int32, [3]> var_288_begin_0 = const()[name = tensor<string, []>("op_288_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_288_end_0 = const()[name = tensor<string, []>("op_288_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_288_end_mask_0 = const()[name = tensor<string, []>("op_288_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_288 = slice_by_index(begin = var_288_begin_0, end = var_288_end_0, end_mask = var_288_end_mask_0, x = x_3)[name = tensor<string, []>("op_288")];
+            tensor<int32, [3]> var_291_begin_0 = const()[name = tensor<string, []>("op_291_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_291_end_0 = const()[name = tensor<string, []>("op_291_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_291_end_mask_0 = const()[name = tensor<string, []>("op_291_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_291 = slice_by_index(begin = var_291_begin_0, end = var_291_end_0, end_mask = var_291_end_mask_0, x = window_1)[name = tensor<string, []>("op_291")];
             tensor<bool, []> window_3_interleave_0 = const()[name = tensor<string, []>("window_3_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_3 = concat(axis = var_27, interleave = window_3_interleave_0, values = (var_225, var_222))[name = tensor<string, []>("window_3")];
-            tensor<int32, [3]> var_230_begin_0 = const()[name = tensor<string, []>("op_230_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_230_end_0 = const()[name = tensor<string, []>("op_230_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_230_end_mask_0 = const()[name = tensor<string, []>("op_230_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_230 = slice_by_index(begin = var_230_begin_0, end = var_230_end_0, end_mask = var_230_end_mask_0, x = x_3)[name = tensor<string, []>("op_230")];
-            tensor<int32, [3]> var_233_begin_0 = const()[name = tensor<string, []>("op_233_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_233_end_0 = const()[name = tensor<string, []>("op_233_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_233_end_mask_0 = const()[name = tensor<string, []>("op_233_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_233 = slice_by_index(begin = var_233_begin_0, end = var_233_end_0, end_mask = var_233_end_mask_0, x = window_3)[name = tensor<string, []>("op_233")];
+            tensor<fp32, [1, 16, 256]> window_3 = concat(axis = var_93, interleave = window_3_interleave_0, values = (var_291, var_288))[name = tensor<string, []>("window_3")];
+            tensor<int32, [3]> var_296_begin_0 = const()[name = tensor<string, []>("op_296_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_296_end_0 = const()[name = tensor<string, []>("op_296_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_296_end_mask_0 = const()[name = tensor<string, []>("op_296_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_296 = slice_by_index(begin = var_296_begin_0, end = var_296_end_0, end_mask = var_296_end_mask_0, x = x_3)[name = tensor<string, []>("op_296")];
+            tensor<int32, [3]> var_299_begin_0 = const()[name = tensor<string, []>("op_299_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_299_end_0 = const()[name = tensor<string, []>("op_299_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_299_end_mask_0 = const()[name = tensor<string, []>("op_299_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_299 = slice_by_index(begin = var_299_begin_0, end = var_299_end_0, end_mask = var_299_end_mask_0, x = window_3)[name = tensor<string, []>("op_299")];
             tensor<bool, []> window_5_interleave_0 = const()[name = tensor<string, []>("window_5_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_5 = concat(axis = var_27, interleave = window_5_interleave_0, values = (var_233, var_230))[name = tensor<string, []>("window_5")];
-            tensor<int32, [3]> var_238_begin_0 = const()[name = tensor<string, []>("op_238_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_238_end_0 = const()[name = tensor<string, []>("op_238_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<bool, [3]> var_238_end_mask_0 = const()[name = tensor<string, []>("op_238_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_238 = slice_by_index(begin = var_238_begin_0, end = var_238_end_0, end_mask = var_238_end_mask_0, x = x_3)[name = tensor<string, []>("op_238")];
-            tensor<int32, [3]> var_241_begin_0 = const()[name = tensor<string, []>("op_241_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_241_end_0 = const()[name = tensor<string, []>("op_241_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_241_end_mask_0 = const()[name = tensor<string, []>("op_241_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_241 = slice_by_index(begin = var_241_begin_0, end = var_241_end_0, end_mask = var_241_end_mask_0, x = window_5)[name = tensor<string, []>("op_241")];
+            tensor<fp32, [1, 16, 256]> window_5 = concat(axis = var_93, interleave = window_5_interleave_0, values = (var_299, var_296))[name = tensor<string, []>("window_5")];
+            tensor<int32, [3]> var_304_begin_0 = const()[name = tensor<string, []>("op_304_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_304_end_0 = const()[name = tensor<string, []>("op_304_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<bool, [3]> var_304_end_mask_0 = const()[name = tensor<string, []>("op_304_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_304 = slice_by_index(begin = var_304_begin_0, end = var_304_end_0, end_mask = var_304_end_mask_0, x = x_3)[name = tensor<string, []>("op_304")];
+            tensor<int32, [3]> var_307_begin_0 = const()[name = tensor<string, []>("op_307_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_307_end_0 = const()[name = tensor<string, []>("op_307_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_307_end_mask_0 = const()[name = tensor<string, []>("op_307_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_307 = slice_by_index(begin = var_307_begin_0, end = var_307_end_0, end_mask = var_307_end_mask_0, x = window_5)[name = tensor<string, []>("op_307")];
             tensor<bool, []> window_7_interleave_0 = const()[name = tensor<string, []>("window_7_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_7 = concat(axis = var_27, interleave = window_7_interleave_0, values = (var_241, var_238))[name = tensor<string, []>("window_7")];
-            tensor<int32, [3]> var_246_begin_0 = const()[name = tensor<string, []>("op_246_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
-            tensor<int32, [3]> var_246_end_0 = const()[name = tensor<string, []>("op_246_end_0"), val = tensor<int32, [3]>([1, 4, 256])];
-            tensor<bool, [3]> var_246_end_mask_0 = const()[name = tensor<string, []>("op_246_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_246 = slice_by_index(begin = var_246_begin_0, end = var_246_end_0, end_mask = var_246_end_mask_0, x = x_3)[name = tensor<string, []>("op_246")];
-            tensor<int32, [3]> var_249_begin_0 = const()[name = tensor<string, []>("op_249_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_249_end_0 = const()[name = tensor<string, []>("op_249_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_249_end_mask_0 = const()[name = tensor<string, []>("op_249_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_249 = slice_by_index(begin = var_249_begin_0, end = var_249_end_0, end_mask = var_249_end_mask_0, x = window_7)[name = tensor<string, []>("op_249")];
+            tensor<fp32, [1, 16, 256]> window_7 = concat(axis = var_93, interleave = window_7_interleave_0, values = (var_307, var_304))[name = tensor<string, []>("window_7")];
+            tensor<int32, [3]> var_312_begin_0 = const()[name = tensor<string, []>("op_312_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
+            tensor<int32, [3]> var_312_end_0 = const()[name = tensor<string, []>("op_312_end_0"), val = tensor<int32, [3]>([1, 4, 256])];
+            tensor<bool, [3]> var_312_end_mask_0 = const()[name = tensor<string, []>("op_312_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_312 = slice_by_index(begin = var_312_begin_0, end = var_312_end_0, end_mask = var_312_end_mask_0, x = x_3)[name = tensor<string, []>("op_312")];
+            tensor<int32, [3]> var_315_begin_0 = const()[name = tensor<string, []>("op_315_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_315_end_0 = const()[name = tensor<string, []>("op_315_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_315_end_mask_0 = const()[name = tensor<string, []>("op_315_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_315 = slice_by_index(begin = var_315_begin_0, end = var_315_end_0, end_mask = var_315_end_mask_0, x = window_7)[name = tensor<string, []>("op_315")];
             tensor<bool, []> window_9_interleave_0 = const()[name = tensor<string, []>("window_9_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_9 = concat(axis = var_27, interleave = window_9_interleave_0, values = (var_249, var_246))[name = tensor<string, []>("window_9")];
-            tensor<int32, [3]> var_254_begin_0 = const()[name = tensor<string, []>("op_254_begin_0"), val = tensor<int32, [3]>([0, 4, 0])];
-            tensor<int32, [3]> var_254_end_0 = const()[name = tensor<string, []>("op_254_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_254_end_mask_0 = const()[name = tensor<string, []>("op_254_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_254 = slice_by_index(begin = var_254_begin_0, end = var_254_end_0, end_mask = var_254_end_mask_0, x = x_3)[name = tensor<string, []>("op_254")];
-            tensor<int32, [3]> var_257_begin_0 = const()[name = tensor<string, []>("op_257_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_257_end_0 = const()[name = tensor<string, []>("op_257_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_257_end_mask_0 = const()[name = tensor<string, []>("op_257_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_257 = slice_by_index(begin = var_257_begin_0, end = var_257_end_0, end_mask = var_257_end_mask_0, x = window_9)[name = tensor<string, []>("op_257")];
+            tensor<fp32, [1, 16, 256]> window_9 = concat(axis = var_93, interleave = window_9_interleave_0, values = (var_315, var_312))[name = tensor<string, []>("window_9")];
+            tensor<int32, [3]> var_320_begin_0 = const()[name = tensor<string, []>("op_320_begin_0"), val = tensor<int32, [3]>([0, 4, 0])];
+            tensor<int32, [3]> var_320_end_0 = const()[name = tensor<string, []>("op_320_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_320_end_mask_0 = const()[name = tensor<string, []>("op_320_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_320 = slice_by_index(begin = var_320_begin_0, end = var_320_end_0, end_mask = var_320_end_mask_0, x = x_3)[name = tensor<string, []>("op_320")];
+            tensor<int32, [3]> var_323_begin_0 = const()[name = tensor<string, []>("op_323_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_323_end_0 = const()[name = tensor<string, []>("op_323_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_323_end_mask_0 = const()[name = tensor<string, []>("op_323_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_323 = slice_by_index(begin = var_323_begin_0, end = var_323_end_0, end_mask = var_323_end_mask_0, x = window_9)[name = tensor<string, []>("op_323")];
             tensor<bool, []> window_11_interleave_0 = const()[name = tensor<string, []>("window_11_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_11 = concat(axis = var_27, interleave = window_11_interleave_0, values = (var_257, var_254))[name = tensor<string, []>("window_11")];
-            tensor<bool, []> input_21_interleave_0 = const()[name = tensor<string, []>("input_21_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [5, 16, 256]> input_21 = concat(axis = var_24, interleave = input_21_interleave_0, values = (window_3, window_5, window_7, window_9, window_11))[name = tensor<string, []>("input_21")];
+            tensor<fp32, [1, 16, 256]> window_11 = concat(axis = var_93, interleave = window_11_interleave_0, values = (var_323, var_320))[name = tensor<string, []>("window_11")];
+            tensor<bool, []> input_23_interleave_0 = const()[name = tensor<string, []>("input_23_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [5, 16, 256]> input_23 = concat(axis = var_79, interleave = input_23_interleave_0, values = (window_3, window_5, window_7, window_9, window_11))[name = tensor<string, []>("input_23")];
             tensor<int32, [1]> x_5_axes_0 = const()[name = tensor<string, []>("x_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [5, 16, 256]> x_5 = layer_norm(axes = x_5_axes_0, beta = encoder_conv_module_0_sequential_0_bias, epsilon = var_29, gamma = encoder_conv_module_0_sequential_0_weight, x = input_21)[name = tensor<string, []>("x_5")];
-            tensor<int32, [3]> input_23_perm_0 = const()[name = tensor<string, []>("input_23_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [5, 16, 256]> x_5 = layer_norm(axes = x_5_axes_0, beta = inner_encoder_conv_module_0_sequential_0_bias, epsilon = var_76, gamma = inner_encoder_conv_module_0_sequential_0_weight, x = input_23)[name = tensor<string, []>("x_5")];
+            tensor<int32, [3]> input_25_perm_0 = const()[name = tensor<string, []>("input_25_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_3_pad_type_0 = const()[name = tensor<string, []>("inputs_3_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_3_strides_0 = const()[name = tensor<string, []>("inputs_3_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_3_pad_0 = const()[name = tensor<string, []>("inputs_3_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_3_dilations_0 = const()[name = tensor<string, []>("inputs_3_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_3_groups_0 = const()[name = tensor<string, []>("inputs_3_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> input_23 = transpose(perm = input_23_perm_0, x = x_5)[name = tensor<string, []>("transpose_54")];
-            tensor<fp32, [5, 512, 16]> inputs_3 = conv(bias = encoder_conv_module_0_sequential_2_conv_bias, dilations = inputs_3_dilations_0, groups = inputs_3_groups_0, pad = inputs_3_pad_0, pad_type = inputs_3_pad_type_0, strides = inputs_3_strides_0, weight = encoder_conv_module_0_sequential_2_conv_weight, x = input_23)[name = tensor<string, []>("inputs_3")];
-            tensor<int32, [2]> var_282_split_sizes_0 = const()[name = tensor<string, []>("op_282_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_282_axis_0 = const()[name = tensor<string, []>("op_282_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> var_282_0, tensor<fp32, [5, 256, 16]> var_282_1 = split(axis = var_282_axis_0, split_sizes = var_282_split_sizes_0, x = inputs_3)[name = tensor<string, []>("op_282")];
-            tensor<fp32, [5, 256, 16]> var_284 = sigmoid(x = var_282_1)[name = tensor<string, []>("op_284")];
-            tensor<fp32, [5, 256, 16]> inputs_5 = mul(x = var_282_0, y = var_284)[name = tensor<string, []>("inputs_5")];
+            tensor<fp32, [5, 256, 16]> input_25 = transpose(perm = input_25_perm_0, x = x_5)[name = tensor<string, []>("transpose_54")];
+            tensor<fp32, [5, 512, 16]> inputs_3 = conv(bias = inner_encoder_conv_module_0_sequential_2_conv_bias, dilations = inputs_3_dilations_0, groups = inputs_3_groups_0, pad = inputs_3_pad_0, pad_type = inputs_3_pad_type_0, strides = inputs_3_strides_0, weight = inner_encoder_conv_module_0_sequential_2_conv_weight, x = input_25)[name = tensor<string, []>("inputs_3")];
+            tensor<int32, [2]> var_348_split_sizes_0 = const()[name = tensor<string, []>("op_348_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_348_axis_0 = const()[name = tensor<string, []>("op_348_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [5, 256, 16]> var_348_0, tensor<fp32, [5, 256, 16]> var_348_1 = split(axis = var_348_axis_0, split_sizes = var_348_split_sizes_0, x = inputs_3)[name = tensor<string, []>("op_348")];
+            tensor<fp32, [5, 256, 16]> var_350 = sigmoid(x = var_348_1)[name = tensor<string, []>("op_350")];
+            tensor<fp32, [5, 256, 16]> inputs_5 = mul(x = var_348_0, y = var_350)[name = tensor<string, []>("inputs_5")];
             tensor<string, []> outputs_aug_1_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_1_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_1_pad_0 = const()[name = tensor<string, []>("outputs_aug_1_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_1_groups_0 = const()[name = tensor<string, []>("outputs_aug_1_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_1_strides_0 = const()[name = tensor<string, []>("outputs_aug_1_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_1_dilations_0 = const()[name = tensor<string, []>("outputs_aug_1_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [5, 256, 31]> outputs_aug_1 = conv(dilations = outputs_aug_1_dilations_0, groups = outputs_aug_1_groups_0, pad = outputs_aug_1_pad_0, pad_type = outputs_aug_1_pad_type_0, strides = outputs_aug_1_strides_0, weight = encoder_conv_module_0_sequential_4_conv_weight, x = inputs_5)[name = tensor<string, []>("outputs_aug_1")];
-            tensor<int32, [3]> input_25_begin_0 = const()[name = tensor<string, []>("input_25_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_25_end_0 = const()[name = tensor<string, []>("input_25_end_0"), val = tensor<int32, [3]>([5, 256, 16])];
-            tensor<bool, [3]> input_25_end_mask_0 = const()[name = tensor<string, []>("input_25_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [5, 256, 16]> input_25 = slice_by_index(begin = input_25_begin_0, end = input_25_end_0, end_mask = input_25_end_mask_0, x = outputs_aug_1)[name = tensor<string, []>("input_25")];
-            tensor<fp32, [5, 256, 16]> inputs_7 = batch_norm(beta = encoder_conv_module_0_sequential_5_bias, epsilon = var_29, gamma = encoder_conv_module_0_sequential_5_weight, mean = encoder_conv_module_0_sequential_5_running_mean, variance = encoder_conv_module_0_sequential_5_running_var, x = input_25)[name = tensor<string, []>("inputs_7")];
-            tensor<fp32, [5, 256, 16]> input_27 = silu(x = inputs_7)[name = tensor<string, []>("input_27")];
-            tensor<string, []> input_29_pad_type_0 = const()[name = tensor<string, []>("input_29_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_29_strides_0 = const()[name = tensor<string, []>("input_29_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_29_pad_0 = const()[name = tensor<string, []>("input_29_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_29_dilations_0 = const()[name = tensor<string, []>("input_29_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_29_groups_0 = const()[name = tensor<string, []>("input_29_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> input_29 = conv(bias = encoder_conv_module_0_sequential_7_conv_bias, dilations = input_29_dilations_0, groups = input_29_groups_0, pad = input_29_pad_0, pad_type = input_29_pad_type_0, strides = input_29_strides_0, weight = encoder_conv_module_0_sequential_7_conv_weight, x = input_27)[name = tensor<string, []>("input_29")];
+            tensor<fp32, [5, 256, 31]> outputs_aug_1 = conv(dilations = outputs_aug_1_dilations_0, groups = outputs_aug_1_groups_0, pad = outputs_aug_1_pad_0, pad_type = outputs_aug_1_pad_type_0, strides = outputs_aug_1_strides_0, weight = inner_encoder_conv_module_0_sequential_4_conv_weight, x = inputs_5)[name = tensor<string, []>("outputs_aug_1")];
+            tensor<int32, [3]> input_27_begin_0 = const()[name = tensor<string, []>("input_27_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_27_end_0 = const()[name = tensor<string, []>("input_27_end_0"), val = tensor<int32, [3]>([5, 256, 16])];
+            tensor<bool, [3]> input_27_end_mask_0 = const()[name = tensor<string, []>("input_27_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [5, 256, 16]> input_27 = slice_by_index(begin = input_27_begin_0, end = input_27_end_0, end_mask = input_27_end_mask_0, x = outputs_aug_1)[name = tensor<string, []>("input_27")];
+            tensor<fp32, [5, 256, 16]> inputs_7 = batch_norm(beta = inner_encoder_conv_module_0_sequential_5_bias, epsilon = var_76, gamma = inner_encoder_conv_module_0_sequential_5_weight, mean = inner_encoder_conv_module_0_sequential_5_running_mean, variance = inner_encoder_conv_module_0_sequential_5_running_var, x = input_27)[name = tensor<string, []>("inputs_7")];
+            tensor<fp32, [5, 256, 16]> input_29 = silu(x = inputs_7)[name = tensor<string, []>("input_29")];
+            tensor<string, []> input_31_pad_type_0 = const()[name = tensor<string, []>("input_31_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_31_strides_0 = const()[name = tensor<string, []>("input_31_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_31_pad_0 = const()[name = tensor<string, []>("input_31_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_31_dilations_0 = const()[name = tensor<string, []>("input_31_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_31_groups_0 = const()[name = tensor<string, []>("input_31_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [5, 256, 16]> input_31 = conv(bias = inner_encoder_conv_module_0_sequential_7_conv_bias, dilations = input_31_dilations_0, groups = input_31_groups_0, pad = input_31_pad_0, pad_type = input_31_pad_type_0, strides = input_31_strides_0, weight = inner_encoder_conv_module_0_sequential_7_conv_weight, x = input_29)[name = tensor<string, []>("input_31")];
             tensor<int32, [3]> conv_out_1_perm_0 = const()[name = tensor<string, []>("conv_out_1_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_315_begin_0 = const()[name = tensor<string, []>("op_315_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_315_end_0 = const()[name = tensor<string, []>("op_315_end_0"), val = tensor<int32, [3]>([5, 16, 256])];
-            tensor<bool, [3]> var_315_end_mask_0 = const()[name = tensor<string, []>("op_315_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [5, 16, 256]> conv_out_1 = transpose(perm = conv_out_1_perm_0, x = input_29)[name = tensor<string, []>("transpose_53")];
-            tensor<fp32, [5, 1, 256]> var_315 = slice_by_index(begin = var_315_begin_0, end = var_315_end_0, end_mask = var_315_end_mask_0, x = conv_out_1)[name = tensor<string, []>("op_315")];
-            tensor<int32, [3]> var_317_perm_0 = const()[name = tensor<string, []>("op_317_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 5, 256]> var_317 = transpose(perm = var_317_perm_0, x = var_315)[name = tensor<string, []>("transpose_52")];
-            tensor<fp32, [1, 5, 256]> input_31 = add(x = x_3, y = var_317)[name = tensor<string, []>("input_31")];
-            tensor<int32, [1]> input_33_axes_0 = const()[name = tensor<string, []>("input_33_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_33 = layer_norm(axes = input_33_axes_0, beta = encoder_ffn2_0_module_sequential_0_bias, epsilon = var_29, gamma = encoder_ffn2_0_module_sequential_0_weight, x = input_31)[name = tensor<string, []>("input_33")];
-            tensor<fp32, [1, 5, 1024]> inputs_9 = linear(bias = encoder_ffn2_0_module_sequential_1_linear_bias, weight = encoder_ffn2_0_module_sequential_1_linear_weight, x = input_33)[name = tensor<string, []>("linear_8")];
-            tensor<fp32, [1, 5, 1024]> input_35 = silu(x = inputs_9)[name = tensor<string, []>("input_35")];
-            tensor<fp32, [1, 5, 256]> input_39 = linear(bias = encoder_ffn2_0_module_sequential_4_linear_bias, weight = encoder_ffn2_0_module_sequential_4_linear_weight, x = input_35)[name = tensor<string, []>("linear_9")];
-            tensor<fp32, []> var_340 = const()[name = tensor<string, []>("op_340"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 5, 256]> var_341 = mul(x = input_39, y = var_340)[name = tensor<string, []>("op_341")];
-            tensor<fp32, [1, 5, 256]> input_41 = add(x = var_341, y = input_31)[name = tensor<string, []>("input_41")];
-            tensor<int32, [1]> input_43_axes_0 = const()[name = tensor<string, []>("input_43_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_43 = layer_norm(axes = input_43_axes_0, beta = encoder_layer_norm_0_bias, epsilon = var_29, gamma = encoder_layer_norm_0_weight, x = input_41)[name = tensor<string, []>("input_43")];
+            tensor<int32, [3]> var_381_begin_0 = const()[name = tensor<string, []>("op_381_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_381_end_0 = const()[name = tensor<string, []>("op_381_end_0"), val = tensor<int32, [3]>([5, 16, 256])];
+            tensor<bool, [3]> var_381_end_mask_0 = const()[name = tensor<string, []>("op_381_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [5, 16, 256]> conv_out_1 = transpose(perm = conv_out_1_perm_0, x = input_31)[name = tensor<string, []>("transpose_53")];
+            tensor<fp32, [5, 1, 256]> var_381 = slice_by_index(begin = var_381_begin_0, end = var_381_end_0, end_mask = var_381_end_mask_0, x = conv_out_1)[name = tensor<string, []>("op_381")];
+            tensor<int32, [3]> var_383_perm_0 = const()[name = tensor<string, []>("op_383_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 5, 256]> var_383 = transpose(perm = var_383_perm_0, x = var_381)[name = tensor<string, []>("transpose_52")];
+            tensor<fp32, [1, 5, 256]> input_33 = add(x = x_3, y = var_383)[name = tensor<string, []>("input_33")];
+            tensor<int32, [1]> input_35_axes_0 = const()[name = tensor<string, []>("input_35_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 5, 256]> input_35 = layer_norm(axes = input_35_axes_0, beta = inner_encoder_ffn2_0_module_sequential_0_bias, epsilon = var_76, gamma = inner_encoder_ffn2_0_module_sequential_0_weight, x = input_33)[name = tensor<string, []>("input_35")];
+            tensor<fp32, [1, 5, 1024]> inputs_9 = linear(bias = inner_encoder_ffn2_0_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_0_module_sequential_1_linear_weight, x = input_35)[name = tensor<string, []>("linear_8")];
+            tensor<fp32, [1, 5, 1024]> input_37 = silu(x = inputs_9)[name = tensor<string, []>("input_37")];
+            tensor<fp32, [1, 5, 256]> input_41 = linear(bias = inner_encoder_ffn2_0_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_0_module_sequential_4_linear_weight, x = input_37)[name = tensor<string, []>("linear_9")];
+            tensor<fp32, []> var_406 = const()[name = tensor<string, []>("op_406"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 5, 256]> var_407 = mul(x = input_41, y = var_406)[name = tensor<string, []>("op_407")];
+            tensor<fp32, [1, 5, 256]> input_43 = add(x = var_407, y = input_33)[name = tensor<string, []>("input_43")];
             tensor<int32, [1]> input_45_axes_0 = const()[name = tensor<string, []>("input_45_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_45 = layer_norm(axes = input_45_axes_0, beta = encoder_ffn1_1_module_sequential_0_bias, epsilon = var_29, gamma = encoder_ffn1_1_module_sequential_0_weight, x = input_43)[name = tensor<string, []>("input_45")];
-            tensor<fp32, [1, 5, 1024]> inputs_11 = linear(bias = encoder_ffn1_1_module_sequential_1_linear_bias, weight = encoder_ffn1_1_module_sequential_1_linear_weight, x = input_45)[name = tensor<string, []>("linear_10")];
-            tensor<fp32, [1, 5, 1024]> input_47 = silu(x = inputs_11)[name = tensor<string, []>("input_47")];
-            tensor<fp32, [1, 5, 256]> input_51 = linear(bias = encoder_ffn1_1_module_sequential_4_linear_bias, weight = encoder_ffn1_1_module_sequential_4_linear_weight, x = input_47)[name = tensor<string, []>("linear_11")];
-            tensor<fp32, []> var_370 = const()[name = tensor<string, []>("op_370"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 5, 256]> var_371 = mul(x = input_51, y = var_370)[name = tensor<string, []>("op_371")];
-            tensor<fp32, [1, 5, 256]> input_53 = add(x = var_371, y = input_43)[name = tensor<string, []>("input_53")];
+            tensor<fp32, [1, 5, 256]> input_45 = layer_norm(axes = input_45_axes_0, beta = inner_encoder_layer_norm_0_bias, epsilon = var_76, gamma = inner_encoder_layer_norm_0_weight, x = input_43)[name = tensor<string, []>("input_45")];
+            tensor<int32, [1]> input_47_axes_0 = const()[name = tensor<string, []>("input_47_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 5, 256]> input_47 = layer_norm(axes = input_47_axes_0, beta = inner_encoder_ffn1_1_module_sequential_0_bias, epsilon = var_76, gamma = inner_encoder_ffn1_1_module_sequential_0_weight, x = input_45)[name = tensor<string, []>("input_47")];
+            tensor<fp32, [1, 5, 1024]> inputs_11 = linear(bias = inner_encoder_ffn1_1_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_1_module_sequential_1_linear_weight, x = input_47)[name = tensor<string, []>("linear_10")];
+            tensor<fp32, [1, 5, 1024]> input_49 = silu(x = inputs_11)[name = tensor<string, []>("input_49")];
+            tensor<fp32, [1, 5, 256]> input_53 = linear(bias = inner_encoder_ffn1_1_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_1_module_sequential_4_linear_weight, x = input_49)[name = tensor<string, []>("linear_11")];
+            tensor<fp32, []> var_436 = const()[name = tensor<string, []>("op_436"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 5, 256]> var_437 = mul(x = input_53, y = var_436)[name = tensor<string, []>("op_437")];
+            tensor<fp32, [1, 5, 256]> input_55 = add(x = var_437, y = input_45)[name = tensor<string, []>("input_55")];
             tensor<int32, [1]> x_7_axes_0 = const()[name = tensor<string, []>("x_7_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> x_7 = layer_norm(axes = x_7_axes_0, beta = encoder_ret_lns_1_bias, epsilon = var_29, gamma = encoder_ret_lns_1_weight, x = input_53)[name = tensor<string, []>("x_7")];
+            tensor<fp32, [1, 5, 256]> x_7 = layer_norm(axes = x_7_axes_0, beta = inner_encoder_ret_lns_1_bias, epsilon = var_76, gamma = inner_encoder_ret_lns_1_weight, x = input_55)[name = tensor<string, []>("x_7")];
             tensor<int32, [5]> prev_kv_3_begin_0 = const()[name = tensor<string, []>("prev_kv_3_begin_0"), val = tensor<int32, [5]>([1, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_3_end_0 = const()[name = tensor<string, []>("prev_kv_3_end_0"), val = tensor<int32, [5]>([2, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_3_end_mask_0 = const()[name = tensor<string, []>("prev_kv_3_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -426,183 +452,183 @@ program(1.0)
             tensor<bool, [2]> prev_scale_3_end_mask_0 = const()[name = tensor<string, []>("prev_scale_3_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_3_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_3_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_3 = slice_by_index(begin = prev_scale_3_begin_0, end = prev_scale_3_end_0, end_mask = prev_scale_3_end_mask_0, squeeze_mask = prev_scale_3_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_3")];
-            tensor<fp32, [1, 5, 256]> var_385 = linear(bias = encoder_q_proj_1_bias, weight = encoder_q_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_12")];
-            tensor<int32, [4]> var_386 = const()[name = tensor<string, []>("op_386"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_387 = reshape(shape = var_386, x = var_385)[name = tensor<string, []>("op_387")];
+            tensor<fp32, [1, 5, 256]> var_451 = linear(bias = inner_encoder_q_proj_1_bias, weight = inner_encoder_q_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_12")];
+            tensor<int32, [4]> var_452 = const()[name = tensor<string, []>("op_452"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_453 = reshape(shape = var_452, x = var_451)[name = tensor<string, []>("op_453")];
             tensor<int32, [4]> q_3_perm_0 = const()[name = tensor<string, []>("q_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> var_391 = linear(bias = encoder_k_proj_1_bias, weight = encoder_k_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_13")];
-            tensor<fp32, []> var_392 = const()[name = tensor<string, []>("op_392"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 5, 256]> var_393 = mul(x = var_391, y = var_392)[name = tensor<string, []>("op_393")];
-            tensor<int32, [4]> var_394 = const()[name = tensor<string, []>("op_394"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_395 = reshape(shape = var_394, x = var_393)[name = tensor<string, []>("op_395")];
+            tensor<fp32, [1, 5, 256]> var_457 = linear(bias = inner_encoder_k_proj_1_bias, weight = inner_encoder_k_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_13")];
+            tensor<fp32, []> var_458 = const()[name = tensor<string, []>("op_458"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 5, 256]> var_459 = mul(x = var_457, y = var_458)[name = tensor<string, []>("op_459")];
+            tensor<int32, [4]> var_460 = const()[name = tensor<string, []>("op_460"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_461 = reshape(shape = var_460, x = var_459)[name = tensor<string, []>("op_461")];
             tensor<int32, [4]> k_3_perm_0 = const()[name = tensor<string, []>("k_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> var_399 = linear(bias = encoder_v_proj_1_bias, weight = encoder_v_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_14")];
-            tensor<int32, [4]> var_400 = const()[name = tensor<string, []>("op_400"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_401 = reshape(shape = var_400, x = var_399)[name = tensor<string, []>("op_401")];
+            tensor<fp32, [1, 5, 256]> var_465 = linear(bias = inner_encoder_v_proj_1_bias, weight = inner_encoder_v_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_14")];
+            tensor<int32, [4]> var_466 = const()[name = tensor<string, []>("op_466"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_467 = reshape(shape = var_466, x = var_465)[name = tensor<string, []>("op_467")];
             tensor<int32, [4]> v_3_perm_0 = const()[name = tensor<string, []>("v_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> input_57 = linear(bias = encoder_g_proj_1_bias, weight = encoder_g_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_15")];
+            tensor<fp32, [1, 5, 256]> input_59 = linear(bias = inner_encoder_g_proj_1_bias, weight = inner_encoder_g_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_15")];
             tensor<fp32, [1]> sqrt_s0_3 = sqrt(x = prev_scale_3)[name = tensor<string, []>("sqrt_s0_3")];
-            tensor<fp32, [5]> s_t_3 = add(x = prev_scale_3, y = encoder__t_index)[name = tensor<string, []>("s_t_3")];
+            tensor<fp32, [5]> s_t_3 = add(x = prev_scale_3, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_3")];
             tensor<fp32, [5]> sqrt_s_t_3 = sqrt(x = s_t_3)[name = tensor<string, []>("sqrt_s_t_3")];
             tensor<bool, []> qk_3_transpose_x_1 = const()[name = tensor<string, []>("qk_3_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_3_transpose_y_1 = const()[name = tensor<string, []>("qk_3_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 5, 64]> k_3 = transpose(perm = k_3_perm_0, x = var_395)[name = tensor<string, []>("transpose_50")];
-            tensor<fp32, [1, 4, 5, 64]> q_3 = transpose(perm = q_3_perm_0, x = var_387)[name = tensor<string, []>("transpose_51")];
+            tensor<fp32, [1, 4, 5, 64]> k_3 = transpose(perm = k_3_perm_0, x = var_461)[name = tensor<string, []>("transpose_50")];
+            tensor<fp32, [1, 4, 5, 64]> q_3 = transpose(perm = q_3_perm_0, x = var_453)[name = tensor<string, []>("transpose_51")];
             tensor<fp32, [1, 4, 5, 5]> qk_3 = matmul(transpose_x = qk_3_transpose_x_1, transpose_y = qk_3_transpose_y_1, x = q_3, y = k_3)[name = tensor<string, []>("qk_3")];
-            tensor<int32, [2]> var_411 = const()[name = tensor<string, []>("op_411"), val = tensor<int32, [2]>([5, 1])];
-            tensor<fp32, [5, 1]> var_412 = reshape(shape = var_411, x = sqrt_s_t_3)[name = tensor<string, []>("op_412")];
-            tensor<fp32, [5, 5]> M_3 = real_div(x = encoder__causal_mask, y = var_412)[name = tensor<string, []>("M_3")];
-            tensor<fp32, [1, 4, 5, 5]> var_414 = mul(x = qk_3, y = M_3)[name = tensor<string, []>("op_414")];
+            tensor<int32, [2]> var_477 = const()[name = tensor<string, []>("op_477"), val = tensor<int32, [2]>([5, 1])];
+            tensor<fp32, [5, 1]> var_478 = reshape(shape = var_477, x = sqrt_s_t_3)[name = tensor<string, []>("op_478")];
+            tensor<fp32, [5, 5]> M_3 = real_div(x = inner_encoder__causal_mask, y = var_478)[name = tensor<string, []>("M_3")];
+            tensor<fp32, [1, 4, 5, 5]> var_480 = mul(x = qk_3, y = M_3)[name = tensor<string, []>("op_480")];
             tensor<bool, []> inner_3_transpose_x_0 = const()[name = tensor<string, []>("inner_3_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_3_transpose_y_0 = const()[name = tensor<string, []>("inner_3_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 5, 64]> v_3 = transpose(perm = v_3_perm_0, x = var_401)[name = tensor<string, []>("transpose_49")];
-            tensor<fp32, [1, 4, 5, 64]> inner_3 = matmul(transpose_x = inner_3_transpose_x_0, transpose_y = inner_3_transpose_y_0, x = var_414, y = v_3)[name = tensor<string, []>("inner_3")];
-            tensor<bool, []> var_416_transpose_x_0 = const()[name = tensor<string, []>("op_416_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_416_transpose_y_0 = const()[name = tensor<string, []>("op_416_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 5, 64]> var_416 = matmul(transpose_x = var_416_transpose_x_0, transpose_y = var_416_transpose_y_0, x = q_3, y = prev_kv_3)[name = tensor<string, []>("op_416")];
-            tensor<fp32, [5]> var_417 = real_div(x = sqrt_s0_3, y = sqrt_s_t_3)[name = tensor<string, []>("op_417")];
-            tensor<int32, [4]> var_418 = const()[name = tensor<string, []>("op_418"), val = tensor<int32, [4]>([1, 1, 5, 1])];
-            tensor<fp32, [1, 1, 5, 1]> var_419 = reshape(shape = var_418, x = var_417)[name = tensor<string, []>("op_419")];
-            tensor<fp32, [1, 4, 5, 64]> cross_3 = mul(x = var_416, y = var_419)[name = tensor<string, []>("cross_3")];
+            tensor<fp32, [1, 4, 5, 64]> v_3 = transpose(perm = v_3_perm_0, x = var_467)[name = tensor<string, []>("transpose_49")];
+            tensor<fp32, [1, 4, 5, 64]> inner_3 = matmul(transpose_x = inner_3_transpose_x_0, transpose_y = inner_3_transpose_y_0, x = var_480, y = v_3)[name = tensor<string, []>("inner_3")];
+            tensor<bool, []> var_482_transpose_x_0 = const()[name = tensor<string, []>("op_482_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_482_transpose_y_0 = const()[name = tensor<string, []>("op_482_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 5, 64]> var_482 = matmul(transpose_x = var_482_transpose_x_0, transpose_y = var_482_transpose_y_0, x = q_3, y = prev_kv_3)[name = tensor<string, []>("op_482")];
+            tensor<fp32, [5]> var_483 = real_div(x = sqrt_s0_3, y = sqrt_s_t_3)[name = tensor<string, []>("op_483")];
+            tensor<int32, [4]> var_484 = const()[name = tensor<string, []>("op_484"), val = tensor<int32, [4]>([1, 1, 5, 1])];
+            tensor<fp32, [1, 1, 5, 1]> var_485 = reshape(shape = var_484, x = var_483)[name = tensor<string, []>("op_485")];
+            tensor<fp32, [1, 4, 5, 64]> cross_3 = mul(x = var_482, y = var_485)[name = tensor<string, []>("cross_3")];
             tensor<fp32, [1, 4, 5, 64]> out_7 = add(x = inner_3, y = cross_3)[name = tensor<string, []>("out_7")];
-            tensor<fp32, [1, 4, 64, 64]> var_422 = mul(x = prev_kv_3, y = sqrt_s0_3)[name = tensor<string, []>("op_422")];
-            tensor<bool, []> var_424_transpose_x_1 = const()[name = tensor<string, []>("op_424_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_424_transpose_y_1 = const()[name = tensor<string, []>("op_424_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_424 = matmul(transpose_x = var_424_transpose_x_1, transpose_y = var_424_transpose_y_1, x = k_3, y = v_3)[name = tensor<string, []>("op_424")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_3 = add(x = var_422, y = var_424)[name = tensor<string, []>("new_kv_unnorm_3")];
-            tensor<fp32, []> var_426 = const()[name = tensor<string, []>("op_426"), val = tensor<fp32, []>(0x1.4p+2)];
-            tensor<fp32, [1]> new_scale_3 = add(x = prev_scale_3, y = var_426)[name = tensor<string, []>("new_scale_3")];
-            tensor<fp32, [1]> var_428 = sqrt(x = new_scale_3)[name = tensor<string, []>("op_428")];
-            tensor<fp32, [1, 4, 64, 64]> var_429 = real_div(x = new_kv_unnorm_3, y = var_428)[name = tensor<string, []>("op_429")];
-            tensor<int32, [4]> var_430_perm_0 = const()[name = tensor<string, []>("op_430_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_488 = mul(x = prev_kv_3, y = sqrt_s0_3)[name = tensor<string, []>("op_488")];
+            tensor<bool, []> var_490_transpose_x_1 = const()[name = tensor<string, []>("op_490_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_490_transpose_y_1 = const()[name = tensor<string, []>("op_490_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_490 = matmul(transpose_x = var_490_transpose_x_1, transpose_y = var_490_transpose_y_1, x = k_3, y = v_3)[name = tensor<string, []>("op_490")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_3 = add(x = var_488, y = var_490)[name = tensor<string, []>("new_kv_unnorm_3")];
+            tensor<fp32, []> var_492 = const()[name = tensor<string, []>("op_492"), val = tensor<fp32, []>(0x1.4p+2)];
+            tensor<fp32, [1]> new_scale_3 = add(x = prev_scale_3, y = var_492)[name = tensor<string, []>("new_scale_3")];
+            tensor<fp32, [1]> var_494 = sqrt(x = new_scale_3)[name = tensor<string, []>("op_494")];
+            tensor<fp32, [1, 4, 64, 64]> var_495 = real_div(x = new_kv_unnorm_3, y = var_494)[name = tensor<string, []>("op_495")];
+            tensor<int32, [4]> var_496_perm_0 = const()[name = tensor<string, []>("op_496_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_9_axes_0 = const()[name = tensor<string, []>("out_9_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 4, 64]> var_430 = transpose(perm = var_430_perm_0, x = out_7)[name = tensor<string, []>("transpose_48")];
-            tensor<fp32, [1, 5, 4, 64]> out_9 = layer_norm(axes = out_9_axes_0, epsilon = var_18, x = var_430)[name = tensor<string, []>("out_9")];
-            tensor<int32, [3]> var_434 = const()[name = tensor<string, []>("op_434"), val = tensor<int32, [3]>([1, 5, 256])];
-            tensor<fp32, [1, 5, 256]> out_11 = reshape(shape = var_434, x = out_9)[name = tensor<string, []>("out_11")];
-            tensor<fp32, [1, 5, 256]> var_436 = silu(x = input_57)[name = tensor<string, []>("op_436")];
-            tensor<fp32, [1, 5, 256]> input_59 = mul(x = var_436, y = out_11)[name = tensor<string, []>("input_59")];
-            tensor<fp32, [1, 5, 256]> ret_out_3 = linear(bias = encoder_out_proj_1_bias, weight = encoder_out_proj_1_weight, x = input_59)[name = tensor<string, []>("linear_16")];
-            tensor<fp32, [1, 5, 256]> x_9 = add(x = input_53, y = ret_out_3)[name = tensor<string, []>("x_9")];
+            tensor<fp32, [1, 5, 4, 64]> var_496 = transpose(perm = var_496_perm_0, x = out_7)[name = tensor<string, []>("transpose_48")];
+            tensor<fp32, [1, 5, 4, 64]> out_9 = layer_norm(axes = out_9_axes_0, epsilon = var_84, x = var_496)[name = tensor<string, []>("out_9")];
+            tensor<int32, [3]> var_500 = const()[name = tensor<string, []>("op_500"), val = tensor<int32, [3]>([1, 5, 256])];
+            tensor<fp32, [1, 5, 256]> out_11 = reshape(shape = var_500, x = out_9)[name = tensor<string, []>("out_11")];
+            tensor<fp32, [1, 5, 256]> var_502 = silu(x = input_59)[name = tensor<string, []>("op_502")];
+            tensor<fp32, [1, 5, 256]> input_61 = mul(x = var_502, y = out_11)[name = tensor<string, []>("input_61")];
+            tensor<fp32, [1, 5, 256]> ret_out_3 = linear(bias = inner_encoder_out_proj_1_bias, weight = inner_encoder_out_proj_1_weight, x = input_61)[name = tensor<string, []>("linear_16")];
+            tensor<fp32, [1, 5, 256]> x_9 = add(x = input_55, y = ret_out_3)[name = tensor<string, []>("x_9")];
             tensor<int32, [4]> window_13_begin_0 = const()[name = tensor<string, []>("window_13_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> window_13_end_0 = const()[name = tensor<string, []>("window_13_end_0"), val = tensor<int32, [4]>([2, 1, 16, 256])];
             tensor<bool, [4]> window_13_end_mask_0 = const()[name = tensor<string, []>("window_13_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_13_squeeze_mask_0 = const()[name = tensor<string, []>("window_13_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_13 = slice_by_index(begin = window_13_begin_0, end = window_13_end_0, end_mask = window_13_end_mask_0, squeeze_mask = window_13_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_13")];
-            tensor<int32, [3]> var_444_begin_0 = const()[name = tensor<string, []>("op_444_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_444_end_0 = const()[name = tensor<string, []>("op_444_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_444_end_mask_0 = const()[name = tensor<string, []>("op_444_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_444 = slice_by_index(begin = var_444_begin_0, end = var_444_end_0, end_mask = var_444_end_mask_0, x = x_9)[name = tensor<string, []>("op_444")];
-            tensor<int32, [3]> var_447_begin_0 = const()[name = tensor<string, []>("op_447_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_447_end_0 = const()[name = tensor<string, []>("op_447_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_447_end_mask_0 = const()[name = tensor<string, []>("op_447_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_447 = slice_by_index(begin = var_447_begin_0, end = var_447_end_0, end_mask = var_447_end_mask_0, x = window_13)[name = tensor<string, []>("op_447")];
+            tensor<int32, [3]> var_510_begin_0 = const()[name = tensor<string, []>("op_510_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_510_end_0 = const()[name = tensor<string, []>("op_510_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_510_end_mask_0 = const()[name = tensor<string, []>("op_510_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_510 = slice_by_index(begin = var_510_begin_0, end = var_510_end_0, end_mask = var_510_end_mask_0, x = x_9)[name = tensor<string, []>("op_510")];
+            tensor<int32, [3]> var_513_begin_0 = const()[name = tensor<string, []>("op_513_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_513_end_0 = const()[name = tensor<string, []>("op_513_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_513_end_mask_0 = const()[name = tensor<string, []>("op_513_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_513 = slice_by_index(begin = var_513_begin_0, end = var_513_end_0, end_mask = var_513_end_mask_0, x = window_13)[name = tensor<string, []>("op_513")];
             tensor<bool, []> window_15_interleave_0 = const()[name = tensor<string, []>("window_15_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_15 = concat(axis = var_27, interleave = window_15_interleave_0, values = (var_447, var_444))[name = tensor<string, []>("window_15")];
-            tensor<int32, [3]> var_452_begin_0 = const()[name = tensor<string, []>("op_452_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_452_end_0 = const()[name = tensor<string, []>("op_452_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_452_end_mask_0 = const()[name = tensor<string, []>("op_452_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_452 = slice_by_index(begin = var_452_begin_0, end = var_452_end_0, end_mask = var_452_end_mask_0, x = x_9)[name = tensor<string, []>("op_452")];
-            tensor<int32, [3]> var_455_begin_0 = const()[name = tensor<string, []>("op_455_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_455_end_0 = const()[name = tensor<string, []>("op_455_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_455_end_mask_0 = const()[name = tensor<string, []>("op_455_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_455 = slice_by_index(begin = var_455_begin_0, end = var_455_end_0, end_mask = var_455_end_mask_0, x = window_15)[name = tensor<string, []>("op_455")];
+            tensor<fp32, [1, 16, 256]> window_15 = concat(axis = var_93, interleave = window_15_interleave_0, values = (var_513, var_510))[name = tensor<string, []>("window_15")];
+            tensor<int32, [3]> var_518_begin_0 = const()[name = tensor<string, []>("op_518_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_518_end_0 = const()[name = tensor<string, []>("op_518_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_518_end_mask_0 = const()[name = tensor<string, []>("op_518_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_518 = slice_by_index(begin = var_518_begin_0, end = var_518_end_0, end_mask = var_518_end_mask_0, x = x_9)[name = tensor<string, []>("op_518")];
+            tensor<int32, [3]> var_521_begin_0 = const()[name = tensor<string, []>("op_521_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_521_end_0 = const()[name = tensor<string, []>("op_521_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_521_end_mask_0 = const()[name = tensor<string, []>("op_521_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_521 = slice_by_index(begin = var_521_begin_0, end = var_521_end_0, end_mask = var_521_end_mask_0, x = window_15)[name = tensor<string, []>("op_521")];
             tensor<bool, []> window_17_interleave_0 = const()[name = tensor<string, []>("window_17_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_17 = concat(axis = var_27, interleave = window_17_interleave_0, values = (var_455, var_452))[name = tensor<string, []>("window_17")];
-            tensor<int32, [3]> var_460_begin_0 = const()[name = tensor<string, []>("op_460_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_460_end_0 = const()[name = tensor<string, []>("op_460_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<bool, [3]> var_460_end_mask_0 = const()[name = tensor<string, []>("op_460_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_460 = slice_by_index(begin = var_460_begin_0, end = var_460_end_0, end_mask = var_460_end_mask_0, x = x_9)[name = tensor<string, []>("op_460")];
-            tensor<int32, [3]> var_463_begin_0 = const()[name = tensor<string, []>("op_463_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_463_end_0 = const()[name = tensor<string, []>("op_463_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_463_end_mask_0 = const()[name = tensor<string, []>("op_463_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_463 = slice_by_index(begin = var_463_begin_0, end = var_463_end_0, end_mask = var_463_end_mask_0, x = window_17)[name = tensor<string, []>("op_463")];
+            tensor<fp32, [1, 16, 256]> window_17 = concat(axis = var_93, interleave = window_17_interleave_0, values = (var_521, var_518))[name = tensor<string, []>("window_17")];
+            tensor<int32, [3]> var_526_begin_0 = const()[name = tensor<string, []>("op_526_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_526_end_0 = const()[name = tensor<string, []>("op_526_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<bool, [3]> var_526_end_mask_0 = const()[name = tensor<string, []>("op_526_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_526 = slice_by_index(begin = var_526_begin_0, end = var_526_end_0, end_mask = var_526_end_mask_0, x = x_9)[name = tensor<string, []>("op_526")];
+            tensor<int32, [3]> var_529_begin_0 = const()[name = tensor<string, []>("op_529_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_529_end_0 = const()[name = tensor<string, []>("op_529_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_529_end_mask_0 = const()[name = tensor<string, []>("op_529_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_529 = slice_by_index(begin = var_529_begin_0, end = var_529_end_0, end_mask = var_529_end_mask_0, x = window_17)[name = tensor<string, []>("op_529")];
             tensor<bool, []> window_19_interleave_0 = const()[name = tensor<string, []>("window_19_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_19 = concat(axis = var_27, interleave = window_19_interleave_0, values = (var_463, var_460))[name = tensor<string, []>("window_19")];
-            tensor<int32, [3]> var_468_begin_0 = const()[name = tensor<string, []>("op_468_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
-            tensor<int32, [3]> var_468_end_0 = const()[name = tensor<string, []>("op_468_end_0"), val = tensor<int32, [3]>([1, 4, 256])];
-            tensor<bool, [3]> var_468_end_mask_0 = const()[name = tensor<string, []>("op_468_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_468 = slice_by_index(begin = var_468_begin_0, end = var_468_end_0, end_mask = var_468_end_mask_0, x = x_9)[name = tensor<string, []>("op_468")];
-            tensor<int32, [3]> var_471_begin_0 = const()[name = tensor<string, []>("op_471_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_471_end_0 = const()[name = tensor<string, []>("op_471_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_471_end_mask_0 = const()[name = tensor<string, []>("op_471_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_471 = slice_by_index(begin = var_471_begin_0, end = var_471_end_0, end_mask = var_471_end_mask_0, x = window_19)[name = tensor<string, []>("op_471")];
+            tensor<fp32, [1, 16, 256]> window_19 = concat(axis = var_93, interleave = window_19_interleave_0, values = (var_529, var_526))[name = tensor<string, []>("window_19")];
+            tensor<int32, [3]> var_534_begin_0 = const()[name = tensor<string, []>("op_534_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
+            tensor<int32, [3]> var_534_end_0 = const()[name = tensor<string, []>("op_534_end_0"), val = tensor<int32, [3]>([1, 4, 256])];
+            tensor<bool, [3]> var_534_end_mask_0 = const()[name = tensor<string, []>("op_534_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_534 = slice_by_index(begin = var_534_begin_0, end = var_534_end_0, end_mask = var_534_end_mask_0, x = x_9)[name = tensor<string, []>("op_534")];
+            tensor<int32, [3]> var_537_begin_0 = const()[name = tensor<string, []>("op_537_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_537_end_0 = const()[name = tensor<string, []>("op_537_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_537_end_mask_0 = const()[name = tensor<string, []>("op_537_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_537 = slice_by_index(begin = var_537_begin_0, end = var_537_end_0, end_mask = var_537_end_mask_0, x = window_19)[name = tensor<string, []>("op_537")];
             tensor<bool, []> window_21_interleave_0 = const()[name = tensor<string, []>("window_21_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_21 = concat(axis = var_27, interleave = window_21_interleave_0, values = (var_471, var_468))[name = tensor<string, []>("window_21")];
-            tensor<int32, [3]> var_476_begin_0 = const()[name = tensor<string, []>("op_476_begin_0"), val = tensor<int32, [3]>([0, 4, 0])];
-            tensor<int32, [3]> var_476_end_0 = const()[name = tensor<string, []>("op_476_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_476_end_mask_0 = const()[name = tensor<string, []>("op_476_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_476 = slice_by_index(begin = var_476_begin_0, end = var_476_end_0, end_mask = var_476_end_mask_0, x = x_9)[name = tensor<string, []>("op_476")];
-            tensor<int32, [3]> var_479_begin_0 = const()[name = tensor<string, []>("op_479_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_479_end_0 = const()[name = tensor<string, []>("op_479_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_479_end_mask_0 = const()[name = tensor<string, []>("op_479_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_479 = slice_by_index(begin = var_479_begin_0, end = var_479_end_0, end_mask = var_479_end_mask_0, x = window_21)[name = tensor<string, []>("op_479")];
+            tensor<fp32, [1, 16, 256]> window_21 = concat(axis = var_93, interleave = window_21_interleave_0, values = (var_537, var_534))[name = tensor<string, []>("window_21")];
+            tensor<int32, [3]> var_542_begin_0 = const()[name = tensor<string, []>("op_542_begin_0"), val = tensor<int32, [3]>([0, 4, 0])];
+            tensor<int32, [3]> var_542_end_0 = const()[name = tensor<string, []>("op_542_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_542_end_mask_0 = const()[name = tensor<string, []>("op_542_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_542 = slice_by_index(begin = var_542_begin_0, end = var_542_end_0, end_mask = var_542_end_mask_0, x = x_9)[name = tensor<string, []>("op_542")];
+            tensor<int32, [3]> var_545_begin_0 = const()[name = tensor<string, []>("op_545_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_545_end_0 = const()[name = tensor<string, []>("op_545_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_545_end_mask_0 = const()[name = tensor<string, []>("op_545_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_545 = slice_by_index(begin = var_545_begin_0, end = var_545_end_0, end_mask = var_545_end_mask_0, x = window_21)[name = tensor<string, []>("op_545")];
             tensor<bool, []> window_23_interleave_0 = const()[name = tensor<string, []>("window_23_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_23 = concat(axis = var_27, interleave = window_23_interleave_0, values = (var_479, var_476))[name = tensor<string, []>("window_23")];
-            tensor<bool, []> input_61_interleave_0 = const()[name = tensor<string, []>("input_61_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [5, 16, 256]> input_61 = concat(axis = var_24, interleave = input_61_interleave_0, values = (window_15, window_17, window_19, window_21, window_23))[name = tensor<string, []>("input_61")];
+            tensor<fp32, [1, 16, 256]> window_23 = concat(axis = var_93, interleave = window_23_interleave_0, values = (var_545, var_542))[name = tensor<string, []>("window_23")];
+            tensor<bool, []> input_63_interleave_0 = const()[name = tensor<string, []>("input_63_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [5, 16, 256]> input_63 = concat(axis = var_79, interleave = input_63_interleave_0, values = (window_15, window_17, window_19, window_21, window_23))[name = tensor<string, []>("input_63")];
             tensor<int32, [1]> x_11_axes_0 = const()[name = tensor<string, []>("x_11_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [5, 16, 256]> x_11 = layer_norm(axes = x_11_axes_0, beta = encoder_conv_module_1_sequential_0_bias, epsilon = var_29, gamma = encoder_conv_module_1_sequential_0_weight, x = input_61)[name = tensor<string, []>("x_11")];
-            tensor<int32, [3]> input_63_perm_0 = const()[name = tensor<string, []>("input_63_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [5, 16, 256]> x_11 = layer_norm(axes = x_11_axes_0, beta = inner_encoder_conv_module_1_sequential_0_bias, epsilon = var_76, gamma = inner_encoder_conv_module_1_sequential_0_weight, x = input_63)[name = tensor<string, []>("x_11")];
+            tensor<int32, [3]> input_65_perm_0 = const()[name = tensor<string, []>("input_65_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_13_pad_type_0 = const()[name = tensor<string, []>("inputs_13_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_13_strides_0 = const()[name = tensor<string, []>("inputs_13_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_13_pad_0 = const()[name = tensor<string, []>("inputs_13_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_13_dilations_0 = const()[name = tensor<string, []>("inputs_13_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_13_groups_0 = const()[name = tensor<string, []>("inputs_13_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> input_63 = transpose(perm = input_63_perm_0, x = x_11)[name = tensor<string, []>("transpose_47")];
-            tensor<fp32, [5, 512, 16]> inputs_13 = conv(bias = encoder_conv_module_1_sequential_2_conv_bias, dilations = inputs_13_dilations_0, groups = inputs_13_groups_0, pad = inputs_13_pad_0, pad_type = inputs_13_pad_type_0, strides = inputs_13_strides_0, weight = encoder_conv_module_1_sequential_2_conv_weight, x = input_63)[name = tensor<string, []>("inputs_13")];
-            tensor<int32, [2]> var_504_split_sizes_0 = const()[name = tensor<string, []>("op_504_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_504_axis_0 = const()[name = tensor<string, []>("op_504_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> var_504_0, tensor<fp32, [5, 256, 16]> var_504_1 = split(axis = var_504_axis_0, split_sizes = var_504_split_sizes_0, x = inputs_13)[name = tensor<string, []>("op_504")];
-            tensor<fp32, [5, 256, 16]> var_506 = sigmoid(x = var_504_1)[name = tensor<string, []>("op_506")];
-            tensor<fp32, [5, 256, 16]> inputs_15 = mul(x = var_504_0, y = var_506)[name = tensor<string, []>("inputs_15")];
+            tensor<fp32, [5, 256, 16]> input_65 = transpose(perm = input_65_perm_0, x = x_11)[name = tensor<string, []>("transpose_47")];
+            tensor<fp32, [5, 512, 16]> inputs_13 = conv(bias = inner_encoder_conv_module_1_sequential_2_conv_bias, dilations = inputs_13_dilations_0, groups = inputs_13_groups_0, pad = inputs_13_pad_0, pad_type = inputs_13_pad_type_0, strides = inputs_13_strides_0, weight = inner_encoder_conv_module_1_sequential_2_conv_weight, x = input_65)[name = tensor<string, []>("inputs_13")];
+            tensor<int32, [2]> var_570_split_sizes_0 = const()[name = tensor<string, []>("op_570_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_570_axis_0 = const()[name = tensor<string, []>("op_570_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [5, 256, 16]> var_570_0, tensor<fp32, [5, 256, 16]> var_570_1 = split(axis = var_570_axis_0, split_sizes = var_570_split_sizes_0, x = inputs_13)[name = tensor<string, []>("op_570")];
+            tensor<fp32, [5, 256, 16]> var_572 = sigmoid(x = var_570_1)[name = tensor<string, []>("op_572")];
+            tensor<fp32, [5, 256, 16]> inputs_15 = mul(x = var_570_0, y = var_572)[name = tensor<string, []>("inputs_15")];
             tensor<string, []> outputs_aug_3_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_3_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_3_pad_0 = const()[name = tensor<string, []>("outputs_aug_3_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_3_groups_0 = const()[name = tensor<string, []>("outputs_aug_3_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_3_strides_0 = const()[name = tensor<string, []>("outputs_aug_3_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_3_dilations_0 = const()[name = tensor<string, []>("outputs_aug_3_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [5, 256, 31]> outputs_aug_3 = conv(dilations = outputs_aug_3_dilations_0, groups = outputs_aug_3_groups_0, pad = outputs_aug_3_pad_0, pad_type = outputs_aug_3_pad_type_0, strides = outputs_aug_3_strides_0, weight = encoder_conv_module_1_sequential_4_conv_weight, x = inputs_15)[name = tensor<string, []>("outputs_aug_3")];
-            tensor<int32, [3]> input_65_begin_0 = const()[name = tensor<string, []>("input_65_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_65_end_0 = const()[name = tensor<string, []>("input_65_end_0"), val = tensor<int32, [3]>([5, 256, 16])];
-            tensor<bool, [3]> input_65_end_mask_0 = const()[name = tensor<string, []>("input_65_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [5, 256, 16]> input_65 = slice_by_index(begin = input_65_begin_0, end = input_65_end_0, end_mask = input_65_end_mask_0, x = outputs_aug_3)[name = tensor<string, []>("input_65")];
-            tensor<fp32, [5, 256, 16]> inputs_17 = batch_norm(beta = encoder_conv_module_1_sequential_5_bias, epsilon = var_29, gamma = encoder_conv_module_1_sequential_5_weight, mean = encoder_conv_module_1_sequential_5_running_mean, variance = encoder_conv_module_1_sequential_5_running_var, x = input_65)[name = tensor<string, []>("inputs_17")];
-            tensor<fp32, [5, 256, 16]> input_67 = silu(x = inputs_17)[name = tensor<string, []>("input_67")];
-            tensor<string, []> input_69_pad_type_0 = const()[name = tensor<string, []>("input_69_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_69_strides_0 = const()[name = tensor<string, []>("input_69_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_69_pad_0 = const()[name = tensor<string, []>("input_69_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_69_dilations_0 = const()[name = tensor<string, []>("input_69_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_69_groups_0 = const()[name = tensor<string, []>("input_69_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> input_69 = conv(bias = encoder_conv_module_1_sequential_7_conv_bias, dilations = input_69_dilations_0, groups = input_69_groups_0, pad = input_69_pad_0, pad_type = input_69_pad_type_0, strides = input_69_strides_0, weight = encoder_conv_module_1_sequential_7_conv_weight, x = input_67)[name = tensor<string, []>("input_69")];
+            tensor<fp32, [5, 256, 31]> outputs_aug_3 = conv(dilations = outputs_aug_3_dilations_0, groups = outputs_aug_3_groups_0, pad = outputs_aug_3_pad_0, pad_type = outputs_aug_3_pad_type_0, strides = outputs_aug_3_strides_0, weight = inner_encoder_conv_module_1_sequential_4_conv_weight, x = inputs_15)[name = tensor<string, []>("outputs_aug_3")];
+            tensor<int32, [3]> input_67_begin_0 = const()[name = tensor<string, []>("input_67_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_67_end_0 = const()[name = tensor<string, []>("input_67_end_0"), val = tensor<int32, [3]>([5, 256, 16])];
+            tensor<bool, [3]> input_67_end_mask_0 = const()[name = tensor<string, []>("input_67_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [5, 256, 16]> input_67 = slice_by_index(begin = input_67_begin_0, end = input_67_end_0, end_mask = input_67_end_mask_0, x = outputs_aug_3)[name = tensor<string, []>("input_67")];
+            tensor<fp32, [5, 256, 16]> inputs_17 = batch_norm(beta = inner_encoder_conv_module_1_sequential_5_bias, epsilon = var_76, gamma = inner_encoder_conv_module_1_sequential_5_weight, mean = inner_encoder_conv_module_1_sequential_5_running_mean, variance = inner_encoder_conv_module_1_sequential_5_running_var, x = input_67)[name = tensor<string, []>("inputs_17")];
+            tensor<fp32, [5, 256, 16]> input_69 = silu(x = inputs_17)[name = tensor<string, []>("input_69")];
+            tensor<string, []> input_71_pad_type_0 = const()[name = tensor<string, []>("input_71_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_71_strides_0 = const()[name = tensor<string, []>("input_71_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_71_pad_0 = const()[name = tensor<string, []>("input_71_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_71_dilations_0 = const()[name = tensor<string, []>("input_71_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_71_groups_0 = const()[name = tensor<string, []>("input_71_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [5, 256, 16]> input_71 = conv(bias = inner_encoder_conv_module_1_sequential_7_conv_bias, dilations = input_71_dilations_0, groups = input_71_groups_0, pad = input_71_pad_0, pad_type = input_71_pad_type_0, strides = input_71_strides_0, weight = inner_encoder_conv_module_1_sequential_7_conv_weight, x = input_69)[name = tensor<string, []>("input_71")];
             tensor<int32, [3]> conv_out_3_perm_0 = const()[name = tensor<string, []>("conv_out_3_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_537_begin_0 = const()[name = tensor<string, []>("op_537_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_537_end_0 = const()[name = tensor<string, []>("op_537_end_0"), val = tensor<int32, [3]>([5, 16, 256])];
-            tensor<bool, [3]> var_537_end_mask_0 = const()[name = tensor<string, []>("op_537_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [5, 16, 256]> conv_out_3 = transpose(perm = conv_out_3_perm_0, x = input_69)[name = tensor<string, []>("transpose_46")];
-            tensor<fp32, [5, 1, 256]> var_537 = slice_by_index(begin = var_537_begin_0, end = var_537_end_0, end_mask = var_537_end_mask_0, x = conv_out_3)[name = tensor<string, []>("op_537")];
-            tensor<int32, [3]> var_539_perm_0 = const()[name = tensor<string, []>("op_539_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 5, 256]> var_539 = transpose(perm = var_539_perm_0, x = var_537)[name = tensor<string, []>("transpose_45")];
-            tensor<fp32, [1, 5, 256]> input_71 = add(x = x_9, y = var_539)[name = tensor<string, []>("input_71")];
-            tensor<int32, [1]> input_73_axes_0 = const()[name = tensor<string, []>("input_73_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_73 = layer_norm(axes = input_73_axes_0, beta = encoder_ffn2_1_module_sequential_0_bias, epsilon = var_29, gamma = encoder_ffn2_1_module_sequential_0_weight, x = input_71)[name = tensor<string, []>("input_73")];
-            tensor<fp32, [1, 5, 1024]> inputs_19 = linear(bias = encoder_ffn2_1_module_sequential_1_linear_bias, weight = encoder_ffn2_1_module_sequential_1_linear_weight, x = input_73)[name = tensor<string, []>("linear_17")];
-            tensor<fp32, [1, 5, 1024]> input_75 = silu(x = inputs_19)[name = tensor<string, []>("input_75")];
-            tensor<fp32, [1, 5, 256]> input_79 = linear(bias = encoder_ffn2_1_module_sequential_4_linear_bias, weight = encoder_ffn2_1_module_sequential_4_linear_weight, x = input_75)[name = tensor<string, []>("linear_18")];
-            tensor<fp32, []> var_562 = const()[name = tensor<string, []>("op_562"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 5, 256]> var_563 = mul(x = input_79, y = var_562)[name = tensor<string, []>("op_563")];
-            tensor<fp32, [1, 5, 256]> input_81 = add(x = var_563, y = input_71)[name = tensor<string, []>("input_81")];
-            tensor<int32, [1]> input_83_axes_0 = const()[name = tensor<string, []>("input_83_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_83 = layer_norm(axes = input_83_axes_0, beta = encoder_layer_norm_1_bias, epsilon = var_29, gamma = encoder_layer_norm_1_weight, x = input_81)[name = tensor<string, []>("input_83")];
+            tensor<int32, [3]> var_603_begin_0 = const()[name = tensor<string, []>("op_603_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_603_end_0 = const()[name = tensor<string, []>("op_603_end_0"), val = tensor<int32, [3]>([5, 16, 256])];
+            tensor<bool, [3]> var_603_end_mask_0 = const()[name = tensor<string, []>("op_603_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [5, 16, 256]> conv_out_3 = transpose(perm = conv_out_3_perm_0, x = input_71)[name = tensor<string, []>("transpose_46")];
+            tensor<fp32, [5, 1, 256]> var_603 = slice_by_index(begin = var_603_begin_0, end = var_603_end_0, end_mask = var_603_end_mask_0, x = conv_out_3)[name = tensor<string, []>("op_603")];
+            tensor<int32, [3]> var_605_perm_0 = const()[name = tensor<string, []>("op_605_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 5, 256]> var_605 = transpose(perm = var_605_perm_0, x = var_603)[name = tensor<string, []>("transpose_45")];
+            tensor<fp32, [1, 5, 256]> input_73 = add(x = x_9, y = var_605)[name = tensor<string, []>("input_73")];
+            tensor<int32, [1]> input_75_axes_0 = const()[name = tensor<string, []>("input_75_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 5, 256]> input_75 = layer_norm(axes = input_75_axes_0, beta = inner_encoder_ffn2_1_module_sequential_0_bias, epsilon = var_76, gamma = inner_encoder_ffn2_1_module_sequential_0_weight, x = input_73)[name = tensor<string, []>("input_75")];
+            tensor<fp32, [1, 5, 1024]> inputs_19 = linear(bias = inner_encoder_ffn2_1_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_1_module_sequential_1_linear_weight, x = input_75)[name = tensor<string, []>("linear_17")];
+            tensor<fp32, [1, 5, 1024]> input_77 = silu(x = inputs_19)[name = tensor<string, []>("input_77")];
+            tensor<fp32, [1, 5, 256]> input_81 = linear(bias = inner_encoder_ffn2_1_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_1_module_sequential_4_linear_weight, x = input_77)[name = tensor<string, []>("linear_18")];
+            tensor<fp32, []> var_628 = const()[name = tensor<string, []>("op_628"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 5, 256]> var_629 = mul(x = input_81, y = var_628)[name = tensor<string, []>("op_629")];
+            tensor<fp32, [1, 5, 256]> input_83 = add(x = var_629, y = input_73)[name = tensor<string, []>("input_83")];
             tensor<int32, [1]> input_85_axes_0 = const()[name = tensor<string, []>("input_85_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_85 = layer_norm(axes = input_85_axes_0, beta = encoder_ffn1_2_module_sequential_0_bias, epsilon = var_29, gamma = encoder_ffn1_2_module_sequential_0_weight, x = input_83)[name = tensor<string, []>("input_85")];
-            tensor<fp32, [1, 5, 1024]> inputs_21 = linear(bias = encoder_ffn1_2_module_sequential_1_linear_bias, weight = encoder_ffn1_2_module_sequential_1_linear_weight, x = input_85)[name = tensor<string, []>("linear_19")];
-            tensor<fp32, [1, 5, 1024]> input_87 = silu(x = inputs_21)[name = tensor<string, []>("input_87")];
-            tensor<fp32, [1, 5, 256]> input_91 = linear(bias = encoder_ffn1_2_module_sequential_4_linear_bias, weight = encoder_ffn1_2_module_sequential_4_linear_weight, x = input_87)[name = tensor<string, []>("linear_20")];
-            tensor<fp32, []> var_592 = const()[name = tensor<string, []>("op_592"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 5, 256]> var_593 = mul(x = input_91, y = var_592)[name = tensor<string, []>("op_593")];
-            tensor<fp32, [1, 5, 256]> input_93 = add(x = var_593, y = input_83)[name = tensor<string, []>("input_93")];
+            tensor<fp32, [1, 5, 256]> input_85 = layer_norm(axes = input_85_axes_0, beta = inner_encoder_layer_norm_1_bias, epsilon = var_76, gamma = inner_encoder_layer_norm_1_weight, x = input_83)[name = tensor<string, []>("input_85")];
+            tensor<int32, [1]> input_87_axes_0 = const()[name = tensor<string, []>("input_87_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 5, 256]> input_87 = layer_norm(axes = input_87_axes_0, beta = inner_encoder_ffn1_2_module_sequential_0_bias, epsilon = var_76, gamma = inner_encoder_ffn1_2_module_sequential_0_weight, x = input_85)[name = tensor<string, []>("input_87")];
+            tensor<fp32, [1, 5, 1024]> inputs_21 = linear(bias = inner_encoder_ffn1_2_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_2_module_sequential_1_linear_weight, x = input_87)[name = tensor<string, []>("linear_19")];
+            tensor<fp32, [1, 5, 1024]> input_89 = silu(x = inputs_21)[name = tensor<string, []>("input_89")];
+            tensor<fp32, [1, 5, 256]> input_93 = linear(bias = inner_encoder_ffn1_2_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_2_module_sequential_4_linear_weight, x = input_89)[name = tensor<string, []>("linear_20")];
+            tensor<fp32, []> var_658 = const()[name = tensor<string, []>("op_658"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 5, 256]> var_659 = mul(x = input_93, y = var_658)[name = tensor<string, []>("op_659")];
+            tensor<fp32, [1, 5, 256]> input_95 = add(x = var_659, y = input_85)[name = tensor<string, []>("input_95")];
             tensor<int32, [1]> x_13_axes_0 = const()[name = tensor<string, []>("x_13_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> x_13 = layer_norm(axes = x_13_axes_0, beta = encoder_ret_lns_2_bias, epsilon = var_29, gamma = encoder_ret_lns_2_weight, x = input_93)[name = tensor<string, []>("x_13")];
+            tensor<fp32, [1, 5, 256]> x_13 = layer_norm(axes = x_13_axes_0, beta = inner_encoder_ret_lns_2_bias, epsilon = var_76, gamma = inner_encoder_ret_lns_2_weight, x = input_95)[name = tensor<string, []>("x_13")];
             tensor<int32, [5]> prev_kv_5_begin_0 = const()[name = tensor<string, []>("prev_kv_5_begin_0"), val = tensor<int32, [5]>([2, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_5_end_0 = const()[name = tensor<string, []>("prev_kv_5_end_0"), val = tensor<int32, [5]>([3, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_5_end_mask_0 = const()[name = tensor<string, []>("prev_kv_5_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -613,183 +639,183 @@ program(1.0)
             tensor<bool, [2]> prev_scale_5_end_mask_0 = const()[name = tensor<string, []>("prev_scale_5_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_5_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_5_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_5 = slice_by_index(begin = prev_scale_5_begin_0, end = prev_scale_5_end_0, end_mask = prev_scale_5_end_mask_0, squeeze_mask = prev_scale_5_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_5")];
-            tensor<fp32, [1, 5, 256]> var_607 = linear(bias = encoder_q_proj_2_bias, weight = encoder_q_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_21")];
-            tensor<int32, [4]> var_608 = const()[name = tensor<string, []>("op_608"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_609 = reshape(shape = var_608, x = var_607)[name = tensor<string, []>("op_609")];
+            tensor<fp32, [1, 5, 256]> var_673 = linear(bias = inner_encoder_q_proj_2_bias, weight = inner_encoder_q_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_21")];
+            tensor<int32, [4]> var_674 = const()[name = tensor<string, []>("op_674"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_675 = reshape(shape = var_674, x = var_673)[name = tensor<string, []>("op_675")];
             tensor<int32, [4]> q_5_perm_0 = const()[name = tensor<string, []>("q_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> var_613 = linear(bias = encoder_k_proj_2_bias, weight = encoder_k_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_22")];
-            tensor<fp32, []> var_614 = const()[name = tensor<string, []>("op_614"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 5, 256]> var_615 = mul(x = var_613, y = var_614)[name = tensor<string, []>("op_615")];
-            tensor<int32, [4]> var_616 = const()[name = tensor<string, []>("op_616"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_617 = reshape(shape = var_616, x = var_615)[name = tensor<string, []>("op_617")];
+            tensor<fp32, [1, 5, 256]> var_679 = linear(bias = inner_encoder_k_proj_2_bias, weight = inner_encoder_k_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_22")];
+            tensor<fp32, []> var_680 = const()[name = tensor<string, []>("op_680"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 5, 256]> var_681 = mul(x = var_679, y = var_680)[name = tensor<string, []>("op_681")];
+            tensor<int32, [4]> var_682 = const()[name = tensor<string, []>("op_682"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_683 = reshape(shape = var_682, x = var_681)[name = tensor<string, []>("op_683")];
             tensor<int32, [4]> k_5_perm_0 = const()[name = tensor<string, []>("k_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> var_621 = linear(bias = encoder_v_proj_2_bias, weight = encoder_v_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_23")];
-            tensor<int32, [4]> var_622 = const()[name = tensor<string, []>("op_622"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_623 = reshape(shape = var_622, x = var_621)[name = tensor<string, []>("op_623")];
+            tensor<fp32, [1, 5, 256]> var_687 = linear(bias = inner_encoder_v_proj_2_bias, weight = inner_encoder_v_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_23")];
+            tensor<int32, [4]> var_688 = const()[name = tensor<string, []>("op_688"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_689 = reshape(shape = var_688, x = var_687)[name = tensor<string, []>("op_689")];
             tensor<int32, [4]> v_5_perm_0 = const()[name = tensor<string, []>("v_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> input_97 = linear(bias = encoder_g_proj_2_bias, weight = encoder_g_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_24")];
+            tensor<fp32, [1, 5, 256]> input_99 = linear(bias = inner_encoder_g_proj_2_bias, weight = inner_encoder_g_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_24")];
             tensor<fp32, [1]> sqrt_s0_5 = sqrt(x = prev_scale_5)[name = tensor<string, []>("sqrt_s0_5")];
-            tensor<fp32, [5]> s_t_5 = add(x = prev_scale_5, y = encoder__t_index)[name = tensor<string, []>("s_t_5")];
+            tensor<fp32, [5]> s_t_5 = add(x = prev_scale_5, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_5")];
             tensor<fp32, [5]> sqrt_s_t_5 = sqrt(x = s_t_5)[name = tensor<string, []>("sqrt_s_t_5")];
             tensor<bool, []> qk_5_transpose_x_1 = const()[name = tensor<string, []>("qk_5_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_5_transpose_y_1 = const()[name = tensor<string, []>("qk_5_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 5, 64]> k_5 = transpose(perm = k_5_perm_0, x = var_617)[name = tensor<string, []>("transpose_43")];
-            tensor<fp32, [1, 4, 5, 64]> q_5 = transpose(perm = q_5_perm_0, x = var_609)[name = tensor<string, []>("transpose_44")];
+            tensor<fp32, [1, 4, 5, 64]> k_5 = transpose(perm = k_5_perm_0, x = var_683)[name = tensor<string, []>("transpose_43")];
+            tensor<fp32, [1, 4, 5, 64]> q_5 = transpose(perm = q_5_perm_0, x = var_675)[name = tensor<string, []>("transpose_44")];
             tensor<fp32, [1, 4, 5, 5]> qk_5 = matmul(transpose_x = qk_5_transpose_x_1, transpose_y = qk_5_transpose_y_1, x = q_5, y = k_5)[name = tensor<string, []>("qk_5")];
-            tensor<int32, [2]> var_633 = const()[name = tensor<string, []>("op_633"), val = tensor<int32, [2]>([5, 1])];
-            tensor<fp32, [5, 1]> var_634 = reshape(shape = var_633, x = sqrt_s_t_5)[name = tensor<string, []>("op_634")];
-            tensor<fp32, [5, 5]> M_5 = real_div(x = encoder__causal_mask, y = var_634)[name = tensor<string, []>("M_5")];
-            tensor<fp32, [1, 4, 5, 5]> var_636 = mul(x = qk_5, y = M_5)[name = tensor<string, []>("op_636")];
+            tensor<int32, [2]> var_699 = const()[name = tensor<string, []>("op_699"), val = tensor<int32, [2]>([5, 1])];
+            tensor<fp32, [5, 1]> var_700 = reshape(shape = var_699, x = sqrt_s_t_5)[name = tensor<string, []>("op_700")];
+            tensor<fp32, [5, 5]> M_5 = real_div(x = inner_encoder__causal_mask, y = var_700)[name = tensor<string, []>("M_5")];
+            tensor<fp32, [1, 4, 5, 5]> var_702 = mul(x = qk_5, y = M_5)[name = tensor<string, []>("op_702")];
             tensor<bool, []> inner_5_transpose_x_0 = const()[name = tensor<string, []>("inner_5_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_5_transpose_y_0 = const()[name = tensor<string, []>("inner_5_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 5, 64]> v_5 = transpose(perm = v_5_perm_0, x = var_623)[name = tensor<string, []>("transpose_42")];
-            tensor<fp32, [1, 4, 5, 64]> inner_5 = matmul(transpose_x = inner_5_transpose_x_0, transpose_y = inner_5_transpose_y_0, x = var_636, y = v_5)[name = tensor<string, []>("inner_5")];
-            tensor<bool, []> var_638_transpose_x_0 = const()[name = tensor<string, []>("op_638_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_638_transpose_y_0 = const()[name = tensor<string, []>("op_638_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 5, 64]> var_638 = matmul(transpose_x = var_638_transpose_x_0, transpose_y = var_638_transpose_y_0, x = q_5, y = prev_kv_5)[name = tensor<string, []>("op_638")];
-            tensor<fp32, [5]> var_639 = real_div(x = sqrt_s0_5, y = sqrt_s_t_5)[name = tensor<string, []>("op_639")];
-            tensor<int32, [4]> var_640 = const()[name = tensor<string, []>("op_640"), val = tensor<int32, [4]>([1, 1, 5, 1])];
-            tensor<fp32, [1, 1, 5, 1]> var_641 = reshape(shape = var_640, x = var_639)[name = tensor<string, []>("op_641")];
-            tensor<fp32, [1, 4, 5, 64]> cross_5 = mul(x = var_638, y = var_641)[name = tensor<string, []>("cross_5")];
+            tensor<fp32, [1, 4, 5, 64]> v_5 = transpose(perm = v_5_perm_0, x = var_689)[name = tensor<string, []>("transpose_42")];
+            tensor<fp32, [1, 4, 5, 64]> inner_5 = matmul(transpose_x = inner_5_transpose_x_0, transpose_y = inner_5_transpose_y_0, x = var_702, y = v_5)[name = tensor<string, []>("inner_5")];
+            tensor<bool, []> var_704_transpose_x_0 = const()[name = tensor<string, []>("op_704_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_704_transpose_y_0 = const()[name = tensor<string, []>("op_704_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 5, 64]> var_704 = matmul(transpose_x = var_704_transpose_x_0, transpose_y = var_704_transpose_y_0, x = q_5, y = prev_kv_5)[name = tensor<string, []>("op_704")];
+            tensor<fp32, [5]> var_705 = real_div(x = sqrt_s0_5, y = sqrt_s_t_5)[name = tensor<string, []>("op_705")];
+            tensor<int32, [4]> var_706 = const()[name = tensor<string, []>("op_706"), val = tensor<int32, [4]>([1, 1, 5, 1])];
+            tensor<fp32, [1, 1, 5, 1]> var_707 = reshape(shape = var_706, x = var_705)[name = tensor<string, []>("op_707")];
+            tensor<fp32, [1, 4, 5, 64]> cross_5 = mul(x = var_704, y = var_707)[name = tensor<string, []>("cross_5")];
             tensor<fp32, [1, 4, 5, 64]> out_13 = add(x = inner_5, y = cross_5)[name = tensor<string, []>("out_13")];
-            tensor<fp32, [1, 4, 64, 64]> var_644 = mul(x = prev_kv_5, y = sqrt_s0_5)[name = tensor<string, []>("op_644")];
-            tensor<bool, []> var_646_transpose_x_1 = const()[name = tensor<string, []>("op_646_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_646_transpose_y_1 = const()[name = tensor<string, []>("op_646_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_646 = matmul(transpose_x = var_646_transpose_x_1, transpose_y = var_646_transpose_y_1, x = k_5, y = v_5)[name = tensor<string, []>("op_646")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_5 = add(x = var_644, y = var_646)[name = tensor<string, []>("new_kv_unnorm_5")];
-            tensor<fp32, []> var_648 = const()[name = tensor<string, []>("op_648"), val = tensor<fp32, []>(0x1.4p+2)];
-            tensor<fp32, [1]> new_scale_5 = add(x = prev_scale_5, y = var_648)[name = tensor<string, []>("new_scale_5")];
-            tensor<fp32, [1]> var_650 = sqrt(x = new_scale_5)[name = tensor<string, []>("op_650")];
-            tensor<fp32, [1, 4, 64, 64]> var_651 = real_div(x = new_kv_unnorm_5, y = var_650)[name = tensor<string, []>("op_651")];
-            tensor<int32, [4]> var_652_perm_0 = const()[name = tensor<string, []>("op_652_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_710 = mul(x = prev_kv_5, y = sqrt_s0_5)[name = tensor<string, []>("op_710")];
+            tensor<bool, []> var_712_transpose_x_1 = const()[name = tensor<string, []>("op_712_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_712_transpose_y_1 = const()[name = tensor<string, []>("op_712_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_712 = matmul(transpose_x = var_712_transpose_x_1, transpose_y = var_712_transpose_y_1, x = k_5, y = v_5)[name = tensor<string, []>("op_712")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_5 = add(x = var_710, y = var_712)[name = tensor<string, []>("new_kv_unnorm_5")];
+            tensor<fp32, []> var_714 = const()[name = tensor<string, []>("op_714"), val = tensor<fp32, []>(0x1.4p+2)];
+            tensor<fp32, [1]> new_scale_5 = add(x = prev_scale_5, y = var_714)[name = tensor<string, []>("new_scale_5")];
+            tensor<fp32, [1]> var_716 = sqrt(x = new_scale_5)[name = tensor<string, []>("op_716")];
+            tensor<fp32, [1, 4, 64, 64]> var_717 = real_div(x = new_kv_unnorm_5, y = var_716)[name = tensor<string, []>("op_717")];
+            tensor<int32, [4]> var_718_perm_0 = const()[name = tensor<string, []>("op_718_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_15_axes_0 = const()[name = tensor<string, []>("out_15_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 4, 64]> var_652 = transpose(perm = var_652_perm_0, x = out_13)[name = tensor<string, []>("transpose_41")];
-            tensor<fp32, [1, 5, 4, 64]> out_15 = layer_norm(axes = out_15_axes_0, epsilon = var_18, x = var_652)[name = tensor<string, []>("out_15")];
-            tensor<int32, [3]> var_656 = const()[name = tensor<string, []>("op_656"), val = tensor<int32, [3]>([1, 5, 256])];
-            tensor<fp32, [1, 5, 256]> out_17 = reshape(shape = var_656, x = out_15)[name = tensor<string, []>("out_17")];
-            tensor<fp32, [1, 5, 256]> var_658 = silu(x = input_97)[name = tensor<string, []>("op_658")];
-            tensor<fp32, [1, 5, 256]> input_99 = mul(x = var_658, y = out_17)[name = tensor<string, []>("input_99")];
-            tensor<fp32, [1, 5, 256]> ret_out_5 = linear(bias = encoder_out_proj_2_bias, weight = encoder_out_proj_2_weight, x = input_99)[name = tensor<string, []>("linear_25")];
-            tensor<fp32, [1, 5, 256]> x_15 = add(x = input_93, y = ret_out_5)[name = tensor<string, []>("x_15")];
+            tensor<fp32, [1, 5, 4, 64]> var_718 = transpose(perm = var_718_perm_0, x = out_13)[name = tensor<string, []>("transpose_41")];
+            tensor<fp32, [1, 5, 4, 64]> out_15 = layer_norm(axes = out_15_axes_0, epsilon = var_84, x = var_718)[name = tensor<string, []>("out_15")];
+            tensor<int32, [3]> var_722 = const()[name = tensor<string, []>("op_722"), val = tensor<int32, [3]>([1, 5, 256])];
+            tensor<fp32, [1, 5, 256]> out_17 = reshape(shape = var_722, x = out_15)[name = tensor<string, []>("out_17")];
+            tensor<fp32, [1, 5, 256]> var_724 = silu(x = input_99)[name = tensor<string, []>("op_724")];
+            tensor<fp32, [1, 5, 256]> input_101 = mul(x = var_724, y = out_17)[name = tensor<string, []>("input_101")];
+            tensor<fp32, [1, 5, 256]> ret_out_5 = linear(bias = inner_encoder_out_proj_2_bias, weight = inner_encoder_out_proj_2_weight, x = input_101)[name = tensor<string, []>("linear_25")];
+            tensor<fp32, [1, 5, 256]> x_15 = add(x = input_95, y = ret_out_5)[name = tensor<string, []>("x_15")];
             tensor<int32, [4]> window_25_begin_0 = const()[name = tensor<string, []>("window_25_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> window_25_end_0 = const()[name = tensor<string, []>("window_25_end_0"), val = tensor<int32, [4]>([3, 1, 16, 256])];
             tensor<bool, [4]> window_25_end_mask_0 = const()[name = tensor<string, []>("window_25_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_25_squeeze_mask_0 = const()[name = tensor<string, []>("window_25_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_25 = slice_by_index(begin = window_25_begin_0, end = window_25_end_0, end_mask = window_25_end_mask_0, squeeze_mask = window_25_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_25")];
-            tensor<int32, [3]> var_666_begin_0 = const()[name = tensor<string, []>("op_666_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_666_end_0 = const()[name = tensor<string, []>("op_666_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_666_end_mask_0 = const()[name = tensor<string, []>("op_666_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_666 = slice_by_index(begin = var_666_begin_0, end = var_666_end_0, end_mask = var_666_end_mask_0, x = x_15)[name = tensor<string, []>("op_666")];
-            tensor<int32, [3]> var_669_begin_0 = const()[name = tensor<string, []>("op_669_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_669_end_0 = const()[name = tensor<string, []>("op_669_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_669_end_mask_0 = const()[name = tensor<string, []>("op_669_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_669 = slice_by_index(begin = var_669_begin_0, end = var_669_end_0, end_mask = var_669_end_mask_0, x = window_25)[name = tensor<string, []>("op_669")];
+            tensor<int32, [3]> var_732_begin_0 = const()[name = tensor<string, []>("op_732_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_732_end_0 = const()[name = tensor<string, []>("op_732_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_732_end_mask_0 = const()[name = tensor<string, []>("op_732_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_732 = slice_by_index(begin = var_732_begin_0, end = var_732_end_0, end_mask = var_732_end_mask_0, x = x_15)[name = tensor<string, []>("op_732")];
+            tensor<int32, [3]> var_735_begin_0 = const()[name = tensor<string, []>("op_735_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_735_end_0 = const()[name = tensor<string, []>("op_735_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_735_end_mask_0 = const()[name = tensor<string, []>("op_735_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_735 = slice_by_index(begin = var_735_begin_0, end = var_735_end_0, end_mask = var_735_end_mask_0, x = window_25)[name = tensor<string, []>("op_735")];
             tensor<bool, []> window_27_interleave_0 = const()[name = tensor<string, []>("window_27_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_27 = concat(axis = var_27, interleave = window_27_interleave_0, values = (var_669, var_666))[name = tensor<string, []>("window_27")];
-            tensor<int32, [3]> var_674_begin_0 = const()[name = tensor<string, []>("op_674_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_674_end_0 = const()[name = tensor<string, []>("op_674_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_674_end_mask_0 = const()[name = tensor<string, []>("op_674_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_674 = slice_by_index(begin = var_674_begin_0, end = var_674_end_0, end_mask = var_674_end_mask_0, x = x_15)[name = tensor<string, []>("op_674")];
-            tensor<int32, [3]> var_677_begin_0 = const()[name = tensor<string, []>("op_677_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_677_end_0 = const()[name = tensor<string, []>("op_677_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_677_end_mask_0 = const()[name = tensor<string, []>("op_677_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_677 = slice_by_index(begin = var_677_begin_0, end = var_677_end_0, end_mask = var_677_end_mask_0, x = window_27)[name = tensor<string, []>("op_677")];
+            tensor<fp32, [1, 16, 256]> window_27 = concat(axis = var_93, interleave = window_27_interleave_0, values = (var_735, var_732))[name = tensor<string, []>("window_27")];
+            tensor<int32, [3]> var_740_begin_0 = const()[name = tensor<string, []>("op_740_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_740_end_0 = const()[name = tensor<string, []>("op_740_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_740_end_mask_0 = const()[name = tensor<string, []>("op_740_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_740 = slice_by_index(begin = var_740_begin_0, end = var_740_end_0, end_mask = var_740_end_mask_0, x = x_15)[name = tensor<string, []>("op_740")];
+            tensor<int32, [3]> var_743_begin_0 = const()[name = tensor<string, []>("op_743_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_743_end_0 = const()[name = tensor<string, []>("op_743_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_743_end_mask_0 = const()[name = tensor<string, []>("op_743_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_743 = slice_by_index(begin = var_743_begin_0, end = var_743_end_0, end_mask = var_743_end_mask_0, x = window_27)[name = tensor<string, []>("op_743")];
             tensor<bool, []> window_29_interleave_0 = const()[name = tensor<string, []>("window_29_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_29 = concat(axis = var_27, interleave = window_29_interleave_0, values = (var_677, var_674))[name = tensor<string, []>("window_29")];
-            tensor<int32, [3]> var_682_begin_0 = const()[name = tensor<string, []>("op_682_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_682_end_0 = const()[name = tensor<string, []>("op_682_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<bool, [3]> var_682_end_mask_0 = const()[name = tensor<string, []>("op_682_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_682 = slice_by_index(begin = var_682_begin_0, end = var_682_end_0, end_mask = var_682_end_mask_0, x = x_15)[name = tensor<string, []>("op_682")];
-            tensor<int32, [3]> var_685_begin_0 = const()[name = tensor<string, []>("op_685_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_685_end_0 = const()[name = tensor<string, []>("op_685_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_685_end_mask_0 = const()[name = tensor<string, []>("op_685_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_685 = slice_by_index(begin = var_685_begin_0, end = var_685_end_0, end_mask = var_685_end_mask_0, x = window_29)[name = tensor<string, []>("op_685")];
+            tensor<fp32, [1, 16, 256]> window_29 = concat(axis = var_93, interleave = window_29_interleave_0, values = (var_743, var_740))[name = tensor<string, []>("window_29")];
+            tensor<int32, [3]> var_748_begin_0 = const()[name = tensor<string, []>("op_748_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_748_end_0 = const()[name = tensor<string, []>("op_748_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<bool, [3]> var_748_end_mask_0 = const()[name = tensor<string, []>("op_748_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_748 = slice_by_index(begin = var_748_begin_0, end = var_748_end_0, end_mask = var_748_end_mask_0, x = x_15)[name = tensor<string, []>("op_748")];
+            tensor<int32, [3]> var_751_begin_0 = const()[name = tensor<string, []>("op_751_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_751_end_0 = const()[name = tensor<string, []>("op_751_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_751_end_mask_0 = const()[name = tensor<string, []>("op_751_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_751 = slice_by_index(begin = var_751_begin_0, end = var_751_end_0, end_mask = var_751_end_mask_0, x = window_29)[name = tensor<string, []>("op_751")];
             tensor<bool, []> window_31_interleave_0 = const()[name = tensor<string, []>("window_31_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_31 = concat(axis = var_27, interleave = window_31_interleave_0, values = (var_685, var_682))[name = tensor<string, []>("window_31")];
-            tensor<int32, [3]> var_690_begin_0 = const()[name = tensor<string, []>("op_690_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
-            tensor<int32, [3]> var_690_end_0 = const()[name = tensor<string, []>("op_690_end_0"), val = tensor<int32, [3]>([1, 4, 256])];
-            tensor<bool, [3]> var_690_end_mask_0 = const()[name = tensor<string, []>("op_690_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_690 = slice_by_index(begin = var_690_begin_0, end = var_690_end_0, end_mask = var_690_end_mask_0, x = x_15)[name = tensor<string, []>("op_690")];
-            tensor<int32, [3]> var_693_begin_0 = const()[name = tensor<string, []>("op_693_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_693_end_0 = const()[name = tensor<string, []>("op_693_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_693_end_mask_0 = const()[name = tensor<string, []>("op_693_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_693 = slice_by_index(begin = var_693_begin_0, end = var_693_end_0, end_mask = var_693_end_mask_0, x = window_31)[name = tensor<string, []>("op_693")];
+            tensor<fp32, [1, 16, 256]> window_31 = concat(axis = var_93, interleave = window_31_interleave_0, values = (var_751, var_748))[name = tensor<string, []>("window_31")];
+            tensor<int32, [3]> var_756_begin_0 = const()[name = tensor<string, []>("op_756_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
+            tensor<int32, [3]> var_756_end_0 = const()[name = tensor<string, []>("op_756_end_0"), val = tensor<int32, [3]>([1, 4, 256])];
+            tensor<bool, [3]> var_756_end_mask_0 = const()[name = tensor<string, []>("op_756_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_756 = slice_by_index(begin = var_756_begin_0, end = var_756_end_0, end_mask = var_756_end_mask_0, x = x_15)[name = tensor<string, []>("op_756")];
+            tensor<int32, [3]> var_759_begin_0 = const()[name = tensor<string, []>("op_759_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_759_end_0 = const()[name = tensor<string, []>("op_759_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_759_end_mask_0 = const()[name = tensor<string, []>("op_759_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_759 = slice_by_index(begin = var_759_begin_0, end = var_759_end_0, end_mask = var_759_end_mask_0, x = window_31)[name = tensor<string, []>("op_759")];
             tensor<bool, []> window_33_interleave_0 = const()[name = tensor<string, []>("window_33_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_33 = concat(axis = var_27, interleave = window_33_interleave_0, values = (var_693, var_690))[name = tensor<string, []>("window_33")];
-            tensor<int32, [3]> var_698_begin_0 = const()[name = tensor<string, []>("op_698_begin_0"), val = tensor<int32, [3]>([0, 4, 0])];
-            tensor<int32, [3]> var_698_end_0 = const()[name = tensor<string, []>("op_698_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_698_end_mask_0 = const()[name = tensor<string, []>("op_698_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_698 = slice_by_index(begin = var_698_begin_0, end = var_698_end_0, end_mask = var_698_end_mask_0, x = x_15)[name = tensor<string, []>("op_698")];
-            tensor<int32, [3]> var_701_begin_0 = const()[name = tensor<string, []>("op_701_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_701_end_0 = const()[name = tensor<string, []>("op_701_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_701_end_mask_0 = const()[name = tensor<string, []>("op_701_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_701 = slice_by_index(begin = var_701_begin_0, end = var_701_end_0, end_mask = var_701_end_mask_0, x = window_33)[name = tensor<string, []>("op_701")];
+            tensor<fp32, [1, 16, 256]> window_33 = concat(axis = var_93, interleave = window_33_interleave_0, values = (var_759, var_756))[name = tensor<string, []>("window_33")];
+            tensor<int32, [3]> var_764_begin_0 = const()[name = tensor<string, []>("op_764_begin_0"), val = tensor<int32, [3]>([0, 4, 0])];
+            tensor<int32, [3]> var_764_end_0 = const()[name = tensor<string, []>("op_764_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_764_end_mask_0 = const()[name = tensor<string, []>("op_764_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_764 = slice_by_index(begin = var_764_begin_0, end = var_764_end_0, end_mask = var_764_end_mask_0, x = x_15)[name = tensor<string, []>("op_764")];
+            tensor<int32, [3]> var_767_begin_0 = const()[name = tensor<string, []>("op_767_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_767_end_0 = const()[name = tensor<string, []>("op_767_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_767_end_mask_0 = const()[name = tensor<string, []>("op_767_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_767 = slice_by_index(begin = var_767_begin_0, end = var_767_end_0, end_mask = var_767_end_mask_0, x = window_33)[name = tensor<string, []>("op_767")];
             tensor<bool, []> window_35_interleave_0 = const()[name = tensor<string, []>("window_35_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_35 = concat(axis = var_27, interleave = window_35_interleave_0, values = (var_701, var_698))[name = tensor<string, []>("window_35")];
-            tensor<bool, []> input_101_interleave_0 = const()[name = tensor<string, []>("input_101_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [5, 16, 256]> input_101 = concat(axis = var_24, interleave = input_101_interleave_0, values = (window_27, window_29, window_31, window_33, window_35))[name = tensor<string, []>("input_101")];
+            tensor<fp32, [1, 16, 256]> window_35 = concat(axis = var_93, interleave = window_35_interleave_0, values = (var_767, var_764))[name = tensor<string, []>("window_35")];
+            tensor<bool, []> input_103_interleave_0 = const()[name = tensor<string, []>("input_103_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [5, 16, 256]> input_103 = concat(axis = var_79, interleave = input_103_interleave_0, values = (window_27, window_29, window_31, window_33, window_35))[name = tensor<string, []>("input_103")];
             tensor<int32, [1]> x_17_axes_0 = const()[name = tensor<string, []>("x_17_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [5, 16, 256]> x_17 = layer_norm(axes = x_17_axes_0, beta = encoder_conv_module_2_sequential_0_bias, epsilon = var_29, gamma = encoder_conv_module_2_sequential_0_weight, x = input_101)[name = tensor<string, []>("x_17")];
-            tensor<int32, [3]> input_103_perm_0 = const()[name = tensor<string, []>("input_103_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [5, 16, 256]> x_17 = layer_norm(axes = x_17_axes_0, beta = inner_encoder_conv_module_2_sequential_0_bias, epsilon = var_76, gamma = inner_encoder_conv_module_2_sequential_0_weight, x = input_103)[name = tensor<string, []>("x_17")];
+            tensor<int32, [3]> input_105_perm_0 = const()[name = tensor<string, []>("input_105_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_23_pad_type_0 = const()[name = tensor<string, []>("inputs_23_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_23_strides_0 = const()[name = tensor<string, []>("inputs_23_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_23_pad_0 = const()[name = tensor<string, []>("inputs_23_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_23_dilations_0 = const()[name = tensor<string, []>("inputs_23_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_23_groups_0 = const()[name = tensor<string, []>("inputs_23_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> input_103 = transpose(perm = input_103_perm_0, x = x_17)[name = tensor<string, []>("transpose_40")];
-            tensor<fp32, [5, 512, 16]> inputs_23 = conv(bias = encoder_conv_module_2_sequential_2_conv_bias, dilations = inputs_23_dilations_0, groups = inputs_23_groups_0, pad = inputs_23_pad_0, pad_type = inputs_23_pad_type_0, strides = inputs_23_strides_0, weight = encoder_conv_module_2_sequential_2_conv_weight, x = input_103)[name = tensor<string, []>("inputs_23")];
-            tensor<int32, [2]> var_726_split_sizes_0 = const()[name = tensor<string, []>("op_726_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_726_axis_0 = const()[name = tensor<string, []>("op_726_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> var_726_0, tensor<fp32, [5, 256, 16]> var_726_1 = split(axis = var_726_axis_0, split_sizes = var_726_split_sizes_0, x = inputs_23)[name = tensor<string, []>("op_726")];
-            tensor<fp32, [5, 256, 16]> var_728 = sigmoid(x = var_726_1)[name = tensor<string, []>("op_728")];
-            tensor<fp32, [5, 256, 16]> inputs_25 = mul(x = var_726_0, y = var_728)[name = tensor<string, []>("inputs_25")];
+            tensor<fp32, [5, 256, 16]> input_105 = transpose(perm = input_105_perm_0, x = x_17)[name = tensor<string, []>("transpose_40")];
+            tensor<fp32, [5, 512, 16]> inputs_23 = conv(bias = inner_encoder_conv_module_2_sequential_2_conv_bias, dilations = inputs_23_dilations_0, groups = inputs_23_groups_0, pad = inputs_23_pad_0, pad_type = inputs_23_pad_type_0, strides = inputs_23_strides_0, weight = inner_encoder_conv_module_2_sequential_2_conv_weight, x = input_105)[name = tensor<string, []>("inputs_23")];
+            tensor<int32, [2]> var_792_split_sizes_0 = const()[name = tensor<string, []>("op_792_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_792_axis_0 = const()[name = tensor<string, []>("op_792_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [5, 256, 16]> var_792_0, tensor<fp32, [5, 256, 16]> var_792_1 = split(axis = var_792_axis_0, split_sizes = var_792_split_sizes_0, x = inputs_23)[name = tensor<string, []>("op_792")];
+            tensor<fp32, [5, 256, 16]> var_794 = sigmoid(x = var_792_1)[name = tensor<string, []>("op_794")];
+            tensor<fp32, [5, 256, 16]> inputs_25 = mul(x = var_792_0, y = var_794)[name = tensor<string, []>("inputs_25")];
             tensor<string, []> outputs_aug_5_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_5_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_5_pad_0 = const()[name = tensor<string, []>("outputs_aug_5_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_5_groups_0 = const()[name = tensor<string, []>("outputs_aug_5_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_5_strides_0 = const()[name = tensor<string, []>("outputs_aug_5_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_5_dilations_0 = const()[name = tensor<string, []>("outputs_aug_5_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [5, 256, 31]> outputs_aug_5 = conv(dilations = outputs_aug_5_dilations_0, groups = outputs_aug_5_groups_0, pad = outputs_aug_5_pad_0, pad_type = outputs_aug_5_pad_type_0, strides = outputs_aug_5_strides_0, weight = encoder_conv_module_2_sequential_4_conv_weight, x = inputs_25)[name = tensor<string, []>("outputs_aug_5")];
-            tensor<int32, [3]> input_105_begin_0 = const()[name = tensor<string, []>("input_105_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_105_end_0 = const()[name = tensor<string, []>("input_105_end_0"), val = tensor<int32, [3]>([5, 256, 16])];
-            tensor<bool, [3]> input_105_end_mask_0 = const()[name = tensor<string, []>("input_105_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [5, 256, 16]> input_105 = slice_by_index(begin = input_105_begin_0, end = input_105_end_0, end_mask = input_105_end_mask_0, x = outputs_aug_5)[name = tensor<string, []>("input_105")];
-            tensor<fp32, [5, 256, 16]> inputs_27 = batch_norm(beta = encoder_conv_module_2_sequential_5_bias, epsilon = var_29, gamma = encoder_conv_module_2_sequential_5_weight, mean = encoder_conv_module_2_sequential_5_running_mean, variance = encoder_conv_module_2_sequential_5_running_var, x = input_105)[name = tensor<string, []>("inputs_27")];
-            tensor<fp32, [5, 256, 16]> input_107 = silu(x = inputs_27)[name = tensor<string, []>("input_107")];
-            tensor<string, []> input_109_pad_type_0 = const()[name = tensor<string, []>("input_109_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_109_strides_0 = const()[name = tensor<string, []>("input_109_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_109_pad_0 = const()[name = tensor<string, []>("input_109_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_109_dilations_0 = const()[name = tensor<string, []>("input_109_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_109_groups_0 = const()[name = tensor<string, []>("input_109_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> input_109 = conv(bias = encoder_conv_module_2_sequential_7_conv_bias, dilations = input_109_dilations_0, groups = input_109_groups_0, pad = input_109_pad_0, pad_type = input_109_pad_type_0, strides = input_109_strides_0, weight = encoder_conv_module_2_sequential_7_conv_weight, x = input_107)[name = tensor<string, []>("input_109")];
+            tensor<fp32, [5, 256, 31]> outputs_aug_5 = conv(dilations = outputs_aug_5_dilations_0, groups = outputs_aug_5_groups_0, pad = outputs_aug_5_pad_0, pad_type = outputs_aug_5_pad_type_0, strides = outputs_aug_5_strides_0, weight = inner_encoder_conv_module_2_sequential_4_conv_weight, x = inputs_25)[name = tensor<string, []>("outputs_aug_5")];
+            tensor<int32, [3]> input_107_begin_0 = const()[name = tensor<string, []>("input_107_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_107_end_0 = const()[name = tensor<string, []>("input_107_end_0"), val = tensor<int32, [3]>([5, 256, 16])];
+            tensor<bool, [3]> input_107_end_mask_0 = const()[name = tensor<string, []>("input_107_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [5, 256, 16]> input_107 = slice_by_index(begin = input_107_begin_0, end = input_107_end_0, end_mask = input_107_end_mask_0, x = outputs_aug_5)[name = tensor<string, []>("input_107")];
+            tensor<fp32, [5, 256, 16]> inputs_27 = batch_norm(beta = inner_encoder_conv_module_2_sequential_5_bias, epsilon = var_76, gamma = inner_encoder_conv_module_2_sequential_5_weight, mean = inner_encoder_conv_module_2_sequential_5_running_mean, variance = inner_encoder_conv_module_2_sequential_5_running_var, x = input_107)[name = tensor<string, []>("inputs_27")];
+            tensor<fp32, [5, 256, 16]> input_109 = silu(x = inputs_27)[name = tensor<string, []>("input_109")];
+            tensor<string, []> input_111_pad_type_0 = const()[name = tensor<string, []>("input_111_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_111_strides_0 = const()[name = tensor<string, []>("input_111_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_111_pad_0 = const()[name = tensor<string, []>("input_111_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_111_dilations_0 = const()[name = tensor<string, []>("input_111_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_111_groups_0 = const()[name = tensor<string, []>("input_111_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [5, 256, 16]> input_111 = conv(bias = inner_encoder_conv_module_2_sequential_7_conv_bias, dilations = input_111_dilations_0, groups = input_111_groups_0, pad = input_111_pad_0, pad_type = input_111_pad_type_0, strides = input_111_strides_0, weight = inner_encoder_conv_module_2_sequential_7_conv_weight, x = input_109)[name = tensor<string, []>("input_111")];
             tensor<int32, [3]> conv_out_5_perm_0 = const()[name = tensor<string, []>("conv_out_5_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_759_begin_0 = const()[name = tensor<string, []>("op_759_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_759_end_0 = const()[name = tensor<string, []>("op_759_end_0"), val = tensor<int32, [3]>([5, 16, 256])];
-            tensor<bool, [3]> var_759_end_mask_0 = const()[name = tensor<string, []>("op_759_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [5, 16, 256]> conv_out_5 = transpose(perm = conv_out_5_perm_0, x = input_109)[name = tensor<string, []>("transpose_39")];
-            tensor<fp32, [5, 1, 256]> var_759 = slice_by_index(begin = var_759_begin_0, end = var_759_end_0, end_mask = var_759_end_mask_0, x = conv_out_5)[name = tensor<string, []>("op_759")];
-            tensor<int32, [3]> var_761_perm_0 = const()[name = tensor<string, []>("op_761_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 5, 256]> var_761 = transpose(perm = var_761_perm_0, x = var_759)[name = tensor<string, []>("transpose_38")];
-            tensor<fp32, [1, 5, 256]> input_111 = add(x = x_15, y = var_761)[name = tensor<string, []>("input_111")];
-            tensor<int32, [1]> input_113_axes_0 = const()[name = tensor<string, []>("input_113_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_113 = layer_norm(axes = input_113_axes_0, beta = encoder_ffn2_2_module_sequential_0_bias, epsilon = var_29, gamma = encoder_ffn2_2_module_sequential_0_weight, x = input_111)[name = tensor<string, []>("input_113")];
-            tensor<fp32, [1, 5, 1024]> inputs_29 = linear(bias = encoder_ffn2_2_module_sequential_1_linear_bias, weight = encoder_ffn2_2_module_sequential_1_linear_weight, x = input_113)[name = tensor<string, []>("linear_26")];
-            tensor<fp32, [1, 5, 1024]> input_115 = silu(x = inputs_29)[name = tensor<string, []>("input_115")];
-            tensor<fp32, [1, 5, 256]> input_119 = linear(bias = encoder_ffn2_2_module_sequential_4_linear_bias, weight = encoder_ffn2_2_module_sequential_4_linear_weight, x = input_115)[name = tensor<string, []>("linear_27")];
-            tensor<fp32, []> var_784 = const()[name = tensor<string, []>("op_784"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 5, 256]> var_785 = mul(x = input_119, y = var_784)[name = tensor<string, []>("op_785")];
-            tensor<fp32, [1, 5, 256]> input_121 = add(x = var_785, y = input_111)[name = tensor<string, []>("input_121")];
-            tensor<int32, [1]> input_123_axes_0 = const()[name = tensor<string, []>("input_123_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_123 = layer_norm(axes = input_123_axes_0, beta = encoder_layer_norm_2_bias, epsilon = var_29, gamma = encoder_layer_norm_2_weight, x = input_121)[name = tensor<string, []>("input_123")];
+            tensor<int32, [3]> var_825_begin_0 = const()[name = tensor<string, []>("op_825_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_825_end_0 = const()[name = tensor<string, []>("op_825_end_0"), val = tensor<int32, [3]>([5, 16, 256])];
+            tensor<bool, [3]> var_825_end_mask_0 = const()[name = tensor<string, []>("op_825_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [5, 16, 256]> conv_out_5 = transpose(perm = conv_out_5_perm_0, x = input_111)[name = tensor<string, []>("transpose_39")];
+            tensor<fp32, [5, 1, 256]> var_825 = slice_by_index(begin = var_825_begin_0, end = var_825_end_0, end_mask = var_825_end_mask_0, x = conv_out_5)[name = tensor<string, []>("op_825")];
+            tensor<int32, [3]> var_827_perm_0 = const()[name = tensor<string, []>("op_827_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 5, 256]> var_827 = transpose(perm = var_827_perm_0, x = var_825)[name = tensor<string, []>("transpose_38")];
+            tensor<fp32, [1, 5, 256]> input_113 = add(x = x_15, y = var_827)[name = tensor<string, []>("input_113")];
+            tensor<int32, [1]> input_115_axes_0 = const()[name = tensor<string, []>("input_115_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 5, 256]> input_115 = layer_norm(axes = input_115_axes_0, beta = inner_encoder_ffn2_2_module_sequential_0_bias, epsilon = var_76, gamma = inner_encoder_ffn2_2_module_sequential_0_weight, x = input_113)[name = tensor<string, []>("input_115")];
+            tensor<fp32, [1, 5, 1024]> inputs_29 = linear(bias = inner_encoder_ffn2_2_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_2_module_sequential_1_linear_weight, x = input_115)[name = tensor<string, []>("linear_26")];
+            tensor<fp32, [1, 5, 1024]> input_117 = silu(x = inputs_29)[name = tensor<string, []>("input_117")];
+            tensor<fp32, [1, 5, 256]> input_121 = linear(bias = inner_encoder_ffn2_2_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_2_module_sequential_4_linear_weight, x = input_117)[name = tensor<string, []>("linear_27")];
+            tensor<fp32, []> var_850 = const()[name = tensor<string, []>("op_850"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 5, 256]> var_851 = mul(x = input_121, y = var_850)[name = tensor<string, []>("op_851")];
+            tensor<fp32, [1, 5, 256]> input_123 = add(x = var_851, y = input_113)[name = tensor<string, []>("input_123")];
             tensor<int32, [1]> input_125_axes_0 = const()[name = tensor<string, []>("input_125_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_125 = layer_norm(axes = input_125_axes_0, beta = encoder_ffn1_3_module_sequential_0_bias, epsilon = var_29, gamma = encoder_ffn1_3_module_sequential_0_weight, x = input_123)[name = tensor<string, []>("input_125")];
-            tensor<fp32, [1, 5, 1024]> inputs_31 = linear(bias = encoder_ffn1_3_module_sequential_1_linear_bias, weight = encoder_ffn1_3_module_sequential_1_linear_weight, x = input_125)[name = tensor<string, []>("linear_28")];
-            tensor<fp32, [1, 5, 1024]> input_127 = silu(x = inputs_31)[name = tensor<string, []>("input_127")];
-            tensor<fp32, [1, 5, 256]> input_131 = linear(bias = encoder_ffn1_3_module_sequential_4_linear_bias, weight = encoder_ffn1_3_module_sequential_4_linear_weight, x = input_127)[name = tensor<string, []>("linear_29")];
-            tensor<fp32, []> var_814 = const()[name = tensor<string, []>("op_814"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 5, 256]> var_815 = mul(x = input_131, y = var_814)[name = tensor<string, []>("op_815")];
-            tensor<fp32, [1, 5, 256]> input_133 = add(x = var_815, y = input_123)[name = tensor<string, []>("input_133")];
+            tensor<fp32, [1, 5, 256]> input_125 = layer_norm(axes = input_125_axes_0, beta = inner_encoder_layer_norm_2_bias, epsilon = var_76, gamma = inner_encoder_layer_norm_2_weight, x = input_123)[name = tensor<string, []>("input_125")];
+            tensor<int32, [1]> input_127_axes_0 = const()[name = tensor<string, []>("input_127_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 5, 256]> input_127 = layer_norm(axes = input_127_axes_0, beta = inner_encoder_ffn1_3_module_sequential_0_bias, epsilon = var_76, gamma = inner_encoder_ffn1_3_module_sequential_0_weight, x = input_125)[name = tensor<string, []>("input_127")];
+            tensor<fp32, [1, 5, 1024]> inputs_31 = linear(bias = inner_encoder_ffn1_3_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_3_module_sequential_1_linear_weight, x = input_127)[name = tensor<string, []>("linear_28")];
+            tensor<fp32, [1, 5, 1024]> input_129 = silu(x = inputs_31)[name = tensor<string, []>("input_129")];
+            tensor<fp32, [1, 5, 256]> input_133 = linear(bias = inner_encoder_ffn1_3_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_3_module_sequential_4_linear_weight, x = input_129)[name = tensor<string, []>("linear_29")];
+            tensor<fp32, []> var_880 = const()[name = tensor<string, []>("op_880"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 5, 256]> var_881 = mul(x = input_133, y = var_880)[name = tensor<string, []>("op_881")];
+            tensor<fp32, [1, 5, 256]> input_135 = add(x = var_881, y = input_125)[name = tensor<string, []>("input_135")];
             tensor<int32, [1]> x_19_axes_0 = const()[name = tensor<string, []>("x_19_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> x_19 = layer_norm(axes = x_19_axes_0, beta = encoder_ret_lns_3_bias, epsilon = var_29, gamma = encoder_ret_lns_3_weight, x = input_133)[name = tensor<string, []>("x_19")];
+            tensor<fp32, [1, 5, 256]> x_19 = layer_norm(axes = x_19_axes_0, beta = inner_encoder_ret_lns_3_bias, epsilon = var_76, gamma = inner_encoder_ret_lns_3_weight, x = input_135)[name = tensor<string, []>("x_19")];
             tensor<int32, [5]> prev_kv_7_begin_0 = const()[name = tensor<string, []>("prev_kv_7_begin_0"), val = tensor<int32, [5]>([3, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_7_end_0 = const()[name = tensor<string, []>("prev_kv_7_end_0"), val = tensor<int32, [5]>([4, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_7_end_mask_0 = const()[name = tensor<string, []>("prev_kv_7_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -800,219 +826,212 @@ program(1.0)
             tensor<bool, [2]> prev_scale_7_end_mask_0 = const()[name = tensor<string, []>("prev_scale_7_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_7_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_7_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_7 = slice_by_index(begin = prev_scale_7_begin_0, end = prev_scale_7_end_0, end_mask = prev_scale_7_end_mask_0, squeeze_mask = prev_scale_7_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_7")];
-            tensor<fp32, [1, 5, 256]> var_829 = linear(bias = encoder_q_proj_3_bias, weight = encoder_q_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_30")];
-            tensor<int32, [4]> var_830 = const()[name = tensor<string, []>("op_830"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_831 = reshape(shape = var_830, x = var_829)[name = tensor<string, []>("op_831")];
+            tensor<fp32, [1, 5, 256]> var_895 = linear(bias = inner_encoder_q_proj_3_bias, weight = inner_encoder_q_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_30")];
+            tensor<int32, [4]> var_896 = const()[name = tensor<string, []>("op_896"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_897 = reshape(shape = var_896, x = var_895)[name = tensor<string, []>("op_897")];
             tensor<int32, [4]> q_7_perm_0 = const()[name = tensor<string, []>("q_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> var_835 = linear(bias = encoder_k_proj_3_bias, weight = encoder_k_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_31")];
-            tensor<fp32, []> var_836 = const()[name = tensor<string, []>("op_836"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 5, 256]> var_837 = mul(x = var_835, y = var_836)[name = tensor<string, []>("op_837")];
-            tensor<int32, [4]> var_838 = const()[name = tensor<string, []>("op_838"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_839 = reshape(shape = var_838, x = var_837)[name = tensor<string, []>("op_839")];
+            tensor<fp32, [1, 5, 256]> var_901 = linear(bias = inner_encoder_k_proj_3_bias, weight = inner_encoder_k_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_31")];
+            tensor<fp32, []> var_902 = const()[name = tensor<string, []>("op_902"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 5, 256]> var_903 = mul(x = var_901, y = var_902)[name = tensor<string, []>("op_903")];
+            tensor<int32, [4]> var_904 = const()[name = tensor<string, []>("op_904"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_905 = reshape(shape = var_904, x = var_903)[name = tensor<string, []>("op_905")];
             tensor<int32, [4]> k_7_perm_0 = const()[name = tensor<string, []>("k_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> var_843 = linear(bias = encoder_v_proj_3_bias, weight = encoder_v_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_32")];
-            tensor<int32, [4]> var_844 = const()[name = tensor<string, []>("op_844"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_845 = reshape(shape = var_844, x = var_843)[name = tensor<string, []>("op_845")];
+            tensor<fp32, [1, 5, 256]> var_909 = linear(bias = inner_encoder_v_proj_3_bias, weight = inner_encoder_v_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_32")];
+            tensor<int32, [4]> var_910 = const()[name = tensor<string, []>("op_910"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_911 = reshape(shape = var_910, x = var_909)[name = tensor<string, []>("op_911")];
             tensor<int32, [4]> v_7_perm_0 = const()[name = tensor<string, []>("v_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> input_137 = linear(bias = encoder_g_proj_3_bias, weight = encoder_g_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_33")];
+            tensor<fp32, [1, 5, 256]> input_139 = linear(bias = inner_encoder_g_proj_3_bias, weight = inner_encoder_g_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_33")];
             tensor<fp32, [1]> sqrt_s0_7 = sqrt(x = prev_scale_7)[name = tensor<string, []>("sqrt_s0_7")];
-            tensor<fp32, [5]> s_t_7 = add(x = prev_scale_7, y = encoder__t_index)[name = tensor<string, []>("s_t_7")];
+            tensor<fp32, [5]> s_t_7 = add(x = prev_scale_7, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_7")];
             tensor<fp32, [5]> sqrt_s_t_7 = sqrt(x = s_t_7)[name = tensor<string, []>("sqrt_s_t_7")];
             tensor<bool, []> qk_7_transpose_x_1 = const()[name = tensor<string, []>("qk_7_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_7_transpose_y_1 = const()[name = tensor<string, []>("qk_7_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 5, 64]> k_7 = transpose(perm = k_7_perm_0, x = var_839)[name = tensor<string, []>("transpose_36")];
-            tensor<fp32, [1, 4, 5, 64]> q_7 = transpose(perm = q_7_perm_0, x = var_831)[name = tensor<string, []>("transpose_37")];
+            tensor<fp32, [1, 4, 5, 64]> k_7 = transpose(perm = k_7_perm_0, x = var_905)[name = tensor<string, []>("transpose_36")];
+            tensor<fp32, [1, 4, 5, 64]> q_7 = transpose(perm = q_7_perm_0, x = var_897)[name = tensor<string, []>("transpose_37")];
             tensor<fp32, [1, 4, 5, 5]> qk_7 = matmul(transpose_x = qk_7_transpose_x_1, transpose_y = qk_7_transpose_y_1, x = q_7, y = k_7)[name = tensor<string, []>("qk_7")];
-            tensor<int32, [2]> var_855 = const()[name = tensor<string, []>("op_855"), val = tensor<int32, [2]>([5, 1])];
-            tensor<fp32, [5, 1]> var_856 = reshape(shape = var_855, x = sqrt_s_t_7)[name = tensor<string, []>("op_856")];
-            tensor<fp32, [5, 5]> M_7 = real_div(x = encoder__causal_mask, y = var_856)[name = tensor<string, []>("M_7")];
-            tensor<fp32, [1, 4, 5, 5]> var_858 = mul(x = qk_7, y = M_7)[name = tensor<string, []>("op_858")];
+            tensor<int32, [2]> var_921 = const()[name = tensor<string, []>("op_921"), val = tensor<int32, [2]>([5, 1])];
+            tensor<fp32, [5, 1]> var_922 = reshape(shape = var_921, x = sqrt_s_t_7)[name = tensor<string, []>("op_922")];
+            tensor<fp32, [5, 5]> M_7 = real_div(x = inner_encoder__causal_mask, y = var_922)[name = tensor<string, []>("M_7")];
+            tensor<fp32, [1, 4, 5, 5]> var_924 = mul(x = qk_7, y = M_7)[name = tensor<string, []>("op_924")];
             tensor<bool, []> inner_7_transpose_x_0 = const()[name = tensor<string, []>("inner_7_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_7_transpose_y_0 = const()[name = tensor<string, []>("inner_7_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 5, 64]> v_7 = transpose(perm = v_7_perm_0, x = var_845)[name = tensor<string, []>("transpose_35")];
-            tensor<fp32, [1, 4, 5, 64]> inner_7 = matmul(transpose_x = inner_7_transpose_x_0, transpose_y = inner_7_transpose_y_0, x = var_858, y = v_7)[name = tensor<string, []>("inner_7")];
-            tensor<bool, []> var_860_transpose_x_0 = const()[name = tensor<string, []>("op_860_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_860_transpose_y_0 = const()[name = tensor<string, []>("op_860_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 5, 64]> var_860 = matmul(transpose_x = var_860_transpose_x_0, transpose_y = var_860_transpose_y_0, x = q_7, y = prev_kv_7)[name = tensor<string, []>("op_860")];
-            tensor<fp32, [5]> var_861 = real_div(x = sqrt_s0_7, y = sqrt_s_t_7)[name = tensor<string, []>("op_861")];
-            tensor<int32, [4]> var_862 = const()[name = tensor<string, []>("op_862"), val = tensor<int32, [4]>([1, 1, 5, 1])];
-            tensor<fp32, [1, 1, 5, 1]> var_863 = reshape(shape = var_862, x = var_861)[name = tensor<string, []>("op_863")];
-            tensor<fp32, [1, 4, 5, 64]> cross_7 = mul(x = var_860, y = var_863)[name = tensor<string, []>("cross_7")];
+            tensor<fp32, [1, 4, 5, 64]> v_7 = transpose(perm = v_7_perm_0, x = var_911)[name = tensor<string, []>("transpose_35")];
+            tensor<fp32, [1, 4, 5, 64]> inner_7 = matmul(transpose_x = inner_7_transpose_x_0, transpose_y = inner_7_transpose_y_0, x = var_924, y = v_7)[name = tensor<string, []>("inner_7")];
+            tensor<bool, []> var_926_transpose_x_0 = const()[name = tensor<string, []>("op_926_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_926_transpose_y_0 = const()[name = tensor<string, []>("op_926_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 5, 64]> var_926 = matmul(transpose_x = var_926_transpose_x_0, transpose_y = var_926_transpose_y_0, x = q_7, y = prev_kv_7)[name = tensor<string, []>("op_926")];
+            tensor<fp32, [5]> var_927 = real_div(x = sqrt_s0_7, y = sqrt_s_t_7)[name = tensor<string, []>("op_927")];
+            tensor<int32, [4]> var_928 = const()[name = tensor<string, []>("op_928"), val = tensor<int32, [4]>([1, 1, 5, 1])];
+            tensor<fp32, [1, 1, 5, 1]> var_929 = reshape(shape = var_928, x = var_927)[name = tensor<string, []>("op_929")];
+            tensor<fp32, [1, 4, 5, 64]> cross_7 = mul(x = var_926, y = var_929)[name = tensor<string, []>("cross_7")];
             tensor<fp32, [1, 4, 5, 64]> out_19 = add(x = inner_7, y = cross_7)[name = tensor<string, []>("out_19")];
-            tensor<fp32, [1, 4, 64, 64]> var_866 = mul(x = prev_kv_7, y = sqrt_s0_7)[name = tensor<string, []>("op_866")];
-            tensor<bool, []> var_868_transpose_x_1 = const()[name = tensor<string, []>("op_868_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_868_transpose_y_1 = const()[name = tensor<string, []>("op_868_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_868 = matmul(transpose_x = var_868_transpose_x_1, transpose_y = var_868_transpose_y_1, x = k_7, y = v_7)[name = tensor<string, []>("op_868")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_7 = add(x = var_866, y = var_868)[name = tensor<string, []>("new_kv_unnorm_7")];
-            tensor<fp32, []> var_870 = const()[name = tensor<string, []>("op_870"), val = tensor<fp32, []>(0x1.4p+2)];
-            tensor<fp32, [1]> new_scale_7 = add(x = prev_scale_7, y = var_870)[name = tensor<string, []>("new_scale_7")];
-            tensor<fp32, [1]> var_872 = sqrt(x = new_scale_7)[name = tensor<string, []>("op_872")];
-            tensor<fp32, [1, 4, 64, 64]> nkv_1 = real_div(x = new_kv_unnorm_7, y = var_872)[name = tensor<string, []>("nkv_1")];
-            tensor<int32, [4]> var_874_perm_0 = const()[name = tensor<string, []>("op_874_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_932 = mul(x = prev_kv_7, y = sqrt_s0_7)[name = tensor<string, []>("op_932")];
+            tensor<bool, []> var_934_transpose_x_1 = const()[name = tensor<string, []>("op_934_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_934_transpose_y_1 = const()[name = tensor<string, []>("op_934_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_934 = matmul(transpose_x = var_934_transpose_x_1, transpose_y = var_934_transpose_y_1, x = k_7, y = v_7)[name = tensor<string, []>("op_934")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_7 = add(x = var_932, y = var_934)[name = tensor<string, []>("new_kv_unnorm_7")];
+            tensor<fp32, []> var_936 = const()[name = tensor<string, []>("op_936"), val = tensor<fp32, []>(0x1.4p+2)];
+            tensor<fp32, [1]> new_scale_7 = add(x = prev_scale_7, y = var_936)[name = tensor<string, []>("new_scale_7")];
+            tensor<fp32, [1]> var_938 = sqrt(x = new_scale_7)[name = tensor<string, []>("op_938")];
+            tensor<fp32, [1, 4, 64, 64]> nkv_1 = real_div(x = new_kv_unnorm_7, y = var_938)[name = tensor<string, []>("nkv_1")];
+            tensor<int32, [4]> var_940_perm_0 = const()[name = tensor<string, []>("op_940_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_21_axes_0 = const()[name = tensor<string, []>("out_21_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 4, 64]> var_874 = transpose(perm = var_874_perm_0, x = out_19)[name = tensor<string, []>("transpose_34")];
-            tensor<fp32, [1, 5, 4, 64]> out_21 = layer_norm(axes = out_21_axes_0, epsilon = var_18, x = var_874)[name = tensor<string, []>("out_21")];
-            tensor<int32, [3]> var_878 = const()[name = tensor<string, []>("op_878"), val = tensor<int32, [3]>([1, 5, 256])];
-            tensor<fp32, [1, 5, 256]> out_23 = reshape(shape = var_878, x = out_21)[name = tensor<string, []>("out_23")];
-            tensor<fp32, [1, 5, 256]> var_880 = silu(x = input_137)[name = tensor<string, []>("op_880")];
-            tensor<fp32, [1, 5, 256]> input_139 = mul(x = var_880, y = out_23)[name = tensor<string, []>("input_139")];
-            tensor<fp32, [1, 5, 256]> ret_out_7 = linear(bias = encoder_out_proj_3_bias, weight = encoder_out_proj_3_weight, x = input_139)[name = tensor<string, []>("linear_34")];
-            tensor<fp32, [1, 5, 256]> x_21 = add(x = input_133, y = ret_out_7)[name = tensor<string, []>("x_21")];
+            tensor<fp32, [1, 5, 4, 64]> var_940 = transpose(perm = var_940_perm_0, x = out_19)[name = tensor<string, []>("transpose_34")];
+            tensor<fp32, [1, 5, 4, 64]> out_21 = layer_norm(axes = out_21_axes_0, epsilon = var_84, x = var_940)[name = tensor<string, []>("out_21")];
+            tensor<int32, [3]> var_944 = const()[name = tensor<string, []>("op_944"), val = tensor<int32, [3]>([1, 5, 256])];
+            tensor<fp32, [1, 5, 256]> out_23 = reshape(shape = var_944, x = out_21)[name = tensor<string, []>("out_23")];
+            tensor<fp32, [1, 5, 256]> var_946 = silu(x = input_139)[name = tensor<string, []>("op_946")];
+            tensor<fp32, [1, 5, 256]> input_141 = mul(x = var_946, y = out_23)[name = tensor<string, []>("input_141")];
+            tensor<fp32, [1, 5, 256]> ret_out_7 = linear(bias = inner_encoder_out_proj_3_bias, weight = inner_encoder_out_proj_3_weight, x = input_141)[name = tensor<string, []>("linear_34")];
+            tensor<fp32, [1, 5, 256]> x_21 = add(x = input_135, y = ret_out_7)[name = tensor<string, []>("x_21")];
             tensor<int32, [4]> window_37_begin_0 = const()[name = tensor<string, []>("window_37_begin_0"), val = tensor<int32, [4]>([3, 0, 0, 0])];
             tensor<int32, [4]> window_37_end_0 = const()[name = tensor<string, []>("window_37_end_0"), val = tensor<int32, [4]>([4, 1, 16, 256])];
             tensor<bool, [4]> window_37_end_mask_0 = const()[name = tensor<string, []>("window_37_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_37_squeeze_mask_0 = const()[name = tensor<string, []>("window_37_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_37 = slice_by_index(begin = window_37_begin_0, end = window_37_end_0, end_mask = window_37_end_mask_0, squeeze_mask = window_37_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_37")];
-            tensor<int32, [3]> var_888_begin_0 = const()[name = tensor<string, []>("op_888_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_888_end_0 = const()[name = tensor<string, []>("op_888_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_888_end_mask_0 = const()[name = tensor<string, []>("op_888_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_888 = slice_by_index(begin = var_888_begin_0, end = var_888_end_0, end_mask = var_888_end_mask_0, x = x_21)[name = tensor<string, []>("op_888")];
-            tensor<int32, [3]> var_891_begin_0 = const()[name = tensor<string, []>("op_891_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_891_end_0 = const()[name = tensor<string, []>("op_891_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_891_end_mask_0 = const()[name = tensor<string, []>("op_891_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_891 = slice_by_index(begin = var_891_begin_0, end = var_891_end_0, end_mask = var_891_end_mask_0, x = window_37)[name = tensor<string, []>("op_891")];
+            tensor<int32, [3]> var_954_begin_0 = const()[name = tensor<string, []>("op_954_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_954_end_0 = const()[name = tensor<string, []>("op_954_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_954_end_mask_0 = const()[name = tensor<string, []>("op_954_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_954 = slice_by_index(begin = var_954_begin_0, end = var_954_end_0, end_mask = var_954_end_mask_0, x = x_21)[name = tensor<string, []>("op_954")];
+            tensor<int32, [3]> var_957_begin_0 = const()[name = tensor<string, []>("op_957_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_957_end_0 = const()[name = tensor<string, []>("op_957_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_957_end_mask_0 = const()[name = tensor<string, []>("op_957_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_957 = slice_by_index(begin = var_957_begin_0, end = var_957_end_0, end_mask = var_957_end_mask_0, x = window_37)[name = tensor<string, []>("op_957")];
             tensor<bool, []> window_39_interleave_0 = const()[name = tensor<string, []>("window_39_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_39 = concat(axis = var_27, interleave = window_39_interleave_0, values = (var_891, var_888))[name = tensor<string, []>("window_39")];
-            tensor<int32, [3]> var_896_begin_0 = const()[name = tensor<string, []>("op_896_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_896_end_0 = const()[name = tensor<string, []>("op_896_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_896_end_mask_0 = const()[name = tensor<string, []>("op_896_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_896 = slice_by_index(begin = var_896_begin_0, end = var_896_end_0, end_mask = var_896_end_mask_0, x = x_21)[name = tensor<string, []>("op_896")];
-            tensor<int32, [3]> var_899_begin_0 = const()[name = tensor<string, []>("op_899_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_899_end_0 = const()[name = tensor<string, []>("op_899_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_899_end_mask_0 = const()[name = tensor<string, []>("op_899_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_899 = slice_by_index(begin = var_899_begin_0, end = var_899_end_0, end_mask = var_899_end_mask_0, x = window_39)[name = tensor<string, []>("op_899")];
+            tensor<fp32, [1, 16, 256]> window_39 = concat(axis = var_93, interleave = window_39_interleave_0, values = (var_957, var_954))[name = tensor<string, []>("window_39")];
+            tensor<int32, [3]> var_962_begin_0 = const()[name = tensor<string, []>("op_962_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_962_end_0 = const()[name = tensor<string, []>("op_962_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_962_end_mask_0 = const()[name = tensor<string, []>("op_962_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_962 = slice_by_index(begin = var_962_begin_0, end = var_962_end_0, end_mask = var_962_end_mask_0, x = x_21)[name = tensor<string, []>("op_962")];
+            tensor<int32, [3]> var_965_begin_0 = const()[name = tensor<string, []>("op_965_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_965_end_0 = const()[name = tensor<string, []>("op_965_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_965_end_mask_0 = const()[name = tensor<string, []>("op_965_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_965 = slice_by_index(begin = var_965_begin_0, end = var_965_end_0, end_mask = var_965_end_mask_0, x = window_39)[name = tensor<string, []>("op_965")];
             tensor<bool, []> window_41_interleave_0 = const()[name = tensor<string, []>("window_41_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_41 = concat(axis = var_27, interleave = window_41_interleave_0, values = (var_899, var_896))[name = tensor<string, []>("window_41")];
-            tensor<int32, [3]> var_904_begin_0 = const()[name = tensor<string, []>("op_904_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_904_end_0 = const()[name = tensor<string, []>("op_904_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<bool, [3]> var_904_end_mask_0 = const()[name = tensor<string, []>("op_904_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_904 = slice_by_index(begin = var_904_begin_0, end = var_904_end_0, end_mask = var_904_end_mask_0, x = x_21)[name = tensor<string, []>("op_904")];
-            tensor<int32, [3]> var_907_begin_0 = const()[name = tensor<string, []>("op_907_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_907_end_0 = const()[name = tensor<string, []>("op_907_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_907_end_mask_0 = const()[name = tensor<string, []>("op_907_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_907 = slice_by_index(begin = var_907_begin_0, end = var_907_end_0, end_mask = var_907_end_mask_0, x = window_41)[name = tensor<string, []>("op_907")];
+            tensor<fp32, [1, 16, 256]> window_41 = concat(axis = var_93, interleave = window_41_interleave_0, values = (var_965, var_962))[name = tensor<string, []>("window_41")];
+            tensor<int32, [3]> var_970_begin_0 = const()[name = tensor<string, []>("op_970_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_970_end_0 = const()[name = tensor<string, []>("op_970_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<bool, [3]> var_970_end_mask_0 = const()[name = tensor<string, []>("op_970_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_970 = slice_by_index(begin = var_970_begin_0, end = var_970_end_0, end_mask = var_970_end_mask_0, x = x_21)[name = tensor<string, []>("op_970")];
+            tensor<int32, [3]> var_973_begin_0 = const()[name = tensor<string, []>("op_973_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_973_end_0 = const()[name = tensor<string, []>("op_973_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_973_end_mask_0 = const()[name = tensor<string, []>("op_973_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_973 = slice_by_index(begin = var_973_begin_0, end = var_973_end_0, end_mask = var_973_end_mask_0, x = window_41)[name = tensor<string, []>("op_973")];
             tensor<bool, []> window_43_interleave_0 = const()[name = tensor<string, []>("window_43_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_43 = concat(axis = var_27, interleave = window_43_interleave_0, values = (var_907, var_904))[name = tensor<string, []>("window_43")];
-            tensor<int32, [3]> var_912_begin_0 = const()[name = tensor<string, []>("op_912_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
-            tensor<int32, [3]> var_912_end_0 = const()[name = tensor<string, []>("op_912_end_0"), val = tensor<int32, [3]>([1, 4, 256])];
-            tensor<bool, [3]> var_912_end_mask_0 = const()[name = tensor<string, []>("op_912_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_912 = slice_by_index(begin = var_912_begin_0, end = var_912_end_0, end_mask = var_912_end_mask_0, x = x_21)[name = tensor<string, []>("op_912")];
-            tensor<int32, [3]> var_915_begin_0 = const()[name = tensor<string, []>("op_915_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_915_end_0 = const()[name = tensor<string, []>("op_915_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_915_end_mask_0 = const()[name = tensor<string, []>("op_915_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_915 = slice_by_index(begin = var_915_begin_0, end = var_915_end_0, end_mask = var_915_end_mask_0, x = window_43)[name = tensor<string, []>("op_915")];
+            tensor<fp32, [1, 16, 256]> window_43 = concat(axis = var_93, interleave = window_43_interleave_0, values = (var_973, var_970))[name = tensor<string, []>("window_43")];
+            tensor<int32, [3]> var_978_begin_0 = const()[name = tensor<string, []>("op_978_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
+            tensor<int32, [3]> var_978_end_0 = const()[name = tensor<string, []>("op_978_end_0"), val = tensor<int32, [3]>([1, 4, 256])];
+            tensor<bool, [3]> var_978_end_mask_0 = const()[name = tensor<string, []>("op_978_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_978 = slice_by_index(begin = var_978_begin_0, end = var_978_end_0, end_mask = var_978_end_mask_0, x = x_21)[name = tensor<string, []>("op_978")];
+            tensor<int32, [3]> var_981_begin_0 = const()[name = tensor<string, []>("op_981_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_981_end_0 = const()[name = tensor<string, []>("op_981_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_981_end_mask_0 = const()[name = tensor<string, []>("op_981_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_981 = slice_by_index(begin = var_981_begin_0, end = var_981_end_0, end_mask = var_981_end_mask_0, x = window_43)[name = tensor<string, []>("op_981")];
             tensor<bool, []> window_45_interleave_0 = const()[name = tensor<string, []>("window_45_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_45 = concat(axis = var_27, interleave = window_45_interleave_0, values = (var_915, var_912))[name = tensor<string, []>("window_45")];
-            tensor<int32, [3]> var_920_begin_0 = const()[name = tensor<string, []>("op_920_begin_0"), val = tensor<int32, [3]>([0, 4, 0])];
-            tensor<int32, [3]> var_920_end_0 = const()[name = tensor<string, []>("op_920_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_920_end_mask_0 = const()[name = tensor<string, []>("op_920_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_920 = slice_by_index(begin = var_920_begin_0, end = var_920_end_0, end_mask = var_920_end_mask_0, x = x_21)[name = tensor<string, []>("op_920")];
-            tensor<int32, [3]> var_923_begin_0 = const()[name = tensor<string, []>("op_923_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_923_end_0 = const()[name = tensor<string, []>("op_923_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_923_end_mask_0 = const()[name = tensor<string, []>("op_923_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_923 = slice_by_index(begin = var_923_begin_0, end = var_923_end_0, end_mask = var_923_end_mask_0, x = window_45)[name = tensor<string, []>("op_923")];
+            tensor<fp32, [1, 16, 256]> window_45 = concat(axis = var_93, interleave = window_45_interleave_0, values = (var_981, var_978))[name = tensor<string, []>("window_45")];
+            tensor<int32, [3]> var_986_begin_0 = const()[name = tensor<string, []>("op_986_begin_0"), val = tensor<int32, [3]>([0, 4, 0])];
+            tensor<int32, [3]> var_986_end_0 = const()[name = tensor<string, []>("op_986_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_986_end_mask_0 = const()[name = tensor<string, []>("op_986_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_986 = slice_by_index(begin = var_986_begin_0, end = var_986_end_0, end_mask = var_986_end_mask_0, x = x_21)[name = tensor<string, []>("op_986")];
+            tensor<int32, [3]> var_989_begin_0 = const()[name = tensor<string, []>("op_989_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_989_end_0 = const()[name = tensor<string, []>("op_989_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_989_end_mask_0 = const()[name = tensor<string, []>("op_989_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_989 = slice_by_index(begin = var_989_begin_0, end = var_989_end_0, end_mask = var_989_end_mask_0, x = window_45)[name = tensor<string, []>("op_989")];
             tensor<bool, []> window_interleave_0 = const()[name = tensor<string, []>("window_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window = concat(axis = var_27, interleave = window_interleave_0, values = (var_923, var_920))[name = tensor<string, []>("window")];
-            tensor<bool, []> input_141_interleave_0 = const()[name = tensor<string, []>("input_141_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [5, 16, 256]> input_141 = concat(axis = var_24, interleave = input_141_interleave_0, values = (window_39, window_41, window_43, window_45, window))[name = tensor<string, []>("input_141")];
+            tensor<fp32, [1, 16, 256]> window = concat(axis = var_93, interleave = window_interleave_0, values = (var_989, var_986))[name = tensor<string, []>("window")];
+            tensor<bool, []> input_143_interleave_0 = const()[name = tensor<string, []>("input_143_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [5, 16, 256]> input_143 = concat(axis = var_79, interleave = input_143_interleave_0, values = (window_39, window_41, window_43, window_45, window))[name = tensor<string, []>("input_143")];
             tensor<int32, [1]> x_23_axes_0 = const()[name = tensor<string, []>("x_23_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [5, 16, 256]> x_23 = layer_norm(axes = x_23_axes_0, beta = encoder_conv_module_3_sequential_0_bias, epsilon = var_29, gamma = encoder_conv_module_3_sequential_0_weight, x = input_141)[name = tensor<string, []>("x_23")];
-            tensor<int32, [3]> input_143_perm_0 = const()[name = tensor<string, []>("input_143_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [5, 16, 256]> x_23 = layer_norm(axes = x_23_axes_0, beta = inner_encoder_conv_module_3_sequential_0_bias, epsilon = var_76, gamma = inner_encoder_conv_module_3_sequential_0_weight, x = input_143)[name = tensor<string, []>("x_23")];
+            tensor<int32, [3]> input_145_perm_0 = const()[name = tensor<string, []>("input_145_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_33_pad_type_0 = const()[name = tensor<string, []>("inputs_33_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_33_strides_0 = const()[name = tensor<string, []>("inputs_33_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_33_pad_0 = const()[name = tensor<string, []>("inputs_33_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_33_dilations_0 = const()[name = tensor<string, []>("inputs_33_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_33_groups_0 = const()[name = tensor<string, []>("inputs_33_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> input_143 = transpose(perm = input_143_perm_0, x = x_23)[name = tensor<string, []>("transpose_33")];
-            tensor<fp32, [5, 512, 16]> inputs_33 = conv(bias = encoder_conv_module_3_sequential_2_conv_bias, dilations = inputs_33_dilations_0, groups = inputs_33_groups_0, pad = inputs_33_pad_0, pad_type = inputs_33_pad_type_0, strides = inputs_33_strides_0, weight = encoder_conv_module_3_sequential_2_conv_weight, x = input_143)[name = tensor<string, []>("inputs_33")];
-            tensor<int32, [2]> var_948_split_sizes_0 = const()[name = tensor<string, []>("op_948_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_948_axis_0 = const()[name = tensor<string, []>("op_948_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> var_948_0, tensor<fp32, [5, 256, 16]> var_948_1 = split(axis = var_948_axis_0, split_sizes = var_948_split_sizes_0, x = inputs_33)[name = tensor<string, []>("op_948")];
-            tensor<fp32, [5, 256, 16]> var_950 = sigmoid(x = var_948_1)[name = tensor<string, []>("op_950")];
-            tensor<fp32, [5, 256, 16]> inputs_35 = mul(x = var_948_0, y = var_950)[name = tensor<string, []>("inputs_35")];
+            tensor<fp32, [5, 256, 16]> input_145 = transpose(perm = input_145_perm_0, x = x_23)[name = tensor<string, []>("transpose_33")];
+            tensor<fp32, [5, 512, 16]> inputs_33 = conv(bias = inner_encoder_conv_module_3_sequential_2_conv_bias, dilations = inputs_33_dilations_0, groups = inputs_33_groups_0, pad = inputs_33_pad_0, pad_type = inputs_33_pad_type_0, strides = inputs_33_strides_0, weight = inner_encoder_conv_module_3_sequential_2_conv_weight, x = input_145)[name = tensor<string, []>("inputs_33")];
+            tensor<int32, [2]> var_1014_split_sizes_0 = const()[name = tensor<string, []>("op_1014_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_1014_axis_0 = const()[name = tensor<string, []>("op_1014_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [5, 256, 16]> var_1014_0, tensor<fp32, [5, 256, 16]> var_1014_1 = split(axis = var_1014_axis_0, split_sizes = var_1014_split_sizes_0, x = inputs_33)[name = tensor<string, []>("op_1014")];
+            tensor<fp32, [5, 256, 16]> var_1016 = sigmoid(x = var_1014_1)[name = tensor<string, []>("op_1016")];
+            tensor<fp32, [5, 256, 16]> inputs_35 = mul(x = var_1014_0, y = var_1016)[name = tensor<string, []>("inputs_35")];
             tensor<string, []> outputs_aug_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_pad_0 = const()[name = tensor<string, []>("outputs_aug_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_groups_0 = const()[name = tensor<string, []>("outputs_aug_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_strides_0 = const()[name = tensor<string, []>("outputs_aug_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_dilations_0 = const()[name = tensor<string, []>("outputs_aug_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [5, 256, 31]> outputs_aug = conv(dilations = outputs_aug_dilations_0, groups = outputs_aug_groups_0, pad = outputs_aug_pad_0, pad_type = outputs_aug_pad_type_0, strides = outputs_aug_strides_0, weight = encoder_conv_module_3_sequential_4_conv_weight, x = inputs_35)[name = tensor<string, []>("outputs_aug")];
-            tensor<int32, [3]> input_145_begin_0 = const()[name = tensor<string, []>("input_145_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_145_end_0 = const()[name = tensor<string, []>("input_145_end_0"), val = tensor<int32, [3]>([5, 256, 16])];
-            tensor<bool, [3]> input_145_end_mask_0 = const()[name = tensor<string, []>("input_145_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [5, 256, 16]> input_145 = slice_by_index(begin = input_145_begin_0, end = input_145_end_0, end_mask = input_145_end_mask_0, x = outputs_aug)[name = tensor<string, []>("input_145")];
-            tensor<fp32, [5, 256, 16]> inputs_37 = batch_norm(beta = encoder_conv_module_3_sequential_5_bias, epsilon = var_29, gamma = encoder_conv_module_3_sequential_5_weight, mean = encoder_conv_module_3_sequential_5_running_mean, variance = encoder_conv_module_3_sequential_5_running_var, x = input_145)[name = tensor<string, []>("inputs_37")];
-            tensor<fp32, [5, 256, 16]> input_147 = silu(x = inputs_37)[name = tensor<string, []>("input_147")];
-            tensor<string, []> input_149_pad_type_0 = const()[name = tensor<string, []>("input_149_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_149_strides_0 = const()[name = tensor<string, []>("input_149_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_149_pad_0 = const()[name = tensor<string, []>("input_149_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_149_dilations_0 = const()[name = tensor<string, []>("input_149_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_149_groups_0 = const()[name = tensor<string, []>("input_149_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> input_149 = conv(bias = encoder_conv_module_3_sequential_7_conv_bias, dilations = input_149_dilations_0, groups = input_149_groups_0, pad = input_149_pad_0, pad_type = input_149_pad_type_0, strides = input_149_strides_0, weight = encoder_conv_module_3_sequential_7_conv_weight, x = input_147)[name = tensor<string, []>("input_149")];
+            tensor<fp32, [5, 256, 31]> outputs_aug = conv(dilations = outputs_aug_dilations_0, groups = outputs_aug_groups_0, pad = outputs_aug_pad_0, pad_type = outputs_aug_pad_type_0, strides = outputs_aug_strides_0, weight = inner_encoder_conv_module_3_sequential_4_conv_weight, x = inputs_35)[name = tensor<string, []>("outputs_aug")];
+            tensor<int32, [3]> input_147_begin_0 = const()[name = tensor<string, []>("input_147_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_147_end_0 = const()[name = tensor<string, []>("input_147_end_0"), val = tensor<int32, [3]>([5, 256, 16])];
+            tensor<bool, [3]> input_147_end_mask_0 = const()[name = tensor<string, []>("input_147_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [5, 256, 16]> input_147 = slice_by_index(begin = input_147_begin_0, end = input_147_end_0, end_mask = input_147_end_mask_0, x = outputs_aug)[name = tensor<string, []>("input_147")];
+            tensor<fp32, [5, 256, 16]> inputs_37 = batch_norm(beta = inner_encoder_conv_module_3_sequential_5_bias, epsilon = var_76, gamma = inner_encoder_conv_module_3_sequential_5_weight, mean = inner_encoder_conv_module_3_sequential_5_running_mean, variance = inner_encoder_conv_module_3_sequential_5_running_var, x = input_147)[name = tensor<string, []>("inputs_37")];
+            tensor<fp32, [5, 256, 16]> input_149 = silu(x = inputs_37)[name = tensor<string, []>("input_149")];
+            tensor<string, []> input_151_pad_type_0 = const()[name = tensor<string, []>("input_151_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_151_strides_0 = const()[name = tensor<string, []>("input_151_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_151_pad_0 = const()[name = tensor<string, []>("input_151_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_151_dilations_0 = const()[name = tensor<string, []>("input_151_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_151_groups_0 = const()[name = tensor<string, []>("input_151_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [5, 256, 16]> input_151 = conv(bias = inner_encoder_conv_module_3_sequential_7_conv_bias, dilations = input_151_dilations_0, groups = input_151_groups_0, pad = input_151_pad_0, pad_type = input_151_pad_type_0, strides = input_151_strides_0, weight = inner_encoder_conv_module_3_sequential_7_conv_weight, x = input_149)[name = tensor<string, []>("input_151")];
             tensor<int32, [3]> conv_out_7_perm_0 = const()[name = tensor<string, []>("conv_out_7_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_981_begin_0 = const()[name = tensor<string, []>("op_981_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_981_end_0 = const()[name = tensor<string, []>("op_981_end_0"), val = tensor<int32, [3]>([5, 16, 256])];
-            tensor<bool, [3]> var_981_end_mask_0 = const()[name = tensor<string, []>("op_981_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [5, 16, 256]> conv_out_7 = transpose(perm = conv_out_7_perm_0, x = input_149)[name = tensor<string, []>("transpose_32")];
-            tensor<fp32, [5, 1, 256]> var_981 = slice_by_index(begin = var_981_begin_0, end = var_981_end_0, end_mask = var_981_end_mask_0, x = conv_out_7)[name = tensor<string, []>("op_981")];
-            tensor<int32, [3]> var_983_perm_0 = const()[name = tensor<string, []>("op_983_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 5, 256]> var_983 = transpose(perm = var_983_perm_0, x = var_981)[name = tensor<string, []>("transpose_31")];
-            tensor<fp32, [1, 5, 256]> input_151 = add(x = x_21, y = var_983)[name = tensor<string, []>("input_151")];
-            tensor<int32, [1]> input_153_axes_0 = const()[name = tensor<string, []>("input_153_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_153 = layer_norm(axes = input_153_axes_0, beta = encoder_ffn2_3_module_sequential_0_bias, epsilon = var_29, gamma = encoder_ffn2_3_module_sequential_0_weight, x = input_151)[name = tensor<string, []>("input_153")];
-            tensor<fp32, [1, 5, 1024]> inputs = linear(bias = encoder_ffn2_3_module_sequential_1_linear_bias, weight = encoder_ffn2_3_module_sequential_1_linear_weight, x = input_153)[name = tensor<string, []>("linear_35")];
-            tensor<fp32, [1, 5, 1024]> input_155 = silu(x = inputs)[name = tensor<string, []>("input_155")];
-            tensor<fp32, [1, 5, 256]> input_159 = linear(bias = encoder_ffn2_3_module_sequential_4_linear_bias, weight = encoder_ffn2_3_module_sequential_4_linear_weight, x = input_155)[name = tensor<string, []>("linear_36")];
-            tensor<fp32, []> var_1006 = const()[name = tensor<string, []>("op_1006"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 5, 256]> var_1007 = mul(x = input_159, y = var_1006)[name = tensor<string, []>("op_1007")];
-            tensor<fp32, [1, 5, 256]> input_161 = add(x = var_1007, y = input_151)[name = tensor<string, []>("input_161")];
+            tensor<int32, [3]> var_1047_begin_0 = const()[name = tensor<string, []>("op_1047_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_1047_end_0 = const()[name = tensor<string, []>("op_1047_end_0"), val = tensor<int32, [3]>([5, 16, 256])];
+            tensor<bool, [3]> var_1047_end_mask_0 = const()[name = tensor<string, []>("op_1047_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [5, 16, 256]> conv_out_7 = transpose(perm = conv_out_7_perm_0, x = input_151)[name = tensor<string, []>("transpose_32")];
+            tensor<fp32, [5, 1, 256]> var_1047 = slice_by_index(begin = var_1047_begin_0, end = var_1047_end_0, end_mask = var_1047_end_mask_0, x = conv_out_7)[name = tensor<string, []>("op_1047")];
+            tensor<int32, [3]> var_1049_perm_0 = const()[name = tensor<string, []>("op_1049_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 5, 256]> var_1049 = transpose(perm = var_1049_perm_0, x = var_1047)[name = tensor<string, []>("transpose_31")];
+            tensor<fp32, [1, 5, 256]> input_153 = add(x = x_21, y = var_1049)[name = tensor<string, []>("input_153")];
+            tensor<int32, [1]> input_155_axes_0 = const()[name = tensor<string, []>("input_155_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 5, 256]> input_155 = layer_norm(axes = input_155_axes_0, beta = inner_encoder_ffn2_3_module_sequential_0_bias, epsilon = var_76, gamma = inner_encoder_ffn2_3_module_sequential_0_weight, x = input_153)[name = tensor<string, []>("input_155")];
+            tensor<fp32, [1, 5, 1024]> inputs = linear(bias = inner_encoder_ffn2_3_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_3_module_sequential_1_linear_weight, x = input_155)[name = tensor<string, []>("linear_35")];
+            tensor<fp32, [1, 5, 1024]> input_157 = silu(x = inputs)[name = tensor<string, []>("input_157")];
+            tensor<fp32, [1, 5, 256]> input_161 = linear(bias = inner_encoder_ffn2_3_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_3_module_sequential_4_linear_weight, x = input_157)[name = tensor<string, []>("linear_36")];
+            tensor<fp32, []> var_1072 = const()[name = tensor<string, []>("op_1072"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 5, 256]> var_1073 = mul(x = input_161, y = var_1072)[name = tensor<string, []>("op_1073")];
+            tensor<fp32, [1, 5, 256]> input_163 = add(x = var_1073, y = input_153)[name = tensor<string, []>("input_163")];
             tensor<int32, [1]> x_25_axes_0 = const()[name = tensor<string, []>("x_25_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> x_25 = layer_norm(axes = x_25_axes_0, beta = encoder_layer_norm_3_bias, epsilon = var_29, gamma = encoder_layer_norm_3_weight, x = input_161)[name = tensor<string, []>("x_25")];
+            tensor<fp32, [1, 5, 256]> x_25 = layer_norm(axes = x_25_axes_0, beta = inner_encoder_layer_norm_3_bias, epsilon = var_76, gamma = inner_encoder_layer_norm_3_weight, x = input_163)[name = tensor<string, []>("x_25")];
             tensor<int32, [3]> x_ct_perm_0 = const()[name = tensor<string, []>("x_ct_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<bool, []> cat_interleave_0 = const()[name = tensor<string, []>("cat_interleave_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [1, 256, 5]> x_ct = transpose(perm = x_ct_perm_0, x = x_25)[name = tensor<string, []>("transpose_30")];
-            tensor<fp32, [1, 256, 23]> cat = concat(axis = var_21, interleave = cat_interleave_0, values = (cnn_window, x_ct))[name = tensor<string, []>("cat")];
+            tensor<fp32, [1, 256, 23]> cat = concat(axis = var_81, interleave = cat_interleave_0, values = (cnn_window, x_ct))[name = tensor<string, []>("cat")];
             tensor<string, []> conv_out_pad_type_0 = const()[name = tensor<string, []>("conv_out_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> conv_out_strides_0 = const()[name = tensor<string, []>("conv_out_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> conv_out_pad_0 = const()[name = tensor<string, []>("conv_out_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> conv_out_dilations_0 = const()[name = tensor<string, []>("conv_out_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> conv_out_groups_0 = const()[name = tensor<string, []>("conv_out_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 5]> conv_out = conv(bias = encoder_cnn_bias, dilations = conv_out_dilations_0, groups = conv_out_groups_0, pad = conv_out_pad_0, pad_type = conv_out_pad_type_0, strides = conv_out_strides_0, weight = encoder_cnn_weight, x = cat)[name = tensor<string, []>("conv_out")];
-            tensor<int32, [3]> var_1025_begin_0 = const()[name = tensor<string, []>("op_1025_begin_0"), val = tensor<int32, [3]>([0, 0, 5])];
-            tensor<int32, [3]> var_1025_end_0 = const()[name = tensor<string, []>("op_1025_end_0"), val = tensor<int32, [3]>([1, 256, 23])];
-            tensor<bool, [3]> var_1025_end_mask_0 = const()[name = tensor<string, []>("op_1025_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 256, 18]> cnn_window_new = slice_by_index(begin = var_1025_begin_0, end = var_1025_end_0, end_mask = var_1025_end_mask_0, x = cat)[name = tensor<string, []>("op_1025")];
-            tensor<int32, [3]> input_163_perm_0 = const()[name = tensor<string, []>("input_163_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [1]> var_1027 = const()[name = tensor<string, []>("op_1027"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_163 = transpose(perm = input_163_perm_0, x = conv_out)[name = tensor<string, []>("transpose_29")];
-            tensor<fp32, [1, 5, 1]> var_1028 = reduce_l2_norm(axes = var_1027, keep_dims = var_30, x = input_163)[name = tensor<string, []>("op_1028")];
+            tensor<fp32, [1, 256, 5]> conv_out = conv(bias = inner_encoder_cnn_bias, dilations = conv_out_dilations_0, groups = conv_out_groups_0, pad = conv_out_pad_0, pad_type = conv_out_pad_type_0, strides = conv_out_strides_0, weight = inner_encoder_cnn_weight, x = cat)[name = tensor<string, []>("conv_out")];
+            tensor<int32, [3]> var_1091_begin_0 = const()[name = tensor<string, []>("op_1091_begin_0"), val = tensor<int32, [3]>([0, 0, 5])];
+            tensor<int32, [3]> var_1091_end_0 = const()[name = tensor<string, []>("op_1091_end_0"), val = tensor<int32, [3]>([1, 256, 23])];
+            tensor<bool, [3]> var_1091_end_mask_0 = const()[name = tensor<string, []>("op_1091_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 256, 18]> cnn_window_new = slice_by_index(begin = var_1091_begin_0, end = var_1091_end_0, end_mask = var_1091_end_mask_0, x = cat)[name = tensor<string, []>("op_1091")];
+            tensor<int32, [3]> input_165_perm_0 = const()[name = tensor<string, []>("input_165_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_1093 = const()[name = tensor<string, []>("op_1093"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 5, 256]> input_165 = transpose(perm = input_165_perm_0, x = conv_out)[name = tensor<string, []>("transpose_29")];
+            tensor<fp32, [1, 5, 1]> var_1094 = reduce_l2_norm(axes = var_1093, keep_dims = var_75, x = input_165)[name = tensor<string, []>("op_1094")];
             tensor<fp32, []> const_12 = const()[name = tensor<string, []>("const_12"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1, 5, 1]> clip_0 = clip(alpha = var_11, beta = const_12, x = var_1028)[name = tensor<string, []>("clip_0")];
-            tensor<fp32, [1, 5, 256]> emb = real_div(x = input_163, y = clip_0)[name = tensor<string, []>("emb")];
-            tensor<int32, []> var_1032_axis_0 = const()[name = tensor<string, []>("op_1032_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1, 4, 64, 64]> enc_kv_new = stack(axis = var_1032_axis_0, values = (var_207, var_429, var_651, nkv_1))[name = tensor<string, []>("op_1032")];
-            tensor<int32, []> var_1034_axis_0 = const()[name = tensor<string, []>("op_1034_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1]> enc_scale_new = stack(axis = var_1034_axis_0, values = (new_scale_1, new_scale_3, new_scale_5, new_scale_7))[name = tensor<string, []>("op_1034")];
-            tensor<int32, []> var_1036_axis_0 = const()[name = tensor<string, []>("op_1036_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1, 16, 256]> enc_conv_cache_new = stack(axis = var_1036_axis_0, values = (window_11, window_23, window_35, window))[name = tensor<string, []>("op_1036")];
-            tensor<fp32, []> var_1045 = const()[name = tensor<string, []>("op_1045"), val = tensor<fp32, []>(0x1.5798eep-27)];
-            tensor<fp32, []> var_1050 = const()[name = tensor<string, []>("op_1050"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
-            tensor<fp32, []> var_1052 = const()[name = tensor<string, []>("op_1052"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
-            tensor<bool, []> var_1053 = const()[name = tensor<string, []>("op_1053"), val = tensor<bool, []>(true)];
-            tensor<fp32, []> var_1055 = const()[name = tensor<string, []>("op_1055"), val = tensor<fp32, []>(0x1p+0)];
-            tensor<int32, []> var_1059 = const()[name = tensor<string, []>("op_1059"), val = tensor<int32, []>(-1)];
-            tensor<int32, []> var_1065 = const()[name = tensor<string, []>("op_1065"), val = tensor<int32, []>(0)];
+            tensor<fp32, [1, 5, 1]> clip_0 = clip(alpha = var_90, beta = const_12, x = var_1094)[name = tensor<string, []>("clip_0")];
+            tensor<fp32, [1, 5, 256]> emb = real_div(x = input_165, y = clip_0)[name = tensor<string, []>("emb")];
+            tensor<int32, []> var_1098_axis_0 = const()[name = tensor<string, []>("op_1098_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1, 4, 64, 64]> enc_kv_new = stack(axis = var_1098_axis_0, values = (var_273, var_495, var_717, nkv_1))[name = tensor<string, []>("op_1098")];
+            tensor<int32, []> var_1100_axis_0 = const()[name = tensor<string, []>("op_1100_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1]> enc_scale_new = stack(axis = var_1100_axis_0, values = (new_scale_1, new_scale_3, new_scale_5, new_scale_7))[name = tensor<string, []>("op_1100")];
+            tensor<int32, []> var_1102_axis_0 = const()[name = tensor<string, []>("op_1102_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1, 16, 256]> enc_conv_cache_new = stack(axis = var_1102_axis_0, values = (window_11, window_23, window_35, window))[name = tensor<string, []>("op_1102")];
             tensor<fp32, [1, 5, 12, 256]> pos = const()[name = tensor<string, []>("pos"), val = tensor<fp32, [1, 5, 12, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44395712)))];
-            tensor<int32, [1]> var_1127_axes_0 = const()[name = tensor<string, []>("op_1127_axes_0"), val = tensor<int32, [1]>([2])];
-            tensor<fp32, [1, 5, 1, 256]> var_1127 = expand_dims(axes = var_1127_axes_0, x = emb)[name = tensor<string, []>("op_1127")];
+            tensor<int32, [1]> var_1170_axes_0 = const()[name = tensor<string, []>("op_1170_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp32, [1, 5, 1, 256]> var_1170 = expand_dims(axes = var_1170_axes_0, x = emb)[name = tensor<string, []>("op_1170")];
             tensor<int32, [4]> emb_exp_reps_0 = const()[name = tensor<string, []>("emb_exp_reps_0"), val = tensor<int32, [4]>([1, 1, 12, 1])];
-            tensor<fp32, [1, 5, 12, 256]> emb_exp = tile(reps = emb_exp_reps_0, x = var_1127)[name = tensor<string, []>("emb_exp")];
-            tensor<bool, []> input_165_interleave_0 = const()[name = tensor<string, []>("input_165_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 5, 12, 512]> input_165 = concat(axis = var_1059, interleave = input_165_interleave_0, values = (emb_exp, pos))[name = tensor<string, []>("input_165")];
-            tensor<fp32, [1, 5, 12, 256]> x_27 = linear(bias = decoder_convert_bias, weight = decoder_convert_weight, x = input_165)[name = tensor<string, []>("linear_37")];
-            tensor<int32, [4]> var_1135_perm_0 = const()[name = tensor<string, []>("op_1135_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1139 = const()[name = tensor<string, []>("op_1139"), val = tensor<int32, [3]>([12, 5, 256])];
-            tensor<fp32, [1, 12, 5, 256]> var_1135 = transpose(perm = var_1135_perm_0, x = x_27)[name = tensor<string, []>("transpose_28")];
-            tensor<fp32, [12, 5, 256]> x_29 = reshape(shape = var_1139, x = var_1135)[name = tensor<string, []>("x_29")];
+            tensor<fp32, [1, 5, 12, 256]> emb_exp = tile(reps = emb_exp_reps_0, x = var_1170)[name = tensor<string, []>("emb_exp")];
+            tensor<bool, []> input_167_interleave_0 = const()[name = tensor<string, []>("input_167_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 5, 12, 512]> input_167 = concat(axis = var_82, interleave = input_167_interleave_0, values = (emb_exp, pos))[name = tensor<string, []>("input_167")];
+            tensor<fp32, [1, 5, 12, 256]> x_27 = linear(bias = inner_decoder_convert_bias, weight = inner_decoder_convert_weight, x = input_167)[name = tensor<string, []>("linear_37")];
+            tensor<int32, [4]> var_1178_perm_0 = const()[name = tensor<string, []>("op_1178_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1182 = const()[name = tensor<string, []>("op_1182"), val = tensor<int32, [3]>([12, 5, 256])];
+            tensor<fp32, [1, 12, 5, 256]> var_1178 = transpose(perm = var_1178_perm_0, x = x_27)[name = tensor<string, []>("transpose_28")];
+            tensor<fp32, [12, 5, 256]> x_29 = reshape(shape = var_1182, x = var_1178)[name = tensor<string, []>("x_29")];
             tensor<int32, [5]> prev_kv_9_begin_0 = const()[name = tensor<string, []>("prev_kv_9_begin_0"), val = tensor<int32, [5]>([0, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_9_end_0 = const()[name = tensor<string, []>("prev_kv_9_end_0"), val = tensor<int32, [5]>([1, 12, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_9_end_mask_0 = const()[name = tensor<string, []>("prev_kv_9_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -1023,132 +1042,132 @@ program(1.0)
             tensor<bool, [2]> prev_scale_9_end_mask_0 = const()[name = tensor<string, []>("prev_scale_9_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_9_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_9_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_9 = slice_by_index(begin = prev_scale_9_begin_0, end = prev_scale_9_end_0, end_mask = prev_scale_9_end_mask_0, squeeze_mask = prev_scale_9_squeeze_mask_0, x = dec_scale)[name = tensor<string, []>("prev_scale_9")];
-            tensor<fp32, [12, 5, 256]> var_1147 = linear(bias = decoder_q_proj_0_bias, weight = decoder_q_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_38")];
-            tensor<int32, [4]> var_1148 = const()[name = tensor<string, []>("op_1148"), val = tensor<int32, [4]>([12, 5, 4, 64])];
-            tensor<fp32, [12, 5, 4, 64]> var_1149 = reshape(shape = var_1148, x = var_1147)[name = tensor<string, []>("op_1149")];
+            tensor<fp32, [12, 5, 256]> var_1190 = linear(bias = inner_decoder_q_proj_0_bias, weight = inner_decoder_q_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_38")];
+            tensor<int32, [4]> var_1191 = const()[name = tensor<string, []>("op_1191"), val = tensor<int32, [4]>([12, 5, 4, 64])];
+            tensor<fp32, [12, 5, 4, 64]> var_1192 = reshape(shape = var_1191, x = var_1190)[name = tensor<string, []>("op_1192")];
             tensor<int32, [4]> q_9_perm_0 = const()[name = tensor<string, []>("q_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 5, 256]> var_1153 = linear(bias = decoder_k_proj_0_bias, weight = decoder_k_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_39")];
-            tensor<fp32, []> var_1154 = const()[name = tensor<string, []>("op_1154"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [12, 5, 256]> var_1155 = mul(x = var_1153, y = var_1154)[name = tensor<string, []>("op_1155")];
-            tensor<int32, [4]> var_1156 = const()[name = tensor<string, []>("op_1156"), val = tensor<int32, [4]>([12, 5, 4, 64])];
-            tensor<fp32, [12, 5, 4, 64]> var_1157 = reshape(shape = var_1156, x = var_1155)[name = tensor<string, []>("op_1157")];
+            tensor<fp32, [12, 5, 256]> var_1196 = linear(bias = inner_decoder_k_proj_0_bias, weight = inner_decoder_k_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_39")];
+            tensor<fp32, []> var_1197 = const()[name = tensor<string, []>("op_1197"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [12, 5, 256]> var_1198 = mul(x = var_1196, y = var_1197)[name = tensor<string, []>("op_1198")];
+            tensor<int32, [4]> var_1199 = const()[name = tensor<string, []>("op_1199"), val = tensor<int32, [4]>([12, 5, 4, 64])];
+            tensor<fp32, [12, 5, 4, 64]> var_1200 = reshape(shape = var_1199, x = var_1198)[name = tensor<string, []>("op_1200")];
             tensor<int32, [4]> k_9_perm_0 = const()[name = tensor<string, []>("k_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 5, 256]> var_1161 = linear(bias = decoder_v_proj_0_bias, weight = decoder_v_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_40")];
-            tensor<int32, [4]> var_1162 = const()[name = tensor<string, []>("op_1162"), val = tensor<int32, [4]>([12, 5, 4, 64])];
-            tensor<fp32, [12, 5, 4, 64]> var_1163 = reshape(shape = var_1162, x = var_1161)[name = tensor<string, []>("op_1163")];
+            tensor<fp32, [12, 5, 256]> var_1204 = linear(bias = inner_decoder_v_proj_0_bias, weight = inner_decoder_v_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_40")];
+            tensor<int32, [4]> var_1205 = const()[name = tensor<string, []>("op_1205"), val = tensor<int32, [4]>([12, 5, 4, 64])];
+            tensor<fp32, [12, 5, 4, 64]> var_1206 = reshape(shape = var_1205, x = var_1204)[name = tensor<string, []>("op_1206")];
             tensor<int32, [4]> v_9_perm_0 = const()[name = tensor<string, []>("v_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 5, 256]> input_169 = linear(bias = decoder_g_proj_0_bias, weight = decoder_g_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_41")];
+            tensor<fp32, [12, 5, 256]> input_171 = linear(bias = inner_decoder_g_proj_0_bias, weight = inner_decoder_g_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_41")];
             tensor<bool, []> cumsum_mask_1_exclusive_0 = const()[name = tensor<string, []>("cumsum_mask_1_exclusive_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> cumsum_mask_1_reverse_0 = const()[name = tensor<string, []>("cumsum_mask_1_reverse_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [5]> cumsum_mask_1 = cumsum(axis = var_1065, exclusive = cumsum_mask_1_exclusive_0, reverse = cumsum_mask_1_reverse_0, x = valid_mask)[name = tensor<string, []>("cumsum_mask_1")];
+            tensor<fp32, [5]> cumsum_mask_1 = cumsum(axis = var_79, exclusive = cumsum_mask_1_exclusive_0, reverse = cumsum_mask_1_reverse_0, x = valid_mask)[name = tensor<string, []>("cumsum_mask_1")];
             tensor<fp32, [1]> sqrt_s0_9 = sqrt(x = prev_scale_9)[name = tensor<string, []>("sqrt_s0_9")];
             tensor<fp32, [5]> s_t_9 = add(x = prev_scale_9, y = cumsum_mask_1)[name = tensor<string, []>("s_t_9")];
             tensor<fp32, []> const_20 = const()[name = tensor<string, []>("const_20"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [5]> clip_1 = clip(alpha = var_1055, beta = const_20, x = s_t_9)[name = tensor<string, []>("clip_1")];
+            tensor<fp32, [5]> clip_1 = clip(alpha = var_69, beta = const_20, x = s_t_9)[name = tensor<string, []>("clip_1")];
             tensor<fp32, [5]> sqrt_s_t_9 = sqrt(x = clip_1)[name = tensor<string, []>("sqrt_s_t_9")];
             tensor<bool, []> qk_9_transpose_x_1 = const()[name = tensor<string, []>("qk_9_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_9_transpose_y_1 = const()[name = tensor<string, []>("qk_9_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [12, 4, 5, 64]> k_9 = transpose(perm = k_9_perm_0, x = var_1157)[name = tensor<string, []>("transpose_26")];
-            tensor<fp32, [12, 4, 5, 64]> q_9 = transpose(perm = q_9_perm_0, x = var_1149)[name = tensor<string, []>("transpose_27")];
+            tensor<fp32, [12, 4, 5, 64]> k_9 = transpose(perm = k_9_perm_0, x = var_1200)[name = tensor<string, []>("transpose_26")];
+            tensor<fp32, [12, 4, 5, 64]> q_9 = transpose(perm = q_9_perm_0, x = var_1192)[name = tensor<string, []>("transpose_27")];
             tensor<fp32, [12, 4, 5, 5]> qk_9 = matmul(transpose_x = qk_9_transpose_x_1, transpose_y = qk_9_transpose_y_1, x = q_9, y = k_9)[name = tensor<string, []>("qk_9")];
-            tensor<int32, [2]> var_1175 = const()[name = tensor<string, []>("op_1175"), val = tensor<int32, [2]>([1, 5])];
-            tensor<fp32, [1, 5]> var_1176 = reshape(shape = var_1175, x = valid_mask)[name = tensor<string, []>("op_1176")];
-            tensor<fp32, [5, 5]> causal_with_valid_1 = mul(x = decoder__causal_mask, y = var_1176)[name = tensor<string, []>("causal_with_valid_1")];
-            tensor<int32, [2]> var_1178 = const()[name = tensor<string, []>("op_1178"), val = tensor<int32, [2]>([5, 1])];
-            tensor<fp32, [5, 1]> var_1179 = reshape(shape = var_1178, x = sqrt_s_t_9)[name = tensor<string, []>("op_1179")];
-            tensor<fp32, [5, 5]> M_9 = real_div(x = causal_with_valid_1, y = var_1179)[name = tensor<string, []>("M_9")];
-            tensor<fp32, [12, 4, 5, 5]> var_1181 = mul(x = qk_9, y = M_9)[name = tensor<string, []>("op_1181")];
+            tensor<int32, [2]> var_1218 = const()[name = tensor<string, []>("op_1218"), val = tensor<int32, [2]>([1, 5])];
+            tensor<fp32, [1, 5]> var_1219 = reshape(shape = var_1218, x = valid_mask)[name = tensor<string, []>("op_1219")];
+            tensor<fp32, [5, 5]> causal_with_valid_1 = mul(x = inner_decoder__causal_mask, y = var_1219)[name = tensor<string, []>("causal_with_valid_1")];
+            tensor<int32, [2]> var_1221 = const()[name = tensor<string, []>("op_1221"), val = tensor<int32, [2]>([5, 1])];
+            tensor<fp32, [5, 1]> var_1222 = reshape(shape = var_1221, x = sqrt_s_t_9)[name = tensor<string, []>("op_1222")];
+            tensor<fp32, [5, 5]> M_9 = real_div(x = causal_with_valid_1, y = var_1222)[name = tensor<string, []>("M_9")];
+            tensor<fp32, [12, 4, 5, 5]> var_1224 = mul(x = qk_9, y = M_9)[name = tensor<string, []>("op_1224")];
             tensor<bool, []> inner_9_transpose_x_0 = const()[name = tensor<string, []>("inner_9_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_9_transpose_y_0 = const()[name = tensor<string, []>("inner_9_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 5, 64]> v_9 = transpose(perm = v_9_perm_0, x = var_1163)[name = tensor<string, []>("transpose_25")];
-            tensor<fp32, [12, 4, 5, 64]> inner_9 = matmul(transpose_x = inner_9_transpose_x_0, transpose_y = inner_9_transpose_y_0, x = var_1181, y = v_9)[name = tensor<string, []>("inner_9")];
-            tensor<bool, []> var_1183_transpose_x_0 = const()[name = tensor<string, []>("op_1183_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_1183_transpose_y_0 = const()[name = tensor<string, []>("op_1183_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 5, 64]> var_1183 = matmul(transpose_x = var_1183_transpose_x_0, transpose_y = var_1183_transpose_y_0, x = q_9, y = prev_kv_9)[name = tensor<string, []>("op_1183")];
-            tensor<fp32, [5]> var_1184 = real_div(x = sqrt_s0_9, y = sqrt_s_t_9)[name = tensor<string, []>("op_1184")];
-            tensor<int32, [4]> var_1185 = const()[name = tensor<string, []>("op_1185"), val = tensor<int32, [4]>([1, 1, 5, 1])];
-            tensor<fp32, [1, 1, 5, 1]> var_1186 = reshape(shape = var_1185, x = var_1184)[name = tensor<string, []>("op_1186")];
-            tensor<fp32, [12, 4, 5, 64]> cross_9 = mul(x = var_1183, y = var_1186)[name = tensor<string, []>("cross_9")];
+            tensor<fp32, [12, 4, 5, 64]> v_9 = transpose(perm = v_9_perm_0, x = var_1206)[name = tensor<string, []>("transpose_25")];
+            tensor<fp32, [12, 4, 5, 64]> inner_9 = matmul(transpose_x = inner_9_transpose_x_0, transpose_y = inner_9_transpose_y_0, x = var_1224, y = v_9)[name = tensor<string, []>("inner_9")];
+            tensor<bool, []> var_1226_transpose_x_0 = const()[name = tensor<string, []>("op_1226_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_1226_transpose_y_0 = const()[name = tensor<string, []>("op_1226_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 5, 64]> var_1226 = matmul(transpose_x = var_1226_transpose_x_0, transpose_y = var_1226_transpose_y_0, x = q_9, y = prev_kv_9)[name = tensor<string, []>("op_1226")];
+            tensor<fp32, [5]> var_1227 = real_div(x = sqrt_s0_9, y = sqrt_s_t_9)[name = tensor<string, []>("op_1227")];
+            tensor<int32, [4]> var_1228 = const()[name = tensor<string, []>("op_1228"), val = tensor<int32, [4]>([1, 1, 5, 1])];
+            tensor<fp32, [1, 1, 5, 1]> var_1229 = reshape(shape = var_1228, x = var_1227)[name = tensor<string, []>("op_1229")];
+            tensor<fp32, [12, 4, 5, 64]> cross_9 = mul(x = var_1226, y = var_1229)[name = tensor<string, []>("cross_9")];
             tensor<fp32, [12, 4, 5, 64]> out_25 = add(x = inner_9, y = cross_9)[name = tensor<string, []>("out_25")];
-            tensor<int32, [4]> var_1189 = const()[name = tensor<string, []>("op_1189"), val = tensor<int32, [4]>([1, 1, 5, 1])];
-            tensor<fp32, [1, 1, 5, 1]> var_1190 = reshape(shape = var_1189, x = valid_mask)[name = tensor<string, []>("op_1190")];
-            tensor<fp32, [12, 4, 5, 64]> v_masked_1 = mul(x = v_9, y = var_1190)[name = tensor<string, []>("v_masked_1")];
-            tensor<fp32, [12, 4, 64, 64]> var_1192 = mul(x = prev_kv_9, y = sqrt_s0_9)[name = tensor<string, []>("op_1192")];
-            tensor<bool, []> var_1194_transpose_x_1 = const()[name = tensor<string, []>("op_1194_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_1194_transpose_y_1 = const()[name = tensor<string, []>("op_1194_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 64, 64]> var_1194 = matmul(transpose_x = var_1194_transpose_x_1, transpose_y = var_1194_transpose_y_1, x = k_9, y = v_masked_1)[name = tensor<string, []>("op_1194")];
-            tensor<fp32, [12, 4, 64, 64]> new_kv_unnorm_9 = add(x = var_1192, y = var_1194)[name = tensor<string, []>("new_kv_unnorm_9")];
-            tensor<bool, []> var_1196_keep_dims_0 = const()[name = tensor<string, []>("op_1196_keep_dims_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, []> var_1196 = reduce_sum(keep_dims = var_1196_keep_dims_0, x = valid_mask)[name = tensor<string, []>("op_1196")];
-            tensor<int32, [1]> var_1197 = const()[name = tensor<string, []>("op_1197"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [1]> var_1198 = reshape(shape = var_1197, x = var_1196)[name = tensor<string, []>("op_1198")];
-            tensor<fp32, [1]> new_scale_9 = add(x = prev_scale_9, y = var_1198)[name = tensor<string, []>("new_scale_9")];
+            tensor<int32, [4]> var_1232 = const()[name = tensor<string, []>("op_1232"), val = tensor<int32, [4]>([1, 1, 5, 1])];
+            tensor<fp32, [1, 1, 5, 1]> var_1233 = reshape(shape = var_1232, x = valid_mask)[name = tensor<string, []>("op_1233")];
+            tensor<fp32, [12, 4, 5, 64]> v_masked_1 = mul(x = v_9, y = var_1233)[name = tensor<string, []>("v_masked_1")];
+            tensor<fp32, [12, 4, 64, 64]> var_1235 = mul(x = prev_kv_9, y = sqrt_s0_9)[name = tensor<string, []>("op_1235")];
+            tensor<bool, []> var_1237_transpose_x_1 = const()[name = tensor<string, []>("op_1237_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_1237_transpose_y_1 = const()[name = tensor<string, []>("op_1237_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 64, 64]> var_1237 = matmul(transpose_x = var_1237_transpose_x_1, transpose_y = var_1237_transpose_y_1, x = k_9, y = v_masked_1)[name = tensor<string, []>("op_1237")];
+            tensor<fp32, [12, 4, 64, 64]> new_kv_unnorm_9 = add(x = var_1235, y = var_1237)[name = tensor<string, []>("new_kv_unnorm_9")];
+            tensor<bool, []> var_1239_keep_dims_0 = const()[name = tensor<string, []>("op_1239_keep_dims_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, []> var_1239 = reduce_sum(keep_dims = var_1239_keep_dims_0, x = valid_mask)[name = tensor<string, []>("op_1239")];
+            tensor<int32, [1]> var_1240 = const()[name = tensor<string, []>("op_1240"), val = tensor<int32, [1]>([1])];
+            tensor<fp32, [1]> var_1241 = reshape(shape = var_1240, x = var_1239)[name = tensor<string, []>("op_1241")];
+            tensor<fp32, [1]> new_scale_9 = add(x = prev_scale_9, y = var_1241)[name = tensor<string, []>("new_scale_9")];
             tensor<fp32, []> const_21 = const()[name = tensor<string, []>("const_21"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_2 = clip(alpha = var_1055, beta = const_21, x = new_scale_9)[name = tensor<string, []>("clip_2")];
+            tensor<fp32, [1]> clip_2 = clip(alpha = var_69, beta = const_21, x = new_scale_9)[name = tensor<string, []>("clip_2")];
             tensor<fp32, [1]> sqrt_new_scale_1 = sqrt(x = clip_2)[name = tensor<string, []>("sqrt_new_scale_1")];
-            tensor<fp32, [12, 4, 64, 64]> var_1202 = real_div(x = new_kv_unnorm_9, y = sqrt_new_scale_1)[name = tensor<string, []>("op_1202")];
-            tensor<int32, [4]> var_1203_perm_0 = const()[name = tensor<string, []>("op_1203_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [12, 4, 64, 64]> var_1245 = real_div(x = new_kv_unnorm_9, y = sqrt_new_scale_1)[name = tensor<string, []>("op_1245")];
+            tensor<int32, [4]> var_1246_perm_0 = const()[name = tensor<string, []>("op_1246_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_27_axes_0 = const()[name = tensor<string, []>("out_27_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [12, 5, 4, 64]> var_1203 = transpose(perm = var_1203_perm_0, x = out_25)[name = tensor<string, []>("transpose_24")];
-            tensor<fp32, [12, 5, 4, 64]> out_27 = layer_norm(axes = out_27_axes_0, epsilon = var_1052, x = var_1203)[name = tensor<string, []>("out_27")];
-            tensor<int32, [3]> var_1207 = const()[name = tensor<string, []>("op_1207"), val = tensor<int32, [3]>([12, 5, 256])];
-            tensor<fp32, [12, 5, 256]> out_29 = reshape(shape = var_1207, x = out_27)[name = tensor<string, []>("out_29")];
-            tensor<fp32, [12, 5, 256]> var_1209 = silu(x = input_169)[name = tensor<string, []>("op_1209")];
-            tensor<fp32, [12, 5, 256]> input_171 = mul(x = var_1209, y = out_29)[name = tensor<string, []>("input_171")];
-            tensor<fp32, [12, 5, 256]> ret_out_9 = linear(bias = decoder_out_proj_0_bias, weight = decoder_out_proj_0_weight, x = input_171)[name = tensor<string, []>("linear_42")];
-            tensor<fp32, [12, 5, 256]> input_173 = add(x = x_29, y = ret_out_9)[name = tensor<string, []>("input_173")];
+            tensor<fp32, [12, 5, 4, 64]> var_1246 = transpose(perm = var_1246_perm_0, x = out_25)[name = tensor<string, []>("transpose_24")];
+            tensor<fp32, [12, 5, 4, 64]> out_27 = layer_norm(axes = out_27_axes_0, epsilon = var_84, x = var_1246)[name = tensor<string, []>("out_27")];
+            tensor<int32, [3]> var_1250 = const()[name = tensor<string, []>("op_1250"), val = tensor<int32, [3]>([12, 5, 256])];
+            tensor<fp32, [12, 5, 256]> out_29 = reshape(shape = var_1250, x = out_27)[name = tensor<string, []>("out_29")];
+            tensor<fp32, [12, 5, 256]> var_1252 = silu(x = input_171)[name = tensor<string, []>("op_1252")];
+            tensor<fp32, [12, 5, 256]> input_173 = mul(x = var_1252, y = out_29)[name = tensor<string, []>("input_173")];
+            tensor<fp32, [12, 5, 256]> ret_out_9 = linear(bias = inner_decoder_out_proj_0_bias, weight = inner_decoder_out_proj_0_weight, x = input_173)[name = tensor<string, []>("linear_42")];
+            tensor<fp32, [12, 5, 256]> input_175 = add(x = x_29, y = ret_out_9)[name = tensor<string, []>("input_175")];
             tensor<int32, [1]> xt_1_axes_0 = const()[name = tensor<string, []>("xt_1_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [12, 5, 256]> xt_1 = layer_norm(axes = xt_1_axes_0, beta = decoder_norm11_0_bias, epsilon = var_1050, gamma = decoder_norm11_0_weight, x = input_173)[name = tensor<string, []>("xt_1")];
-            tensor<int32, [4]> var_1219 = const()[name = tensor<string, []>("op_1219"), val = tensor<int32, [4]>([1, 12, 5, 256])];
-            tensor<fp32, [1, 12, 5, 256]> var_1220 = reshape(shape = var_1219, x = xt_1)[name = tensor<string, []>("op_1220")];
-            tensor<int32, [4]> var_1221_perm_0 = const()[name = tensor<string, []>("op_1221_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1224 = const()[name = tensor<string, []>("op_1224"), val = tensor<int32, [3]>([5, 12, 256])];
-            tensor<fp32, [1, 5, 12, 256]> var_1221 = transpose(perm = var_1221_perm_0, x = var_1220)[name = tensor<string, []>("transpose_23")];
-            tensor<fp32, [5, 12, 256]> query_1 = reshape(shape = var_1224, x = var_1221)[name = tensor<string, []>("query_1")];
+            tensor<fp32, [12, 5, 256]> xt_1 = layer_norm(axes = xt_1_axes_0, beta = inner_decoder_norm11_0_bias, epsilon = var_76, gamma = inner_decoder_norm11_0_weight, x = input_175)[name = tensor<string, []>("xt_1")];
+            tensor<int32, [4]> var_1262 = const()[name = tensor<string, []>("op_1262"), val = tensor<int32, [4]>([1, 12, 5, 256])];
+            tensor<fp32, [1, 12, 5, 256]> var_1263 = reshape(shape = var_1262, x = xt_1)[name = tensor<string, []>("op_1263")];
+            tensor<int32, [4]> var_1264_perm_0 = const()[name = tensor<string, []>("op_1264_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1267 = const()[name = tensor<string, []>("op_1267"), val = tensor<int32, [3]>([5, 12, 256])];
+            tensor<fp32, [1, 5, 12, 256]> var_1264 = transpose(perm = var_1264_perm_0, x = var_1263)[name = tensor<string, []>("transpose_23")];
+            tensor<fp32, [5, 12, 256]> query_1 = reshape(shape = var_1267, x = var_1264)[name = tensor<string, []>("query_1")];
             tensor<int32, [3]> query_3_perm_0 = const()[name = tensor<string, []>("query_3_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [12, 5, 256]> query_3 = transpose(perm = query_3_perm_0, x = query_1)[name = tensor<string, []>("transpose_22")];
-            tensor<fp32, [12, 5, 768]> var_1247 = linear(bias = decoder_self_attn2_0_in_proj_bias, weight = decoder_self_attn2_0_in_proj_weight, x = query_3)[name = tensor<string, []>("linear_43")];
+            tensor<fp32, [12, 5, 768]> var_1290 = linear(bias = inner_decoder_self_attn2_0_in_proj_bias, weight = inner_decoder_self_attn2_0_in_proj_weight, x = query_3)[name = tensor<string, []>("linear_43")];
             tensor<int32, [4]> concat_1 = const()[name = tensor<string, []>("concat_1"), val = tensor<int32, [4]>([12, 5, 3, 256])];
-            tensor<fp32, [12, 5, 3, 256]> var_1249 = reshape(shape = concat_1, x = var_1247)[name = tensor<string, []>("op_1249")];
-            tensor<int32, [1]> var_1250_axes_0 = const()[name = tensor<string, []>("op_1250_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp32, [1, 12, 5, 3, 256]> var_1250 = expand_dims(axes = var_1250_axes_0, x = var_1249)[name = tensor<string, []>("op_1250")];
-            tensor<int32, [5]> var_1251_perm_0 = const()[name = tensor<string, []>("op_1251_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
-            tensor<int32, [1]> var_1252_axes_0 = const()[name = tensor<string, []>("op_1252_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp32, [3, 12, 5, 1, 256]> var_1251 = transpose(perm = var_1251_perm_0, x = var_1250)[name = tensor<string, []>("transpose_21")];
-            tensor<fp32, [3, 12, 5, 256]> var_1252 = squeeze(axes = var_1252_axes_0, x = var_1251)[name = tensor<string, []>("op_1252")];
+            tensor<fp32, [12, 5, 3, 256]> var_1292 = reshape(shape = concat_1, x = var_1290)[name = tensor<string, []>("op_1292")];
+            tensor<int32, [1]> var_1293_axes_0 = const()[name = tensor<string, []>("op_1293_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 12, 5, 3, 256]> var_1293 = expand_dims(axes = var_1293_axes_0, x = var_1292)[name = tensor<string, []>("op_1293")];
+            tensor<int32, [5]> var_1294_perm_0 = const()[name = tensor<string, []>("op_1294_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
+            tensor<int32, [1]> var_1295_axes_0 = const()[name = tensor<string, []>("op_1295_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp32, [3, 12, 5, 1, 256]> var_1294 = transpose(perm = var_1294_perm_0, x = var_1293)[name = tensor<string, []>("transpose_21")];
+            tensor<fp32, [3, 12, 5, 256]> var_1295 = squeeze(axes = var_1295_axes_0, x = var_1294)[name = tensor<string, []>("op_1295")];
             tensor<int32, [4]> q_11_begin_0 = const()[name = tensor<string, []>("q_11_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> q_11_end_0 = const()[name = tensor<string, []>("q_11_end_0"), val = tensor<int32, [4]>([1, 12, 5, 256])];
             tensor<bool, [4]> q_11_end_mask_0 = const()[name = tensor<string, []>("q_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> q_11_squeeze_mask_0 = const()[name = tensor<string, []>("q_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 5, 256]> q_11 = slice_by_index(begin = q_11_begin_0, end = q_11_end_0, end_mask = q_11_end_mask_0, squeeze_mask = q_11_squeeze_mask_0, x = var_1252)[name = tensor<string, []>("q_11")];
+            tensor<fp32, [12, 5, 256]> q_11 = slice_by_index(begin = q_11_begin_0, end = q_11_end_0, end_mask = q_11_end_mask_0, squeeze_mask = q_11_squeeze_mask_0, x = var_1295)[name = tensor<string, []>("q_11")];
             tensor<int32, [4]> k_11_begin_0 = const()[name = tensor<string, []>("k_11_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> k_11_end_0 = const()[name = tensor<string, []>("k_11_end_0"), val = tensor<int32, [4]>([2, 12, 5, 256])];
             tensor<bool, [4]> k_11_end_mask_0 = const()[name = tensor<string, []>("k_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> k_11_squeeze_mask_0 = const()[name = tensor<string, []>("k_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 5, 256]> k_11 = slice_by_index(begin = k_11_begin_0, end = k_11_end_0, end_mask = k_11_end_mask_0, squeeze_mask = k_11_squeeze_mask_0, x = var_1252)[name = tensor<string, []>("k_11")];
+            tensor<fp32, [12, 5, 256]> k_11 = slice_by_index(begin = k_11_begin_0, end = k_11_end_0, end_mask = k_11_end_mask_0, squeeze_mask = k_11_squeeze_mask_0, x = var_1295)[name = tensor<string, []>("k_11")];
             tensor<int32, [4]> v_11_begin_0 = const()[name = tensor<string, []>("v_11_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> v_11_end_0 = const()[name = tensor<string, []>("v_11_end_0"), val = tensor<int32, [4]>([3, 12, 5, 256])];
             tensor<bool, [4]> v_11_end_mask_0 = const()[name = tensor<string, []>("v_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> v_11_squeeze_mask_0 = const()[name = tensor<string, []>("v_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 5, 256]> v_11 = slice_by_index(begin = v_11_begin_0, end = v_11_end_0, end_mask = v_11_end_mask_0, squeeze_mask = v_11_squeeze_mask_0, x = var_1252)[name = tensor<string, []>("v_11")];
-            tensor<int32, [3]> var_1260 = const()[name = tensor<string, []>("op_1260"), val = tensor<int32, [3]>([12, 20, 64])];
-            tensor<fp32, [12, 20, 64]> var_1261 = reshape(shape = var_1260, x = q_11)[name = tensor<string, []>("op_1261")];
+            tensor<fp32, [12, 5, 256]> v_11 = slice_by_index(begin = v_11_begin_0, end = v_11_end_0, end_mask = v_11_end_mask_0, squeeze_mask = v_11_squeeze_mask_0, x = var_1295)[name = tensor<string, []>("v_11")];
+            tensor<int32, [3]> var_1303 = const()[name = tensor<string, []>("op_1303"), val = tensor<int32, [3]>([12, 20, 64])];
+            tensor<fp32, [12, 20, 64]> var_1304 = reshape(shape = var_1303, x = q_11)[name = tensor<string, []>("op_1304")];
             tensor<int32, [3]> q_13_perm_0 = const()[name = tensor<string, []>("q_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1267 = const()[name = tensor<string, []>("op_1267"), val = tensor<int32, [3]>([12, 20, 64])];
-            tensor<fp32, [12, 20, 64]> var_1268 = reshape(shape = var_1267, x = k_11)[name = tensor<string, []>("op_1268")];
+            tensor<int32, [3]> var_1310 = const()[name = tensor<string, []>("op_1310"), val = tensor<int32, [3]>([12, 20, 64])];
+            tensor<fp32, [12, 20, 64]> var_1311 = reshape(shape = var_1310, x = k_11)[name = tensor<string, []>("op_1311")];
             tensor<int32, [3]> k_13_perm_0 = const()[name = tensor<string, []>("k_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1274 = const()[name = tensor<string, []>("op_1274"), val = tensor<int32, [3]>([12, 20, 64])];
-            tensor<fp32, [12, 20, 64]> var_1275 = reshape(shape = var_1274, x = v_11)[name = tensor<string, []>("op_1275")];
+            tensor<int32, [3]> var_1317 = const()[name = tensor<string, []>("op_1317"), val = tensor<int32, [3]>([12, 20, 64])];
+            tensor<fp32, [12, 20, 64]> var_1318 = reshape(shape = var_1317, x = v_11)[name = tensor<string, []>("op_1318")];
             tensor<int32, [3]> v_13_perm_0 = const()[name = tensor<string, []>("v_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [4]> var_1278 = const()[name = tensor<string, []>("op_1278"), val = tensor<int32, [4]>([5, 4, 12, 64])];
-            tensor<fp32, [20, 12, 64]> q_13 = transpose(perm = q_13_perm_0, x = var_1261)[name = tensor<string, []>("transpose_20")];
-            tensor<fp32, [5, 4, 12, 64]> q_15 = reshape(shape = var_1278, x = q_13)[name = tensor<string, []>("q_15")];
-            tensor<int32, [4]> var_1280 = const()[name = tensor<string, []>("op_1280"), val = tensor<int32, [4]>([5, 4, 12, 64])];
-            tensor<fp32, [20, 12, 64]> k_13 = transpose(perm = k_13_perm_0, x = var_1268)[name = tensor<string, []>("transpose_19")];
-            tensor<fp32, [5, 4, 12, 64]> k_15 = reshape(shape = var_1280, x = k_13)[name = tensor<string, []>("k_15")];
-            tensor<int32, [4]> var_1282 = const()[name = tensor<string, []>("op_1282"), val = tensor<int32, [4]>([5, 4, 12, 64])];
-            tensor<fp32, [20, 12, 64]> v_13 = transpose(perm = v_13_perm_0, x = var_1275)[name = tensor<string, []>("transpose_18")];
-            tensor<fp32, [5, 4, 12, 64]> v_15 = reshape(shape = var_1282, x = v_13)[name = tensor<string, []>("v_15")];
+            tensor<int32, [4]> var_1321 = const()[name = tensor<string, []>("op_1321"), val = tensor<int32, [4]>([5, 4, 12, 64])];
+            tensor<fp32, [20, 12, 64]> q_13 = transpose(perm = q_13_perm_0, x = var_1304)[name = tensor<string, []>("transpose_20")];
+            tensor<fp32, [5, 4, 12, 64]> q_15 = reshape(shape = var_1321, x = q_13)[name = tensor<string, []>("q_15")];
+            tensor<int32, [4]> var_1323 = const()[name = tensor<string, []>("op_1323"), val = tensor<int32, [4]>([5, 4, 12, 64])];
+            tensor<fp32, [20, 12, 64]> k_13 = transpose(perm = k_13_perm_0, x = var_1311)[name = tensor<string, []>("transpose_19")];
+            tensor<fp32, [5, 4, 12, 64]> k_15 = reshape(shape = var_1323, x = k_13)[name = tensor<string, []>("k_15")];
+            tensor<int32, [4]> var_1325 = const()[name = tensor<string, []>("op_1325"), val = tensor<int32, [4]>([5, 4, 12, 64])];
+            tensor<fp32, [20, 12, 64]> v_13 = transpose(perm = v_13_perm_0, x = var_1318)[name = tensor<string, []>("transpose_18")];
+            tensor<fp32, [5, 4, 12, 64]> v_15 = reshape(shape = var_1325, x = v_13)[name = tensor<string, []>("v_15")];
             tensor<fp32, []> mul_1_y_0 = const()[name = tensor<string, []>("mul_1_y_0"), val = tensor<fp32, []>(0x1p-3)];
             tensor<fp32, [5, 4, 12, 64]> mul_1 = mul(x = q_15, y = mul_1_y_0)[name = tensor<string, []>("mul_1")];
             tensor<bool, []> matmul_0_transpose_y_0 = const()[name = tensor<string, []>("matmul_0_transpose_y_0"), val = tensor<bool, []>(true)];
@@ -1159,30 +1178,30 @@ program(1.0)
             tensor<bool, []> attn_output_1_transpose_x_0 = const()[name = tensor<string, []>("attn_output_1_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> attn_output_1_transpose_y_0 = const()[name = tensor<string, []>("attn_output_1_transpose_y_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [5, 4, 12, 64]> attn_output_1 = matmul(transpose_x = attn_output_1_transpose_x_0, transpose_y = attn_output_1_transpose_y_0, x = softmax_0, y = v_15)[name = tensor<string, []>("attn_output_1")];
-            tensor<int32, [4]> var_1285 = const()[name = tensor<string, []>("op_1285"), val = tensor<int32, [4]>([2, 0, 1, 3])];
-            tensor<int32, [2]> var_1290 = const()[name = tensor<string, []>("op_1290"), val = tensor<int32, [2]>([60, 256])];
-            tensor<fp32, [12, 5, 4, 64]> var_1286 = transpose(perm = var_1285, x = attn_output_1)[name = tensor<string, []>("transpose_17")];
-            tensor<fp32, [60, 256]> attn_output_3 = reshape(shape = var_1290, x = var_1286)[name = tensor<string, []>("attn_output_3")];
-            tensor<fp32, [60, 256]> attn_output_5 = linear(bias = decoder_self_attn2_0_out_proj_bias, weight = decoder_self_attn2_0_out_proj_weight, x = attn_output_3)[name = tensor<string, []>("linear_44")];
-            tensor<int32, [3]> var_1294 = const()[name = tensor<string, []>("op_1294"), val = tensor<int32, [3]>([12, 5, 256])];
-            tensor<fp32, [12, 5, 256]> attn_output_7 = reshape(shape = var_1294, x = attn_output_5)[name = tensor<string, []>("attn_output_7")];
+            tensor<int32, [4]> var_1328 = const()[name = tensor<string, []>("op_1328"), val = tensor<int32, [4]>([2, 0, 1, 3])];
+            tensor<int32, [2]> var_1333 = const()[name = tensor<string, []>("op_1333"), val = tensor<int32, [2]>([60, 256])];
+            tensor<fp32, [12, 5, 4, 64]> var_1329 = transpose(perm = var_1328, x = attn_output_1)[name = tensor<string, []>("transpose_17")];
+            tensor<fp32, [60, 256]> attn_output_3 = reshape(shape = var_1333, x = var_1329)[name = tensor<string, []>("attn_output_3")];
+            tensor<fp32, [60, 256]> attn_output_5 = linear(bias = inner_decoder_self_attn2_0_out_proj_bias, weight = inner_decoder_self_attn2_0_out_proj_weight, x = attn_output_3)[name = tensor<string, []>("linear_44")];
+            tensor<int32, [3]> var_1337 = const()[name = tensor<string, []>("op_1337"), val = tensor<int32, [3]>([12, 5, 256])];
+            tensor<fp32, [12, 5, 256]> attn_output_7 = reshape(shape = var_1337, x = attn_output_5)[name = tensor<string, []>("attn_output_7")];
             tensor<int32, [3]> sa2_out_1_perm_0 = const()[name = tensor<string, []>("sa2_out_1_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [5, 12, 256]> sa2_out_1 = transpose(perm = sa2_out_1_perm_0, x = attn_output_7)[name = tensor<string, []>("transpose_16")];
-            tensor<fp32, [5, 12, 256]> input_175 = add(x = query_1, y = sa2_out_1)[name = tensor<string, []>("input_175")];
-            tensor<int32, [1]> input_177_axes_0 = const()[name = tensor<string, []>("input_177_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [5, 12, 256]> input_177 = layer_norm(axes = input_177_axes_0, beta = decoder_norm21_0_bias, epsilon = var_1050, gamma = decoder_norm21_0_weight, x = input_175)[name = tensor<string, []>("input_177")];
-            tensor<fp32, [5, 12, 2048]> input_179 = linear(bias = decoder_linear1_0_bias, weight = decoder_linear1_0_weight, x = input_177)[name = tensor<string, []>("linear_45")];
-            tensor<fp32, [5, 12, 2048]> input_181 = relu(x = input_179)[name = tensor<string, []>("input_181")];
-            tensor<fp32, [5, 12, 256]> ffn_out_1 = linear(bias = decoder_linear2_0_bias, weight = decoder_linear2_0_weight, x = input_181)[name = tensor<string, []>("linear_46")];
-            tensor<fp32, [5, 12, 256]> input_183 = add(x = input_177, y = ffn_out_1)[name = tensor<string, []>("input_183")];
+            tensor<fp32, [5, 12, 256]> input_177 = add(x = query_1, y = sa2_out_1)[name = tensor<string, []>("input_177")];
+            tensor<int32, [1]> input_179_axes_0 = const()[name = tensor<string, []>("input_179_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [5, 12, 256]> input_179 = layer_norm(axes = input_179_axes_0, beta = inner_decoder_norm21_0_bias, epsilon = var_76, gamma = inner_decoder_norm21_0_weight, x = input_177)[name = tensor<string, []>("input_179")];
+            tensor<fp32, [5, 12, 2048]> input_181 = linear(bias = inner_decoder_linear1_0_bias, weight = inner_decoder_linear1_0_weight, x = input_179)[name = tensor<string, []>("linear_45")];
+            tensor<fp32, [5, 12, 2048]> input_183 = relu(x = input_181)[name = tensor<string, []>("input_183")];
+            tensor<fp32, [5, 12, 256]> ffn_out_1 = linear(bias = inner_decoder_linear2_0_bias, weight = inner_decoder_linear2_0_weight, x = input_183)[name = tensor<string, []>("linear_46")];
+            tensor<fp32, [5, 12, 256]> input_185 = add(x = input_179, y = ffn_out_1)[name = tensor<string, []>("input_185")];
             tensor<int32, [1]> xt_3_axes_0 = const()[name = tensor<string, []>("xt_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [5, 12, 256]> xt_3 = layer_norm(axes = xt_3_axes_0, beta = decoder_norm22_0_bias, epsilon = var_1050, gamma = decoder_norm22_0_weight, x = input_183)[name = tensor<string, []>("xt_3")];
-            tensor<int32, [4]> var_1314 = const()[name = tensor<string, []>("op_1314"), val = tensor<int32, [4]>([1, 5, 12, 256])];
-            tensor<fp32, [1, 5, 12, 256]> x_31 = reshape(shape = var_1314, x = xt_3)[name = tensor<string, []>("x_31")];
-            tensor<int32, [4]> var_1316_perm_0 = const()[name = tensor<string, []>("op_1316_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1320 = const()[name = tensor<string, []>("op_1320"), val = tensor<int32, [3]>([12, 5, 256])];
-            tensor<fp32, [1, 12, 5, 256]> var_1316 = transpose(perm = var_1316_perm_0, x = x_31)[name = tensor<string, []>("transpose_15")];
-            tensor<fp32, [12, 5, 256]> x = reshape(shape = var_1320, x = var_1316)[name = tensor<string, []>("x")];
+            tensor<fp32, [5, 12, 256]> xt_3 = layer_norm(axes = xt_3_axes_0, beta = inner_decoder_norm22_0_bias, epsilon = var_76, gamma = inner_decoder_norm22_0_weight, x = input_185)[name = tensor<string, []>("xt_3")];
+            tensor<int32, [4]> var_1357 = const()[name = tensor<string, []>("op_1357"), val = tensor<int32, [4]>([1, 5, 12, 256])];
+            tensor<fp32, [1, 5, 12, 256]> x_31 = reshape(shape = var_1357, x = xt_3)[name = tensor<string, []>("x_31")];
+            tensor<int32, [4]> var_1359_perm_0 = const()[name = tensor<string, []>("op_1359_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1363 = const()[name = tensor<string, []>("op_1363"), val = tensor<int32, [3]>([12, 5, 256])];
+            tensor<fp32, [1, 12, 5, 256]> var_1359 = transpose(perm = var_1359_perm_0, x = x_31)[name = tensor<string, []>("transpose_15")];
+            tensor<fp32, [12, 5, 256]> x = reshape(shape = var_1363, x = var_1359)[name = tensor<string, []>("x")];
             tensor<int32, [5]> prev_kv_begin_0 = const()[name = tensor<string, []>("prev_kv_begin_0"), val = tensor<int32, [5]>([1, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_end_0 = const()[name = tensor<string, []>("prev_kv_end_0"), val = tensor<int32, [5]>([2, 12, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_end_mask_0 = const()[name = tensor<string, []>("prev_kv_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -1193,120 +1212,120 @@ program(1.0)
             tensor<bool, [2]> prev_scale_end_mask_0 = const()[name = tensor<string, []>("prev_scale_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale = slice_by_index(begin = prev_scale_begin_0, end = prev_scale_end_0, end_mask = prev_scale_end_mask_0, squeeze_mask = prev_scale_squeeze_mask_0, x = dec_scale)[name = tensor<string, []>("prev_scale")];
-            tensor<fp32, [12, 5, 256]> var_1328 = linear(bias = decoder_q_proj_1_bias, weight = decoder_q_proj_1_weight, x = x)[name = tensor<string, []>("linear_47")];
-            tensor<int32, [4]> var_1329 = const()[name = tensor<string, []>("op_1329"), val = tensor<int32, [4]>([12, 5, 4, 64])];
-            tensor<fp32, [12, 5, 4, 64]> var_1330 = reshape(shape = var_1329, x = var_1328)[name = tensor<string, []>("op_1330")];
+            tensor<fp32, [12, 5, 256]> var_1371 = linear(bias = inner_decoder_q_proj_1_bias, weight = inner_decoder_q_proj_1_weight, x = x)[name = tensor<string, []>("linear_47")];
+            tensor<int32, [4]> var_1372 = const()[name = tensor<string, []>("op_1372"), val = tensor<int32, [4]>([12, 5, 4, 64])];
+            tensor<fp32, [12, 5, 4, 64]> var_1373 = reshape(shape = var_1372, x = var_1371)[name = tensor<string, []>("op_1373")];
             tensor<int32, [4]> q_17_perm_0 = const()[name = tensor<string, []>("q_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 5, 256]> var_1334 = linear(bias = decoder_k_proj_1_bias, weight = decoder_k_proj_1_weight, x = x)[name = tensor<string, []>("linear_48")];
-            tensor<fp32, []> var_1335 = const()[name = tensor<string, []>("op_1335"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [12, 5, 256]> var_1336 = mul(x = var_1334, y = var_1335)[name = tensor<string, []>("op_1336")];
-            tensor<int32, [4]> var_1337 = const()[name = tensor<string, []>("op_1337"), val = tensor<int32, [4]>([12, 5, 4, 64])];
-            tensor<fp32, [12, 5, 4, 64]> var_1338 = reshape(shape = var_1337, x = var_1336)[name = tensor<string, []>("op_1338")];
+            tensor<fp32, [12, 5, 256]> var_1377 = linear(bias = inner_decoder_k_proj_1_bias, weight = inner_decoder_k_proj_1_weight, x = x)[name = tensor<string, []>("linear_48")];
+            tensor<fp32, []> var_1378 = const()[name = tensor<string, []>("op_1378"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [12, 5, 256]> var_1379 = mul(x = var_1377, y = var_1378)[name = tensor<string, []>("op_1379")];
+            tensor<int32, [4]> var_1380 = const()[name = tensor<string, []>("op_1380"), val = tensor<int32, [4]>([12, 5, 4, 64])];
+            tensor<fp32, [12, 5, 4, 64]> var_1381 = reshape(shape = var_1380, x = var_1379)[name = tensor<string, []>("op_1381")];
             tensor<int32, [4]> k_17_perm_0 = const()[name = tensor<string, []>("k_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 5, 256]> var_1342 = linear(bias = decoder_v_proj_1_bias, weight = decoder_v_proj_1_weight, x = x)[name = tensor<string, []>("linear_49")];
-            tensor<int32, [4]> var_1343 = const()[name = tensor<string, []>("op_1343"), val = tensor<int32, [4]>([12, 5, 4, 64])];
-            tensor<fp32, [12, 5, 4, 64]> var_1344 = reshape(shape = var_1343, x = var_1342)[name = tensor<string, []>("op_1344")];
+            tensor<fp32, [12, 5, 256]> var_1385 = linear(bias = inner_decoder_v_proj_1_bias, weight = inner_decoder_v_proj_1_weight, x = x)[name = tensor<string, []>("linear_49")];
+            tensor<int32, [4]> var_1386 = const()[name = tensor<string, []>("op_1386"), val = tensor<int32, [4]>([12, 5, 4, 64])];
+            tensor<fp32, [12, 5, 4, 64]> var_1387 = reshape(shape = var_1386, x = var_1385)[name = tensor<string, []>("op_1387")];
             tensor<int32, [4]> v_17_perm_0 = const()[name = tensor<string, []>("v_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 5, 256]> input_187 = linear(bias = decoder_g_proj_1_bias, weight = decoder_g_proj_1_weight, x = x)[name = tensor<string, []>("linear_50")];
+            tensor<fp32, [12, 5, 256]> input_189 = linear(bias = inner_decoder_g_proj_1_bias, weight = inner_decoder_g_proj_1_weight, x = x)[name = tensor<string, []>("linear_50")];
             tensor<fp32, [1]> sqrt_s0 = sqrt(x = prev_scale)[name = tensor<string, []>("sqrt_s0")];
             tensor<fp32, [5]> s_t = add(x = prev_scale, y = cumsum_mask_1)[name = tensor<string, []>("s_t")];
             tensor<fp32, []> const_32 = const()[name = tensor<string, []>("const_32"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [5]> clip_3 = clip(alpha = var_1055, beta = const_32, x = s_t)[name = tensor<string, []>("clip_3")];
+            tensor<fp32, [5]> clip_3 = clip(alpha = var_69, beta = const_32, x = s_t)[name = tensor<string, []>("clip_3")];
             tensor<fp32, [5]> sqrt_s_t = sqrt(x = clip_3)[name = tensor<string, []>("sqrt_s_t")];
             tensor<bool, []> qk_transpose_x_1 = const()[name = tensor<string, []>("qk_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_transpose_y_1 = const()[name = tensor<string, []>("qk_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [12, 4, 5, 64]> k_17 = transpose(perm = k_17_perm_0, x = var_1338)[name = tensor<string, []>("transpose_13")];
-            tensor<fp32, [12, 4, 5, 64]> q_17 = transpose(perm = q_17_perm_0, x = var_1330)[name = tensor<string, []>("transpose_14")];
+            tensor<fp32, [12, 4, 5, 64]> k_17 = transpose(perm = k_17_perm_0, x = var_1381)[name = tensor<string, []>("transpose_13")];
+            tensor<fp32, [12, 4, 5, 64]> q_17 = transpose(perm = q_17_perm_0, x = var_1373)[name = tensor<string, []>("transpose_14")];
             tensor<fp32, [12, 4, 5, 5]> qk = matmul(transpose_x = qk_transpose_x_1, transpose_y = qk_transpose_y_1, x = q_17, y = k_17)[name = tensor<string, []>("qk")];
-            tensor<int32, [2]> var_1359 = const()[name = tensor<string, []>("op_1359"), val = tensor<int32, [2]>([5, 1])];
-            tensor<fp32, [5, 1]> var_1360 = reshape(shape = var_1359, x = sqrt_s_t)[name = tensor<string, []>("op_1360")];
-            tensor<fp32, [5, 5]> M = real_div(x = causal_with_valid_1, y = var_1360)[name = tensor<string, []>("M")];
-            tensor<fp32, [12, 4, 5, 5]> var_1362 = mul(x = qk, y = M)[name = tensor<string, []>("op_1362")];
-            tensor<bool, []> inner_transpose_x_0 = const()[name = tensor<string, []>("inner_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> inner_transpose_y_0 = const()[name = tensor<string, []>("inner_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 5, 64]> v_17 = transpose(perm = v_17_perm_0, x = var_1344)[name = tensor<string, []>("transpose_12")];
-            tensor<fp32, [12, 4, 5, 64]> inner = matmul(transpose_x = inner_transpose_x_0, transpose_y = inner_transpose_y_0, x = var_1362, y = v_17)[name = tensor<string, []>("inner")];
-            tensor<bool, []> var_1364_transpose_x_0 = const()[name = tensor<string, []>("op_1364_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_1364_transpose_y_0 = const()[name = tensor<string, []>("op_1364_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 5, 64]> var_1364 = matmul(transpose_x = var_1364_transpose_x_0, transpose_y = var_1364_transpose_y_0, x = q_17, y = prev_kv)[name = tensor<string, []>("op_1364")];
-            tensor<fp32, [5]> var_1365 = real_div(x = sqrt_s0, y = sqrt_s_t)[name = tensor<string, []>("op_1365")];
-            tensor<int32, [4]> var_1366 = const()[name = tensor<string, []>("op_1366"), val = tensor<int32, [4]>([1, 1, 5, 1])];
-            tensor<fp32, [1, 1, 5, 1]> var_1367 = reshape(shape = var_1366, x = var_1365)[name = tensor<string, []>("op_1367")];
-            tensor<fp32, [12, 4, 5, 64]> cross = mul(x = var_1364, y = var_1367)[name = tensor<string, []>("cross")];
-            tensor<fp32, [12, 4, 5, 64]> out_31 = add(x = inner, y = cross)[name = tensor<string, []>("out_31")];
-            tensor<fp32, [12, 4, 5, 64]> v_masked = mul(x = v_17, y = var_1190)[name = tensor<string, []>("v_masked")];
-            tensor<fp32, [12, 4, 64, 64]> var_1373 = mul(x = prev_kv, y = sqrt_s0)[name = tensor<string, []>("op_1373")];
-            tensor<bool, []> var_1375_transpose_x_1 = const()[name = tensor<string, []>("op_1375_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_1375_transpose_y_1 = const()[name = tensor<string, []>("op_1375_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 64, 64]> var_1375 = matmul(transpose_x = var_1375_transpose_x_1, transpose_y = var_1375_transpose_y_1, x = k_17, y = v_masked)[name = tensor<string, []>("op_1375")];
-            tensor<fp32, [12, 4, 64, 64]> new_kv_unnorm = add(x = var_1373, y = var_1375)[name = tensor<string, []>("new_kv_unnorm")];
-            tensor<fp32, [1]> new_scale = add(x = prev_scale, y = var_1198)[name = tensor<string, []>("new_scale")];
+            tensor<int32, [2]> var_1402 = const()[name = tensor<string, []>("op_1402"), val = tensor<int32, [2]>([5, 1])];
+            tensor<fp32, [5, 1]> var_1403 = reshape(shape = var_1402, x = sqrt_s_t)[name = tensor<string, []>("op_1403")];
+            tensor<fp32, [5, 5]> M = real_div(x = causal_with_valid_1, y = var_1403)[name = tensor<string, []>("M")];
+            tensor<fp32, [12, 4, 5, 5]> var_1405 = mul(x = qk, y = M)[name = tensor<string, []>("op_1405")];
+            tensor<bool, []> inner_11_transpose_x_0 = const()[name = tensor<string, []>("inner_11_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> inner_11_transpose_y_0 = const()[name = tensor<string, []>("inner_11_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 5, 64]> v_17 = transpose(perm = v_17_perm_0, x = var_1387)[name = tensor<string, []>("transpose_12")];
+            tensor<fp32, [12, 4, 5, 64]> inner_11 = matmul(transpose_x = inner_11_transpose_x_0, transpose_y = inner_11_transpose_y_0, x = var_1405, y = v_17)[name = tensor<string, []>("inner_11")];
+            tensor<bool, []> var_1407_transpose_x_0 = const()[name = tensor<string, []>("op_1407_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_1407_transpose_y_0 = const()[name = tensor<string, []>("op_1407_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 5, 64]> var_1407 = matmul(transpose_x = var_1407_transpose_x_0, transpose_y = var_1407_transpose_y_0, x = q_17, y = prev_kv)[name = tensor<string, []>("op_1407")];
+            tensor<fp32, [5]> var_1408 = real_div(x = sqrt_s0, y = sqrt_s_t)[name = tensor<string, []>("op_1408")];
+            tensor<int32, [4]> var_1409 = const()[name = tensor<string, []>("op_1409"), val = tensor<int32, [4]>([1, 1, 5, 1])];
+            tensor<fp32, [1, 1, 5, 1]> var_1410 = reshape(shape = var_1409, x = var_1408)[name = tensor<string, []>("op_1410")];
+            tensor<fp32, [12, 4, 5, 64]> cross = mul(x = var_1407, y = var_1410)[name = tensor<string, []>("cross")];
+            tensor<fp32, [12, 4, 5, 64]> out_31 = add(x = inner_11, y = cross)[name = tensor<string, []>("out_31")];
+            tensor<fp32, [12, 4, 5, 64]> v_masked = mul(x = v_17, y = var_1233)[name = tensor<string, []>("v_masked")];
+            tensor<fp32, [12, 4, 64, 64]> var_1416 = mul(x = prev_kv, y = sqrt_s0)[name = tensor<string, []>("op_1416")];
+            tensor<bool, []> var_1418_transpose_x_1 = const()[name = tensor<string, []>("op_1418_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_1418_transpose_y_1 = const()[name = tensor<string, []>("op_1418_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 64, 64]> var_1418 = matmul(transpose_x = var_1418_transpose_x_1, transpose_y = var_1418_transpose_y_1, x = k_17, y = v_masked)[name = tensor<string, []>("op_1418")];
+            tensor<fp32, [12, 4, 64, 64]> new_kv_unnorm = add(x = var_1416, y = var_1418)[name = tensor<string, []>("new_kv_unnorm")];
+            tensor<fp32, [1]> new_scale = add(x = prev_scale, y = var_1241)[name = tensor<string, []>("new_scale")];
             tensor<fp32, []> const_33 = const()[name = tensor<string, []>("const_33"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_4 = clip(alpha = var_1055, beta = const_33, x = new_scale)[name = tensor<string, []>("clip_4")];
+            tensor<fp32, [1]> clip_4 = clip(alpha = var_69, beta = const_33, x = new_scale)[name = tensor<string, []>("clip_4")];
             tensor<fp32, [1]> sqrt_new_scale = sqrt(x = clip_4)[name = tensor<string, []>("sqrt_new_scale")];
             tensor<fp32, [12, 4, 64, 64]> nkv = real_div(x = new_kv_unnorm, y = sqrt_new_scale)[name = tensor<string, []>("nkv")];
-            tensor<int32, [4]> var_1384_perm_0 = const()[name = tensor<string, []>("op_1384_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [4]> var_1427_perm_0 = const()[name = tensor<string, []>("op_1427_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_33_axes_0 = const()[name = tensor<string, []>("out_33_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [12, 5, 4, 64]> var_1384 = transpose(perm = var_1384_perm_0, x = out_31)[name = tensor<string, []>("transpose_11")];
-            tensor<fp32, [12, 5, 4, 64]> out_33 = layer_norm(axes = out_33_axes_0, epsilon = var_1052, x = var_1384)[name = tensor<string, []>("out_33")];
-            tensor<int32, [3]> var_1388 = const()[name = tensor<string, []>("op_1388"), val = tensor<int32, [3]>([12, 5, 256])];
-            tensor<fp32, [12, 5, 256]> out = reshape(shape = var_1388, x = out_33)[name = tensor<string, []>("out")];
-            tensor<fp32, [12, 5, 256]> var_1390 = silu(x = input_187)[name = tensor<string, []>("op_1390")];
-            tensor<fp32, [12, 5, 256]> input_189 = mul(x = var_1390, y = out)[name = tensor<string, []>("input_189")];
-            tensor<fp32, [12, 5, 256]> ret_out = linear(bias = decoder_out_proj_1_bias, weight = decoder_out_proj_1_weight, x = input_189)[name = tensor<string, []>("linear_51")];
-            tensor<fp32, [12, 5, 256]> input_191 = add(x = x, y = ret_out)[name = tensor<string, []>("input_191")];
+            tensor<fp32, [12, 5, 4, 64]> var_1427 = transpose(perm = var_1427_perm_0, x = out_31)[name = tensor<string, []>("transpose_11")];
+            tensor<fp32, [12, 5, 4, 64]> out_33 = layer_norm(axes = out_33_axes_0, epsilon = var_84, x = var_1427)[name = tensor<string, []>("out_33")];
+            tensor<int32, [3]> var_1431 = const()[name = tensor<string, []>("op_1431"), val = tensor<int32, [3]>([12, 5, 256])];
+            tensor<fp32, [12, 5, 256]> out = reshape(shape = var_1431, x = out_33)[name = tensor<string, []>("out")];
+            tensor<fp32, [12, 5, 256]> var_1433 = silu(x = input_189)[name = tensor<string, []>("op_1433")];
+            tensor<fp32, [12, 5, 256]> input_191 = mul(x = var_1433, y = out)[name = tensor<string, []>("input_191")];
+            tensor<fp32, [12, 5, 256]> ret_out = linear(bias = inner_decoder_out_proj_1_bias, weight = inner_decoder_out_proj_1_weight, x = input_191)[name = tensor<string, []>("linear_51")];
+            tensor<fp32, [12, 5, 256]> input_193 = add(x = x, y = ret_out)[name = tensor<string, []>("input_193")];
             tensor<int32, [1]> xt_5_axes_0 = const()[name = tensor<string, []>("xt_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [12, 5, 256]> xt_5 = layer_norm(axes = xt_5_axes_0, beta = decoder_norm11_1_bias, epsilon = var_1050, gamma = decoder_norm11_1_weight, x = input_191)[name = tensor<string, []>("xt_5")];
-            tensor<int32, [4]> var_1400 = const()[name = tensor<string, []>("op_1400"), val = tensor<int32, [4]>([1, 12, 5, 256])];
-            tensor<fp32, [1, 12, 5, 256]> var_1401 = reshape(shape = var_1400, x = xt_5)[name = tensor<string, []>("op_1401")];
-            tensor<int32, [4]> var_1402_perm_0 = const()[name = tensor<string, []>("op_1402_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1405 = const()[name = tensor<string, []>("op_1405"), val = tensor<int32, [3]>([5, 12, 256])];
-            tensor<fp32, [1, 5, 12, 256]> var_1402 = transpose(perm = var_1402_perm_0, x = var_1401)[name = tensor<string, []>("transpose_10")];
-            tensor<fp32, [5, 12, 256]> query_5 = reshape(shape = var_1405, x = var_1402)[name = tensor<string, []>("query_5")];
+            tensor<fp32, [12, 5, 256]> xt_5 = layer_norm(axes = xt_5_axes_0, beta = inner_decoder_norm11_1_bias, epsilon = var_76, gamma = inner_decoder_norm11_1_weight, x = input_193)[name = tensor<string, []>("xt_5")];
+            tensor<int32, [4]> var_1443 = const()[name = tensor<string, []>("op_1443"), val = tensor<int32, [4]>([1, 12, 5, 256])];
+            tensor<fp32, [1, 12, 5, 256]> var_1444 = reshape(shape = var_1443, x = xt_5)[name = tensor<string, []>("op_1444")];
+            tensor<int32, [4]> var_1445_perm_0 = const()[name = tensor<string, []>("op_1445_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1448 = const()[name = tensor<string, []>("op_1448"), val = tensor<int32, [3]>([5, 12, 256])];
+            tensor<fp32, [1, 5, 12, 256]> var_1445 = transpose(perm = var_1445_perm_0, x = var_1444)[name = tensor<string, []>("transpose_10")];
+            tensor<fp32, [5, 12, 256]> query_5 = reshape(shape = var_1448, x = var_1445)[name = tensor<string, []>("query_5")];
             tensor<int32, [3]> query_perm_0 = const()[name = tensor<string, []>("query_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [12, 5, 256]> query = transpose(perm = query_perm_0, x = query_5)[name = tensor<string, []>("transpose_9")];
-            tensor<fp32, [12, 5, 768]> var_1428 = linear(bias = decoder_self_attn2_1_in_proj_bias, weight = decoder_self_attn2_1_in_proj_weight, x = query)[name = tensor<string, []>("linear_52")];
+            tensor<fp32, [12, 5, 768]> var_1471 = linear(bias = inner_decoder_self_attn2_1_in_proj_bias, weight = inner_decoder_self_attn2_1_in_proj_weight, x = query)[name = tensor<string, []>("linear_52")];
             tensor<int32, [4]> concat_2 = const()[name = tensor<string, []>("concat_2"), val = tensor<int32, [4]>([12, 5, 3, 256])];
-            tensor<fp32, [12, 5, 3, 256]> var_1430 = reshape(shape = concat_2, x = var_1428)[name = tensor<string, []>("op_1430")];
-            tensor<int32, [1]> var_1431_axes_0 = const()[name = tensor<string, []>("op_1431_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp32, [1, 12, 5, 3, 256]> var_1431 = expand_dims(axes = var_1431_axes_0, x = var_1430)[name = tensor<string, []>("op_1431")];
-            tensor<int32, [5]> var_1432_perm_0 = const()[name = tensor<string, []>("op_1432_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
-            tensor<int32, [1]> var_1433_axes_0 = const()[name = tensor<string, []>("op_1433_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp32, [3, 12, 5, 1, 256]> var_1432 = transpose(perm = var_1432_perm_0, x = var_1431)[name = tensor<string, []>("transpose_8")];
-            tensor<fp32, [3, 12, 5, 256]> var_1433 = squeeze(axes = var_1433_axes_0, x = var_1432)[name = tensor<string, []>("op_1433")];
+            tensor<fp32, [12, 5, 3, 256]> var_1473 = reshape(shape = concat_2, x = var_1471)[name = tensor<string, []>("op_1473")];
+            tensor<int32, [1]> var_1474_axes_0 = const()[name = tensor<string, []>("op_1474_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 12, 5, 3, 256]> var_1474 = expand_dims(axes = var_1474_axes_0, x = var_1473)[name = tensor<string, []>("op_1474")];
+            tensor<int32, [5]> var_1475_perm_0 = const()[name = tensor<string, []>("op_1475_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
+            tensor<int32, [1]> var_1476_axes_0 = const()[name = tensor<string, []>("op_1476_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp32, [3, 12, 5, 1, 256]> var_1475 = transpose(perm = var_1475_perm_0, x = var_1474)[name = tensor<string, []>("transpose_8")];
+            tensor<fp32, [3, 12, 5, 256]> var_1476 = squeeze(axes = var_1476_axes_0, x = var_1475)[name = tensor<string, []>("op_1476")];
             tensor<int32, [4]> q_19_begin_0 = const()[name = tensor<string, []>("q_19_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> q_19_end_0 = const()[name = tensor<string, []>("q_19_end_0"), val = tensor<int32, [4]>([1, 12, 5, 256])];
             tensor<bool, [4]> q_19_end_mask_0 = const()[name = tensor<string, []>("q_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> q_19_squeeze_mask_0 = const()[name = tensor<string, []>("q_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 5, 256]> q_19 = slice_by_index(begin = q_19_begin_0, end = q_19_end_0, end_mask = q_19_end_mask_0, squeeze_mask = q_19_squeeze_mask_0, x = var_1433)[name = tensor<string, []>("q_19")];
+            tensor<fp32, [12, 5, 256]> q_19 = slice_by_index(begin = q_19_begin_0, end = q_19_end_0, end_mask = q_19_end_mask_0, squeeze_mask = q_19_squeeze_mask_0, x = var_1476)[name = tensor<string, []>("q_19")];
             tensor<int32, [4]> k_19_begin_0 = const()[name = tensor<string, []>("k_19_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> k_19_end_0 = const()[name = tensor<string, []>("k_19_end_0"), val = tensor<int32, [4]>([2, 12, 5, 256])];
             tensor<bool, [4]> k_19_end_mask_0 = const()[name = tensor<string, []>("k_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> k_19_squeeze_mask_0 = const()[name = tensor<string, []>("k_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 5, 256]> k_19 = slice_by_index(begin = k_19_begin_0, end = k_19_end_0, end_mask = k_19_end_mask_0, squeeze_mask = k_19_squeeze_mask_0, x = var_1433)[name = tensor<string, []>("k_19")];
+            tensor<fp32, [12, 5, 256]> k_19 = slice_by_index(begin = k_19_begin_0, end = k_19_end_0, end_mask = k_19_end_mask_0, squeeze_mask = k_19_squeeze_mask_0, x = var_1476)[name = tensor<string, []>("k_19")];
             tensor<int32, [4]> v_19_begin_0 = const()[name = tensor<string, []>("v_19_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> v_19_end_0 = const()[name = tensor<string, []>("v_19_end_0"), val = tensor<int32, [4]>([3, 12, 5, 256])];
             tensor<bool, [4]> v_19_end_mask_0 = const()[name = tensor<string, []>("v_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> v_19_squeeze_mask_0 = const()[name = tensor<string, []>("v_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 5, 256]> v_19 = slice_by_index(begin = v_19_begin_0, end = v_19_end_0, end_mask = v_19_end_mask_0, squeeze_mask = v_19_squeeze_mask_0, x = var_1433)[name = tensor<string, []>("v_19")];
-            tensor<int32, [3]> var_1441 = const()[name = tensor<string, []>("op_1441"), val = tensor<int32, [3]>([12, 20, 64])];
-            tensor<fp32, [12, 20, 64]> var_1442 = reshape(shape = var_1441, x = q_19)[name = tensor<string, []>("op_1442")];
+            tensor<fp32, [12, 5, 256]> v_19 = slice_by_index(begin = v_19_begin_0, end = v_19_end_0, end_mask = v_19_end_mask_0, squeeze_mask = v_19_squeeze_mask_0, x = var_1476)[name = tensor<string, []>("v_19")];
+            tensor<int32, [3]> var_1484 = const()[name = tensor<string, []>("op_1484"), val = tensor<int32, [3]>([12, 20, 64])];
+            tensor<fp32, [12, 20, 64]> var_1485 = reshape(shape = var_1484, x = q_19)[name = tensor<string, []>("op_1485")];
             tensor<int32, [3]> q_21_perm_0 = const()[name = tensor<string, []>("q_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1448 = const()[name = tensor<string, []>("op_1448"), val = tensor<int32, [3]>([12, 20, 64])];
-            tensor<fp32, [12, 20, 64]> var_1449 = reshape(shape = var_1448, x = k_19)[name = tensor<string, []>("op_1449")];
+            tensor<int32, [3]> var_1491 = const()[name = tensor<string, []>("op_1491"), val = tensor<int32, [3]>([12, 20, 64])];
+            tensor<fp32, [12, 20, 64]> var_1492 = reshape(shape = var_1491, x = k_19)[name = tensor<string, []>("op_1492")];
             tensor<int32, [3]> k_21_perm_0 = const()[name = tensor<string, []>("k_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1455 = const()[name = tensor<string, []>("op_1455"), val = tensor<int32, [3]>([12, 20, 64])];
-            tensor<fp32, [12, 20, 64]> var_1456 = reshape(shape = var_1455, x = v_19)[name = tensor<string, []>("op_1456")];
+            tensor<int32, [3]> var_1498 = const()[name = tensor<string, []>("op_1498"), val = tensor<int32, [3]>([12, 20, 64])];
+            tensor<fp32, [12, 20, 64]> var_1499 = reshape(shape = var_1498, x = v_19)[name = tensor<string, []>("op_1499")];
             tensor<int32, [3]> v_21_perm_0 = const()[name = tensor<string, []>("v_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [4]> var_1459 = const()[name = tensor<string, []>("op_1459"), val = tensor<int32, [4]>([5, 4, 12, 64])];
-            tensor<fp32, [20, 12, 64]> q_21 = transpose(perm = q_21_perm_0, x = var_1442)[name = tensor<string, []>("transpose_7")];
-            tensor<fp32, [5, 4, 12, 64]> q = reshape(shape = var_1459, x = q_21)[name = tensor<string, []>("q")];
-            tensor<int32, [4]> var_1461 = const()[name = tensor<string, []>("op_1461"), val = tensor<int32, [4]>([5, 4, 12, 64])];
-            tensor<fp32, [20, 12, 64]> k_21 = transpose(perm = k_21_perm_0, x = var_1449)[name = tensor<string, []>("transpose_6")];
-            tensor<fp32, [5, 4, 12, 64]> k = reshape(shape = var_1461, x = k_21)[name = tensor<string, []>("k")];
-            tensor<int32, [4]> var_1463 = const()[name = tensor<string, []>("op_1463"), val = tensor<int32, [4]>([5, 4, 12, 64])];
-            tensor<fp32, [20, 12, 64]> v_21 = transpose(perm = v_21_perm_0, x = var_1456)[name = tensor<string, []>("transpose_5")];
-            tensor<fp32, [5, 4, 12, 64]> v = reshape(shape = var_1463, x = v_21)[name = tensor<string, []>("v")];
+            tensor<int32, [4]> var_1502 = const()[name = tensor<string, []>("op_1502"), val = tensor<int32, [4]>([5, 4, 12, 64])];
+            tensor<fp32, [20, 12, 64]> q_21 = transpose(perm = q_21_perm_0, x = var_1485)[name = tensor<string, []>("transpose_7")];
+            tensor<fp32, [5, 4, 12, 64]> q = reshape(shape = var_1502, x = q_21)[name = tensor<string, []>("q")];
+            tensor<int32, [4]> var_1504 = const()[name = tensor<string, []>("op_1504"), val = tensor<int32, [4]>([5, 4, 12, 64])];
+            tensor<fp32, [20, 12, 64]> k_21 = transpose(perm = k_21_perm_0, x = var_1492)[name = tensor<string, []>("transpose_6")];
+            tensor<fp32, [5, 4, 12, 64]> k = reshape(shape = var_1504, x = k_21)[name = tensor<string, []>("k")];
+            tensor<int32, [4]> var_1506 = const()[name = tensor<string, []>("op_1506"), val = tensor<int32, [4]>([5, 4, 12, 64])];
+            tensor<fp32, [20, 12, 64]> v_21 = transpose(perm = v_21_perm_0, x = var_1499)[name = tensor<string, []>("transpose_5")];
+            tensor<fp32, [5, 4, 12, 64]> v = reshape(shape = var_1506, x = v_21)[name = tensor<string, []>("v")];
             tensor<fp32, []> mul_3_y_0 = const()[name = tensor<string, []>("mul_3_y_0"), val = tensor<fp32, []>(0x1p-3)];
             tensor<fp32, [5, 4, 12, 64]> mul_3 = mul(x = q, y = mul_3_y_0)[name = tensor<string, []>("mul_3")];
             tensor<bool, []> matmul_1_transpose_y_0 = const()[name = tensor<string, []>("matmul_1_transpose_y_0"), val = tensor<bool, []>(true)];
@@ -1317,36 +1336,36 @@ program(1.0)
             tensor<bool, []> attn_output_9_transpose_x_0 = const()[name = tensor<string, []>("attn_output_9_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> attn_output_9_transpose_y_0 = const()[name = tensor<string, []>("attn_output_9_transpose_y_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [5, 4, 12, 64]> attn_output_9 = matmul(transpose_x = attn_output_9_transpose_x_0, transpose_y = attn_output_9_transpose_y_0, x = softmax_1, y = v)[name = tensor<string, []>("attn_output_9")];
-            tensor<int32, [4]> var_1466 = const()[name = tensor<string, []>("op_1466"), val = tensor<int32, [4]>([2, 0, 1, 3])];
-            tensor<int32, [2]> var_1471 = const()[name = tensor<string, []>("op_1471"), val = tensor<int32, [2]>([60, 256])];
-            tensor<fp32, [12, 5, 4, 64]> var_1467 = transpose(perm = var_1466, x = attn_output_9)[name = tensor<string, []>("transpose_4")];
-            tensor<fp32, [60, 256]> attn_output_11 = reshape(shape = var_1471, x = var_1467)[name = tensor<string, []>("attn_output_11")];
-            tensor<fp32, [60, 256]> attn_output_13 = linear(bias = decoder_self_attn2_1_out_proj_bias, weight = decoder_self_attn2_1_out_proj_weight, x = attn_output_11)[name = tensor<string, []>("linear_53")];
-            tensor<int32, [3]> var_1475 = const()[name = tensor<string, []>("op_1475"), val = tensor<int32, [3]>([12, 5, 256])];
-            tensor<fp32, [12, 5, 256]> attn_output = reshape(shape = var_1475, x = attn_output_13)[name = tensor<string, []>("attn_output")];
+            tensor<int32, [4]> var_1509 = const()[name = tensor<string, []>("op_1509"), val = tensor<int32, [4]>([2, 0, 1, 3])];
+            tensor<int32, [2]> var_1514 = const()[name = tensor<string, []>("op_1514"), val = tensor<int32, [2]>([60, 256])];
+            tensor<fp32, [12, 5, 4, 64]> var_1510 = transpose(perm = var_1509, x = attn_output_9)[name = tensor<string, []>("transpose_4")];
+            tensor<fp32, [60, 256]> attn_output_11 = reshape(shape = var_1514, x = var_1510)[name = tensor<string, []>("attn_output_11")];
+            tensor<fp32, [60, 256]> attn_output_13 = linear(bias = inner_decoder_self_attn2_1_out_proj_bias, weight = inner_decoder_self_attn2_1_out_proj_weight, x = attn_output_11)[name = tensor<string, []>("linear_53")];
+            tensor<int32, [3]> var_1518 = const()[name = tensor<string, []>("op_1518"), val = tensor<int32, [3]>([12, 5, 256])];
+            tensor<fp32, [12, 5, 256]> attn_output = reshape(shape = var_1518, x = attn_output_13)[name = tensor<string, []>("attn_output")];
             tensor<int32, [3]> sa2_out_perm_0 = const()[name = tensor<string, []>("sa2_out_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [5, 12, 256]> sa2_out = transpose(perm = sa2_out_perm_0, x = attn_output)[name = tensor<string, []>("transpose_3")];
-            tensor<fp32, [5, 12, 256]> input_193 = add(x = query_5, y = sa2_out)[name = tensor<string, []>("input_193")];
-            tensor<int32, [1]> input_195_axes_0 = const()[name = tensor<string, []>("input_195_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [5, 12, 256]> input_195 = layer_norm(axes = input_195_axes_0, beta = decoder_norm21_1_bias, epsilon = var_1050, gamma = decoder_norm21_1_weight, x = input_193)[name = tensor<string, []>("input_195")];
-            tensor<fp32, [5, 12, 2048]> input_197 = linear(bias = decoder_linear1_1_bias, weight = decoder_linear1_1_weight, x = input_195)[name = tensor<string, []>("linear_54")];
-            tensor<fp32, [5, 12, 2048]> input_199 = relu(x = input_197)[name = tensor<string, []>("input_199")];
-            tensor<fp32, [5, 12, 256]> ffn_out = linear(bias = decoder_linear2_1_bias, weight = decoder_linear2_1_weight, x = input_199)[name = tensor<string, []>("linear_55")];
-            tensor<fp32, [5, 12, 256]> input_201 = add(x = input_195, y = ffn_out)[name = tensor<string, []>("input_201")];
+            tensor<fp32, [5, 12, 256]> input_195 = add(x = query_5, y = sa2_out)[name = tensor<string, []>("input_195")];
+            tensor<int32, [1]> input_197_axes_0 = const()[name = tensor<string, []>("input_197_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [5, 12, 256]> input_197 = layer_norm(axes = input_197_axes_0, beta = inner_decoder_norm21_1_bias, epsilon = var_76, gamma = inner_decoder_norm21_1_weight, x = input_195)[name = tensor<string, []>("input_197")];
+            tensor<fp32, [5, 12, 2048]> input_199 = linear(bias = inner_decoder_linear1_1_bias, weight = inner_decoder_linear1_1_weight, x = input_197)[name = tensor<string, []>("linear_54")];
+            tensor<fp32, [5, 12, 2048]> input_201 = relu(x = input_199)[name = tensor<string, []>("input_201")];
+            tensor<fp32, [5, 12, 256]> ffn_out = linear(bias = inner_decoder_linear2_1_bias, weight = inner_decoder_linear2_1_weight, x = input_201)[name = tensor<string, []>("linear_55")];
+            tensor<fp32, [5, 12, 256]> input_203 = add(x = input_197, y = ffn_out)[name = tensor<string, []>("input_203")];
             tensor<int32, [1]> xt_axes_0 = const()[name = tensor<string, []>("xt_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [5, 12, 256]> xt = layer_norm(axes = xt_axes_0, beta = decoder_norm22_1_bias, epsilon = var_1050, gamma = decoder_norm22_1_weight, x = input_201)[name = tensor<string, []>("xt")];
-            tensor<int32, [4]> var_1495 = const()[name = tensor<string, []>("op_1495"), val = tensor<int32, [4]>([1, 5, 12, 256])];
-            tensor<fp32, [1, 5, 12, 256]> input = reshape(shape = var_1495, x = xt)[name = tensor<string, []>("input")];
-            tensor<int32, [1]> var_1497 = const()[name = tensor<string, []>("op_1497"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 12, 1]> var_1498 = reduce_l2_norm(axes = var_1497, keep_dims = var_1053, x = input)[name = tensor<string, []>("op_1498")];
+            tensor<fp32, [5, 12, 256]> xt = layer_norm(axes = xt_axes_0, beta = inner_decoder_norm22_1_bias, epsilon = var_76, gamma = inner_decoder_norm22_1_weight, x = input_203)[name = tensor<string, []>("xt")];
+            tensor<int32, [4]> var_1538 = const()[name = tensor<string, []>("op_1538"), val = tensor<int32, [4]>([1, 5, 12, 256])];
+            tensor<fp32, [1, 5, 12, 256]> input = reshape(shape = var_1538, x = xt)[name = tensor<string, []>("input")];
+            tensor<int32, [1]> var_1540 = const()[name = tensor<string, []>("op_1540"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 5, 12, 1]> var_1541 = reduce_l2_norm(axes = var_1540, keep_dims = var_75, x = input)[name = tensor<string, []>("op_1541")];
             tensor<fp32, []> const_42 = const()[name = tensor<string, []>("const_42"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1, 5, 12, 1]> clip_5 = clip(alpha = var_1045, beta = const_42, x = var_1498)[name = tensor<string, []>("clip_5")];
-            tensor<fp32, [1, 5, 12, 256]> var_1500 = real_div(x = input, y = clip_5)[name = tensor<string, []>("op_1500")];
+            tensor<fp32, [1, 5, 12, 1]> clip_5 = clip(alpha = var_90, beta = const_42, x = var_1541)[name = tensor<string, []>("clip_5")];
+            tensor<fp32, [1, 5, 12, 256]> var_1543 = real_div(x = input, y = clip_5)[name = tensor<string, []>("op_1543")];
             tensor<int32, [3]> concat_6 = const()[name = tensor<string, []>("concat_6"), val = tensor<int32, [3]>([5, 1, 256])];
             tensor<fp32, [5, 1, 256]> reshape_0 = reshape(shape = concat_6, x = emb)[name = tensor<string, []>("reshape_0")];
             tensor<int32, [4]> transpose_1_perm_0 = const()[name = tensor<string, []>("transpose_1_perm_0"), val = tensor<int32, [4]>([0, 1, 3, 2])];
             tensor<int32, [3]> concat_7 = const()[name = tensor<string, []>("concat_7"), val = tensor<int32, [3]>([5, 256, 12])];
-            tensor<fp32, [1, 5, 256, 12]> transpose_1 = transpose(perm = transpose_1_perm_0, x = var_1500)[name = tensor<string, []>("transpose_2")];
+            tensor<fp32, [1, 5, 256, 12]> transpose_1 = transpose(perm = transpose_1_perm_0, x = var_1543)[name = tensor<string, []>("transpose_2")];
             tensor<fp32, [5, 256, 12]> reshape_1 = reshape(shape = concat_7, x = transpose_1)[name = tensor<string, []>("reshape_1")];
             tensor<bool, []> matmul_2_transpose_x_0 = const()[name = tensor<string, []>("matmul_2_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> matmul_2_transpose_y_0 = const()[name = tensor<string, []>("matmul_2_transpose_y_0"), val = tensor<bool, []>(false)];
@@ -1357,10 +1376,10 @@ program(1.0)
             tensor<int32, [3]> output_end_0 = const()[name = tensor<string, []>("output_end_0"), val = tensor<int32, [3]>([1, 5, 11])];
             tensor<bool, [3]> output_end_mask_0 = const()[name = tensor<string, []>("output_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
             tensor<fp32, [1, 5, 10]> output = slice_by_index(begin = output_begin_0, end = output_end_0, end_mask = output_end_mask_0, x = reshape_2)[name = tensor<string, []>("output")];
-            tensor<fp32, [1, 5, 10]> probs = sigmoid(x = output)[name = tensor<string, []>("op_1504")];
-            tensor<int32, []> var_1506_axis_0 = const()[name = tensor<string, []>("op_1506_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [2, 12, 4, 64, 64]> dec_kv_new = stack(axis = var_1506_axis_0, values = (var_1202, nkv))[name = tensor<string, []>("op_1506")];
-            tensor<int32, []> var_1508_axis_0 = const()[name = tensor<string, []>("op_1508_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [2, 1]> dec_scale_new = stack(axis = var_1508_axis_0, values = (new_scale_9, new_scale))[name = tensor<string, []>("op_1508")];
+            tensor<fp32, [1, 5, 10]> probs = sigmoid(x = output)[name = tensor<string, []>("op_1547")];
+            tensor<int32, []> var_1549_axis_0 = const()[name = tensor<string, []>("op_1549_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [2, 12, 4, 64, 64]> dec_kv_new = stack(axis = var_1549_axis_0, values = (var_1245, nkv))[name = tensor<string, []>("op_1549")];
+            tensor<int32, []> var_1551_axis_0 = const()[name = tensor<string, []>("op_1551_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [2, 1]> dec_scale_new = stack(axis = var_1551_axis_0, values = (new_scale_9, new_scale))[name = tensor<string, []>("op_1551")];
         } -> (probs, enc_kv_new, enc_scale_new, enc_conv_cache_new, cnn_window_new, dec_kv_new, dec_scale_new);
 }
\ No newline at end of file
diff --git a/optimized/dih2/500ms/ls_eend_dih2_500ms.mlpackage/Data/com.apple.CoreML/model.mlmodel b/optimized/dih2/500ms/ls_eend_dih2_500ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
index ee2f5685e43d7adb38c9c1a4e1b73168d29549bc..8d7849f201b9a56c345f9d633052694a7179a893 100644
--- a/optimized/dih2/500ms/ls_eend_dih2_500ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
+++ b/optimized/dih2/500ms/ls_eend_dih2_500ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c600d793bc84760911328536382b994a486396bf5f73c044403146818f2ba2f1
-size 196627
+oid sha256:a2c020978aa574f4e94c5c4eb17591bdfd5a7c47b821971904d55db85631b0dc
+size 203227
diff --git a/optimized/dih2/500ms/ls_eend_dih2_500ms.mlpackage/Manifest.json b/optimized/dih2/500ms/ls_eend_dih2_500ms.mlpackage/Manifest.json
index 9b05f7f5a11617791e1ce810a13372eb75f0afe2..be0e3204e5fd671e217172b8c6e0f13f9de50722 100644
--- a/optimized/dih2/500ms/ls_eend_dih2_500ms.mlpackage/Manifest.json
+++ b/optimized/dih2/500ms/ls_eend_dih2_500ms.mlpackage/Manifest.json
@@ -1,18 +1,18 @@
 {
     "fileFormatVersion": "1.0.0",
     "itemInfoEntries": {
-        "4275B100-4E00-4F6A-9426-B138B856397B": {
-            "author": "com.apple.CoreML",
-            "description": "CoreML Model Specification",
-            "name": "model.mlmodel",
-            "path": "com.apple.CoreML/model.mlmodel"
-        },
-        "DBD9F036-EA3D-4076-9E45-DF3A8F67CEED": {
+        "7AB5BB1E-EBD5-49AC-8966-C82191C767B6": {
             "author": "com.apple.CoreML",
             "description": "CoreML Model Weights",
             "name": "weights",
             "path": "com.apple.CoreML/weights"
+        },
+        "F7EB0522-7F8B-4B11-955A-E1CF6D14C11C": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Specification",
+            "name": "model.mlmodel",
+            "path": "com.apple.CoreML/model.mlmodel"
         }
     },
-    "rootModelIdentifier": "4275B100-4E00-4F6A-9426-B138B856397B"
+    "rootModelIdentifier": "F7EB0522-7F8B-4B11-955A-E1CF6D14C11C"
 }
diff --git a/optimized/dih3/100ms/ls_eend_dih3_100ms.mlmodelc/analytics/coremldata.bin b/optimized/dih3/100ms/ls_eend_dih3_100ms.mlmodelc/analytics/coremldata.bin
index 009646e23619a556a10590ffca6efa31fb7f6eeb..86bc9045c3e712049d0ae18400c2da6a70347117 100644
--- a/optimized/dih3/100ms/ls_eend_dih3_100ms.mlmodelc/analytics/coremldata.bin
+++ b/optimized/dih3/100ms/ls_eend_dih3_100ms.mlmodelc/analytics/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:23998be748af73a6dec0a0dced0634a6a32bf25edbeb9b6f24938033cbb5bcbf
+oid sha256:eae41bbb03511ff0e04bb217b278b49e67a9744724c28ac2c3b1ecbb6a719544
 size 243
diff --git a/optimized/dih3/100ms/ls_eend_dih3_100ms.mlmodelc/coremldata.bin b/optimized/dih3/100ms/ls_eend_dih3_100ms.mlmodelc/coremldata.bin
index c569cfb25eae428b97226bb2c29697a5083561fb..a0a5315c87eb589f6b299c4b786fe45a45a95486 100644
--- a/optimized/dih3/100ms/ls_eend_dih3_100ms.mlmodelc/coremldata.bin
+++ b/optimized/dih3/100ms/ls_eend_dih3_100ms.mlmodelc/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:868aa7ddffd7f41be3a3dc214b15d2e85acbbceb685c24d452f5c4629170b995
-size 1310
+oid sha256:ca07a132288cc2acbaee7e034d18c3a9d66bd5be22617b52e9e812d160268ccf
+size 1413
diff --git a/optimized/dih3/100ms/ls_eend_dih3_100ms.mlmodelc/metadata.json b/optimized/dih3/100ms/ls_eend_dih3_100ms.mlmodelc/metadata.json
index 74e418759f83a521fd3e5c06595da4efc184dc64..55ea7f0524227f02cd802073192368a90c729142 100644
--- a/optimized/dih3/100ms/ls_eend_dih3_100ms.mlmodelc/metadata.json
+++ b/optimized/dih3/100ms/ls_eend_dih3_100ms.mlmodelc/metadata.json
@@ -1,7 +1,7 @@
 [
   {
     "metadataOutputVersion" : "3.0",
-    "shortDescription" : "LS-EEND DIHARD III streaming diarizer (pipeline, T=1, max_speakers=10)",
+    "shortDescription" : "LS-EEND DIHARD III streaming diarizer (pipeline, T=1, max_speakers=10, layout=raw_mel, cu=all)",
     "outputSchema" : [
       {
         "hasShapeFlexibility" : "0",
@@ -81,12 +81,12 @@
     "specificationVersion" : 8,
     "mlProgramOperationTypeHistogram" : {
       "Ios16.reduceL2Norm" : 2,
-      "Ios17.reshape" : 66,
+      "Ios17.reshape" : 67,
       "Ios16.softmax" : 2,
       "Ios17.matmul" : 29,
       "Ios17.transpose" : 57,
       "Split" : 4,
-      "Ios17.expandDims" : 3,
+      "Ios17.expandDims" : 4,
       "Ios17.add" : 46,
       "Ios16.sigmoid" : 5,
       "Ios17.sliceByIndex" : 36,
@@ -128,9 +128,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float32",
-        "formattedType" : "MultiArray (Float32 1 × 1 × 345)",
+        "formattedType" : "MultiArray (Float32 1 × 15 × 23)",
         "shortDescription" : "",
-        "shape" : "[1, 1, 345]",
+        "shape" : "[1, 15, 23]",
         "name" : "features",
         "type" : "MultiArray"
       },
@@ -206,8 +206,8 @@
       }
     ],
     "userDefinedMetadata" : {
-      "com.github.apple.coremltools.conversion_date" : "2026-04-16",
-      "config" : "{\"model_name\": \"dih3\", \"model_label\": \"DIHARD III\", \"variant\": \"pipeline\", \"chunk_size\": 1, \"step_duration_ms\": 100, \"frame_duration_ms\": 100, \"frame_duration_seconds\": 0.1, \"max_speakers\": 10, \"max_nspks\": 12, \"n_units\": 256, \"n_heads\": 4, \"enc_n_layers\": 4, \"dec_n_layers\": 2, \"conv_kernel_size\": 16, \"conv_delay\": 9, \"sample_rate\": 8000, \"win_length\": 200, \"hop_length\": 80, \"n_mels\": 23, \"context_size\": 7, \"subsampling\": 10, \"feat_type\": \"logmel23_cummn\", \"pure_roll\": true}",
+      "com.github.apple.coremltools.conversion_date" : "2026-04-18",
+      "config" : "{\"model_name\": \"dih3\", \"model_label\": \"DIHARD III\", \"variant\": \"pipeline\", \"chunk_size\": 1, \"step_duration_ms\": 100, \"frame_duration_ms\": 100, \"frame_duration_seconds\": 0.1, \"max_speakers\": 10, \"max_nspks\": 12, \"n_units\": 256, \"n_heads\": 4, \"enc_n_layers\": 4, \"dec_n_layers\": 2, \"conv_kernel_size\": 16, \"conv_delay\": 9, \"sample_rate\": 8000, \"win_length\": 200, \"hop_length\": 80, \"n_mels\": 23, \"context_size\": 7, \"subsampling\": 10, \"feat_type\": \"logmel23_cummn\", \"pure_roll\": true, \"input_layout\": \"raw_mel\", \"compute_units_export\": \"all\", \"raw_mel_length\": 15}",
       "com.github.apple.coremltools.source" : "torch==2.6.0",
       "com.github.apple.coremltools.version" : "9.0",
       "com.github.apple.coremltools.source_dialect" : "TorchScript"
diff --git a/optimized/dih3/100ms/ls_eend_dih3_100ms.mlmodelc/model.mil b/optimized/dih3/100ms/ls_eend_dih3_100ms.mlmodelc/model.mil
index 182f88767b6a7156fba8d5e17cc71543704b1b0a..4bc2f36465c8a6b77c6ec5021d7ddf40dec1643b 100644
--- a/optimized/dih3/100ms/ls_eend_dih3_100ms.mlmodelc/model.mil
+++ b/optimized/dih3/100ms/ls_eend_dih3_100ms.mlmodelc/model.mil
@@ -1,233 +1,239 @@
 program(1.0)
 [buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.6.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "9.0"}})]
 {
-    func main<ios17>(tensor<fp32, [1, 256, 18]> cnn_window, tensor<fp32, [2, 12, 4, 64, 64]> dec_kv, tensor<fp32, [2, 1]> dec_scale, tensor<fp32, [4, 1, 16, 256]> enc_conv_cache, tensor<fp32, [4, 1, 4, 64, 64]> enc_kv, tensor<fp32, [4, 1]> enc_scale, tensor<fp32, [1, 1, 345]> features, tensor<fp32, [1]> valid_mask) {
-            tensor<fp32, [256]> encoder_cnn_bias = const()[name = tensor<string, []>("encoder_cnn_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
-            tensor<fp32, [256, 256, 19]> encoder_cnn_weight = const()[name = tensor<string, []>("encoder_cnn_weight"), val = tensor<fp32, [256, 256, 19]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
-            tensor<fp32, [1, 1]> encoder__causal_mask = const()[name = tensor<string, []>("encoder__causal_mask"), val = tensor<fp32, [1, 1]>([[0x1p+0]])];
-            tensor<fp32, [1]> encoder__t_index = const()[name = tensor<string, []>("encoder__t_index"), val = tensor<fp32, [1]>([0x1p+0])];
-            tensor<fp32, [256]> encoder_input_projection_linear_bias = const()[name = tensor<string, []>("encoder_input_projection_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4981952)))];
-            tensor<fp32, [256, 345]> encoder_input_projection_linear_weight = const()[name = tensor<string, []>("encoder_input_projection_linear_weight"), val = tensor<fp32, [256, 345]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4983040)))];
-            tensor<fp32, [256]> encoder_pre_layer_norm_bias = const()[name = tensor<string, []>("encoder_pre_layer_norm_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5336384)))];
-            tensor<fp32, [256]> encoder_pre_layer_norm_weight = const()[name = tensor<string, []>("encoder_pre_layer_norm_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5337472)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5338560)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5339648)))];
-            tensor<fp32, [1024]> encoder_ffn1_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5340736)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5344896)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6393536)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6394624)))];
-            tensor<fp32, [256]> encoder_ret_lns_0_bias = const()[name = tensor<string, []>("encoder_ret_lns_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7443264)))];
-            tensor<fp32, [256]> encoder_ret_lns_0_weight = const()[name = tensor<string, []>("encoder_ret_lns_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7444352)))];
-            tensor<fp32, [256]> encoder_q_proj_0_bias = const()[name = tensor<string, []>("encoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7445440)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_0_weight = const()[name = tensor<string, []>("encoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7446528)))];
-            tensor<fp32, [256]> encoder_k_proj_0_bias = const()[name = tensor<string, []>("encoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7708736)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_0_weight = const()[name = tensor<string, []>("encoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7709824)))];
-            tensor<fp32, [256]> encoder_v_proj_0_bias = const()[name = tensor<string, []>("encoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7972032)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_0_weight = const()[name = tensor<string, []>("encoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7973120)))];
-            tensor<fp32, [256]> encoder_g_proj_0_bias = const()[name = tensor<string, []>("encoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8235328)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_0_weight = const()[name = tensor<string, []>("encoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8236416)))];
-            tensor<fp32, [256]> encoder_out_proj_0_bias = const()[name = tensor<string, []>("encoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8498624)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_0_weight = const()[name = tensor<string, []>("encoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8499712)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8761920)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8763008)))];
-            tensor<fp32, [512]> encoder_conv_module_0_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8764096)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_0_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8766208)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_0_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9290560)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9307008)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9308096)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9309184)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9310272)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9311360)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_0_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9312448)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9574656)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9575744)))];
-            tensor<fp32, [1024]> encoder_ffn2_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9576832)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9580992)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10629632)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10630720)))];
-            tensor<fp32, [256]> encoder_layer_norm_0_bias = const()[name = tensor<string, []>("encoder_layer_norm_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11679360)))];
-            tensor<fp32, [256]> encoder_layer_norm_0_weight = const()[name = tensor<string, []>("encoder_layer_norm_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11680448)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11681536)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11682624)))];
-            tensor<fp32, [1024]> encoder_ffn1_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11683712)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11687872)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12736512)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12737600)))];
-            tensor<fp32, [256]> encoder_ret_lns_1_bias = const()[name = tensor<string, []>("encoder_ret_lns_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13786240)))];
-            tensor<fp32, [256]> encoder_ret_lns_1_weight = const()[name = tensor<string, []>("encoder_ret_lns_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13787328)))];
-            tensor<fp32, [256]> encoder_q_proj_1_bias = const()[name = tensor<string, []>("encoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13788416)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_1_weight = const()[name = tensor<string, []>("encoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13789504)))];
-            tensor<fp32, [256]> encoder_k_proj_1_bias = const()[name = tensor<string, []>("encoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14051712)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_1_weight = const()[name = tensor<string, []>("encoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14052800)))];
-            tensor<fp32, [256]> encoder_v_proj_1_bias = const()[name = tensor<string, []>("encoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14315008)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_1_weight = const()[name = tensor<string, []>("encoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14316096)))];
-            tensor<fp32, [256]> encoder_g_proj_1_bias = const()[name = tensor<string, []>("encoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14578304)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_1_weight = const()[name = tensor<string, []>("encoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14579392)))];
-            tensor<fp32, [256]> encoder_out_proj_1_bias = const()[name = tensor<string, []>("encoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14841600)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_1_weight = const()[name = tensor<string, []>("encoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14842688)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15104896)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15105984)))];
-            tensor<fp32, [512]> encoder_conv_module_1_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15107072)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_1_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15109184)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_1_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15633536)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15649984)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15651072)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15652160)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15653248)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15654336)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_1_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15655424)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15917632)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15918720)))];
-            tensor<fp32, [1024]> encoder_ffn2_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15919808)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15923968)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16972608)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16973696)))];
-            tensor<fp32, [256]> encoder_layer_norm_1_bias = const()[name = tensor<string, []>("encoder_layer_norm_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18022336)))];
-            tensor<fp32, [256]> encoder_layer_norm_1_weight = const()[name = tensor<string, []>("encoder_layer_norm_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18023424)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18024512)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18025600)))];
-            tensor<fp32, [1024]> encoder_ffn1_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18026688)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18030848)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19079488)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19080576)))];
-            tensor<fp32, [256]> encoder_ret_lns_2_bias = const()[name = tensor<string, []>("encoder_ret_lns_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20129216)))];
-            tensor<fp32, [256]> encoder_ret_lns_2_weight = const()[name = tensor<string, []>("encoder_ret_lns_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20130304)))];
-            tensor<fp32, [256]> encoder_q_proj_2_bias = const()[name = tensor<string, []>("encoder_q_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20131392)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_2_weight = const()[name = tensor<string, []>("encoder_q_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20132480)))];
-            tensor<fp32, [256]> encoder_k_proj_2_bias = const()[name = tensor<string, []>("encoder_k_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20394688)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_2_weight = const()[name = tensor<string, []>("encoder_k_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20395776)))];
-            tensor<fp32, [256]> encoder_v_proj_2_bias = const()[name = tensor<string, []>("encoder_v_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20657984)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_2_weight = const()[name = tensor<string, []>("encoder_v_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20659072)))];
-            tensor<fp32, [256]> encoder_g_proj_2_bias = const()[name = tensor<string, []>("encoder_g_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20921280)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_2_weight = const()[name = tensor<string, []>("encoder_g_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20922368)))];
-            tensor<fp32, [256]> encoder_out_proj_2_bias = const()[name = tensor<string, []>("encoder_out_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21184576)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_2_weight = const()[name = tensor<string, []>("encoder_out_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21185664)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21447872)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21448960)))];
-            tensor<fp32, [512]> encoder_conv_module_2_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21450048)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_2_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21452160)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_2_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21976512)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21992960)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21994048)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21995136)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21996224)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21997312)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_2_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21998400)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22260608)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22261696)))];
-            tensor<fp32, [1024]> encoder_ffn2_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22262784)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22266944)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23315584)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23316672)))];
-            tensor<fp32, [256]> encoder_layer_norm_2_bias = const()[name = tensor<string, []>("encoder_layer_norm_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24365312)))];
-            tensor<fp32, [256]> encoder_layer_norm_2_weight = const()[name = tensor<string, []>("encoder_layer_norm_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24366400)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24367488)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24368576)))];
-            tensor<fp32, [1024]> encoder_ffn1_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24369664)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24373824)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25422464)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25423552)))];
-            tensor<fp32, [256]> encoder_ret_lns_3_bias = const()[name = tensor<string, []>("encoder_ret_lns_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26472192)))];
-            tensor<fp32, [256]> encoder_ret_lns_3_weight = const()[name = tensor<string, []>("encoder_ret_lns_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26473280)))];
-            tensor<fp32, [256]> encoder_q_proj_3_bias = const()[name = tensor<string, []>("encoder_q_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26474368)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_3_weight = const()[name = tensor<string, []>("encoder_q_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26475456)))];
-            tensor<fp32, [256]> encoder_k_proj_3_bias = const()[name = tensor<string, []>("encoder_k_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26737664)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_3_weight = const()[name = tensor<string, []>("encoder_k_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26738752)))];
-            tensor<fp32, [256]> encoder_v_proj_3_bias = const()[name = tensor<string, []>("encoder_v_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27000960)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_3_weight = const()[name = tensor<string, []>("encoder_v_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27002048)))];
-            tensor<fp32, [256]> encoder_g_proj_3_bias = const()[name = tensor<string, []>("encoder_g_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27264256)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_3_weight = const()[name = tensor<string, []>("encoder_g_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27265344)))];
-            tensor<fp32, [256]> encoder_out_proj_3_bias = const()[name = tensor<string, []>("encoder_out_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27527552)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_3_weight = const()[name = tensor<string, []>("encoder_out_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27528640)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27790848)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27791936)))];
-            tensor<fp32, [512]> encoder_conv_module_3_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27793024)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_3_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27795136)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_3_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28319488)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28335936)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28337024)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28338112)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28339200)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28340288)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_3_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28341376)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28603584)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28604672)))];
-            tensor<fp32, [1024]> encoder_ffn2_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28605760)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28609920)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29658560)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29659648)))];
-            tensor<fp32, [256]> encoder_layer_norm_3_bias = const()[name = tensor<string, []>("encoder_layer_norm_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30708288)))];
-            tensor<fp32, [256]> encoder_layer_norm_3_weight = const()[name = tensor<string, []>("encoder_layer_norm_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30709376)))];
-            tensor<fp32, [256]> decoder_convert_bias = const()[name = tensor<string, []>("decoder_convert_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710464)))];
-            tensor<fp32, [256, 512]> decoder_convert_weight = const()[name = tensor<string, []>("decoder_convert_weight"), val = tensor<fp32, [256, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30711552)))];
-            tensor<fp32, [256]> decoder_q_proj_0_bias = const()[name = tensor<string, []>("decoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31235904)))];
-            tensor<fp32, [256, 256]> decoder_q_proj_0_weight = const()[name = tensor<string, []>("decoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31236992)))];
-            tensor<fp32, [256]> decoder_k_proj_0_bias = const()[name = tensor<string, []>("decoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31499200)))];
-            tensor<fp32, [256, 256]> decoder_k_proj_0_weight = const()[name = tensor<string, []>("decoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31500288)))];
-            tensor<fp32, [256]> decoder_v_proj_0_bias = const()[name = tensor<string, []>("decoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31762496)))];
-            tensor<fp32, [256, 256]> decoder_v_proj_0_weight = const()[name = tensor<string, []>("decoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31763584)))];
-            tensor<fp32, [256]> decoder_g_proj_0_bias = const()[name = tensor<string, []>("decoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32025792)))];
-            tensor<fp32, [256, 256]> decoder_g_proj_0_weight = const()[name = tensor<string, []>("decoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32026880)))];
-            tensor<fp32, [256]> decoder_out_proj_0_bias = const()[name = tensor<string, []>("decoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32289088)))];
-            tensor<fp32, [256, 256]> decoder_out_proj_0_weight = const()[name = tensor<string, []>("decoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32290176)))];
-            tensor<fp32, [256]> decoder_norm11_0_bias = const()[name = tensor<string, []>("decoder_norm11_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32552384)))];
-            tensor<fp32, [256]> decoder_norm11_0_weight = const()[name = tensor<string, []>("decoder_norm11_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32553472)))];
-            tensor<fp32, [256]> decoder_self_attn2_0_out_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_0_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32554560)))];
-            tensor<fp32, [256, 256]> decoder_self_attn2_0_out_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_0_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32555648)))];
-            tensor<fp32, [768]> decoder_self_attn2_0_in_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_0_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32817856)))];
-            tensor<fp32, [768, 256]> decoder_self_attn2_0_in_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_0_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32820992)))];
-            tensor<fp32, [256]> decoder_norm21_0_bias = const()[name = tensor<string, []>("decoder_norm21_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33607488)))];
-            tensor<fp32, [256]> decoder_norm21_0_weight = const()[name = tensor<string, []>("decoder_norm21_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33608576)))];
-            tensor<fp32, [2048]> decoder_linear1_0_bias = const()[name = tensor<string, []>("decoder_linear1_0_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33609664)))];
-            tensor<fp32, [2048, 256]> decoder_linear1_0_weight = const()[name = tensor<string, []>("decoder_linear1_0_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33617920)))];
-            tensor<fp32, [256]> decoder_linear2_0_bias = const()[name = tensor<string, []>("decoder_linear2_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35715136)))];
-            tensor<fp32, [256, 2048]> decoder_linear2_0_weight = const()[name = tensor<string, []>("decoder_linear2_0_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35716224)))];
-            tensor<fp32, [256]> decoder_norm22_0_bias = const()[name = tensor<string, []>("decoder_norm22_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37813440)))];
-            tensor<fp32, [256]> decoder_norm22_0_weight = const()[name = tensor<string, []>("decoder_norm22_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37814528)))];
-            tensor<fp32, [256]> decoder_q_proj_1_bias = const()[name = tensor<string, []>("decoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37815616)))];
-            tensor<fp32, [256, 256]> decoder_q_proj_1_weight = const()[name = tensor<string, []>("decoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37816704)))];
-            tensor<fp32, [256]> decoder_k_proj_1_bias = const()[name = tensor<string, []>("decoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38078912)))];
-            tensor<fp32, [256, 256]> decoder_k_proj_1_weight = const()[name = tensor<string, []>("decoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38080000)))];
-            tensor<fp32, [256]> decoder_v_proj_1_bias = const()[name = tensor<string, []>("decoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38342208)))];
-            tensor<fp32, [256, 256]> decoder_v_proj_1_weight = const()[name = tensor<string, []>("decoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38343296)))];
-            tensor<fp32, [256]> decoder_g_proj_1_bias = const()[name = tensor<string, []>("decoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38605504)))];
-            tensor<fp32, [256, 256]> decoder_g_proj_1_weight = const()[name = tensor<string, []>("decoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38606592)))];
-            tensor<fp32, [256]> decoder_out_proj_1_bias = const()[name = tensor<string, []>("decoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38868800)))];
-            tensor<fp32, [256, 256]> decoder_out_proj_1_weight = const()[name = tensor<string, []>("decoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38869888)))];
-            tensor<fp32, [256]> decoder_norm11_1_bias = const()[name = tensor<string, []>("decoder_norm11_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39132096)))];
-            tensor<fp32, [256]> decoder_norm11_1_weight = const()[name = tensor<string, []>("decoder_norm11_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39133184)))];
-            tensor<fp32, [256]> decoder_self_attn2_1_out_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_1_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39134272)))];
-            tensor<fp32, [256, 256]> decoder_self_attn2_1_out_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_1_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39135360)))];
-            tensor<fp32, [768]> decoder_self_attn2_1_in_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_1_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39397568)))];
-            tensor<fp32, [768, 256]> decoder_self_attn2_1_in_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_1_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39400704)))];
-            tensor<fp32, [256]> decoder_norm21_1_bias = const()[name = tensor<string, []>("decoder_norm21_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40187200)))];
-            tensor<fp32, [256]> decoder_norm21_1_weight = const()[name = tensor<string, []>("decoder_norm21_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40188288)))];
-            tensor<fp32, [2048]> decoder_linear1_1_bias = const()[name = tensor<string, []>("decoder_linear1_1_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40189376)))];
-            tensor<fp32, [2048, 256]> decoder_linear1_1_weight = const()[name = tensor<string, []>("decoder_linear1_1_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40197632)))];
-            tensor<fp32, [256]> decoder_linear2_1_bias = const()[name = tensor<string, []>("decoder_linear2_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42294848)))];
-            tensor<fp32, [256, 2048]> decoder_linear2_1_weight = const()[name = tensor<string, []>("decoder_linear2_1_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42295936)))];
-            tensor<fp32, [256]> decoder_norm22_1_bias = const()[name = tensor<string, []>("decoder_norm22_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44393152)))];
-            tensor<fp32, [256]> decoder_norm22_1_weight = const()[name = tensor<string, []>("decoder_norm22_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44394240)))];
-            tensor<fp32, []> var_11 = const()[name = tensor<string, []>("op_11"), val = tensor<fp32, []>(0x1.5798eep-27)];
-            tensor<fp32, []> var_17 = const()[name = tensor<string, []>("op_17"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
-            tensor<int32, []> var_20 = const()[name = tensor<string, []>("op_20"), val = tensor<int32, []>(2)];
-            tensor<int32, []> var_23 = const()[name = tensor<string, []>("op_23"), val = tensor<int32, []>(0)];
-            tensor<int32, []> var_26 = const()[name = tensor<string, []>("op_26"), val = tensor<int32, []>(1)];
-            tensor<fp32, []> var_28 = const()[name = tensor<string, []>("op_28"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
-            tensor<bool, []> var_29 = const()[name = tensor<string, []>("op_29"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 1, 256]> input_1 = linear(bias = encoder_input_projection_linear_bias, weight = encoder_input_projection_linear_weight, x = features)[name = tensor<string, []>("linear_0")];
-            tensor<int32, [1]> input_3_axes_0 = const()[name = tensor<string, []>("input_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_3 = layer_norm(axes = input_3_axes_0, beta = encoder_pre_layer_norm_bias, epsilon = var_28, gamma = encoder_pre_layer_norm_weight, x = input_1)[name = tensor<string, []>("input_3")];
+    func main<ios17>(tensor<fp32, [1, 256, 18]> cnn_window, tensor<fp32, [2, 12, 4, 64, 64]> dec_kv, tensor<fp32, [2, 1]> dec_scale, tensor<fp32, [4, 1, 16, 256]> enc_conv_cache, tensor<fp32, [4, 1, 4, 64, 64]> enc_kv, tensor<fp32, [4, 1]> enc_scale, tensor<fp32, [1, 15, 23]> features, tensor<fp32, [1]> valid_mask) {
+            tensor<fp32, [256]> inner_encoder_cnn_bias = const()[name = tensor<string, []>("inner_encoder_cnn_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp32, [256, 256, 19]> inner_encoder_cnn_weight = const()[name = tensor<string, []>("inner_encoder_cnn_weight"), val = tensor<fp32, [256, 256, 19]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
+            tensor<fp32, [1, 1]> inner_encoder__causal_mask = const()[name = tensor<string, []>("inner_encoder__causal_mask"), val = tensor<fp32, [1, 1]>([[0x1p+0]])];
+            tensor<fp32, [1]> inner_encoder__t_index = const()[name = tensor<string, []>("inner_encoder__t_index"), val = tensor<fp32, [1]>([0x1p+0])];
+            tensor<fp32, [256]> inner_encoder_input_projection_linear_bias = const()[name = tensor<string, []>("inner_encoder_input_projection_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4981952)))];
+            tensor<fp32, [256, 345]> inner_encoder_input_projection_linear_weight = const()[name = tensor<string, []>("inner_encoder_input_projection_linear_weight"), val = tensor<fp32, [256, 345]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4983040)))];
+            tensor<fp32, [256]> inner_encoder_pre_layer_norm_bias = const()[name = tensor<string, []>("inner_encoder_pre_layer_norm_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5336384)))];
+            tensor<fp32, [256]> inner_encoder_pre_layer_norm_weight = const()[name = tensor<string, []>("inner_encoder_pre_layer_norm_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5337472)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5338560)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5339648)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5340736)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5344896)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6393536)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6394624)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_0_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7443264)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_0_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7444352)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7445440)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7446528)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7708736)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7709824)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7972032)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7973120)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8235328)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8236416)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8498624)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8499712)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8761920)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8763008)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_0_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8764096)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_0_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8766208)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_0_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9290560)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9307008)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9308096)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9309184)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9310272)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9311360)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_0_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9312448)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9574656)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9575744)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9576832)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9580992)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10629632)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10630720)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_0_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11679360)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_0_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11680448)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11681536)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11682624)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11683712)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11687872)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12736512)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12737600)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_1_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13786240)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_1_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13787328)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13788416)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13789504)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14051712)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14052800)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14315008)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14316096)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14578304)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14579392)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14841600)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14842688)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15104896)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15105984)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_1_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15107072)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_1_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15109184)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_1_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15633536)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15649984)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15651072)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15652160)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15653248)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15654336)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_1_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15655424)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15917632)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15918720)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15919808)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15923968)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16972608)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16973696)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_1_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18022336)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_1_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18023424)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18024512)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18025600)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18026688)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18030848)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19079488)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19080576)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_2_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20129216)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_2_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20130304)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20131392)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20132480)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20394688)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20395776)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20657984)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20659072)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20921280)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20922368)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21184576)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21185664)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21447872)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21448960)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_2_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21450048)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_2_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21452160)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_2_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21976512)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21992960)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21994048)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21995136)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21996224)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21997312)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_2_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21998400)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22260608)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22261696)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22262784)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22266944)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23315584)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23316672)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_2_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24365312)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_2_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24366400)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24367488)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24368576)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24369664)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24373824)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25422464)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25423552)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_3_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26472192)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_3_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26473280)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26474368)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26475456)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26737664)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26738752)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27000960)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27002048)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27264256)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27265344)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27527552)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27528640)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27790848)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27791936)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_3_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27793024)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_3_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27795136)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_3_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28319488)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28335936)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28337024)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28338112)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28339200)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28340288)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_3_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28341376)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28603584)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28604672)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28605760)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28609920)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29658560)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29659648)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_3_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30708288)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_3_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30709376)))];
+            tensor<fp32, [256]> inner_decoder_convert_bias = const()[name = tensor<string, []>("inner_decoder_convert_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710464)))];
+            tensor<fp32, [256, 512]> inner_decoder_convert_weight = const()[name = tensor<string, []>("inner_decoder_convert_weight"), val = tensor<fp32, [256, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30711552)))];
+            tensor<fp32, [256]> inner_decoder_q_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31235904)))];
+            tensor<fp32, [256, 256]> inner_decoder_q_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31236992)))];
+            tensor<fp32, [256]> inner_decoder_k_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31499200)))];
+            tensor<fp32, [256, 256]> inner_decoder_k_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31500288)))];
+            tensor<fp32, [256]> inner_decoder_v_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31762496)))];
+            tensor<fp32, [256, 256]> inner_decoder_v_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31763584)))];
+            tensor<fp32, [256]> inner_decoder_g_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32025792)))];
+            tensor<fp32, [256, 256]> inner_decoder_g_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32026880)))];
+            tensor<fp32, [256]> inner_decoder_out_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32289088)))];
+            tensor<fp32, [256, 256]> inner_decoder_out_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32290176)))];
+            tensor<fp32, [256]> inner_decoder_norm11_0_bias = const()[name = tensor<string, []>("inner_decoder_norm11_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32552384)))];
+            tensor<fp32, [256]> inner_decoder_norm11_0_weight = const()[name = tensor<string, []>("inner_decoder_norm11_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32553472)))];
+            tensor<fp32, [256]> inner_decoder_self_attn2_0_out_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32554560)))];
+            tensor<fp32, [256, 256]> inner_decoder_self_attn2_0_out_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32555648)))];
+            tensor<fp32, [768]> inner_decoder_self_attn2_0_in_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32817856)))];
+            tensor<fp32, [768, 256]> inner_decoder_self_attn2_0_in_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32820992)))];
+            tensor<fp32, [256]> inner_decoder_norm21_0_bias = const()[name = tensor<string, []>("inner_decoder_norm21_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33607488)))];
+            tensor<fp32, [256]> inner_decoder_norm21_0_weight = const()[name = tensor<string, []>("inner_decoder_norm21_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33608576)))];
+            tensor<fp32, [2048]> inner_decoder_linear1_0_bias = const()[name = tensor<string, []>("inner_decoder_linear1_0_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33609664)))];
+            tensor<fp32, [2048, 256]> inner_decoder_linear1_0_weight = const()[name = tensor<string, []>("inner_decoder_linear1_0_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33617920)))];
+            tensor<fp32, [256]> inner_decoder_linear2_0_bias = const()[name = tensor<string, []>("inner_decoder_linear2_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35715136)))];
+            tensor<fp32, [256, 2048]> inner_decoder_linear2_0_weight = const()[name = tensor<string, []>("inner_decoder_linear2_0_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35716224)))];
+            tensor<fp32, [256]> inner_decoder_norm22_0_bias = const()[name = tensor<string, []>("inner_decoder_norm22_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37813440)))];
+            tensor<fp32, [256]> inner_decoder_norm22_0_weight = const()[name = tensor<string, []>("inner_decoder_norm22_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37814528)))];
+            tensor<fp32, [256]> inner_decoder_q_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37815616)))];
+            tensor<fp32, [256, 256]> inner_decoder_q_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37816704)))];
+            tensor<fp32, [256]> inner_decoder_k_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38078912)))];
+            tensor<fp32, [256, 256]> inner_decoder_k_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38080000)))];
+            tensor<fp32, [256]> inner_decoder_v_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38342208)))];
+            tensor<fp32, [256, 256]> inner_decoder_v_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38343296)))];
+            tensor<fp32, [256]> inner_decoder_g_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38605504)))];
+            tensor<fp32, [256, 256]> inner_decoder_g_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38606592)))];
+            tensor<fp32, [256]> inner_decoder_out_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38868800)))];
+            tensor<fp32, [256, 256]> inner_decoder_out_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38869888)))];
+            tensor<fp32, [256]> inner_decoder_norm11_1_bias = const()[name = tensor<string, []>("inner_decoder_norm11_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39132096)))];
+            tensor<fp32, [256]> inner_decoder_norm11_1_weight = const()[name = tensor<string, []>("inner_decoder_norm11_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39133184)))];
+            tensor<fp32, [256]> inner_decoder_self_attn2_1_out_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39134272)))];
+            tensor<fp32, [256, 256]> inner_decoder_self_attn2_1_out_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39135360)))];
+            tensor<fp32, [768]> inner_decoder_self_attn2_1_in_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39397568)))];
+            tensor<fp32, [768, 256]> inner_decoder_self_attn2_1_in_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39400704)))];
+            tensor<fp32, [256]> inner_decoder_norm21_1_bias = const()[name = tensor<string, []>("inner_decoder_norm21_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40187200)))];
+            tensor<fp32, [256]> inner_decoder_norm21_1_weight = const()[name = tensor<string, []>("inner_decoder_norm21_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40188288)))];
+            tensor<fp32, [2048]> inner_decoder_linear1_1_bias = const()[name = tensor<string, []>("inner_decoder_linear1_1_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40189376)))];
+            tensor<fp32, [2048, 256]> inner_decoder_linear1_1_weight = const()[name = tensor<string, []>("inner_decoder_linear1_1_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40197632)))];
+            tensor<fp32, [256]> inner_decoder_linear2_1_bias = const()[name = tensor<string, []>("inner_decoder_linear2_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42294848)))];
+            tensor<fp32, [256, 2048]> inner_decoder_linear2_1_weight = const()[name = tensor<string, []>("inner_decoder_linear2_1_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42295936)))];
+            tensor<fp32, [256]> inner_decoder_norm22_1_bias = const()[name = tensor<string, []>("inner_decoder_norm22_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44393152)))];
+            tensor<fp32, [256]> inner_decoder_norm22_1_weight = const()[name = tensor<string, []>("inner_decoder_norm22_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44394240)))];
+            tensor<int32, [1]> stacked_axes_0 = const()[name = tensor<string, []>("stacked_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp32, [1, 1, 15, 23]> stacked = expand_dims(axes = stacked_axes_0, x = features)[name = tensor<string, []>("stacked")];
+            tensor<int32, [3]> var_26 = const()[name = tensor<string, []>("op_26"), val = tensor<int32, [3]>([1, 1, 345])];
+            tensor<fp32, [1, 1, 345]> input_1 = reshape(shape = var_26, x = stacked)[name = tensor<string, []>("input_1")];
+            tensor<fp32, []> var_29 = const()[name = tensor<string, []>("op_29"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<bool, []> var_35 = const()[name = tensor<string, []>("op_35"), val = tensor<bool, []>(true)];
+            tensor<fp32, []> var_36 = const()[name = tensor<string, []>("op_36"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
+            tensor<int32, []> var_39 = const()[name = tensor<string, []>("op_39"), val = tensor<int32, []>(0)];
+            tensor<int32, []> var_41 = const()[name = tensor<string, []>("op_41"), val = tensor<int32, []>(2)];
+            tensor<int32, []> var_42 = const()[name = tensor<string, []>("op_42"), val = tensor<int32, []>(-1)];
+            tensor<fp32, []> var_44 = const()[name = tensor<string, []>("op_44"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
+            tensor<fp32, []> var_49 = const()[name = tensor<string, []>("op_49"), val = tensor<fp32, []>(0x1.5798eep-27)];
+            tensor<int32, []> var_52 = const()[name = tensor<string, []>("op_52"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 1, 256]> input_3 = linear(bias = inner_encoder_input_projection_linear_bias, weight = inner_encoder_input_projection_linear_weight, x = input_1)[name = tensor<string, []>("linear_0")];
             tensor<int32, [1]> input_5_axes_0 = const()[name = tensor<string, []>("input_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_5 = layer_norm(axes = input_5_axes_0, beta = encoder_ffn1_0_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_0_module_sequential_0_weight, x = input_3)[name = tensor<string, []>("input_5")];
-            tensor<fp32, [1, 1, 1024]> inputs_1 = linear(bias = encoder_ffn1_0_module_sequential_1_linear_bias, weight = encoder_ffn1_0_module_sequential_1_linear_weight, x = input_5)[name = tensor<string, []>("linear_1")];
-            tensor<fp32, [1, 1, 1024]> input_7 = silu(x = inputs_1)[name = tensor<string, []>("input_7")];
-            tensor<fp32, [1, 1, 256]> input_11 = linear(bias = encoder_ffn1_0_module_sequential_4_linear_bias, weight = encoder_ffn1_0_module_sequential_4_linear_weight, x = input_7)[name = tensor<string, []>("linear_2")];
-            tensor<fp32, []> var_147 = const()[name = tensor<string, []>("op_147"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 1, 256]> var_148 = mul(x = input_11, y = var_147)[name = tensor<string, []>("op_148")];
-            tensor<fp32, [1, 1, 256]> input_13 = add(x = var_148, y = input_3)[name = tensor<string, []>("input_13")];
+            tensor<fp32, [1, 1, 256]> input_5 = layer_norm(axes = input_5_axes_0, beta = inner_encoder_pre_layer_norm_bias, epsilon = var_36, gamma = inner_encoder_pre_layer_norm_weight, x = input_3)[name = tensor<string, []>("input_5")];
+            tensor<int32, [1]> input_7_axes_0 = const()[name = tensor<string, []>("input_7_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 1, 256]> input_7 = layer_norm(axes = input_7_axes_0, beta = inner_encoder_ffn1_0_module_sequential_0_bias, epsilon = var_36, gamma = inner_encoder_ffn1_0_module_sequential_0_weight, x = input_5)[name = tensor<string, []>("input_7")];
+            tensor<fp32, [1, 1, 1024]> inputs_1 = linear(bias = inner_encoder_ffn1_0_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_0_module_sequential_1_linear_weight, x = input_7)[name = tensor<string, []>("linear_1")];
+            tensor<fp32, [1, 1, 1024]> input_9 = silu(x = inputs_1)[name = tensor<string, []>("input_9")];
+            tensor<fp32, [1, 1, 256]> input_13 = linear(bias = inner_encoder_ffn1_0_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_0_module_sequential_4_linear_weight, x = input_9)[name = tensor<string, []>("linear_2")];
+            tensor<fp32, []> var_173 = const()[name = tensor<string, []>("op_173"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 1, 256]> var_174 = mul(x = input_13, y = var_173)[name = tensor<string, []>("op_174")];
+            tensor<fp32, [1, 1, 256]> input_15 = add(x = var_174, y = input_5)[name = tensor<string, []>("input_15")];
             tensor<int32, [1]> x_1_axes_0 = const()[name = tensor<string, []>("x_1_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> x_1 = layer_norm(axes = x_1_axes_0, beta = encoder_ret_lns_0_bias, epsilon = var_28, gamma = encoder_ret_lns_0_weight, x = input_13)[name = tensor<string, []>("x_1")];
+            tensor<fp32, [1, 1, 256]> x_1 = layer_norm(axes = x_1_axes_0, beta = inner_encoder_ret_lns_0_bias, epsilon = var_36, gamma = inner_encoder_ret_lns_0_weight, x = input_15)[name = tensor<string, []>("x_1")];
             tensor<int32, [5]> prev_kv_1_begin_0 = const()[name = tensor<string, []>("prev_kv_1_begin_0"), val = tensor<int32, [5]>([0, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_1_end_0 = const()[name = tensor<string, []>("prev_kv_1_end_0"), val = tensor<int32, [5]>([1, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_1_end_mask_0 = const()[name = tensor<string, []>("prev_kv_1_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -238,139 +244,139 @@ program(1.0)
             tensor<bool, [2]> prev_scale_1_end_mask_0 = const()[name = tensor<string, []>("prev_scale_1_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_1_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_1_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_1 = slice_by_index(begin = prev_scale_1_begin_0, end = prev_scale_1_end_0, end_mask = prev_scale_1_end_mask_0, squeeze_mask = prev_scale_1_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_1")];
-            tensor<fp32, [1, 1, 256]> var_162 = linear(bias = encoder_q_proj_0_bias, weight = encoder_q_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_3")];
-            tensor<int32, [4]> var_163 = const()[name = tensor<string, []>("op_163"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_164 = reshape(shape = var_163, x = var_162)[name = tensor<string, []>("op_164")];
+            tensor<fp32, [1, 1, 256]> var_188 = linear(bias = inner_encoder_q_proj_0_bias, weight = inner_encoder_q_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_3")];
+            tensor<int32, [4]> var_189 = const()[name = tensor<string, []>("op_189"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_190 = reshape(shape = var_189, x = var_188)[name = tensor<string, []>("op_190")];
             tensor<int32, [4]> q_1_perm_0 = const()[name = tensor<string, []>("q_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> var_168 = linear(bias = encoder_k_proj_0_bias, weight = encoder_k_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_4")];
-            tensor<fp32, []> var_169 = const()[name = tensor<string, []>("op_169"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 1, 256]> var_170 = mul(x = var_168, y = var_169)[name = tensor<string, []>("op_170")];
-            tensor<int32, [4]> var_171 = const()[name = tensor<string, []>("op_171"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_172 = reshape(shape = var_171, x = var_170)[name = tensor<string, []>("op_172")];
+            tensor<fp32, [1, 1, 256]> var_194 = linear(bias = inner_encoder_k_proj_0_bias, weight = inner_encoder_k_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_4")];
+            tensor<fp32, []> var_195 = const()[name = tensor<string, []>("op_195"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 1, 256]> var_196 = mul(x = var_194, y = var_195)[name = tensor<string, []>("op_196")];
+            tensor<int32, [4]> var_197 = const()[name = tensor<string, []>("op_197"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_198 = reshape(shape = var_197, x = var_196)[name = tensor<string, []>("op_198")];
             tensor<int32, [4]> k_1_perm_0 = const()[name = tensor<string, []>("k_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> var_176 = linear(bias = encoder_v_proj_0_bias, weight = encoder_v_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_5")];
-            tensor<int32, [4]> var_177 = const()[name = tensor<string, []>("op_177"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_178 = reshape(shape = var_177, x = var_176)[name = tensor<string, []>("op_178")];
+            tensor<fp32, [1, 1, 256]> var_202 = linear(bias = inner_encoder_v_proj_0_bias, weight = inner_encoder_v_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_5")];
+            tensor<int32, [4]> var_203 = const()[name = tensor<string, []>("op_203"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_204 = reshape(shape = var_203, x = var_202)[name = tensor<string, []>("op_204")];
             tensor<int32, [4]> v_1_perm_0 = const()[name = tensor<string, []>("v_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> input_17 = linear(bias = encoder_g_proj_0_bias, weight = encoder_g_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_6")];
+            tensor<fp32, [1, 1, 256]> input_19 = linear(bias = inner_encoder_g_proj_0_bias, weight = inner_encoder_g_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_6")];
             tensor<fp32, [1]> sqrt_s0_1 = sqrt(x = prev_scale_1)[name = tensor<string, []>("sqrt_s0_1")];
-            tensor<fp32, [1]> s_t_1 = add(x = prev_scale_1, y = encoder__t_index)[name = tensor<string, []>("s_t_1")];
+            tensor<fp32, [1]> s_t_1 = add(x = prev_scale_1, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_1")];
             tensor<fp32, [1]> sqrt_s_t_1 = sqrt(x = s_t_1)[name = tensor<string, []>("sqrt_s_t_1")];
             tensor<bool, []> qk_1_transpose_x_1 = const()[name = tensor<string, []>("qk_1_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_1_transpose_y_1 = const()[name = tensor<string, []>("qk_1_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 1, 64]> k_1 = transpose(perm = k_1_perm_0, x = var_172)[name = tensor<string, []>("transpose_57")];
-            tensor<fp32, [1, 4, 1, 64]> q_1 = transpose(perm = q_1_perm_0, x = var_164)[name = tensor<string, []>("transpose_58")];
+            tensor<fp32, [1, 4, 1, 64]> k_1 = transpose(perm = k_1_perm_0, x = var_198)[name = tensor<string, []>("transpose_57")];
+            tensor<fp32, [1, 4, 1, 64]> q_1 = transpose(perm = q_1_perm_0, x = var_190)[name = tensor<string, []>("transpose_58")];
             tensor<fp32, [1, 4, 1, 1]> qk_1 = matmul(transpose_x = qk_1_transpose_x_1, transpose_y = qk_1_transpose_y_1, x = q_1, y = k_1)[name = tensor<string, []>("qk_1")];
-            tensor<int32, [2]> var_188 = const()[name = tensor<string, []>("op_188"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp32, [1, 1]> var_189 = reshape(shape = var_188, x = sqrt_s_t_1)[name = tensor<string, []>("op_189")];
-            tensor<fp32, [1, 1]> M_1 = real_div(x = encoder__causal_mask, y = var_189)[name = tensor<string, []>("M_1")];
-            tensor<fp32, [1, 4, 1, 1]> var_191 = mul(x = qk_1, y = M_1)[name = tensor<string, []>("op_191")];
+            tensor<int32, [2]> var_214 = const()[name = tensor<string, []>("op_214"), val = tensor<int32, [2]>([1, 1])];
+            tensor<fp32, [1, 1]> var_215 = reshape(shape = var_214, x = sqrt_s_t_1)[name = tensor<string, []>("op_215")];
+            tensor<fp32, [1, 1]> M_1 = real_div(x = inner_encoder__causal_mask, y = var_215)[name = tensor<string, []>("M_1")];
+            tensor<fp32, [1, 4, 1, 1]> var_217 = mul(x = qk_1, y = M_1)[name = tensor<string, []>("op_217")];
             tensor<bool, []> inner_1_transpose_x_0 = const()[name = tensor<string, []>("inner_1_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_1_transpose_y_0 = const()[name = tensor<string, []>("inner_1_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 1, 64]> v_1 = transpose(perm = v_1_perm_0, x = var_178)[name = tensor<string, []>("transpose_56")];
-            tensor<fp32, [1, 4, 1, 64]> inner_1 = matmul(transpose_x = inner_1_transpose_x_0, transpose_y = inner_1_transpose_y_0, x = var_191, y = v_1)[name = tensor<string, []>("inner_1")];
-            tensor<bool, []> var_193_transpose_x_0 = const()[name = tensor<string, []>("op_193_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_193_transpose_y_0 = const()[name = tensor<string, []>("op_193_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 1, 64]> var_193 = matmul(transpose_x = var_193_transpose_x_0, transpose_y = var_193_transpose_y_0, x = q_1, y = prev_kv_1)[name = tensor<string, []>("op_193")];
-            tensor<fp32, [1]> var_194 = real_div(x = sqrt_s0_1, y = sqrt_s_t_1)[name = tensor<string, []>("op_194")];
-            tensor<int32, [4]> var_195 = const()[name = tensor<string, []>("op_195"), val = tensor<int32, [4]>([1, 1, 1, 1])];
-            tensor<fp32, [1, 1, 1, 1]> var_196 = reshape(shape = var_195, x = var_194)[name = tensor<string, []>("op_196")];
-            tensor<fp32, [1, 4, 1, 64]> cross_1 = mul(x = var_193, y = var_196)[name = tensor<string, []>("cross_1")];
+            tensor<fp32, [1, 4, 1, 64]> v_1 = transpose(perm = v_1_perm_0, x = var_204)[name = tensor<string, []>("transpose_56")];
+            tensor<fp32, [1, 4, 1, 64]> inner_1 = matmul(transpose_x = inner_1_transpose_x_0, transpose_y = inner_1_transpose_y_0, x = var_217, y = v_1)[name = tensor<string, []>("inner_1")];
+            tensor<bool, []> var_219_transpose_x_0 = const()[name = tensor<string, []>("op_219_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_219_transpose_y_0 = const()[name = tensor<string, []>("op_219_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 1, 64]> var_219 = matmul(transpose_x = var_219_transpose_x_0, transpose_y = var_219_transpose_y_0, x = q_1, y = prev_kv_1)[name = tensor<string, []>("op_219")];
+            tensor<fp32, [1]> var_220 = real_div(x = sqrt_s0_1, y = sqrt_s_t_1)[name = tensor<string, []>("op_220")];
+            tensor<int32, [4]> var_221 = const()[name = tensor<string, []>("op_221"), val = tensor<int32, [4]>([1, 1, 1, 1])];
+            tensor<fp32, [1, 1, 1, 1]> var_222 = reshape(shape = var_221, x = var_220)[name = tensor<string, []>("op_222")];
+            tensor<fp32, [1, 4, 1, 64]> cross_1 = mul(x = var_219, y = var_222)[name = tensor<string, []>("cross_1")];
             tensor<fp32, [1, 4, 1, 64]> out_1 = add(x = inner_1, y = cross_1)[name = tensor<string, []>("out_1")];
-            tensor<fp32, [1, 4, 64, 64]> var_199 = mul(x = prev_kv_1, y = sqrt_s0_1)[name = tensor<string, []>("op_199")];
-            tensor<bool, []> var_201_transpose_x_1 = const()[name = tensor<string, []>("op_201_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_201_transpose_y_1 = const()[name = tensor<string, []>("op_201_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_201 = matmul(transpose_x = var_201_transpose_x_1, transpose_y = var_201_transpose_y_1, x = k_1, y = v_1)[name = tensor<string, []>("op_201")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_1 = add(x = var_199, y = var_201)[name = tensor<string, []>("new_kv_unnorm_1")];
-            tensor<fp32, []> var_203 = const()[name = tensor<string, []>("op_203"), val = tensor<fp32, []>(0x1p+0)];
-            tensor<fp32, [1]> new_scale_1 = add(x = prev_scale_1, y = var_203)[name = tensor<string, []>("new_scale_1")];
-            tensor<fp32, [1]> var_205 = sqrt(x = new_scale_1)[name = tensor<string, []>("op_205")];
-            tensor<fp32, [1, 4, 64, 64]> var_206 = real_div(x = new_kv_unnorm_1, y = var_205)[name = tensor<string, []>("op_206")];
-            tensor<int32, [4]> var_207_perm_0 = const()[name = tensor<string, []>("op_207_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_225 = mul(x = prev_kv_1, y = sqrt_s0_1)[name = tensor<string, []>("op_225")];
+            tensor<bool, []> var_227_transpose_x_1 = const()[name = tensor<string, []>("op_227_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_227_transpose_y_1 = const()[name = tensor<string, []>("op_227_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_227 = matmul(transpose_x = var_227_transpose_x_1, transpose_y = var_227_transpose_y_1, x = k_1, y = v_1)[name = tensor<string, []>("op_227")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_1 = add(x = var_225, y = var_227)[name = tensor<string, []>("new_kv_unnorm_1")];
+            tensor<fp32, []> var_229 = const()[name = tensor<string, []>("op_229"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<fp32, [1]> new_scale_1 = add(x = prev_scale_1, y = var_229)[name = tensor<string, []>("new_scale_1")];
+            tensor<fp32, [1]> var_231 = sqrt(x = new_scale_1)[name = tensor<string, []>("op_231")];
+            tensor<fp32, [1, 4, 64, 64]> var_232 = real_div(x = new_kv_unnorm_1, y = var_231)[name = tensor<string, []>("op_232")];
+            tensor<int32, [4]> var_233_perm_0 = const()[name = tensor<string, []>("op_233_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_3_axes_0 = const()[name = tensor<string, []>("out_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 4, 64]> var_207 = transpose(perm = var_207_perm_0, x = out_1)[name = tensor<string, []>("transpose_55")];
-            tensor<fp32, [1, 1, 4, 64]> out_3 = layer_norm(axes = out_3_axes_0, epsilon = var_17, x = var_207)[name = tensor<string, []>("out_3")];
-            tensor<int32, [3]> var_211 = const()[name = tensor<string, []>("op_211"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<fp32, [1, 1, 256]> out_5 = reshape(shape = var_211, x = out_3)[name = tensor<string, []>("out_5")];
-            tensor<fp32, [1, 1, 256]> var_213 = silu(x = input_17)[name = tensor<string, []>("op_213")];
-            tensor<fp32, [1, 1, 256]> input_19 = mul(x = var_213, y = out_5)[name = tensor<string, []>("input_19")];
-            tensor<fp32, [1, 1, 256]> ret_out_1 = linear(bias = encoder_out_proj_0_bias, weight = encoder_out_proj_0_weight, x = input_19)[name = tensor<string, []>("linear_7")];
-            tensor<fp32, [1, 1, 256]> x_3 = add(x = input_13, y = ret_out_1)[name = tensor<string, []>("x_3")];
+            tensor<fp32, [1, 1, 4, 64]> var_233 = transpose(perm = var_233_perm_0, x = out_1)[name = tensor<string, []>("transpose_55")];
+            tensor<fp32, [1, 1, 4, 64]> out_3 = layer_norm(axes = out_3_axes_0, epsilon = var_44, x = var_233)[name = tensor<string, []>("out_3")];
+            tensor<int32, [3]> var_237 = const()[name = tensor<string, []>("op_237"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<fp32, [1, 1, 256]> out_5 = reshape(shape = var_237, x = out_3)[name = tensor<string, []>("out_5")];
+            tensor<fp32, [1, 1, 256]> var_239 = silu(x = input_19)[name = tensor<string, []>("op_239")];
+            tensor<fp32, [1, 1, 256]> input_21 = mul(x = var_239, y = out_5)[name = tensor<string, []>("input_21")];
+            tensor<fp32, [1, 1, 256]> ret_out_1 = linear(bias = inner_encoder_out_proj_0_bias, weight = inner_encoder_out_proj_0_weight, x = input_21)[name = tensor<string, []>("linear_7")];
+            tensor<fp32, [1, 1, 256]> x_3 = add(x = input_15, y = ret_out_1)[name = tensor<string, []>("x_3")];
             tensor<int32, [4]> window_1_begin_0 = const()[name = tensor<string, []>("window_1_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> window_1_end_0 = const()[name = tensor<string, []>("window_1_end_0"), val = tensor<int32, [4]>([1, 1, 16, 256])];
             tensor<bool, [4]> window_1_end_mask_0 = const()[name = tensor<string, []>("window_1_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_1_squeeze_mask_0 = const()[name = tensor<string, []>("window_1_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_1 = slice_by_index(begin = window_1_begin_0, end = window_1_end_0, end_mask = window_1_end_mask_0, squeeze_mask = window_1_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_1")];
-            tensor<int32, [3]> var_224_begin_0 = const()[name = tensor<string, []>("op_224_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_224_end_0 = const()[name = tensor<string, []>("op_224_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_224_end_mask_0 = const()[name = tensor<string, []>("op_224_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_224 = slice_by_index(begin = var_224_begin_0, end = var_224_end_0, end_mask = var_224_end_mask_0, x = window_1)[name = tensor<string, []>("op_224")];
+            tensor<int32, [3]> var_250_begin_0 = const()[name = tensor<string, []>("op_250_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_250_end_0 = const()[name = tensor<string, []>("op_250_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_250_end_mask_0 = const()[name = tensor<string, []>("op_250_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_250 = slice_by_index(begin = var_250_begin_0, end = var_250_end_0, end_mask = var_250_end_mask_0, x = window_1)[name = tensor<string, []>("op_250")];
             tensor<bool, []> window_3_interleave_0 = const()[name = tensor<string, []>("window_3_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_3 = concat(axis = var_26, interleave = window_3_interleave_0, values = (var_224, x_3))[name = tensor<string, []>("window_3")];
-            tensor<bool, []> input_21_interleave_0 = const()[name = tensor<string, []>("input_21_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> input_21 = concat(axis = var_23, interleave = input_21_interleave_0, values = window_3)[name = tensor<string, []>("input_21")];
+            tensor<fp32, [1, 16, 256]> window_3 = concat(axis = var_52, interleave = window_3_interleave_0, values = (var_250, x_3))[name = tensor<string, []>("window_3")];
+            tensor<bool, []> input_23_interleave_0 = const()[name = tensor<string, []>("input_23_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 16, 256]> input_23 = concat(axis = var_39, interleave = input_23_interleave_0, values = window_3)[name = tensor<string, []>("input_23")];
             tensor<int32, [1]> x_5_axes_0 = const()[name = tensor<string, []>("x_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 16, 256]> x_5 = layer_norm(axes = x_5_axes_0, beta = encoder_conv_module_0_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_0_sequential_0_weight, x = input_21)[name = tensor<string, []>("x_5")];
-            tensor<int32, [3]> input_23_perm_0 = const()[name = tensor<string, []>("input_23_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [1, 16, 256]> x_5 = layer_norm(axes = x_5_axes_0, beta = inner_encoder_conv_module_0_sequential_0_bias, epsilon = var_36, gamma = inner_encoder_conv_module_0_sequential_0_weight, x = input_23)[name = tensor<string, []>("x_5")];
+            tensor<int32, [3]> input_25_perm_0 = const()[name = tensor<string, []>("input_25_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_3_pad_type_0 = const()[name = tensor<string, []>("inputs_3_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_3_strides_0 = const()[name = tensor<string, []>("inputs_3_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_3_pad_0 = const()[name = tensor<string, []>("inputs_3_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_3_dilations_0 = const()[name = tensor<string, []>("inputs_3_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_3_groups_0 = const()[name = tensor<string, []>("inputs_3_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> input_23 = transpose(perm = input_23_perm_0, x = x_5)[name = tensor<string, []>("transpose_54")];
-            tensor<fp32, [1, 512, 16]> inputs_3 = conv(bias = encoder_conv_module_0_sequential_2_conv_bias, dilations = inputs_3_dilations_0, groups = inputs_3_groups_0, pad = inputs_3_pad_0, pad_type = inputs_3_pad_type_0, strides = inputs_3_strides_0, weight = encoder_conv_module_0_sequential_2_conv_weight, x = input_23)[name = tensor<string, []>("inputs_3")];
-            tensor<int32, [2]> var_249_split_sizes_0 = const()[name = tensor<string, []>("op_249_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_249_axis_0 = const()[name = tensor<string, []>("op_249_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> var_249_0, tensor<fp32, [1, 256, 16]> var_249_1 = split(axis = var_249_axis_0, split_sizes = var_249_split_sizes_0, x = inputs_3)[name = tensor<string, []>("op_249")];
-            tensor<fp32, [1, 256, 16]> var_251 = sigmoid(x = var_249_1)[name = tensor<string, []>("op_251")];
-            tensor<fp32, [1, 256, 16]> inputs_5 = mul(x = var_249_0, y = var_251)[name = tensor<string, []>("inputs_5")];
+            tensor<fp32, [1, 256, 16]> input_25 = transpose(perm = input_25_perm_0, x = x_5)[name = tensor<string, []>("transpose_54")];
+            tensor<fp32, [1, 512, 16]> inputs_3 = conv(bias = inner_encoder_conv_module_0_sequential_2_conv_bias, dilations = inputs_3_dilations_0, groups = inputs_3_groups_0, pad = inputs_3_pad_0, pad_type = inputs_3_pad_type_0, strides = inputs_3_strides_0, weight = inner_encoder_conv_module_0_sequential_2_conv_weight, x = input_25)[name = tensor<string, []>("inputs_3")];
+            tensor<int32, [2]> var_275_split_sizes_0 = const()[name = tensor<string, []>("op_275_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_275_axis_0 = const()[name = tensor<string, []>("op_275_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 256, 16]> var_275_0, tensor<fp32, [1, 256, 16]> var_275_1 = split(axis = var_275_axis_0, split_sizes = var_275_split_sizes_0, x = inputs_3)[name = tensor<string, []>("op_275")];
+            tensor<fp32, [1, 256, 16]> var_277 = sigmoid(x = var_275_1)[name = tensor<string, []>("op_277")];
+            tensor<fp32, [1, 256, 16]> inputs_5 = mul(x = var_275_0, y = var_277)[name = tensor<string, []>("inputs_5")];
             tensor<string, []> outputs_aug_1_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_1_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_1_pad_0 = const()[name = tensor<string, []>("outputs_aug_1_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_1_groups_0 = const()[name = tensor<string, []>("outputs_aug_1_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_1_strides_0 = const()[name = tensor<string, []>("outputs_aug_1_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_1_dilations_0 = const()[name = tensor<string, []>("outputs_aug_1_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [1, 256, 31]> outputs_aug_1 = conv(dilations = outputs_aug_1_dilations_0, groups = outputs_aug_1_groups_0, pad = outputs_aug_1_pad_0, pad_type = outputs_aug_1_pad_type_0, strides = outputs_aug_1_strides_0, weight = encoder_conv_module_0_sequential_4_conv_weight, x = inputs_5)[name = tensor<string, []>("outputs_aug_1")];
-            tensor<int32, [3]> input_25_begin_0 = const()[name = tensor<string, []>("input_25_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_25_end_0 = const()[name = tensor<string, []>("input_25_end_0"), val = tensor<int32, [3]>([1, 256, 16])];
-            tensor<bool, [3]> input_25_end_mask_0 = const()[name = tensor<string, []>("input_25_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [1, 256, 16]> input_25 = slice_by_index(begin = input_25_begin_0, end = input_25_end_0, end_mask = input_25_end_mask_0, x = outputs_aug_1)[name = tensor<string, []>("input_25")];
-            tensor<fp32, [1, 256, 16]> inputs_7 = batch_norm(beta = encoder_conv_module_0_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_0_sequential_5_weight, mean = encoder_conv_module_0_sequential_5_running_mean, variance = encoder_conv_module_0_sequential_5_running_var, x = input_25)[name = tensor<string, []>("inputs_7")];
-            tensor<fp32, [1, 256, 16]> input_27 = silu(x = inputs_7)[name = tensor<string, []>("input_27")];
-            tensor<string, []> input_29_pad_type_0 = const()[name = tensor<string, []>("input_29_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_29_strides_0 = const()[name = tensor<string, []>("input_29_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_29_pad_0 = const()[name = tensor<string, []>("input_29_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_29_dilations_0 = const()[name = tensor<string, []>("input_29_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_29_groups_0 = const()[name = tensor<string, []>("input_29_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> input_29 = conv(bias = encoder_conv_module_0_sequential_7_conv_bias, dilations = input_29_dilations_0, groups = input_29_groups_0, pad = input_29_pad_0, pad_type = input_29_pad_type_0, strides = input_29_strides_0, weight = encoder_conv_module_0_sequential_7_conv_weight, x = input_27)[name = tensor<string, []>("input_29")];
+            tensor<fp32, [1, 256, 31]> outputs_aug_1 = conv(dilations = outputs_aug_1_dilations_0, groups = outputs_aug_1_groups_0, pad = outputs_aug_1_pad_0, pad_type = outputs_aug_1_pad_type_0, strides = outputs_aug_1_strides_0, weight = inner_encoder_conv_module_0_sequential_4_conv_weight, x = inputs_5)[name = tensor<string, []>("outputs_aug_1")];
+            tensor<int32, [3]> input_27_begin_0 = const()[name = tensor<string, []>("input_27_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_27_end_0 = const()[name = tensor<string, []>("input_27_end_0"), val = tensor<int32, [3]>([1, 256, 16])];
+            tensor<bool, [3]> input_27_end_mask_0 = const()[name = tensor<string, []>("input_27_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [1, 256, 16]> input_27 = slice_by_index(begin = input_27_begin_0, end = input_27_end_0, end_mask = input_27_end_mask_0, x = outputs_aug_1)[name = tensor<string, []>("input_27")];
+            tensor<fp32, [1, 256, 16]> inputs_7 = batch_norm(beta = inner_encoder_conv_module_0_sequential_5_bias, epsilon = var_36, gamma = inner_encoder_conv_module_0_sequential_5_weight, mean = inner_encoder_conv_module_0_sequential_5_running_mean, variance = inner_encoder_conv_module_0_sequential_5_running_var, x = input_27)[name = tensor<string, []>("inputs_7")];
+            tensor<fp32, [1, 256, 16]> input_29 = silu(x = inputs_7)[name = tensor<string, []>("input_29")];
+            tensor<string, []> input_31_pad_type_0 = const()[name = tensor<string, []>("input_31_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_31_strides_0 = const()[name = tensor<string, []>("input_31_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_31_pad_0 = const()[name = tensor<string, []>("input_31_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_31_dilations_0 = const()[name = tensor<string, []>("input_31_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_31_groups_0 = const()[name = tensor<string, []>("input_31_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 256, 16]> input_31 = conv(bias = inner_encoder_conv_module_0_sequential_7_conv_bias, dilations = input_31_dilations_0, groups = input_31_groups_0, pad = input_31_pad_0, pad_type = input_31_pad_type_0, strides = input_31_strides_0, weight = inner_encoder_conv_module_0_sequential_7_conv_weight, x = input_29)[name = tensor<string, []>("input_31")];
             tensor<int32, [3]> conv_out_1_perm_0 = const()[name = tensor<string, []>("conv_out_1_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_282_begin_0 = const()[name = tensor<string, []>("op_282_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_282_end_0 = const()[name = tensor<string, []>("op_282_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_282_end_mask_0 = const()[name = tensor<string, []>("op_282_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 16, 256]> conv_out_1 = transpose(perm = conv_out_1_perm_0, x = input_29)[name = tensor<string, []>("transpose_53")];
-            tensor<fp32, [1, 1, 256]> var_282 = slice_by_index(begin = var_282_begin_0, end = var_282_end_0, end_mask = var_282_end_mask_0, x = conv_out_1)[name = tensor<string, []>("op_282")];
-            tensor<int32, [3]> var_284_perm_0 = const()[name = tensor<string, []>("op_284_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 1, 256]> var_284 = transpose(perm = var_284_perm_0, x = var_282)[name = tensor<string, []>("transpose_52")];
-            tensor<fp32, [1, 1, 256]> input_31 = add(x = x_3, y = var_284)[name = tensor<string, []>("input_31")];
-            tensor<int32, [1]> input_33_axes_0 = const()[name = tensor<string, []>("input_33_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_33 = layer_norm(axes = input_33_axes_0, beta = encoder_ffn2_0_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_0_module_sequential_0_weight, x = input_31)[name = tensor<string, []>("input_33")];
-            tensor<fp32, [1, 1, 1024]> inputs_9 = linear(bias = encoder_ffn2_0_module_sequential_1_linear_bias, weight = encoder_ffn2_0_module_sequential_1_linear_weight, x = input_33)[name = tensor<string, []>("linear_8")];
-            tensor<fp32, [1, 1, 1024]> input_35 = silu(x = inputs_9)[name = tensor<string, []>("input_35")];
-            tensor<fp32, [1, 1, 256]> input_39 = linear(bias = encoder_ffn2_0_module_sequential_4_linear_bias, weight = encoder_ffn2_0_module_sequential_4_linear_weight, x = input_35)[name = tensor<string, []>("linear_9")];
-            tensor<fp32, []> var_307 = const()[name = tensor<string, []>("op_307"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 1, 256]> var_308 = mul(x = input_39, y = var_307)[name = tensor<string, []>("op_308")];
-            tensor<fp32, [1, 1, 256]> input_41 = add(x = var_308, y = input_31)[name = tensor<string, []>("input_41")];
-            tensor<int32, [1]> input_43_axes_0 = const()[name = tensor<string, []>("input_43_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_43 = layer_norm(axes = input_43_axes_0, beta = encoder_layer_norm_0_bias, epsilon = var_28, gamma = encoder_layer_norm_0_weight, x = input_41)[name = tensor<string, []>("input_43")];
+            tensor<int32, [3]> var_308_begin_0 = const()[name = tensor<string, []>("op_308_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_308_end_0 = const()[name = tensor<string, []>("op_308_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_308_end_mask_0 = const()[name = tensor<string, []>("op_308_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 16, 256]> conv_out_1 = transpose(perm = conv_out_1_perm_0, x = input_31)[name = tensor<string, []>("transpose_53")];
+            tensor<fp32, [1, 1, 256]> var_308 = slice_by_index(begin = var_308_begin_0, end = var_308_end_0, end_mask = var_308_end_mask_0, x = conv_out_1)[name = tensor<string, []>("op_308")];
+            tensor<int32, [3]> var_310_perm_0 = const()[name = tensor<string, []>("op_310_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 1, 256]> var_310 = transpose(perm = var_310_perm_0, x = var_308)[name = tensor<string, []>("transpose_52")];
+            tensor<fp32, [1, 1, 256]> input_33 = add(x = x_3, y = var_310)[name = tensor<string, []>("input_33")];
+            tensor<int32, [1]> input_35_axes_0 = const()[name = tensor<string, []>("input_35_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 1, 256]> input_35 = layer_norm(axes = input_35_axes_0, beta = inner_encoder_ffn2_0_module_sequential_0_bias, epsilon = var_36, gamma = inner_encoder_ffn2_0_module_sequential_0_weight, x = input_33)[name = tensor<string, []>("input_35")];
+            tensor<fp32, [1, 1, 1024]> inputs_9 = linear(bias = inner_encoder_ffn2_0_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_0_module_sequential_1_linear_weight, x = input_35)[name = tensor<string, []>("linear_8")];
+            tensor<fp32, [1, 1, 1024]> input_37 = silu(x = inputs_9)[name = tensor<string, []>("input_37")];
+            tensor<fp32, [1, 1, 256]> input_41 = linear(bias = inner_encoder_ffn2_0_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_0_module_sequential_4_linear_weight, x = input_37)[name = tensor<string, []>("linear_9")];
+            tensor<fp32, []> var_333 = const()[name = tensor<string, []>("op_333"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 1, 256]> var_334 = mul(x = input_41, y = var_333)[name = tensor<string, []>("op_334")];
+            tensor<fp32, [1, 1, 256]> input_43 = add(x = var_334, y = input_33)[name = tensor<string, []>("input_43")];
             tensor<int32, [1]> input_45_axes_0 = const()[name = tensor<string, []>("input_45_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_45 = layer_norm(axes = input_45_axes_0, beta = encoder_ffn1_1_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_1_module_sequential_0_weight, x = input_43)[name = tensor<string, []>("input_45")];
-            tensor<fp32, [1, 1, 1024]> inputs_11 = linear(bias = encoder_ffn1_1_module_sequential_1_linear_bias, weight = encoder_ffn1_1_module_sequential_1_linear_weight, x = input_45)[name = tensor<string, []>("linear_10")];
-            tensor<fp32, [1, 1, 1024]> input_47 = silu(x = inputs_11)[name = tensor<string, []>("input_47")];
-            tensor<fp32, [1, 1, 256]> input_51 = linear(bias = encoder_ffn1_1_module_sequential_4_linear_bias, weight = encoder_ffn1_1_module_sequential_4_linear_weight, x = input_47)[name = tensor<string, []>("linear_11")];
-            tensor<fp32, []> var_337 = const()[name = tensor<string, []>("op_337"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 1, 256]> var_338 = mul(x = input_51, y = var_337)[name = tensor<string, []>("op_338")];
-            tensor<fp32, [1, 1, 256]> input_53 = add(x = var_338, y = input_43)[name = tensor<string, []>("input_53")];
+            tensor<fp32, [1, 1, 256]> input_45 = layer_norm(axes = input_45_axes_0, beta = inner_encoder_layer_norm_0_bias, epsilon = var_36, gamma = inner_encoder_layer_norm_0_weight, x = input_43)[name = tensor<string, []>("input_45")];
+            tensor<int32, [1]> input_47_axes_0 = const()[name = tensor<string, []>("input_47_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 1, 256]> input_47 = layer_norm(axes = input_47_axes_0, beta = inner_encoder_ffn1_1_module_sequential_0_bias, epsilon = var_36, gamma = inner_encoder_ffn1_1_module_sequential_0_weight, x = input_45)[name = tensor<string, []>("input_47")];
+            tensor<fp32, [1, 1, 1024]> inputs_11 = linear(bias = inner_encoder_ffn1_1_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_1_module_sequential_1_linear_weight, x = input_47)[name = tensor<string, []>("linear_10")];
+            tensor<fp32, [1, 1, 1024]> input_49 = silu(x = inputs_11)[name = tensor<string, []>("input_49")];
+            tensor<fp32, [1, 1, 256]> input_53 = linear(bias = inner_encoder_ffn1_1_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_1_module_sequential_4_linear_weight, x = input_49)[name = tensor<string, []>("linear_11")];
+            tensor<fp32, []> var_363 = const()[name = tensor<string, []>("op_363"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 1, 256]> var_364 = mul(x = input_53, y = var_363)[name = tensor<string, []>("op_364")];
+            tensor<fp32, [1, 1, 256]> input_55 = add(x = var_364, y = input_45)[name = tensor<string, []>("input_55")];
             tensor<int32, [1]> x_7_axes_0 = const()[name = tensor<string, []>("x_7_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> x_7 = layer_norm(axes = x_7_axes_0, beta = encoder_ret_lns_1_bias, epsilon = var_28, gamma = encoder_ret_lns_1_weight, x = input_53)[name = tensor<string, []>("x_7")];
+            tensor<fp32, [1, 1, 256]> x_7 = layer_norm(axes = x_7_axes_0, beta = inner_encoder_ret_lns_1_bias, epsilon = var_36, gamma = inner_encoder_ret_lns_1_weight, x = input_55)[name = tensor<string, []>("x_7")];
             tensor<int32, [5]> prev_kv_3_begin_0 = const()[name = tensor<string, []>("prev_kv_3_begin_0"), val = tensor<int32, [5]>([1, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_3_end_0 = const()[name = tensor<string, []>("prev_kv_3_end_0"), val = tensor<int32, [5]>([2, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_3_end_mask_0 = const()[name = tensor<string, []>("prev_kv_3_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -381,139 +387,139 @@ program(1.0)
             tensor<bool, [2]> prev_scale_3_end_mask_0 = const()[name = tensor<string, []>("prev_scale_3_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_3_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_3_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_3 = slice_by_index(begin = prev_scale_3_begin_0, end = prev_scale_3_end_0, end_mask = prev_scale_3_end_mask_0, squeeze_mask = prev_scale_3_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_3")];
-            tensor<fp32, [1, 1, 256]> var_352 = linear(bias = encoder_q_proj_1_bias, weight = encoder_q_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_12")];
-            tensor<int32, [4]> var_353 = const()[name = tensor<string, []>("op_353"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_354 = reshape(shape = var_353, x = var_352)[name = tensor<string, []>("op_354")];
+            tensor<fp32, [1, 1, 256]> var_378 = linear(bias = inner_encoder_q_proj_1_bias, weight = inner_encoder_q_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_12")];
+            tensor<int32, [4]> var_379 = const()[name = tensor<string, []>("op_379"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_380 = reshape(shape = var_379, x = var_378)[name = tensor<string, []>("op_380")];
             tensor<int32, [4]> q_3_perm_0 = const()[name = tensor<string, []>("q_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> var_358 = linear(bias = encoder_k_proj_1_bias, weight = encoder_k_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_13")];
-            tensor<fp32, []> var_359 = const()[name = tensor<string, []>("op_359"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 1, 256]> var_360 = mul(x = var_358, y = var_359)[name = tensor<string, []>("op_360")];
-            tensor<int32, [4]> var_361 = const()[name = tensor<string, []>("op_361"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_362 = reshape(shape = var_361, x = var_360)[name = tensor<string, []>("op_362")];
+            tensor<fp32, [1, 1, 256]> var_384 = linear(bias = inner_encoder_k_proj_1_bias, weight = inner_encoder_k_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_13")];
+            tensor<fp32, []> var_385 = const()[name = tensor<string, []>("op_385"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 1, 256]> var_386 = mul(x = var_384, y = var_385)[name = tensor<string, []>("op_386")];
+            tensor<int32, [4]> var_387 = const()[name = tensor<string, []>("op_387"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_388 = reshape(shape = var_387, x = var_386)[name = tensor<string, []>("op_388")];
             tensor<int32, [4]> k_3_perm_0 = const()[name = tensor<string, []>("k_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> var_366 = linear(bias = encoder_v_proj_1_bias, weight = encoder_v_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_14")];
-            tensor<int32, [4]> var_367 = const()[name = tensor<string, []>("op_367"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_368 = reshape(shape = var_367, x = var_366)[name = tensor<string, []>("op_368")];
+            tensor<fp32, [1, 1, 256]> var_392 = linear(bias = inner_encoder_v_proj_1_bias, weight = inner_encoder_v_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_14")];
+            tensor<int32, [4]> var_393 = const()[name = tensor<string, []>("op_393"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_394 = reshape(shape = var_393, x = var_392)[name = tensor<string, []>("op_394")];
             tensor<int32, [4]> v_3_perm_0 = const()[name = tensor<string, []>("v_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> input_57 = linear(bias = encoder_g_proj_1_bias, weight = encoder_g_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_15")];
+            tensor<fp32, [1, 1, 256]> input_59 = linear(bias = inner_encoder_g_proj_1_bias, weight = inner_encoder_g_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_15")];
             tensor<fp32, [1]> sqrt_s0_3 = sqrt(x = prev_scale_3)[name = tensor<string, []>("sqrt_s0_3")];
-            tensor<fp32, [1]> s_t_3 = add(x = prev_scale_3, y = encoder__t_index)[name = tensor<string, []>("s_t_3")];
+            tensor<fp32, [1]> s_t_3 = add(x = prev_scale_3, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_3")];
             tensor<fp32, [1]> sqrt_s_t_3 = sqrt(x = s_t_3)[name = tensor<string, []>("sqrt_s_t_3")];
             tensor<bool, []> qk_3_transpose_x_1 = const()[name = tensor<string, []>("qk_3_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_3_transpose_y_1 = const()[name = tensor<string, []>("qk_3_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 1, 64]> k_3 = transpose(perm = k_3_perm_0, x = var_362)[name = tensor<string, []>("transpose_50")];
-            tensor<fp32, [1, 4, 1, 64]> q_3 = transpose(perm = q_3_perm_0, x = var_354)[name = tensor<string, []>("transpose_51")];
+            tensor<fp32, [1, 4, 1, 64]> k_3 = transpose(perm = k_3_perm_0, x = var_388)[name = tensor<string, []>("transpose_50")];
+            tensor<fp32, [1, 4, 1, 64]> q_3 = transpose(perm = q_3_perm_0, x = var_380)[name = tensor<string, []>("transpose_51")];
             tensor<fp32, [1, 4, 1, 1]> qk_3 = matmul(transpose_x = qk_3_transpose_x_1, transpose_y = qk_3_transpose_y_1, x = q_3, y = k_3)[name = tensor<string, []>("qk_3")];
-            tensor<int32, [2]> var_378 = const()[name = tensor<string, []>("op_378"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp32, [1, 1]> var_379 = reshape(shape = var_378, x = sqrt_s_t_3)[name = tensor<string, []>("op_379")];
-            tensor<fp32, [1, 1]> M_3 = real_div(x = encoder__causal_mask, y = var_379)[name = tensor<string, []>("M_3")];
-            tensor<fp32, [1, 4, 1, 1]> var_381 = mul(x = qk_3, y = M_3)[name = tensor<string, []>("op_381")];
+            tensor<int32, [2]> var_404 = const()[name = tensor<string, []>("op_404"), val = tensor<int32, [2]>([1, 1])];
+            tensor<fp32, [1, 1]> var_405 = reshape(shape = var_404, x = sqrt_s_t_3)[name = tensor<string, []>("op_405")];
+            tensor<fp32, [1, 1]> M_3 = real_div(x = inner_encoder__causal_mask, y = var_405)[name = tensor<string, []>("M_3")];
+            tensor<fp32, [1, 4, 1, 1]> var_407 = mul(x = qk_3, y = M_3)[name = tensor<string, []>("op_407")];
             tensor<bool, []> inner_3_transpose_x_0 = const()[name = tensor<string, []>("inner_3_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_3_transpose_y_0 = const()[name = tensor<string, []>("inner_3_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 1, 64]> v_3 = transpose(perm = v_3_perm_0, x = var_368)[name = tensor<string, []>("transpose_49")];
-            tensor<fp32, [1, 4, 1, 64]> inner_3 = matmul(transpose_x = inner_3_transpose_x_0, transpose_y = inner_3_transpose_y_0, x = var_381, y = v_3)[name = tensor<string, []>("inner_3")];
-            tensor<bool, []> var_383_transpose_x_0 = const()[name = tensor<string, []>("op_383_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_383_transpose_y_0 = const()[name = tensor<string, []>("op_383_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 1, 64]> var_383 = matmul(transpose_x = var_383_transpose_x_0, transpose_y = var_383_transpose_y_0, x = q_3, y = prev_kv_3)[name = tensor<string, []>("op_383")];
-            tensor<fp32, [1]> var_384 = real_div(x = sqrt_s0_3, y = sqrt_s_t_3)[name = tensor<string, []>("op_384")];
-            tensor<int32, [4]> var_385 = const()[name = tensor<string, []>("op_385"), val = tensor<int32, [4]>([1, 1, 1, 1])];
-            tensor<fp32, [1, 1, 1, 1]> var_386 = reshape(shape = var_385, x = var_384)[name = tensor<string, []>("op_386")];
-            tensor<fp32, [1, 4, 1, 64]> cross_3 = mul(x = var_383, y = var_386)[name = tensor<string, []>("cross_3")];
+            tensor<fp32, [1, 4, 1, 64]> v_3 = transpose(perm = v_3_perm_0, x = var_394)[name = tensor<string, []>("transpose_49")];
+            tensor<fp32, [1, 4, 1, 64]> inner_3 = matmul(transpose_x = inner_3_transpose_x_0, transpose_y = inner_3_transpose_y_0, x = var_407, y = v_3)[name = tensor<string, []>("inner_3")];
+            tensor<bool, []> var_409_transpose_x_0 = const()[name = tensor<string, []>("op_409_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_409_transpose_y_0 = const()[name = tensor<string, []>("op_409_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 1, 64]> var_409 = matmul(transpose_x = var_409_transpose_x_0, transpose_y = var_409_transpose_y_0, x = q_3, y = prev_kv_3)[name = tensor<string, []>("op_409")];
+            tensor<fp32, [1]> var_410 = real_div(x = sqrt_s0_3, y = sqrt_s_t_3)[name = tensor<string, []>("op_410")];
+            tensor<int32, [4]> var_411 = const()[name = tensor<string, []>("op_411"), val = tensor<int32, [4]>([1, 1, 1, 1])];
+            tensor<fp32, [1, 1, 1, 1]> var_412 = reshape(shape = var_411, x = var_410)[name = tensor<string, []>("op_412")];
+            tensor<fp32, [1, 4, 1, 64]> cross_3 = mul(x = var_409, y = var_412)[name = tensor<string, []>("cross_3")];
             tensor<fp32, [1, 4, 1, 64]> out_7 = add(x = inner_3, y = cross_3)[name = tensor<string, []>("out_7")];
-            tensor<fp32, [1, 4, 64, 64]> var_389 = mul(x = prev_kv_3, y = sqrt_s0_3)[name = tensor<string, []>("op_389")];
-            tensor<bool, []> var_391_transpose_x_1 = const()[name = tensor<string, []>("op_391_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_391_transpose_y_1 = const()[name = tensor<string, []>("op_391_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_391 = matmul(transpose_x = var_391_transpose_x_1, transpose_y = var_391_transpose_y_1, x = k_3, y = v_3)[name = tensor<string, []>("op_391")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_3 = add(x = var_389, y = var_391)[name = tensor<string, []>("new_kv_unnorm_3")];
-            tensor<fp32, []> var_393 = const()[name = tensor<string, []>("op_393"), val = tensor<fp32, []>(0x1p+0)];
-            tensor<fp32, [1]> new_scale_3 = add(x = prev_scale_3, y = var_393)[name = tensor<string, []>("new_scale_3")];
-            tensor<fp32, [1]> var_395 = sqrt(x = new_scale_3)[name = tensor<string, []>("op_395")];
-            tensor<fp32, [1, 4, 64, 64]> var_396 = real_div(x = new_kv_unnorm_3, y = var_395)[name = tensor<string, []>("op_396")];
-            tensor<int32, [4]> var_397_perm_0 = const()[name = tensor<string, []>("op_397_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_415 = mul(x = prev_kv_3, y = sqrt_s0_3)[name = tensor<string, []>("op_415")];
+            tensor<bool, []> var_417_transpose_x_1 = const()[name = tensor<string, []>("op_417_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_417_transpose_y_1 = const()[name = tensor<string, []>("op_417_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_417 = matmul(transpose_x = var_417_transpose_x_1, transpose_y = var_417_transpose_y_1, x = k_3, y = v_3)[name = tensor<string, []>("op_417")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_3 = add(x = var_415, y = var_417)[name = tensor<string, []>("new_kv_unnorm_3")];
+            tensor<fp32, []> var_419 = const()[name = tensor<string, []>("op_419"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<fp32, [1]> new_scale_3 = add(x = prev_scale_3, y = var_419)[name = tensor<string, []>("new_scale_3")];
+            tensor<fp32, [1]> var_421 = sqrt(x = new_scale_3)[name = tensor<string, []>("op_421")];
+            tensor<fp32, [1, 4, 64, 64]> var_422 = real_div(x = new_kv_unnorm_3, y = var_421)[name = tensor<string, []>("op_422")];
+            tensor<int32, [4]> var_423_perm_0 = const()[name = tensor<string, []>("op_423_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_9_axes_0 = const()[name = tensor<string, []>("out_9_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 4, 64]> var_397 = transpose(perm = var_397_perm_0, x = out_7)[name = tensor<string, []>("transpose_48")];
-            tensor<fp32, [1, 1, 4, 64]> out_9 = layer_norm(axes = out_9_axes_0, epsilon = var_17, x = var_397)[name = tensor<string, []>("out_9")];
-            tensor<int32, [3]> var_401 = const()[name = tensor<string, []>("op_401"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<fp32, [1, 1, 256]> out_11 = reshape(shape = var_401, x = out_9)[name = tensor<string, []>("out_11")];
-            tensor<fp32, [1, 1, 256]> var_403 = silu(x = input_57)[name = tensor<string, []>("op_403")];
-            tensor<fp32, [1, 1, 256]> input_59 = mul(x = var_403, y = out_11)[name = tensor<string, []>("input_59")];
-            tensor<fp32, [1, 1, 256]> ret_out_3 = linear(bias = encoder_out_proj_1_bias, weight = encoder_out_proj_1_weight, x = input_59)[name = tensor<string, []>("linear_16")];
-            tensor<fp32, [1, 1, 256]> x_9 = add(x = input_53, y = ret_out_3)[name = tensor<string, []>("x_9")];
+            tensor<fp32, [1, 1, 4, 64]> var_423 = transpose(perm = var_423_perm_0, x = out_7)[name = tensor<string, []>("transpose_48")];
+            tensor<fp32, [1, 1, 4, 64]> out_9 = layer_norm(axes = out_9_axes_0, epsilon = var_44, x = var_423)[name = tensor<string, []>("out_9")];
+            tensor<int32, [3]> var_427 = const()[name = tensor<string, []>("op_427"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<fp32, [1, 1, 256]> out_11 = reshape(shape = var_427, x = out_9)[name = tensor<string, []>("out_11")];
+            tensor<fp32, [1, 1, 256]> var_429 = silu(x = input_59)[name = tensor<string, []>("op_429")];
+            tensor<fp32, [1, 1, 256]> input_61 = mul(x = var_429, y = out_11)[name = tensor<string, []>("input_61")];
+            tensor<fp32, [1, 1, 256]> ret_out_3 = linear(bias = inner_encoder_out_proj_1_bias, weight = inner_encoder_out_proj_1_weight, x = input_61)[name = tensor<string, []>("linear_16")];
+            tensor<fp32, [1, 1, 256]> x_9 = add(x = input_55, y = ret_out_3)[name = tensor<string, []>("x_9")];
             tensor<int32, [4]> window_5_begin_0 = const()[name = tensor<string, []>("window_5_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> window_5_end_0 = const()[name = tensor<string, []>("window_5_end_0"), val = tensor<int32, [4]>([2, 1, 16, 256])];
             tensor<bool, [4]> window_5_end_mask_0 = const()[name = tensor<string, []>("window_5_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_5_squeeze_mask_0 = const()[name = tensor<string, []>("window_5_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_5 = slice_by_index(begin = window_5_begin_0, end = window_5_end_0, end_mask = window_5_end_mask_0, squeeze_mask = window_5_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_5")];
-            tensor<int32, [3]> var_414_begin_0 = const()[name = tensor<string, []>("op_414_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_414_end_0 = const()[name = tensor<string, []>("op_414_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_414_end_mask_0 = const()[name = tensor<string, []>("op_414_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_414 = slice_by_index(begin = var_414_begin_0, end = var_414_end_0, end_mask = var_414_end_mask_0, x = window_5)[name = tensor<string, []>("op_414")];
+            tensor<int32, [3]> var_440_begin_0 = const()[name = tensor<string, []>("op_440_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_440_end_0 = const()[name = tensor<string, []>("op_440_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_440_end_mask_0 = const()[name = tensor<string, []>("op_440_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_440 = slice_by_index(begin = var_440_begin_0, end = var_440_end_0, end_mask = var_440_end_mask_0, x = window_5)[name = tensor<string, []>("op_440")];
             tensor<bool, []> window_7_interleave_0 = const()[name = tensor<string, []>("window_7_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_7 = concat(axis = var_26, interleave = window_7_interleave_0, values = (var_414, x_9))[name = tensor<string, []>("window_7")];
-            tensor<bool, []> input_61_interleave_0 = const()[name = tensor<string, []>("input_61_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> input_61 = concat(axis = var_23, interleave = input_61_interleave_0, values = window_7)[name = tensor<string, []>("input_61")];
+            tensor<fp32, [1, 16, 256]> window_7 = concat(axis = var_52, interleave = window_7_interleave_0, values = (var_440, x_9))[name = tensor<string, []>("window_7")];
+            tensor<bool, []> input_63_interleave_0 = const()[name = tensor<string, []>("input_63_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 16, 256]> input_63 = concat(axis = var_39, interleave = input_63_interleave_0, values = window_7)[name = tensor<string, []>("input_63")];
             tensor<int32, [1]> x_11_axes_0 = const()[name = tensor<string, []>("x_11_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 16, 256]> x_11 = layer_norm(axes = x_11_axes_0, beta = encoder_conv_module_1_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_1_sequential_0_weight, x = input_61)[name = tensor<string, []>("x_11")];
-            tensor<int32, [3]> input_63_perm_0 = const()[name = tensor<string, []>("input_63_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [1, 16, 256]> x_11 = layer_norm(axes = x_11_axes_0, beta = inner_encoder_conv_module_1_sequential_0_bias, epsilon = var_36, gamma = inner_encoder_conv_module_1_sequential_0_weight, x = input_63)[name = tensor<string, []>("x_11")];
+            tensor<int32, [3]> input_65_perm_0 = const()[name = tensor<string, []>("input_65_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_13_pad_type_0 = const()[name = tensor<string, []>("inputs_13_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_13_strides_0 = const()[name = tensor<string, []>("inputs_13_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_13_pad_0 = const()[name = tensor<string, []>("inputs_13_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_13_dilations_0 = const()[name = tensor<string, []>("inputs_13_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_13_groups_0 = const()[name = tensor<string, []>("inputs_13_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> input_63 = transpose(perm = input_63_perm_0, x = x_11)[name = tensor<string, []>("transpose_47")];
-            tensor<fp32, [1, 512, 16]> inputs_13 = conv(bias = encoder_conv_module_1_sequential_2_conv_bias, dilations = inputs_13_dilations_0, groups = inputs_13_groups_0, pad = inputs_13_pad_0, pad_type = inputs_13_pad_type_0, strides = inputs_13_strides_0, weight = encoder_conv_module_1_sequential_2_conv_weight, x = input_63)[name = tensor<string, []>("inputs_13")];
-            tensor<int32, [2]> var_439_split_sizes_0 = const()[name = tensor<string, []>("op_439_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_439_axis_0 = const()[name = tensor<string, []>("op_439_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> var_439_0, tensor<fp32, [1, 256, 16]> var_439_1 = split(axis = var_439_axis_0, split_sizes = var_439_split_sizes_0, x = inputs_13)[name = tensor<string, []>("op_439")];
-            tensor<fp32, [1, 256, 16]> var_441 = sigmoid(x = var_439_1)[name = tensor<string, []>("op_441")];
-            tensor<fp32, [1, 256, 16]> inputs_15 = mul(x = var_439_0, y = var_441)[name = tensor<string, []>("inputs_15")];
+            tensor<fp32, [1, 256, 16]> input_65 = transpose(perm = input_65_perm_0, x = x_11)[name = tensor<string, []>("transpose_47")];
+            tensor<fp32, [1, 512, 16]> inputs_13 = conv(bias = inner_encoder_conv_module_1_sequential_2_conv_bias, dilations = inputs_13_dilations_0, groups = inputs_13_groups_0, pad = inputs_13_pad_0, pad_type = inputs_13_pad_type_0, strides = inputs_13_strides_0, weight = inner_encoder_conv_module_1_sequential_2_conv_weight, x = input_65)[name = tensor<string, []>("inputs_13")];
+            tensor<int32, [2]> var_465_split_sizes_0 = const()[name = tensor<string, []>("op_465_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_465_axis_0 = const()[name = tensor<string, []>("op_465_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 256, 16]> var_465_0, tensor<fp32, [1, 256, 16]> var_465_1 = split(axis = var_465_axis_0, split_sizes = var_465_split_sizes_0, x = inputs_13)[name = tensor<string, []>("op_465")];
+            tensor<fp32, [1, 256, 16]> var_467 = sigmoid(x = var_465_1)[name = tensor<string, []>("op_467")];
+            tensor<fp32, [1, 256, 16]> inputs_15 = mul(x = var_465_0, y = var_467)[name = tensor<string, []>("inputs_15")];
             tensor<string, []> outputs_aug_3_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_3_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_3_pad_0 = const()[name = tensor<string, []>("outputs_aug_3_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_3_groups_0 = const()[name = tensor<string, []>("outputs_aug_3_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_3_strides_0 = const()[name = tensor<string, []>("outputs_aug_3_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_3_dilations_0 = const()[name = tensor<string, []>("outputs_aug_3_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [1, 256, 31]> outputs_aug_3 = conv(dilations = outputs_aug_3_dilations_0, groups = outputs_aug_3_groups_0, pad = outputs_aug_3_pad_0, pad_type = outputs_aug_3_pad_type_0, strides = outputs_aug_3_strides_0, weight = encoder_conv_module_1_sequential_4_conv_weight, x = inputs_15)[name = tensor<string, []>("outputs_aug_3")];
-            tensor<int32, [3]> input_65_begin_0 = const()[name = tensor<string, []>("input_65_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_65_end_0 = const()[name = tensor<string, []>("input_65_end_0"), val = tensor<int32, [3]>([1, 256, 16])];
-            tensor<bool, [3]> input_65_end_mask_0 = const()[name = tensor<string, []>("input_65_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [1, 256, 16]> input_65 = slice_by_index(begin = input_65_begin_0, end = input_65_end_0, end_mask = input_65_end_mask_0, x = outputs_aug_3)[name = tensor<string, []>("input_65")];
-            tensor<fp32, [1, 256, 16]> inputs_17 = batch_norm(beta = encoder_conv_module_1_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_1_sequential_5_weight, mean = encoder_conv_module_1_sequential_5_running_mean, variance = encoder_conv_module_1_sequential_5_running_var, x = input_65)[name = tensor<string, []>("inputs_17")];
-            tensor<fp32, [1, 256, 16]> input_67 = silu(x = inputs_17)[name = tensor<string, []>("input_67")];
-            tensor<string, []> input_69_pad_type_0 = const()[name = tensor<string, []>("input_69_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_69_strides_0 = const()[name = tensor<string, []>("input_69_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_69_pad_0 = const()[name = tensor<string, []>("input_69_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_69_dilations_0 = const()[name = tensor<string, []>("input_69_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_69_groups_0 = const()[name = tensor<string, []>("input_69_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> input_69 = conv(bias = encoder_conv_module_1_sequential_7_conv_bias, dilations = input_69_dilations_0, groups = input_69_groups_0, pad = input_69_pad_0, pad_type = input_69_pad_type_0, strides = input_69_strides_0, weight = encoder_conv_module_1_sequential_7_conv_weight, x = input_67)[name = tensor<string, []>("input_69")];
+            tensor<fp32, [1, 256, 31]> outputs_aug_3 = conv(dilations = outputs_aug_3_dilations_0, groups = outputs_aug_3_groups_0, pad = outputs_aug_3_pad_0, pad_type = outputs_aug_3_pad_type_0, strides = outputs_aug_3_strides_0, weight = inner_encoder_conv_module_1_sequential_4_conv_weight, x = inputs_15)[name = tensor<string, []>("outputs_aug_3")];
+            tensor<int32, [3]> input_67_begin_0 = const()[name = tensor<string, []>("input_67_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_67_end_0 = const()[name = tensor<string, []>("input_67_end_0"), val = tensor<int32, [3]>([1, 256, 16])];
+            tensor<bool, [3]> input_67_end_mask_0 = const()[name = tensor<string, []>("input_67_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [1, 256, 16]> input_67 = slice_by_index(begin = input_67_begin_0, end = input_67_end_0, end_mask = input_67_end_mask_0, x = outputs_aug_3)[name = tensor<string, []>("input_67")];
+            tensor<fp32, [1, 256, 16]> inputs_17 = batch_norm(beta = inner_encoder_conv_module_1_sequential_5_bias, epsilon = var_36, gamma = inner_encoder_conv_module_1_sequential_5_weight, mean = inner_encoder_conv_module_1_sequential_5_running_mean, variance = inner_encoder_conv_module_1_sequential_5_running_var, x = input_67)[name = tensor<string, []>("inputs_17")];
+            tensor<fp32, [1, 256, 16]> input_69 = silu(x = inputs_17)[name = tensor<string, []>("input_69")];
+            tensor<string, []> input_71_pad_type_0 = const()[name = tensor<string, []>("input_71_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_71_strides_0 = const()[name = tensor<string, []>("input_71_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_71_pad_0 = const()[name = tensor<string, []>("input_71_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_71_dilations_0 = const()[name = tensor<string, []>("input_71_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_71_groups_0 = const()[name = tensor<string, []>("input_71_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 256, 16]> input_71 = conv(bias = inner_encoder_conv_module_1_sequential_7_conv_bias, dilations = input_71_dilations_0, groups = input_71_groups_0, pad = input_71_pad_0, pad_type = input_71_pad_type_0, strides = input_71_strides_0, weight = inner_encoder_conv_module_1_sequential_7_conv_weight, x = input_69)[name = tensor<string, []>("input_71")];
             tensor<int32, [3]> conv_out_3_perm_0 = const()[name = tensor<string, []>("conv_out_3_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_472_begin_0 = const()[name = tensor<string, []>("op_472_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_472_end_0 = const()[name = tensor<string, []>("op_472_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_472_end_mask_0 = const()[name = tensor<string, []>("op_472_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 16, 256]> conv_out_3 = transpose(perm = conv_out_3_perm_0, x = input_69)[name = tensor<string, []>("transpose_46")];
-            tensor<fp32, [1, 1, 256]> var_472 = slice_by_index(begin = var_472_begin_0, end = var_472_end_0, end_mask = var_472_end_mask_0, x = conv_out_3)[name = tensor<string, []>("op_472")];
-            tensor<int32, [3]> var_474_perm_0 = const()[name = tensor<string, []>("op_474_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 1, 256]> var_474 = transpose(perm = var_474_perm_0, x = var_472)[name = tensor<string, []>("transpose_45")];
-            tensor<fp32, [1, 1, 256]> input_71 = add(x = x_9, y = var_474)[name = tensor<string, []>("input_71")];
-            tensor<int32, [1]> input_73_axes_0 = const()[name = tensor<string, []>("input_73_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_73 = layer_norm(axes = input_73_axes_0, beta = encoder_ffn2_1_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_1_module_sequential_0_weight, x = input_71)[name = tensor<string, []>("input_73")];
-            tensor<fp32, [1, 1, 1024]> inputs_19 = linear(bias = encoder_ffn2_1_module_sequential_1_linear_bias, weight = encoder_ffn2_1_module_sequential_1_linear_weight, x = input_73)[name = tensor<string, []>("linear_17")];
-            tensor<fp32, [1, 1, 1024]> input_75 = silu(x = inputs_19)[name = tensor<string, []>("input_75")];
-            tensor<fp32, [1, 1, 256]> input_79 = linear(bias = encoder_ffn2_1_module_sequential_4_linear_bias, weight = encoder_ffn2_1_module_sequential_4_linear_weight, x = input_75)[name = tensor<string, []>("linear_18")];
-            tensor<fp32, []> var_497 = const()[name = tensor<string, []>("op_497"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 1, 256]> var_498 = mul(x = input_79, y = var_497)[name = tensor<string, []>("op_498")];
-            tensor<fp32, [1, 1, 256]> input_81 = add(x = var_498, y = input_71)[name = tensor<string, []>("input_81")];
-            tensor<int32, [1]> input_83_axes_0 = const()[name = tensor<string, []>("input_83_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_83 = layer_norm(axes = input_83_axes_0, beta = encoder_layer_norm_1_bias, epsilon = var_28, gamma = encoder_layer_norm_1_weight, x = input_81)[name = tensor<string, []>("input_83")];
+            tensor<int32, [3]> var_498_begin_0 = const()[name = tensor<string, []>("op_498_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_498_end_0 = const()[name = tensor<string, []>("op_498_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_498_end_mask_0 = const()[name = tensor<string, []>("op_498_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 16, 256]> conv_out_3 = transpose(perm = conv_out_3_perm_0, x = input_71)[name = tensor<string, []>("transpose_46")];
+            tensor<fp32, [1, 1, 256]> var_498 = slice_by_index(begin = var_498_begin_0, end = var_498_end_0, end_mask = var_498_end_mask_0, x = conv_out_3)[name = tensor<string, []>("op_498")];
+            tensor<int32, [3]> var_500_perm_0 = const()[name = tensor<string, []>("op_500_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 1, 256]> var_500 = transpose(perm = var_500_perm_0, x = var_498)[name = tensor<string, []>("transpose_45")];
+            tensor<fp32, [1, 1, 256]> input_73 = add(x = x_9, y = var_500)[name = tensor<string, []>("input_73")];
+            tensor<int32, [1]> input_75_axes_0 = const()[name = tensor<string, []>("input_75_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 1, 256]> input_75 = layer_norm(axes = input_75_axes_0, beta = inner_encoder_ffn2_1_module_sequential_0_bias, epsilon = var_36, gamma = inner_encoder_ffn2_1_module_sequential_0_weight, x = input_73)[name = tensor<string, []>("input_75")];
+            tensor<fp32, [1, 1, 1024]> inputs_19 = linear(bias = inner_encoder_ffn2_1_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_1_module_sequential_1_linear_weight, x = input_75)[name = tensor<string, []>("linear_17")];
+            tensor<fp32, [1, 1, 1024]> input_77 = silu(x = inputs_19)[name = tensor<string, []>("input_77")];
+            tensor<fp32, [1, 1, 256]> input_81 = linear(bias = inner_encoder_ffn2_1_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_1_module_sequential_4_linear_weight, x = input_77)[name = tensor<string, []>("linear_18")];
+            tensor<fp32, []> var_523 = const()[name = tensor<string, []>("op_523"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 1, 256]> var_524 = mul(x = input_81, y = var_523)[name = tensor<string, []>("op_524")];
+            tensor<fp32, [1, 1, 256]> input_83 = add(x = var_524, y = input_73)[name = tensor<string, []>("input_83")];
             tensor<int32, [1]> input_85_axes_0 = const()[name = tensor<string, []>("input_85_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_85 = layer_norm(axes = input_85_axes_0, beta = encoder_ffn1_2_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_2_module_sequential_0_weight, x = input_83)[name = tensor<string, []>("input_85")];
-            tensor<fp32, [1, 1, 1024]> inputs_21 = linear(bias = encoder_ffn1_2_module_sequential_1_linear_bias, weight = encoder_ffn1_2_module_sequential_1_linear_weight, x = input_85)[name = tensor<string, []>("linear_19")];
-            tensor<fp32, [1, 1, 1024]> input_87 = silu(x = inputs_21)[name = tensor<string, []>("input_87")];
-            tensor<fp32, [1, 1, 256]> input_91 = linear(bias = encoder_ffn1_2_module_sequential_4_linear_bias, weight = encoder_ffn1_2_module_sequential_4_linear_weight, x = input_87)[name = tensor<string, []>("linear_20")];
-            tensor<fp32, []> var_527 = const()[name = tensor<string, []>("op_527"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 1, 256]> var_528 = mul(x = input_91, y = var_527)[name = tensor<string, []>("op_528")];
-            tensor<fp32, [1, 1, 256]> input_93 = add(x = var_528, y = input_83)[name = tensor<string, []>("input_93")];
+            tensor<fp32, [1, 1, 256]> input_85 = layer_norm(axes = input_85_axes_0, beta = inner_encoder_layer_norm_1_bias, epsilon = var_36, gamma = inner_encoder_layer_norm_1_weight, x = input_83)[name = tensor<string, []>("input_85")];
+            tensor<int32, [1]> input_87_axes_0 = const()[name = tensor<string, []>("input_87_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 1, 256]> input_87 = layer_norm(axes = input_87_axes_0, beta = inner_encoder_ffn1_2_module_sequential_0_bias, epsilon = var_36, gamma = inner_encoder_ffn1_2_module_sequential_0_weight, x = input_85)[name = tensor<string, []>("input_87")];
+            tensor<fp32, [1, 1, 1024]> inputs_21 = linear(bias = inner_encoder_ffn1_2_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_2_module_sequential_1_linear_weight, x = input_87)[name = tensor<string, []>("linear_19")];
+            tensor<fp32, [1, 1, 1024]> input_89 = silu(x = inputs_21)[name = tensor<string, []>("input_89")];
+            tensor<fp32, [1, 1, 256]> input_93 = linear(bias = inner_encoder_ffn1_2_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_2_module_sequential_4_linear_weight, x = input_89)[name = tensor<string, []>("linear_20")];
+            tensor<fp32, []> var_553 = const()[name = tensor<string, []>("op_553"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 1, 256]> var_554 = mul(x = input_93, y = var_553)[name = tensor<string, []>("op_554")];
+            tensor<fp32, [1, 1, 256]> input_95 = add(x = var_554, y = input_85)[name = tensor<string, []>("input_95")];
             tensor<int32, [1]> x_13_axes_0 = const()[name = tensor<string, []>("x_13_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> x_13 = layer_norm(axes = x_13_axes_0, beta = encoder_ret_lns_2_bias, epsilon = var_28, gamma = encoder_ret_lns_2_weight, x = input_93)[name = tensor<string, []>("x_13")];
+            tensor<fp32, [1, 1, 256]> x_13 = layer_norm(axes = x_13_axes_0, beta = inner_encoder_ret_lns_2_bias, epsilon = var_36, gamma = inner_encoder_ret_lns_2_weight, x = input_95)[name = tensor<string, []>("x_13")];
             tensor<int32, [5]> prev_kv_5_begin_0 = const()[name = tensor<string, []>("prev_kv_5_begin_0"), val = tensor<int32, [5]>([2, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_5_end_0 = const()[name = tensor<string, []>("prev_kv_5_end_0"), val = tensor<int32, [5]>([3, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_5_end_mask_0 = const()[name = tensor<string, []>("prev_kv_5_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -524,139 +530,139 @@ program(1.0)
             tensor<bool, [2]> prev_scale_5_end_mask_0 = const()[name = tensor<string, []>("prev_scale_5_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_5_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_5_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_5 = slice_by_index(begin = prev_scale_5_begin_0, end = prev_scale_5_end_0, end_mask = prev_scale_5_end_mask_0, squeeze_mask = prev_scale_5_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_5")];
-            tensor<fp32, [1, 1, 256]> var_542 = linear(bias = encoder_q_proj_2_bias, weight = encoder_q_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_21")];
-            tensor<int32, [4]> var_543 = const()[name = tensor<string, []>("op_543"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_544 = reshape(shape = var_543, x = var_542)[name = tensor<string, []>("op_544")];
+            tensor<fp32, [1, 1, 256]> var_568 = linear(bias = inner_encoder_q_proj_2_bias, weight = inner_encoder_q_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_21")];
+            tensor<int32, [4]> var_569 = const()[name = tensor<string, []>("op_569"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_570 = reshape(shape = var_569, x = var_568)[name = tensor<string, []>("op_570")];
             tensor<int32, [4]> q_5_perm_0 = const()[name = tensor<string, []>("q_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> var_548 = linear(bias = encoder_k_proj_2_bias, weight = encoder_k_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_22")];
-            tensor<fp32, []> var_549 = const()[name = tensor<string, []>("op_549"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 1, 256]> var_550 = mul(x = var_548, y = var_549)[name = tensor<string, []>("op_550")];
-            tensor<int32, [4]> var_551 = const()[name = tensor<string, []>("op_551"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_552 = reshape(shape = var_551, x = var_550)[name = tensor<string, []>("op_552")];
+            tensor<fp32, [1, 1, 256]> var_574 = linear(bias = inner_encoder_k_proj_2_bias, weight = inner_encoder_k_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_22")];
+            tensor<fp32, []> var_575 = const()[name = tensor<string, []>("op_575"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 1, 256]> var_576 = mul(x = var_574, y = var_575)[name = tensor<string, []>("op_576")];
+            tensor<int32, [4]> var_577 = const()[name = tensor<string, []>("op_577"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_578 = reshape(shape = var_577, x = var_576)[name = tensor<string, []>("op_578")];
             tensor<int32, [4]> k_5_perm_0 = const()[name = tensor<string, []>("k_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> var_556 = linear(bias = encoder_v_proj_2_bias, weight = encoder_v_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_23")];
-            tensor<int32, [4]> var_557 = const()[name = tensor<string, []>("op_557"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_558 = reshape(shape = var_557, x = var_556)[name = tensor<string, []>("op_558")];
+            tensor<fp32, [1, 1, 256]> var_582 = linear(bias = inner_encoder_v_proj_2_bias, weight = inner_encoder_v_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_23")];
+            tensor<int32, [4]> var_583 = const()[name = tensor<string, []>("op_583"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_584 = reshape(shape = var_583, x = var_582)[name = tensor<string, []>("op_584")];
             tensor<int32, [4]> v_5_perm_0 = const()[name = tensor<string, []>("v_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> input_97 = linear(bias = encoder_g_proj_2_bias, weight = encoder_g_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_24")];
+            tensor<fp32, [1, 1, 256]> input_99 = linear(bias = inner_encoder_g_proj_2_bias, weight = inner_encoder_g_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_24")];
             tensor<fp32, [1]> sqrt_s0_5 = sqrt(x = prev_scale_5)[name = tensor<string, []>("sqrt_s0_5")];
-            tensor<fp32, [1]> s_t_5 = add(x = prev_scale_5, y = encoder__t_index)[name = tensor<string, []>("s_t_5")];
+            tensor<fp32, [1]> s_t_5 = add(x = prev_scale_5, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_5")];
             tensor<fp32, [1]> sqrt_s_t_5 = sqrt(x = s_t_5)[name = tensor<string, []>("sqrt_s_t_5")];
             tensor<bool, []> qk_5_transpose_x_1 = const()[name = tensor<string, []>("qk_5_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_5_transpose_y_1 = const()[name = tensor<string, []>("qk_5_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 1, 64]> k_5 = transpose(perm = k_5_perm_0, x = var_552)[name = tensor<string, []>("transpose_43")];
-            tensor<fp32, [1, 4, 1, 64]> q_5 = transpose(perm = q_5_perm_0, x = var_544)[name = tensor<string, []>("transpose_44")];
+            tensor<fp32, [1, 4, 1, 64]> k_5 = transpose(perm = k_5_perm_0, x = var_578)[name = tensor<string, []>("transpose_43")];
+            tensor<fp32, [1, 4, 1, 64]> q_5 = transpose(perm = q_5_perm_0, x = var_570)[name = tensor<string, []>("transpose_44")];
             tensor<fp32, [1, 4, 1, 1]> qk_5 = matmul(transpose_x = qk_5_transpose_x_1, transpose_y = qk_5_transpose_y_1, x = q_5, y = k_5)[name = tensor<string, []>("qk_5")];
-            tensor<int32, [2]> var_568 = const()[name = tensor<string, []>("op_568"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp32, [1, 1]> var_569 = reshape(shape = var_568, x = sqrt_s_t_5)[name = tensor<string, []>("op_569")];
-            tensor<fp32, [1, 1]> M_5 = real_div(x = encoder__causal_mask, y = var_569)[name = tensor<string, []>("M_5")];
-            tensor<fp32, [1, 4, 1, 1]> var_571 = mul(x = qk_5, y = M_5)[name = tensor<string, []>("op_571")];
+            tensor<int32, [2]> var_594 = const()[name = tensor<string, []>("op_594"), val = tensor<int32, [2]>([1, 1])];
+            tensor<fp32, [1, 1]> var_595 = reshape(shape = var_594, x = sqrt_s_t_5)[name = tensor<string, []>("op_595")];
+            tensor<fp32, [1, 1]> M_5 = real_div(x = inner_encoder__causal_mask, y = var_595)[name = tensor<string, []>("M_5")];
+            tensor<fp32, [1, 4, 1, 1]> var_597 = mul(x = qk_5, y = M_5)[name = tensor<string, []>("op_597")];
             tensor<bool, []> inner_5_transpose_x_0 = const()[name = tensor<string, []>("inner_5_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_5_transpose_y_0 = const()[name = tensor<string, []>("inner_5_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 1, 64]> v_5 = transpose(perm = v_5_perm_0, x = var_558)[name = tensor<string, []>("transpose_42")];
-            tensor<fp32, [1, 4, 1, 64]> inner_5 = matmul(transpose_x = inner_5_transpose_x_0, transpose_y = inner_5_transpose_y_0, x = var_571, y = v_5)[name = tensor<string, []>("inner_5")];
-            tensor<bool, []> var_573_transpose_x_0 = const()[name = tensor<string, []>("op_573_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_573_transpose_y_0 = const()[name = tensor<string, []>("op_573_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 1, 64]> var_573 = matmul(transpose_x = var_573_transpose_x_0, transpose_y = var_573_transpose_y_0, x = q_5, y = prev_kv_5)[name = tensor<string, []>("op_573")];
-            tensor<fp32, [1]> var_574 = real_div(x = sqrt_s0_5, y = sqrt_s_t_5)[name = tensor<string, []>("op_574")];
-            tensor<int32, [4]> var_575 = const()[name = tensor<string, []>("op_575"), val = tensor<int32, [4]>([1, 1, 1, 1])];
-            tensor<fp32, [1, 1, 1, 1]> var_576 = reshape(shape = var_575, x = var_574)[name = tensor<string, []>("op_576")];
-            tensor<fp32, [1, 4, 1, 64]> cross_5 = mul(x = var_573, y = var_576)[name = tensor<string, []>("cross_5")];
+            tensor<fp32, [1, 4, 1, 64]> v_5 = transpose(perm = v_5_perm_0, x = var_584)[name = tensor<string, []>("transpose_42")];
+            tensor<fp32, [1, 4, 1, 64]> inner_5 = matmul(transpose_x = inner_5_transpose_x_0, transpose_y = inner_5_transpose_y_0, x = var_597, y = v_5)[name = tensor<string, []>("inner_5")];
+            tensor<bool, []> var_599_transpose_x_0 = const()[name = tensor<string, []>("op_599_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_599_transpose_y_0 = const()[name = tensor<string, []>("op_599_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 1, 64]> var_599 = matmul(transpose_x = var_599_transpose_x_0, transpose_y = var_599_transpose_y_0, x = q_5, y = prev_kv_5)[name = tensor<string, []>("op_599")];
+            tensor<fp32, [1]> var_600 = real_div(x = sqrt_s0_5, y = sqrt_s_t_5)[name = tensor<string, []>("op_600")];
+            tensor<int32, [4]> var_601 = const()[name = tensor<string, []>("op_601"), val = tensor<int32, [4]>([1, 1, 1, 1])];
+            tensor<fp32, [1, 1, 1, 1]> var_602 = reshape(shape = var_601, x = var_600)[name = tensor<string, []>("op_602")];
+            tensor<fp32, [1, 4, 1, 64]> cross_5 = mul(x = var_599, y = var_602)[name = tensor<string, []>("cross_5")];
             tensor<fp32, [1, 4, 1, 64]> out_13 = add(x = inner_5, y = cross_5)[name = tensor<string, []>("out_13")];
-            tensor<fp32, [1, 4, 64, 64]> var_579 = mul(x = prev_kv_5, y = sqrt_s0_5)[name = tensor<string, []>("op_579")];
-            tensor<bool, []> var_581_transpose_x_1 = const()[name = tensor<string, []>("op_581_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_581_transpose_y_1 = const()[name = tensor<string, []>("op_581_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_581 = matmul(transpose_x = var_581_transpose_x_1, transpose_y = var_581_transpose_y_1, x = k_5, y = v_5)[name = tensor<string, []>("op_581")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_5 = add(x = var_579, y = var_581)[name = tensor<string, []>("new_kv_unnorm_5")];
-            tensor<fp32, []> var_583 = const()[name = tensor<string, []>("op_583"), val = tensor<fp32, []>(0x1p+0)];
-            tensor<fp32, [1]> new_scale_5 = add(x = prev_scale_5, y = var_583)[name = tensor<string, []>("new_scale_5")];
-            tensor<fp32, [1]> var_585 = sqrt(x = new_scale_5)[name = tensor<string, []>("op_585")];
-            tensor<fp32, [1, 4, 64, 64]> var_586 = real_div(x = new_kv_unnorm_5, y = var_585)[name = tensor<string, []>("op_586")];
-            tensor<int32, [4]> var_587_perm_0 = const()[name = tensor<string, []>("op_587_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_605 = mul(x = prev_kv_5, y = sqrt_s0_5)[name = tensor<string, []>("op_605")];
+            tensor<bool, []> var_607_transpose_x_1 = const()[name = tensor<string, []>("op_607_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_607_transpose_y_1 = const()[name = tensor<string, []>("op_607_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_607 = matmul(transpose_x = var_607_transpose_x_1, transpose_y = var_607_transpose_y_1, x = k_5, y = v_5)[name = tensor<string, []>("op_607")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_5 = add(x = var_605, y = var_607)[name = tensor<string, []>("new_kv_unnorm_5")];
+            tensor<fp32, []> var_609 = const()[name = tensor<string, []>("op_609"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<fp32, [1]> new_scale_5 = add(x = prev_scale_5, y = var_609)[name = tensor<string, []>("new_scale_5")];
+            tensor<fp32, [1]> var_611 = sqrt(x = new_scale_5)[name = tensor<string, []>("op_611")];
+            tensor<fp32, [1, 4, 64, 64]> var_612 = real_div(x = new_kv_unnorm_5, y = var_611)[name = tensor<string, []>("op_612")];
+            tensor<int32, [4]> var_613_perm_0 = const()[name = tensor<string, []>("op_613_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_15_axes_0 = const()[name = tensor<string, []>("out_15_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 4, 64]> var_587 = transpose(perm = var_587_perm_0, x = out_13)[name = tensor<string, []>("transpose_41")];
-            tensor<fp32, [1, 1, 4, 64]> out_15 = layer_norm(axes = out_15_axes_0, epsilon = var_17, x = var_587)[name = tensor<string, []>("out_15")];
-            tensor<int32, [3]> var_591 = const()[name = tensor<string, []>("op_591"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<fp32, [1, 1, 256]> out_17 = reshape(shape = var_591, x = out_15)[name = tensor<string, []>("out_17")];
-            tensor<fp32, [1, 1, 256]> var_593 = silu(x = input_97)[name = tensor<string, []>("op_593")];
-            tensor<fp32, [1, 1, 256]> input_99 = mul(x = var_593, y = out_17)[name = tensor<string, []>("input_99")];
-            tensor<fp32, [1, 1, 256]> ret_out_5 = linear(bias = encoder_out_proj_2_bias, weight = encoder_out_proj_2_weight, x = input_99)[name = tensor<string, []>("linear_25")];
-            tensor<fp32, [1, 1, 256]> x_15 = add(x = input_93, y = ret_out_5)[name = tensor<string, []>("x_15")];
+            tensor<fp32, [1, 1, 4, 64]> var_613 = transpose(perm = var_613_perm_0, x = out_13)[name = tensor<string, []>("transpose_41")];
+            tensor<fp32, [1, 1, 4, 64]> out_15 = layer_norm(axes = out_15_axes_0, epsilon = var_44, x = var_613)[name = tensor<string, []>("out_15")];
+            tensor<int32, [3]> var_617 = const()[name = tensor<string, []>("op_617"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<fp32, [1, 1, 256]> out_17 = reshape(shape = var_617, x = out_15)[name = tensor<string, []>("out_17")];
+            tensor<fp32, [1, 1, 256]> var_619 = silu(x = input_99)[name = tensor<string, []>("op_619")];
+            tensor<fp32, [1, 1, 256]> input_101 = mul(x = var_619, y = out_17)[name = tensor<string, []>("input_101")];
+            tensor<fp32, [1, 1, 256]> ret_out_5 = linear(bias = inner_encoder_out_proj_2_bias, weight = inner_encoder_out_proj_2_weight, x = input_101)[name = tensor<string, []>("linear_25")];
+            tensor<fp32, [1, 1, 256]> x_15 = add(x = input_95, y = ret_out_5)[name = tensor<string, []>("x_15")];
             tensor<int32, [4]> window_9_begin_0 = const()[name = tensor<string, []>("window_9_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> window_9_end_0 = const()[name = tensor<string, []>("window_9_end_0"), val = tensor<int32, [4]>([3, 1, 16, 256])];
             tensor<bool, [4]> window_9_end_mask_0 = const()[name = tensor<string, []>("window_9_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_9_squeeze_mask_0 = const()[name = tensor<string, []>("window_9_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_9 = slice_by_index(begin = window_9_begin_0, end = window_9_end_0, end_mask = window_9_end_mask_0, squeeze_mask = window_9_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_9")];
-            tensor<int32, [3]> var_604_begin_0 = const()[name = tensor<string, []>("op_604_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_604_end_0 = const()[name = tensor<string, []>("op_604_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_604_end_mask_0 = const()[name = tensor<string, []>("op_604_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_604 = slice_by_index(begin = var_604_begin_0, end = var_604_end_0, end_mask = var_604_end_mask_0, x = window_9)[name = tensor<string, []>("op_604")];
+            tensor<int32, [3]> var_630_begin_0 = const()[name = tensor<string, []>("op_630_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_630_end_0 = const()[name = tensor<string, []>("op_630_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_630_end_mask_0 = const()[name = tensor<string, []>("op_630_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_630 = slice_by_index(begin = var_630_begin_0, end = var_630_end_0, end_mask = var_630_end_mask_0, x = window_9)[name = tensor<string, []>("op_630")];
             tensor<bool, []> window_11_interleave_0 = const()[name = tensor<string, []>("window_11_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_11 = concat(axis = var_26, interleave = window_11_interleave_0, values = (var_604, x_15))[name = tensor<string, []>("window_11")];
-            tensor<bool, []> input_101_interleave_0 = const()[name = tensor<string, []>("input_101_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> input_101 = concat(axis = var_23, interleave = input_101_interleave_0, values = window_11)[name = tensor<string, []>("input_101")];
+            tensor<fp32, [1, 16, 256]> window_11 = concat(axis = var_52, interleave = window_11_interleave_0, values = (var_630, x_15))[name = tensor<string, []>("window_11")];
+            tensor<bool, []> input_103_interleave_0 = const()[name = tensor<string, []>("input_103_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 16, 256]> input_103 = concat(axis = var_39, interleave = input_103_interleave_0, values = window_11)[name = tensor<string, []>("input_103")];
             tensor<int32, [1]> x_17_axes_0 = const()[name = tensor<string, []>("x_17_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 16, 256]> x_17 = layer_norm(axes = x_17_axes_0, beta = encoder_conv_module_2_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_2_sequential_0_weight, x = input_101)[name = tensor<string, []>("x_17")];
-            tensor<int32, [3]> input_103_perm_0 = const()[name = tensor<string, []>("input_103_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [1, 16, 256]> x_17 = layer_norm(axes = x_17_axes_0, beta = inner_encoder_conv_module_2_sequential_0_bias, epsilon = var_36, gamma = inner_encoder_conv_module_2_sequential_0_weight, x = input_103)[name = tensor<string, []>("x_17")];
+            tensor<int32, [3]> input_105_perm_0 = const()[name = tensor<string, []>("input_105_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_23_pad_type_0 = const()[name = tensor<string, []>("inputs_23_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_23_strides_0 = const()[name = tensor<string, []>("inputs_23_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_23_pad_0 = const()[name = tensor<string, []>("inputs_23_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_23_dilations_0 = const()[name = tensor<string, []>("inputs_23_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_23_groups_0 = const()[name = tensor<string, []>("inputs_23_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> input_103 = transpose(perm = input_103_perm_0, x = x_17)[name = tensor<string, []>("transpose_40")];
-            tensor<fp32, [1, 512, 16]> inputs_23 = conv(bias = encoder_conv_module_2_sequential_2_conv_bias, dilations = inputs_23_dilations_0, groups = inputs_23_groups_0, pad = inputs_23_pad_0, pad_type = inputs_23_pad_type_0, strides = inputs_23_strides_0, weight = encoder_conv_module_2_sequential_2_conv_weight, x = input_103)[name = tensor<string, []>("inputs_23")];
-            tensor<int32, [2]> var_629_split_sizes_0 = const()[name = tensor<string, []>("op_629_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_629_axis_0 = const()[name = tensor<string, []>("op_629_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> var_629_0, tensor<fp32, [1, 256, 16]> var_629_1 = split(axis = var_629_axis_0, split_sizes = var_629_split_sizes_0, x = inputs_23)[name = tensor<string, []>("op_629")];
-            tensor<fp32, [1, 256, 16]> var_631 = sigmoid(x = var_629_1)[name = tensor<string, []>("op_631")];
-            tensor<fp32, [1, 256, 16]> inputs_25 = mul(x = var_629_0, y = var_631)[name = tensor<string, []>("inputs_25")];
+            tensor<fp32, [1, 256, 16]> input_105 = transpose(perm = input_105_perm_0, x = x_17)[name = tensor<string, []>("transpose_40")];
+            tensor<fp32, [1, 512, 16]> inputs_23 = conv(bias = inner_encoder_conv_module_2_sequential_2_conv_bias, dilations = inputs_23_dilations_0, groups = inputs_23_groups_0, pad = inputs_23_pad_0, pad_type = inputs_23_pad_type_0, strides = inputs_23_strides_0, weight = inner_encoder_conv_module_2_sequential_2_conv_weight, x = input_105)[name = tensor<string, []>("inputs_23")];
+            tensor<int32, [2]> var_655_split_sizes_0 = const()[name = tensor<string, []>("op_655_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_655_axis_0 = const()[name = tensor<string, []>("op_655_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 256, 16]> var_655_0, tensor<fp32, [1, 256, 16]> var_655_1 = split(axis = var_655_axis_0, split_sizes = var_655_split_sizes_0, x = inputs_23)[name = tensor<string, []>("op_655")];
+            tensor<fp32, [1, 256, 16]> var_657 = sigmoid(x = var_655_1)[name = tensor<string, []>("op_657")];
+            tensor<fp32, [1, 256, 16]> inputs_25 = mul(x = var_655_0, y = var_657)[name = tensor<string, []>("inputs_25")];
             tensor<string, []> outputs_aug_5_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_5_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_5_pad_0 = const()[name = tensor<string, []>("outputs_aug_5_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_5_groups_0 = const()[name = tensor<string, []>("outputs_aug_5_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_5_strides_0 = const()[name = tensor<string, []>("outputs_aug_5_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_5_dilations_0 = const()[name = tensor<string, []>("outputs_aug_5_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [1, 256, 31]> outputs_aug_5 = conv(dilations = outputs_aug_5_dilations_0, groups = outputs_aug_5_groups_0, pad = outputs_aug_5_pad_0, pad_type = outputs_aug_5_pad_type_0, strides = outputs_aug_5_strides_0, weight = encoder_conv_module_2_sequential_4_conv_weight, x = inputs_25)[name = tensor<string, []>("outputs_aug_5")];
-            tensor<int32, [3]> input_105_begin_0 = const()[name = tensor<string, []>("input_105_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_105_end_0 = const()[name = tensor<string, []>("input_105_end_0"), val = tensor<int32, [3]>([1, 256, 16])];
-            tensor<bool, [3]> input_105_end_mask_0 = const()[name = tensor<string, []>("input_105_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [1, 256, 16]> input_105 = slice_by_index(begin = input_105_begin_0, end = input_105_end_0, end_mask = input_105_end_mask_0, x = outputs_aug_5)[name = tensor<string, []>("input_105")];
-            tensor<fp32, [1, 256, 16]> inputs_27 = batch_norm(beta = encoder_conv_module_2_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_2_sequential_5_weight, mean = encoder_conv_module_2_sequential_5_running_mean, variance = encoder_conv_module_2_sequential_5_running_var, x = input_105)[name = tensor<string, []>("inputs_27")];
-            tensor<fp32, [1, 256, 16]> input_107 = silu(x = inputs_27)[name = tensor<string, []>("input_107")];
-            tensor<string, []> input_109_pad_type_0 = const()[name = tensor<string, []>("input_109_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_109_strides_0 = const()[name = tensor<string, []>("input_109_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_109_pad_0 = const()[name = tensor<string, []>("input_109_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_109_dilations_0 = const()[name = tensor<string, []>("input_109_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_109_groups_0 = const()[name = tensor<string, []>("input_109_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> input_109 = conv(bias = encoder_conv_module_2_sequential_7_conv_bias, dilations = input_109_dilations_0, groups = input_109_groups_0, pad = input_109_pad_0, pad_type = input_109_pad_type_0, strides = input_109_strides_0, weight = encoder_conv_module_2_sequential_7_conv_weight, x = input_107)[name = tensor<string, []>("input_109")];
+            tensor<fp32, [1, 256, 31]> outputs_aug_5 = conv(dilations = outputs_aug_5_dilations_0, groups = outputs_aug_5_groups_0, pad = outputs_aug_5_pad_0, pad_type = outputs_aug_5_pad_type_0, strides = outputs_aug_5_strides_0, weight = inner_encoder_conv_module_2_sequential_4_conv_weight, x = inputs_25)[name = tensor<string, []>("outputs_aug_5")];
+            tensor<int32, [3]> input_107_begin_0 = const()[name = tensor<string, []>("input_107_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_107_end_0 = const()[name = tensor<string, []>("input_107_end_0"), val = tensor<int32, [3]>([1, 256, 16])];
+            tensor<bool, [3]> input_107_end_mask_0 = const()[name = tensor<string, []>("input_107_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [1, 256, 16]> input_107 = slice_by_index(begin = input_107_begin_0, end = input_107_end_0, end_mask = input_107_end_mask_0, x = outputs_aug_5)[name = tensor<string, []>("input_107")];
+            tensor<fp32, [1, 256, 16]> inputs_27 = batch_norm(beta = inner_encoder_conv_module_2_sequential_5_bias, epsilon = var_36, gamma = inner_encoder_conv_module_2_sequential_5_weight, mean = inner_encoder_conv_module_2_sequential_5_running_mean, variance = inner_encoder_conv_module_2_sequential_5_running_var, x = input_107)[name = tensor<string, []>("inputs_27")];
+            tensor<fp32, [1, 256, 16]> input_109 = silu(x = inputs_27)[name = tensor<string, []>("input_109")];
+            tensor<string, []> input_111_pad_type_0 = const()[name = tensor<string, []>("input_111_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_111_strides_0 = const()[name = tensor<string, []>("input_111_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_111_pad_0 = const()[name = tensor<string, []>("input_111_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_111_dilations_0 = const()[name = tensor<string, []>("input_111_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_111_groups_0 = const()[name = tensor<string, []>("input_111_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 256, 16]> input_111 = conv(bias = inner_encoder_conv_module_2_sequential_7_conv_bias, dilations = input_111_dilations_0, groups = input_111_groups_0, pad = input_111_pad_0, pad_type = input_111_pad_type_0, strides = input_111_strides_0, weight = inner_encoder_conv_module_2_sequential_7_conv_weight, x = input_109)[name = tensor<string, []>("input_111")];
             tensor<int32, [3]> conv_out_5_perm_0 = const()[name = tensor<string, []>("conv_out_5_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_662_begin_0 = const()[name = tensor<string, []>("op_662_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_662_end_0 = const()[name = tensor<string, []>("op_662_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_662_end_mask_0 = const()[name = tensor<string, []>("op_662_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 16, 256]> conv_out_5 = transpose(perm = conv_out_5_perm_0, x = input_109)[name = tensor<string, []>("transpose_39")];
-            tensor<fp32, [1, 1, 256]> var_662 = slice_by_index(begin = var_662_begin_0, end = var_662_end_0, end_mask = var_662_end_mask_0, x = conv_out_5)[name = tensor<string, []>("op_662")];
-            tensor<int32, [3]> var_664_perm_0 = const()[name = tensor<string, []>("op_664_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 1, 256]> var_664 = transpose(perm = var_664_perm_0, x = var_662)[name = tensor<string, []>("transpose_38")];
-            tensor<fp32, [1, 1, 256]> input_111 = add(x = x_15, y = var_664)[name = tensor<string, []>("input_111")];
-            tensor<int32, [1]> input_113_axes_0 = const()[name = tensor<string, []>("input_113_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_113 = layer_norm(axes = input_113_axes_0, beta = encoder_ffn2_2_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_2_module_sequential_0_weight, x = input_111)[name = tensor<string, []>("input_113")];
-            tensor<fp32, [1, 1, 1024]> inputs_29 = linear(bias = encoder_ffn2_2_module_sequential_1_linear_bias, weight = encoder_ffn2_2_module_sequential_1_linear_weight, x = input_113)[name = tensor<string, []>("linear_26")];
-            tensor<fp32, [1, 1, 1024]> input_115 = silu(x = inputs_29)[name = tensor<string, []>("input_115")];
-            tensor<fp32, [1, 1, 256]> input_119 = linear(bias = encoder_ffn2_2_module_sequential_4_linear_bias, weight = encoder_ffn2_2_module_sequential_4_linear_weight, x = input_115)[name = tensor<string, []>("linear_27")];
-            tensor<fp32, []> var_687 = const()[name = tensor<string, []>("op_687"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 1, 256]> var_688 = mul(x = input_119, y = var_687)[name = tensor<string, []>("op_688")];
-            tensor<fp32, [1, 1, 256]> input_121 = add(x = var_688, y = input_111)[name = tensor<string, []>("input_121")];
-            tensor<int32, [1]> input_123_axes_0 = const()[name = tensor<string, []>("input_123_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_123 = layer_norm(axes = input_123_axes_0, beta = encoder_layer_norm_2_bias, epsilon = var_28, gamma = encoder_layer_norm_2_weight, x = input_121)[name = tensor<string, []>("input_123")];
+            tensor<int32, [3]> var_688_begin_0 = const()[name = tensor<string, []>("op_688_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_688_end_0 = const()[name = tensor<string, []>("op_688_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_688_end_mask_0 = const()[name = tensor<string, []>("op_688_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 16, 256]> conv_out_5 = transpose(perm = conv_out_5_perm_0, x = input_111)[name = tensor<string, []>("transpose_39")];
+            tensor<fp32, [1, 1, 256]> var_688 = slice_by_index(begin = var_688_begin_0, end = var_688_end_0, end_mask = var_688_end_mask_0, x = conv_out_5)[name = tensor<string, []>("op_688")];
+            tensor<int32, [3]> var_690_perm_0 = const()[name = tensor<string, []>("op_690_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 1, 256]> var_690 = transpose(perm = var_690_perm_0, x = var_688)[name = tensor<string, []>("transpose_38")];
+            tensor<fp32, [1, 1, 256]> input_113 = add(x = x_15, y = var_690)[name = tensor<string, []>("input_113")];
+            tensor<int32, [1]> input_115_axes_0 = const()[name = tensor<string, []>("input_115_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 1, 256]> input_115 = layer_norm(axes = input_115_axes_0, beta = inner_encoder_ffn2_2_module_sequential_0_bias, epsilon = var_36, gamma = inner_encoder_ffn2_2_module_sequential_0_weight, x = input_113)[name = tensor<string, []>("input_115")];
+            tensor<fp32, [1, 1, 1024]> inputs_29 = linear(bias = inner_encoder_ffn2_2_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_2_module_sequential_1_linear_weight, x = input_115)[name = tensor<string, []>("linear_26")];
+            tensor<fp32, [1, 1, 1024]> input_117 = silu(x = inputs_29)[name = tensor<string, []>("input_117")];
+            tensor<fp32, [1, 1, 256]> input_121 = linear(bias = inner_encoder_ffn2_2_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_2_module_sequential_4_linear_weight, x = input_117)[name = tensor<string, []>("linear_27")];
+            tensor<fp32, []> var_713 = const()[name = tensor<string, []>("op_713"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 1, 256]> var_714 = mul(x = input_121, y = var_713)[name = tensor<string, []>("op_714")];
+            tensor<fp32, [1, 1, 256]> input_123 = add(x = var_714, y = input_113)[name = tensor<string, []>("input_123")];
             tensor<int32, [1]> input_125_axes_0 = const()[name = tensor<string, []>("input_125_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_125 = layer_norm(axes = input_125_axes_0, beta = encoder_ffn1_3_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_3_module_sequential_0_weight, x = input_123)[name = tensor<string, []>("input_125")];
-            tensor<fp32, [1, 1, 1024]> inputs_31 = linear(bias = encoder_ffn1_3_module_sequential_1_linear_bias, weight = encoder_ffn1_3_module_sequential_1_linear_weight, x = input_125)[name = tensor<string, []>("linear_28")];
-            tensor<fp32, [1, 1, 1024]> input_127 = silu(x = inputs_31)[name = tensor<string, []>("input_127")];
-            tensor<fp32, [1, 1, 256]> input_131 = linear(bias = encoder_ffn1_3_module_sequential_4_linear_bias, weight = encoder_ffn1_3_module_sequential_4_linear_weight, x = input_127)[name = tensor<string, []>("linear_29")];
-            tensor<fp32, []> var_717 = const()[name = tensor<string, []>("op_717"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 1, 256]> var_718 = mul(x = input_131, y = var_717)[name = tensor<string, []>("op_718")];
-            tensor<fp32, [1, 1, 256]> input_133 = add(x = var_718, y = input_123)[name = tensor<string, []>("input_133")];
+            tensor<fp32, [1, 1, 256]> input_125 = layer_norm(axes = input_125_axes_0, beta = inner_encoder_layer_norm_2_bias, epsilon = var_36, gamma = inner_encoder_layer_norm_2_weight, x = input_123)[name = tensor<string, []>("input_125")];
+            tensor<int32, [1]> input_127_axes_0 = const()[name = tensor<string, []>("input_127_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 1, 256]> input_127 = layer_norm(axes = input_127_axes_0, beta = inner_encoder_ffn1_3_module_sequential_0_bias, epsilon = var_36, gamma = inner_encoder_ffn1_3_module_sequential_0_weight, x = input_125)[name = tensor<string, []>("input_127")];
+            tensor<fp32, [1, 1, 1024]> inputs_31 = linear(bias = inner_encoder_ffn1_3_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_3_module_sequential_1_linear_weight, x = input_127)[name = tensor<string, []>("linear_28")];
+            tensor<fp32, [1, 1, 1024]> input_129 = silu(x = inputs_31)[name = tensor<string, []>("input_129")];
+            tensor<fp32, [1, 1, 256]> input_133 = linear(bias = inner_encoder_ffn1_3_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_3_module_sequential_4_linear_weight, x = input_129)[name = tensor<string, []>("linear_29")];
+            tensor<fp32, []> var_743 = const()[name = tensor<string, []>("op_743"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 1, 256]> var_744 = mul(x = input_133, y = var_743)[name = tensor<string, []>("op_744")];
+            tensor<fp32, [1, 1, 256]> input_135 = add(x = var_744, y = input_125)[name = tensor<string, []>("input_135")];
             tensor<int32, [1]> x_19_axes_0 = const()[name = tensor<string, []>("x_19_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> x_19 = layer_norm(axes = x_19_axes_0, beta = encoder_ret_lns_3_bias, epsilon = var_28, gamma = encoder_ret_lns_3_weight, x = input_133)[name = tensor<string, []>("x_19")];
+            tensor<fp32, [1, 1, 256]> x_19 = layer_norm(axes = x_19_axes_0, beta = inner_encoder_ret_lns_3_bias, epsilon = var_36, gamma = inner_encoder_ret_lns_3_weight, x = input_135)[name = tensor<string, []>("x_19")];
             tensor<int32, [5]> prev_kv_7_begin_0 = const()[name = tensor<string, []>("prev_kv_7_begin_0"), val = tensor<int32, [5]>([3, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_7_end_0 = const()[name = tensor<string, []>("prev_kv_7_end_0"), val = tensor<int32, [5]>([4, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_7_end_mask_0 = const()[name = tensor<string, []>("prev_kv_7_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -667,175 +673,168 @@ program(1.0)
             tensor<bool, [2]> prev_scale_7_end_mask_0 = const()[name = tensor<string, []>("prev_scale_7_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_7_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_7_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_7 = slice_by_index(begin = prev_scale_7_begin_0, end = prev_scale_7_end_0, end_mask = prev_scale_7_end_mask_0, squeeze_mask = prev_scale_7_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_7")];
-            tensor<fp32, [1, 1, 256]> var_732 = linear(bias = encoder_q_proj_3_bias, weight = encoder_q_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_30")];
-            tensor<int32, [4]> var_733 = const()[name = tensor<string, []>("op_733"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_734 = reshape(shape = var_733, x = var_732)[name = tensor<string, []>("op_734")];
+            tensor<fp32, [1, 1, 256]> var_758 = linear(bias = inner_encoder_q_proj_3_bias, weight = inner_encoder_q_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_30")];
+            tensor<int32, [4]> var_759 = const()[name = tensor<string, []>("op_759"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_760 = reshape(shape = var_759, x = var_758)[name = tensor<string, []>("op_760")];
             tensor<int32, [4]> q_7_perm_0 = const()[name = tensor<string, []>("q_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> var_738 = linear(bias = encoder_k_proj_3_bias, weight = encoder_k_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_31")];
-            tensor<fp32, []> var_739 = const()[name = tensor<string, []>("op_739"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 1, 256]> var_740 = mul(x = var_738, y = var_739)[name = tensor<string, []>("op_740")];
-            tensor<int32, [4]> var_741 = const()[name = tensor<string, []>("op_741"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_742 = reshape(shape = var_741, x = var_740)[name = tensor<string, []>("op_742")];
+            tensor<fp32, [1, 1, 256]> var_764 = linear(bias = inner_encoder_k_proj_3_bias, weight = inner_encoder_k_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_31")];
+            tensor<fp32, []> var_765 = const()[name = tensor<string, []>("op_765"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 1, 256]> var_766 = mul(x = var_764, y = var_765)[name = tensor<string, []>("op_766")];
+            tensor<int32, [4]> var_767 = const()[name = tensor<string, []>("op_767"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_768 = reshape(shape = var_767, x = var_766)[name = tensor<string, []>("op_768")];
             tensor<int32, [4]> k_7_perm_0 = const()[name = tensor<string, []>("k_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> var_746 = linear(bias = encoder_v_proj_3_bias, weight = encoder_v_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_32")];
-            tensor<int32, [4]> var_747 = const()[name = tensor<string, []>("op_747"), val = tensor<int32, [4]>([1, 1, 4, 64])];
-            tensor<fp32, [1, 1, 4, 64]> var_748 = reshape(shape = var_747, x = var_746)[name = tensor<string, []>("op_748")];
+            tensor<fp32, [1, 1, 256]> var_772 = linear(bias = inner_encoder_v_proj_3_bias, weight = inner_encoder_v_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_32")];
+            tensor<int32, [4]> var_773 = const()[name = tensor<string, []>("op_773"), val = tensor<int32, [4]>([1, 1, 4, 64])];
+            tensor<fp32, [1, 1, 4, 64]> var_774 = reshape(shape = var_773, x = var_772)[name = tensor<string, []>("op_774")];
             tensor<int32, [4]> v_7_perm_0 = const()[name = tensor<string, []>("v_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 1, 256]> input_137 = linear(bias = encoder_g_proj_3_bias, weight = encoder_g_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_33")];
+            tensor<fp32, [1, 1, 256]> input_139 = linear(bias = inner_encoder_g_proj_3_bias, weight = inner_encoder_g_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_33")];
             tensor<fp32, [1]> sqrt_s0_7 = sqrt(x = prev_scale_7)[name = tensor<string, []>("sqrt_s0_7")];
-            tensor<fp32, [1]> s_t_7 = add(x = prev_scale_7, y = encoder__t_index)[name = tensor<string, []>("s_t_7")];
+            tensor<fp32, [1]> s_t_7 = add(x = prev_scale_7, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_7")];
             tensor<fp32, [1]> sqrt_s_t_7 = sqrt(x = s_t_7)[name = tensor<string, []>("sqrt_s_t_7")];
             tensor<bool, []> qk_7_transpose_x_1 = const()[name = tensor<string, []>("qk_7_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_7_transpose_y_1 = const()[name = tensor<string, []>("qk_7_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 1, 64]> k_7 = transpose(perm = k_7_perm_0, x = var_742)[name = tensor<string, []>("transpose_36")];
-            tensor<fp32, [1, 4, 1, 64]> q_7 = transpose(perm = q_7_perm_0, x = var_734)[name = tensor<string, []>("transpose_37")];
+            tensor<fp32, [1, 4, 1, 64]> k_7 = transpose(perm = k_7_perm_0, x = var_768)[name = tensor<string, []>("transpose_36")];
+            tensor<fp32, [1, 4, 1, 64]> q_7 = transpose(perm = q_7_perm_0, x = var_760)[name = tensor<string, []>("transpose_37")];
             tensor<fp32, [1, 4, 1, 1]> qk_7 = matmul(transpose_x = qk_7_transpose_x_1, transpose_y = qk_7_transpose_y_1, x = q_7, y = k_7)[name = tensor<string, []>("qk_7")];
-            tensor<int32, [2]> var_758 = const()[name = tensor<string, []>("op_758"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp32, [1, 1]> var_759 = reshape(shape = var_758, x = sqrt_s_t_7)[name = tensor<string, []>("op_759")];
-            tensor<fp32, [1, 1]> M_7 = real_div(x = encoder__causal_mask, y = var_759)[name = tensor<string, []>("M_7")];
-            tensor<fp32, [1, 4, 1, 1]> var_761 = mul(x = qk_7, y = M_7)[name = tensor<string, []>("op_761")];
+            tensor<int32, [2]> var_784 = const()[name = tensor<string, []>("op_784"), val = tensor<int32, [2]>([1, 1])];
+            tensor<fp32, [1, 1]> var_785 = reshape(shape = var_784, x = sqrt_s_t_7)[name = tensor<string, []>("op_785")];
+            tensor<fp32, [1, 1]> M_7 = real_div(x = inner_encoder__causal_mask, y = var_785)[name = tensor<string, []>("M_7")];
+            tensor<fp32, [1, 4, 1, 1]> var_787 = mul(x = qk_7, y = M_7)[name = tensor<string, []>("op_787")];
             tensor<bool, []> inner_7_transpose_x_0 = const()[name = tensor<string, []>("inner_7_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_7_transpose_y_0 = const()[name = tensor<string, []>("inner_7_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 1, 64]> v_7 = transpose(perm = v_7_perm_0, x = var_748)[name = tensor<string, []>("transpose_35")];
-            tensor<fp32, [1, 4, 1, 64]> inner_7 = matmul(transpose_x = inner_7_transpose_x_0, transpose_y = inner_7_transpose_y_0, x = var_761, y = v_7)[name = tensor<string, []>("inner_7")];
-            tensor<bool, []> var_763_transpose_x_0 = const()[name = tensor<string, []>("op_763_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_763_transpose_y_0 = const()[name = tensor<string, []>("op_763_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 1, 64]> var_763 = matmul(transpose_x = var_763_transpose_x_0, transpose_y = var_763_transpose_y_0, x = q_7, y = prev_kv_7)[name = tensor<string, []>("op_763")];
-            tensor<fp32, [1]> var_764 = real_div(x = sqrt_s0_7, y = sqrt_s_t_7)[name = tensor<string, []>("op_764")];
-            tensor<int32, [4]> var_765 = const()[name = tensor<string, []>("op_765"), val = tensor<int32, [4]>([1, 1, 1, 1])];
-            tensor<fp32, [1, 1, 1, 1]> var_766 = reshape(shape = var_765, x = var_764)[name = tensor<string, []>("op_766")];
-            tensor<fp32, [1, 4, 1, 64]> cross_7 = mul(x = var_763, y = var_766)[name = tensor<string, []>("cross_7")];
+            tensor<fp32, [1, 4, 1, 64]> v_7 = transpose(perm = v_7_perm_0, x = var_774)[name = tensor<string, []>("transpose_35")];
+            tensor<fp32, [1, 4, 1, 64]> inner_7 = matmul(transpose_x = inner_7_transpose_x_0, transpose_y = inner_7_transpose_y_0, x = var_787, y = v_7)[name = tensor<string, []>("inner_7")];
+            tensor<bool, []> var_789_transpose_x_0 = const()[name = tensor<string, []>("op_789_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_789_transpose_y_0 = const()[name = tensor<string, []>("op_789_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 1, 64]> var_789 = matmul(transpose_x = var_789_transpose_x_0, transpose_y = var_789_transpose_y_0, x = q_7, y = prev_kv_7)[name = tensor<string, []>("op_789")];
+            tensor<fp32, [1]> var_790 = real_div(x = sqrt_s0_7, y = sqrt_s_t_7)[name = tensor<string, []>("op_790")];
+            tensor<int32, [4]> var_791 = const()[name = tensor<string, []>("op_791"), val = tensor<int32, [4]>([1, 1, 1, 1])];
+            tensor<fp32, [1, 1, 1, 1]> var_792 = reshape(shape = var_791, x = var_790)[name = tensor<string, []>("op_792")];
+            tensor<fp32, [1, 4, 1, 64]> cross_7 = mul(x = var_789, y = var_792)[name = tensor<string, []>("cross_7")];
             tensor<fp32, [1, 4, 1, 64]> out_19 = add(x = inner_7, y = cross_7)[name = tensor<string, []>("out_19")];
-            tensor<fp32, [1, 4, 64, 64]> var_769 = mul(x = prev_kv_7, y = sqrt_s0_7)[name = tensor<string, []>("op_769")];
-            tensor<bool, []> var_771_transpose_x_1 = const()[name = tensor<string, []>("op_771_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_771_transpose_y_1 = const()[name = tensor<string, []>("op_771_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_771 = matmul(transpose_x = var_771_transpose_x_1, transpose_y = var_771_transpose_y_1, x = k_7, y = v_7)[name = tensor<string, []>("op_771")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_7 = add(x = var_769, y = var_771)[name = tensor<string, []>("new_kv_unnorm_7")];
-            tensor<fp32, []> var_773 = const()[name = tensor<string, []>("op_773"), val = tensor<fp32, []>(0x1p+0)];
-            tensor<fp32, [1]> new_scale_7 = add(x = prev_scale_7, y = var_773)[name = tensor<string, []>("new_scale_7")];
-            tensor<fp32, [1]> var_775 = sqrt(x = new_scale_7)[name = tensor<string, []>("op_775")];
-            tensor<fp32, [1, 4, 64, 64]> nkv_1 = real_div(x = new_kv_unnorm_7, y = var_775)[name = tensor<string, []>("nkv_1")];
-            tensor<int32, [4]> var_777_perm_0 = const()[name = tensor<string, []>("op_777_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_795 = mul(x = prev_kv_7, y = sqrt_s0_7)[name = tensor<string, []>("op_795")];
+            tensor<bool, []> var_797_transpose_x_1 = const()[name = tensor<string, []>("op_797_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_797_transpose_y_1 = const()[name = tensor<string, []>("op_797_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_797 = matmul(transpose_x = var_797_transpose_x_1, transpose_y = var_797_transpose_y_1, x = k_7, y = v_7)[name = tensor<string, []>("op_797")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_7 = add(x = var_795, y = var_797)[name = tensor<string, []>("new_kv_unnorm_7")];
+            tensor<fp32, []> var_799 = const()[name = tensor<string, []>("op_799"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<fp32, [1]> new_scale_7 = add(x = prev_scale_7, y = var_799)[name = tensor<string, []>("new_scale_7")];
+            tensor<fp32, [1]> var_801 = sqrt(x = new_scale_7)[name = tensor<string, []>("op_801")];
+            tensor<fp32, [1, 4, 64, 64]> nkv_1 = real_div(x = new_kv_unnorm_7, y = var_801)[name = tensor<string, []>("nkv_1")];
+            tensor<int32, [4]> var_803_perm_0 = const()[name = tensor<string, []>("op_803_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_21_axes_0 = const()[name = tensor<string, []>("out_21_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 4, 64]> var_777 = transpose(perm = var_777_perm_0, x = out_19)[name = tensor<string, []>("transpose_34")];
-            tensor<fp32, [1, 1, 4, 64]> out_21 = layer_norm(axes = out_21_axes_0, epsilon = var_17, x = var_777)[name = tensor<string, []>("out_21")];
-            tensor<int32, [3]> var_781 = const()[name = tensor<string, []>("op_781"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<fp32, [1, 1, 256]> out_23 = reshape(shape = var_781, x = out_21)[name = tensor<string, []>("out_23")];
-            tensor<fp32, [1, 1, 256]> var_783 = silu(x = input_137)[name = tensor<string, []>("op_783")];
-            tensor<fp32, [1, 1, 256]> input_139 = mul(x = var_783, y = out_23)[name = tensor<string, []>("input_139")];
-            tensor<fp32, [1, 1, 256]> ret_out_7 = linear(bias = encoder_out_proj_3_bias, weight = encoder_out_proj_3_weight, x = input_139)[name = tensor<string, []>("linear_34")];
-            tensor<fp32, [1, 1, 256]> x_21 = add(x = input_133, y = ret_out_7)[name = tensor<string, []>("x_21")];
+            tensor<fp32, [1, 1, 4, 64]> var_803 = transpose(perm = var_803_perm_0, x = out_19)[name = tensor<string, []>("transpose_34")];
+            tensor<fp32, [1, 1, 4, 64]> out_21 = layer_norm(axes = out_21_axes_0, epsilon = var_44, x = var_803)[name = tensor<string, []>("out_21")];
+            tensor<int32, [3]> var_807 = const()[name = tensor<string, []>("op_807"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<fp32, [1, 1, 256]> out_23 = reshape(shape = var_807, x = out_21)[name = tensor<string, []>("out_23")];
+            tensor<fp32, [1, 1, 256]> var_809 = silu(x = input_139)[name = tensor<string, []>("op_809")];
+            tensor<fp32, [1, 1, 256]> input_141 = mul(x = var_809, y = out_23)[name = tensor<string, []>("input_141")];
+            tensor<fp32, [1, 1, 256]> ret_out_7 = linear(bias = inner_encoder_out_proj_3_bias, weight = inner_encoder_out_proj_3_weight, x = input_141)[name = tensor<string, []>("linear_34")];
+            tensor<fp32, [1, 1, 256]> x_21 = add(x = input_135, y = ret_out_7)[name = tensor<string, []>("x_21")];
             tensor<int32, [4]> window_13_begin_0 = const()[name = tensor<string, []>("window_13_begin_0"), val = tensor<int32, [4]>([3, 0, 0, 0])];
             tensor<int32, [4]> window_13_end_0 = const()[name = tensor<string, []>("window_13_end_0"), val = tensor<int32, [4]>([4, 1, 16, 256])];
             tensor<bool, [4]> window_13_end_mask_0 = const()[name = tensor<string, []>("window_13_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_13_squeeze_mask_0 = const()[name = tensor<string, []>("window_13_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_13 = slice_by_index(begin = window_13_begin_0, end = window_13_end_0, end_mask = window_13_end_mask_0, squeeze_mask = window_13_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_13")];
-            tensor<int32, [3]> var_794_begin_0 = const()[name = tensor<string, []>("op_794_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_794_end_0 = const()[name = tensor<string, []>("op_794_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_794_end_mask_0 = const()[name = tensor<string, []>("op_794_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_794 = slice_by_index(begin = var_794_begin_0, end = var_794_end_0, end_mask = var_794_end_mask_0, x = window_13)[name = tensor<string, []>("op_794")];
+            tensor<int32, [3]> var_820_begin_0 = const()[name = tensor<string, []>("op_820_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_820_end_0 = const()[name = tensor<string, []>("op_820_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_820_end_mask_0 = const()[name = tensor<string, []>("op_820_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_820 = slice_by_index(begin = var_820_begin_0, end = var_820_end_0, end_mask = var_820_end_mask_0, x = window_13)[name = tensor<string, []>("op_820")];
             tensor<bool, []> window_interleave_0 = const()[name = tensor<string, []>("window_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window = concat(axis = var_26, interleave = window_interleave_0, values = (var_794, x_21))[name = tensor<string, []>("window")];
-            tensor<bool, []> input_141_interleave_0 = const()[name = tensor<string, []>("input_141_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> input_141 = concat(axis = var_23, interleave = input_141_interleave_0, values = window)[name = tensor<string, []>("input_141")];
+            tensor<fp32, [1, 16, 256]> window = concat(axis = var_52, interleave = window_interleave_0, values = (var_820, x_21))[name = tensor<string, []>("window")];
+            tensor<bool, []> input_143_interleave_0 = const()[name = tensor<string, []>("input_143_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 16, 256]> input_143 = concat(axis = var_39, interleave = input_143_interleave_0, values = window)[name = tensor<string, []>("input_143")];
             tensor<int32, [1]> x_23_axes_0 = const()[name = tensor<string, []>("x_23_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 16, 256]> x_23 = layer_norm(axes = x_23_axes_0, beta = encoder_conv_module_3_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_3_sequential_0_weight, x = input_141)[name = tensor<string, []>("x_23")];
-            tensor<int32, [3]> input_143_perm_0 = const()[name = tensor<string, []>("input_143_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [1, 16, 256]> x_23 = layer_norm(axes = x_23_axes_0, beta = inner_encoder_conv_module_3_sequential_0_bias, epsilon = var_36, gamma = inner_encoder_conv_module_3_sequential_0_weight, x = input_143)[name = tensor<string, []>("x_23")];
+            tensor<int32, [3]> input_145_perm_0 = const()[name = tensor<string, []>("input_145_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_33_pad_type_0 = const()[name = tensor<string, []>("inputs_33_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_33_strides_0 = const()[name = tensor<string, []>("inputs_33_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_33_pad_0 = const()[name = tensor<string, []>("inputs_33_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_33_dilations_0 = const()[name = tensor<string, []>("inputs_33_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_33_groups_0 = const()[name = tensor<string, []>("inputs_33_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> input_143 = transpose(perm = input_143_perm_0, x = x_23)[name = tensor<string, []>("transpose_33")];
-            tensor<fp32, [1, 512, 16]> inputs_33 = conv(bias = encoder_conv_module_3_sequential_2_conv_bias, dilations = inputs_33_dilations_0, groups = inputs_33_groups_0, pad = inputs_33_pad_0, pad_type = inputs_33_pad_type_0, strides = inputs_33_strides_0, weight = encoder_conv_module_3_sequential_2_conv_weight, x = input_143)[name = tensor<string, []>("inputs_33")];
-            tensor<int32, [2]> var_819_split_sizes_0 = const()[name = tensor<string, []>("op_819_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_819_axis_0 = const()[name = tensor<string, []>("op_819_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> var_819_0, tensor<fp32, [1, 256, 16]> var_819_1 = split(axis = var_819_axis_0, split_sizes = var_819_split_sizes_0, x = inputs_33)[name = tensor<string, []>("op_819")];
-            tensor<fp32, [1, 256, 16]> var_821 = sigmoid(x = var_819_1)[name = tensor<string, []>("op_821")];
-            tensor<fp32, [1, 256, 16]> inputs_35 = mul(x = var_819_0, y = var_821)[name = tensor<string, []>("inputs_35")];
+            tensor<fp32, [1, 256, 16]> input_145 = transpose(perm = input_145_perm_0, x = x_23)[name = tensor<string, []>("transpose_33")];
+            tensor<fp32, [1, 512, 16]> inputs_33 = conv(bias = inner_encoder_conv_module_3_sequential_2_conv_bias, dilations = inputs_33_dilations_0, groups = inputs_33_groups_0, pad = inputs_33_pad_0, pad_type = inputs_33_pad_type_0, strides = inputs_33_strides_0, weight = inner_encoder_conv_module_3_sequential_2_conv_weight, x = input_145)[name = tensor<string, []>("inputs_33")];
+            tensor<int32, [2]> var_845_split_sizes_0 = const()[name = tensor<string, []>("op_845_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_845_axis_0 = const()[name = tensor<string, []>("op_845_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 256, 16]> var_845_0, tensor<fp32, [1, 256, 16]> var_845_1 = split(axis = var_845_axis_0, split_sizes = var_845_split_sizes_0, x = inputs_33)[name = tensor<string, []>("op_845")];
+            tensor<fp32, [1, 256, 16]> var_847 = sigmoid(x = var_845_1)[name = tensor<string, []>("op_847")];
+            tensor<fp32, [1, 256, 16]> inputs_35 = mul(x = var_845_0, y = var_847)[name = tensor<string, []>("inputs_35")];
             tensor<string, []> outputs_aug_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_pad_0 = const()[name = tensor<string, []>("outputs_aug_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_groups_0 = const()[name = tensor<string, []>("outputs_aug_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_strides_0 = const()[name = tensor<string, []>("outputs_aug_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_dilations_0 = const()[name = tensor<string, []>("outputs_aug_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [1, 256, 31]> outputs_aug = conv(dilations = outputs_aug_dilations_0, groups = outputs_aug_groups_0, pad = outputs_aug_pad_0, pad_type = outputs_aug_pad_type_0, strides = outputs_aug_strides_0, weight = encoder_conv_module_3_sequential_4_conv_weight, x = inputs_35)[name = tensor<string, []>("outputs_aug")];
-            tensor<int32, [3]> input_145_begin_0 = const()[name = tensor<string, []>("input_145_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_145_end_0 = const()[name = tensor<string, []>("input_145_end_0"), val = tensor<int32, [3]>([1, 256, 16])];
-            tensor<bool, [3]> input_145_end_mask_0 = const()[name = tensor<string, []>("input_145_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [1, 256, 16]> input_145 = slice_by_index(begin = input_145_begin_0, end = input_145_end_0, end_mask = input_145_end_mask_0, x = outputs_aug)[name = tensor<string, []>("input_145")];
-            tensor<fp32, [1, 256, 16]> inputs_37 = batch_norm(beta = encoder_conv_module_3_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_3_sequential_5_weight, mean = encoder_conv_module_3_sequential_5_running_mean, variance = encoder_conv_module_3_sequential_5_running_var, x = input_145)[name = tensor<string, []>("inputs_37")];
-            tensor<fp32, [1, 256, 16]> input_147 = silu(x = inputs_37)[name = tensor<string, []>("input_147")];
-            tensor<string, []> input_149_pad_type_0 = const()[name = tensor<string, []>("input_149_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_149_strides_0 = const()[name = tensor<string, []>("input_149_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_149_pad_0 = const()[name = tensor<string, []>("input_149_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_149_dilations_0 = const()[name = tensor<string, []>("input_149_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_149_groups_0 = const()[name = tensor<string, []>("input_149_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 16]> input_149 = conv(bias = encoder_conv_module_3_sequential_7_conv_bias, dilations = input_149_dilations_0, groups = input_149_groups_0, pad = input_149_pad_0, pad_type = input_149_pad_type_0, strides = input_149_strides_0, weight = encoder_conv_module_3_sequential_7_conv_weight, x = input_147)[name = tensor<string, []>("input_149")];
+            tensor<fp32, [1, 256, 31]> outputs_aug = conv(dilations = outputs_aug_dilations_0, groups = outputs_aug_groups_0, pad = outputs_aug_pad_0, pad_type = outputs_aug_pad_type_0, strides = outputs_aug_strides_0, weight = inner_encoder_conv_module_3_sequential_4_conv_weight, x = inputs_35)[name = tensor<string, []>("outputs_aug")];
+            tensor<int32, [3]> input_147_begin_0 = const()[name = tensor<string, []>("input_147_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_147_end_0 = const()[name = tensor<string, []>("input_147_end_0"), val = tensor<int32, [3]>([1, 256, 16])];
+            tensor<bool, [3]> input_147_end_mask_0 = const()[name = tensor<string, []>("input_147_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [1, 256, 16]> input_147 = slice_by_index(begin = input_147_begin_0, end = input_147_end_0, end_mask = input_147_end_mask_0, x = outputs_aug)[name = tensor<string, []>("input_147")];
+            tensor<fp32, [1, 256, 16]> inputs_37 = batch_norm(beta = inner_encoder_conv_module_3_sequential_5_bias, epsilon = var_36, gamma = inner_encoder_conv_module_3_sequential_5_weight, mean = inner_encoder_conv_module_3_sequential_5_running_mean, variance = inner_encoder_conv_module_3_sequential_5_running_var, x = input_147)[name = tensor<string, []>("inputs_37")];
+            tensor<fp32, [1, 256, 16]> input_149 = silu(x = inputs_37)[name = tensor<string, []>("input_149")];
+            tensor<string, []> input_151_pad_type_0 = const()[name = tensor<string, []>("input_151_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_151_strides_0 = const()[name = tensor<string, []>("input_151_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_151_pad_0 = const()[name = tensor<string, []>("input_151_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_151_dilations_0 = const()[name = tensor<string, []>("input_151_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_151_groups_0 = const()[name = tensor<string, []>("input_151_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 256, 16]> input_151 = conv(bias = inner_encoder_conv_module_3_sequential_7_conv_bias, dilations = input_151_dilations_0, groups = input_151_groups_0, pad = input_151_pad_0, pad_type = input_151_pad_type_0, strides = input_151_strides_0, weight = inner_encoder_conv_module_3_sequential_7_conv_weight, x = input_149)[name = tensor<string, []>("input_151")];
             tensor<int32, [3]> conv_out_7_perm_0 = const()[name = tensor<string, []>("conv_out_7_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_852_begin_0 = const()[name = tensor<string, []>("op_852_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_852_end_0 = const()[name = tensor<string, []>("op_852_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_852_end_mask_0 = const()[name = tensor<string, []>("op_852_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 16, 256]> conv_out_7 = transpose(perm = conv_out_7_perm_0, x = input_149)[name = tensor<string, []>("transpose_32")];
-            tensor<fp32, [1, 1, 256]> var_852 = slice_by_index(begin = var_852_begin_0, end = var_852_end_0, end_mask = var_852_end_mask_0, x = conv_out_7)[name = tensor<string, []>("op_852")];
-            tensor<int32, [3]> var_854_perm_0 = const()[name = tensor<string, []>("op_854_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 1, 256]> var_854 = transpose(perm = var_854_perm_0, x = var_852)[name = tensor<string, []>("transpose_31")];
-            tensor<fp32, [1, 1, 256]> input_151 = add(x = x_21, y = var_854)[name = tensor<string, []>("input_151")];
-            tensor<int32, [1]> input_153_axes_0 = const()[name = tensor<string, []>("input_153_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_153 = layer_norm(axes = input_153_axes_0, beta = encoder_ffn2_3_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_3_module_sequential_0_weight, x = input_151)[name = tensor<string, []>("input_153")];
-            tensor<fp32, [1, 1, 1024]> inputs = linear(bias = encoder_ffn2_3_module_sequential_1_linear_bias, weight = encoder_ffn2_3_module_sequential_1_linear_weight, x = input_153)[name = tensor<string, []>("linear_35")];
-            tensor<fp32, [1, 1, 1024]> input_155 = silu(x = inputs)[name = tensor<string, []>("input_155")];
-            tensor<fp32, [1, 1, 256]> input_159 = linear(bias = encoder_ffn2_3_module_sequential_4_linear_bias, weight = encoder_ffn2_3_module_sequential_4_linear_weight, x = input_155)[name = tensor<string, []>("linear_36")];
-            tensor<fp32, []> var_877 = const()[name = tensor<string, []>("op_877"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 1, 256]> var_878 = mul(x = input_159, y = var_877)[name = tensor<string, []>("op_878")];
-            tensor<fp32, [1, 1, 256]> input_161 = add(x = var_878, y = input_151)[name = tensor<string, []>("input_161")];
+            tensor<int32, [3]> var_878_begin_0 = const()[name = tensor<string, []>("op_878_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_878_end_0 = const()[name = tensor<string, []>("op_878_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_878_end_mask_0 = const()[name = tensor<string, []>("op_878_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 16, 256]> conv_out_7 = transpose(perm = conv_out_7_perm_0, x = input_151)[name = tensor<string, []>("transpose_32")];
+            tensor<fp32, [1, 1, 256]> var_878 = slice_by_index(begin = var_878_begin_0, end = var_878_end_0, end_mask = var_878_end_mask_0, x = conv_out_7)[name = tensor<string, []>("op_878")];
+            tensor<int32, [3]> var_880_perm_0 = const()[name = tensor<string, []>("op_880_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 1, 256]> var_880 = transpose(perm = var_880_perm_0, x = var_878)[name = tensor<string, []>("transpose_31")];
+            tensor<fp32, [1, 1, 256]> input_153 = add(x = x_21, y = var_880)[name = tensor<string, []>("input_153")];
+            tensor<int32, [1]> input_155_axes_0 = const()[name = tensor<string, []>("input_155_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 1, 256]> input_155 = layer_norm(axes = input_155_axes_0, beta = inner_encoder_ffn2_3_module_sequential_0_bias, epsilon = var_36, gamma = inner_encoder_ffn2_3_module_sequential_0_weight, x = input_153)[name = tensor<string, []>("input_155")];
+            tensor<fp32, [1, 1, 1024]> inputs = linear(bias = inner_encoder_ffn2_3_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_3_module_sequential_1_linear_weight, x = input_155)[name = tensor<string, []>("linear_35")];
+            tensor<fp32, [1, 1, 1024]> input_157 = silu(x = inputs)[name = tensor<string, []>("input_157")];
+            tensor<fp32, [1, 1, 256]> input_161 = linear(bias = inner_encoder_ffn2_3_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_3_module_sequential_4_linear_weight, x = input_157)[name = tensor<string, []>("linear_36")];
+            tensor<fp32, []> var_903 = const()[name = tensor<string, []>("op_903"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 1, 256]> var_904 = mul(x = input_161, y = var_903)[name = tensor<string, []>("op_904")];
+            tensor<fp32, [1, 1, 256]> input_163 = add(x = var_904, y = input_153)[name = tensor<string, []>("input_163")];
             tensor<int32, [1]> x_25_axes_0 = const()[name = tensor<string, []>("x_25_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> x_25 = layer_norm(axes = x_25_axes_0, beta = encoder_layer_norm_3_bias, epsilon = var_28, gamma = encoder_layer_norm_3_weight, x = input_161)[name = tensor<string, []>("x_25")];
+            tensor<fp32, [1, 1, 256]> x_25 = layer_norm(axes = x_25_axes_0, beta = inner_encoder_layer_norm_3_bias, epsilon = var_36, gamma = inner_encoder_layer_norm_3_weight, x = input_163)[name = tensor<string, []>("x_25")];
             tensor<int32, [3]> x_ct_perm_0 = const()[name = tensor<string, []>("x_ct_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<bool, []> cat_interleave_0 = const()[name = tensor<string, []>("cat_interleave_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [1, 256, 1]> x_ct = transpose(perm = x_ct_perm_0, x = x_25)[name = tensor<string, []>("transpose_30")];
-            tensor<fp32, [1, 256, 19]> cat = concat(axis = var_20, interleave = cat_interleave_0, values = (cnn_window, x_ct))[name = tensor<string, []>("cat")];
+            tensor<fp32, [1, 256, 19]> cat = concat(axis = var_41, interleave = cat_interleave_0, values = (cnn_window, x_ct))[name = tensor<string, []>("cat")];
             tensor<string, []> conv_out_pad_type_0 = const()[name = tensor<string, []>("conv_out_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> conv_out_strides_0 = const()[name = tensor<string, []>("conv_out_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> conv_out_pad_0 = const()[name = tensor<string, []>("conv_out_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> conv_out_dilations_0 = const()[name = tensor<string, []>("conv_out_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> conv_out_groups_0 = const()[name = tensor<string, []>("conv_out_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 1]> conv_out = conv(bias = encoder_cnn_bias, dilations = conv_out_dilations_0, groups = conv_out_groups_0, pad = conv_out_pad_0, pad_type = conv_out_pad_type_0, strides = conv_out_strides_0, weight = encoder_cnn_weight, x = cat)[name = tensor<string, []>("conv_out")];
-            tensor<int32, [3]> var_896_begin_0 = const()[name = tensor<string, []>("op_896_begin_0"), val = tensor<int32, [3]>([0, 0, 1])];
-            tensor<int32, [3]> var_896_end_0 = const()[name = tensor<string, []>("op_896_end_0"), val = tensor<int32, [3]>([1, 256, 19])];
-            tensor<bool, [3]> var_896_end_mask_0 = const()[name = tensor<string, []>("op_896_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 256, 18]> cnn_window_new = slice_by_index(begin = var_896_begin_0, end = var_896_end_0, end_mask = var_896_end_mask_0, x = cat)[name = tensor<string, []>("op_896")];
-            tensor<int32, [3]> input_163_perm_0 = const()[name = tensor<string, []>("input_163_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [1]> var_898 = const()[name = tensor<string, []>("op_898"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 256]> input_163 = transpose(perm = input_163_perm_0, x = conv_out)[name = tensor<string, []>("transpose_29")];
-            tensor<fp32, [1, 1, 1]> var_899 = reduce_l2_norm(axes = var_898, keep_dims = var_29, x = input_163)[name = tensor<string, []>("op_899")];
+            tensor<fp32, [1, 256, 1]> conv_out = conv(bias = inner_encoder_cnn_bias, dilations = conv_out_dilations_0, groups = conv_out_groups_0, pad = conv_out_pad_0, pad_type = conv_out_pad_type_0, strides = conv_out_strides_0, weight = inner_encoder_cnn_weight, x = cat)[name = tensor<string, []>("conv_out")];
+            tensor<int32, [3]> var_922_begin_0 = const()[name = tensor<string, []>("op_922_begin_0"), val = tensor<int32, [3]>([0, 0, 1])];
+            tensor<int32, [3]> var_922_end_0 = const()[name = tensor<string, []>("op_922_end_0"), val = tensor<int32, [3]>([1, 256, 19])];
+            tensor<bool, [3]> var_922_end_mask_0 = const()[name = tensor<string, []>("op_922_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 256, 18]> cnn_window_new = slice_by_index(begin = var_922_begin_0, end = var_922_end_0, end_mask = var_922_end_mask_0, x = cat)[name = tensor<string, []>("op_922")];
+            tensor<int32, [3]> input_165_perm_0 = const()[name = tensor<string, []>("input_165_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_924 = const()[name = tensor<string, []>("op_924"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 1, 256]> input_165 = transpose(perm = input_165_perm_0, x = conv_out)[name = tensor<string, []>("transpose_29")];
+            tensor<fp32, [1, 1, 1]> var_925 = reduce_l2_norm(axes = var_924, keep_dims = var_35, x = input_165)[name = tensor<string, []>("op_925")];
             tensor<fp32, []> const_12 = const()[name = tensor<string, []>("const_12"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1, 1, 1]> clip_0 = clip(alpha = var_11, beta = const_12, x = var_899)[name = tensor<string, []>("clip_0")];
-            tensor<fp32, [1, 1, 256]> emb = real_div(x = input_163, y = clip_0)[name = tensor<string, []>("emb")];
-            tensor<int32, []> var_903_axis_0 = const()[name = tensor<string, []>("op_903_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1, 4, 64, 64]> enc_kv_new = stack(axis = var_903_axis_0, values = (var_206, var_396, var_586, nkv_1))[name = tensor<string, []>("op_903")];
-            tensor<int32, []> var_905_axis_0 = const()[name = tensor<string, []>("op_905_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1]> enc_scale_new = stack(axis = var_905_axis_0, values = (new_scale_1, new_scale_3, new_scale_5, new_scale_7))[name = tensor<string, []>("op_905")];
-            tensor<int32, []> var_907_axis_0 = const()[name = tensor<string, []>("op_907_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1, 16, 256]> enc_conv_cache_new = stack(axis = var_907_axis_0, values = (window_3, window_7, window_11, window))[name = tensor<string, []>("op_907")];
-            tensor<fp32, []> var_916 = const()[name = tensor<string, []>("op_916"), val = tensor<fp32, []>(0x1.5798eep-27)];
-            tensor<fp32, []> var_921 = const()[name = tensor<string, []>("op_921"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
-            tensor<fp32, []> var_923 = const()[name = tensor<string, []>("op_923"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
-            tensor<bool, []> var_924 = const()[name = tensor<string, []>("op_924"), val = tensor<bool, []>(true)];
-            tensor<fp32, []> var_926 = const()[name = tensor<string, []>("op_926"), val = tensor<fp32, []>(0x1p+0)];
-            tensor<int32, []> var_930 = const()[name = tensor<string, []>("op_930"), val = tensor<int32, []>(-1)];
-            tensor<int32, []> var_936 = const()[name = tensor<string, []>("op_936"), val = tensor<int32, []>(0)];
-            tensor<fp32, [1, 1, 12, 256]> var_993 = const()[name = tensor<string, []>("op_993"), val = tensor<fp32, [1, 1, 12, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44395328)))];
-            tensor<int32, [1]> var_998_axes_0 = const()[name = tensor<string, []>("op_998_axes_0"), val = tensor<int32, [1]>([2])];
-            tensor<fp32, [1, 1, 1, 256]> var_998 = expand_dims(axes = var_998_axes_0, x = emb)[name = tensor<string, []>("op_998")];
+            tensor<fp32, [1, 1, 1]> clip_0 = clip(alpha = var_49, beta = const_12, x = var_925)[name = tensor<string, []>("clip_0")];
+            tensor<fp32, [1, 1, 256]> emb = real_div(x = input_165, y = clip_0)[name = tensor<string, []>("emb")];
+            tensor<int32, []> var_929_axis_0 = const()[name = tensor<string, []>("op_929_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1, 4, 64, 64]> enc_kv_new = stack(axis = var_929_axis_0, values = (var_232, var_422, var_612, nkv_1))[name = tensor<string, []>("op_929")];
+            tensor<int32, []> var_931_axis_0 = const()[name = tensor<string, []>("op_931_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1]> enc_scale_new = stack(axis = var_931_axis_0, values = (new_scale_1, new_scale_3, new_scale_5, new_scale_7))[name = tensor<string, []>("op_931")];
+            tensor<int32, []> var_933_axis_0 = const()[name = tensor<string, []>("op_933_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1, 16, 256]> enc_conv_cache_new = stack(axis = var_933_axis_0, values = (window_3, window_7, window_11, window))[name = tensor<string, []>("op_933")];
+            tensor<fp32, [1, 1, 12, 256]> var_996 = const()[name = tensor<string, []>("op_996"), val = tensor<fp32, [1, 1, 12, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44395328)))];
+            tensor<int32, [1]> var_1001_axes_0 = const()[name = tensor<string, []>("op_1001_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp32, [1, 1, 1, 256]> var_1001 = expand_dims(axes = var_1001_axes_0, x = emb)[name = tensor<string, []>("op_1001")];
             tensor<int32, [4]> emb_exp_reps_0 = const()[name = tensor<string, []>("emb_exp_reps_0"), val = tensor<int32, [4]>([1, 1, 12, 1])];
-            tensor<fp32, [1, 1, 12, 256]> emb_exp = tile(reps = emb_exp_reps_0, x = var_998)[name = tensor<string, []>("emb_exp")];
-            tensor<bool, []> input_165_interleave_0 = const()[name = tensor<string, []>("input_165_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 1, 12, 512]> input_165 = concat(axis = var_930, interleave = input_165_interleave_0, values = (emb_exp, var_993))[name = tensor<string, []>("input_165")];
-            tensor<fp32, [1, 1, 12, 256]> x_27 = linear(bias = decoder_convert_bias, weight = decoder_convert_weight, x = input_165)[name = tensor<string, []>("linear_37")];
-            tensor<int32, [4]> var_1006_perm_0 = const()[name = tensor<string, []>("op_1006_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1010 = const()[name = tensor<string, []>("op_1010"), val = tensor<int32, [3]>([12, 1, 256])];
-            tensor<fp32, [1, 12, 1, 256]> var_1006 = transpose(perm = var_1006_perm_0, x = x_27)[name = tensor<string, []>("transpose_28")];
-            tensor<fp32, [12, 1, 256]> x_29 = reshape(shape = var_1010, x = var_1006)[name = tensor<string, []>("x_29")];
+            tensor<fp32, [1, 1, 12, 256]> emb_exp = tile(reps = emb_exp_reps_0, x = var_1001)[name = tensor<string, []>("emb_exp")];
+            tensor<bool, []> input_167_interleave_0 = const()[name = tensor<string, []>("input_167_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 1, 12, 512]> input_167 = concat(axis = var_42, interleave = input_167_interleave_0, values = (emb_exp, var_996))[name = tensor<string, []>("input_167")];
+            tensor<fp32, [1, 1, 12, 256]> x_27 = linear(bias = inner_decoder_convert_bias, weight = inner_decoder_convert_weight, x = input_167)[name = tensor<string, []>("linear_37")];
+            tensor<int32, [4]> var_1009_perm_0 = const()[name = tensor<string, []>("op_1009_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1013 = const()[name = tensor<string, []>("op_1013"), val = tensor<int32, [3]>([12, 1, 256])];
+            tensor<fp32, [1, 12, 1, 256]> var_1009 = transpose(perm = var_1009_perm_0, x = x_27)[name = tensor<string, []>("transpose_28")];
+            tensor<fp32, [12, 1, 256]> x_29 = reshape(shape = var_1013, x = var_1009)[name = tensor<string, []>("x_29")];
             tensor<int32, [5]> prev_kv_9_begin_0 = const()[name = tensor<string, []>("prev_kv_9_begin_0"), val = tensor<int32, [5]>([0, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_9_end_0 = const()[name = tensor<string, []>("prev_kv_9_end_0"), val = tensor<int32, [5]>([1, 12, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_9_end_mask_0 = const()[name = tensor<string, []>("prev_kv_9_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -846,131 +845,131 @@ program(1.0)
             tensor<bool, [2]> prev_scale_9_end_mask_0 = const()[name = tensor<string, []>("prev_scale_9_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_9_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_9_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_9 = slice_by_index(begin = prev_scale_9_begin_0, end = prev_scale_9_end_0, end_mask = prev_scale_9_end_mask_0, squeeze_mask = prev_scale_9_squeeze_mask_0, x = dec_scale)[name = tensor<string, []>("prev_scale_9")];
-            tensor<fp32, [12, 1, 256]> var_1018 = linear(bias = decoder_q_proj_0_bias, weight = decoder_q_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_38")];
-            tensor<int32, [4]> var_1019 = const()[name = tensor<string, []>("op_1019"), val = tensor<int32, [4]>([12, 1, 4, 64])];
-            tensor<fp32, [12, 1, 4, 64]> var_1020 = reshape(shape = var_1019, x = var_1018)[name = tensor<string, []>("op_1020")];
+            tensor<fp32, [12, 1, 256]> var_1021 = linear(bias = inner_decoder_q_proj_0_bias, weight = inner_decoder_q_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_38")];
+            tensor<int32, [4]> var_1022 = const()[name = tensor<string, []>("op_1022"), val = tensor<int32, [4]>([12, 1, 4, 64])];
+            tensor<fp32, [12, 1, 4, 64]> var_1023 = reshape(shape = var_1022, x = var_1021)[name = tensor<string, []>("op_1023")];
             tensor<int32, [4]> q_9_perm_0 = const()[name = tensor<string, []>("q_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 1, 256]> var_1024 = linear(bias = decoder_k_proj_0_bias, weight = decoder_k_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_39")];
-            tensor<fp32, []> var_1025 = const()[name = tensor<string, []>("op_1025"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [12, 1, 256]> var_1026 = mul(x = var_1024, y = var_1025)[name = tensor<string, []>("op_1026")];
-            tensor<int32, [4]> var_1027 = const()[name = tensor<string, []>("op_1027"), val = tensor<int32, [4]>([12, 1, 4, 64])];
-            tensor<fp32, [12, 1, 4, 64]> var_1028 = reshape(shape = var_1027, x = var_1026)[name = tensor<string, []>("op_1028")];
+            tensor<fp32, [12, 1, 256]> var_1027 = linear(bias = inner_decoder_k_proj_0_bias, weight = inner_decoder_k_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_39")];
+            tensor<fp32, []> var_1028 = const()[name = tensor<string, []>("op_1028"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [12, 1, 256]> var_1029 = mul(x = var_1027, y = var_1028)[name = tensor<string, []>("op_1029")];
+            tensor<int32, [4]> var_1030 = const()[name = tensor<string, []>("op_1030"), val = tensor<int32, [4]>([12, 1, 4, 64])];
+            tensor<fp32, [12, 1, 4, 64]> var_1031 = reshape(shape = var_1030, x = var_1029)[name = tensor<string, []>("op_1031")];
             tensor<int32, [4]> k_9_perm_0 = const()[name = tensor<string, []>("k_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 1, 256]> var_1032 = linear(bias = decoder_v_proj_0_bias, weight = decoder_v_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_40")];
-            tensor<int32, [4]> var_1033 = const()[name = tensor<string, []>("op_1033"), val = tensor<int32, [4]>([12, 1, 4, 64])];
-            tensor<fp32, [12, 1, 4, 64]> var_1034 = reshape(shape = var_1033, x = var_1032)[name = tensor<string, []>("op_1034")];
+            tensor<fp32, [12, 1, 256]> var_1035 = linear(bias = inner_decoder_v_proj_0_bias, weight = inner_decoder_v_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_40")];
+            tensor<int32, [4]> var_1036 = const()[name = tensor<string, []>("op_1036"), val = tensor<int32, [4]>([12, 1, 4, 64])];
+            tensor<fp32, [12, 1, 4, 64]> var_1037 = reshape(shape = var_1036, x = var_1035)[name = tensor<string, []>("op_1037")];
             tensor<int32, [4]> v_9_perm_0 = const()[name = tensor<string, []>("v_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 1, 256]> input_169 = linear(bias = decoder_g_proj_0_bias, weight = decoder_g_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_41")];
+            tensor<fp32, [12, 1, 256]> input_171 = linear(bias = inner_decoder_g_proj_0_bias, weight = inner_decoder_g_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_41")];
             tensor<bool, []> cumsum_mask_1_exclusive_0 = const()[name = tensor<string, []>("cumsum_mask_1_exclusive_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> cumsum_mask_1_reverse_0 = const()[name = tensor<string, []>("cumsum_mask_1_reverse_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1]> cumsum_mask_1 = cumsum(axis = var_936, exclusive = cumsum_mask_1_exclusive_0, reverse = cumsum_mask_1_reverse_0, x = valid_mask)[name = tensor<string, []>("cumsum_mask_1")];
+            tensor<fp32, [1]> cumsum_mask_1 = cumsum(axis = var_39, exclusive = cumsum_mask_1_exclusive_0, reverse = cumsum_mask_1_reverse_0, x = valid_mask)[name = tensor<string, []>("cumsum_mask_1")];
             tensor<fp32, [1]> sqrt_s0_9 = sqrt(x = prev_scale_9)[name = tensor<string, []>("sqrt_s0_9")];
             tensor<fp32, [1]> s_t_9 = add(x = prev_scale_9, y = cumsum_mask_1)[name = tensor<string, []>("s_t_9")];
             tensor<fp32, []> const_20 = const()[name = tensor<string, []>("const_20"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_1 = clip(alpha = var_926, beta = const_20, x = s_t_9)[name = tensor<string, []>("clip_1")];
+            tensor<fp32, [1]> clip_1 = clip(alpha = var_29, beta = const_20, x = s_t_9)[name = tensor<string, []>("clip_1")];
             tensor<fp32, [1]> sqrt_s_t_9 = sqrt(x = clip_1)[name = tensor<string, []>("sqrt_s_t_9")];
             tensor<bool, []> qk_9_transpose_x_1 = const()[name = tensor<string, []>("qk_9_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_9_transpose_y_1 = const()[name = tensor<string, []>("qk_9_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [12, 4, 1, 64]> k_9 = transpose(perm = k_9_perm_0, x = var_1028)[name = tensor<string, []>("transpose_26")];
-            tensor<fp32, [12, 4, 1, 64]> q_9 = transpose(perm = q_9_perm_0, x = var_1020)[name = tensor<string, []>("transpose_27")];
+            tensor<fp32, [12, 4, 1, 64]> k_9 = transpose(perm = k_9_perm_0, x = var_1031)[name = tensor<string, []>("transpose_26")];
+            tensor<fp32, [12, 4, 1, 64]> q_9 = transpose(perm = q_9_perm_0, x = var_1023)[name = tensor<string, []>("transpose_27")];
             tensor<fp32, [12, 4, 1, 1]> qk_9 = matmul(transpose_x = qk_9_transpose_x_1, transpose_y = qk_9_transpose_y_1, x = q_9, y = k_9)[name = tensor<string, []>("qk_9")];
-            tensor<int32, [2]> var_1046 = const()[name = tensor<string, []>("op_1046"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp32, [1, 1]> var_1047 = reshape(shape = var_1046, x = valid_mask)[name = tensor<string, []>("op_1047")];
             tensor<int32, [2]> var_1049 = const()[name = tensor<string, []>("op_1049"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp32, [1, 1]> var_1050 = reshape(shape = var_1049, x = sqrt_s_t_9)[name = tensor<string, []>("op_1050")];
-            tensor<fp32, [1, 1]> M_9 = real_div(x = var_1047, y = var_1050)[name = tensor<string, []>("M_9")];
-            tensor<fp32, [12, 4, 1, 1]> var_1052 = mul(x = qk_9, y = M_9)[name = tensor<string, []>("op_1052")];
+            tensor<fp32, [1, 1]> var_1050 = reshape(shape = var_1049, x = valid_mask)[name = tensor<string, []>("op_1050")];
+            tensor<int32, [2]> var_1052 = const()[name = tensor<string, []>("op_1052"), val = tensor<int32, [2]>([1, 1])];
+            tensor<fp32, [1, 1]> var_1053 = reshape(shape = var_1052, x = sqrt_s_t_9)[name = tensor<string, []>("op_1053")];
+            tensor<fp32, [1, 1]> M_9 = real_div(x = var_1050, y = var_1053)[name = tensor<string, []>("M_9")];
+            tensor<fp32, [12, 4, 1, 1]> var_1055 = mul(x = qk_9, y = M_9)[name = tensor<string, []>("op_1055")];
             tensor<bool, []> inner_9_transpose_x_0 = const()[name = tensor<string, []>("inner_9_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_9_transpose_y_0 = const()[name = tensor<string, []>("inner_9_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 1, 64]> v_9 = transpose(perm = v_9_perm_0, x = var_1034)[name = tensor<string, []>("transpose_25")];
-            tensor<fp32, [12, 4, 1, 64]> inner_9 = matmul(transpose_x = inner_9_transpose_x_0, transpose_y = inner_9_transpose_y_0, x = var_1052, y = v_9)[name = tensor<string, []>("inner_9")];
-            tensor<bool, []> var_1054_transpose_x_0 = const()[name = tensor<string, []>("op_1054_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_1054_transpose_y_0 = const()[name = tensor<string, []>("op_1054_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 1, 64]> var_1054 = matmul(transpose_x = var_1054_transpose_x_0, transpose_y = var_1054_transpose_y_0, x = q_9, y = prev_kv_9)[name = tensor<string, []>("op_1054")];
-            tensor<fp32, [1]> var_1055 = real_div(x = sqrt_s0_9, y = sqrt_s_t_9)[name = tensor<string, []>("op_1055")];
-            tensor<int32, [4]> var_1056 = const()[name = tensor<string, []>("op_1056"), val = tensor<int32, [4]>([1, 1, 1, 1])];
-            tensor<fp32, [1, 1, 1, 1]> var_1057 = reshape(shape = var_1056, x = var_1055)[name = tensor<string, []>("op_1057")];
-            tensor<fp32, [12, 4, 1, 64]> cross_9 = mul(x = var_1054, y = var_1057)[name = tensor<string, []>("cross_9")];
+            tensor<fp32, [12, 4, 1, 64]> v_9 = transpose(perm = v_9_perm_0, x = var_1037)[name = tensor<string, []>("transpose_25")];
+            tensor<fp32, [12, 4, 1, 64]> inner_9 = matmul(transpose_x = inner_9_transpose_x_0, transpose_y = inner_9_transpose_y_0, x = var_1055, y = v_9)[name = tensor<string, []>("inner_9")];
+            tensor<bool, []> var_1057_transpose_x_0 = const()[name = tensor<string, []>("op_1057_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_1057_transpose_y_0 = const()[name = tensor<string, []>("op_1057_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 1, 64]> var_1057 = matmul(transpose_x = var_1057_transpose_x_0, transpose_y = var_1057_transpose_y_0, x = q_9, y = prev_kv_9)[name = tensor<string, []>("op_1057")];
+            tensor<fp32, [1]> var_1058 = real_div(x = sqrt_s0_9, y = sqrt_s_t_9)[name = tensor<string, []>("op_1058")];
+            tensor<int32, [4]> var_1059 = const()[name = tensor<string, []>("op_1059"), val = tensor<int32, [4]>([1, 1, 1, 1])];
+            tensor<fp32, [1, 1, 1, 1]> var_1060 = reshape(shape = var_1059, x = var_1058)[name = tensor<string, []>("op_1060")];
+            tensor<fp32, [12, 4, 1, 64]> cross_9 = mul(x = var_1057, y = var_1060)[name = tensor<string, []>("cross_9")];
             tensor<fp32, [12, 4, 1, 64]> out_25 = add(x = inner_9, y = cross_9)[name = tensor<string, []>("out_25")];
-            tensor<int32, [4]> var_1060 = const()[name = tensor<string, []>("op_1060"), val = tensor<int32, [4]>([1, 1, 1, 1])];
-            tensor<fp32, [1, 1, 1, 1]> var_1061 = reshape(shape = var_1060, x = valid_mask)[name = tensor<string, []>("op_1061")];
-            tensor<fp32, [12, 4, 1, 64]> v_masked_1 = mul(x = v_9, y = var_1061)[name = tensor<string, []>("v_masked_1")];
-            tensor<fp32, [12, 4, 64, 64]> var_1063 = mul(x = prev_kv_9, y = sqrt_s0_9)[name = tensor<string, []>("op_1063")];
-            tensor<bool, []> var_1065_transpose_x_1 = const()[name = tensor<string, []>("op_1065_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_1065_transpose_y_1 = const()[name = tensor<string, []>("op_1065_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 64, 64]> var_1065 = matmul(transpose_x = var_1065_transpose_x_1, transpose_y = var_1065_transpose_y_1, x = k_9, y = v_masked_1)[name = tensor<string, []>("op_1065")];
-            tensor<fp32, [12, 4, 64, 64]> new_kv_unnorm_9 = add(x = var_1063, y = var_1065)[name = tensor<string, []>("new_kv_unnorm_9")];
-            tensor<bool, []> var_1067_keep_dims_0 = const()[name = tensor<string, []>("op_1067_keep_dims_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, []> var_1067 = reduce_sum(keep_dims = var_1067_keep_dims_0, x = valid_mask)[name = tensor<string, []>("op_1067")];
-            tensor<int32, [1]> var_1068 = const()[name = tensor<string, []>("op_1068"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [1]> var_1069 = reshape(shape = var_1068, x = var_1067)[name = tensor<string, []>("op_1069")];
-            tensor<fp32, [1]> new_scale_9 = add(x = prev_scale_9, y = var_1069)[name = tensor<string, []>("new_scale_9")];
+            tensor<int32, [4]> var_1063 = const()[name = tensor<string, []>("op_1063"), val = tensor<int32, [4]>([1, 1, 1, 1])];
+            tensor<fp32, [1, 1, 1, 1]> var_1064 = reshape(shape = var_1063, x = valid_mask)[name = tensor<string, []>("op_1064")];
+            tensor<fp32, [12, 4, 1, 64]> v_masked_1 = mul(x = v_9, y = var_1064)[name = tensor<string, []>("v_masked_1")];
+            tensor<fp32, [12, 4, 64, 64]> var_1066 = mul(x = prev_kv_9, y = sqrt_s0_9)[name = tensor<string, []>("op_1066")];
+            tensor<bool, []> var_1068_transpose_x_1 = const()[name = tensor<string, []>("op_1068_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_1068_transpose_y_1 = const()[name = tensor<string, []>("op_1068_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 64, 64]> var_1068 = matmul(transpose_x = var_1068_transpose_x_1, transpose_y = var_1068_transpose_y_1, x = k_9, y = v_masked_1)[name = tensor<string, []>("op_1068")];
+            tensor<fp32, [12, 4, 64, 64]> new_kv_unnorm_9 = add(x = var_1066, y = var_1068)[name = tensor<string, []>("new_kv_unnorm_9")];
+            tensor<bool, []> var_1070_keep_dims_0 = const()[name = tensor<string, []>("op_1070_keep_dims_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, []> var_1070 = reduce_sum(keep_dims = var_1070_keep_dims_0, x = valid_mask)[name = tensor<string, []>("op_1070")];
+            tensor<int32, [1]> var_1071 = const()[name = tensor<string, []>("op_1071"), val = tensor<int32, [1]>([1])];
+            tensor<fp32, [1]> var_1072 = reshape(shape = var_1071, x = var_1070)[name = tensor<string, []>("op_1072")];
+            tensor<fp32, [1]> new_scale_9 = add(x = prev_scale_9, y = var_1072)[name = tensor<string, []>("new_scale_9")];
             tensor<fp32, []> const_21 = const()[name = tensor<string, []>("const_21"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_2 = clip(alpha = var_926, beta = const_21, x = new_scale_9)[name = tensor<string, []>("clip_2")];
+            tensor<fp32, [1]> clip_2 = clip(alpha = var_29, beta = const_21, x = new_scale_9)[name = tensor<string, []>("clip_2")];
             tensor<fp32, [1]> sqrt_new_scale_1 = sqrt(x = clip_2)[name = tensor<string, []>("sqrt_new_scale_1")];
-            tensor<fp32, [12, 4, 64, 64]> var_1073 = real_div(x = new_kv_unnorm_9, y = sqrt_new_scale_1)[name = tensor<string, []>("op_1073")];
-            tensor<int32, [4]> var_1074_perm_0 = const()[name = tensor<string, []>("op_1074_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [12, 4, 64, 64]> var_1076 = real_div(x = new_kv_unnorm_9, y = sqrt_new_scale_1)[name = tensor<string, []>("op_1076")];
+            tensor<int32, [4]> var_1077_perm_0 = const()[name = tensor<string, []>("op_1077_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_27_axes_0 = const()[name = tensor<string, []>("out_27_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [12, 1, 4, 64]> var_1074 = transpose(perm = var_1074_perm_0, x = out_25)[name = tensor<string, []>("transpose_24")];
-            tensor<fp32, [12, 1, 4, 64]> out_27 = layer_norm(axes = out_27_axes_0, epsilon = var_923, x = var_1074)[name = tensor<string, []>("out_27")];
-            tensor<int32, [3]> var_1078 = const()[name = tensor<string, []>("op_1078"), val = tensor<int32, [3]>([12, 1, 256])];
-            tensor<fp32, [12, 1, 256]> out_29 = reshape(shape = var_1078, x = out_27)[name = tensor<string, []>("out_29")];
-            tensor<fp32, [12, 1, 256]> var_1080 = silu(x = input_169)[name = tensor<string, []>("op_1080")];
-            tensor<fp32, [12, 1, 256]> input_171 = mul(x = var_1080, y = out_29)[name = tensor<string, []>("input_171")];
-            tensor<fp32, [12, 1, 256]> ret_out_9 = linear(bias = decoder_out_proj_0_bias, weight = decoder_out_proj_0_weight, x = input_171)[name = tensor<string, []>("linear_42")];
-            tensor<fp32, [12, 1, 256]> input_173 = add(x = x_29, y = ret_out_9)[name = tensor<string, []>("input_173")];
+            tensor<fp32, [12, 1, 4, 64]> var_1077 = transpose(perm = var_1077_perm_0, x = out_25)[name = tensor<string, []>("transpose_24")];
+            tensor<fp32, [12, 1, 4, 64]> out_27 = layer_norm(axes = out_27_axes_0, epsilon = var_44, x = var_1077)[name = tensor<string, []>("out_27")];
+            tensor<int32, [3]> var_1081 = const()[name = tensor<string, []>("op_1081"), val = tensor<int32, [3]>([12, 1, 256])];
+            tensor<fp32, [12, 1, 256]> out_29 = reshape(shape = var_1081, x = out_27)[name = tensor<string, []>("out_29")];
+            tensor<fp32, [12, 1, 256]> var_1083 = silu(x = input_171)[name = tensor<string, []>("op_1083")];
+            tensor<fp32, [12, 1, 256]> input_173 = mul(x = var_1083, y = out_29)[name = tensor<string, []>("input_173")];
+            tensor<fp32, [12, 1, 256]> ret_out_9 = linear(bias = inner_decoder_out_proj_0_bias, weight = inner_decoder_out_proj_0_weight, x = input_173)[name = tensor<string, []>("linear_42")];
+            tensor<fp32, [12, 1, 256]> input_175 = add(x = x_29, y = ret_out_9)[name = tensor<string, []>("input_175")];
             tensor<int32, [1]> xt_1_axes_0 = const()[name = tensor<string, []>("xt_1_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [12, 1, 256]> xt_1 = layer_norm(axes = xt_1_axes_0, beta = decoder_norm11_0_bias, epsilon = var_921, gamma = decoder_norm11_0_weight, x = input_173)[name = tensor<string, []>("xt_1")];
-            tensor<int32, [4]> var_1090 = const()[name = tensor<string, []>("op_1090"), val = tensor<int32, [4]>([1, 12, 1, 256])];
-            tensor<fp32, [1, 12, 1, 256]> var_1091 = reshape(shape = var_1090, x = xt_1)[name = tensor<string, []>("op_1091")];
-            tensor<int32, [4]> var_1092_perm_0 = const()[name = tensor<string, []>("op_1092_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1095 = const()[name = tensor<string, []>("op_1095"), val = tensor<int32, [3]>([1, 12, 256])];
-            tensor<fp32, [1, 1, 12, 256]> var_1092 = transpose(perm = var_1092_perm_0, x = var_1091)[name = tensor<string, []>("transpose_23")];
-            tensor<fp32, [1, 12, 256]> query_1 = reshape(shape = var_1095, x = var_1092)[name = tensor<string, []>("query_1")];
+            tensor<fp32, [12, 1, 256]> xt_1 = layer_norm(axes = xt_1_axes_0, beta = inner_decoder_norm11_0_bias, epsilon = var_36, gamma = inner_decoder_norm11_0_weight, x = input_175)[name = tensor<string, []>("xt_1")];
+            tensor<int32, [4]> var_1093 = const()[name = tensor<string, []>("op_1093"), val = tensor<int32, [4]>([1, 12, 1, 256])];
+            tensor<fp32, [1, 12, 1, 256]> var_1094 = reshape(shape = var_1093, x = xt_1)[name = tensor<string, []>("op_1094")];
+            tensor<int32, [4]> var_1095_perm_0 = const()[name = tensor<string, []>("op_1095_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1098 = const()[name = tensor<string, []>("op_1098"), val = tensor<int32, [3]>([1, 12, 256])];
+            tensor<fp32, [1, 1, 12, 256]> var_1095 = transpose(perm = var_1095_perm_0, x = var_1094)[name = tensor<string, []>("transpose_23")];
+            tensor<fp32, [1, 12, 256]> query_1 = reshape(shape = var_1098, x = var_1095)[name = tensor<string, []>("query_1")];
             tensor<int32, [3]> query_3_perm_0 = const()[name = tensor<string, []>("query_3_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [12, 1, 256]> query_3 = transpose(perm = query_3_perm_0, x = query_1)[name = tensor<string, []>("transpose_22")];
-            tensor<fp32, [12, 1, 768]> var_1118 = linear(bias = decoder_self_attn2_0_in_proj_bias, weight = decoder_self_attn2_0_in_proj_weight, x = query_3)[name = tensor<string, []>("linear_43")];
+            tensor<fp32, [12, 1, 768]> var_1121 = linear(bias = inner_decoder_self_attn2_0_in_proj_bias, weight = inner_decoder_self_attn2_0_in_proj_weight, x = query_3)[name = tensor<string, []>("linear_43")];
             tensor<int32, [4]> concat_1 = const()[name = tensor<string, []>("concat_1"), val = tensor<int32, [4]>([12, 1, 3, 256])];
-            tensor<fp32, [12, 1, 3, 256]> var_1120 = reshape(shape = concat_1, x = var_1118)[name = tensor<string, []>("op_1120")];
-            tensor<int32, [1]> var_1121_axes_0 = const()[name = tensor<string, []>("op_1121_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp32, [1, 12, 1, 3, 256]> var_1121 = expand_dims(axes = var_1121_axes_0, x = var_1120)[name = tensor<string, []>("op_1121")];
-            tensor<int32, [5]> var_1122_perm_0 = const()[name = tensor<string, []>("op_1122_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
-            tensor<int32, [1]> var_1123_axes_0 = const()[name = tensor<string, []>("op_1123_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp32, [3, 12, 1, 1, 256]> var_1122 = transpose(perm = var_1122_perm_0, x = var_1121)[name = tensor<string, []>("transpose_21")];
-            tensor<fp32, [3, 12, 1, 256]> var_1123 = squeeze(axes = var_1123_axes_0, x = var_1122)[name = tensor<string, []>("op_1123")];
+            tensor<fp32, [12, 1, 3, 256]> var_1123 = reshape(shape = concat_1, x = var_1121)[name = tensor<string, []>("op_1123")];
+            tensor<int32, [1]> var_1124_axes_0 = const()[name = tensor<string, []>("op_1124_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 12, 1, 3, 256]> var_1124 = expand_dims(axes = var_1124_axes_0, x = var_1123)[name = tensor<string, []>("op_1124")];
+            tensor<int32, [5]> var_1125_perm_0 = const()[name = tensor<string, []>("op_1125_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
+            tensor<int32, [1]> var_1126_axes_0 = const()[name = tensor<string, []>("op_1126_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp32, [3, 12, 1, 1, 256]> var_1125 = transpose(perm = var_1125_perm_0, x = var_1124)[name = tensor<string, []>("transpose_21")];
+            tensor<fp32, [3, 12, 1, 256]> var_1126 = squeeze(axes = var_1126_axes_0, x = var_1125)[name = tensor<string, []>("op_1126")];
             tensor<int32, [4]> q_11_begin_0 = const()[name = tensor<string, []>("q_11_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> q_11_end_0 = const()[name = tensor<string, []>("q_11_end_0"), val = tensor<int32, [4]>([1, 12, 1, 256])];
             tensor<bool, [4]> q_11_end_mask_0 = const()[name = tensor<string, []>("q_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> q_11_squeeze_mask_0 = const()[name = tensor<string, []>("q_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 1, 256]> q_11 = slice_by_index(begin = q_11_begin_0, end = q_11_end_0, end_mask = q_11_end_mask_0, squeeze_mask = q_11_squeeze_mask_0, x = var_1123)[name = tensor<string, []>("q_11")];
+            tensor<fp32, [12, 1, 256]> q_11 = slice_by_index(begin = q_11_begin_0, end = q_11_end_0, end_mask = q_11_end_mask_0, squeeze_mask = q_11_squeeze_mask_0, x = var_1126)[name = tensor<string, []>("q_11")];
             tensor<int32, [4]> k_11_begin_0 = const()[name = tensor<string, []>("k_11_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> k_11_end_0 = const()[name = tensor<string, []>("k_11_end_0"), val = tensor<int32, [4]>([2, 12, 1, 256])];
             tensor<bool, [4]> k_11_end_mask_0 = const()[name = tensor<string, []>("k_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> k_11_squeeze_mask_0 = const()[name = tensor<string, []>("k_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 1, 256]> k_11 = slice_by_index(begin = k_11_begin_0, end = k_11_end_0, end_mask = k_11_end_mask_0, squeeze_mask = k_11_squeeze_mask_0, x = var_1123)[name = tensor<string, []>("k_11")];
+            tensor<fp32, [12, 1, 256]> k_11 = slice_by_index(begin = k_11_begin_0, end = k_11_end_0, end_mask = k_11_end_mask_0, squeeze_mask = k_11_squeeze_mask_0, x = var_1126)[name = tensor<string, []>("k_11")];
             tensor<int32, [4]> v_11_begin_0 = const()[name = tensor<string, []>("v_11_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> v_11_end_0 = const()[name = tensor<string, []>("v_11_end_0"), val = tensor<int32, [4]>([3, 12, 1, 256])];
             tensor<bool, [4]> v_11_end_mask_0 = const()[name = tensor<string, []>("v_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> v_11_squeeze_mask_0 = const()[name = tensor<string, []>("v_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 1, 256]> v_11 = slice_by_index(begin = v_11_begin_0, end = v_11_end_0, end_mask = v_11_end_mask_0, squeeze_mask = v_11_squeeze_mask_0, x = var_1123)[name = tensor<string, []>("v_11")];
-            tensor<int32, [3]> var_1131 = const()[name = tensor<string, []>("op_1131"), val = tensor<int32, [3]>([12, 4, 64])];
-            tensor<fp32, [12, 4, 64]> var_1132 = reshape(shape = var_1131, x = q_11)[name = tensor<string, []>("op_1132")];
+            tensor<fp32, [12, 1, 256]> v_11 = slice_by_index(begin = v_11_begin_0, end = v_11_end_0, end_mask = v_11_end_mask_0, squeeze_mask = v_11_squeeze_mask_0, x = var_1126)[name = tensor<string, []>("v_11")];
+            tensor<int32, [3]> var_1134 = const()[name = tensor<string, []>("op_1134"), val = tensor<int32, [3]>([12, 4, 64])];
+            tensor<fp32, [12, 4, 64]> var_1135 = reshape(shape = var_1134, x = q_11)[name = tensor<string, []>("op_1135")];
             tensor<int32, [3]> q_13_perm_0 = const()[name = tensor<string, []>("q_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1138 = const()[name = tensor<string, []>("op_1138"), val = tensor<int32, [3]>([12, 4, 64])];
-            tensor<fp32, [12, 4, 64]> var_1139 = reshape(shape = var_1138, x = k_11)[name = tensor<string, []>("op_1139")];
+            tensor<int32, [3]> var_1141 = const()[name = tensor<string, []>("op_1141"), val = tensor<int32, [3]>([12, 4, 64])];
+            tensor<fp32, [12, 4, 64]> var_1142 = reshape(shape = var_1141, x = k_11)[name = tensor<string, []>("op_1142")];
             tensor<int32, [3]> k_13_perm_0 = const()[name = tensor<string, []>("k_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1145 = const()[name = tensor<string, []>("op_1145"), val = tensor<int32, [3]>([12, 4, 64])];
-            tensor<fp32, [12, 4, 64]> var_1146 = reshape(shape = var_1145, x = v_11)[name = tensor<string, []>("op_1146")];
+            tensor<int32, [3]> var_1148 = const()[name = tensor<string, []>("op_1148"), val = tensor<int32, [3]>([12, 4, 64])];
+            tensor<fp32, [12, 4, 64]> var_1149 = reshape(shape = var_1148, x = v_11)[name = tensor<string, []>("op_1149")];
             tensor<int32, [3]> v_13_perm_0 = const()[name = tensor<string, []>("v_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [4]> var_1149 = const()[name = tensor<string, []>("op_1149"), val = tensor<int32, [4]>([1, 4, 12, 64])];
-            tensor<fp32, [4, 12, 64]> q_13 = transpose(perm = q_13_perm_0, x = var_1132)[name = tensor<string, []>("transpose_20")];
-            tensor<fp32, [1, 4, 12, 64]> q_15 = reshape(shape = var_1149, x = q_13)[name = tensor<string, []>("q_15")];
-            tensor<int32, [4]> var_1151 = const()[name = tensor<string, []>("op_1151"), val = tensor<int32, [4]>([1, 4, 12, 64])];
-            tensor<fp32, [4, 12, 64]> k_13 = transpose(perm = k_13_perm_0, x = var_1139)[name = tensor<string, []>("transpose_19")];
-            tensor<fp32, [1, 4, 12, 64]> k_15 = reshape(shape = var_1151, x = k_13)[name = tensor<string, []>("k_15")];
-            tensor<int32, [4]> var_1153 = const()[name = tensor<string, []>("op_1153"), val = tensor<int32, [4]>([1, 4, 12, 64])];
-            tensor<fp32, [4, 12, 64]> v_13 = transpose(perm = v_13_perm_0, x = var_1146)[name = tensor<string, []>("transpose_18")];
-            tensor<fp32, [1, 4, 12, 64]> v_15 = reshape(shape = var_1153, x = v_13)[name = tensor<string, []>("v_15")];
+            tensor<int32, [4]> var_1152 = const()[name = tensor<string, []>("op_1152"), val = tensor<int32, [4]>([1, 4, 12, 64])];
+            tensor<fp32, [4, 12, 64]> q_13 = transpose(perm = q_13_perm_0, x = var_1135)[name = tensor<string, []>("transpose_20")];
+            tensor<fp32, [1, 4, 12, 64]> q_15 = reshape(shape = var_1152, x = q_13)[name = tensor<string, []>("q_15")];
+            tensor<int32, [4]> var_1154 = const()[name = tensor<string, []>("op_1154"), val = tensor<int32, [4]>([1, 4, 12, 64])];
+            tensor<fp32, [4, 12, 64]> k_13 = transpose(perm = k_13_perm_0, x = var_1142)[name = tensor<string, []>("transpose_19")];
+            tensor<fp32, [1, 4, 12, 64]> k_15 = reshape(shape = var_1154, x = k_13)[name = tensor<string, []>("k_15")];
+            tensor<int32, [4]> var_1156 = const()[name = tensor<string, []>("op_1156"), val = tensor<int32, [4]>([1, 4, 12, 64])];
+            tensor<fp32, [4, 12, 64]> v_13 = transpose(perm = v_13_perm_0, x = var_1149)[name = tensor<string, []>("transpose_18")];
+            tensor<fp32, [1, 4, 12, 64]> v_15 = reshape(shape = var_1156, x = v_13)[name = tensor<string, []>("v_15")];
             tensor<fp32, []> mul_1_y_0 = const()[name = tensor<string, []>("mul_1_y_0"), val = tensor<fp32, []>(0x1p-3)];
             tensor<fp32, [1, 4, 12, 64]> mul_1 = mul(x = q_15, y = mul_1_y_0)[name = tensor<string, []>("mul_1")];
             tensor<bool, []> matmul_0_transpose_y_0 = const()[name = tensor<string, []>("matmul_0_transpose_y_0"), val = tensor<bool, []>(true)];
@@ -981,30 +980,30 @@ program(1.0)
             tensor<bool, []> attn_output_1_transpose_x_0 = const()[name = tensor<string, []>("attn_output_1_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> attn_output_1_transpose_y_0 = const()[name = tensor<string, []>("attn_output_1_transpose_y_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [1, 4, 12, 64]> attn_output_1 = matmul(transpose_x = attn_output_1_transpose_x_0, transpose_y = attn_output_1_transpose_y_0, x = softmax_0, y = v_15)[name = tensor<string, []>("attn_output_1")];
-            tensor<int32, [4]> var_1156 = const()[name = tensor<string, []>("op_1156"), val = tensor<int32, [4]>([2, 0, 1, 3])];
-            tensor<int32, [2]> var_1161 = const()[name = tensor<string, []>("op_1161"), val = tensor<int32, [2]>([12, 256])];
-            tensor<fp32, [12, 1, 4, 64]> var_1157 = transpose(perm = var_1156, x = attn_output_1)[name = tensor<string, []>("transpose_17")];
-            tensor<fp32, [12, 256]> attn_output_3 = reshape(shape = var_1161, x = var_1157)[name = tensor<string, []>("attn_output_3")];
-            tensor<fp32, [12, 256]> attn_output_5 = linear(bias = decoder_self_attn2_0_out_proj_bias, weight = decoder_self_attn2_0_out_proj_weight, x = attn_output_3)[name = tensor<string, []>("linear_44")];
-            tensor<int32, [3]> var_1165 = const()[name = tensor<string, []>("op_1165"), val = tensor<int32, [3]>([12, 1, 256])];
-            tensor<fp32, [12, 1, 256]> attn_output_7 = reshape(shape = var_1165, x = attn_output_5)[name = tensor<string, []>("attn_output_7")];
+            tensor<int32, [4]> var_1159 = const()[name = tensor<string, []>("op_1159"), val = tensor<int32, [4]>([2, 0, 1, 3])];
+            tensor<int32, [2]> var_1164 = const()[name = tensor<string, []>("op_1164"), val = tensor<int32, [2]>([12, 256])];
+            tensor<fp32, [12, 1, 4, 64]> var_1160 = transpose(perm = var_1159, x = attn_output_1)[name = tensor<string, []>("transpose_17")];
+            tensor<fp32, [12, 256]> attn_output_3 = reshape(shape = var_1164, x = var_1160)[name = tensor<string, []>("attn_output_3")];
+            tensor<fp32, [12, 256]> attn_output_5 = linear(bias = inner_decoder_self_attn2_0_out_proj_bias, weight = inner_decoder_self_attn2_0_out_proj_weight, x = attn_output_3)[name = tensor<string, []>("linear_44")];
+            tensor<int32, [3]> var_1168 = const()[name = tensor<string, []>("op_1168"), val = tensor<int32, [3]>([12, 1, 256])];
+            tensor<fp32, [12, 1, 256]> attn_output_7 = reshape(shape = var_1168, x = attn_output_5)[name = tensor<string, []>("attn_output_7")];
             tensor<int32, [3]> sa2_out_1_perm_0 = const()[name = tensor<string, []>("sa2_out_1_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [1, 12, 256]> sa2_out_1 = transpose(perm = sa2_out_1_perm_0, x = attn_output_7)[name = tensor<string, []>("transpose_16")];
-            tensor<fp32, [1, 12, 256]> input_175 = add(x = query_1, y = sa2_out_1)[name = tensor<string, []>("input_175")];
-            tensor<int32, [1]> input_177_axes_0 = const()[name = tensor<string, []>("input_177_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 12, 256]> input_177 = layer_norm(axes = input_177_axes_0, beta = decoder_norm21_0_bias, epsilon = var_921, gamma = decoder_norm21_0_weight, x = input_175)[name = tensor<string, []>("input_177")];
-            tensor<fp32, [1, 12, 2048]> input_179 = linear(bias = decoder_linear1_0_bias, weight = decoder_linear1_0_weight, x = input_177)[name = tensor<string, []>("linear_45")];
-            tensor<fp32, [1, 12, 2048]> input_181 = relu(x = input_179)[name = tensor<string, []>("input_181")];
-            tensor<fp32, [1, 12, 256]> ffn_out_1 = linear(bias = decoder_linear2_0_bias, weight = decoder_linear2_0_weight, x = input_181)[name = tensor<string, []>("linear_46")];
-            tensor<fp32, [1, 12, 256]> input_183 = add(x = input_177, y = ffn_out_1)[name = tensor<string, []>("input_183")];
+            tensor<fp32, [1, 12, 256]> input_177 = add(x = query_1, y = sa2_out_1)[name = tensor<string, []>("input_177")];
+            tensor<int32, [1]> input_179_axes_0 = const()[name = tensor<string, []>("input_179_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 12, 256]> input_179 = layer_norm(axes = input_179_axes_0, beta = inner_decoder_norm21_0_bias, epsilon = var_36, gamma = inner_decoder_norm21_0_weight, x = input_177)[name = tensor<string, []>("input_179")];
+            tensor<fp32, [1, 12, 2048]> input_181 = linear(bias = inner_decoder_linear1_0_bias, weight = inner_decoder_linear1_0_weight, x = input_179)[name = tensor<string, []>("linear_45")];
+            tensor<fp32, [1, 12, 2048]> input_183 = relu(x = input_181)[name = tensor<string, []>("input_183")];
+            tensor<fp32, [1, 12, 256]> ffn_out_1 = linear(bias = inner_decoder_linear2_0_bias, weight = inner_decoder_linear2_0_weight, x = input_183)[name = tensor<string, []>("linear_46")];
+            tensor<fp32, [1, 12, 256]> input_185 = add(x = input_179, y = ffn_out_1)[name = tensor<string, []>("input_185")];
             tensor<int32, [1]> xt_3_axes_0 = const()[name = tensor<string, []>("xt_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 12, 256]> xt_3 = layer_norm(axes = xt_3_axes_0, beta = decoder_norm22_0_bias, epsilon = var_921, gamma = decoder_norm22_0_weight, x = input_183)[name = tensor<string, []>("xt_3")];
-            tensor<int32, [4]> var_1185 = const()[name = tensor<string, []>("op_1185"), val = tensor<int32, [4]>([1, 1, 12, 256])];
-            tensor<fp32, [1, 1, 12, 256]> x_31 = reshape(shape = var_1185, x = xt_3)[name = tensor<string, []>("x_31")];
-            tensor<int32, [4]> var_1187_perm_0 = const()[name = tensor<string, []>("op_1187_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1191 = const()[name = tensor<string, []>("op_1191"), val = tensor<int32, [3]>([12, 1, 256])];
-            tensor<fp32, [1, 12, 1, 256]> var_1187 = transpose(perm = var_1187_perm_0, x = x_31)[name = tensor<string, []>("transpose_15")];
-            tensor<fp32, [12, 1, 256]> x = reshape(shape = var_1191, x = var_1187)[name = tensor<string, []>("x")];
+            tensor<fp32, [1, 12, 256]> xt_3 = layer_norm(axes = xt_3_axes_0, beta = inner_decoder_norm22_0_bias, epsilon = var_36, gamma = inner_decoder_norm22_0_weight, x = input_185)[name = tensor<string, []>("xt_3")];
+            tensor<int32, [4]> var_1188 = const()[name = tensor<string, []>("op_1188"), val = tensor<int32, [4]>([1, 1, 12, 256])];
+            tensor<fp32, [1, 1, 12, 256]> x_31 = reshape(shape = var_1188, x = xt_3)[name = tensor<string, []>("x_31")];
+            tensor<int32, [4]> var_1190_perm_0 = const()[name = tensor<string, []>("op_1190_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1194 = const()[name = tensor<string, []>("op_1194"), val = tensor<int32, [3]>([12, 1, 256])];
+            tensor<fp32, [1, 12, 1, 256]> var_1190 = transpose(perm = var_1190_perm_0, x = x_31)[name = tensor<string, []>("transpose_15")];
+            tensor<fp32, [12, 1, 256]> x = reshape(shape = var_1194, x = var_1190)[name = tensor<string, []>("x")];
             tensor<int32, [5]> prev_kv_begin_0 = const()[name = tensor<string, []>("prev_kv_begin_0"), val = tensor<int32, [5]>([1, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_end_0 = const()[name = tensor<string, []>("prev_kv_end_0"), val = tensor<int32, [5]>([2, 12, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_end_mask_0 = const()[name = tensor<string, []>("prev_kv_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -1015,120 +1014,120 @@ program(1.0)
             tensor<bool, [2]> prev_scale_end_mask_0 = const()[name = tensor<string, []>("prev_scale_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale = slice_by_index(begin = prev_scale_begin_0, end = prev_scale_end_0, end_mask = prev_scale_end_mask_0, squeeze_mask = prev_scale_squeeze_mask_0, x = dec_scale)[name = tensor<string, []>("prev_scale")];
-            tensor<fp32, [12, 1, 256]> var_1199 = linear(bias = decoder_q_proj_1_bias, weight = decoder_q_proj_1_weight, x = x)[name = tensor<string, []>("linear_47")];
-            tensor<int32, [4]> var_1200 = const()[name = tensor<string, []>("op_1200"), val = tensor<int32, [4]>([12, 1, 4, 64])];
-            tensor<fp32, [12, 1, 4, 64]> var_1201 = reshape(shape = var_1200, x = var_1199)[name = tensor<string, []>("op_1201")];
+            tensor<fp32, [12, 1, 256]> var_1202 = linear(bias = inner_decoder_q_proj_1_bias, weight = inner_decoder_q_proj_1_weight, x = x)[name = tensor<string, []>("linear_47")];
+            tensor<int32, [4]> var_1203 = const()[name = tensor<string, []>("op_1203"), val = tensor<int32, [4]>([12, 1, 4, 64])];
+            tensor<fp32, [12, 1, 4, 64]> var_1204 = reshape(shape = var_1203, x = var_1202)[name = tensor<string, []>("op_1204")];
             tensor<int32, [4]> q_17_perm_0 = const()[name = tensor<string, []>("q_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 1, 256]> var_1205 = linear(bias = decoder_k_proj_1_bias, weight = decoder_k_proj_1_weight, x = x)[name = tensor<string, []>("linear_48")];
-            tensor<fp32, []> var_1206 = const()[name = tensor<string, []>("op_1206"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [12, 1, 256]> var_1207 = mul(x = var_1205, y = var_1206)[name = tensor<string, []>("op_1207")];
-            tensor<int32, [4]> var_1208 = const()[name = tensor<string, []>("op_1208"), val = tensor<int32, [4]>([12, 1, 4, 64])];
-            tensor<fp32, [12, 1, 4, 64]> var_1209 = reshape(shape = var_1208, x = var_1207)[name = tensor<string, []>("op_1209")];
+            tensor<fp32, [12, 1, 256]> var_1208 = linear(bias = inner_decoder_k_proj_1_bias, weight = inner_decoder_k_proj_1_weight, x = x)[name = tensor<string, []>("linear_48")];
+            tensor<fp32, []> var_1209 = const()[name = tensor<string, []>("op_1209"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [12, 1, 256]> var_1210 = mul(x = var_1208, y = var_1209)[name = tensor<string, []>("op_1210")];
+            tensor<int32, [4]> var_1211 = const()[name = tensor<string, []>("op_1211"), val = tensor<int32, [4]>([12, 1, 4, 64])];
+            tensor<fp32, [12, 1, 4, 64]> var_1212 = reshape(shape = var_1211, x = var_1210)[name = tensor<string, []>("op_1212")];
             tensor<int32, [4]> k_17_perm_0 = const()[name = tensor<string, []>("k_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 1, 256]> var_1213 = linear(bias = decoder_v_proj_1_bias, weight = decoder_v_proj_1_weight, x = x)[name = tensor<string, []>("linear_49")];
-            tensor<int32, [4]> var_1214 = const()[name = tensor<string, []>("op_1214"), val = tensor<int32, [4]>([12, 1, 4, 64])];
-            tensor<fp32, [12, 1, 4, 64]> var_1215 = reshape(shape = var_1214, x = var_1213)[name = tensor<string, []>("op_1215")];
+            tensor<fp32, [12, 1, 256]> var_1216 = linear(bias = inner_decoder_v_proj_1_bias, weight = inner_decoder_v_proj_1_weight, x = x)[name = tensor<string, []>("linear_49")];
+            tensor<int32, [4]> var_1217 = const()[name = tensor<string, []>("op_1217"), val = tensor<int32, [4]>([12, 1, 4, 64])];
+            tensor<fp32, [12, 1, 4, 64]> var_1218 = reshape(shape = var_1217, x = var_1216)[name = tensor<string, []>("op_1218")];
             tensor<int32, [4]> v_17_perm_0 = const()[name = tensor<string, []>("v_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 1, 256]> input_187 = linear(bias = decoder_g_proj_1_bias, weight = decoder_g_proj_1_weight, x = x)[name = tensor<string, []>("linear_50")];
+            tensor<fp32, [12, 1, 256]> input_189 = linear(bias = inner_decoder_g_proj_1_bias, weight = inner_decoder_g_proj_1_weight, x = x)[name = tensor<string, []>("linear_50")];
             tensor<fp32, [1]> sqrt_s0 = sqrt(x = prev_scale)[name = tensor<string, []>("sqrt_s0")];
             tensor<fp32, [1]> s_t = add(x = prev_scale, y = cumsum_mask_1)[name = tensor<string, []>("s_t")];
             tensor<fp32, []> const_32 = const()[name = tensor<string, []>("const_32"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_3 = clip(alpha = var_926, beta = const_32, x = s_t)[name = tensor<string, []>("clip_3")];
+            tensor<fp32, [1]> clip_3 = clip(alpha = var_29, beta = const_32, x = s_t)[name = tensor<string, []>("clip_3")];
             tensor<fp32, [1]> sqrt_s_t = sqrt(x = clip_3)[name = tensor<string, []>("sqrt_s_t")];
             tensor<bool, []> qk_transpose_x_1 = const()[name = tensor<string, []>("qk_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_transpose_y_1 = const()[name = tensor<string, []>("qk_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [12, 4, 1, 64]> k_17 = transpose(perm = k_17_perm_0, x = var_1209)[name = tensor<string, []>("transpose_13")];
-            tensor<fp32, [12, 4, 1, 64]> q_17 = transpose(perm = q_17_perm_0, x = var_1201)[name = tensor<string, []>("transpose_14")];
+            tensor<fp32, [12, 4, 1, 64]> k_17 = transpose(perm = k_17_perm_0, x = var_1212)[name = tensor<string, []>("transpose_13")];
+            tensor<fp32, [12, 4, 1, 64]> q_17 = transpose(perm = q_17_perm_0, x = var_1204)[name = tensor<string, []>("transpose_14")];
             tensor<fp32, [12, 4, 1, 1]> qk = matmul(transpose_x = qk_transpose_x_1, transpose_y = qk_transpose_y_1, x = q_17, y = k_17)[name = tensor<string, []>("qk")];
-            tensor<int32, [2]> var_1230 = const()[name = tensor<string, []>("op_1230"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp32, [1, 1]> var_1231 = reshape(shape = var_1230, x = sqrt_s_t)[name = tensor<string, []>("op_1231")];
-            tensor<fp32, [1, 1]> M = real_div(x = var_1047, y = var_1231)[name = tensor<string, []>("M")];
-            tensor<fp32, [12, 4, 1, 1]> var_1233 = mul(x = qk, y = M)[name = tensor<string, []>("op_1233")];
-            tensor<bool, []> inner_transpose_x_0 = const()[name = tensor<string, []>("inner_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> inner_transpose_y_0 = const()[name = tensor<string, []>("inner_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 1, 64]> v_17 = transpose(perm = v_17_perm_0, x = var_1215)[name = tensor<string, []>("transpose_12")];
-            tensor<fp32, [12, 4, 1, 64]> inner = matmul(transpose_x = inner_transpose_x_0, transpose_y = inner_transpose_y_0, x = var_1233, y = v_17)[name = tensor<string, []>("inner")];
-            tensor<bool, []> var_1235_transpose_x_0 = const()[name = tensor<string, []>("op_1235_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_1235_transpose_y_0 = const()[name = tensor<string, []>("op_1235_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 1, 64]> var_1235 = matmul(transpose_x = var_1235_transpose_x_0, transpose_y = var_1235_transpose_y_0, x = q_17, y = prev_kv)[name = tensor<string, []>("op_1235")];
-            tensor<fp32, [1]> var_1236 = real_div(x = sqrt_s0, y = sqrt_s_t)[name = tensor<string, []>("op_1236")];
-            tensor<int32, [4]> var_1237 = const()[name = tensor<string, []>("op_1237"), val = tensor<int32, [4]>([1, 1, 1, 1])];
-            tensor<fp32, [1, 1, 1, 1]> var_1238 = reshape(shape = var_1237, x = var_1236)[name = tensor<string, []>("op_1238")];
-            tensor<fp32, [12, 4, 1, 64]> cross = mul(x = var_1235, y = var_1238)[name = tensor<string, []>("cross")];
-            tensor<fp32, [12, 4, 1, 64]> out_31 = add(x = inner, y = cross)[name = tensor<string, []>("out_31")];
-            tensor<fp32, [12, 4, 1, 64]> v_masked = mul(x = v_17, y = var_1061)[name = tensor<string, []>("v_masked")];
-            tensor<fp32, [12, 4, 64, 64]> var_1244 = mul(x = prev_kv, y = sqrt_s0)[name = tensor<string, []>("op_1244")];
-            tensor<bool, []> var_1246_transpose_x_1 = const()[name = tensor<string, []>("op_1246_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_1246_transpose_y_1 = const()[name = tensor<string, []>("op_1246_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 64, 64]> var_1246 = matmul(transpose_x = var_1246_transpose_x_1, transpose_y = var_1246_transpose_y_1, x = k_17, y = v_masked)[name = tensor<string, []>("op_1246")];
-            tensor<fp32, [12, 4, 64, 64]> new_kv_unnorm = add(x = var_1244, y = var_1246)[name = tensor<string, []>("new_kv_unnorm")];
-            tensor<fp32, [1]> new_scale = add(x = prev_scale, y = var_1069)[name = tensor<string, []>("new_scale")];
+            tensor<int32, [2]> var_1233 = const()[name = tensor<string, []>("op_1233"), val = tensor<int32, [2]>([1, 1])];
+            tensor<fp32, [1, 1]> var_1234 = reshape(shape = var_1233, x = sqrt_s_t)[name = tensor<string, []>("op_1234")];
+            tensor<fp32, [1, 1]> M = real_div(x = var_1050, y = var_1234)[name = tensor<string, []>("M")];
+            tensor<fp32, [12, 4, 1, 1]> var_1236 = mul(x = qk, y = M)[name = tensor<string, []>("op_1236")];
+            tensor<bool, []> inner_11_transpose_x_0 = const()[name = tensor<string, []>("inner_11_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> inner_11_transpose_y_0 = const()[name = tensor<string, []>("inner_11_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 1, 64]> v_17 = transpose(perm = v_17_perm_0, x = var_1218)[name = tensor<string, []>("transpose_12")];
+            tensor<fp32, [12, 4, 1, 64]> inner_11 = matmul(transpose_x = inner_11_transpose_x_0, transpose_y = inner_11_transpose_y_0, x = var_1236, y = v_17)[name = tensor<string, []>("inner_11")];
+            tensor<bool, []> var_1238_transpose_x_0 = const()[name = tensor<string, []>("op_1238_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_1238_transpose_y_0 = const()[name = tensor<string, []>("op_1238_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 1, 64]> var_1238 = matmul(transpose_x = var_1238_transpose_x_0, transpose_y = var_1238_transpose_y_0, x = q_17, y = prev_kv)[name = tensor<string, []>("op_1238")];
+            tensor<fp32, [1]> var_1239 = real_div(x = sqrt_s0, y = sqrt_s_t)[name = tensor<string, []>("op_1239")];
+            tensor<int32, [4]> var_1240 = const()[name = tensor<string, []>("op_1240"), val = tensor<int32, [4]>([1, 1, 1, 1])];
+            tensor<fp32, [1, 1, 1, 1]> var_1241 = reshape(shape = var_1240, x = var_1239)[name = tensor<string, []>("op_1241")];
+            tensor<fp32, [12, 4, 1, 64]> cross = mul(x = var_1238, y = var_1241)[name = tensor<string, []>("cross")];
+            tensor<fp32, [12, 4, 1, 64]> out_31 = add(x = inner_11, y = cross)[name = tensor<string, []>("out_31")];
+            tensor<fp32, [12, 4, 1, 64]> v_masked = mul(x = v_17, y = var_1064)[name = tensor<string, []>("v_masked")];
+            tensor<fp32, [12, 4, 64, 64]> var_1247 = mul(x = prev_kv, y = sqrt_s0)[name = tensor<string, []>("op_1247")];
+            tensor<bool, []> var_1249_transpose_x_1 = const()[name = tensor<string, []>("op_1249_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_1249_transpose_y_1 = const()[name = tensor<string, []>("op_1249_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 64, 64]> var_1249 = matmul(transpose_x = var_1249_transpose_x_1, transpose_y = var_1249_transpose_y_1, x = k_17, y = v_masked)[name = tensor<string, []>("op_1249")];
+            tensor<fp32, [12, 4, 64, 64]> new_kv_unnorm = add(x = var_1247, y = var_1249)[name = tensor<string, []>("new_kv_unnorm")];
+            tensor<fp32, [1]> new_scale = add(x = prev_scale, y = var_1072)[name = tensor<string, []>("new_scale")];
             tensor<fp32, []> const_33 = const()[name = tensor<string, []>("const_33"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_4 = clip(alpha = var_926, beta = const_33, x = new_scale)[name = tensor<string, []>("clip_4")];
+            tensor<fp32, [1]> clip_4 = clip(alpha = var_29, beta = const_33, x = new_scale)[name = tensor<string, []>("clip_4")];
             tensor<fp32, [1]> sqrt_new_scale = sqrt(x = clip_4)[name = tensor<string, []>("sqrt_new_scale")];
             tensor<fp32, [12, 4, 64, 64]> nkv = real_div(x = new_kv_unnorm, y = sqrt_new_scale)[name = tensor<string, []>("nkv")];
-            tensor<int32, [4]> var_1255_perm_0 = const()[name = tensor<string, []>("op_1255_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [4]> var_1258_perm_0 = const()[name = tensor<string, []>("op_1258_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_33_axes_0 = const()[name = tensor<string, []>("out_33_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [12, 1, 4, 64]> var_1255 = transpose(perm = var_1255_perm_0, x = out_31)[name = tensor<string, []>("transpose_11")];
-            tensor<fp32, [12, 1, 4, 64]> out_33 = layer_norm(axes = out_33_axes_0, epsilon = var_923, x = var_1255)[name = tensor<string, []>("out_33")];
-            tensor<int32, [3]> var_1259 = const()[name = tensor<string, []>("op_1259"), val = tensor<int32, [3]>([12, 1, 256])];
-            tensor<fp32, [12, 1, 256]> out = reshape(shape = var_1259, x = out_33)[name = tensor<string, []>("out")];
-            tensor<fp32, [12, 1, 256]> var_1261 = silu(x = input_187)[name = tensor<string, []>("op_1261")];
-            tensor<fp32, [12, 1, 256]> input_189 = mul(x = var_1261, y = out)[name = tensor<string, []>("input_189")];
-            tensor<fp32, [12, 1, 256]> ret_out = linear(bias = decoder_out_proj_1_bias, weight = decoder_out_proj_1_weight, x = input_189)[name = tensor<string, []>("linear_51")];
-            tensor<fp32, [12, 1, 256]> input_191 = add(x = x, y = ret_out)[name = tensor<string, []>("input_191")];
+            tensor<fp32, [12, 1, 4, 64]> var_1258 = transpose(perm = var_1258_perm_0, x = out_31)[name = tensor<string, []>("transpose_11")];
+            tensor<fp32, [12, 1, 4, 64]> out_33 = layer_norm(axes = out_33_axes_0, epsilon = var_44, x = var_1258)[name = tensor<string, []>("out_33")];
+            tensor<int32, [3]> var_1262 = const()[name = tensor<string, []>("op_1262"), val = tensor<int32, [3]>([12, 1, 256])];
+            tensor<fp32, [12, 1, 256]> out = reshape(shape = var_1262, x = out_33)[name = tensor<string, []>("out")];
+            tensor<fp32, [12, 1, 256]> var_1264 = silu(x = input_189)[name = tensor<string, []>("op_1264")];
+            tensor<fp32, [12, 1, 256]> input_191 = mul(x = var_1264, y = out)[name = tensor<string, []>("input_191")];
+            tensor<fp32, [12, 1, 256]> ret_out = linear(bias = inner_decoder_out_proj_1_bias, weight = inner_decoder_out_proj_1_weight, x = input_191)[name = tensor<string, []>("linear_51")];
+            tensor<fp32, [12, 1, 256]> input_193 = add(x = x, y = ret_out)[name = tensor<string, []>("input_193")];
             tensor<int32, [1]> xt_5_axes_0 = const()[name = tensor<string, []>("xt_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [12, 1, 256]> xt_5 = layer_norm(axes = xt_5_axes_0, beta = decoder_norm11_1_bias, epsilon = var_921, gamma = decoder_norm11_1_weight, x = input_191)[name = tensor<string, []>("xt_5")];
-            tensor<int32, [4]> var_1271 = const()[name = tensor<string, []>("op_1271"), val = tensor<int32, [4]>([1, 12, 1, 256])];
-            tensor<fp32, [1, 12, 1, 256]> var_1272 = reshape(shape = var_1271, x = xt_5)[name = tensor<string, []>("op_1272")];
-            tensor<int32, [4]> var_1273_perm_0 = const()[name = tensor<string, []>("op_1273_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1276 = const()[name = tensor<string, []>("op_1276"), val = tensor<int32, [3]>([1, 12, 256])];
-            tensor<fp32, [1, 1, 12, 256]> var_1273 = transpose(perm = var_1273_perm_0, x = var_1272)[name = tensor<string, []>("transpose_10")];
-            tensor<fp32, [1, 12, 256]> query_5 = reshape(shape = var_1276, x = var_1273)[name = tensor<string, []>("query_5")];
+            tensor<fp32, [12, 1, 256]> xt_5 = layer_norm(axes = xt_5_axes_0, beta = inner_decoder_norm11_1_bias, epsilon = var_36, gamma = inner_decoder_norm11_1_weight, x = input_193)[name = tensor<string, []>("xt_5")];
+            tensor<int32, [4]> var_1274 = const()[name = tensor<string, []>("op_1274"), val = tensor<int32, [4]>([1, 12, 1, 256])];
+            tensor<fp32, [1, 12, 1, 256]> var_1275 = reshape(shape = var_1274, x = xt_5)[name = tensor<string, []>("op_1275")];
+            tensor<int32, [4]> var_1276_perm_0 = const()[name = tensor<string, []>("op_1276_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1279 = const()[name = tensor<string, []>("op_1279"), val = tensor<int32, [3]>([1, 12, 256])];
+            tensor<fp32, [1, 1, 12, 256]> var_1276 = transpose(perm = var_1276_perm_0, x = var_1275)[name = tensor<string, []>("transpose_10")];
+            tensor<fp32, [1, 12, 256]> query_5 = reshape(shape = var_1279, x = var_1276)[name = tensor<string, []>("query_5")];
             tensor<int32, [3]> query_perm_0 = const()[name = tensor<string, []>("query_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [12, 1, 256]> query = transpose(perm = query_perm_0, x = query_5)[name = tensor<string, []>("transpose_9")];
-            tensor<fp32, [12, 1, 768]> var_1299 = linear(bias = decoder_self_attn2_1_in_proj_bias, weight = decoder_self_attn2_1_in_proj_weight, x = query)[name = tensor<string, []>("linear_52")];
+            tensor<fp32, [12, 1, 768]> var_1302 = linear(bias = inner_decoder_self_attn2_1_in_proj_bias, weight = inner_decoder_self_attn2_1_in_proj_weight, x = query)[name = tensor<string, []>("linear_52")];
             tensor<int32, [4]> concat_2 = const()[name = tensor<string, []>("concat_2"), val = tensor<int32, [4]>([12, 1, 3, 256])];
-            tensor<fp32, [12, 1, 3, 256]> var_1301 = reshape(shape = concat_2, x = var_1299)[name = tensor<string, []>("op_1301")];
-            tensor<int32, [1]> var_1302_axes_0 = const()[name = tensor<string, []>("op_1302_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp32, [1, 12, 1, 3, 256]> var_1302 = expand_dims(axes = var_1302_axes_0, x = var_1301)[name = tensor<string, []>("op_1302")];
-            tensor<int32, [5]> var_1303_perm_0 = const()[name = tensor<string, []>("op_1303_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
-            tensor<int32, [1]> var_1304_axes_0 = const()[name = tensor<string, []>("op_1304_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp32, [3, 12, 1, 1, 256]> var_1303 = transpose(perm = var_1303_perm_0, x = var_1302)[name = tensor<string, []>("transpose_8")];
-            tensor<fp32, [3, 12, 1, 256]> var_1304 = squeeze(axes = var_1304_axes_0, x = var_1303)[name = tensor<string, []>("op_1304")];
+            tensor<fp32, [12, 1, 3, 256]> var_1304 = reshape(shape = concat_2, x = var_1302)[name = tensor<string, []>("op_1304")];
+            tensor<int32, [1]> var_1305_axes_0 = const()[name = tensor<string, []>("op_1305_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 12, 1, 3, 256]> var_1305 = expand_dims(axes = var_1305_axes_0, x = var_1304)[name = tensor<string, []>("op_1305")];
+            tensor<int32, [5]> var_1306_perm_0 = const()[name = tensor<string, []>("op_1306_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
+            tensor<int32, [1]> var_1307_axes_0 = const()[name = tensor<string, []>("op_1307_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp32, [3, 12, 1, 1, 256]> var_1306 = transpose(perm = var_1306_perm_0, x = var_1305)[name = tensor<string, []>("transpose_8")];
+            tensor<fp32, [3, 12, 1, 256]> var_1307 = squeeze(axes = var_1307_axes_0, x = var_1306)[name = tensor<string, []>("op_1307")];
             tensor<int32, [4]> q_19_begin_0 = const()[name = tensor<string, []>("q_19_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> q_19_end_0 = const()[name = tensor<string, []>("q_19_end_0"), val = tensor<int32, [4]>([1, 12, 1, 256])];
             tensor<bool, [4]> q_19_end_mask_0 = const()[name = tensor<string, []>("q_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> q_19_squeeze_mask_0 = const()[name = tensor<string, []>("q_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 1, 256]> q_19 = slice_by_index(begin = q_19_begin_0, end = q_19_end_0, end_mask = q_19_end_mask_0, squeeze_mask = q_19_squeeze_mask_0, x = var_1304)[name = tensor<string, []>("q_19")];
+            tensor<fp32, [12, 1, 256]> q_19 = slice_by_index(begin = q_19_begin_0, end = q_19_end_0, end_mask = q_19_end_mask_0, squeeze_mask = q_19_squeeze_mask_0, x = var_1307)[name = tensor<string, []>("q_19")];
             tensor<int32, [4]> k_19_begin_0 = const()[name = tensor<string, []>("k_19_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> k_19_end_0 = const()[name = tensor<string, []>("k_19_end_0"), val = tensor<int32, [4]>([2, 12, 1, 256])];
             tensor<bool, [4]> k_19_end_mask_0 = const()[name = tensor<string, []>("k_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> k_19_squeeze_mask_0 = const()[name = tensor<string, []>("k_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 1, 256]> k_19 = slice_by_index(begin = k_19_begin_0, end = k_19_end_0, end_mask = k_19_end_mask_0, squeeze_mask = k_19_squeeze_mask_0, x = var_1304)[name = tensor<string, []>("k_19")];
+            tensor<fp32, [12, 1, 256]> k_19 = slice_by_index(begin = k_19_begin_0, end = k_19_end_0, end_mask = k_19_end_mask_0, squeeze_mask = k_19_squeeze_mask_0, x = var_1307)[name = tensor<string, []>("k_19")];
             tensor<int32, [4]> v_19_begin_0 = const()[name = tensor<string, []>("v_19_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> v_19_end_0 = const()[name = tensor<string, []>("v_19_end_0"), val = tensor<int32, [4]>([3, 12, 1, 256])];
             tensor<bool, [4]> v_19_end_mask_0 = const()[name = tensor<string, []>("v_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> v_19_squeeze_mask_0 = const()[name = tensor<string, []>("v_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 1, 256]> v_19 = slice_by_index(begin = v_19_begin_0, end = v_19_end_0, end_mask = v_19_end_mask_0, squeeze_mask = v_19_squeeze_mask_0, x = var_1304)[name = tensor<string, []>("v_19")];
-            tensor<int32, [3]> var_1312 = const()[name = tensor<string, []>("op_1312"), val = tensor<int32, [3]>([12, 4, 64])];
-            tensor<fp32, [12, 4, 64]> var_1313 = reshape(shape = var_1312, x = q_19)[name = tensor<string, []>("op_1313")];
+            tensor<fp32, [12, 1, 256]> v_19 = slice_by_index(begin = v_19_begin_0, end = v_19_end_0, end_mask = v_19_end_mask_0, squeeze_mask = v_19_squeeze_mask_0, x = var_1307)[name = tensor<string, []>("v_19")];
+            tensor<int32, [3]> var_1315 = const()[name = tensor<string, []>("op_1315"), val = tensor<int32, [3]>([12, 4, 64])];
+            tensor<fp32, [12, 4, 64]> var_1316 = reshape(shape = var_1315, x = q_19)[name = tensor<string, []>("op_1316")];
             tensor<int32, [3]> q_21_perm_0 = const()[name = tensor<string, []>("q_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1319 = const()[name = tensor<string, []>("op_1319"), val = tensor<int32, [3]>([12, 4, 64])];
-            tensor<fp32, [12, 4, 64]> var_1320 = reshape(shape = var_1319, x = k_19)[name = tensor<string, []>("op_1320")];
+            tensor<int32, [3]> var_1322 = const()[name = tensor<string, []>("op_1322"), val = tensor<int32, [3]>([12, 4, 64])];
+            tensor<fp32, [12, 4, 64]> var_1323 = reshape(shape = var_1322, x = k_19)[name = tensor<string, []>("op_1323")];
             tensor<int32, [3]> k_21_perm_0 = const()[name = tensor<string, []>("k_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1326 = const()[name = tensor<string, []>("op_1326"), val = tensor<int32, [3]>([12, 4, 64])];
-            tensor<fp32, [12, 4, 64]> var_1327 = reshape(shape = var_1326, x = v_19)[name = tensor<string, []>("op_1327")];
+            tensor<int32, [3]> var_1329 = const()[name = tensor<string, []>("op_1329"), val = tensor<int32, [3]>([12, 4, 64])];
+            tensor<fp32, [12, 4, 64]> var_1330 = reshape(shape = var_1329, x = v_19)[name = tensor<string, []>("op_1330")];
             tensor<int32, [3]> v_21_perm_0 = const()[name = tensor<string, []>("v_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [4]> var_1330 = const()[name = tensor<string, []>("op_1330"), val = tensor<int32, [4]>([1, 4, 12, 64])];
-            tensor<fp32, [4, 12, 64]> q_21 = transpose(perm = q_21_perm_0, x = var_1313)[name = tensor<string, []>("transpose_7")];
-            tensor<fp32, [1, 4, 12, 64]> q = reshape(shape = var_1330, x = q_21)[name = tensor<string, []>("q")];
-            tensor<int32, [4]> var_1332 = const()[name = tensor<string, []>("op_1332"), val = tensor<int32, [4]>([1, 4, 12, 64])];
-            tensor<fp32, [4, 12, 64]> k_21 = transpose(perm = k_21_perm_0, x = var_1320)[name = tensor<string, []>("transpose_6")];
-            tensor<fp32, [1, 4, 12, 64]> k = reshape(shape = var_1332, x = k_21)[name = tensor<string, []>("k")];
-            tensor<int32, [4]> var_1334 = const()[name = tensor<string, []>("op_1334"), val = tensor<int32, [4]>([1, 4, 12, 64])];
-            tensor<fp32, [4, 12, 64]> v_21 = transpose(perm = v_21_perm_0, x = var_1327)[name = tensor<string, []>("transpose_5")];
-            tensor<fp32, [1, 4, 12, 64]> v = reshape(shape = var_1334, x = v_21)[name = tensor<string, []>("v")];
+            tensor<int32, [4]> var_1333 = const()[name = tensor<string, []>("op_1333"), val = tensor<int32, [4]>([1, 4, 12, 64])];
+            tensor<fp32, [4, 12, 64]> q_21 = transpose(perm = q_21_perm_0, x = var_1316)[name = tensor<string, []>("transpose_7")];
+            tensor<fp32, [1, 4, 12, 64]> q = reshape(shape = var_1333, x = q_21)[name = tensor<string, []>("q")];
+            tensor<int32, [4]> var_1335 = const()[name = tensor<string, []>("op_1335"), val = tensor<int32, [4]>([1, 4, 12, 64])];
+            tensor<fp32, [4, 12, 64]> k_21 = transpose(perm = k_21_perm_0, x = var_1323)[name = tensor<string, []>("transpose_6")];
+            tensor<fp32, [1, 4, 12, 64]> k = reshape(shape = var_1335, x = k_21)[name = tensor<string, []>("k")];
+            tensor<int32, [4]> var_1337 = const()[name = tensor<string, []>("op_1337"), val = tensor<int32, [4]>([1, 4, 12, 64])];
+            tensor<fp32, [4, 12, 64]> v_21 = transpose(perm = v_21_perm_0, x = var_1330)[name = tensor<string, []>("transpose_5")];
+            tensor<fp32, [1, 4, 12, 64]> v = reshape(shape = var_1337, x = v_21)[name = tensor<string, []>("v")];
             tensor<fp32, []> mul_3_y_0 = const()[name = tensor<string, []>("mul_3_y_0"), val = tensor<fp32, []>(0x1p-3)];
             tensor<fp32, [1, 4, 12, 64]> mul_3 = mul(x = q, y = mul_3_y_0)[name = tensor<string, []>("mul_3")];
             tensor<bool, []> matmul_1_transpose_y_0 = const()[name = tensor<string, []>("matmul_1_transpose_y_0"), val = tensor<bool, []>(true)];
@@ -1139,34 +1138,34 @@ program(1.0)
             tensor<bool, []> attn_output_9_transpose_x_0 = const()[name = tensor<string, []>("attn_output_9_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> attn_output_9_transpose_y_0 = const()[name = tensor<string, []>("attn_output_9_transpose_y_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [1, 4, 12, 64]> attn_output_9 = matmul(transpose_x = attn_output_9_transpose_x_0, transpose_y = attn_output_9_transpose_y_0, x = softmax_1, y = v)[name = tensor<string, []>("attn_output_9")];
-            tensor<int32, [4]> var_1337 = const()[name = tensor<string, []>("op_1337"), val = tensor<int32, [4]>([2, 0, 1, 3])];
-            tensor<int32, [2]> var_1342 = const()[name = tensor<string, []>("op_1342"), val = tensor<int32, [2]>([12, 256])];
-            tensor<fp32, [12, 1, 4, 64]> var_1338 = transpose(perm = var_1337, x = attn_output_9)[name = tensor<string, []>("transpose_4")];
-            tensor<fp32, [12, 256]> attn_output_11 = reshape(shape = var_1342, x = var_1338)[name = tensor<string, []>("attn_output_11")];
-            tensor<fp32, [12, 256]> attn_output_13 = linear(bias = decoder_self_attn2_1_out_proj_bias, weight = decoder_self_attn2_1_out_proj_weight, x = attn_output_11)[name = tensor<string, []>("linear_53")];
-            tensor<int32, [3]> var_1346 = const()[name = tensor<string, []>("op_1346"), val = tensor<int32, [3]>([12, 1, 256])];
-            tensor<fp32, [12, 1, 256]> attn_output = reshape(shape = var_1346, x = attn_output_13)[name = tensor<string, []>("attn_output")];
+            tensor<int32, [4]> var_1340 = const()[name = tensor<string, []>("op_1340"), val = tensor<int32, [4]>([2, 0, 1, 3])];
+            tensor<int32, [2]> var_1345 = const()[name = tensor<string, []>("op_1345"), val = tensor<int32, [2]>([12, 256])];
+            tensor<fp32, [12, 1, 4, 64]> var_1341 = transpose(perm = var_1340, x = attn_output_9)[name = tensor<string, []>("transpose_4")];
+            tensor<fp32, [12, 256]> attn_output_11 = reshape(shape = var_1345, x = var_1341)[name = tensor<string, []>("attn_output_11")];
+            tensor<fp32, [12, 256]> attn_output_13 = linear(bias = inner_decoder_self_attn2_1_out_proj_bias, weight = inner_decoder_self_attn2_1_out_proj_weight, x = attn_output_11)[name = tensor<string, []>("linear_53")];
+            tensor<int32, [3]> var_1349 = const()[name = tensor<string, []>("op_1349"), val = tensor<int32, [3]>([12, 1, 256])];
+            tensor<fp32, [12, 1, 256]> attn_output = reshape(shape = var_1349, x = attn_output_13)[name = tensor<string, []>("attn_output")];
             tensor<int32, [3]> sa2_out_perm_0 = const()[name = tensor<string, []>("sa2_out_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [1, 12, 256]> sa2_out = transpose(perm = sa2_out_perm_0, x = attn_output)[name = tensor<string, []>("transpose_3")];
-            tensor<fp32, [1, 12, 256]> input_193 = add(x = query_5, y = sa2_out)[name = tensor<string, []>("input_193")];
-            tensor<int32, [1]> input_195_axes_0 = const()[name = tensor<string, []>("input_195_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 12, 256]> input_195 = layer_norm(axes = input_195_axes_0, beta = decoder_norm21_1_bias, epsilon = var_921, gamma = decoder_norm21_1_weight, x = input_193)[name = tensor<string, []>("input_195")];
-            tensor<fp32, [1, 12, 2048]> input_197 = linear(bias = decoder_linear1_1_bias, weight = decoder_linear1_1_weight, x = input_195)[name = tensor<string, []>("linear_54")];
-            tensor<fp32, [1, 12, 2048]> input_199 = relu(x = input_197)[name = tensor<string, []>("input_199")];
-            tensor<fp32, [1, 12, 256]> ffn_out = linear(bias = decoder_linear2_1_bias, weight = decoder_linear2_1_weight, x = input_199)[name = tensor<string, []>("linear_55")];
-            tensor<fp32, [1, 12, 256]> input_201 = add(x = input_195, y = ffn_out)[name = tensor<string, []>("input_201")];
+            tensor<fp32, [1, 12, 256]> input_195 = add(x = query_5, y = sa2_out)[name = tensor<string, []>("input_195")];
+            tensor<int32, [1]> input_197_axes_0 = const()[name = tensor<string, []>("input_197_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 12, 256]> input_197 = layer_norm(axes = input_197_axes_0, beta = inner_decoder_norm21_1_bias, epsilon = var_36, gamma = inner_decoder_norm21_1_weight, x = input_195)[name = tensor<string, []>("input_197")];
+            tensor<fp32, [1, 12, 2048]> input_199 = linear(bias = inner_decoder_linear1_1_bias, weight = inner_decoder_linear1_1_weight, x = input_197)[name = tensor<string, []>("linear_54")];
+            tensor<fp32, [1, 12, 2048]> input_201 = relu(x = input_199)[name = tensor<string, []>("input_201")];
+            tensor<fp32, [1, 12, 256]> ffn_out = linear(bias = inner_decoder_linear2_1_bias, weight = inner_decoder_linear2_1_weight, x = input_201)[name = tensor<string, []>("linear_55")];
+            tensor<fp32, [1, 12, 256]> input_203 = add(x = input_197, y = ffn_out)[name = tensor<string, []>("input_203")];
             tensor<int32, [1]> xt_axes_0 = const()[name = tensor<string, []>("xt_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 12, 256]> xt = layer_norm(axes = xt_axes_0, beta = decoder_norm22_1_bias, epsilon = var_921, gamma = decoder_norm22_1_weight, x = input_201)[name = tensor<string, []>("xt")];
-            tensor<int32, [4]> var_1366 = const()[name = tensor<string, []>("op_1366"), val = tensor<int32, [4]>([1, 1, 12, 256])];
-            tensor<fp32, [1, 1, 12, 256]> input = reshape(shape = var_1366, x = xt)[name = tensor<string, []>("input")];
-            tensor<int32, [1]> var_1368 = const()[name = tensor<string, []>("op_1368"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 1, 12, 1]> var_1369 = reduce_l2_norm(axes = var_1368, keep_dims = var_924, x = input)[name = tensor<string, []>("op_1369")];
+            tensor<fp32, [1, 12, 256]> xt = layer_norm(axes = xt_axes_0, beta = inner_decoder_norm22_1_bias, epsilon = var_36, gamma = inner_decoder_norm22_1_weight, x = input_203)[name = tensor<string, []>("xt")];
+            tensor<int32, [4]> var_1369 = const()[name = tensor<string, []>("op_1369"), val = tensor<int32, [4]>([1, 1, 12, 256])];
+            tensor<fp32, [1, 1, 12, 256]> input = reshape(shape = var_1369, x = xt)[name = tensor<string, []>("input")];
+            tensor<int32, [1]> var_1371 = const()[name = tensor<string, []>("op_1371"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 1, 12, 1]> var_1372 = reduce_l2_norm(axes = var_1371, keep_dims = var_35, x = input)[name = tensor<string, []>("op_1372")];
             tensor<fp32, []> const_42 = const()[name = tensor<string, []>("const_42"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1, 1, 12, 1]> clip_5 = clip(alpha = var_916, beta = const_42, x = var_1369)[name = tensor<string, []>("clip_5")];
-            tensor<fp32, [1, 1, 12, 256]> var_1371 = real_div(x = input, y = clip_5)[name = tensor<string, []>("op_1371")];
+            tensor<fp32, [1, 1, 12, 1]> clip_5 = clip(alpha = var_49, beta = const_42, x = var_1372)[name = tensor<string, []>("clip_5")];
+            tensor<fp32, [1, 1, 12, 256]> var_1374 = real_div(x = input, y = clip_5)[name = tensor<string, []>("op_1374")];
             tensor<int32, [4]> transpose_1_perm_0 = const()[name = tensor<string, []>("transpose_1_perm_0"), val = tensor<int32, [4]>([0, 1, 3, 2])];
             tensor<int32, [3]> concat_7 = const()[name = tensor<string, []>("concat_7"), val = tensor<int32, [3]>([1, 256, 12])];
-            tensor<fp32, [1, 1, 256, 12]> transpose_1 = transpose(perm = transpose_1_perm_0, x = var_1371)[name = tensor<string, []>("transpose_2")];
+            tensor<fp32, [1, 1, 256, 12]> transpose_1 = transpose(perm = transpose_1_perm_0, x = var_1374)[name = tensor<string, []>("transpose_2")];
             tensor<fp32, [1, 256, 12]> reshape_1 = reshape(shape = concat_7, x = transpose_1)[name = tensor<string, []>("reshape_1")];
             tensor<bool, []> matmul_2_transpose_x_0 = const()[name = tensor<string, []>("matmul_2_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> matmul_2_transpose_y_0 = const()[name = tensor<string, []>("matmul_2_transpose_y_0"), val = tensor<bool, []>(false)];
@@ -1175,10 +1174,10 @@ program(1.0)
             tensor<int32, [3]> output_end_0 = const()[name = tensor<string, []>("output_end_0"), val = tensor<int32, [3]>([1, 1, 11])];
             tensor<bool, [3]> output_end_mask_0 = const()[name = tensor<string, []>("output_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
             tensor<fp32, [1, 1, 10]> output = slice_by_index(begin = output_begin_0, end = output_end_0, end_mask = output_end_mask_0, x = matmul_2)[name = tensor<string, []>("output")];
-            tensor<fp32, [1, 1, 10]> probs = sigmoid(x = output)[name = tensor<string, []>("op_1375")];
-            tensor<int32, []> var_1377_axis_0 = const()[name = tensor<string, []>("op_1377_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [2, 12, 4, 64, 64]> dec_kv_new = stack(axis = var_1377_axis_0, values = (var_1073, nkv))[name = tensor<string, []>("op_1377")];
-            tensor<int32, []> var_1379_axis_0 = const()[name = tensor<string, []>("op_1379_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [2, 1]> dec_scale_new = stack(axis = var_1379_axis_0, values = (new_scale_9, new_scale))[name = tensor<string, []>("op_1379")];
+            tensor<fp32, [1, 1, 10]> probs = sigmoid(x = output)[name = tensor<string, []>("op_1378")];
+            tensor<int32, []> var_1380_axis_0 = const()[name = tensor<string, []>("op_1380_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [2, 12, 4, 64, 64]> dec_kv_new = stack(axis = var_1380_axis_0, values = (var_1076, nkv))[name = tensor<string, []>("op_1380")];
+            tensor<int32, []> var_1382_axis_0 = const()[name = tensor<string, []>("op_1382_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [2, 1]> dec_scale_new = stack(axis = var_1382_axis_0, values = (new_scale_9, new_scale))[name = tensor<string, []>("op_1382")];
         } -> (probs, enc_kv_new, enc_scale_new, enc_conv_cache_new, cnn_window_new, dec_kv_new, dec_scale_new);
 }
\ No newline at end of file
diff --git a/optimized/dih3/100ms/ls_eend_dih3_100ms.mlpackage/Data/com.apple.CoreML/model.mlmodel b/optimized/dih3/100ms/ls_eend_dih3_100ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
index 172bd36e29739117a908278b3525a67e992d8b17..4d0f7dab493892e96c7f5365a99d744d609ae14c 100644
--- a/optimized/dih3/100ms/ls_eend_dih3_100ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
+++ b/optimized/dih3/100ms/ls_eend_dih3_100ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:866e7d0226752a637b1394b3ae9ecf1dc1de0ca1073f1b164eb38c8dfe62ea4a
-size 171366
+oid sha256:cedf01fcea7289c508f0174575bc7f7cb6a932676c48bf6d17e6e8761e0e7ce1
+size 175284
diff --git a/optimized/dih3/100ms/ls_eend_dih3_100ms.mlpackage/Manifest.json b/optimized/dih3/100ms/ls_eend_dih3_100ms.mlpackage/Manifest.json
index 5e96fabcd00f0d81e3d7405b594d3c65e54a58a8..597f60c85753c109df7183ebf5cc7eba6fc00946 100644
--- a/optimized/dih3/100ms/ls_eend_dih3_100ms.mlpackage/Manifest.json
+++ b/optimized/dih3/100ms/ls_eend_dih3_100ms.mlpackage/Manifest.json
@@ -1,18 +1,18 @@
 {
     "fileFormatVersion": "1.0.0",
     "itemInfoEntries": {
-        "80A6D93A-77B4-40EA-A15C-5545A78FEEBD": {
+        "887AF76B-5BF7-465B-AF63-A5E7FF000DC6": {
             "author": "com.apple.CoreML",
             "description": "CoreML Model Specification",
             "name": "model.mlmodel",
             "path": "com.apple.CoreML/model.mlmodel"
         },
-        "9DDE2145-36B0-46F7-8DFC-385831DC4F71": {
+        "8D81431E-8F42-4587-93CD-FB6D145BD632": {
             "author": "com.apple.CoreML",
             "description": "CoreML Model Weights",
             "name": "weights",
             "path": "com.apple.CoreML/weights"
         }
     },
-    "rootModelIdentifier": "80A6D93A-77B4-40EA-A15C-5545A78FEEBD"
+    "rootModelIdentifier": "887AF76B-5BF7-465B-AF63-A5E7FF000DC6"
 }
diff --git a/optimized/dih3/200ms/ls_eend_dih3_200ms.mlmodelc/analytics/coremldata.bin b/optimized/dih3/200ms/ls_eend_dih3_200ms.mlmodelc/analytics/coremldata.bin
index 5409bd7627c8339408f2ba44c89764b14943a6db..5275287c92c06d59c0f454d9396d645febb5490e 100644
--- a/optimized/dih3/200ms/ls_eend_dih3_200ms.mlmodelc/analytics/coremldata.bin
+++ b/optimized/dih3/200ms/ls_eend_dih3_200ms.mlmodelc/analytics/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a8893bce28227e8d51b3751b9be3a44e3ec395d16b2515ff8db197af2749801d
+oid sha256:5b551f8181ccd8ed3acd3ea62db76df66be4bc1acee718b152fedb8e2052cf1f
 size 243
diff --git a/optimized/dih3/200ms/ls_eend_dih3_200ms.mlmodelc/coremldata.bin b/optimized/dih3/200ms/ls_eend_dih3_200ms.mlmodelc/coremldata.bin
index 1ab9910e40d04ee5bb87f2f64a0f79e96a63d69c..b604e797d0d7645d87b600080421e751416b3e47 100644
--- a/optimized/dih3/200ms/ls_eend_dih3_200ms.mlmodelc/coremldata.bin
+++ b/optimized/dih3/200ms/ls_eend_dih3_200ms.mlmodelc/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:714cac00c71f584a18c944c841c3236d25c7bded9052f77aa3c110e1c0373194
-size 1310
+oid sha256:a9a6da6dd9e7274ccc549595c38658e3f8a05098b3ff212278de6e0cac6f157e
+size 1413
diff --git a/optimized/dih3/200ms/ls_eend_dih3_200ms.mlmodelc/metadata.json b/optimized/dih3/200ms/ls_eend_dih3_200ms.mlmodelc/metadata.json
index aaab8ad1bf6a40dc63cb5e669c82cca3eee7f301..0a646b8aab9afdfcede56f504c4d672650956b97 100644
--- a/optimized/dih3/200ms/ls_eend_dih3_200ms.mlmodelc/metadata.json
+++ b/optimized/dih3/200ms/ls_eend_dih3_200ms.mlmodelc/metadata.json
@@ -1,7 +1,7 @@
 [
   {
     "metadataOutputVersion" : "3.0",
-    "shortDescription" : "LS-EEND DIHARD III streaming diarizer (pipeline, T=2, max_speakers=10)",
+    "shortDescription" : "LS-EEND DIHARD III streaming diarizer (pipeline, T=2, max_speakers=10, layout=raw_mel, cu=all)",
     "outputSchema" : [
       {
         "hasShapeFlexibility" : "0",
@@ -81,7 +81,7 @@
     "specificationVersion" : 8,
     "mlProgramOperationTypeHistogram" : {
       "Ios16.reduceL2Norm" : 2,
-      "Ios17.reshape" : 68,
+      "Ios17.reshape" : 69,
       "Ios16.softmax" : 2,
       "Ios17.matmul" : 29,
       "Ios17.transpose" : 57,
@@ -89,7 +89,7 @@
       "Ios17.expandDims" : 3,
       "Ios17.add" : 46,
       "Ios16.sigmoid" : 5,
-      "Ios17.sliceByIndex" : 48,
+      "Ios17.sliceByIndex" : 50,
       "Tile" : 1,
       "Ios16.reduceSum" : 1,
       "Ios17.squeeze" : 2,
@@ -101,7 +101,7 @@
       "Ios16.silu" : 18,
       "Ios17.realDiv" : 20,
       "Ios17.linear" : 56,
-      "Stack" : 5,
+      "Stack" : 6,
       "Ios17.concat" : 14,
       "Ios16.relu" : 2,
       "Ios16.cumsum" : 1,
@@ -128,9 +128,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float32",
-        "formattedType" : "MultiArray (Float32 1 × 2 × 345)",
+        "formattedType" : "MultiArray (Float32 1 × 25 × 23)",
         "shortDescription" : "",
-        "shape" : "[1, 2, 345]",
+        "shape" : "[1, 25, 23]",
         "name" : "features",
         "type" : "MultiArray"
       },
@@ -206,8 +206,8 @@
       }
     ],
     "userDefinedMetadata" : {
-      "com.github.apple.coremltools.conversion_date" : "2026-04-16",
-      "config" : "{\"model_name\": \"dih3\", \"model_label\": \"DIHARD III\", \"variant\": \"pipeline\", \"chunk_size\": 2, \"step_duration_ms\": 200, \"frame_duration_ms\": 100, \"frame_duration_seconds\": 0.1, \"max_speakers\": 10, \"max_nspks\": 12, \"n_units\": 256, \"n_heads\": 4, \"enc_n_layers\": 4, \"dec_n_layers\": 2, \"conv_kernel_size\": 16, \"conv_delay\": 9, \"sample_rate\": 8000, \"win_length\": 200, \"hop_length\": 80, \"n_mels\": 23, \"context_size\": 7, \"subsampling\": 10, \"feat_type\": \"logmel23_cummn\", \"pure_roll\": true}",
+      "com.github.apple.coremltools.conversion_date" : "2026-04-18",
+      "config" : "{\"model_name\": \"dih3\", \"model_label\": \"DIHARD III\", \"variant\": \"pipeline\", \"chunk_size\": 2, \"step_duration_ms\": 200, \"frame_duration_ms\": 100, \"frame_duration_seconds\": 0.1, \"max_speakers\": 10, \"max_nspks\": 12, \"n_units\": 256, \"n_heads\": 4, \"enc_n_layers\": 4, \"dec_n_layers\": 2, \"conv_kernel_size\": 16, \"conv_delay\": 9, \"sample_rate\": 8000, \"win_length\": 200, \"hop_length\": 80, \"n_mels\": 23, \"context_size\": 7, \"subsampling\": 10, \"feat_type\": \"logmel23_cummn\", \"pure_roll\": true, \"input_layout\": \"raw_mel\", \"compute_units_export\": \"all\", \"raw_mel_length\": 25}",
       "com.github.apple.coremltools.source" : "torch==2.6.0",
       "com.github.apple.coremltools.version" : "9.0",
       "com.github.apple.coremltools.source_dialect" : "TorchScript"
diff --git a/optimized/dih3/200ms/ls_eend_dih3_200ms.mlmodelc/model.mil b/optimized/dih3/200ms/ls_eend_dih3_200ms.mlmodelc/model.mil
index 5651eaf71d9adaeffd8f5bfe34de4f9950314977..2ccb23e6a99a5f049025ab2b67fb75bd7b7827a2 100644
--- a/optimized/dih3/200ms/ls_eend_dih3_200ms.mlmodelc/model.mil
+++ b/optimized/dih3/200ms/ls_eend_dih3_200ms.mlmodelc/model.mil
@@ -1,234 +1,248 @@
 program(1.0)
 [buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.6.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "9.0"}})]
 {
-    func main<ios17>(tensor<fp32, [1, 256, 18]> cnn_window, tensor<fp32, [2, 12, 4, 64, 64]> dec_kv, tensor<fp32, [2, 1]> dec_scale, tensor<fp32, [4, 1, 16, 256]> enc_conv_cache, tensor<fp32, [4, 1, 4, 64, 64]> enc_kv, tensor<fp32, [4, 1]> enc_scale, tensor<fp32, [1, 2, 345]> features, tensor<fp32, [2]> valid_mask) {
-            tensor<fp32, [256]> encoder_cnn_bias = const()[name = tensor<string, []>("encoder_cnn_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
-            tensor<fp32, [256, 256, 19]> encoder_cnn_weight = const()[name = tensor<string, []>("encoder_cnn_weight"), val = tensor<fp32, [256, 256, 19]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
-            tensor<fp32, [2, 2]> encoder__causal_mask = const()[name = tensor<string, []>("encoder__causal_mask"), val = tensor<fp32, [2, 2]>([[0x1p+0, 0x0p+0], [0x1p+0, 0x1p+0]])];
-            tensor<fp32, [2]> encoder__t_index = const()[name = tensor<string, []>("encoder__t_index"), val = tensor<fp32, [2]>([0x1p+0, 0x1p+1])];
-            tensor<fp32, [256]> encoder_input_projection_linear_bias = const()[name = tensor<string, []>("encoder_input_projection_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4981952)))];
-            tensor<fp32, [256, 345]> encoder_input_projection_linear_weight = const()[name = tensor<string, []>("encoder_input_projection_linear_weight"), val = tensor<fp32, [256, 345]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4983040)))];
-            tensor<fp32, [256]> encoder_pre_layer_norm_bias = const()[name = tensor<string, []>("encoder_pre_layer_norm_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5336384)))];
-            tensor<fp32, [256]> encoder_pre_layer_norm_weight = const()[name = tensor<string, []>("encoder_pre_layer_norm_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5337472)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5338560)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5339648)))];
-            tensor<fp32, [1024]> encoder_ffn1_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5340736)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5344896)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6393536)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6394624)))];
-            tensor<fp32, [256]> encoder_ret_lns_0_bias = const()[name = tensor<string, []>("encoder_ret_lns_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7443264)))];
-            tensor<fp32, [256]> encoder_ret_lns_0_weight = const()[name = tensor<string, []>("encoder_ret_lns_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7444352)))];
-            tensor<fp32, [256]> encoder_q_proj_0_bias = const()[name = tensor<string, []>("encoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7445440)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_0_weight = const()[name = tensor<string, []>("encoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7446528)))];
-            tensor<fp32, [256]> encoder_k_proj_0_bias = const()[name = tensor<string, []>("encoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7708736)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_0_weight = const()[name = tensor<string, []>("encoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7709824)))];
-            tensor<fp32, [256]> encoder_v_proj_0_bias = const()[name = tensor<string, []>("encoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7972032)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_0_weight = const()[name = tensor<string, []>("encoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7973120)))];
-            tensor<fp32, [256]> encoder_g_proj_0_bias = const()[name = tensor<string, []>("encoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8235328)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_0_weight = const()[name = tensor<string, []>("encoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8236416)))];
-            tensor<fp32, [256]> encoder_out_proj_0_bias = const()[name = tensor<string, []>("encoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8498624)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_0_weight = const()[name = tensor<string, []>("encoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8499712)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8761920)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8763008)))];
-            tensor<fp32, [512]> encoder_conv_module_0_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8764096)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_0_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8766208)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_0_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9290560)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9307008)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9308096)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9309184)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9310272)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9311360)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_0_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9312448)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9574656)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9575744)))];
-            tensor<fp32, [1024]> encoder_ffn2_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9576832)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9580992)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10629632)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10630720)))];
-            tensor<fp32, [256]> encoder_layer_norm_0_bias = const()[name = tensor<string, []>("encoder_layer_norm_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11679360)))];
-            tensor<fp32, [256]> encoder_layer_norm_0_weight = const()[name = tensor<string, []>("encoder_layer_norm_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11680448)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11681536)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11682624)))];
-            tensor<fp32, [1024]> encoder_ffn1_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11683712)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11687872)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12736512)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12737600)))];
-            tensor<fp32, [256]> encoder_ret_lns_1_bias = const()[name = tensor<string, []>("encoder_ret_lns_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13786240)))];
-            tensor<fp32, [256]> encoder_ret_lns_1_weight = const()[name = tensor<string, []>("encoder_ret_lns_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13787328)))];
-            tensor<fp32, [256]> encoder_q_proj_1_bias = const()[name = tensor<string, []>("encoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13788416)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_1_weight = const()[name = tensor<string, []>("encoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13789504)))];
-            tensor<fp32, [256]> encoder_k_proj_1_bias = const()[name = tensor<string, []>("encoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14051712)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_1_weight = const()[name = tensor<string, []>("encoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14052800)))];
-            tensor<fp32, [256]> encoder_v_proj_1_bias = const()[name = tensor<string, []>("encoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14315008)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_1_weight = const()[name = tensor<string, []>("encoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14316096)))];
-            tensor<fp32, [256]> encoder_g_proj_1_bias = const()[name = tensor<string, []>("encoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14578304)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_1_weight = const()[name = tensor<string, []>("encoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14579392)))];
-            tensor<fp32, [256]> encoder_out_proj_1_bias = const()[name = tensor<string, []>("encoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14841600)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_1_weight = const()[name = tensor<string, []>("encoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14842688)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15104896)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15105984)))];
-            tensor<fp32, [512]> encoder_conv_module_1_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15107072)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_1_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15109184)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_1_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15633536)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15649984)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15651072)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15652160)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15653248)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15654336)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_1_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15655424)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15917632)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15918720)))];
-            tensor<fp32, [1024]> encoder_ffn2_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15919808)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15923968)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16972608)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16973696)))];
-            tensor<fp32, [256]> encoder_layer_norm_1_bias = const()[name = tensor<string, []>("encoder_layer_norm_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18022336)))];
-            tensor<fp32, [256]> encoder_layer_norm_1_weight = const()[name = tensor<string, []>("encoder_layer_norm_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18023424)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18024512)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18025600)))];
-            tensor<fp32, [1024]> encoder_ffn1_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18026688)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18030848)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19079488)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19080576)))];
-            tensor<fp32, [256]> encoder_ret_lns_2_bias = const()[name = tensor<string, []>("encoder_ret_lns_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20129216)))];
-            tensor<fp32, [256]> encoder_ret_lns_2_weight = const()[name = tensor<string, []>("encoder_ret_lns_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20130304)))];
-            tensor<fp32, [256]> encoder_q_proj_2_bias = const()[name = tensor<string, []>("encoder_q_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20131392)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_2_weight = const()[name = tensor<string, []>("encoder_q_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20132480)))];
-            tensor<fp32, [256]> encoder_k_proj_2_bias = const()[name = tensor<string, []>("encoder_k_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20394688)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_2_weight = const()[name = tensor<string, []>("encoder_k_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20395776)))];
-            tensor<fp32, [256]> encoder_v_proj_2_bias = const()[name = tensor<string, []>("encoder_v_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20657984)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_2_weight = const()[name = tensor<string, []>("encoder_v_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20659072)))];
-            tensor<fp32, [256]> encoder_g_proj_2_bias = const()[name = tensor<string, []>("encoder_g_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20921280)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_2_weight = const()[name = tensor<string, []>("encoder_g_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20922368)))];
-            tensor<fp32, [256]> encoder_out_proj_2_bias = const()[name = tensor<string, []>("encoder_out_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21184576)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_2_weight = const()[name = tensor<string, []>("encoder_out_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21185664)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21447872)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21448960)))];
-            tensor<fp32, [512]> encoder_conv_module_2_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21450048)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_2_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21452160)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_2_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21976512)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21992960)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21994048)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21995136)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21996224)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21997312)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_2_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21998400)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22260608)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22261696)))];
-            tensor<fp32, [1024]> encoder_ffn2_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22262784)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22266944)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23315584)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23316672)))];
-            tensor<fp32, [256]> encoder_layer_norm_2_bias = const()[name = tensor<string, []>("encoder_layer_norm_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24365312)))];
-            tensor<fp32, [256]> encoder_layer_norm_2_weight = const()[name = tensor<string, []>("encoder_layer_norm_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24366400)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24367488)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24368576)))];
-            tensor<fp32, [1024]> encoder_ffn1_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24369664)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24373824)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25422464)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25423552)))];
-            tensor<fp32, [256]> encoder_ret_lns_3_bias = const()[name = tensor<string, []>("encoder_ret_lns_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26472192)))];
-            tensor<fp32, [256]> encoder_ret_lns_3_weight = const()[name = tensor<string, []>("encoder_ret_lns_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26473280)))];
-            tensor<fp32, [256]> encoder_q_proj_3_bias = const()[name = tensor<string, []>("encoder_q_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26474368)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_3_weight = const()[name = tensor<string, []>("encoder_q_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26475456)))];
-            tensor<fp32, [256]> encoder_k_proj_3_bias = const()[name = tensor<string, []>("encoder_k_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26737664)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_3_weight = const()[name = tensor<string, []>("encoder_k_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26738752)))];
-            tensor<fp32, [256]> encoder_v_proj_3_bias = const()[name = tensor<string, []>("encoder_v_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27000960)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_3_weight = const()[name = tensor<string, []>("encoder_v_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27002048)))];
-            tensor<fp32, [256]> encoder_g_proj_3_bias = const()[name = tensor<string, []>("encoder_g_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27264256)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_3_weight = const()[name = tensor<string, []>("encoder_g_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27265344)))];
-            tensor<fp32, [256]> encoder_out_proj_3_bias = const()[name = tensor<string, []>("encoder_out_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27527552)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_3_weight = const()[name = tensor<string, []>("encoder_out_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27528640)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27790848)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27791936)))];
-            tensor<fp32, [512]> encoder_conv_module_3_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27793024)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_3_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27795136)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_3_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28319488)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28335936)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28337024)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28338112)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28339200)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28340288)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_3_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28341376)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28603584)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28604672)))];
-            tensor<fp32, [1024]> encoder_ffn2_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28605760)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28609920)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29658560)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29659648)))];
-            tensor<fp32, [256]> encoder_layer_norm_3_bias = const()[name = tensor<string, []>("encoder_layer_norm_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30708288)))];
-            tensor<fp32, [256]> encoder_layer_norm_3_weight = const()[name = tensor<string, []>("encoder_layer_norm_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30709376)))];
-            tensor<fp32, [2, 2]> decoder__causal_mask = const()[name = tensor<string, []>("decoder__causal_mask"), val = tensor<fp32, [2, 2]>([[0x1p+0, 0x0p+0], [0x1p+0, 0x1p+0]])];
-            tensor<fp32, [256]> decoder_convert_bias = const()[name = tensor<string, []>("decoder_convert_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710464)))];
-            tensor<fp32, [256, 512]> decoder_convert_weight = const()[name = tensor<string, []>("decoder_convert_weight"), val = tensor<fp32, [256, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30711552)))];
-            tensor<fp32, [256]> decoder_q_proj_0_bias = const()[name = tensor<string, []>("decoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31235904)))];
-            tensor<fp32, [256, 256]> decoder_q_proj_0_weight = const()[name = tensor<string, []>("decoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31236992)))];
-            tensor<fp32, [256]> decoder_k_proj_0_bias = const()[name = tensor<string, []>("decoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31499200)))];
-            tensor<fp32, [256, 256]> decoder_k_proj_0_weight = const()[name = tensor<string, []>("decoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31500288)))];
-            tensor<fp32, [256]> decoder_v_proj_0_bias = const()[name = tensor<string, []>("decoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31762496)))];
-            tensor<fp32, [256, 256]> decoder_v_proj_0_weight = const()[name = tensor<string, []>("decoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31763584)))];
-            tensor<fp32, [256]> decoder_g_proj_0_bias = const()[name = tensor<string, []>("decoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32025792)))];
-            tensor<fp32, [256, 256]> decoder_g_proj_0_weight = const()[name = tensor<string, []>("decoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32026880)))];
-            tensor<fp32, [256]> decoder_out_proj_0_bias = const()[name = tensor<string, []>("decoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32289088)))];
-            tensor<fp32, [256, 256]> decoder_out_proj_0_weight = const()[name = tensor<string, []>("decoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32290176)))];
-            tensor<fp32, [256]> decoder_norm11_0_bias = const()[name = tensor<string, []>("decoder_norm11_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32552384)))];
-            tensor<fp32, [256]> decoder_norm11_0_weight = const()[name = tensor<string, []>("decoder_norm11_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32553472)))];
-            tensor<fp32, [256]> decoder_self_attn2_0_out_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_0_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32554560)))];
-            tensor<fp32, [256, 256]> decoder_self_attn2_0_out_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_0_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32555648)))];
-            tensor<fp32, [768]> decoder_self_attn2_0_in_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_0_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32817856)))];
-            tensor<fp32, [768, 256]> decoder_self_attn2_0_in_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_0_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32820992)))];
-            tensor<fp32, [256]> decoder_norm21_0_bias = const()[name = tensor<string, []>("decoder_norm21_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33607488)))];
-            tensor<fp32, [256]> decoder_norm21_0_weight = const()[name = tensor<string, []>("decoder_norm21_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33608576)))];
-            tensor<fp32, [2048]> decoder_linear1_0_bias = const()[name = tensor<string, []>("decoder_linear1_0_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33609664)))];
-            tensor<fp32, [2048, 256]> decoder_linear1_0_weight = const()[name = tensor<string, []>("decoder_linear1_0_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33617920)))];
-            tensor<fp32, [256]> decoder_linear2_0_bias = const()[name = tensor<string, []>("decoder_linear2_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35715136)))];
-            tensor<fp32, [256, 2048]> decoder_linear2_0_weight = const()[name = tensor<string, []>("decoder_linear2_0_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35716224)))];
-            tensor<fp32, [256]> decoder_norm22_0_bias = const()[name = tensor<string, []>("decoder_norm22_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37813440)))];
-            tensor<fp32, [256]> decoder_norm22_0_weight = const()[name = tensor<string, []>("decoder_norm22_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37814528)))];
-            tensor<fp32, [256]> decoder_q_proj_1_bias = const()[name = tensor<string, []>("decoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37815616)))];
-            tensor<fp32, [256, 256]> decoder_q_proj_1_weight = const()[name = tensor<string, []>("decoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37816704)))];
-            tensor<fp32, [256]> decoder_k_proj_1_bias = const()[name = tensor<string, []>("decoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38078912)))];
-            tensor<fp32, [256, 256]> decoder_k_proj_1_weight = const()[name = tensor<string, []>("decoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38080000)))];
-            tensor<fp32, [256]> decoder_v_proj_1_bias = const()[name = tensor<string, []>("decoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38342208)))];
-            tensor<fp32, [256, 256]> decoder_v_proj_1_weight = const()[name = tensor<string, []>("decoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38343296)))];
-            tensor<fp32, [256]> decoder_g_proj_1_bias = const()[name = tensor<string, []>("decoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38605504)))];
-            tensor<fp32, [256, 256]> decoder_g_proj_1_weight = const()[name = tensor<string, []>("decoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38606592)))];
-            tensor<fp32, [256]> decoder_out_proj_1_bias = const()[name = tensor<string, []>("decoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38868800)))];
-            tensor<fp32, [256, 256]> decoder_out_proj_1_weight = const()[name = tensor<string, []>("decoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38869888)))];
-            tensor<fp32, [256]> decoder_norm11_1_bias = const()[name = tensor<string, []>("decoder_norm11_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39132096)))];
-            tensor<fp32, [256]> decoder_norm11_1_weight = const()[name = tensor<string, []>("decoder_norm11_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39133184)))];
-            tensor<fp32, [256]> decoder_self_attn2_1_out_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_1_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39134272)))];
-            tensor<fp32, [256, 256]> decoder_self_attn2_1_out_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_1_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39135360)))];
-            tensor<fp32, [768]> decoder_self_attn2_1_in_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_1_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39397568)))];
-            tensor<fp32, [768, 256]> decoder_self_attn2_1_in_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_1_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39400704)))];
-            tensor<fp32, [256]> decoder_norm21_1_bias = const()[name = tensor<string, []>("decoder_norm21_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40187200)))];
-            tensor<fp32, [256]> decoder_norm21_1_weight = const()[name = tensor<string, []>("decoder_norm21_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40188288)))];
-            tensor<fp32, [2048]> decoder_linear1_1_bias = const()[name = tensor<string, []>("decoder_linear1_1_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40189376)))];
-            tensor<fp32, [2048, 256]> decoder_linear1_1_weight = const()[name = tensor<string, []>("decoder_linear1_1_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40197632)))];
-            tensor<fp32, [256]> decoder_linear2_1_bias = const()[name = tensor<string, []>("decoder_linear2_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42294848)))];
-            tensor<fp32, [256, 2048]> decoder_linear2_1_weight = const()[name = tensor<string, []>("decoder_linear2_1_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42295936)))];
-            tensor<fp32, [256]> decoder_norm22_1_bias = const()[name = tensor<string, []>("decoder_norm22_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44393152)))];
-            tensor<fp32, [256]> decoder_norm22_1_weight = const()[name = tensor<string, []>("decoder_norm22_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44394240)))];
-            tensor<fp32, []> var_11 = const()[name = tensor<string, []>("op_11"), val = tensor<fp32, []>(0x1.5798eep-27)];
-            tensor<fp32, []> var_17 = const()[name = tensor<string, []>("op_17"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
-            tensor<int32, []> var_20 = const()[name = tensor<string, []>("op_20"), val = tensor<int32, []>(2)];
-            tensor<int32, []> var_23 = const()[name = tensor<string, []>("op_23"), val = tensor<int32, []>(0)];
-            tensor<int32, []> var_26 = const()[name = tensor<string, []>("op_26"), val = tensor<int32, []>(1)];
-            tensor<fp32, []> var_28 = const()[name = tensor<string, []>("op_28"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
-            tensor<bool, []> var_29 = const()[name = tensor<string, []>("op_29"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 2, 256]> input_1 = linear(bias = encoder_input_projection_linear_bias, weight = encoder_input_projection_linear_weight, x = features)[name = tensor<string, []>("linear_0")];
-            tensor<int32, [1]> input_3_axes_0 = const()[name = tensor<string, []>("input_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_3 = layer_norm(axes = input_3_axes_0, beta = encoder_pre_layer_norm_bias, epsilon = var_28, gamma = encoder_pre_layer_norm_weight, x = input_1)[name = tensor<string, []>("input_3")];
+    func main<ios17>(tensor<fp32, [1, 256, 18]> cnn_window, tensor<fp32, [2, 12, 4, 64, 64]> dec_kv, tensor<fp32, [2, 1]> dec_scale, tensor<fp32, [4, 1, 16, 256]> enc_conv_cache, tensor<fp32, [4, 1, 4, 64, 64]> enc_kv, tensor<fp32, [4, 1]> enc_scale, tensor<fp32, [1, 25, 23]> features, tensor<fp32, [2]> valid_mask) {
+            tensor<fp32, [256]> inner_encoder_cnn_bias = const()[name = tensor<string, []>("inner_encoder_cnn_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp32, [256, 256, 19]> inner_encoder_cnn_weight = const()[name = tensor<string, []>("inner_encoder_cnn_weight"), val = tensor<fp32, [256, 256, 19]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
+            tensor<fp32, [2, 2]> inner_encoder__causal_mask = const()[name = tensor<string, []>("inner_encoder__causal_mask"), val = tensor<fp32, [2, 2]>([[0x1p+0, 0x0p+0], [0x1p+0, 0x1p+0]])];
+            tensor<fp32, [2]> inner_encoder__t_index = const()[name = tensor<string, []>("inner_encoder__t_index"), val = tensor<fp32, [2]>([0x1p+0, 0x1p+1])];
+            tensor<fp32, [256]> inner_encoder_input_projection_linear_bias = const()[name = tensor<string, []>("inner_encoder_input_projection_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4981952)))];
+            tensor<fp32, [256, 345]> inner_encoder_input_projection_linear_weight = const()[name = tensor<string, []>("inner_encoder_input_projection_linear_weight"), val = tensor<fp32, [256, 345]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4983040)))];
+            tensor<fp32, [256]> inner_encoder_pre_layer_norm_bias = const()[name = tensor<string, []>("inner_encoder_pre_layer_norm_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5336384)))];
+            tensor<fp32, [256]> inner_encoder_pre_layer_norm_weight = const()[name = tensor<string, []>("inner_encoder_pre_layer_norm_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5337472)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5338560)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5339648)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5340736)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5344896)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6393536)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6394624)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_0_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7443264)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_0_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7444352)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7445440)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7446528)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7708736)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7709824)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7972032)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7973120)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8235328)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8236416)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8498624)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8499712)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8761920)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8763008)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_0_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8764096)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_0_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8766208)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_0_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9290560)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9307008)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9308096)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9309184)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9310272)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9311360)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_0_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9312448)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9574656)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9575744)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9576832)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9580992)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10629632)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10630720)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_0_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11679360)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_0_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11680448)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11681536)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11682624)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11683712)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11687872)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12736512)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12737600)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_1_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13786240)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_1_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13787328)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13788416)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13789504)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14051712)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14052800)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14315008)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14316096)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14578304)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14579392)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14841600)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14842688)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15104896)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15105984)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_1_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15107072)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_1_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15109184)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_1_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15633536)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15649984)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15651072)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15652160)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15653248)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15654336)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_1_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15655424)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15917632)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15918720)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15919808)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15923968)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16972608)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16973696)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_1_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18022336)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_1_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18023424)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18024512)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18025600)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18026688)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18030848)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19079488)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19080576)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_2_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20129216)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_2_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20130304)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20131392)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20132480)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20394688)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20395776)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20657984)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20659072)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20921280)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20922368)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21184576)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21185664)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21447872)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21448960)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_2_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21450048)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_2_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21452160)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_2_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21976512)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21992960)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21994048)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21995136)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21996224)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21997312)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_2_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21998400)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22260608)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22261696)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22262784)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22266944)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23315584)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23316672)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_2_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24365312)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_2_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24366400)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24367488)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24368576)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24369664)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24373824)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25422464)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25423552)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_3_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26472192)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_3_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26473280)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26474368)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26475456)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26737664)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26738752)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27000960)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27002048)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27264256)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27265344)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27527552)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27528640)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27790848)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27791936)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_3_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27793024)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_3_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27795136)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_3_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28319488)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28335936)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28337024)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28338112)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28339200)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28340288)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_3_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28341376)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28603584)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28604672)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28605760)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28609920)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29658560)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29659648)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_3_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30708288)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_3_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30709376)))];
+            tensor<fp32, [2, 2]> inner_decoder__causal_mask = const()[name = tensor<string, []>("inner_decoder__causal_mask"), val = tensor<fp32, [2, 2]>([[0x1p+0, 0x0p+0], [0x1p+0, 0x1p+0]])];
+            tensor<fp32, [256]> inner_decoder_convert_bias = const()[name = tensor<string, []>("inner_decoder_convert_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710464)))];
+            tensor<fp32, [256, 512]> inner_decoder_convert_weight = const()[name = tensor<string, []>("inner_decoder_convert_weight"), val = tensor<fp32, [256, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30711552)))];
+            tensor<fp32, [256]> inner_decoder_q_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31235904)))];
+            tensor<fp32, [256, 256]> inner_decoder_q_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31236992)))];
+            tensor<fp32, [256]> inner_decoder_k_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31499200)))];
+            tensor<fp32, [256, 256]> inner_decoder_k_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31500288)))];
+            tensor<fp32, [256]> inner_decoder_v_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31762496)))];
+            tensor<fp32, [256, 256]> inner_decoder_v_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31763584)))];
+            tensor<fp32, [256]> inner_decoder_g_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32025792)))];
+            tensor<fp32, [256, 256]> inner_decoder_g_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32026880)))];
+            tensor<fp32, [256]> inner_decoder_out_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32289088)))];
+            tensor<fp32, [256, 256]> inner_decoder_out_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32290176)))];
+            tensor<fp32, [256]> inner_decoder_norm11_0_bias = const()[name = tensor<string, []>("inner_decoder_norm11_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32552384)))];
+            tensor<fp32, [256]> inner_decoder_norm11_0_weight = const()[name = tensor<string, []>("inner_decoder_norm11_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32553472)))];
+            tensor<fp32, [256]> inner_decoder_self_attn2_0_out_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32554560)))];
+            tensor<fp32, [256, 256]> inner_decoder_self_attn2_0_out_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32555648)))];
+            tensor<fp32, [768]> inner_decoder_self_attn2_0_in_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32817856)))];
+            tensor<fp32, [768, 256]> inner_decoder_self_attn2_0_in_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32820992)))];
+            tensor<fp32, [256]> inner_decoder_norm21_0_bias = const()[name = tensor<string, []>("inner_decoder_norm21_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33607488)))];
+            tensor<fp32, [256]> inner_decoder_norm21_0_weight = const()[name = tensor<string, []>("inner_decoder_norm21_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33608576)))];
+            tensor<fp32, [2048]> inner_decoder_linear1_0_bias = const()[name = tensor<string, []>("inner_decoder_linear1_0_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33609664)))];
+            tensor<fp32, [2048, 256]> inner_decoder_linear1_0_weight = const()[name = tensor<string, []>("inner_decoder_linear1_0_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33617920)))];
+            tensor<fp32, [256]> inner_decoder_linear2_0_bias = const()[name = tensor<string, []>("inner_decoder_linear2_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35715136)))];
+            tensor<fp32, [256, 2048]> inner_decoder_linear2_0_weight = const()[name = tensor<string, []>("inner_decoder_linear2_0_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35716224)))];
+            tensor<fp32, [256]> inner_decoder_norm22_0_bias = const()[name = tensor<string, []>("inner_decoder_norm22_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37813440)))];
+            tensor<fp32, [256]> inner_decoder_norm22_0_weight = const()[name = tensor<string, []>("inner_decoder_norm22_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37814528)))];
+            tensor<fp32, [256]> inner_decoder_q_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37815616)))];
+            tensor<fp32, [256, 256]> inner_decoder_q_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37816704)))];
+            tensor<fp32, [256]> inner_decoder_k_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38078912)))];
+            tensor<fp32, [256, 256]> inner_decoder_k_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38080000)))];
+            tensor<fp32, [256]> inner_decoder_v_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38342208)))];
+            tensor<fp32, [256, 256]> inner_decoder_v_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38343296)))];
+            tensor<fp32, [256]> inner_decoder_g_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38605504)))];
+            tensor<fp32, [256, 256]> inner_decoder_g_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38606592)))];
+            tensor<fp32, [256]> inner_decoder_out_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38868800)))];
+            tensor<fp32, [256, 256]> inner_decoder_out_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38869888)))];
+            tensor<fp32, [256]> inner_decoder_norm11_1_bias = const()[name = tensor<string, []>("inner_decoder_norm11_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39132096)))];
+            tensor<fp32, [256]> inner_decoder_norm11_1_weight = const()[name = tensor<string, []>("inner_decoder_norm11_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39133184)))];
+            tensor<fp32, [256]> inner_decoder_self_attn2_1_out_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39134272)))];
+            tensor<fp32, [256, 256]> inner_decoder_self_attn2_1_out_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39135360)))];
+            tensor<fp32, [768]> inner_decoder_self_attn2_1_in_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39397568)))];
+            tensor<fp32, [768, 256]> inner_decoder_self_attn2_1_in_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39400704)))];
+            tensor<fp32, [256]> inner_decoder_norm21_1_bias = const()[name = tensor<string, []>("inner_decoder_norm21_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40187200)))];
+            tensor<fp32, [256]> inner_decoder_norm21_1_weight = const()[name = tensor<string, []>("inner_decoder_norm21_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40188288)))];
+            tensor<fp32, [2048]> inner_decoder_linear1_1_bias = const()[name = tensor<string, []>("inner_decoder_linear1_1_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40189376)))];
+            tensor<fp32, [2048, 256]> inner_decoder_linear1_1_weight = const()[name = tensor<string, []>("inner_decoder_linear1_1_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40197632)))];
+            tensor<fp32, [256]> inner_decoder_linear2_1_bias = const()[name = tensor<string, []>("inner_decoder_linear2_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42294848)))];
+            tensor<fp32, [256, 2048]> inner_decoder_linear2_1_weight = const()[name = tensor<string, []>("inner_decoder_linear2_1_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42295936)))];
+            tensor<fp32, [256]> inner_decoder_norm22_1_bias = const()[name = tensor<string, []>("inner_decoder_norm22_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44393152)))];
+            tensor<fp32, [256]> inner_decoder_norm22_1_weight = const()[name = tensor<string, []>("inner_decoder_norm22_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44394240)))];
+            tensor<int32, [3]> var_19_begin_0 = const()[name = tensor<string, []>("op_19_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_19_end_0 = const()[name = tensor<string, []>("op_19_end_0"), val = tensor<int32, [3]>([1, 15, 23])];
+            tensor<bool, [3]> var_19_end_mask_0 = const()[name = tensor<string, []>("op_19_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 15, 23]> var_19 = slice_by_index(begin = var_19_begin_0, end = var_19_end_0, end_mask = var_19_end_mask_0, x = features)[name = tensor<string, []>("op_19")];
+            tensor<int32, [3]> var_29_begin_0 = const()[name = tensor<string, []>("op_29_begin_0"), val = tensor<int32, [3]>([0, 10, 0])];
+            tensor<int32, [3]> var_29_end_0 = const()[name = tensor<string, []>("op_29_end_0"), val = tensor<int32, [3]>([1, 1, 23])];
+            tensor<bool, [3]> var_29_end_mask_0 = const()[name = tensor<string, []>("op_29_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 23]> var_29 = slice_by_index(begin = var_29_begin_0, end = var_29_end_0, end_mask = var_29_end_mask_0, x = features)[name = tensor<string, []>("op_29")];
+            tensor<int32, []> stacked_axis_0 = const()[name = tensor<string, []>("stacked_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 2, 15, 23]> stacked = stack(axis = stacked_axis_0, values = (var_19, var_29))[name = tensor<string, []>("stacked")];
+            tensor<int32, [3]> var_36 = const()[name = tensor<string, []>("op_36"), val = tensor<int32, [3]>([1, 2, 345])];
+            tensor<fp32, [1, 2, 345]> input_1 = reshape(shape = var_36, x = stacked)[name = tensor<string, []>("input_1")];
+            tensor<fp32, []> var_39 = const()[name = tensor<string, []>("op_39"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<bool, []> var_45 = const()[name = tensor<string, []>("op_45"), val = tensor<bool, []>(true)];
+            tensor<fp32, []> var_46 = const()[name = tensor<string, []>("op_46"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
+            tensor<int32, []> var_49 = const()[name = tensor<string, []>("op_49"), val = tensor<int32, []>(0)];
+            tensor<int32, []> var_51 = const()[name = tensor<string, []>("op_51"), val = tensor<int32, []>(2)];
+            tensor<int32, []> var_52 = const()[name = tensor<string, []>("op_52"), val = tensor<int32, []>(-1)];
+            tensor<fp32, []> var_54 = const()[name = tensor<string, []>("op_54"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
+            tensor<fp32, []> var_59 = const()[name = tensor<string, []>("op_59"), val = tensor<fp32, []>(0x1.5798eep-27)];
+            tensor<int32, []> var_62 = const()[name = tensor<string, []>("op_62"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 2, 256]> input_3 = linear(bias = inner_encoder_input_projection_linear_bias, weight = inner_encoder_input_projection_linear_weight, x = input_1)[name = tensor<string, []>("linear_0")];
             tensor<int32, [1]> input_5_axes_0 = const()[name = tensor<string, []>("input_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_5 = layer_norm(axes = input_5_axes_0, beta = encoder_ffn1_0_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_0_module_sequential_0_weight, x = input_3)[name = tensor<string, []>("input_5")];
-            tensor<fp32, [1, 2, 1024]> inputs_1 = linear(bias = encoder_ffn1_0_module_sequential_1_linear_bias, weight = encoder_ffn1_0_module_sequential_1_linear_weight, x = input_5)[name = tensor<string, []>("linear_1")];
-            tensor<fp32, [1, 2, 1024]> input_7 = silu(x = inputs_1)[name = tensor<string, []>("input_7")];
-            tensor<fp32, [1, 2, 256]> input_11 = linear(bias = encoder_ffn1_0_module_sequential_4_linear_bias, weight = encoder_ffn1_0_module_sequential_4_linear_weight, x = input_7)[name = tensor<string, []>("linear_2")];
-            tensor<fp32, []> var_147 = const()[name = tensor<string, []>("op_147"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 2, 256]> var_148 = mul(x = input_11, y = var_147)[name = tensor<string, []>("op_148")];
-            tensor<fp32, [1, 2, 256]> input_13 = add(x = var_148, y = input_3)[name = tensor<string, []>("input_13")];
+            tensor<fp32, [1, 2, 256]> input_5 = layer_norm(axes = input_5_axes_0, beta = inner_encoder_pre_layer_norm_bias, epsilon = var_46, gamma = inner_encoder_pre_layer_norm_weight, x = input_3)[name = tensor<string, []>("input_5")];
+            tensor<int32, [1]> input_7_axes_0 = const()[name = tensor<string, []>("input_7_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 2, 256]> input_7 = layer_norm(axes = input_7_axes_0, beta = inner_encoder_ffn1_0_module_sequential_0_bias, epsilon = var_46, gamma = inner_encoder_ffn1_0_module_sequential_0_weight, x = input_5)[name = tensor<string, []>("input_7")];
+            tensor<fp32, [1, 2, 1024]> inputs_1 = linear(bias = inner_encoder_ffn1_0_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_0_module_sequential_1_linear_weight, x = input_7)[name = tensor<string, []>("linear_1")];
+            tensor<fp32, [1, 2, 1024]> input_9 = silu(x = inputs_1)[name = tensor<string, []>("input_9")];
+            tensor<fp32, [1, 2, 256]> input_13 = linear(bias = inner_encoder_ffn1_0_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_0_module_sequential_4_linear_weight, x = input_9)[name = tensor<string, []>("linear_2")];
+            tensor<fp32, []> var_183 = const()[name = tensor<string, []>("op_183"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 2, 256]> var_184 = mul(x = input_13, y = var_183)[name = tensor<string, []>("op_184")];
+            tensor<fp32, [1, 2, 256]> input_15 = add(x = var_184, y = input_5)[name = tensor<string, []>("input_15")];
             tensor<int32, [1]> x_1_axes_0 = const()[name = tensor<string, []>("x_1_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> x_1 = layer_norm(axes = x_1_axes_0, beta = encoder_ret_lns_0_bias, epsilon = var_28, gamma = encoder_ret_lns_0_weight, x = input_13)[name = tensor<string, []>("x_1")];
+            tensor<fp32, [1, 2, 256]> x_1 = layer_norm(axes = x_1_axes_0, beta = inner_encoder_ret_lns_0_bias, epsilon = var_46, gamma = inner_encoder_ret_lns_0_weight, x = input_15)[name = tensor<string, []>("x_1")];
             tensor<int32, [5]> prev_kv_1_begin_0 = const()[name = tensor<string, []>("prev_kv_1_begin_0"), val = tensor<int32, [5]>([0, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_1_end_0 = const()[name = tensor<string, []>("prev_kv_1_end_0"), val = tensor<int32, [5]>([1, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_1_end_mask_0 = const()[name = tensor<string, []>("prev_kv_1_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -239,153 +253,153 @@ program(1.0)
             tensor<bool, [2]> prev_scale_1_end_mask_0 = const()[name = tensor<string, []>("prev_scale_1_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_1_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_1_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_1 = slice_by_index(begin = prev_scale_1_begin_0, end = prev_scale_1_end_0, end_mask = prev_scale_1_end_mask_0, squeeze_mask = prev_scale_1_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_1")];
-            tensor<fp32, [1, 2, 256]> var_162 = linear(bias = encoder_q_proj_0_bias, weight = encoder_q_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_3")];
-            tensor<int32, [4]> var_163 = const()[name = tensor<string, []>("op_163"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_164 = reshape(shape = var_163, x = var_162)[name = tensor<string, []>("op_164")];
+            tensor<fp32, [1, 2, 256]> var_198 = linear(bias = inner_encoder_q_proj_0_bias, weight = inner_encoder_q_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_3")];
+            tensor<int32, [4]> var_199 = const()[name = tensor<string, []>("op_199"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_200 = reshape(shape = var_199, x = var_198)[name = tensor<string, []>("op_200")];
             tensor<int32, [4]> q_1_perm_0 = const()[name = tensor<string, []>("q_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> var_168 = linear(bias = encoder_k_proj_0_bias, weight = encoder_k_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_4")];
-            tensor<fp32, []> var_169 = const()[name = tensor<string, []>("op_169"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 2, 256]> var_170 = mul(x = var_168, y = var_169)[name = tensor<string, []>("op_170")];
-            tensor<int32, [4]> var_171 = const()[name = tensor<string, []>("op_171"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_172 = reshape(shape = var_171, x = var_170)[name = tensor<string, []>("op_172")];
+            tensor<fp32, [1, 2, 256]> var_204 = linear(bias = inner_encoder_k_proj_0_bias, weight = inner_encoder_k_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_4")];
+            tensor<fp32, []> var_205 = const()[name = tensor<string, []>("op_205"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 2, 256]> var_206 = mul(x = var_204, y = var_205)[name = tensor<string, []>("op_206")];
+            tensor<int32, [4]> var_207 = const()[name = tensor<string, []>("op_207"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_208 = reshape(shape = var_207, x = var_206)[name = tensor<string, []>("op_208")];
             tensor<int32, [4]> k_1_perm_0 = const()[name = tensor<string, []>("k_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> var_176 = linear(bias = encoder_v_proj_0_bias, weight = encoder_v_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_5")];
-            tensor<int32, [4]> var_177 = const()[name = tensor<string, []>("op_177"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_178 = reshape(shape = var_177, x = var_176)[name = tensor<string, []>("op_178")];
+            tensor<fp32, [1, 2, 256]> var_212 = linear(bias = inner_encoder_v_proj_0_bias, weight = inner_encoder_v_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_5")];
+            tensor<int32, [4]> var_213 = const()[name = tensor<string, []>("op_213"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_214 = reshape(shape = var_213, x = var_212)[name = tensor<string, []>("op_214")];
             tensor<int32, [4]> v_1_perm_0 = const()[name = tensor<string, []>("v_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> input_17 = linear(bias = encoder_g_proj_0_bias, weight = encoder_g_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_6")];
+            tensor<fp32, [1, 2, 256]> input_19 = linear(bias = inner_encoder_g_proj_0_bias, weight = inner_encoder_g_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_6")];
             tensor<fp32, [1]> sqrt_s0_1 = sqrt(x = prev_scale_1)[name = tensor<string, []>("sqrt_s0_1")];
-            tensor<fp32, [2]> s_t_1 = add(x = prev_scale_1, y = encoder__t_index)[name = tensor<string, []>("s_t_1")];
+            tensor<fp32, [2]> s_t_1 = add(x = prev_scale_1, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_1")];
             tensor<fp32, [2]> sqrt_s_t_1 = sqrt(x = s_t_1)[name = tensor<string, []>("sqrt_s_t_1")];
             tensor<bool, []> qk_1_transpose_x_1 = const()[name = tensor<string, []>("qk_1_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_1_transpose_y_1 = const()[name = tensor<string, []>("qk_1_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 2, 64]> k_1 = transpose(perm = k_1_perm_0, x = var_172)[name = tensor<string, []>("transpose_57")];
-            tensor<fp32, [1, 4, 2, 64]> q_1 = transpose(perm = q_1_perm_0, x = var_164)[name = tensor<string, []>("transpose_58")];
+            tensor<fp32, [1, 4, 2, 64]> k_1 = transpose(perm = k_1_perm_0, x = var_208)[name = tensor<string, []>("transpose_57")];
+            tensor<fp32, [1, 4, 2, 64]> q_1 = transpose(perm = q_1_perm_0, x = var_200)[name = tensor<string, []>("transpose_58")];
             tensor<fp32, [1, 4, 2, 2]> qk_1 = matmul(transpose_x = qk_1_transpose_x_1, transpose_y = qk_1_transpose_y_1, x = q_1, y = k_1)[name = tensor<string, []>("qk_1")];
-            tensor<int32, [2]> var_188 = const()[name = tensor<string, []>("op_188"), val = tensor<int32, [2]>([2, 1])];
-            tensor<fp32, [2, 1]> var_189 = reshape(shape = var_188, x = sqrt_s_t_1)[name = tensor<string, []>("op_189")];
-            tensor<fp32, [2, 2]> M_1 = real_div(x = encoder__causal_mask, y = var_189)[name = tensor<string, []>("M_1")];
-            tensor<fp32, [1, 4, 2, 2]> var_191 = mul(x = qk_1, y = M_1)[name = tensor<string, []>("op_191")];
+            tensor<int32, [2]> var_224 = const()[name = tensor<string, []>("op_224"), val = tensor<int32, [2]>([2, 1])];
+            tensor<fp32, [2, 1]> var_225 = reshape(shape = var_224, x = sqrt_s_t_1)[name = tensor<string, []>("op_225")];
+            tensor<fp32, [2, 2]> M_1 = real_div(x = inner_encoder__causal_mask, y = var_225)[name = tensor<string, []>("M_1")];
+            tensor<fp32, [1, 4, 2, 2]> var_227 = mul(x = qk_1, y = M_1)[name = tensor<string, []>("op_227")];
             tensor<bool, []> inner_1_transpose_x_0 = const()[name = tensor<string, []>("inner_1_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_1_transpose_y_0 = const()[name = tensor<string, []>("inner_1_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 2, 64]> v_1 = transpose(perm = v_1_perm_0, x = var_178)[name = tensor<string, []>("transpose_56")];
-            tensor<fp32, [1, 4, 2, 64]> inner_1 = matmul(transpose_x = inner_1_transpose_x_0, transpose_y = inner_1_transpose_y_0, x = var_191, y = v_1)[name = tensor<string, []>("inner_1")];
-            tensor<bool, []> var_193_transpose_x_0 = const()[name = tensor<string, []>("op_193_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_193_transpose_y_0 = const()[name = tensor<string, []>("op_193_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 2, 64]> var_193 = matmul(transpose_x = var_193_transpose_x_0, transpose_y = var_193_transpose_y_0, x = q_1, y = prev_kv_1)[name = tensor<string, []>("op_193")];
-            tensor<fp32, [2]> var_194 = real_div(x = sqrt_s0_1, y = sqrt_s_t_1)[name = tensor<string, []>("op_194")];
-            tensor<int32, [4]> var_195 = const()[name = tensor<string, []>("op_195"), val = tensor<int32, [4]>([1, 1, 2, 1])];
-            tensor<fp32, [1, 1, 2, 1]> var_196 = reshape(shape = var_195, x = var_194)[name = tensor<string, []>("op_196")];
-            tensor<fp32, [1, 4, 2, 64]> cross_1 = mul(x = var_193, y = var_196)[name = tensor<string, []>("cross_1")];
+            tensor<fp32, [1, 4, 2, 64]> v_1 = transpose(perm = v_1_perm_0, x = var_214)[name = tensor<string, []>("transpose_56")];
+            tensor<fp32, [1, 4, 2, 64]> inner_1 = matmul(transpose_x = inner_1_transpose_x_0, transpose_y = inner_1_transpose_y_0, x = var_227, y = v_1)[name = tensor<string, []>("inner_1")];
+            tensor<bool, []> var_229_transpose_x_0 = const()[name = tensor<string, []>("op_229_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_229_transpose_y_0 = const()[name = tensor<string, []>("op_229_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 2, 64]> var_229 = matmul(transpose_x = var_229_transpose_x_0, transpose_y = var_229_transpose_y_0, x = q_1, y = prev_kv_1)[name = tensor<string, []>("op_229")];
+            tensor<fp32, [2]> var_230 = real_div(x = sqrt_s0_1, y = sqrt_s_t_1)[name = tensor<string, []>("op_230")];
+            tensor<int32, [4]> var_231 = const()[name = tensor<string, []>("op_231"), val = tensor<int32, [4]>([1, 1, 2, 1])];
+            tensor<fp32, [1, 1, 2, 1]> var_232 = reshape(shape = var_231, x = var_230)[name = tensor<string, []>("op_232")];
+            tensor<fp32, [1, 4, 2, 64]> cross_1 = mul(x = var_229, y = var_232)[name = tensor<string, []>("cross_1")];
             tensor<fp32, [1, 4, 2, 64]> out_1 = add(x = inner_1, y = cross_1)[name = tensor<string, []>("out_1")];
-            tensor<fp32, [1, 4, 64, 64]> var_199 = mul(x = prev_kv_1, y = sqrt_s0_1)[name = tensor<string, []>("op_199")];
-            tensor<bool, []> var_201_transpose_x_1 = const()[name = tensor<string, []>("op_201_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_201_transpose_y_1 = const()[name = tensor<string, []>("op_201_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_201 = matmul(transpose_x = var_201_transpose_x_1, transpose_y = var_201_transpose_y_1, x = k_1, y = v_1)[name = tensor<string, []>("op_201")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_1 = add(x = var_199, y = var_201)[name = tensor<string, []>("new_kv_unnorm_1")];
-            tensor<fp32, []> var_203 = const()[name = tensor<string, []>("op_203"), val = tensor<fp32, []>(0x1p+1)];
-            tensor<fp32, [1]> new_scale_1 = add(x = prev_scale_1, y = var_203)[name = tensor<string, []>("new_scale_1")];
-            tensor<fp32, [1]> var_205 = sqrt(x = new_scale_1)[name = tensor<string, []>("op_205")];
-            tensor<fp32, [1, 4, 64, 64]> var_206 = real_div(x = new_kv_unnorm_1, y = var_205)[name = tensor<string, []>("op_206")];
-            tensor<int32, [4]> var_207_perm_0 = const()[name = tensor<string, []>("op_207_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_235 = mul(x = prev_kv_1, y = sqrt_s0_1)[name = tensor<string, []>("op_235")];
+            tensor<bool, []> var_237_transpose_x_1 = const()[name = tensor<string, []>("op_237_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_237_transpose_y_1 = const()[name = tensor<string, []>("op_237_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_237 = matmul(transpose_x = var_237_transpose_x_1, transpose_y = var_237_transpose_y_1, x = k_1, y = v_1)[name = tensor<string, []>("op_237")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_1 = add(x = var_235, y = var_237)[name = tensor<string, []>("new_kv_unnorm_1")];
+            tensor<fp32, []> var_239 = const()[name = tensor<string, []>("op_239"), val = tensor<fp32, []>(0x1p+1)];
+            tensor<fp32, [1]> new_scale_1 = add(x = prev_scale_1, y = var_239)[name = tensor<string, []>("new_scale_1")];
+            tensor<fp32, [1]> var_241 = sqrt(x = new_scale_1)[name = tensor<string, []>("op_241")];
+            tensor<fp32, [1, 4, 64, 64]> var_242 = real_div(x = new_kv_unnorm_1, y = var_241)[name = tensor<string, []>("op_242")];
+            tensor<int32, [4]> var_243_perm_0 = const()[name = tensor<string, []>("op_243_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_3_axes_0 = const()[name = tensor<string, []>("out_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 4, 64]> var_207 = transpose(perm = var_207_perm_0, x = out_1)[name = tensor<string, []>("transpose_55")];
-            tensor<fp32, [1, 2, 4, 64]> out_3 = layer_norm(axes = out_3_axes_0, epsilon = var_17, x = var_207)[name = tensor<string, []>("out_3")];
-            tensor<int32, [3]> var_211 = const()[name = tensor<string, []>("op_211"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<fp32, [1, 2, 256]> out_5 = reshape(shape = var_211, x = out_3)[name = tensor<string, []>("out_5")];
-            tensor<fp32, [1, 2, 256]> var_213 = silu(x = input_17)[name = tensor<string, []>("op_213")];
-            tensor<fp32, [1, 2, 256]> input_19 = mul(x = var_213, y = out_5)[name = tensor<string, []>("input_19")];
-            tensor<fp32, [1, 2, 256]> ret_out_1 = linear(bias = encoder_out_proj_0_bias, weight = encoder_out_proj_0_weight, x = input_19)[name = tensor<string, []>("linear_7")];
-            tensor<fp32, [1, 2, 256]> x_3 = add(x = input_13, y = ret_out_1)[name = tensor<string, []>("x_3")];
+            tensor<fp32, [1, 2, 4, 64]> var_243 = transpose(perm = var_243_perm_0, x = out_1)[name = tensor<string, []>("transpose_55")];
+            tensor<fp32, [1, 2, 4, 64]> out_3 = layer_norm(axes = out_3_axes_0, epsilon = var_54, x = var_243)[name = tensor<string, []>("out_3")];
+            tensor<int32, [3]> var_247 = const()[name = tensor<string, []>("op_247"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp32, [1, 2, 256]> out_5 = reshape(shape = var_247, x = out_3)[name = tensor<string, []>("out_5")];
+            tensor<fp32, [1, 2, 256]> var_249 = silu(x = input_19)[name = tensor<string, []>("op_249")];
+            tensor<fp32, [1, 2, 256]> input_21 = mul(x = var_249, y = out_5)[name = tensor<string, []>("input_21")];
+            tensor<fp32, [1, 2, 256]> ret_out_1 = linear(bias = inner_encoder_out_proj_0_bias, weight = inner_encoder_out_proj_0_weight, x = input_21)[name = tensor<string, []>("linear_7")];
+            tensor<fp32, [1, 2, 256]> x_3 = add(x = input_15, y = ret_out_1)[name = tensor<string, []>("x_3")];
             tensor<int32, [4]> window_1_begin_0 = const()[name = tensor<string, []>("window_1_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> window_1_end_0 = const()[name = tensor<string, []>("window_1_end_0"), val = tensor<int32, [4]>([1, 1, 16, 256])];
             tensor<bool, [4]> window_1_end_mask_0 = const()[name = tensor<string, []>("window_1_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_1_squeeze_mask_0 = const()[name = tensor<string, []>("window_1_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_1 = slice_by_index(begin = window_1_begin_0, end = window_1_end_0, end_mask = window_1_end_mask_0, squeeze_mask = window_1_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_1")];
-            tensor<int32, [3]> var_221_begin_0 = const()[name = tensor<string, []>("op_221_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_221_end_0 = const()[name = tensor<string, []>("op_221_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_221_end_mask_0 = const()[name = tensor<string, []>("op_221_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_221 = slice_by_index(begin = var_221_begin_0, end = var_221_end_0, end_mask = var_221_end_mask_0, x = x_3)[name = tensor<string, []>("op_221")];
-            tensor<int32, [3]> var_224_begin_0 = const()[name = tensor<string, []>("op_224_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_224_end_0 = const()[name = tensor<string, []>("op_224_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_224_end_mask_0 = const()[name = tensor<string, []>("op_224_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_224 = slice_by_index(begin = var_224_begin_0, end = var_224_end_0, end_mask = var_224_end_mask_0, x = window_1)[name = tensor<string, []>("op_224")];
+            tensor<int32, [3]> var_257_begin_0 = const()[name = tensor<string, []>("op_257_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_257_end_0 = const()[name = tensor<string, []>("op_257_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_257_end_mask_0 = const()[name = tensor<string, []>("op_257_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_257 = slice_by_index(begin = var_257_begin_0, end = var_257_end_0, end_mask = var_257_end_mask_0, x = x_3)[name = tensor<string, []>("op_257")];
+            tensor<int32, [3]> var_260_begin_0 = const()[name = tensor<string, []>("op_260_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_260_end_0 = const()[name = tensor<string, []>("op_260_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_260_end_mask_0 = const()[name = tensor<string, []>("op_260_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_260 = slice_by_index(begin = var_260_begin_0, end = var_260_end_0, end_mask = var_260_end_mask_0, x = window_1)[name = tensor<string, []>("op_260")];
             tensor<bool, []> window_3_interleave_0 = const()[name = tensor<string, []>("window_3_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_3 = concat(axis = var_26, interleave = window_3_interleave_0, values = (var_224, var_221))[name = tensor<string, []>("window_3")];
-            tensor<int32, [3]> var_229_begin_0 = const()[name = tensor<string, []>("op_229_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_229_end_0 = const()[name = tensor<string, []>("op_229_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_229_end_mask_0 = const()[name = tensor<string, []>("op_229_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_229 = slice_by_index(begin = var_229_begin_0, end = var_229_end_0, end_mask = var_229_end_mask_0, x = x_3)[name = tensor<string, []>("op_229")];
-            tensor<int32, [3]> var_232_begin_0 = const()[name = tensor<string, []>("op_232_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_232_end_0 = const()[name = tensor<string, []>("op_232_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_232_end_mask_0 = const()[name = tensor<string, []>("op_232_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_232 = slice_by_index(begin = var_232_begin_0, end = var_232_end_0, end_mask = var_232_end_mask_0, x = window_3)[name = tensor<string, []>("op_232")];
+            tensor<fp32, [1, 16, 256]> window_3 = concat(axis = var_62, interleave = window_3_interleave_0, values = (var_260, var_257))[name = tensor<string, []>("window_3")];
+            tensor<int32, [3]> var_265_begin_0 = const()[name = tensor<string, []>("op_265_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_265_end_0 = const()[name = tensor<string, []>("op_265_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_265_end_mask_0 = const()[name = tensor<string, []>("op_265_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_265 = slice_by_index(begin = var_265_begin_0, end = var_265_end_0, end_mask = var_265_end_mask_0, x = x_3)[name = tensor<string, []>("op_265")];
+            tensor<int32, [3]> var_268_begin_0 = const()[name = tensor<string, []>("op_268_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_268_end_0 = const()[name = tensor<string, []>("op_268_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_268_end_mask_0 = const()[name = tensor<string, []>("op_268_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_268 = slice_by_index(begin = var_268_begin_0, end = var_268_end_0, end_mask = var_268_end_mask_0, x = window_3)[name = tensor<string, []>("op_268")];
             tensor<bool, []> window_5_interleave_0 = const()[name = tensor<string, []>("window_5_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_5 = concat(axis = var_26, interleave = window_5_interleave_0, values = (var_232, var_229))[name = tensor<string, []>("window_5")];
-            tensor<bool, []> input_21_interleave_0 = const()[name = tensor<string, []>("input_21_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [2, 16, 256]> input_21 = concat(axis = var_23, interleave = input_21_interleave_0, values = (window_3, window_5))[name = tensor<string, []>("input_21")];
+            tensor<fp32, [1, 16, 256]> window_5 = concat(axis = var_62, interleave = window_5_interleave_0, values = (var_268, var_265))[name = tensor<string, []>("window_5")];
+            tensor<bool, []> input_23_interleave_0 = const()[name = tensor<string, []>("input_23_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [2, 16, 256]> input_23 = concat(axis = var_49, interleave = input_23_interleave_0, values = (window_3, window_5))[name = tensor<string, []>("input_23")];
             tensor<int32, [1]> x_5_axes_0 = const()[name = tensor<string, []>("x_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [2, 16, 256]> x_5 = layer_norm(axes = x_5_axes_0, beta = encoder_conv_module_0_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_0_sequential_0_weight, x = input_21)[name = tensor<string, []>("x_5")];
-            tensor<int32, [3]> input_23_perm_0 = const()[name = tensor<string, []>("input_23_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [2, 16, 256]> x_5 = layer_norm(axes = x_5_axes_0, beta = inner_encoder_conv_module_0_sequential_0_bias, epsilon = var_46, gamma = inner_encoder_conv_module_0_sequential_0_weight, x = input_23)[name = tensor<string, []>("x_5")];
+            tensor<int32, [3]> input_25_perm_0 = const()[name = tensor<string, []>("input_25_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_3_pad_type_0 = const()[name = tensor<string, []>("inputs_3_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_3_strides_0 = const()[name = tensor<string, []>("inputs_3_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_3_pad_0 = const()[name = tensor<string, []>("inputs_3_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_3_dilations_0 = const()[name = tensor<string, []>("inputs_3_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_3_groups_0 = const()[name = tensor<string, []>("inputs_3_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> input_23 = transpose(perm = input_23_perm_0, x = x_5)[name = tensor<string, []>("transpose_54")];
-            tensor<fp32, [2, 512, 16]> inputs_3 = conv(bias = encoder_conv_module_0_sequential_2_conv_bias, dilations = inputs_3_dilations_0, groups = inputs_3_groups_0, pad = inputs_3_pad_0, pad_type = inputs_3_pad_type_0, strides = inputs_3_strides_0, weight = encoder_conv_module_0_sequential_2_conv_weight, x = input_23)[name = tensor<string, []>("inputs_3")];
-            tensor<int32, [2]> var_257_split_sizes_0 = const()[name = tensor<string, []>("op_257_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_257_axis_0 = const()[name = tensor<string, []>("op_257_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> var_257_0, tensor<fp32, [2, 256, 16]> var_257_1 = split(axis = var_257_axis_0, split_sizes = var_257_split_sizes_0, x = inputs_3)[name = tensor<string, []>("op_257")];
-            tensor<fp32, [2, 256, 16]> var_259 = sigmoid(x = var_257_1)[name = tensor<string, []>("op_259")];
-            tensor<fp32, [2, 256, 16]> inputs_5 = mul(x = var_257_0, y = var_259)[name = tensor<string, []>("inputs_5")];
+            tensor<fp32, [2, 256, 16]> input_25 = transpose(perm = input_25_perm_0, x = x_5)[name = tensor<string, []>("transpose_54")];
+            tensor<fp32, [2, 512, 16]> inputs_3 = conv(bias = inner_encoder_conv_module_0_sequential_2_conv_bias, dilations = inputs_3_dilations_0, groups = inputs_3_groups_0, pad = inputs_3_pad_0, pad_type = inputs_3_pad_type_0, strides = inputs_3_strides_0, weight = inner_encoder_conv_module_0_sequential_2_conv_weight, x = input_25)[name = tensor<string, []>("inputs_3")];
+            tensor<int32, [2]> var_293_split_sizes_0 = const()[name = tensor<string, []>("op_293_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_293_axis_0 = const()[name = tensor<string, []>("op_293_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [2, 256, 16]> var_293_0, tensor<fp32, [2, 256, 16]> var_293_1 = split(axis = var_293_axis_0, split_sizes = var_293_split_sizes_0, x = inputs_3)[name = tensor<string, []>("op_293")];
+            tensor<fp32, [2, 256, 16]> var_295 = sigmoid(x = var_293_1)[name = tensor<string, []>("op_295")];
+            tensor<fp32, [2, 256, 16]> inputs_5 = mul(x = var_293_0, y = var_295)[name = tensor<string, []>("inputs_5")];
             tensor<string, []> outputs_aug_1_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_1_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_1_pad_0 = const()[name = tensor<string, []>("outputs_aug_1_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_1_groups_0 = const()[name = tensor<string, []>("outputs_aug_1_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_1_strides_0 = const()[name = tensor<string, []>("outputs_aug_1_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_1_dilations_0 = const()[name = tensor<string, []>("outputs_aug_1_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [2, 256, 31]> outputs_aug_1 = conv(dilations = outputs_aug_1_dilations_0, groups = outputs_aug_1_groups_0, pad = outputs_aug_1_pad_0, pad_type = outputs_aug_1_pad_type_0, strides = outputs_aug_1_strides_0, weight = encoder_conv_module_0_sequential_4_conv_weight, x = inputs_5)[name = tensor<string, []>("outputs_aug_1")];
-            tensor<int32, [3]> input_25_begin_0 = const()[name = tensor<string, []>("input_25_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_25_end_0 = const()[name = tensor<string, []>("input_25_end_0"), val = tensor<int32, [3]>([2, 256, 16])];
-            tensor<bool, [3]> input_25_end_mask_0 = const()[name = tensor<string, []>("input_25_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [2, 256, 16]> input_25 = slice_by_index(begin = input_25_begin_0, end = input_25_end_0, end_mask = input_25_end_mask_0, x = outputs_aug_1)[name = tensor<string, []>("input_25")];
-            tensor<fp32, [2, 256, 16]> inputs_7 = batch_norm(beta = encoder_conv_module_0_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_0_sequential_5_weight, mean = encoder_conv_module_0_sequential_5_running_mean, variance = encoder_conv_module_0_sequential_5_running_var, x = input_25)[name = tensor<string, []>("inputs_7")];
-            tensor<fp32, [2, 256, 16]> input_27 = silu(x = inputs_7)[name = tensor<string, []>("input_27")];
-            tensor<string, []> input_29_pad_type_0 = const()[name = tensor<string, []>("input_29_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_29_strides_0 = const()[name = tensor<string, []>("input_29_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_29_pad_0 = const()[name = tensor<string, []>("input_29_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_29_dilations_0 = const()[name = tensor<string, []>("input_29_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_29_groups_0 = const()[name = tensor<string, []>("input_29_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> input_29 = conv(bias = encoder_conv_module_0_sequential_7_conv_bias, dilations = input_29_dilations_0, groups = input_29_groups_0, pad = input_29_pad_0, pad_type = input_29_pad_type_0, strides = input_29_strides_0, weight = encoder_conv_module_0_sequential_7_conv_weight, x = input_27)[name = tensor<string, []>("input_29")];
+            tensor<fp32, [2, 256, 31]> outputs_aug_1 = conv(dilations = outputs_aug_1_dilations_0, groups = outputs_aug_1_groups_0, pad = outputs_aug_1_pad_0, pad_type = outputs_aug_1_pad_type_0, strides = outputs_aug_1_strides_0, weight = inner_encoder_conv_module_0_sequential_4_conv_weight, x = inputs_5)[name = tensor<string, []>("outputs_aug_1")];
+            tensor<int32, [3]> input_27_begin_0 = const()[name = tensor<string, []>("input_27_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_27_end_0 = const()[name = tensor<string, []>("input_27_end_0"), val = tensor<int32, [3]>([2, 256, 16])];
+            tensor<bool, [3]> input_27_end_mask_0 = const()[name = tensor<string, []>("input_27_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [2, 256, 16]> input_27 = slice_by_index(begin = input_27_begin_0, end = input_27_end_0, end_mask = input_27_end_mask_0, x = outputs_aug_1)[name = tensor<string, []>("input_27")];
+            tensor<fp32, [2, 256, 16]> inputs_7 = batch_norm(beta = inner_encoder_conv_module_0_sequential_5_bias, epsilon = var_46, gamma = inner_encoder_conv_module_0_sequential_5_weight, mean = inner_encoder_conv_module_0_sequential_5_running_mean, variance = inner_encoder_conv_module_0_sequential_5_running_var, x = input_27)[name = tensor<string, []>("inputs_7")];
+            tensor<fp32, [2, 256, 16]> input_29 = silu(x = inputs_7)[name = tensor<string, []>("input_29")];
+            tensor<string, []> input_31_pad_type_0 = const()[name = tensor<string, []>("input_31_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_31_strides_0 = const()[name = tensor<string, []>("input_31_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_31_pad_0 = const()[name = tensor<string, []>("input_31_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_31_dilations_0 = const()[name = tensor<string, []>("input_31_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_31_groups_0 = const()[name = tensor<string, []>("input_31_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [2, 256, 16]> input_31 = conv(bias = inner_encoder_conv_module_0_sequential_7_conv_bias, dilations = input_31_dilations_0, groups = input_31_groups_0, pad = input_31_pad_0, pad_type = input_31_pad_type_0, strides = input_31_strides_0, weight = inner_encoder_conv_module_0_sequential_7_conv_weight, x = input_29)[name = tensor<string, []>("input_31")];
             tensor<int32, [3]> conv_out_1_perm_0 = const()[name = tensor<string, []>("conv_out_1_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_290_begin_0 = const()[name = tensor<string, []>("op_290_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_290_end_0 = const()[name = tensor<string, []>("op_290_end_0"), val = tensor<int32, [3]>([2, 16, 256])];
-            tensor<bool, [3]> var_290_end_mask_0 = const()[name = tensor<string, []>("op_290_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [2, 16, 256]> conv_out_1 = transpose(perm = conv_out_1_perm_0, x = input_29)[name = tensor<string, []>("transpose_53")];
-            tensor<fp32, [2, 1, 256]> var_290 = slice_by_index(begin = var_290_begin_0, end = var_290_end_0, end_mask = var_290_end_mask_0, x = conv_out_1)[name = tensor<string, []>("op_290")];
-            tensor<int32, [3]> var_292_perm_0 = const()[name = tensor<string, []>("op_292_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 2, 256]> var_292 = transpose(perm = var_292_perm_0, x = var_290)[name = tensor<string, []>("transpose_52")];
-            tensor<fp32, [1, 2, 256]> input_31 = add(x = x_3, y = var_292)[name = tensor<string, []>("input_31")];
-            tensor<int32, [1]> input_33_axes_0 = const()[name = tensor<string, []>("input_33_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_33 = layer_norm(axes = input_33_axes_0, beta = encoder_ffn2_0_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_0_module_sequential_0_weight, x = input_31)[name = tensor<string, []>("input_33")];
-            tensor<fp32, [1, 2, 1024]> inputs_9 = linear(bias = encoder_ffn2_0_module_sequential_1_linear_bias, weight = encoder_ffn2_0_module_sequential_1_linear_weight, x = input_33)[name = tensor<string, []>("linear_8")];
-            tensor<fp32, [1, 2, 1024]> input_35 = silu(x = inputs_9)[name = tensor<string, []>("input_35")];
-            tensor<fp32, [1, 2, 256]> input_39 = linear(bias = encoder_ffn2_0_module_sequential_4_linear_bias, weight = encoder_ffn2_0_module_sequential_4_linear_weight, x = input_35)[name = tensor<string, []>("linear_9")];
-            tensor<fp32, []> var_315 = const()[name = tensor<string, []>("op_315"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 2, 256]> var_316 = mul(x = input_39, y = var_315)[name = tensor<string, []>("op_316")];
-            tensor<fp32, [1, 2, 256]> input_41 = add(x = var_316, y = input_31)[name = tensor<string, []>("input_41")];
-            tensor<int32, [1]> input_43_axes_0 = const()[name = tensor<string, []>("input_43_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_43 = layer_norm(axes = input_43_axes_0, beta = encoder_layer_norm_0_bias, epsilon = var_28, gamma = encoder_layer_norm_0_weight, x = input_41)[name = tensor<string, []>("input_43")];
+            tensor<int32, [3]> var_326_begin_0 = const()[name = tensor<string, []>("op_326_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_326_end_0 = const()[name = tensor<string, []>("op_326_end_0"), val = tensor<int32, [3]>([2, 16, 256])];
+            tensor<bool, [3]> var_326_end_mask_0 = const()[name = tensor<string, []>("op_326_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [2, 16, 256]> conv_out_1 = transpose(perm = conv_out_1_perm_0, x = input_31)[name = tensor<string, []>("transpose_53")];
+            tensor<fp32, [2, 1, 256]> var_326 = slice_by_index(begin = var_326_begin_0, end = var_326_end_0, end_mask = var_326_end_mask_0, x = conv_out_1)[name = tensor<string, []>("op_326")];
+            tensor<int32, [3]> var_328_perm_0 = const()[name = tensor<string, []>("op_328_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 2, 256]> var_328 = transpose(perm = var_328_perm_0, x = var_326)[name = tensor<string, []>("transpose_52")];
+            tensor<fp32, [1, 2, 256]> input_33 = add(x = x_3, y = var_328)[name = tensor<string, []>("input_33")];
+            tensor<int32, [1]> input_35_axes_0 = const()[name = tensor<string, []>("input_35_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 2, 256]> input_35 = layer_norm(axes = input_35_axes_0, beta = inner_encoder_ffn2_0_module_sequential_0_bias, epsilon = var_46, gamma = inner_encoder_ffn2_0_module_sequential_0_weight, x = input_33)[name = tensor<string, []>("input_35")];
+            tensor<fp32, [1, 2, 1024]> inputs_9 = linear(bias = inner_encoder_ffn2_0_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_0_module_sequential_1_linear_weight, x = input_35)[name = tensor<string, []>("linear_8")];
+            tensor<fp32, [1, 2, 1024]> input_37 = silu(x = inputs_9)[name = tensor<string, []>("input_37")];
+            tensor<fp32, [1, 2, 256]> input_41 = linear(bias = inner_encoder_ffn2_0_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_0_module_sequential_4_linear_weight, x = input_37)[name = tensor<string, []>("linear_9")];
+            tensor<fp32, []> var_351 = const()[name = tensor<string, []>("op_351"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 2, 256]> var_352 = mul(x = input_41, y = var_351)[name = tensor<string, []>("op_352")];
+            tensor<fp32, [1, 2, 256]> input_43 = add(x = var_352, y = input_33)[name = tensor<string, []>("input_43")];
             tensor<int32, [1]> input_45_axes_0 = const()[name = tensor<string, []>("input_45_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_45 = layer_norm(axes = input_45_axes_0, beta = encoder_ffn1_1_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_1_module_sequential_0_weight, x = input_43)[name = tensor<string, []>("input_45")];
-            tensor<fp32, [1, 2, 1024]> inputs_11 = linear(bias = encoder_ffn1_1_module_sequential_1_linear_bias, weight = encoder_ffn1_1_module_sequential_1_linear_weight, x = input_45)[name = tensor<string, []>("linear_10")];
-            tensor<fp32, [1, 2, 1024]> input_47 = silu(x = inputs_11)[name = tensor<string, []>("input_47")];
-            tensor<fp32, [1, 2, 256]> input_51 = linear(bias = encoder_ffn1_1_module_sequential_4_linear_bias, weight = encoder_ffn1_1_module_sequential_4_linear_weight, x = input_47)[name = tensor<string, []>("linear_11")];
-            tensor<fp32, []> var_345 = const()[name = tensor<string, []>("op_345"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 2, 256]> var_346 = mul(x = input_51, y = var_345)[name = tensor<string, []>("op_346")];
-            tensor<fp32, [1, 2, 256]> input_53 = add(x = var_346, y = input_43)[name = tensor<string, []>("input_53")];
+            tensor<fp32, [1, 2, 256]> input_45 = layer_norm(axes = input_45_axes_0, beta = inner_encoder_layer_norm_0_bias, epsilon = var_46, gamma = inner_encoder_layer_norm_0_weight, x = input_43)[name = tensor<string, []>("input_45")];
+            tensor<int32, [1]> input_47_axes_0 = const()[name = tensor<string, []>("input_47_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 2, 256]> input_47 = layer_norm(axes = input_47_axes_0, beta = inner_encoder_ffn1_1_module_sequential_0_bias, epsilon = var_46, gamma = inner_encoder_ffn1_1_module_sequential_0_weight, x = input_45)[name = tensor<string, []>("input_47")];
+            tensor<fp32, [1, 2, 1024]> inputs_11 = linear(bias = inner_encoder_ffn1_1_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_1_module_sequential_1_linear_weight, x = input_47)[name = tensor<string, []>("linear_10")];
+            tensor<fp32, [1, 2, 1024]> input_49 = silu(x = inputs_11)[name = tensor<string, []>("input_49")];
+            tensor<fp32, [1, 2, 256]> input_53 = linear(bias = inner_encoder_ffn1_1_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_1_module_sequential_4_linear_weight, x = input_49)[name = tensor<string, []>("linear_11")];
+            tensor<fp32, []> var_381 = const()[name = tensor<string, []>("op_381"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 2, 256]> var_382 = mul(x = input_53, y = var_381)[name = tensor<string, []>("op_382")];
+            tensor<fp32, [1, 2, 256]> input_55 = add(x = var_382, y = input_45)[name = tensor<string, []>("input_55")];
             tensor<int32, [1]> x_7_axes_0 = const()[name = tensor<string, []>("x_7_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> x_7 = layer_norm(axes = x_7_axes_0, beta = encoder_ret_lns_1_bias, epsilon = var_28, gamma = encoder_ret_lns_1_weight, x = input_53)[name = tensor<string, []>("x_7")];
+            tensor<fp32, [1, 2, 256]> x_7 = layer_norm(axes = x_7_axes_0, beta = inner_encoder_ret_lns_1_bias, epsilon = var_46, gamma = inner_encoder_ret_lns_1_weight, x = input_55)[name = tensor<string, []>("x_7")];
             tensor<int32, [5]> prev_kv_3_begin_0 = const()[name = tensor<string, []>("prev_kv_3_begin_0"), val = tensor<int32, [5]>([1, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_3_end_0 = const()[name = tensor<string, []>("prev_kv_3_end_0"), val = tensor<int32, [5]>([2, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_3_end_mask_0 = const()[name = tensor<string, []>("prev_kv_3_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -396,153 +410,153 @@ program(1.0)
             tensor<bool, [2]> prev_scale_3_end_mask_0 = const()[name = tensor<string, []>("prev_scale_3_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_3_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_3_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_3 = slice_by_index(begin = prev_scale_3_begin_0, end = prev_scale_3_end_0, end_mask = prev_scale_3_end_mask_0, squeeze_mask = prev_scale_3_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_3")];
-            tensor<fp32, [1, 2, 256]> var_360 = linear(bias = encoder_q_proj_1_bias, weight = encoder_q_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_12")];
-            tensor<int32, [4]> var_361 = const()[name = tensor<string, []>("op_361"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_362 = reshape(shape = var_361, x = var_360)[name = tensor<string, []>("op_362")];
+            tensor<fp32, [1, 2, 256]> var_396 = linear(bias = inner_encoder_q_proj_1_bias, weight = inner_encoder_q_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_12")];
+            tensor<int32, [4]> var_397 = const()[name = tensor<string, []>("op_397"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_398 = reshape(shape = var_397, x = var_396)[name = tensor<string, []>("op_398")];
             tensor<int32, [4]> q_3_perm_0 = const()[name = tensor<string, []>("q_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> var_366 = linear(bias = encoder_k_proj_1_bias, weight = encoder_k_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_13")];
-            tensor<fp32, []> var_367 = const()[name = tensor<string, []>("op_367"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 2, 256]> var_368 = mul(x = var_366, y = var_367)[name = tensor<string, []>("op_368")];
-            tensor<int32, [4]> var_369 = const()[name = tensor<string, []>("op_369"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_370 = reshape(shape = var_369, x = var_368)[name = tensor<string, []>("op_370")];
+            tensor<fp32, [1, 2, 256]> var_402 = linear(bias = inner_encoder_k_proj_1_bias, weight = inner_encoder_k_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_13")];
+            tensor<fp32, []> var_403 = const()[name = tensor<string, []>("op_403"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 2, 256]> var_404 = mul(x = var_402, y = var_403)[name = tensor<string, []>("op_404")];
+            tensor<int32, [4]> var_405 = const()[name = tensor<string, []>("op_405"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_406 = reshape(shape = var_405, x = var_404)[name = tensor<string, []>("op_406")];
             tensor<int32, [4]> k_3_perm_0 = const()[name = tensor<string, []>("k_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> var_374 = linear(bias = encoder_v_proj_1_bias, weight = encoder_v_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_14")];
-            tensor<int32, [4]> var_375 = const()[name = tensor<string, []>("op_375"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_376 = reshape(shape = var_375, x = var_374)[name = tensor<string, []>("op_376")];
+            tensor<fp32, [1, 2, 256]> var_410 = linear(bias = inner_encoder_v_proj_1_bias, weight = inner_encoder_v_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_14")];
+            tensor<int32, [4]> var_411 = const()[name = tensor<string, []>("op_411"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_412 = reshape(shape = var_411, x = var_410)[name = tensor<string, []>("op_412")];
             tensor<int32, [4]> v_3_perm_0 = const()[name = tensor<string, []>("v_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> input_57 = linear(bias = encoder_g_proj_1_bias, weight = encoder_g_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_15")];
+            tensor<fp32, [1, 2, 256]> input_59 = linear(bias = inner_encoder_g_proj_1_bias, weight = inner_encoder_g_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_15")];
             tensor<fp32, [1]> sqrt_s0_3 = sqrt(x = prev_scale_3)[name = tensor<string, []>("sqrt_s0_3")];
-            tensor<fp32, [2]> s_t_3 = add(x = prev_scale_3, y = encoder__t_index)[name = tensor<string, []>("s_t_3")];
+            tensor<fp32, [2]> s_t_3 = add(x = prev_scale_3, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_3")];
             tensor<fp32, [2]> sqrt_s_t_3 = sqrt(x = s_t_3)[name = tensor<string, []>("sqrt_s_t_3")];
             tensor<bool, []> qk_3_transpose_x_1 = const()[name = tensor<string, []>("qk_3_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_3_transpose_y_1 = const()[name = tensor<string, []>("qk_3_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 2, 64]> k_3 = transpose(perm = k_3_perm_0, x = var_370)[name = tensor<string, []>("transpose_50")];
-            tensor<fp32, [1, 4, 2, 64]> q_3 = transpose(perm = q_3_perm_0, x = var_362)[name = tensor<string, []>("transpose_51")];
+            tensor<fp32, [1, 4, 2, 64]> k_3 = transpose(perm = k_3_perm_0, x = var_406)[name = tensor<string, []>("transpose_50")];
+            tensor<fp32, [1, 4, 2, 64]> q_3 = transpose(perm = q_3_perm_0, x = var_398)[name = tensor<string, []>("transpose_51")];
             tensor<fp32, [1, 4, 2, 2]> qk_3 = matmul(transpose_x = qk_3_transpose_x_1, transpose_y = qk_3_transpose_y_1, x = q_3, y = k_3)[name = tensor<string, []>("qk_3")];
-            tensor<int32, [2]> var_386 = const()[name = tensor<string, []>("op_386"), val = tensor<int32, [2]>([2, 1])];
-            tensor<fp32, [2, 1]> var_387 = reshape(shape = var_386, x = sqrt_s_t_3)[name = tensor<string, []>("op_387")];
-            tensor<fp32, [2, 2]> M_3 = real_div(x = encoder__causal_mask, y = var_387)[name = tensor<string, []>("M_3")];
-            tensor<fp32, [1, 4, 2, 2]> var_389 = mul(x = qk_3, y = M_3)[name = tensor<string, []>("op_389")];
+            tensor<int32, [2]> var_422 = const()[name = tensor<string, []>("op_422"), val = tensor<int32, [2]>([2, 1])];
+            tensor<fp32, [2, 1]> var_423 = reshape(shape = var_422, x = sqrt_s_t_3)[name = tensor<string, []>("op_423")];
+            tensor<fp32, [2, 2]> M_3 = real_div(x = inner_encoder__causal_mask, y = var_423)[name = tensor<string, []>("M_3")];
+            tensor<fp32, [1, 4, 2, 2]> var_425 = mul(x = qk_3, y = M_3)[name = tensor<string, []>("op_425")];
             tensor<bool, []> inner_3_transpose_x_0 = const()[name = tensor<string, []>("inner_3_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_3_transpose_y_0 = const()[name = tensor<string, []>("inner_3_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 2, 64]> v_3 = transpose(perm = v_3_perm_0, x = var_376)[name = tensor<string, []>("transpose_49")];
-            tensor<fp32, [1, 4, 2, 64]> inner_3 = matmul(transpose_x = inner_3_transpose_x_0, transpose_y = inner_3_transpose_y_0, x = var_389, y = v_3)[name = tensor<string, []>("inner_3")];
-            tensor<bool, []> var_391_transpose_x_0 = const()[name = tensor<string, []>("op_391_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_391_transpose_y_0 = const()[name = tensor<string, []>("op_391_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 2, 64]> var_391 = matmul(transpose_x = var_391_transpose_x_0, transpose_y = var_391_transpose_y_0, x = q_3, y = prev_kv_3)[name = tensor<string, []>("op_391")];
-            tensor<fp32, [2]> var_392 = real_div(x = sqrt_s0_3, y = sqrt_s_t_3)[name = tensor<string, []>("op_392")];
-            tensor<int32, [4]> var_393 = const()[name = tensor<string, []>("op_393"), val = tensor<int32, [4]>([1, 1, 2, 1])];
-            tensor<fp32, [1, 1, 2, 1]> var_394 = reshape(shape = var_393, x = var_392)[name = tensor<string, []>("op_394")];
-            tensor<fp32, [1, 4, 2, 64]> cross_3 = mul(x = var_391, y = var_394)[name = tensor<string, []>("cross_3")];
+            tensor<fp32, [1, 4, 2, 64]> v_3 = transpose(perm = v_3_perm_0, x = var_412)[name = tensor<string, []>("transpose_49")];
+            tensor<fp32, [1, 4, 2, 64]> inner_3 = matmul(transpose_x = inner_3_transpose_x_0, transpose_y = inner_3_transpose_y_0, x = var_425, y = v_3)[name = tensor<string, []>("inner_3")];
+            tensor<bool, []> var_427_transpose_x_0 = const()[name = tensor<string, []>("op_427_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_427_transpose_y_0 = const()[name = tensor<string, []>("op_427_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 2, 64]> var_427 = matmul(transpose_x = var_427_transpose_x_0, transpose_y = var_427_transpose_y_0, x = q_3, y = prev_kv_3)[name = tensor<string, []>("op_427")];
+            tensor<fp32, [2]> var_428 = real_div(x = sqrt_s0_3, y = sqrt_s_t_3)[name = tensor<string, []>("op_428")];
+            tensor<int32, [4]> var_429 = const()[name = tensor<string, []>("op_429"), val = tensor<int32, [4]>([1, 1, 2, 1])];
+            tensor<fp32, [1, 1, 2, 1]> var_430 = reshape(shape = var_429, x = var_428)[name = tensor<string, []>("op_430")];
+            tensor<fp32, [1, 4, 2, 64]> cross_3 = mul(x = var_427, y = var_430)[name = tensor<string, []>("cross_3")];
             tensor<fp32, [1, 4, 2, 64]> out_7 = add(x = inner_3, y = cross_3)[name = tensor<string, []>("out_7")];
-            tensor<fp32, [1, 4, 64, 64]> var_397 = mul(x = prev_kv_3, y = sqrt_s0_3)[name = tensor<string, []>("op_397")];
-            tensor<bool, []> var_399_transpose_x_1 = const()[name = tensor<string, []>("op_399_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_399_transpose_y_1 = const()[name = tensor<string, []>("op_399_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_399 = matmul(transpose_x = var_399_transpose_x_1, transpose_y = var_399_transpose_y_1, x = k_3, y = v_3)[name = tensor<string, []>("op_399")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_3 = add(x = var_397, y = var_399)[name = tensor<string, []>("new_kv_unnorm_3")];
-            tensor<fp32, []> var_401 = const()[name = tensor<string, []>("op_401"), val = tensor<fp32, []>(0x1p+1)];
-            tensor<fp32, [1]> new_scale_3 = add(x = prev_scale_3, y = var_401)[name = tensor<string, []>("new_scale_3")];
-            tensor<fp32, [1]> var_403 = sqrt(x = new_scale_3)[name = tensor<string, []>("op_403")];
-            tensor<fp32, [1, 4, 64, 64]> var_404 = real_div(x = new_kv_unnorm_3, y = var_403)[name = tensor<string, []>("op_404")];
-            tensor<int32, [4]> var_405_perm_0 = const()[name = tensor<string, []>("op_405_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_433 = mul(x = prev_kv_3, y = sqrt_s0_3)[name = tensor<string, []>("op_433")];
+            tensor<bool, []> var_435_transpose_x_1 = const()[name = tensor<string, []>("op_435_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_435_transpose_y_1 = const()[name = tensor<string, []>("op_435_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_435 = matmul(transpose_x = var_435_transpose_x_1, transpose_y = var_435_transpose_y_1, x = k_3, y = v_3)[name = tensor<string, []>("op_435")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_3 = add(x = var_433, y = var_435)[name = tensor<string, []>("new_kv_unnorm_3")];
+            tensor<fp32, []> var_437 = const()[name = tensor<string, []>("op_437"), val = tensor<fp32, []>(0x1p+1)];
+            tensor<fp32, [1]> new_scale_3 = add(x = prev_scale_3, y = var_437)[name = tensor<string, []>("new_scale_3")];
+            tensor<fp32, [1]> var_439 = sqrt(x = new_scale_3)[name = tensor<string, []>("op_439")];
+            tensor<fp32, [1, 4, 64, 64]> var_440 = real_div(x = new_kv_unnorm_3, y = var_439)[name = tensor<string, []>("op_440")];
+            tensor<int32, [4]> var_441_perm_0 = const()[name = tensor<string, []>("op_441_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_9_axes_0 = const()[name = tensor<string, []>("out_9_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 4, 64]> var_405 = transpose(perm = var_405_perm_0, x = out_7)[name = tensor<string, []>("transpose_48")];
-            tensor<fp32, [1, 2, 4, 64]> out_9 = layer_norm(axes = out_9_axes_0, epsilon = var_17, x = var_405)[name = tensor<string, []>("out_9")];
-            tensor<int32, [3]> var_409 = const()[name = tensor<string, []>("op_409"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<fp32, [1, 2, 256]> out_11 = reshape(shape = var_409, x = out_9)[name = tensor<string, []>("out_11")];
-            tensor<fp32, [1, 2, 256]> var_411 = silu(x = input_57)[name = tensor<string, []>("op_411")];
-            tensor<fp32, [1, 2, 256]> input_59 = mul(x = var_411, y = out_11)[name = tensor<string, []>("input_59")];
-            tensor<fp32, [1, 2, 256]> ret_out_3 = linear(bias = encoder_out_proj_1_bias, weight = encoder_out_proj_1_weight, x = input_59)[name = tensor<string, []>("linear_16")];
-            tensor<fp32, [1, 2, 256]> x_9 = add(x = input_53, y = ret_out_3)[name = tensor<string, []>("x_9")];
+            tensor<fp32, [1, 2, 4, 64]> var_441 = transpose(perm = var_441_perm_0, x = out_7)[name = tensor<string, []>("transpose_48")];
+            tensor<fp32, [1, 2, 4, 64]> out_9 = layer_norm(axes = out_9_axes_0, epsilon = var_54, x = var_441)[name = tensor<string, []>("out_9")];
+            tensor<int32, [3]> var_445 = const()[name = tensor<string, []>("op_445"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp32, [1, 2, 256]> out_11 = reshape(shape = var_445, x = out_9)[name = tensor<string, []>("out_11")];
+            tensor<fp32, [1, 2, 256]> var_447 = silu(x = input_59)[name = tensor<string, []>("op_447")];
+            tensor<fp32, [1, 2, 256]> input_61 = mul(x = var_447, y = out_11)[name = tensor<string, []>("input_61")];
+            tensor<fp32, [1, 2, 256]> ret_out_3 = linear(bias = inner_encoder_out_proj_1_bias, weight = inner_encoder_out_proj_1_weight, x = input_61)[name = tensor<string, []>("linear_16")];
+            tensor<fp32, [1, 2, 256]> x_9 = add(x = input_55, y = ret_out_3)[name = tensor<string, []>("x_9")];
             tensor<int32, [4]> window_7_begin_0 = const()[name = tensor<string, []>("window_7_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> window_7_end_0 = const()[name = tensor<string, []>("window_7_end_0"), val = tensor<int32, [4]>([2, 1, 16, 256])];
             tensor<bool, [4]> window_7_end_mask_0 = const()[name = tensor<string, []>("window_7_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_7_squeeze_mask_0 = const()[name = tensor<string, []>("window_7_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_7 = slice_by_index(begin = window_7_begin_0, end = window_7_end_0, end_mask = window_7_end_mask_0, squeeze_mask = window_7_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_7")];
-            tensor<int32, [3]> var_419_begin_0 = const()[name = tensor<string, []>("op_419_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_419_end_0 = const()[name = tensor<string, []>("op_419_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_419_end_mask_0 = const()[name = tensor<string, []>("op_419_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_419 = slice_by_index(begin = var_419_begin_0, end = var_419_end_0, end_mask = var_419_end_mask_0, x = x_9)[name = tensor<string, []>("op_419")];
-            tensor<int32, [3]> var_422_begin_0 = const()[name = tensor<string, []>("op_422_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_422_end_0 = const()[name = tensor<string, []>("op_422_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_422_end_mask_0 = const()[name = tensor<string, []>("op_422_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_422 = slice_by_index(begin = var_422_begin_0, end = var_422_end_0, end_mask = var_422_end_mask_0, x = window_7)[name = tensor<string, []>("op_422")];
+            tensor<int32, [3]> var_455_begin_0 = const()[name = tensor<string, []>("op_455_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_455_end_0 = const()[name = tensor<string, []>("op_455_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_455_end_mask_0 = const()[name = tensor<string, []>("op_455_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_455 = slice_by_index(begin = var_455_begin_0, end = var_455_end_0, end_mask = var_455_end_mask_0, x = x_9)[name = tensor<string, []>("op_455")];
+            tensor<int32, [3]> var_458_begin_0 = const()[name = tensor<string, []>("op_458_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_458_end_0 = const()[name = tensor<string, []>("op_458_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_458_end_mask_0 = const()[name = tensor<string, []>("op_458_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_458 = slice_by_index(begin = var_458_begin_0, end = var_458_end_0, end_mask = var_458_end_mask_0, x = window_7)[name = tensor<string, []>("op_458")];
             tensor<bool, []> window_9_interleave_0 = const()[name = tensor<string, []>("window_9_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_9 = concat(axis = var_26, interleave = window_9_interleave_0, values = (var_422, var_419))[name = tensor<string, []>("window_9")];
-            tensor<int32, [3]> var_427_begin_0 = const()[name = tensor<string, []>("op_427_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_427_end_0 = const()[name = tensor<string, []>("op_427_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_427_end_mask_0 = const()[name = tensor<string, []>("op_427_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_427 = slice_by_index(begin = var_427_begin_0, end = var_427_end_0, end_mask = var_427_end_mask_0, x = x_9)[name = tensor<string, []>("op_427")];
-            tensor<int32, [3]> var_430_begin_0 = const()[name = tensor<string, []>("op_430_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_430_end_0 = const()[name = tensor<string, []>("op_430_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_430_end_mask_0 = const()[name = tensor<string, []>("op_430_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_430 = slice_by_index(begin = var_430_begin_0, end = var_430_end_0, end_mask = var_430_end_mask_0, x = window_9)[name = tensor<string, []>("op_430")];
+            tensor<fp32, [1, 16, 256]> window_9 = concat(axis = var_62, interleave = window_9_interleave_0, values = (var_458, var_455))[name = tensor<string, []>("window_9")];
+            tensor<int32, [3]> var_463_begin_0 = const()[name = tensor<string, []>("op_463_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_463_end_0 = const()[name = tensor<string, []>("op_463_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_463_end_mask_0 = const()[name = tensor<string, []>("op_463_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_463 = slice_by_index(begin = var_463_begin_0, end = var_463_end_0, end_mask = var_463_end_mask_0, x = x_9)[name = tensor<string, []>("op_463")];
+            tensor<int32, [3]> var_466_begin_0 = const()[name = tensor<string, []>("op_466_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_466_end_0 = const()[name = tensor<string, []>("op_466_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_466_end_mask_0 = const()[name = tensor<string, []>("op_466_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_466 = slice_by_index(begin = var_466_begin_0, end = var_466_end_0, end_mask = var_466_end_mask_0, x = window_9)[name = tensor<string, []>("op_466")];
             tensor<bool, []> window_11_interleave_0 = const()[name = tensor<string, []>("window_11_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_11 = concat(axis = var_26, interleave = window_11_interleave_0, values = (var_430, var_427))[name = tensor<string, []>("window_11")];
-            tensor<bool, []> input_61_interleave_0 = const()[name = tensor<string, []>("input_61_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [2, 16, 256]> input_61 = concat(axis = var_23, interleave = input_61_interleave_0, values = (window_9, window_11))[name = tensor<string, []>("input_61")];
+            tensor<fp32, [1, 16, 256]> window_11 = concat(axis = var_62, interleave = window_11_interleave_0, values = (var_466, var_463))[name = tensor<string, []>("window_11")];
+            tensor<bool, []> input_63_interleave_0 = const()[name = tensor<string, []>("input_63_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [2, 16, 256]> input_63 = concat(axis = var_49, interleave = input_63_interleave_0, values = (window_9, window_11))[name = tensor<string, []>("input_63")];
             tensor<int32, [1]> x_11_axes_0 = const()[name = tensor<string, []>("x_11_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [2, 16, 256]> x_11 = layer_norm(axes = x_11_axes_0, beta = encoder_conv_module_1_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_1_sequential_0_weight, x = input_61)[name = tensor<string, []>("x_11")];
-            tensor<int32, [3]> input_63_perm_0 = const()[name = tensor<string, []>("input_63_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [2, 16, 256]> x_11 = layer_norm(axes = x_11_axes_0, beta = inner_encoder_conv_module_1_sequential_0_bias, epsilon = var_46, gamma = inner_encoder_conv_module_1_sequential_0_weight, x = input_63)[name = tensor<string, []>("x_11")];
+            tensor<int32, [3]> input_65_perm_0 = const()[name = tensor<string, []>("input_65_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_13_pad_type_0 = const()[name = tensor<string, []>("inputs_13_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_13_strides_0 = const()[name = tensor<string, []>("inputs_13_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_13_pad_0 = const()[name = tensor<string, []>("inputs_13_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_13_dilations_0 = const()[name = tensor<string, []>("inputs_13_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_13_groups_0 = const()[name = tensor<string, []>("inputs_13_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> input_63 = transpose(perm = input_63_perm_0, x = x_11)[name = tensor<string, []>("transpose_47")];
-            tensor<fp32, [2, 512, 16]> inputs_13 = conv(bias = encoder_conv_module_1_sequential_2_conv_bias, dilations = inputs_13_dilations_0, groups = inputs_13_groups_0, pad = inputs_13_pad_0, pad_type = inputs_13_pad_type_0, strides = inputs_13_strides_0, weight = encoder_conv_module_1_sequential_2_conv_weight, x = input_63)[name = tensor<string, []>("inputs_13")];
-            tensor<int32, [2]> var_455_split_sizes_0 = const()[name = tensor<string, []>("op_455_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_455_axis_0 = const()[name = tensor<string, []>("op_455_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> var_455_0, tensor<fp32, [2, 256, 16]> var_455_1 = split(axis = var_455_axis_0, split_sizes = var_455_split_sizes_0, x = inputs_13)[name = tensor<string, []>("op_455")];
-            tensor<fp32, [2, 256, 16]> var_457 = sigmoid(x = var_455_1)[name = tensor<string, []>("op_457")];
-            tensor<fp32, [2, 256, 16]> inputs_15 = mul(x = var_455_0, y = var_457)[name = tensor<string, []>("inputs_15")];
+            tensor<fp32, [2, 256, 16]> input_65 = transpose(perm = input_65_perm_0, x = x_11)[name = tensor<string, []>("transpose_47")];
+            tensor<fp32, [2, 512, 16]> inputs_13 = conv(bias = inner_encoder_conv_module_1_sequential_2_conv_bias, dilations = inputs_13_dilations_0, groups = inputs_13_groups_0, pad = inputs_13_pad_0, pad_type = inputs_13_pad_type_0, strides = inputs_13_strides_0, weight = inner_encoder_conv_module_1_sequential_2_conv_weight, x = input_65)[name = tensor<string, []>("inputs_13")];
+            tensor<int32, [2]> var_491_split_sizes_0 = const()[name = tensor<string, []>("op_491_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_491_axis_0 = const()[name = tensor<string, []>("op_491_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [2, 256, 16]> var_491_0, tensor<fp32, [2, 256, 16]> var_491_1 = split(axis = var_491_axis_0, split_sizes = var_491_split_sizes_0, x = inputs_13)[name = tensor<string, []>("op_491")];
+            tensor<fp32, [2, 256, 16]> var_493 = sigmoid(x = var_491_1)[name = tensor<string, []>("op_493")];
+            tensor<fp32, [2, 256, 16]> inputs_15 = mul(x = var_491_0, y = var_493)[name = tensor<string, []>("inputs_15")];
             tensor<string, []> outputs_aug_3_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_3_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_3_pad_0 = const()[name = tensor<string, []>("outputs_aug_3_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_3_groups_0 = const()[name = tensor<string, []>("outputs_aug_3_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_3_strides_0 = const()[name = tensor<string, []>("outputs_aug_3_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_3_dilations_0 = const()[name = tensor<string, []>("outputs_aug_3_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [2, 256, 31]> outputs_aug_3 = conv(dilations = outputs_aug_3_dilations_0, groups = outputs_aug_3_groups_0, pad = outputs_aug_3_pad_0, pad_type = outputs_aug_3_pad_type_0, strides = outputs_aug_3_strides_0, weight = encoder_conv_module_1_sequential_4_conv_weight, x = inputs_15)[name = tensor<string, []>("outputs_aug_3")];
-            tensor<int32, [3]> input_65_begin_0 = const()[name = tensor<string, []>("input_65_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_65_end_0 = const()[name = tensor<string, []>("input_65_end_0"), val = tensor<int32, [3]>([2, 256, 16])];
-            tensor<bool, [3]> input_65_end_mask_0 = const()[name = tensor<string, []>("input_65_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [2, 256, 16]> input_65 = slice_by_index(begin = input_65_begin_0, end = input_65_end_0, end_mask = input_65_end_mask_0, x = outputs_aug_3)[name = tensor<string, []>("input_65")];
-            tensor<fp32, [2, 256, 16]> inputs_17 = batch_norm(beta = encoder_conv_module_1_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_1_sequential_5_weight, mean = encoder_conv_module_1_sequential_5_running_mean, variance = encoder_conv_module_1_sequential_5_running_var, x = input_65)[name = tensor<string, []>("inputs_17")];
-            tensor<fp32, [2, 256, 16]> input_67 = silu(x = inputs_17)[name = tensor<string, []>("input_67")];
-            tensor<string, []> input_69_pad_type_0 = const()[name = tensor<string, []>("input_69_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_69_strides_0 = const()[name = tensor<string, []>("input_69_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_69_pad_0 = const()[name = tensor<string, []>("input_69_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_69_dilations_0 = const()[name = tensor<string, []>("input_69_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_69_groups_0 = const()[name = tensor<string, []>("input_69_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> input_69 = conv(bias = encoder_conv_module_1_sequential_7_conv_bias, dilations = input_69_dilations_0, groups = input_69_groups_0, pad = input_69_pad_0, pad_type = input_69_pad_type_0, strides = input_69_strides_0, weight = encoder_conv_module_1_sequential_7_conv_weight, x = input_67)[name = tensor<string, []>("input_69")];
+            tensor<fp32, [2, 256, 31]> outputs_aug_3 = conv(dilations = outputs_aug_3_dilations_0, groups = outputs_aug_3_groups_0, pad = outputs_aug_3_pad_0, pad_type = outputs_aug_3_pad_type_0, strides = outputs_aug_3_strides_0, weight = inner_encoder_conv_module_1_sequential_4_conv_weight, x = inputs_15)[name = tensor<string, []>("outputs_aug_3")];
+            tensor<int32, [3]> input_67_begin_0 = const()[name = tensor<string, []>("input_67_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_67_end_0 = const()[name = tensor<string, []>("input_67_end_0"), val = tensor<int32, [3]>([2, 256, 16])];
+            tensor<bool, [3]> input_67_end_mask_0 = const()[name = tensor<string, []>("input_67_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [2, 256, 16]> input_67 = slice_by_index(begin = input_67_begin_0, end = input_67_end_0, end_mask = input_67_end_mask_0, x = outputs_aug_3)[name = tensor<string, []>("input_67")];
+            tensor<fp32, [2, 256, 16]> inputs_17 = batch_norm(beta = inner_encoder_conv_module_1_sequential_5_bias, epsilon = var_46, gamma = inner_encoder_conv_module_1_sequential_5_weight, mean = inner_encoder_conv_module_1_sequential_5_running_mean, variance = inner_encoder_conv_module_1_sequential_5_running_var, x = input_67)[name = tensor<string, []>("inputs_17")];
+            tensor<fp32, [2, 256, 16]> input_69 = silu(x = inputs_17)[name = tensor<string, []>("input_69")];
+            tensor<string, []> input_71_pad_type_0 = const()[name = tensor<string, []>("input_71_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_71_strides_0 = const()[name = tensor<string, []>("input_71_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_71_pad_0 = const()[name = tensor<string, []>("input_71_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_71_dilations_0 = const()[name = tensor<string, []>("input_71_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_71_groups_0 = const()[name = tensor<string, []>("input_71_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [2, 256, 16]> input_71 = conv(bias = inner_encoder_conv_module_1_sequential_7_conv_bias, dilations = input_71_dilations_0, groups = input_71_groups_0, pad = input_71_pad_0, pad_type = input_71_pad_type_0, strides = input_71_strides_0, weight = inner_encoder_conv_module_1_sequential_7_conv_weight, x = input_69)[name = tensor<string, []>("input_71")];
             tensor<int32, [3]> conv_out_3_perm_0 = const()[name = tensor<string, []>("conv_out_3_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_488_begin_0 = const()[name = tensor<string, []>("op_488_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_488_end_0 = const()[name = tensor<string, []>("op_488_end_0"), val = tensor<int32, [3]>([2, 16, 256])];
-            tensor<bool, [3]> var_488_end_mask_0 = const()[name = tensor<string, []>("op_488_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [2, 16, 256]> conv_out_3 = transpose(perm = conv_out_3_perm_0, x = input_69)[name = tensor<string, []>("transpose_46")];
-            tensor<fp32, [2, 1, 256]> var_488 = slice_by_index(begin = var_488_begin_0, end = var_488_end_0, end_mask = var_488_end_mask_0, x = conv_out_3)[name = tensor<string, []>("op_488")];
-            tensor<int32, [3]> var_490_perm_0 = const()[name = tensor<string, []>("op_490_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 2, 256]> var_490 = transpose(perm = var_490_perm_0, x = var_488)[name = tensor<string, []>("transpose_45")];
-            tensor<fp32, [1, 2, 256]> input_71 = add(x = x_9, y = var_490)[name = tensor<string, []>("input_71")];
-            tensor<int32, [1]> input_73_axes_0 = const()[name = tensor<string, []>("input_73_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_73 = layer_norm(axes = input_73_axes_0, beta = encoder_ffn2_1_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_1_module_sequential_0_weight, x = input_71)[name = tensor<string, []>("input_73")];
-            tensor<fp32, [1, 2, 1024]> inputs_19 = linear(bias = encoder_ffn2_1_module_sequential_1_linear_bias, weight = encoder_ffn2_1_module_sequential_1_linear_weight, x = input_73)[name = tensor<string, []>("linear_17")];
-            tensor<fp32, [1, 2, 1024]> input_75 = silu(x = inputs_19)[name = tensor<string, []>("input_75")];
-            tensor<fp32, [1, 2, 256]> input_79 = linear(bias = encoder_ffn2_1_module_sequential_4_linear_bias, weight = encoder_ffn2_1_module_sequential_4_linear_weight, x = input_75)[name = tensor<string, []>("linear_18")];
-            tensor<fp32, []> var_513 = const()[name = tensor<string, []>("op_513"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 2, 256]> var_514 = mul(x = input_79, y = var_513)[name = tensor<string, []>("op_514")];
-            tensor<fp32, [1, 2, 256]> input_81 = add(x = var_514, y = input_71)[name = tensor<string, []>("input_81")];
-            tensor<int32, [1]> input_83_axes_0 = const()[name = tensor<string, []>("input_83_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_83 = layer_norm(axes = input_83_axes_0, beta = encoder_layer_norm_1_bias, epsilon = var_28, gamma = encoder_layer_norm_1_weight, x = input_81)[name = tensor<string, []>("input_83")];
+            tensor<int32, [3]> var_524_begin_0 = const()[name = tensor<string, []>("op_524_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_524_end_0 = const()[name = tensor<string, []>("op_524_end_0"), val = tensor<int32, [3]>([2, 16, 256])];
+            tensor<bool, [3]> var_524_end_mask_0 = const()[name = tensor<string, []>("op_524_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [2, 16, 256]> conv_out_3 = transpose(perm = conv_out_3_perm_0, x = input_71)[name = tensor<string, []>("transpose_46")];
+            tensor<fp32, [2, 1, 256]> var_524 = slice_by_index(begin = var_524_begin_0, end = var_524_end_0, end_mask = var_524_end_mask_0, x = conv_out_3)[name = tensor<string, []>("op_524")];
+            tensor<int32, [3]> var_526_perm_0 = const()[name = tensor<string, []>("op_526_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 2, 256]> var_526 = transpose(perm = var_526_perm_0, x = var_524)[name = tensor<string, []>("transpose_45")];
+            tensor<fp32, [1, 2, 256]> input_73 = add(x = x_9, y = var_526)[name = tensor<string, []>("input_73")];
+            tensor<int32, [1]> input_75_axes_0 = const()[name = tensor<string, []>("input_75_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 2, 256]> input_75 = layer_norm(axes = input_75_axes_0, beta = inner_encoder_ffn2_1_module_sequential_0_bias, epsilon = var_46, gamma = inner_encoder_ffn2_1_module_sequential_0_weight, x = input_73)[name = tensor<string, []>("input_75")];
+            tensor<fp32, [1, 2, 1024]> inputs_19 = linear(bias = inner_encoder_ffn2_1_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_1_module_sequential_1_linear_weight, x = input_75)[name = tensor<string, []>("linear_17")];
+            tensor<fp32, [1, 2, 1024]> input_77 = silu(x = inputs_19)[name = tensor<string, []>("input_77")];
+            tensor<fp32, [1, 2, 256]> input_81 = linear(bias = inner_encoder_ffn2_1_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_1_module_sequential_4_linear_weight, x = input_77)[name = tensor<string, []>("linear_18")];
+            tensor<fp32, []> var_549 = const()[name = tensor<string, []>("op_549"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 2, 256]> var_550 = mul(x = input_81, y = var_549)[name = tensor<string, []>("op_550")];
+            tensor<fp32, [1, 2, 256]> input_83 = add(x = var_550, y = input_73)[name = tensor<string, []>("input_83")];
             tensor<int32, [1]> input_85_axes_0 = const()[name = tensor<string, []>("input_85_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_85 = layer_norm(axes = input_85_axes_0, beta = encoder_ffn1_2_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_2_module_sequential_0_weight, x = input_83)[name = tensor<string, []>("input_85")];
-            tensor<fp32, [1, 2, 1024]> inputs_21 = linear(bias = encoder_ffn1_2_module_sequential_1_linear_bias, weight = encoder_ffn1_2_module_sequential_1_linear_weight, x = input_85)[name = tensor<string, []>("linear_19")];
-            tensor<fp32, [1, 2, 1024]> input_87 = silu(x = inputs_21)[name = tensor<string, []>("input_87")];
-            tensor<fp32, [1, 2, 256]> input_91 = linear(bias = encoder_ffn1_2_module_sequential_4_linear_bias, weight = encoder_ffn1_2_module_sequential_4_linear_weight, x = input_87)[name = tensor<string, []>("linear_20")];
-            tensor<fp32, []> var_543 = const()[name = tensor<string, []>("op_543"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 2, 256]> var_544 = mul(x = input_91, y = var_543)[name = tensor<string, []>("op_544")];
-            tensor<fp32, [1, 2, 256]> input_93 = add(x = var_544, y = input_83)[name = tensor<string, []>("input_93")];
+            tensor<fp32, [1, 2, 256]> input_85 = layer_norm(axes = input_85_axes_0, beta = inner_encoder_layer_norm_1_bias, epsilon = var_46, gamma = inner_encoder_layer_norm_1_weight, x = input_83)[name = tensor<string, []>("input_85")];
+            tensor<int32, [1]> input_87_axes_0 = const()[name = tensor<string, []>("input_87_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 2, 256]> input_87 = layer_norm(axes = input_87_axes_0, beta = inner_encoder_ffn1_2_module_sequential_0_bias, epsilon = var_46, gamma = inner_encoder_ffn1_2_module_sequential_0_weight, x = input_85)[name = tensor<string, []>("input_87")];
+            tensor<fp32, [1, 2, 1024]> inputs_21 = linear(bias = inner_encoder_ffn1_2_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_2_module_sequential_1_linear_weight, x = input_87)[name = tensor<string, []>("linear_19")];
+            tensor<fp32, [1, 2, 1024]> input_89 = silu(x = inputs_21)[name = tensor<string, []>("input_89")];
+            tensor<fp32, [1, 2, 256]> input_93 = linear(bias = inner_encoder_ffn1_2_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_2_module_sequential_4_linear_weight, x = input_89)[name = tensor<string, []>("linear_20")];
+            tensor<fp32, []> var_579 = const()[name = tensor<string, []>("op_579"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 2, 256]> var_580 = mul(x = input_93, y = var_579)[name = tensor<string, []>("op_580")];
+            tensor<fp32, [1, 2, 256]> input_95 = add(x = var_580, y = input_85)[name = tensor<string, []>("input_95")];
             tensor<int32, [1]> x_13_axes_0 = const()[name = tensor<string, []>("x_13_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> x_13 = layer_norm(axes = x_13_axes_0, beta = encoder_ret_lns_2_bias, epsilon = var_28, gamma = encoder_ret_lns_2_weight, x = input_93)[name = tensor<string, []>("x_13")];
+            tensor<fp32, [1, 2, 256]> x_13 = layer_norm(axes = x_13_axes_0, beta = inner_encoder_ret_lns_2_bias, epsilon = var_46, gamma = inner_encoder_ret_lns_2_weight, x = input_95)[name = tensor<string, []>("x_13")];
             tensor<int32, [5]> prev_kv_5_begin_0 = const()[name = tensor<string, []>("prev_kv_5_begin_0"), val = tensor<int32, [5]>([2, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_5_end_0 = const()[name = tensor<string, []>("prev_kv_5_end_0"), val = tensor<int32, [5]>([3, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_5_end_mask_0 = const()[name = tensor<string, []>("prev_kv_5_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -553,153 +567,153 @@ program(1.0)
             tensor<bool, [2]> prev_scale_5_end_mask_0 = const()[name = tensor<string, []>("prev_scale_5_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_5_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_5_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_5 = slice_by_index(begin = prev_scale_5_begin_0, end = prev_scale_5_end_0, end_mask = prev_scale_5_end_mask_0, squeeze_mask = prev_scale_5_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_5")];
-            tensor<fp32, [1, 2, 256]> var_558 = linear(bias = encoder_q_proj_2_bias, weight = encoder_q_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_21")];
-            tensor<int32, [4]> var_559 = const()[name = tensor<string, []>("op_559"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_560 = reshape(shape = var_559, x = var_558)[name = tensor<string, []>("op_560")];
+            tensor<fp32, [1, 2, 256]> var_594 = linear(bias = inner_encoder_q_proj_2_bias, weight = inner_encoder_q_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_21")];
+            tensor<int32, [4]> var_595 = const()[name = tensor<string, []>("op_595"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_596 = reshape(shape = var_595, x = var_594)[name = tensor<string, []>("op_596")];
             tensor<int32, [4]> q_5_perm_0 = const()[name = tensor<string, []>("q_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> var_564 = linear(bias = encoder_k_proj_2_bias, weight = encoder_k_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_22")];
-            tensor<fp32, []> var_565 = const()[name = tensor<string, []>("op_565"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 2, 256]> var_566 = mul(x = var_564, y = var_565)[name = tensor<string, []>("op_566")];
-            tensor<int32, [4]> var_567 = const()[name = tensor<string, []>("op_567"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_568 = reshape(shape = var_567, x = var_566)[name = tensor<string, []>("op_568")];
+            tensor<fp32, [1, 2, 256]> var_600 = linear(bias = inner_encoder_k_proj_2_bias, weight = inner_encoder_k_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_22")];
+            tensor<fp32, []> var_601 = const()[name = tensor<string, []>("op_601"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 2, 256]> var_602 = mul(x = var_600, y = var_601)[name = tensor<string, []>("op_602")];
+            tensor<int32, [4]> var_603 = const()[name = tensor<string, []>("op_603"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_604 = reshape(shape = var_603, x = var_602)[name = tensor<string, []>("op_604")];
             tensor<int32, [4]> k_5_perm_0 = const()[name = tensor<string, []>("k_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> var_572 = linear(bias = encoder_v_proj_2_bias, weight = encoder_v_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_23")];
-            tensor<int32, [4]> var_573 = const()[name = tensor<string, []>("op_573"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_574 = reshape(shape = var_573, x = var_572)[name = tensor<string, []>("op_574")];
+            tensor<fp32, [1, 2, 256]> var_608 = linear(bias = inner_encoder_v_proj_2_bias, weight = inner_encoder_v_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_23")];
+            tensor<int32, [4]> var_609 = const()[name = tensor<string, []>("op_609"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_610 = reshape(shape = var_609, x = var_608)[name = tensor<string, []>("op_610")];
             tensor<int32, [4]> v_5_perm_0 = const()[name = tensor<string, []>("v_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> input_97 = linear(bias = encoder_g_proj_2_bias, weight = encoder_g_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_24")];
+            tensor<fp32, [1, 2, 256]> input_99 = linear(bias = inner_encoder_g_proj_2_bias, weight = inner_encoder_g_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_24")];
             tensor<fp32, [1]> sqrt_s0_5 = sqrt(x = prev_scale_5)[name = tensor<string, []>("sqrt_s0_5")];
-            tensor<fp32, [2]> s_t_5 = add(x = prev_scale_5, y = encoder__t_index)[name = tensor<string, []>("s_t_5")];
+            tensor<fp32, [2]> s_t_5 = add(x = prev_scale_5, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_5")];
             tensor<fp32, [2]> sqrt_s_t_5 = sqrt(x = s_t_5)[name = tensor<string, []>("sqrt_s_t_5")];
             tensor<bool, []> qk_5_transpose_x_1 = const()[name = tensor<string, []>("qk_5_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_5_transpose_y_1 = const()[name = tensor<string, []>("qk_5_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 2, 64]> k_5 = transpose(perm = k_5_perm_0, x = var_568)[name = tensor<string, []>("transpose_43")];
-            tensor<fp32, [1, 4, 2, 64]> q_5 = transpose(perm = q_5_perm_0, x = var_560)[name = tensor<string, []>("transpose_44")];
+            tensor<fp32, [1, 4, 2, 64]> k_5 = transpose(perm = k_5_perm_0, x = var_604)[name = tensor<string, []>("transpose_43")];
+            tensor<fp32, [1, 4, 2, 64]> q_5 = transpose(perm = q_5_perm_0, x = var_596)[name = tensor<string, []>("transpose_44")];
             tensor<fp32, [1, 4, 2, 2]> qk_5 = matmul(transpose_x = qk_5_transpose_x_1, transpose_y = qk_5_transpose_y_1, x = q_5, y = k_5)[name = tensor<string, []>("qk_5")];
-            tensor<int32, [2]> var_584 = const()[name = tensor<string, []>("op_584"), val = tensor<int32, [2]>([2, 1])];
-            tensor<fp32, [2, 1]> var_585 = reshape(shape = var_584, x = sqrt_s_t_5)[name = tensor<string, []>("op_585")];
-            tensor<fp32, [2, 2]> M_5 = real_div(x = encoder__causal_mask, y = var_585)[name = tensor<string, []>("M_5")];
-            tensor<fp32, [1, 4, 2, 2]> var_587 = mul(x = qk_5, y = M_5)[name = tensor<string, []>("op_587")];
+            tensor<int32, [2]> var_620 = const()[name = tensor<string, []>("op_620"), val = tensor<int32, [2]>([2, 1])];
+            tensor<fp32, [2, 1]> var_621 = reshape(shape = var_620, x = sqrt_s_t_5)[name = tensor<string, []>("op_621")];
+            tensor<fp32, [2, 2]> M_5 = real_div(x = inner_encoder__causal_mask, y = var_621)[name = tensor<string, []>("M_5")];
+            tensor<fp32, [1, 4, 2, 2]> var_623 = mul(x = qk_5, y = M_5)[name = tensor<string, []>("op_623")];
             tensor<bool, []> inner_5_transpose_x_0 = const()[name = tensor<string, []>("inner_5_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_5_transpose_y_0 = const()[name = tensor<string, []>("inner_5_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 2, 64]> v_5 = transpose(perm = v_5_perm_0, x = var_574)[name = tensor<string, []>("transpose_42")];
-            tensor<fp32, [1, 4, 2, 64]> inner_5 = matmul(transpose_x = inner_5_transpose_x_0, transpose_y = inner_5_transpose_y_0, x = var_587, y = v_5)[name = tensor<string, []>("inner_5")];
-            tensor<bool, []> var_589_transpose_x_0 = const()[name = tensor<string, []>("op_589_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_589_transpose_y_0 = const()[name = tensor<string, []>("op_589_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 2, 64]> var_589 = matmul(transpose_x = var_589_transpose_x_0, transpose_y = var_589_transpose_y_0, x = q_5, y = prev_kv_5)[name = tensor<string, []>("op_589")];
-            tensor<fp32, [2]> var_590 = real_div(x = sqrt_s0_5, y = sqrt_s_t_5)[name = tensor<string, []>("op_590")];
-            tensor<int32, [4]> var_591 = const()[name = tensor<string, []>("op_591"), val = tensor<int32, [4]>([1, 1, 2, 1])];
-            tensor<fp32, [1, 1, 2, 1]> var_592 = reshape(shape = var_591, x = var_590)[name = tensor<string, []>("op_592")];
-            tensor<fp32, [1, 4, 2, 64]> cross_5 = mul(x = var_589, y = var_592)[name = tensor<string, []>("cross_5")];
+            tensor<fp32, [1, 4, 2, 64]> v_5 = transpose(perm = v_5_perm_0, x = var_610)[name = tensor<string, []>("transpose_42")];
+            tensor<fp32, [1, 4, 2, 64]> inner_5 = matmul(transpose_x = inner_5_transpose_x_0, transpose_y = inner_5_transpose_y_0, x = var_623, y = v_5)[name = tensor<string, []>("inner_5")];
+            tensor<bool, []> var_625_transpose_x_0 = const()[name = tensor<string, []>("op_625_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_625_transpose_y_0 = const()[name = tensor<string, []>("op_625_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 2, 64]> var_625 = matmul(transpose_x = var_625_transpose_x_0, transpose_y = var_625_transpose_y_0, x = q_5, y = prev_kv_5)[name = tensor<string, []>("op_625")];
+            tensor<fp32, [2]> var_626 = real_div(x = sqrt_s0_5, y = sqrt_s_t_5)[name = tensor<string, []>("op_626")];
+            tensor<int32, [4]> var_627 = const()[name = tensor<string, []>("op_627"), val = tensor<int32, [4]>([1, 1, 2, 1])];
+            tensor<fp32, [1, 1, 2, 1]> var_628 = reshape(shape = var_627, x = var_626)[name = tensor<string, []>("op_628")];
+            tensor<fp32, [1, 4, 2, 64]> cross_5 = mul(x = var_625, y = var_628)[name = tensor<string, []>("cross_5")];
             tensor<fp32, [1, 4, 2, 64]> out_13 = add(x = inner_5, y = cross_5)[name = tensor<string, []>("out_13")];
-            tensor<fp32, [1, 4, 64, 64]> var_595 = mul(x = prev_kv_5, y = sqrt_s0_5)[name = tensor<string, []>("op_595")];
-            tensor<bool, []> var_597_transpose_x_1 = const()[name = tensor<string, []>("op_597_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_597_transpose_y_1 = const()[name = tensor<string, []>("op_597_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_597 = matmul(transpose_x = var_597_transpose_x_1, transpose_y = var_597_transpose_y_1, x = k_5, y = v_5)[name = tensor<string, []>("op_597")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_5 = add(x = var_595, y = var_597)[name = tensor<string, []>("new_kv_unnorm_5")];
-            tensor<fp32, []> var_599 = const()[name = tensor<string, []>("op_599"), val = tensor<fp32, []>(0x1p+1)];
-            tensor<fp32, [1]> new_scale_5 = add(x = prev_scale_5, y = var_599)[name = tensor<string, []>("new_scale_5")];
-            tensor<fp32, [1]> var_601 = sqrt(x = new_scale_5)[name = tensor<string, []>("op_601")];
-            tensor<fp32, [1, 4, 64, 64]> var_602 = real_div(x = new_kv_unnorm_5, y = var_601)[name = tensor<string, []>("op_602")];
-            tensor<int32, [4]> var_603_perm_0 = const()[name = tensor<string, []>("op_603_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_631 = mul(x = prev_kv_5, y = sqrt_s0_5)[name = tensor<string, []>("op_631")];
+            tensor<bool, []> var_633_transpose_x_1 = const()[name = tensor<string, []>("op_633_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_633_transpose_y_1 = const()[name = tensor<string, []>("op_633_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_633 = matmul(transpose_x = var_633_transpose_x_1, transpose_y = var_633_transpose_y_1, x = k_5, y = v_5)[name = tensor<string, []>("op_633")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_5 = add(x = var_631, y = var_633)[name = tensor<string, []>("new_kv_unnorm_5")];
+            tensor<fp32, []> var_635 = const()[name = tensor<string, []>("op_635"), val = tensor<fp32, []>(0x1p+1)];
+            tensor<fp32, [1]> new_scale_5 = add(x = prev_scale_5, y = var_635)[name = tensor<string, []>("new_scale_5")];
+            tensor<fp32, [1]> var_637 = sqrt(x = new_scale_5)[name = tensor<string, []>("op_637")];
+            tensor<fp32, [1, 4, 64, 64]> var_638 = real_div(x = new_kv_unnorm_5, y = var_637)[name = tensor<string, []>("op_638")];
+            tensor<int32, [4]> var_639_perm_0 = const()[name = tensor<string, []>("op_639_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_15_axes_0 = const()[name = tensor<string, []>("out_15_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 4, 64]> var_603 = transpose(perm = var_603_perm_0, x = out_13)[name = tensor<string, []>("transpose_41")];
-            tensor<fp32, [1, 2, 4, 64]> out_15 = layer_norm(axes = out_15_axes_0, epsilon = var_17, x = var_603)[name = tensor<string, []>("out_15")];
-            tensor<int32, [3]> var_607 = const()[name = tensor<string, []>("op_607"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<fp32, [1, 2, 256]> out_17 = reshape(shape = var_607, x = out_15)[name = tensor<string, []>("out_17")];
-            tensor<fp32, [1, 2, 256]> var_609 = silu(x = input_97)[name = tensor<string, []>("op_609")];
-            tensor<fp32, [1, 2, 256]> input_99 = mul(x = var_609, y = out_17)[name = tensor<string, []>("input_99")];
-            tensor<fp32, [1, 2, 256]> ret_out_5 = linear(bias = encoder_out_proj_2_bias, weight = encoder_out_proj_2_weight, x = input_99)[name = tensor<string, []>("linear_25")];
-            tensor<fp32, [1, 2, 256]> x_15 = add(x = input_93, y = ret_out_5)[name = tensor<string, []>("x_15")];
+            tensor<fp32, [1, 2, 4, 64]> var_639 = transpose(perm = var_639_perm_0, x = out_13)[name = tensor<string, []>("transpose_41")];
+            tensor<fp32, [1, 2, 4, 64]> out_15 = layer_norm(axes = out_15_axes_0, epsilon = var_54, x = var_639)[name = tensor<string, []>("out_15")];
+            tensor<int32, [3]> var_643 = const()[name = tensor<string, []>("op_643"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp32, [1, 2, 256]> out_17 = reshape(shape = var_643, x = out_15)[name = tensor<string, []>("out_17")];
+            tensor<fp32, [1, 2, 256]> var_645 = silu(x = input_99)[name = tensor<string, []>("op_645")];
+            tensor<fp32, [1, 2, 256]> input_101 = mul(x = var_645, y = out_17)[name = tensor<string, []>("input_101")];
+            tensor<fp32, [1, 2, 256]> ret_out_5 = linear(bias = inner_encoder_out_proj_2_bias, weight = inner_encoder_out_proj_2_weight, x = input_101)[name = tensor<string, []>("linear_25")];
+            tensor<fp32, [1, 2, 256]> x_15 = add(x = input_95, y = ret_out_5)[name = tensor<string, []>("x_15")];
             tensor<int32, [4]> window_13_begin_0 = const()[name = tensor<string, []>("window_13_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> window_13_end_0 = const()[name = tensor<string, []>("window_13_end_0"), val = tensor<int32, [4]>([3, 1, 16, 256])];
             tensor<bool, [4]> window_13_end_mask_0 = const()[name = tensor<string, []>("window_13_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_13_squeeze_mask_0 = const()[name = tensor<string, []>("window_13_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_13 = slice_by_index(begin = window_13_begin_0, end = window_13_end_0, end_mask = window_13_end_mask_0, squeeze_mask = window_13_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_13")];
-            tensor<int32, [3]> var_617_begin_0 = const()[name = tensor<string, []>("op_617_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_617_end_0 = const()[name = tensor<string, []>("op_617_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_617_end_mask_0 = const()[name = tensor<string, []>("op_617_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_617 = slice_by_index(begin = var_617_begin_0, end = var_617_end_0, end_mask = var_617_end_mask_0, x = x_15)[name = tensor<string, []>("op_617")];
-            tensor<int32, [3]> var_620_begin_0 = const()[name = tensor<string, []>("op_620_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_620_end_0 = const()[name = tensor<string, []>("op_620_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_620_end_mask_0 = const()[name = tensor<string, []>("op_620_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_620 = slice_by_index(begin = var_620_begin_0, end = var_620_end_0, end_mask = var_620_end_mask_0, x = window_13)[name = tensor<string, []>("op_620")];
+            tensor<int32, [3]> var_653_begin_0 = const()[name = tensor<string, []>("op_653_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_653_end_0 = const()[name = tensor<string, []>("op_653_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_653_end_mask_0 = const()[name = tensor<string, []>("op_653_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_653 = slice_by_index(begin = var_653_begin_0, end = var_653_end_0, end_mask = var_653_end_mask_0, x = x_15)[name = tensor<string, []>("op_653")];
+            tensor<int32, [3]> var_656_begin_0 = const()[name = tensor<string, []>("op_656_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_656_end_0 = const()[name = tensor<string, []>("op_656_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_656_end_mask_0 = const()[name = tensor<string, []>("op_656_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_656 = slice_by_index(begin = var_656_begin_0, end = var_656_end_0, end_mask = var_656_end_mask_0, x = window_13)[name = tensor<string, []>("op_656")];
             tensor<bool, []> window_15_interleave_0 = const()[name = tensor<string, []>("window_15_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_15 = concat(axis = var_26, interleave = window_15_interleave_0, values = (var_620, var_617))[name = tensor<string, []>("window_15")];
-            tensor<int32, [3]> var_625_begin_0 = const()[name = tensor<string, []>("op_625_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_625_end_0 = const()[name = tensor<string, []>("op_625_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_625_end_mask_0 = const()[name = tensor<string, []>("op_625_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_625 = slice_by_index(begin = var_625_begin_0, end = var_625_end_0, end_mask = var_625_end_mask_0, x = x_15)[name = tensor<string, []>("op_625")];
-            tensor<int32, [3]> var_628_begin_0 = const()[name = tensor<string, []>("op_628_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_628_end_0 = const()[name = tensor<string, []>("op_628_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_628_end_mask_0 = const()[name = tensor<string, []>("op_628_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_628 = slice_by_index(begin = var_628_begin_0, end = var_628_end_0, end_mask = var_628_end_mask_0, x = window_15)[name = tensor<string, []>("op_628")];
+            tensor<fp32, [1, 16, 256]> window_15 = concat(axis = var_62, interleave = window_15_interleave_0, values = (var_656, var_653))[name = tensor<string, []>("window_15")];
+            tensor<int32, [3]> var_661_begin_0 = const()[name = tensor<string, []>("op_661_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_661_end_0 = const()[name = tensor<string, []>("op_661_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_661_end_mask_0 = const()[name = tensor<string, []>("op_661_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_661 = slice_by_index(begin = var_661_begin_0, end = var_661_end_0, end_mask = var_661_end_mask_0, x = x_15)[name = tensor<string, []>("op_661")];
+            tensor<int32, [3]> var_664_begin_0 = const()[name = tensor<string, []>("op_664_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_664_end_0 = const()[name = tensor<string, []>("op_664_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_664_end_mask_0 = const()[name = tensor<string, []>("op_664_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_664 = slice_by_index(begin = var_664_begin_0, end = var_664_end_0, end_mask = var_664_end_mask_0, x = window_15)[name = tensor<string, []>("op_664")];
             tensor<bool, []> window_17_interleave_0 = const()[name = tensor<string, []>("window_17_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_17 = concat(axis = var_26, interleave = window_17_interleave_0, values = (var_628, var_625))[name = tensor<string, []>("window_17")];
-            tensor<bool, []> input_101_interleave_0 = const()[name = tensor<string, []>("input_101_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [2, 16, 256]> input_101 = concat(axis = var_23, interleave = input_101_interleave_0, values = (window_15, window_17))[name = tensor<string, []>("input_101")];
+            tensor<fp32, [1, 16, 256]> window_17 = concat(axis = var_62, interleave = window_17_interleave_0, values = (var_664, var_661))[name = tensor<string, []>("window_17")];
+            tensor<bool, []> input_103_interleave_0 = const()[name = tensor<string, []>("input_103_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [2, 16, 256]> input_103 = concat(axis = var_49, interleave = input_103_interleave_0, values = (window_15, window_17))[name = tensor<string, []>("input_103")];
             tensor<int32, [1]> x_17_axes_0 = const()[name = tensor<string, []>("x_17_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [2, 16, 256]> x_17 = layer_norm(axes = x_17_axes_0, beta = encoder_conv_module_2_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_2_sequential_0_weight, x = input_101)[name = tensor<string, []>("x_17")];
-            tensor<int32, [3]> input_103_perm_0 = const()[name = tensor<string, []>("input_103_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [2, 16, 256]> x_17 = layer_norm(axes = x_17_axes_0, beta = inner_encoder_conv_module_2_sequential_0_bias, epsilon = var_46, gamma = inner_encoder_conv_module_2_sequential_0_weight, x = input_103)[name = tensor<string, []>("x_17")];
+            tensor<int32, [3]> input_105_perm_0 = const()[name = tensor<string, []>("input_105_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_23_pad_type_0 = const()[name = tensor<string, []>("inputs_23_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_23_strides_0 = const()[name = tensor<string, []>("inputs_23_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_23_pad_0 = const()[name = tensor<string, []>("inputs_23_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_23_dilations_0 = const()[name = tensor<string, []>("inputs_23_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_23_groups_0 = const()[name = tensor<string, []>("inputs_23_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> input_103 = transpose(perm = input_103_perm_0, x = x_17)[name = tensor<string, []>("transpose_40")];
-            tensor<fp32, [2, 512, 16]> inputs_23 = conv(bias = encoder_conv_module_2_sequential_2_conv_bias, dilations = inputs_23_dilations_0, groups = inputs_23_groups_0, pad = inputs_23_pad_0, pad_type = inputs_23_pad_type_0, strides = inputs_23_strides_0, weight = encoder_conv_module_2_sequential_2_conv_weight, x = input_103)[name = tensor<string, []>("inputs_23")];
-            tensor<int32, [2]> var_653_split_sizes_0 = const()[name = tensor<string, []>("op_653_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_653_axis_0 = const()[name = tensor<string, []>("op_653_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> var_653_0, tensor<fp32, [2, 256, 16]> var_653_1 = split(axis = var_653_axis_0, split_sizes = var_653_split_sizes_0, x = inputs_23)[name = tensor<string, []>("op_653")];
-            tensor<fp32, [2, 256, 16]> var_655 = sigmoid(x = var_653_1)[name = tensor<string, []>("op_655")];
-            tensor<fp32, [2, 256, 16]> inputs_25 = mul(x = var_653_0, y = var_655)[name = tensor<string, []>("inputs_25")];
+            tensor<fp32, [2, 256, 16]> input_105 = transpose(perm = input_105_perm_0, x = x_17)[name = tensor<string, []>("transpose_40")];
+            tensor<fp32, [2, 512, 16]> inputs_23 = conv(bias = inner_encoder_conv_module_2_sequential_2_conv_bias, dilations = inputs_23_dilations_0, groups = inputs_23_groups_0, pad = inputs_23_pad_0, pad_type = inputs_23_pad_type_0, strides = inputs_23_strides_0, weight = inner_encoder_conv_module_2_sequential_2_conv_weight, x = input_105)[name = tensor<string, []>("inputs_23")];
+            tensor<int32, [2]> var_689_split_sizes_0 = const()[name = tensor<string, []>("op_689_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_689_axis_0 = const()[name = tensor<string, []>("op_689_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [2, 256, 16]> var_689_0, tensor<fp32, [2, 256, 16]> var_689_1 = split(axis = var_689_axis_0, split_sizes = var_689_split_sizes_0, x = inputs_23)[name = tensor<string, []>("op_689")];
+            tensor<fp32, [2, 256, 16]> var_691 = sigmoid(x = var_689_1)[name = tensor<string, []>("op_691")];
+            tensor<fp32, [2, 256, 16]> inputs_25 = mul(x = var_689_0, y = var_691)[name = tensor<string, []>("inputs_25")];
             tensor<string, []> outputs_aug_5_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_5_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_5_pad_0 = const()[name = tensor<string, []>("outputs_aug_5_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_5_groups_0 = const()[name = tensor<string, []>("outputs_aug_5_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_5_strides_0 = const()[name = tensor<string, []>("outputs_aug_5_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_5_dilations_0 = const()[name = tensor<string, []>("outputs_aug_5_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [2, 256, 31]> outputs_aug_5 = conv(dilations = outputs_aug_5_dilations_0, groups = outputs_aug_5_groups_0, pad = outputs_aug_5_pad_0, pad_type = outputs_aug_5_pad_type_0, strides = outputs_aug_5_strides_0, weight = encoder_conv_module_2_sequential_4_conv_weight, x = inputs_25)[name = tensor<string, []>("outputs_aug_5")];
-            tensor<int32, [3]> input_105_begin_0 = const()[name = tensor<string, []>("input_105_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_105_end_0 = const()[name = tensor<string, []>("input_105_end_0"), val = tensor<int32, [3]>([2, 256, 16])];
-            tensor<bool, [3]> input_105_end_mask_0 = const()[name = tensor<string, []>("input_105_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [2, 256, 16]> input_105 = slice_by_index(begin = input_105_begin_0, end = input_105_end_0, end_mask = input_105_end_mask_0, x = outputs_aug_5)[name = tensor<string, []>("input_105")];
-            tensor<fp32, [2, 256, 16]> inputs_27 = batch_norm(beta = encoder_conv_module_2_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_2_sequential_5_weight, mean = encoder_conv_module_2_sequential_5_running_mean, variance = encoder_conv_module_2_sequential_5_running_var, x = input_105)[name = tensor<string, []>("inputs_27")];
-            tensor<fp32, [2, 256, 16]> input_107 = silu(x = inputs_27)[name = tensor<string, []>("input_107")];
-            tensor<string, []> input_109_pad_type_0 = const()[name = tensor<string, []>("input_109_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_109_strides_0 = const()[name = tensor<string, []>("input_109_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_109_pad_0 = const()[name = tensor<string, []>("input_109_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_109_dilations_0 = const()[name = tensor<string, []>("input_109_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_109_groups_0 = const()[name = tensor<string, []>("input_109_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> input_109 = conv(bias = encoder_conv_module_2_sequential_7_conv_bias, dilations = input_109_dilations_0, groups = input_109_groups_0, pad = input_109_pad_0, pad_type = input_109_pad_type_0, strides = input_109_strides_0, weight = encoder_conv_module_2_sequential_7_conv_weight, x = input_107)[name = tensor<string, []>("input_109")];
+            tensor<fp32, [2, 256, 31]> outputs_aug_5 = conv(dilations = outputs_aug_5_dilations_0, groups = outputs_aug_5_groups_0, pad = outputs_aug_5_pad_0, pad_type = outputs_aug_5_pad_type_0, strides = outputs_aug_5_strides_0, weight = inner_encoder_conv_module_2_sequential_4_conv_weight, x = inputs_25)[name = tensor<string, []>("outputs_aug_5")];
+            tensor<int32, [3]> input_107_begin_0 = const()[name = tensor<string, []>("input_107_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_107_end_0 = const()[name = tensor<string, []>("input_107_end_0"), val = tensor<int32, [3]>([2, 256, 16])];
+            tensor<bool, [3]> input_107_end_mask_0 = const()[name = tensor<string, []>("input_107_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [2, 256, 16]> input_107 = slice_by_index(begin = input_107_begin_0, end = input_107_end_0, end_mask = input_107_end_mask_0, x = outputs_aug_5)[name = tensor<string, []>("input_107")];
+            tensor<fp32, [2, 256, 16]> inputs_27 = batch_norm(beta = inner_encoder_conv_module_2_sequential_5_bias, epsilon = var_46, gamma = inner_encoder_conv_module_2_sequential_5_weight, mean = inner_encoder_conv_module_2_sequential_5_running_mean, variance = inner_encoder_conv_module_2_sequential_5_running_var, x = input_107)[name = tensor<string, []>("inputs_27")];
+            tensor<fp32, [2, 256, 16]> input_109 = silu(x = inputs_27)[name = tensor<string, []>("input_109")];
+            tensor<string, []> input_111_pad_type_0 = const()[name = tensor<string, []>("input_111_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_111_strides_0 = const()[name = tensor<string, []>("input_111_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_111_pad_0 = const()[name = tensor<string, []>("input_111_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_111_dilations_0 = const()[name = tensor<string, []>("input_111_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_111_groups_0 = const()[name = tensor<string, []>("input_111_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [2, 256, 16]> input_111 = conv(bias = inner_encoder_conv_module_2_sequential_7_conv_bias, dilations = input_111_dilations_0, groups = input_111_groups_0, pad = input_111_pad_0, pad_type = input_111_pad_type_0, strides = input_111_strides_0, weight = inner_encoder_conv_module_2_sequential_7_conv_weight, x = input_109)[name = tensor<string, []>("input_111")];
             tensor<int32, [3]> conv_out_5_perm_0 = const()[name = tensor<string, []>("conv_out_5_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_686_begin_0 = const()[name = tensor<string, []>("op_686_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_686_end_0 = const()[name = tensor<string, []>("op_686_end_0"), val = tensor<int32, [3]>([2, 16, 256])];
-            tensor<bool, [3]> var_686_end_mask_0 = const()[name = tensor<string, []>("op_686_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [2, 16, 256]> conv_out_5 = transpose(perm = conv_out_5_perm_0, x = input_109)[name = tensor<string, []>("transpose_39")];
-            tensor<fp32, [2, 1, 256]> var_686 = slice_by_index(begin = var_686_begin_0, end = var_686_end_0, end_mask = var_686_end_mask_0, x = conv_out_5)[name = tensor<string, []>("op_686")];
-            tensor<int32, [3]> var_688_perm_0 = const()[name = tensor<string, []>("op_688_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 2, 256]> var_688 = transpose(perm = var_688_perm_0, x = var_686)[name = tensor<string, []>("transpose_38")];
-            tensor<fp32, [1, 2, 256]> input_111 = add(x = x_15, y = var_688)[name = tensor<string, []>("input_111")];
-            tensor<int32, [1]> input_113_axes_0 = const()[name = tensor<string, []>("input_113_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_113 = layer_norm(axes = input_113_axes_0, beta = encoder_ffn2_2_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_2_module_sequential_0_weight, x = input_111)[name = tensor<string, []>("input_113")];
-            tensor<fp32, [1, 2, 1024]> inputs_29 = linear(bias = encoder_ffn2_2_module_sequential_1_linear_bias, weight = encoder_ffn2_2_module_sequential_1_linear_weight, x = input_113)[name = tensor<string, []>("linear_26")];
-            tensor<fp32, [1, 2, 1024]> input_115 = silu(x = inputs_29)[name = tensor<string, []>("input_115")];
-            tensor<fp32, [1, 2, 256]> input_119 = linear(bias = encoder_ffn2_2_module_sequential_4_linear_bias, weight = encoder_ffn2_2_module_sequential_4_linear_weight, x = input_115)[name = tensor<string, []>("linear_27")];
-            tensor<fp32, []> var_711 = const()[name = tensor<string, []>("op_711"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 2, 256]> var_712 = mul(x = input_119, y = var_711)[name = tensor<string, []>("op_712")];
-            tensor<fp32, [1, 2, 256]> input_121 = add(x = var_712, y = input_111)[name = tensor<string, []>("input_121")];
-            tensor<int32, [1]> input_123_axes_0 = const()[name = tensor<string, []>("input_123_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_123 = layer_norm(axes = input_123_axes_0, beta = encoder_layer_norm_2_bias, epsilon = var_28, gamma = encoder_layer_norm_2_weight, x = input_121)[name = tensor<string, []>("input_123")];
+            tensor<int32, [3]> var_722_begin_0 = const()[name = tensor<string, []>("op_722_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_722_end_0 = const()[name = tensor<string, []>("op_722_end_0"), val = tensor<int32, [3]>([2, 16, 256])];
+            tensor<bool, [3]> var_722_end_mask_0 = const()[name = tensor<string, []>("op_722_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [2, 16, 256]> conv_out_5 = transpose(perm = conv_out_5_perm_0, x = input_111)[name = tensor<string, []>("transpose_39")];
+            tensor<fp32, [2, 1, 256]> var_722 = slice_by_index(begin = var_722_begin_0, end = var_722_end_0, end_mask = var_722_end_mask_0, x = conv_out_5)[name = tensor<string, []>("op_722")];
+            tensor<int32, [3]> var_724_perm_0 = const()[name = tensor<string, []>("op_724_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 2, 256]> var_724 = transpose(perm = var_724_perm_0, x = var_722)[name = tensor<string, []>("transpose_38")];
+            tensor<fp32, [1, 2, 256]> input_113 = add(x = x_15, y = var_724)[name = tensor<string, []>("input_113")];
+            tensor<int32, [1]> input_115_axes_0 = const()[name = tensor<string, []>("input_115_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 2, 256]> input_115 = layer_norm(axes = input_115_axes_0, beta = inner_encoder_ffn2_2_module_sequential_0_bias, epsilon = var_46, gamma = inner_encoder_ffn2_2_module_sequential_0_weight, x = input_113)[name = tensor<string, []>("input_115")];
+            tensor<fp32, [1, 2, 1024]> inputs_29 = linear(bias = inner_encoder_ffn2_2_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_2_module_sequential_1_linear_weight, x = input_115)[name = tensor<string, []>("linear_26")];
+            tensor<fp32, [1, 2, 1024]> input_117 = silu(x = inputs_29)[name = tensor<string, []>("input_117")];
+            tensor<fp32, [1, 2, 256]> input_121 = linear(bias = inner_encoder_ffn2_2_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_2_module_sequential_4_linear_weight, x = input_117)[name = tensor<string, []>("linear_27")];
+            tensor<fp32, []> var_747 = const()[name = tensor<string, []>("op_747"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 2, 256]> var_748 = mul(x = input_121, y = var_747)[name = tensor<string, []>("op_748")];
+            tensor<fp32, [1, 2, 256]> input_123 = add(x = var_748, y = input_113)[name = tensor<string, []>("input_123")];
             tensor<int32, [1]> input_125_axes_0 = const()[name = tensor<string, []>("input_125_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_125 = layer_norm(axes = input_125_axes_0, beta = encoder_ffn1_3_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_3_module_sequential_0_weight, x = input_123)[name = tensor<string, []>("input_125")];
-            tensor<fp32, [1, 2, 1024]> inputs_31 = linear(bias = encoder_ffn1_3_module_sequential_1_linear_bias, weight = encoder_ffn1_3_module_sequential_1_linear_weight, x = input_125)[name = tensor<string, []>("linear_28")];
-            tensor<fp32, [1, 2, 1024]> input_127 = silu(x = inputs_31)[name = tensor<string, []>("input_127")];
-            tensor<fp32, [1, 2, 256]> input_131 = linear(bias = encoder_ffn1_3_module_sequential_4_linear_bias, weight = encoder_ffn1_3_module_sequential_4_linear_weight, x = input_127)[name = tensor<string, []>("linear_29")];
-            tensor<fp32, []> var_741 = const()[name = tensor<string, []>("op_741"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 2, 256]> var_742 = mul(x = input_131, y = var_741)[name = tensor<string, []>("op_742")];
-            tensor<fp32, [1, 2, 256]> input_133 = add(x = var_742, y = input_123)[name = tensor<string, []>("input_133")];
+            tensor<fp32, [1, 2, 256]> input_125 = layer_norm(axes = input_125_axes_0, beta = inner_encoder_layer_norm_2_bias, epsilon = var_46, gamma = inner_encoder_layer_norm_2_weight, x = input_123)[name = tensor<string, []>("input_125")];
+            tensor<int32, [1]> input_127_axes_0 = const()[name = tensor<string, []>("input_127_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 2, 256]> input_127 = layer_norm(axes = input_127_axes_0, beta = inner_encoder_ffn1_3_module_sequential_0_bias, epsilon = var_46, gamma = inner_encoder_ffn1_3_module_sequential_0_weight, x = input_125)[name = tensor<string, []>("input_127")];
+            tensor<fp32, [1, 2, 1024]> inputs_31 = linear(bias = inner_encoder_ffn1_3_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_3_module_sequential_1_linear_weight, x = input_127)[name = tensor<string, []>("linear_28")];
+            tensor<fp32, [1, 2, 1024]> input_129 = silu(x = inputs_31)[name = tensor<string, []>("input_129")];
+            tensor<fp32, [1, 2, 256]> input_133 = linear(bias = inner_encoder_ffn1_3_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_3_module_sequential_4_linear_weight, x = input_129)[name = tensor<string, []>("linear_29")];
+            tensor<fp32, []> var_777 = const()[name = tensor<string, []>("op_777"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 2, 256]> var_778 = mul(x = input_133, y = var_777)[name = tensor<string, []>("op_778")];
+            tensor<fp32, [1, 2, 256]> input_135 = add(x = var_778, y = input_125)[name = tensor<string, []>("input_135")];
             tensor<int32, [1]> x_19_axes_0 = const()[name = tensor<string, []>("x_19_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> x_19 = layer_norm(axes = x_19_axes_0, beta = encoder_ret_lns_3_bias, epsilon = var_28, gamma = encoder_ret_lns_3_weight, x = input_133)[name = tensor<string, []>("x_19")];
+            tensor<fp32, [1, 2, 256]> x_19 = layer_norm(axes = x_19_axes_0, beta = inner_encoder_ret_lns_3_bias, epsilon = var_46, gamma = inner_encoder_ret_lns_3_weight, x = input_135)[name = tensor<string, []>("x_19")];
             tensor<int32, [5]> prev_kv_7_begin_0 = const()[name = tensor<string, []>("prev_kv_7_begin_0"), val = tensor<int32, [5]>([3, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_7_end_0 = const()[name = tensor<string, []>("prev_kv_7_end_0"), val = tensor<int32, [5]>([4, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_7_end_mask_0 = const()[name = tensor<string, []>("prev_kv_7_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -710,189 +724,182 @@ program(1.0)
             tensor<bool, [2]> prev_scale_7_end_mask_0 = const()[name = tensor<string, []>("prev_scale_7_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_7_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_7_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_7 = slice_by_index(begin = prev_scale_7_begin_0, end = prev_scale_7_end_0, end_mask = prev_scale_7_end_mask_0, squeeze_mask = prev_scale_7_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_7")];
-            tensor<fp32, [1, 2, 256]> var_756 = linear(bias = encoder_q_proj_3_bias, weight = encoder_q_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_30")];
-            tensor<int32, [4]> var_757 = const()[name = tensor<string, []>("op_757"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_758 = reshape(shape = var_757, x = var_756)[name = tensor<string, []>("op_758")];
+            tensor<fp32, [1, 2, 256]> var_792 = linear(bias = inner_encoder_q_proj_3_bias, weight = inner_encoder_q_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_30")];
+            tensor<int32, [4]> var_793 = const()[name = tensor<string, []>("op_793"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_794 = reshape(shape = var_793, x = var_792)[name = tensor<string, []>("op_794")];
             tensor<int32, [4]> q_7_perm_0 = const()[name = tensor<string, []>("q_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> var_762 = linear(bias = encoder_k_proj_3_bias, weight = encoder_k_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_31")];
-            tensor<fp32, []> var_763 = const()[name = tensor<string, []>("op_763"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 2, 256]> var_764 = mul(x = var_762, y = var_763)[name = tensor<string, []>("op_764")];
-            tensor<int32, [4]> var_765 = const()[name = tensor<string, []>("op_765"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_766 = reshape(shape = var_765, x = var_764)[name = tensor<string, []>("op_766")];
+            tensor<fp32, [1, 2, 256]> var_798 = linear(bias = inner_encoder_k_proj_3_bias, weight = inner_encoder_k_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_31")];
+            tensor<fp32, []> var_799 = const()[name = tensor<string, []>("op_799"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 2, 256]> var_800 = mul(x = var_798, y = var_799)[name = tensor<string, []>("op_800")];
+            tensor<int32, [4]> var_801 = const()[name = tensor<string, []>("op_801"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_802 = reshape(shape = var_801, x = var_800)[name = tensor<string, []>("op_802")];
             tensor<int32, [4]> k_7_perm_0 = const()[name = tensor<string, []>("k_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> var_770 = linear(bias = encoder_v_proj_3_bias, weight = encoder_v_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_32")];
-            tensor<int32, [4]> var_771 = const()[name = tensor<string, []>("op_771"), val = tensor<int32, [4]>([1, 2, 4, 64])];
-            tensor<fp32, [1, 2, 4, 64]> var_772 = reshape(shape = var_771, x = var_770)[name = tensor<string, []>("op_772")];
+            tensor<fp32, [1, 2, 256]> var_806 = linear(bias = inner_encoder_v_proj_3_bias, weight = inner_encoder_v_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_32")];
+            tensor<int32, [4]> var_807 = const()[name = tensor<string, []>("op_807"), val = tensor<int32, [4]>([1, 2, 4, 64])];
+            tensor<fp32, [1, 2, 4, 64]> var_808 = reshape(shape = var_807, x = var_806)[name = tensor<string, []>("op_808")];
             tensor<int32, [4]> v_7_perm_0 = const()[name = tensor<string, []>("v_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 2, 256]> input_137 = linear(bias = encoder_g_proj_3_bias, weight = encoder_g_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_33")];
+            tensor<fp32, [1, 2, 256]> input_139 = linear(bias = inner_encoder_g_proj_3_bias, weight = inner_encoder_g_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_33")];
             tensor<fp32, [1]> sqrt_s0_7 = sqrt(x = prev_scale_7)[name = tensor<string, []>("sqrt_s0_7")];
-            tensor<fp32, [2]> s_t_7 = add(x = prev_scale_7, y = encoder__t_index)[name = tensor<string, []>("s_t_7")];
+            tensor<fp32, [2]> s_t_7 = add(x = prev_scale_7, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_7")];
             tensor<fp32, [2]> sqrt_s_t_7 = sqrt(x = s_t_7)[name = tensor<string, []>("sqrt_s_t_7")];
             tensor<bool, []> qk_7_transpose_x_1 = const()[name = tensor<string, []>("qk_7_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_7_transpose_y_1 = const()[name = tensor<string, []>("qk_7_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 2, 64]> k_7 = transpose(perm = k_7_perm_0, x = var_766)[name = tensor<string, []>("transpose_36")];
-            tensor<fp32, [1, 4, 2, 64]> q_7 = transpose(perm = q_7_perm_0, x = var_758)[name = tensor<string, []>("transpose_37")];
+            tensor<fp32, [1, 4, 2, 64]> k_7 = transpose(perm = k_7_perm_0, x = var_802)[name = tensor<string, []>("transpose_36")];
+            tensor<fp32, [1, 4, 2, 64]> q_7 = transpose(perm = q_7_perm_0, x = var_794)[name = tensor<string, []>("transpose_37")];
             tensor<fp32, [1, 4, 2, 2]> qk_7 = matmul(transpose_x = qk_7_transpose_x_1, transpose_y = qk_7_transpose_y_1, x = q_7, y = k_7)[name = tensor<string, []>("qk_7")];
-            tensor<int32, [2]> var_782 = const()[name = tensor<string, []>("op_782"), val = tensor<int32, [2]>([2, 1])];
-            tensor<fp32, [2, 1]> var_783 = reshape(shape = var_782, x = sqrt_s_t_7)[name = tensor<string, []>("op_783")];
-            tensor<fp32, [2, 2]> M_7 = real_div(x = encoder__causal_mask, y = var_783)[name = tensor<string, []>("M_7")];
-            tensor<fp32, [1, 4, 2, 2]> var_785 = mul(x = qk_7, y = M_7)[name = tensor<string, []>("op_785")];
+            tensor<int32, [2]> var_818 = const()[name = tensor<string, []>("op_818"), val = tensor<int32, [2]>([2, 1])];
+            tensor<fp32, [2, 1]> var_819 = reshape(shape = var_818, x = sqrt_s_t_7)[name = tensor<string, []>("op_819")];
+            tensor<fp32, [2, 2]> M_7 = real_div(x = inner_encoder__causal_mask, y = var_819)[name = tensor<string, []>("M_7")];
+            tensor<fp32, [1, 4, 2, 2]> var_821 = mul(x = qk_7, y = M_7)[name = tensor<string, []>("op_821")];
             tensor<bool, []> inner_7_transpose_x_0 = const()[name = tensor<string, []>("inner_7_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_7_transpose_y_0 = const()[name = tensor<string, []>("inner_7_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 2, 64]> v_7 = transpose(perm = v_7_perm_0, x = var_772)[name = tensor<string, []>("transpose_35")];
-            tensor<fp32, [1, 4, 2, 64]> inner_7 = matmul(transpose_x = inner_7_transpose_x_0, transpose_y = inner_7_transpose_y_0, x = var_785, y = v_7)[name = tensor<string, []>("inner_7")];
-            tensor<bool, []> var_787_transpose_x_0 = const()[name = tensor<string, []>("op_787_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_787_transpose_y_0 = const()[name = tensor<string, []>("op_787_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 2, 64]> var_787 = matmul(transpose_x = var_787_transpose_x_0, transpose_y = var_787_transpose_y_0, x = q_7, y = prev_kv_7)[name = tensor<string, []>("op_787")];
-            tensor<fp32, [2]> var_788 = real_div(x = sqrt_s0_7, y = sqrt_s_t_7)[name = tensor<string, []>("op_788")];
-            tensor<int32, [4]> var_789 = const()[name = tensor<string, []>("op_789"), val = tensor<int32, [4]>([1, 1, 2, 1])];
-            tensor<fp32, [1, 1, 2, 1]> var_790 = reshape(shape = var_789, x = var_788)[name = tensor<string, []>("op_790")];
-            tensor<fp32, [1, 4, 2, 64]> cross_7 = mul(x = var_787, y = var_790)[name = tensor<string, []>("cross_7")];
+            tensor<fp32, [1, 4, 2, 64]> v_7 = transpose(perm = v_7_perm_0, x = var_808)[name = tensor<string, []>("transpose_35")];
+            tensor<fp32, [1, 4, 2, 64]> inner_7 = matmul(transpose_x = inner_7_transpose_x_0, transpose_y = inner_7_transpose_y_0, x = var_821, y = v_7)[name = tensor<string, []>("inner_7")];
+            tensor<bool, []> var_823_transpose_x_0 = const()[name = tensor<string, []>("op_823_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_823_transpose_y_0 = const()[name = tensor<string, []>("op_823_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 2, 64]> var_823 = matmul(transpose_x = var_823_transpose_x_0, transpose_y = var_823_transpose_y_0, x = q_7, y = prev_kv_7)[name = tensor<string, []>("op_823")];
+            tensor<fp32, [2]> var_824 = real_div(x = sqrt_s0_7, y = sqrt_s_t_7)[name = tensor<string, []>("op_824")];
+            tensor<int32, [4]> var_825 = const()[name = tensor<string, []>("op_825"), val = tensor<int32, [4]>([1, 1, 2, 1])];
+            tensor<fp32, [1, 1, 2, 1]> var_826 = reshape(shape = var_825, x = var_824)[name = tensor<string, []>("op_826")];
+            tensor<fp32, [1, 4, 2, 64]> cross_7 = mul(x = var_823, y = var_826)[name = tensor<string, []>("cross_7")];
             tensor<fp32, [1, 4, 2, 64]> out_19 = add(x = inner_7, y = cross_7)[name = tensor<string, []>("out_19")];
-            tensor<fp32, [1, 4, 64, 64]> var_793 = mul(x = prev_kv_7, y = sqrt_s0_7)[name = tensor<string, []>("op_793")];
-            tensor<bool, []> var_795_transpose_x_1 = const()[name = tensor<string, []>("op_795_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_795_transpose_y_1 = const()[name = tensor<string, []>("op_795_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_795 = matmul(transpose_x = var_795_transpose_x_1, transpose_y = var_795_transpose_y_1, x = k_7, y = v_7)[name = tensor<string, []>("op_795")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_7 = add(x = var_793, y = var_795)[name = tensor<string, []>("new_kv_unnorm_7")];
-            tensor<fp32, []> var_797 = const()[name = tensor<string, []>("op_797"), val = tensor<fp32, []>(0x1p+1)];
-            tensor<fp32, [1]> new_scale_7 = add(x = prev_scale_7, y = var_797)[name = tensor<string, []>("new_scale_7")];
-            tensor<fp32, [1]> var_799 = sqrt(x = new_scale_7)[name = tensor<string, []>("op_799")];
-            tensor<fp32, [1, 4, 64, 64]> nkv_1 = real_div(x = new_kv_unnorm_7, y = var_799)[name = tensor<string, []>("nkv_1")];
-            tensor<int32, [4]> var_801_perm_0 = const()[name = tensor<string, []>("op_801_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_829 = mul(x = prev_kv_7, y = sqrt_s0_7)[name = tensor<string, []>("op_829")];
+            tensor<bool, []> var_831_transpose_x_1 = const()[name = tensor<string, []>("op_831_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_831_transpose_y_1 = const()[name = tensor<string, []>("op_831_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_831 = matmul(transpose_x = var_831_transpose_x_1, transpose_y = var_831_transpose_y_1, x = k_7, y = v_7)[name = tensor<string, []>("op_831")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_7 = add(x = var_829, y = var_831)[name = tensor<string, []>("new_kv_unnorm_7")];
+            tensor<fp32, []> var_833 = const()[name = tensor<string, []>("op_833"), val = tensor<fp32, []>(0x1p+1)];
+            tensor<fp32, [1]> new_scale_7 = add(x = prev_scale_7, y = var_833)[name = tensor<string, []>("new_scale_7")];
+            tensor<fp32, [1]> var_835 = sqrt(x = new_scale_7)[name = tensor<string, []>("op_835")];
+            tensor<fp32, [1, 4, 64, 64]> nkv_1 = real_div(x = new_kv_unnorm_7, y = var_835)[name = tensor<string, []>("nkv_1")];
+            tensor<int32, [4]> var_837_perm_0 = const()[name = tensor<string, []>("op_837_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_21_axes_0 = const()[name = tensor<string, []>("out_21_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 4, 64]> var_801 = transpose(perm = var_801_perm_0, x = out_19)[name = tensor<string, []>("transpose_34")];
-            tensor<fp32, [1, 2, 4, 64]> out_21 = layer_norm(axes = out_21_axes_0, epsilon = var_17, x = var_801)[name = tensor<string, []>("out_21")];
-            tensor<int32, [3]> var_805 = const()[name = tensor<string, []>("op_805"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<fp32, [1, 2, 256]> out_23 = reshape(shape = var_805, x = out_21)[name = tensor<string, []>("out_23")];
-            tensor<fp32, [1, 2, 256]> var_807 = silu(x = input_137)[name = tensor<string, []>("op_807")];
-            tensor<fp32, [1, 2, 256]> input_139 = mul(x = var_807, y = out_23)[name = tensor<string, []>("input_139")];
-            tensor<fp32, [1, 2, 256]> ret_out_7 = linear(bias = encoder_out_proj_3_bias, weight = encoder_out_proj_3_weight, x = input_139)[name = tensor<string, []>("linear_34")];
-            tensor<fp32, [1, 2, 256]> x_21 = add(x = input_133, y = ret_out_7)[name = tensor<string, []>("x_21")];
+            tensor<fp32, [1, 2, 4, 64]> var_837 = transpose(perm = var_837_perm_0, x = out_19)[name = tensor<string, []>("transpose_34")];
+            tensor<fp32, [1, 2, 4, 64]> out_21 = layer_norm(axes = out_21_axes_0, epsilon = var_54, x = var_837)[name = tensor<string, []>("out_21")];
+            tensor<int32, [3]> var_841 = const()[name = tensor<string, []>("op_841"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp32, [1, 2, 256]> out_23 = reshape(shape = var_841, x = out_21)[name = tensor<string, []>("out_23")];
+            tensor<fp32, [1, 2, 256]> var_843 = silu(x = input_139)[name = tensor<string, []>("op_843")];
+            tensor<fp32, [1, 2, 256]> input_141 = mul(x = var_843, y = out_23)[name = tensor<string, []>("input_141")];
+            tensor<fp32, [1, 2, 256]> ret_out_7 = linear(bias = inner_encoder_out_proj_3_bias, weight = inner_encoder_out_proj_3_weight, x = input_141)[name = tensor<string, []>("linear_34")];
+            tensor<fp32, [1, 2, 256]> x_21 = add(x = input_135, y = ret_out_7)[name = tensor<string, []>("x_21")];
             tensor<int32, [4]> window_19_begin_0 = const()[name = tensor<string, []>("window_19_begin_0"), val = tensor<int32, [4]>([3, 0, 0, 0])];
             tensor<int32, [4]> window_19_end_0 = const()[name = tensor<string, []>("window_19_end_0"), val = tensor<int32, [4]>([4, 1, 16, 256])];
             tensor<bool, [4]> window_19_end_mask_0 = const()[name = tensor<string, []>("window_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_19_squeeze_mask_0 = const()[name = tensor<string, []>("window_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_19 = slice_by_index(begin = window_19_begin_0, end = window_19_end_0, end_mask = window_19_end_mask_0, squeeze_mask = window_19_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_19")];
-            tensor<int32, [3]> var_815_begin_0 = const()[name = tensor<string, []>("op_815_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_815_end_0 = const()[name = tensor<string, []>("op_815_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_815_end_mask_0 = const()[name = tensor<string, []>("op_815_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_815 = slice_by_index(begin = var_815_begin_0, end = var_815_end_0, end_mask = var_815_end_mask_0, x = x_21)[name = tensor<string, []>("op_815")];
-            tensor<int32, [3]> var_818_begin_0 = const()[name = tensor<string, []>("op_818_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_818_end_0 = const()[name = tensor<string, []>("op_818_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_818_end_mask_0 = const()[name = tensor<string, []>("op_818_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_818 = slice_by_index(begin = var_818_begin_0, end = var_818_end_0, end_mask = var_818_end_mask_0, x = window_19)[name = tensor<string, []>("op_818")];
+            tensor<int32, [3]> var_851_begin_0 = const()[name = tensor<string, []>("op_851_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_851_end_0 = const()[name = tensor<string, []>("op_851_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_851_end_mask_0 = const()[name = tensor<string, []>("op_851_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_851 = slice_by_index(begin = var_851_begin_0, end = var_851_end_0, end_mask = var_851_end_mask_0, x = x_21)[name = tensor<string, []>("op_851")];
+            tensor<int32, [3]> var_854_begin_0 = const()[name = tensor<string, []>("op_854_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_854_end_0 = const()[name = tensor<string, []>("op_854_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_854_end_mask_0 = const()[name = tensor<string, []>("op_854_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_854 = slice_by_index(begin = var_854_begin_0, end = var_854_end_0, end_mask = var_854_end_mask_0, x = window_19)[name = tensor<string, []>("op_854")];
             tensor<bool, []> window_21_interleave_0 = const()[name = tensor<string, []>("window_21_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_21 = concat(axis = var_26, interleave = window_21_interleave_0, values = (var_818, var_815))[name = tensor<string, []>("window_21")];
-            tensor<int32, [3]> var_823_begin_0 = const()[name = tensor<string, []>("op_823_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_823_end_0 = const()[name = tensor<string, []>("op_823_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_823_end_mask_0 = const()[name = tensor<string, []>("op_823_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_823 = slice_by_index(begin = var_823_begin_0, end = var_823_end_0, end_mask = var_823_end_mask_0, x = x_21)[name = tensor<string, []>("op_823")];
-            tensor<int32, [3]> var_826_begin_0 = const()[name = tensor<string, []>("op_826_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_826_end_0 = const()[name = tensor<string, []>("op_826_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_826_end_mask_0 = const()[name = tensor<string, []>("op_826_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_826 = slice_by_index(begin = var_826_begin_0, end = var_826_end_0, end_mask = var_826_end_mask_0, x = window_21)[name = tensor<string, []>("op_826")];
+            tensor<fp32, [1, 16, 256]> window_21 = concat(axis = var_62, interleave = window_21_interleave_0, values = (var_854, var_851))[name = tensor<string, []>("window_21")];
+            tensor<int32, [3]> var_859_begin_0 = const()[name = tensor<string, []>("op_859_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_859_end_0 = const()[name = tensor<string, []>("op_859_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_859_end_mask_0 = const()[name = tensor<string, []>("op_859_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_859 = slice_by_index(begin = var_859_begin_0, end = var_859_end_0, end_mask = var_859_end_mask_0, x = x_21)[name = tensor<string, []>("op_859")];
+            tensor<int32, [3]> var_862_begin_0 = const()[name = tensor<string, []>("op_862_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_862_end_0 = const()[name = tensor<string, []>("op_862_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_862_end_mask_0 = const()[name = tensor<string, []>("op_862_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_862 = slice_by_index(begin = var_862_begin_0, end = var_862_end_0, end_mask = var_862_end_mask_0, x = window_21)[name = tensor<string, []>("op_862")];
             tensor<bool, []> window_interleave_0 = const()[name = tensor<string, []>("window_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window = concat(axis = var_26, interleave = window_interleave_0, values = (var_826, var_823))[name = tensor<string, []>("window")];
-            tensor<bool, []> input_141_interleave_0 = const()[name = tensor<string, []>("input_141_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [2, 16, 256]> input_141 = concat(axis = var_23, interleave = input_141_interleave_0, values = (window_21, window))[name = tensor<string, []>("input_141")];
+            tensor<fp32, [1, 16, 256]> window = concat(axis = var_62, interleave = window_interleave_0, values = (var_862, var_859))[name = tensor<string, []>("window")];
+            tensor<bool, []> input_143_interleave_0 = const()[name = tensor<string, []>("input_143_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [2, 16, 256]> input_143 = concat(axis = var_49, interleave = input_143_interleave_0, values = (window_21, window))[name = tensor<string, []>("input_143")];
             tensor<int32, [1]> x_23_axes_0 = const()[name = tensor<string, []>("x_23_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [2, 16, 256]> x_23 = layer_norm(axes = x_23_axes_0, beta = encoder_conv_module_3_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_3_sequential_0_weight, x = input_141)[name = tensor<string, []>("x_23")];
-            tensor<int32, [3]> input_143_perm_0 = const()[name = tensor<string, []>("input_143_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [2, 16, 256]> x_23 = layer_norm(axes = x_23_axes_0, beta = inner_encoder_conv_module_3_sequential_0_bias, epsilon = var_46, gamma = inner_encoder_conv_module_3_sequential_0_weight, x = input_143)[name = tensor<string, []>("x_23")];
+            tensor<int32, [3]> input_145_perm_0 = const()[name = tensor<string, []>("input_145_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_33_pad_type_0 = const()[name = tensor<string, []>("inputs_33_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_33_strides_0 = const()[name = tensor<string, []>("inputs_33_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_33_pad_0 = const()[name = tensor<string, []>("inputs_33_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_33_dilations_0 = const()[name = tensor<string, []>("inputs_33_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_33_groups_0 = const()[name = tensor<string, []>("inputs_33_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> input_143 = transpose(perm = input_143_perm_0, x = x_23)[name = tensor<string, []>("transpose_33")];
-            tensor<fp32, [2, 512, 16]> inputs_33 = conv(bias = encoder_conv_module_3_sequential_2_conv_bias, dilations = inputs_33_dilations_0, groups = inputs_33_groups_0, pad = inputs_33_pad_0, pad_type = inputs_33_pad_type_0, strides = inputs_33_strides_0, weight = encoder_conv_module_3_sequential_2_conv_weight, x = input_143)[name = tensor<string, []>("inputs_33")];
-            tensor<int32, [2]> var_851_split_sizes_0 = const()[name = tensor<string, []>("op_851_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_851_axis_0 = const()[name = tensor<string, []>("op_851_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> var_851_0, tensor<fp32, [2, 256, 16]> var_851_1 = split(axis = var_851_axis_0, split_sizes = var_851_split_sizes_0, x = inputs_33)[name = tensor<string, []>("op_851")];
-            tensor<fp32, [2, 256, 16]> var_853 = sigmoid(x = var_851_1)[name = tensor<string, []>("op_853")];
-            tensor<fp32, [2, 256, 16]> inputs_35 = mul(x = var_851_0, y = var_853)[name = tensor<string, []>("inputs_35")];
+            tensor<fp32, [2, 256, 16]> input_145 = transpose(perm = input_145_perm_0, x = x_23)[name = tensor<string, []>("transpose_33")];
+            tensor<fp32, [2, 512, 16]> inputs_33 = conv(bias = inner_encoder_conv_module_3_sequential_2_conv_bias, dilations = inputs_33_dilations_0, groups = inputs_33_groups_0, pad = inputs_33_pad_0, pad_type = inputs_33_pad_type_0, strides = inputs_33_strides_0, weight = inner_encoder_conv_module_3_sequential_2_conv_weight, x = input_145)[name = tensor<string, []>("inputs_33")];
+            tensor<int32, [2]> var_887_split_sizes_0 = const()[name = tensor<string, []>("op_887_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_887_axis_0 = const()[name = tensor<string, []>("op_887_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [2, 256, 16]> var_887_0, tensor<fp32, [2, 256, 16]> var_887_1 = split(axis = var_887_axis_0, split_sizes = var_887_split_sizes_0, x = inputs_33)[name = tensor<string, []>("op_887")];
+            tensor<fp32, [2, 256, 16]> var_889 = sigmoid(x = var_887_1)[name = tensor<string, []>("op_889")];
+            tensor<fp32, [2, 256, 16]> inputs_35 = mul(x = var_887_0, y = var_889)[name = tensor<string, []>("inputs_35")];
             tensor<string, []> outputs_aug_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_pad_0 = const()[name = tensor<string, []>("outputs_aug_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_groups_0 = const()[name = tensor<string, []>("outputs_aug_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_strides_0 = const()[name = tensor<string, []>("outputs_aug_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_dilations_0 = const()[name = tensor<string, []>("outputs_aug_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [2, 256, 31]> outputs_aug = conv(dilations = outputs_aug_dilations_0, groups = outputs_aug_groups_0, pad = outputs_aug_pad_0, pad_type = outputs_aug_pad_type_0, strides = outputs_aug_strides_0, weight = encoder_conv_module_3_sequential_4_conv_weight, x = inputs_35)[name = tensor<string, []>("outputs_aug")];
-            tensor<int32, [3]> input_145_begin_0 = const()[name = tensor<string, []>("input_145_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_145_end_0 = const()[name = tensor<string, []>("input_145_end_0"), val = tensor<int32, [3]>([2, 256, 16])];
-            tensor<bool, [3]> input_145_end_mask_0 = const()[name = tensor<string, []>("input_145_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [2, 256, 16]> input_145 = slice_by_index(begin = input_145_begin_0, end = input_145_end_0, end_mask = input_145_end_mask_0, x = outputs_aug)[name = tensor<string, []>("input_145")];
-            tensor<fp32, [2, 256, 16]> inputs_37 = batch_norm(beta = encoder_conv_module_3_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_3_sequential_5_weight, mean = encoder_conv_module_3_sequential_5_running_mean, variance = encoder_conv_module_3_sequential_5_running_var, x = input_145)[name = tensor<string, []>("inputs_37")];
-            tensor<fp32, [2, 256, 16]> input_147 = silu(x = inputs_37)[name = tensor<string, []>("input_147")];
-            tensor<string, []> input_149_pad_type_0 = const()[name = tensor<string, []>("input_149_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_149_strides_0 = const()[name = tensor<string, []>("input_149_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_149_pad_0 = const()[name = tensor<string, []>("input_149_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_149_dilations_0 = const()[name = tensor<string, []>("input_149_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_149_groups_0 = const()[name = tensor<string, []>("input_149_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [2, 256, 16]> input_149 = conv(bias = encoder_conv_module_3_sequential_7_conv_bias, dilations = input_149_dilations_0, groups = input_149_groups_0, pad = input_149_pad_0, pad_type = input_149_pad_type_0, strides = input_149_strides_0, weight = encoder_conv_module_3_sequential_7_conv_weight, x = input_147)[name = tensor<string, []>("input_149")];
+            tensor<fp32, [2, 256, 31]> outputs_aug = conv(dilations = outputs_aug_dilations_0, groups = outputs_aug_groups_0, pad = outputs_aug_pad_0, pad_type = outputs_aug_pad_type_0, strides = outputs_aug_strides_0, weight = inner_encoder_conv_module_3_sequential_4_conv_weight, x = inputs_35)[name = tensor<string, []>("outputs_aug")];
+            tensor<int32, [3]> input_147_begin_0 = const()[name = tensor<string, []>("input_147_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_147_end_0 = const()[name = tensor<string, []>("input_147_end_0"), val = tensor<int32, [3]>([2, 256, 16])];
+            tensor<bool, [3]> input_147_end_mask_0 = const()[name = tensor<string, []>("input_147_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [2, 256, 16]> input_147 = slice_by_index(begin = input_147_begin_0, end = input_147_end_0, end_mask = input_147_end_mask_0, x = outputs_aug)[name = tensor<string, []>("input_147")];
+            tensor<fp32, [2, 256, 16]> inputs_37 = batch_norm(beta = inner_encoder_conv_module_3_sequential_5_bias, epsilon = var_46, gamma = inner_encoder_conv_module_3_sequential_5_weight, mean = inner_encoder_conv_module_3_sequential_5_running_mean, variance = inner_encoder_conv_module_3_sequential_5_running_var, x = input_147)[name = tensor<string, []>("inputs_37")];
+            tensor<fp32, [2, 256, 16]> input_149 = silu(x = inputs_37)[name = tensor<string, []>("input_149")];
+            tensor<string, []> input_151_pad_type_0 = const()[name = tensor<string, []>("input_151_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_151_strides_0 = const()[name = tensor<string, []>("input_151_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_151_pad_0 = const()[name = tensor<string, []>("input_151_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_151_dilations_0 = const()[name = tensor<string, []>("input_151_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_151_groups_0 = const()[name = tensor<string, []>("input_151_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [2, 256, 16]> input_151 = conv(bias = inner_encoder_conv_module_3_sequential_7_conv_bias, dilations = input_151_dilations_0, groups = input_151_groups_0, pad = input_151_pad_0, pad_type = input_151_pad_type_0, strides = input_151_strides_0, weight = inner_encoder_conv_module_3_sequential_7_conv_weight, x = input_149)[name = tensor<string, []>("input_151")];
             tensor<int32, [3]> conv_out_7_perm_0 = const()[name = tensor<string, []>("conv_out_7_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_884_begin_0 = const()[name = tensor<string, []>("op_884_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_884_end_0 = const()[name = tensor<string, []>("op_884_end_0"), val = tensor<int32, [3]>([2, 16, 256])];
-            tensor<bool, [3]> var_884_end_mask_0 = const()[name = tensor<string, []>("op_884_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [2, 16, 256]> conv_out_7 = transpose(perm = conv_out_7_perm_0, x = input_149)[name = tensor<string, []>("transpose_32")];
-            tensor<fp32, [2, 1, 256]> var_884 = slice_by_index(begin = var_884_begin_0, end = var_884_end_0, end_mask = var_884_end_mask_0, x = conv_out_7)[name = tensor<string, []>("op_884")];
-            tensor<int32, [3]> var_886_perm_0 = const()[name = tensor<string, []>("op_886_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 2, 256]> var_886 = transpose(perm = var_886_perm_0, x = var_884)[name = tensor<string, []>("transpose_31")];
-            tensor<fp32, [1, 2, 256]> input_151 = add(x = x_21, y = var_886)[name = tensor<string, []>("input_151")];
-            tensor<int32, [1]> input_153_axes_0 = const()[name = tensor<string, []>("input_153_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_153 = layer_norm(axes = input_153_axes_0, beta = encoder_ffn2_3_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_3_module_sequential_0_weight, x = input_151)[name = tensor<string, []>("input_153")];
-            tensor<fp32, [1, 2, 1024]> inputs = linear(bias = encoder_ffn2_3_module_sequential_1_linear_bias, weight = encoder_ffn2_3_module_sequential_1_linear_weight, x = input_153)[name = tensor<string, []>("linear_35")];
-            tensor<fp32, [1, 2, 1024]> input_155 = silu(x = inputs)[name = tensor<string, []>("input_155")];
-            tensor<fp32, [1, 2, 256]> input_159 = linear(bias = encoder_ffn2_3_module_sequential_4_linear_bias, weight = encoder_ffn2_3_module_sequential_4_linear_weight, x = input_155)[name = tensor<string, []>("linear_36")];
-            tensor<fp32, []> var_909 = const()[name = tensor<string, []>("op_909"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 2, 256]> var_910 = mul(x = input_159, y = var_909)[name = tensor<string, []>("op_910")];
-            tensor<fp32, [1, 2, 256]> input_161 = add(x = var_910, y = input_151)[name = tensor<string, []>("input_161")];
+            tensor<int32, [3]> var_920_begin_0 = const()[name = tensor<string, []>("op_920_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_920_end_0 = const()[name = tensor<string, []>("op_920_end_0"), val = tensor<int32, [3]>([2, 16, 256])];
+            tensor<bool, [3]> var_920_end_mask_0 = const()[name = tensor<string, []>("op_920_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [2, 16, 256]> conv_out_7 = transpose(perm = conv_out_7_perm_0, x = input_151)[name = tensor<string, []>("transpose_32")];
+            tensor<fp32, [2, 1, 256]> var_920 = slice_by_index(begin = var_920_begin_0, end = var_920_end_0, end_mask = var_920_end_mask_0, x = conv_out_7)[name = tensor<string, []>("op_920")];
+            tensor<int32, [3]> var_922_perm_0 = const()[name = tensor<string, []>("op_922_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 2, 256]> var_922 = transpose(perm = var_922_perm_0, x = var_920)[name = tensor<string, []>("transpose_31")];
+            tensor<fp32, [1, 2, 256]> input_153 = add(x = x_21, y = var_922)[name = tensor<string, []>("input_153")];
+            tensor<int32, [1]> input_155_axes_0 = const()[name = tensor<string, []>("input_155_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 2, 256]> input_155 = layer_norm(axes = input_155_axes_0, beta = inner_encoder_ffn2_3_module_sequential_0_bias, epsilon = var_46, gamma = inner_encoder_ffn2_3_module_sequential_0_weight, x = input_153)[name = tensor<string, []>("input_155")];
+            tensor<fp32, [1, 2, 1024]> inputs = linear(bias = inner_encoder_ffn2_3_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_3_module_sequential_1_linear_weight, x = input_155)[name = tensor<string, []>("linear_35")];
+            tensor<fp32, [1, 2, 1024]> input_157 = silu(x = inputs)[name = tensor<string, []>("input_157")];
+            tensor<fp32, [1, 2, 256]> input_161 = linear(bias = inner_encoder_ffn2_3_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_3_module_sequential_4_linear_weight, x = input_157)[name = tensor<string, []>("linear_36")];
+            tensor<fp32, []> var_945 = const()[name = tensor<string, []>("op_945"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 2, 256]> var_946 = mul(x = input_161, y = var_945)[name = tensor<string, []>("op_946")];
+            tensor<fp32, [1, 2, 256]> input_163 = add(x = var_946, y = input_153)[name = tensor<string, []>("input_163")];
             tensor<int32, [1]> x_25_axes_0 = const()[name = tensor<string, []>("x_25_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> x_25 = layer_norm(axes = x_25_axes_0, beta = encoder_layer_norm_3_bias, epsilon = var_28, gamma = encoder_layer_norm_3_weight, x = input_161)[name = tensor<string, []>("x_25")];
+            tensor<fp32, [1, 2, 256]> x_25 = layer_norm(axes = x_25_axes_0, beta = inner_encoder_layer_norm_3_bias, epsilon = var_46, gamma = inner_encoder_layer_norm_3_weight, x = input_163)[name = tensor<string, []>("x_25")];
             tensor<int32, [3]> x_ct_perm_0 = const()[name = tensor<string, []>("x_ct_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<bool, []> cat_interleave_0 = const()[name = tensor<string, []>("cat_interleave_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [1, 256, 2]> x_ct = transpose(perm = x_ct_perm_0, x = x_25)[name = tensor<string, []>("transpose_30")];
-            tensor<fp32, [1, 256, 20]> cat = concat(axis = var_20, interleave = cat_interleave_0, values = (cnn_window, x_ct))[name = tensor<string, []>("cat")];
+            tensor<fp32, [1, 256, 20]> cat = concat(axis = var_51, interleave = cat_interleave_0, values = (cnn_window, x_ct))[name = tensor<string, []>("cat")];
             tensor<string, []> conv_out_pad_type_0 = const()[name = tensor<string, []>("conv_out_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> conv_out_strides_0 = const()[name = tensor<string, []>("conv_out_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> conv_out_pad_0 = const()[name = tensor<string, []>("conv_out_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> conv_out_dilations_0 = const()[name = tensor<string, []>("conv_out_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> conv_out_groups_0 = const()[name = tensor<string, []>("conv_out_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 2]> conv_out = conv(bias = encoder_cnn_bias, dilations = conv_out_dilations_0, groups = conv_out_groups_0, pad = conv_out_pad_0, pad_type = conv_out_pad_type_0, strides = conv_out_strides_0, weight = encoder_cnn_weight, x = cat)[name = tensor<string, []>("conv_out")];
-            tensor<int32, [3]> var_928_begin_0 = const()[name = tensor<string, []>("op_928_begin_0"), val = tensor<int32, [3]>([0, 0, 2])];
-            tensor<int32, [3]> var_928_end_0 = const()[name = tensor<string, []>("op_928_end_0"), val = tensor<int32, [3]>([1, 256, 20])];
-            tensor<bool, [3]> var_928_end_mask_0 = const()[name = tensor<string, []>("op_928_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 256, 18]> cnn_window_new = slice_by_index(begin = var_928_begin_0, end = var_928_end_0, end_mask = var_928_end_mask_0, x = cat)[name = tensor<string, []>("op_928")];
-            tensor<int32, [3]> input_163_perm_0 = const()[name = tensor<string, []>("input_163_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [1]> var_930 = const()[name = tensor<string, []>("op_930"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 256]> input_163 = transpose(perm = input_163_perm_0, x = conv_out)[name = tensor<string, []>("transpose_29")];
-            tensor<fp32, [1, 2, 1]> var_931 = reduce_l2_norm(axes = var_930, keep_dims = var_29, x = input_163)[name = tensor<string, []>("op_931")];
+            tensor<fp32, [1, 256, 2]> conv_out = conv(bias = inner_encoder_cnn_bias, dilations = conv_out_dilations_0, groups = conv_out_groups_0, pad = conv_out_pad_0, pad_type = conv_out_pad_type_0, strides = conv_out_strides_0, weight = inner_encoder_cnn_weight, x = cat)[name = tensor<string, []>("conv_out")];
+            tensor<int32, [3]> var_964_begin_0 = const()[name = tensor<string, []>("op_964_begin_0"), val = tensor<int32, [3]>([0, 0, 2])];
+            tensor<int32, [3]> var_964_end_0 = const()[name = tensor<string, []>("op_964_end_0"), val = tensor<int32, [3]>([1, 256, 20])];
+            tensor<bool, [3]> var_964_end_mask_0 = const()[name = tensor<string, []>("op_964_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 256, 18]> cnn_window_new = slice_by_index(begin = var_964_begin_0, end = var_964_end_0, end_mask = var_964_end_mask_0, x = cat)[name = tensor<string, []>("op_964")];
+            tensor<int32, [3]> input_165_perm_0 = const()[name = tensor<string, []>("input_165_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_966 = const()[name = tensor<string, []>("op_966"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 2, 256]> input_165 = transpose(perm = input_165_perm_0, x = conv_out)[name = tensor<string, []>("transpose_29")];
+            tensor<fp32, [1, 2, 1]> var_967 = reduce_l2_norm(axes = var_966, keep_dims = var_45, x = input_165)[name = tensor<string, []>("op_967")];
             tensor<fp32, []> const_12 = const()[name = tensor<string, []>("const_12"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1, 2, 1]> clip_0 = clip(alpha = var_11, beta = const_12, x = var_931)[name = tensor<string, []>("clip_0")];
-            tensor<fp32, [1, 2, 256]> emb = real_div(x = input_163, y = clip_0)[name = tensor<string, []>("emb")];
-            tensor<int32, []> var_935_axis_0 = const()[name = tensor<string, []>("op_935_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1, 4, 64, 64]> enc_kv_new = stack(axis = var_935_axis_0, values = (var_206, var_404, var_602, nkv_1))[name = tensor<string, []>("op_935")];
-            tensor<int32, []> var_937_axis_0 = const()[name = tensor<string, []>("op_937_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1]> enc_scale_new = stack(axis = var_937_axis_0, values = (new_scale_1, new_scale_3, new_scale_5, new_scale_7))[name = tensor<string, []>("op_937")];
-            tensor<int32, []> var_939_axis_0 = const()[name = tensor<string, []>("op_939_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1, 16, 256]> enc_conv_cache_new = stack(axis = var_939_axis_0, values = (window_5, window_11, window_17, window))[name = tensor<string, []>("op_939")];
-            tensor<fp32, []> var_948 = const()[name = tensor<string, []>("op_948"), val = tensor<fp32, []>(0x1.5798eep-27)];
-            tensor<fp32, []> var_953 = const()[name = tensor<string, []>("op_953"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
-            tensor<fp32, []> var_955 = const()[name = tensor<string, []>("op_955"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
-            tensor<bool, []> var_956 = const()[name = tensor<string, []>("op_956"), val = tensor<bool, []>(true)];
-            tensor<fp32, []> var_958 = const()[name = tensor<string, []>("op_958"), val = tensor<fp32, []>(0x1p+0)];
-            tensor<int32, []> var_962 = const()[name = tensor<string, []>("op_962"), val = tensor<int32, []>(-1)];
-            tensor<int32, []> var_968 = const()[name = tensor<string, []>("op_968"), val = tensor<int32, []>(0)];
+            tensor<fp32, [1, 2, 1]> clip_0 = clip(alpha = var_59, beta = const_12, x = var_967)[name = tensor<string, []>("clip_0")];
+            tensor<fp32, [1, 2, 256]> emb = real_div(x = input_165, y = clip_0)[name = tensor<string, []>("emb")];
+            tensor<int32, []> var_971_axis_0 = const()[name = tensor<string, []>("op_971_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1, 4, 64, 64]> enc_kv_new = stack(axis = var_971_axis_0, values = (var_242, var_440, var_638, nkv_1))[name = tensor<string, []>("op_971")];
+            tensor<int32, []> var_973_axis_0 = const()[name = tensor<string, []>("op_973_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1]> enc_scale_new = stack(axis = var_973_axis_0, values = (new_scale_1, new_scale_3, new_scale_5, new_scale_7))[name = tensor<string, []>("op_973")];
+            tensor<int32, []> var_975_axis_0 = const()[name = tensor<string, []>("op_975_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1, 16, 256]> enc_conv_cache_new = stack(axis = var_975_axis_0, values = (window_5, window_11, window_17, window))[name = tensor<string, []>("op_975")];
             tensor<fp32, [1, 2, 12, 256]> pos = const()[name = tensor<string, []>("pos"), val = tensor<fp32, [1, 2, 12, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44395328)))];
-            tensor<int32, [1]> var_1030_axes_0 = const()[name = tensor<string, []>("op_1030_axes_0"), val = tensor<int32, [1]>([2])];
-            tensor<fp32, [1, 2, 1, 256]> var_1030 = expand_dims(axes = var_1030_axes_0, x = emb)[name = tensor<string, []>("op_1030")];
+            tensor<int32, [1]> var_1043_axes_0 = const()[name = tensor<string, []>("op_1043_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp32, [1, 2, 1, 256]> var_1043 = expand_dims(axes = var_1043_axes_0, x = emb)[name = tensor<string, []>("op_1043")];
             tensor<int32, [4]> emb_exp_reps_0 = const()[name = tensor<string, []>("emb_exp_reps_0"), val = tensor<int32, [4]>([1, 1, 12, 1])];
-            tensor<fp32, [1, 2, 12, 256]> emb_exp = tile(reps = emb_exp_reps_0, x = var_1030)[name = tensor<string, []>("emb_exp")];
-            tensor<bool, []> input_165_interleave_0 = const()[name = tensor<string, []>("input_165_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 2, 12, 512]> input_165 = concat(axis = var_962, interleave = input_165_interleave_0, values = (emb_exp, pos))[name = tensor<string, []>("input_165")];
-            tensor<fp32, [1, 2, 12, 256]> x_27 = linear(bias = decoder_convert_bias, weight = decoder_convert_weight, x = input_165)[name = tensor<string, []>("linear_37")];
-            tensor<int32, [4]> var_1038_perm_0 = const()[name = tensor<string, []>("op_1038_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1042 = const()[name = tensor<string, []>("op_1042"), val = tensor<int32, [3]>([12, 2, 256])];
-            tensor<fp32, [1, 12, 2, 256]> var_1038 = transpose(perm = var_1038_perm_0, x = x_27)[name = tensor<string, []>("transpose_28")];
-            tensor<fp32, [12, 2, 256]> x_29 = reshape(shape = var_1042, x = var_1038)[name = tensor<string, []>("x_29")];
+            tensor<fp32, [1, 2, 12, 256]> emb_exp = tile(reps = emb_exp_reps_0, x = var_1043)[name = tensor<string, []>("emb_exp")];
+            tensor<bool, []> input_167_interleave_0 = const()[name = tensor<string, []>("input_167_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 2, 12, 512]> input_167 = concat(axis = var_52, interleave = input_167_interleave_0, values = (emb_exp, pos))[name = tensor<string, []>("input_167")];
+            tensor<fp32, [1, 2, 12, 256]> x_27 = linear(bias = inner_decoder_convert_bias, weight = inner_decoder_convert_weight, x = input_167)[name = tensor<string, []>("linear_37")];
+            tensor<int32, [4]> var_1051_perm_0 = const()[name = tensor<string, []>("op_1051_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1055 = const()[name = tensor<string, []>("op_1055"), val = tensor<int32, [3]>([12, 2, 256])];
+            tensor<fp32, [1, 12, 2, 256]> var_1051 = transpose(perm = var_1051_perm_0, x = x_27)[name = tensor<string, []>("transpose_28")];
+            tensor<fp32, [12, 2, 256]> x_29 = reshape(shape = var_1055, x = var_1051)[name = tensor<string, []>("x_29")];
             tensor<int32, [5]> prev_kv_9_begin_0 = const()[name = tensor<string, []>("prev_kv_9_begin_0"), val = tensor<int32, [5]>([0, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_9_end_0 = const()[name = tensor<string, []>("prev_kv_9_end_0"), val = tensor<int32, [5]>([1, 12, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_9_end_mask_0 = const()[name = tensor<string, []>("prev_kv_9_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -903,132 +910,132 @@ program(1.0)
             tensor<bool, [2]> prev_scale_9_end_mask_0 = const()[name = tensor<string, []>("prev_scale_9_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_9_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_9_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_9 = slice_by_index(begin = prev_scale_9_begin_0, end = prev_scale_9_end_0, end_mask = prev_scale_9_end_mask_0, squeeze_mask = prev_scale_9_squeeze_mask_0, x = dec_scale)[name = tensor<string, []>("prev_scale_9")];
-            tensor<fp32, [12, 2, 256]> var_1050 = linear(bias = decoder_q_proj_0_bias, weight = decoder_q_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_38")];
-            tensor<int32, [4]> var_1051 = const()[name = tensor<string, []>("op_1051"), val = tensor<int32, [4]>([12, 2, 4, 64])];
-            tensor<fp32, [12, 2, 4, 64]> var_1052 = reshape(shape = var_1051, x = var_1050)[name = tensor<string, []>("op_1052")];
+            tensor<fp32, [12, 2, 256]> var_1063 = linear(bias = inner_decoder_q_proj_0_bias, weight = inner_decoder_q_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_38")];
+            tensor<int32, [4]> var_1064 = const()[name = tensor<string, []>("op_1064"), val = tensor<int32, [4]>([12, 2, 4, 64])];
+            tensor<fp32, [12, 2, 4, 64]> var_1065 = reshape(shape = var_1064, x = var_1063)[name = tensor<string, []>("op_1065")];
             tensor<int32, [4]> q_9_perm_0 = const()[name = tensor<string, []>("q_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 2, 256]> var_1056 = linear(bias = decoder_k_proj_0_bias, weight = decoder_k_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_39")];
-            tensor<fp32, []> var_1057 = const()[name = tensor<string, []>("op_1057"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [12, 2, 256]> var_1058 = mul(x = var_1056, y = var_1057)[name = tensor<string, []>("op_1058")];
-            tensor<int32, [4]> var_1059 = const()[name = tensor<string, []>("op_1059"), val = tensor<int32, [4]>([12, 2, 4, 64])];
-            tensor<fp32, [12, 2, 4, 64]> var_1060 = reshape(shape = var_1059, x = var_1058)[name = tensor<string, []>("op_1060")];
+            tensor<fp32, [12, 2, 256]> var_1069 = linear(bias = inner_decoder_k_proj_0_bias, weight = inner_decoder_k_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_39")];
+            tensor<fp32, []> var_1070 = const()[name = tensor<string, []>("op_1070"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [12, 2, 256]> var_1071 = mul(x = var_1069, y = var_1070)[name = tensor<string, []>("op_1071")];
+            tensor<int32, [4]> var_1072 = const()[name = tensor<string, []>("op_1072"), val = tensor<int32, [4]>([12, 2, 4, 64])];
+            tensor<fp32, [12, 2, 4, 64]> var_1073 = reshape(shape = var_1072, x = var_1071)[name = tensor<string, []>("op_1073")];
             tensor<int32, [4]> k_9_perm_0 = const()[name = tensor<string, []>("k_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 2, 256]> var_1064 = linear(bias = decoder_v_proj_0_bias, weight = decoder_v_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_40")];
-            tensor<int32, [4]> var_1065 = const()[name = tensor<string, []>("op_1065"), val = tensor<int32, [4]>([12, 2, 4, 64])];
-            tensor<fp32, [12, 2, 4, 64]> var_1066 = reshape(shape = var_1065, x = var_1064)[name = tensor<string, []>("op_1066")];
+            tensor<fp32, [12, 2, 256]> var_1077 = linear(bias = inner_decoder_v_proj_0_bias, weight = inner_decoder_v_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_40")];
+            tensor<int32, [4]> var_1078 = const()[name = tensor<string, []>("op_1078"), val = tensor<int32, [4]>([12, 2, 4, 64])];
+            tensor<fp32, [12, 2, 4, 64]> var_1079 = reshape(shape = var_1078, x = var_1077)[name = tensor<string, []>("op_1079")];
             tensor<int32, [4]> v_9_perm_0 = const()[name = tensor<string, []>("v_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 2, 256]> input_169 = linear(bias = decoder_g_proj_0_bias, weight = decoder_g_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_41")];
+            tensor<fp32, [12, 2, 256]> input_171 = linear(bias = inner_decoder_g_proj_0_bias, weight = inner_decoder_g_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_41")];
             tensor<bool, []> cumsum_mask_1_exclusive_0 = const()[name = tensor<string, []>("cumsum_mask_1_exclusive_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> cumsum_mask_1_reverse_0 = const()[name = tensor<string, []>("cumsum_mask_1_reverse_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [2]> cumsum_mask_1 = cumsum(axis = var_968, exclusive = cumsum_mask_1_exclusive_0, reverse = cumsum_mask_1_reverse_0, x = valid_mask)[name = tensor<string, []>("cumsum_mask_1")];
+            tensor<fp32, [2]> cumsum_mask_1 = cumsum(axis = var_49, exclusive = cumsum_mask_1_exclusive_0, reverse = cumsum_mask_1_reverse_0, x = valid_mask)[name = tensor<string, []>("cumsum_mask_1")];
             tensor<fp32, [1]> sqrt_s0_9 = sqrt(x = prev_scale_9)[name = tensor<string, []>("sqrt_s0_9")];
             tensor<fp32, [2]> s_t_9 = add(x = prev_scale_9, y = cumsum_mask_1)[name = tensor<string, []>("s_t_9")];
             tensor<fp32, []> const_20 = const()[name = tensor<string, []>("const_20"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [2]> clip_1 = clip(alpha = var_958, beta = const_20, x = s_t_9)[name = tensor<string, []>("clip_1")];
+            tensor<fp32, [2]> clip_1 = clip(alpha = var_39, beta = const_20, x = s_t_9)[name = tensor<string, []>("clip_1")];
             tensor<fp32, [2]> sqrt_s_t_9 = sqrt(x = clip_1)[name = tensor<string, []>("sqrt_s_t_9")];
             tensor<bool, []> qk_9_transpose_x_1 = const()[name = tensor<string, []>("qk_9_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_9_transpose_y_1 = const()[name = tensor<string, []>("qk_9_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [12, 4, 2, 64]> k_9 = transpose(perm = k_9_perm_0, x = var_1060)[name = tensor<string, []>("transpose_26")];
-            tensor<fp32, [12, 4, 2, 64]> q_9 = transpose(perm = q_9_perm_0, x = var_1052)[name = tensor<string, []>("transpose_27")];
+            tensor<fp32, [12, 4, 2, 64]> k_9 = transpose(perm = k_9_perm_0, x = var_1073)[name = tensor<string, []>("transpose_26")];
+            tensor<fp32, [12, 4, 2, 64]> q_9 = transpose(perm = q_9_perm_0, x = var_1065)[name = tensor<string, []>("transpose_27")];
             tensor<fp32, [12, 4, 2, 2]> qk_9 = matmul(transpose_x = qk_9_transpose_x_1, transpose_y = qk_9_transpose_y_1, x = q_9, y = k_9)[name = tensor<string, []>("qk_9")];
-            tensor<int32, [2]> var_1078 = const()[name = tensor<string, []>("op_1078"), val = tensor<int32, [2]>([1, 2])];
-            tensor<fp32, [1, 2]> var_1079 = reshape(shape = var_1078, x = valid_mask)[name = tensor<string, []>("op_1079")];
-            tensor<fp32, [2, 2]> causal_with_valid_1 = mul(x = decoder__causal_mask, y = var_1079)[name = tensor<string, []>("causal_with_valid_1")];
-            tensor<int32, [2]> var_1081 = const()[name = tensor<string, []>("op_1081"), val = tensor<int32, [2]>([2, 1])];
-            tensor<fp32, [2, 1]> var_1082 = reshape(shape = var_1081, x = sqrt_s_t_9)[name = tensor<string, []>("op_1082")];
-            tensor<fp32, [2, 2]> M_9 = real_div(x = causal_with_valid_1, y = var_1082)[name = tensor<string, []>("M_9")];
-            tensor<fp32, [12, 4, 2, 2]> var_1084 = mul(x = qk_9, y = M_9)[name = tensor<string, []>("op_1084")];
+            tensor<int32, [2]> var_1091 = const()[name = tensor<string, []>("op_1091"), val = tensor<int32, [2]>([1, 2])];
+            tensor<fp32, [1, 2]> var_1092 = reshape(shape = var_1091, x = valid_mask)[name = tensor<string, []>("op_1092")];
+            tensor<fp32, [2, 2]> causal_with_valid_1 = mul(x = inner_decoder__causal_mask, y = var_1092)[name = tensor<string, []>("causal_with_valid_1")];
+            tensor<int32, [2]> var_1094 = const()[name = tensor<string, []>("op_1094"), val = tensor<int32, [2]>([2, 1])];
+            tensor<fp32, [2, 1]> var_1095 = reshape(shape = var_1094, x = sqrt_s_t_9)[name = tensor<string, []>("op_1095")];
+            tensor<fp32, [2, 2]> M_9 = real_div(x = causal_with_valid_1, y = var_1095)[name = tensor<string, []>("M_9")];
+            tensor<fp32, [12, 4, 2, 2]> var_1097 = mul(x = qk_9, y = M_9)[name = tensor<string, []>("op_1097")];
             tensor<bool, []> inner_9_transpose_x_0 = const()[name = tensor<string, []>("inner_9_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_9_transpose_y_0 = const()[name = tensor<string, []>("inner_9_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 2, 64]> v_9 = transpose(perm = v_9_perm_0, x = var_1066)[name = tensor<string, []>("transpose_25")];
-            tensor<fp32, [12, 4, 2, 64]> inner_9 = matmul(transpose_x = inner_9_transpose_x_0, transpose_y = inner_9_transpose_y_0, x = var_1084, y = v_9)[name = tensor<string, []>("inner_9")];
-            tensor<bool, []> var_1086_transpose_x_0 = const()[name = tensor<string, []>("op_1086_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_1086_transpose_y_0 = const()[name = tensor<string, []>("op_1086_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 2, 64]> var_1086 = matmul(transpose_x = var_1086_transpose_x_0, transpose_y = var_1086_transpose_y_0, x = q_9, y = prev_kv_9)[name = tensor<string, []>("op_1086")];
-            tensor<fp32, [2]> var_1087 = real_div(x = sqrt_s0_9, y = sqrt_s_t_9)[name = tensor<string, []>("op_1087")];
-            tensor<int32, [4]> var_1088 = const()[name = tensor<string, []>("op_1088"), val = tensor<int32, [4]>([1, 1, 2, 1])];
-            tensor<fp32, [1, 1, 2, 1]> var_1089 = reshape(shape = var_1088, x = var_1087)[name = tensor<string, []>("op_1089")];
-            tensor<fp32, [12, 4, 2, 64]> cross_9 = mul(x = var_1086, y = var_1089)[name = tensor<string, []>("cross_9")];
+            tensor<fp32, [12, 4, 2, 64]> v_9 = transpose(perm = v_9_perm_0, x = var_1079)[name = tensor<string, []>("transpose_25")];
+            tensor<fp32, [12, 4, 2, 64]> inner_9 = matmul(transpose_x = inner_9_transpose_x_0, transpose_y = inner_9_transpose_y_0, x = var_1097, y = v_9)[name = tensor<string, []>("inner_9")];
+            tensor<bool, []> var_1099_transpose_x_0 = const()[name = tensor<string, []>("op_1099_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_1099_transpose_y_0 = const()[name = tensor<string, []>("op_1099_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 2, 64]> var_1099 = matmul(transpose_x = var_1099_transpose_x_0, transpose_y = var_1099_transpose_y_0, x = q_9, y = prev_kv_9)[name = tensor<string, []>("op_1099")];
+            tensor<fp32, [2]> var_1100 = real_div(x = sqrt_s0_9, y = sqrt_s_t_9)[name = tensor<string, []>("op_1100")];
+            tensor<int32, [4]> var_1101 = const()[name = tensor<string, []>("op_1101"), val = tensor<int32, [4]>([1, 1, 2, 1])];
+            tensor<fp32, [1, 1, 2, 1]> var_1102 = reshape(shape = var_1101, x = var_1100)[name = tensor<string, []>("op_1102")];
+            tensor<fp32, [12, 4, 2, 64]> cross_9 = mul(x = var_1099, y = var_1102)[name = tensor<string, []>("cross_9")];
             tensor<fp32, [12, 4, 2, 64]> out_25 = add(x = inner_9, y = cross_9)[name = tensor<string, []>("out_25")];
-            tensor<int32, [4]> var_1092 = const()[name = tensor<string, []>("op_1092"), val = tensor<int32, [4]>([1, 1, 2, 1])];
-            tensor<fp32, [1, 1, 2, 1]> var_1093 = reshape(shape = var_1092, x = valid_mask)[name = tensor<string, []>("op_1093")];
-            tensor<fp32, [12, 4, 2, 64]> v_masked_1 = mul(x = v_9, y = var_1093)[name = tensor<string, []>("v_masked_1")];
-            tensor<fp32, [12, 4, 64, 64]> var_1095 = mul(x = prev_kv_9, y = sqrt_s0_9)[name = tensor<string, []>("op_1095")];
-            tensor<bool, []> var_1097_transpose_x_1 = const()[name = tensor<string, []>("op_1097_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_1097_transpose_y_1 = const()[name = tensor<string, []>("op_1097_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 64, 64]> var_1097 = matmul(transpose_x = var_1097_transpose_x_1, transpose_y = var_1097_transpose_y_1, x = k_9, y = v_masked_1)[name = tensor<string, []>("op_1097")];
-            tensor<fp32, [12, 4, 64, 64]> new_kv_unnorm_9 = add(x = var_1095, y = var_1097)[name = tensor<string, []>("new_kv_unnorm_9")];
-            tensor<bool, []> var_1099_keep_dims_0 = const()[name = tensor<string, []>("op_1099_keep_dims_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, []> var_1099 = reduce_sum(keep_dims = var_1099_keep_dims_0, x = valid_mask)[name = tensor<string, []>("op_1099")];
-            tensor<int32, [1]> var_1100 = const()[name = tensor<string, []>("op_1100"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [1]> var_1101 = reshape(shape = var_1100, x = var_1099)[name = tensor<string, []>("op_1101")];
-            tensor<fp32, [1]> new_scale_9 = add(x = prev_scale_9, y = var_1101)[name = tensor<string, []>("new_scale_9")];
+            tensor<int32, [4]> var_1105 = const()[name = tensor<string, []>("op_1105"), val = tensor<int32, [4]>([1, 1, 2, 1])];
+            tensor<fp32, [1, 1, 2, 1]> var_1106 = reshape(shape = var_1105, x = valid_mask)[name = tensor<string, []>("op_1106")];
+            tensor<fp32, [12, 4, 2, 64]> v_masked_1 = mul(x = v_9, y = var_1106)[name = tensor<string, []>("v_masked_1")];
+            tensor<fp32, [12, 4, 64, 64]> var_1108 = mul(x = prev_kv_9, y = sqrt_s0_9)[name = tensor<string, []>("op_1108")];
+            tensor<bool, []> var_1110_transpose_x_1 = const()[name = tensor<string, []>("op_1110_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_1110_transpose_y_1 = const()[name = tensor<string, []>("op_1110_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 64, 64]> var_1110 = matmul(transpose_x = var_1110_transpose_x_1, transpose_y = var_1110_transpose_y_1, x = k_9, y = v_masked_1)[name = tensor<string, []>("op_1110")];
+            tensor<fp32, [12, 4, 64, 64]> new_kv_unnorm_9 = add(x = var_1108, y = var_1110)[name = tensor<string, []>("new_kv_unnorm_9")];
+            tensor<bool, []> var_1112_keep_dims_0 = const()[name = tensor<string, []>("op_1112_keep_dims_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, []> var_1112 = reduce_sum(keep_dims = var_1112_keep_dims_0, x = valid_mask)[name = tensor<string, []>("op_1112")];
+            tensor<int32, [1]> var_1113 = const()[name = tensor<string, []>("op_1113"), val = tensor<int32, [1]>([1])];
+            tensor<fp32, [1]> var_1114 = reshape(shape = var_1113, x = var_1112)[name = tensor<string, []>("op_1114")];
+            tensor<fp32, [1]> new_scale_9 = add(x = prev_scale_9, y = var_1114)[name = tensor<string, []>("new_scale_9")];
             tensor<fp32, []> const_21 = const()[name = tensor<string, []>("const_21"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_2 = clip(alpha = var_958, beta = const_21, x = new_scale_9)[name = tensor<string, []>("clip_2")];
+            tensor<fp32, [1]> clip_2 = clip(alpha = var_39, beta = const_21, x = new_scale_9)[name = tensor<string, []>("clip_2")];
             tensor<fp32, [1]> sqrt_new_scale_1 = sqrt(x = clip_2)[name = tensor<string, []>("sqrt_new_scale_1")];
-            tensor<fp32, [12, 4, 64, 64]> var_1105 = real_div(x = new_kv_unnorm_9, y = sqrt_new_scale_1)[name = tensor<string, []>("op_1105")];
-            tensor<int32, [4]> var_1106_perm_0 = const()[name = tensor<string, []>("op_1106_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [12, 4, 64, 64]> var_1118 = real_div(x = new_kv_unnorm_9, y = sqrt_new_scale_1)[name = tensor<string, []>("op_1118")];
+            tensor<int32, [4]> var_1119_perm_0 = const()[name = tensor<string, []>("op_1119_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_27_axes_0 = const()[name = tensor<string, []>("out_27_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [12, 2, 4, 64]> var_1106 = transpose(perm = var_1106_perm_0, x = out_25)[name = tensor<string, []>("transpose_24")];
-            tensor<fp32, [12, 2, 4, 64]> out_27 = layer_norm(axes = out_27_axes_0, epsilon = var_955, x = var_1106)[name = tensor<string, []>("out_27")];
-            tensor<int32, [3]> var_1110 = const()[name = tensor<string, []>("op_1110"), val = tensor<int32, [3]>([12, 2, 256])];
-            tensor<fp32, [12, 2, 256]> out_29 = reshape(shape = var_1110, x = out_27)[name = tensor<string, []>("out_29")];
-            tensor<fp32, [12, 2, 256]> var_1112 = silu(x = input_169)[name = tensor<string, []>("op_1112")];
-            tensor<fp32, [12, 2, 256]> input_171 = mul(x = var_1112, y = out_29)[name = tensor<string, []>("input_171")];
-            tensor<fp32, [12, 2, 256]> ret_out_9 = linear(bias = decoder_out_proj_0_bias, weight = decoder_out_proj_0_weight, x = input_171)[name = tensor<string, []>("linear_42")];
-            tensor<fp32, [12, 2, 256]> input_173 = add(x = x_29, y = ret_out_9)[name = tensor<string, []>("input_173")];
+            tensor<fp32, [12, 2, 4, 64]> var_1119 = transpose(perm = var_1119_perm_0, x = out_25)[name = tensor<string, []>("transpose_24")];
+            tensor<fp32, [12, 2, 4, 64]> out_27 = layer_norm(axes = out_27_axes_0, epsilon = var_54, x = var_1119)[name = tensor<string, []>("out_27")];
+            tensor<int32, [3]> var_1123 = const()[name = tensor<string, []>("op_1123"), val = tensor<int32, [3]>([12, 2, 256])];
+            tensor<fp32, [12, 2, 256]> out_29 = reshape(shape = var_1123, x = out_27)[name = tensor<string, []>("out_29")];
+            tensor<fp32, [12, 2, 256]> var_1125 = silu(x = input_171)[name = tensor<string, []>("op_1125")];
+            tensor<fp32, [12, 2, 256]> input_173 = mul(x = var_1125, y = out_29)[name = tensor<string, []>("input_173")];
+            tensor<fp32, [12, 2, 256]> ret_out_9 = linear(bias = inner_decoder_out_proj_0_bias, weight = inner_decoder_out_proj_0_weight, x = input_173)[name = tensor<string, []>("linear_42")];
+            tensor<fp32, [12, 2, 256]> input_175 = add(x = x_29, y = ret_out_9)[name = tensor<string, []>("input_175")];
             tensor<int32, [1]> xt_1_axes_0 = const()[name = tensor<string, []>("xt_1_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [12, 2, 256]> xt_1 = layer_norm(axes = xt_1_axes_0, beta = decoder_norm11_0_bias, epsilon = var_953, gamma = decoder_norm11_0_weight, x = input_173)[name = tensor<string, []>("xt_1")];
-            tensor<int32, [4]> var_1122 = const()[name = tensor<string, []>("op_1122"), val = tensor<int32, [4]>([1, 12, 2, 256])];
-            tensor<fp32, [1, 12, 2, 256]> var_1123 = reshape(shape = var_1122, x = xt_1)[name = tensor<string, []>("op_1123")];
-            tensor<int32, [4]> var_1124_perm_0 = const()[name = tensor<string, []>("op_1124_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1127 = const()[name = tensor<string, []>("op_1127"), val = tensor<int32, [3]>([2, 12, 256])];
-            tensor<fp32, [1, 2, 12, 256]> var_1124 = transpose(perm = var_1124_perm_0, x = var_1123)[name = tensor<string, []>("transpose_23")];
-            tensor<fp32, [2, 12, 256]> query_1 = reshape(shape = var_1127, x = var_1124)[name = tensor<string, []>("query_1")];
+            tensor<fp32, [12, 2, 256]> xt_1 = layer_norm(axes = xt_1_axes_0, beta = inner_decoder_norm11_0_bias, epsilon = var_46, gamma = inner_decoder_norm11_0_weight, x = input_175)[name = tensor<string, []>("xt_1")];
+            tensor<int32, [4]> var_1135 = const()[name = tensor<string, []>("op_1135"), val = tensor<int32, [4]>([1, 12, 2, 256])];
+            tensor<fp32, [1, 12, 2, 256]> var_1136 = reshape(shape = var_1135, x = xt_1)[name = tensor<string, []>("op_1136")];
+            tensor<int32, [4]> var_1137_perm_0 = const()[name = tensor<string, []>("op_1137_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1140 = const()[name = tensor<string, []>("op_1140"), val = tensor<int32, [3]>([2, 12, 256])];
+            tensor<fp32, [1, 2, 12, 256]> var_1137 = transpose(perm = var_1137_perm_0, x = var_1136)[name = tensor<string, []>("transpose_23")];
+            tensor<fp32, [2, 12, 256]> query_1 = reshape(shape = var_1140, x = var_1137)[name = tensor<string, []>("query_1")];
             tensor<int32, [3]> query_3_perm_0 = const()[name = tensor<string, []>("query_3_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [12, 2, 256]> query_3 = transpose(perm = query_3_perm_0, x = query_1)[name = tensor<string, []>("transpose_22")];
-            tensor<fp32, [12, 2, 768]> var_1150 = linear(bias = decoder_self_attn2_0_in_proj_bias, weight = decoder_self_attn2_0_in_proj_weight, x = query_3)[name = tensor<string, []>("linear_43")];
+            tensor<fp32, [12, 2, 768]> var_1163 = linear(bias = inner_decoder_self_attn2_0_in_proj_bias, weight = inner_decoder_self_attn2_0_in_proj_weight, x = query_3)[name = tensor<string, []>("linear_43")];
             tensor<int32, [4]> concat_1 = const()[name = tensor<string, []>("concat_1"), val = tensor<int32, [4]>([12, 2, 3, 256])];
-            tensor<fp32, [12, 2, 3, 256]> var_1152 = reshape(shape = concat_1, x = var_1150)[name = tensor<string, []>("op_1152")];
-            tensor<int32, [1]> var_1153_axes_0 = const()[name = tensor<string, []>("op_1153_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp32, [1, 12, 2, 3, 256]> var_1153 = expand_dims(axes = var_1153_axes_0, x = var_1152)[name = tensor<string, []>("op_1153")];
-            tensor<int32, [5]> var_1154_perm_0 = const()[name = tensor<string, []>("op_1154_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
-            tensor<int32, [1]> var_1155_axes_0 = const()[name = tensor<string, []>("op_1155_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp32, [3, 12, 2, 1, 256]> var_1154 = transpose(perm = var_1154_perm_0, x = var_1153)[name = tensor<string, []>("transpose_21")];
-            tensor<fp32, [3, 12, 2, 256]> var_1155 = squeeze(axes = var_1155_axes_0, x = var_1154)[name = tensor<string, []>("op_1155")];
+            tensor<fp32, [12, 2, 3, 256]> var_1165 = reshape(shape = concat_1, x = var_1163)[name = tensor<string, []>("op_1165")];
+            tensor<int32, [1]> var_1166_axes_0 = const()[name = tensor<string, []>("op_1166_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 12, 2, 3, 256]> var_1166 = expand_dims(axes = var_1166_axes_0, x = var_1165)[name = tensor<string, []>("op_1166")];
+            tensor<int32, [5]> var_1167_perm_0 = const()[name = tensor<string, []>("op_1167_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
+            tensor<int32, [1]> var_1168_axes_0 = const()[name = tensor<string, []>("op_1168_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp32, [3, 12, 2, 1, 256]> var_1167 = transpose(perm = var_1167_perm_0, x = var_1166)[name = tensor<string, []>("transpose_21")];
+            tensor<fp32, [3, 12, 2, 256]> var_1168 = squeeze(axes = var_1168_axes_0, x = var_1167)[name = tensor<string, []>("op_1168")];
             tensor<int32, [4]> q_11_begin_0 = const()[name = tensor<string, []>("q_11_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> q_11_end_0 = const()[name = tensor<string, []>("q_11_end_0"), val = tensor<int32, [4]>([1, 12, 2, 256])];
             tensor<bool, [4]> q_11_end_mask_0 = const()[name = tensor<string, []>("q_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> q_11_squeeze_mask_0 = const()[name = tensor<string, []>("q_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 2, 256]> q_11 = slice_by_index(begin = q_11_begin_0, end = q_11_end_0, end_mask = q_11_end_mask_0, squeeze_mask = q_11_squeeze_mask_0, x = var_1155)[name = tensor<string, []>("q_11")];
+            tensor<fp32, [12, 2, 256]> q_11 = slice_by_index(begin = q_11_begin_0, end = q_11_end_0, end_mask = q_11_end_mask_0, squeeze_mask = q_11_squeeze_mask_0, x = var_1168)[name = tensor<string, []>("q_11")];
             tensor<int32, [4]> k_11_begin_0 = const()[name = tensor<string, []>("k_11_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> k_11_end_0 = const()[name = tensor<string, []>("k_11_end_0"), val = tensor<int32, [4]>([2, 12, 2, 256])];
             tensor<bool, [4]> k_11_end_mask_0 = const()[name = tensor<string, []>("k_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> k_11_squeeze_mask_0 = const()[name = tensor<string, []>("k_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 2, 256]> k_11 = slice_by_index(begin = k_11_begin_0, end = k_11_end_0, end_mask = k_11_end_mask_0, squeeze_mask = k_11_squeeze_mask_0, x = var_1155)[name = tensor<string, []>("k_11")];
+            tensor<fp32, [12, 2, 256]> k_11 = slice_by_index(begin = k_11_begin_0, end = k_11_end_0, end_mask = k_11_end_mask_0, squeeze_mask = k_11_squeeze_mask_0, x = var_1168)[name = tensor<string, []>("k_11")];
             tensor<int32, [4]> v_11_begin_0 = const()[name = tensor<string, []>("v_11_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> v_11_end_0 = const()[name = tensor<string, []>("v_11_end_0"), val = tensor<int32, [4]>([3, 12, 2, 256])];
             tensor<bool, [4]> v_11_end_mask_0 = const()[name = tensor<string, []>("v_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> v_11_squeeze_mask_0 = const()[name = tensor<string, []>("v_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 2, 256]> v_11 = slice_by_index(begin = v_11_begin_0, end = v_11_end_0, end_mask = v_11_end_mask_0, squeeze_mask = v_11_squeeze_mask_0, x = var_1155)[name = tensor<string, []>("v_11")];
-            tensor<int32, [3]> var_1163 = const()[name = tensor<string, []>("op_1163"), val = tensor<int32, [3]>([12, 8, 64])];
-            tensor<fp32, [12, 8, 64]> var_1164 = reshape(shape = var_1163, x = q_11)[name = tensor<string, []>("op_1164")];
+            tensor<fp32, [12, 2, 256]> v_11 = slice_by_index(begin = v_11_begin_0, end = v_11_end_0, end_mask = v_11_end_mask_0, squeeze_mask = v_11_squeeze_mask_0, x = var_1168)[name = tensor<string, []>("v_11")];
+            tensor<int32, [3]> var_1176 = const()[name = tensor<string, []>("op_1176"), val = tensor<int32, [3]>([12, 8, 64])];
+            tensor<fp32, [12, 8, 64]> var_1177 = reshape(shape = var_1176, x = q_11)[name = tensor<string, []>("op_1177")];
             tensor<int32, [3]> q_13_perm_0 = const()[name = tensor<string, []>("q_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1170 = const()[name = tensor<string, []>("op_1170"), val = tensor<int32, [3]>([12, 8, 64])];
-            tensor<fp32, [12, 8, 64]> var_1171 = reshape(shape = var_1170, x = k_11)[name = tensor<string, []>("op_1171")];
+            tensor<int32, [3]> var_1183 = const()[name = tensor<string, []>("op_1183"), val = tensor<int32, [3]>([12, 8, 64])];
+            tensor<fp32, [12, 8, 64]> var_1184 = reshape(shape = var_1183, x = k_11)[name = tensor<string, []>("op_1184")];
             tensor<int32, [3]> k_13_perm_0 = const()[name = tensor<string, []>("k_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1177 = const()[name = tensor<string, []>("op_1177"), val = tensor<int32, [3]>([12, 8, 64])];
-            tensor<fp32, [12, 8, 64]> var_1178 = reshape(shape = var_1177, x = v_11)[name = tensor<string, []>("op_1178")];
+            tensor<int32, [3]> var_1190 = const()[name = tensor<string, []>("op_1190"), val = tensor<int32, [3]>([12, 8, 64])];
+            tensor<fp32, [12, 8, 64]> var_1191 = reshape(shape = var_1190, x = v_11)[name = tensor<string, []>("op_1191")];
             tensor<int32, [3]> v_13_perm_0 = const()[name = tensor<string, []>("v_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [4]> var_1181 = const()[name = tensor<string, []>("op_1181"), val = tensor<int32, [4]>([2, 4, 12, 64])];
-            tensor<fp32, [8, 12, 64]> q_13 = transpose(perm = q_13_perm_0, x = var_1164)[name = tensor<string, []>("transpose_20")];
-            tensor<fp32, [2, 4, 12, 64]> q_15 = reshape(shape = var_1181, x = q_13)[name = tensor<string, []>("q_15")];
-            tensor<int32, [4]> var_1183 = const()[name = tensor<string, []>("op_1183"), val = tensor<int32, [4]>([2, 4, 12, 64])];
-            tensor<fp32, [8, 12, 64]> k_13 = transpose(perm = k_13_perm_0, x = var_1171)[name = tensor<string, []>("transpose_19")];
-            tensor<fp32, [2, 4, 12, 64]> k_15 = reshape(shape = var_1183, x = k_13)[name = tensor<string, []>("k_15")];
-            tensor<int32, [4]> var_1185 = const()[name = tensor<string, []>("op_1185"), val = tensor<int32, [4]>([2, 4, 12, 64])];
-            tensor<fp32, [8, 12, 64]> v_13 = transpose(perm = v_13_perm_0, x = var_1178)[name = tensor<string, []>("transpose_18")];
-            tensor<fp32, [2, 4, 12, 64]> v_15 = reshape(shape = var_1185, x = v_13)[name = tensor<string, []>("v_15")];
+            tensor<int32, [4]> var_1194 = const()[name = tensor<string, []>("op_1194"), val = tensor<int32, [4]>([2, 4, 12, 64])];
+            tensor<fp32, [8, 12, 64]> q_13 = transpose(perm = q_13_perm_0, x = var_1177)[name = tensor<string, []>("transpose_20")];
+            tensor<fp32, [2, 4, 12, 64]> q_15 = reshape(shape = var_1194, x = q_13)[name = tensor<string, []>("q_15")];
+            tensor<int32, [4]> var_1196 = const()[name = tensor<string, []>("op_1196"), val = tensor<int32, [4]>([2, 4, 12, 64])];
+            tensor<fp32, [8, 12, 64]> k_13 = transpose(perm = k_13_perm_0, x = var_1184)[name = tensor<string, []>("transpose_19")];
+            tensor<fp32, [2, 4, 12, 64]> k_15 = reshape(shape = var_1196, x = k_13)[name = tensor<string, []>("k_15")];
+            tensor<int32, [4]> var_1198 = const()[name = tensor<string, []>("op_1198"), val = tensor<int32, [4]>([2, 4, 12, 64])];
+            tensor<fp32, [8, 12, 64]> v_13 = transpose(perm = v_13_perm_0, x = var_1191)[name = tensor<string, []>("transpose_18")];
+            tensor<fp32, [2, 4, 12, 64]> v_15 = reshape(shape = var_1198, x = v_13)[name = tensor<string, []>("v_15")];
             tensor<fp32, []> mul_1_y_0 = const()[name = tensor<string, []>("mul_1_y_0"), val = tensor<fp32, []>(0x1p-3)];
             tensor<fp32, [2, 4, 12, 64]> mul_1 = mul(x = q_15, y = mul_1_y_0)[name = tensor<string, []>("mul_1")];
             tensor<bool, []> matmul_0_transpose_y_0 = const()[name = tensor<string, []>("matmul_0_transpose_y_0"), val = tensor<bool, []>(true)];
@@ -1039,30 +1046,30 @@ program(1.0)
             tensor<bool, []> attn_output_1_transpose_x_0 = const()[name = tensor<string, []>("attn_output_1_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> attn_output_1_transpose_y_0 = const()[name = tensor<string, []>("attn_output_1_transpose_y_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [2, 4, 12, 64]> attn_output_1 = matmul(transpose_x = attn_output_1_transpose_x_0, transpose_y = attn_output_1_transpose_y_0, x = softmax_0, y = v_15)[name = tensor<string, []>("attn_output_1")];
-            tensor<int32, [4]> var_1188 = const()[name = tensor<string, []>("op_1188"), val = tensor<int32, [4]>([2, 0, 1, 3])];
-            tensor<int32, [2]> var_1193 = const()[name = tensor<string, []>("op_1193"), val = tensor<int32, [2]>([24, 256])];
-            tensor<fp32, [12, 2, 4, 64]> var_1189 = transpose(perm = var_1188, x = attn_output_1)[name = tensor<string, []>("transpose_17")];
-            tensor<fp32, [24, 256]> attn_output_3 = reshape(shape = var_1193, x = var_1189)[name = tensor<string, []>("attn_output_3")];
-            tensor<fp32, [24, 256]> attn_output_5 = linear(bias = decoder_self_attn2_0_out_proj_bias, weight = decoder_self_attn2_0_out_proj_weight, x = attn_output_3)[name = tensor<string, []>("linear_44")];
-            tensor<int32, [3]> var_1197 = const()[name = tensor<string, []>("op_1197"), val = tensor<int32, [3]>([12, 2, 256])];
-            tensor<fp32, [12, 2, 256]> attn_output_7 = reshape(shape = var_1197, x = attn_output_5)[name = tensor<string, []>("attn_output_7")];
+            tensor<int32, [4]> var_1201 = const()[name = tensor<string, []>("op_1201"), val = tensor<int32, [4]>([2, 0, 1, 3])];
+            tensor<int32, [2]> var_1206 = const()[name = tensor<string, []>("op_1206"), val = tensor<int32, [2]>([24, 256])];
+            tensor<fp32, [12, 2, 4, 64]> var_1202 = transpose(perm = var_1201, x = attn_output_1)[name = tensor<string, []>("transpose_17")];
+            tensor<fp32, [24, 256]> attn_output_3 = reshape(shape = var_1206, x = var_1202)[name = tensor<string, []>("attn_output_3")];
+            tensor<fp32, [24, 256]> attn_output_5 = linear(bias = inner_decoder_self_attn2_0_out_proj_bias, weight = inner_decoder_self_attn2_0_out_proj_weight, x = attn_output_3)[name = tensor<string, []>("linear_44")];
+            tensor<int32, [3]> var_1210 = const()[name = tensor<string, []>("op_1210"), val = tensor<int32, [3]>([12, 2, 256])];
+            tensor<fp32, [12, 2, 256]> attn_output_7 = reshape(shape = var_1210, x = attn_output_5)[name = tensor<string, []>("attn_output_7")];
             tensor<int32, [3]> sa2_out_1_perm_0 = const()[name = tensor<string, []>("sa2_out_1_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [2, 12, 256]> sa2_out_1 = transpose(perm = sa2_out_1_perm_0, x = attn_output_7)[name = tensor<string, []>("transpose_16")];
-            tensor<fp32, [2, 12, 256]> input_175 = add(x = query_1, y = sa2_out_1)[name = tensor<string, []>("input_175")];
-            tensor<int32, [1]> input_177_axes_0 = const()[name = tensor<string, []>("input_177_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [2, 12, 256]> input_177 = layer_norm(axes = input_177_axes_0, beta = decoder_norm21_0_bias, epsilon = var_953, gamma = decoder_norm21_0_weight, x = input_175)[name = tensor<string, []>("input_177")];
-            tensor<fp32, [2, 12, 2048]> input_179 = linear(bias = decoder_linear1_0_bias, weight = decoder_linear1_0_weight, x = input_177)[name = tensor<string, []>("linear_45")];
-            tensor<fp32, [2, 12, 2048]> input_181 = relu(x = input_179)[name = tensor<string, []>("input_181")];
-            tensor<fp32, [2, 12, 256]> ffn_out_1 = linear(bias = decoder_linear2_0_bias, weight = decoder_linear2_0_weight, x = input_181)[name = tensor<string, []>("linear_46")];
-            tensor<fp32, [2, 12, 256]> input_183 = add(x = input_177, y = ffn_out_1)[name = tensor<string, []>("input_183")];
+            tensor<fp32, [2, 12, 256]> input_177 = add(x = query_1, y = sa2_out_1)[name = tensor<string, []>("input_177")];
+            tensor<int32, [1]> input_179_axes_0 = const()[name = tensor<string, []>("input_179_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [2, 12, 256]> input_179 = layer_norm(axes = input_179_axes_0, beta = inner_decoder_norm21_0_bias, epsilon = var_46, gamma = inner_decoder_norm21_0_weight, x = input_177)[name = tensor<string, []>("input_179")];
+            tensor<fp32, [2, 12, 2048]> input_181 = linear(bias = inner_decoder_linear1_0_bias, weight = inner_decoder_linear1_0_weight, x = input_179)[name = tensor<string, []>("linear_45")];
+            tensor<fp32, [2, 12, 2048]> input_183 = relu(x = input_181)[name = tensor<string, []>("input_183")];
+            tensor<fp32, [2, 12, 256]> ffn_out_1 = linear(bias = inner_decoder_linear2_0_bias, weight = inner_decoder_linear2_0_weight, x = input_183)[name = tensor<string, []>("linear_46")];
+            tensor<fp32, [2, 12, 256]> input_185 = add(x = input_179, y = ffn_out_1)[name = tensor<string, []>("input_185")];
             tensor<int32, [1]> xt_3_axes_0 = const()[name = tensor<string, []>("xt_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [2, 12, 256]> xt_3 = layer_norm(axes = xt_3_axes_0, beta = decoder_norm22_0_bias, epsilon = var_953, gamma = decoder_norm22_0_weight, x = input_183)[name = tensor<string, []>("xt_3")];
-            tensor<int32, [4]> var_1217 = const()[name = tensor<string, []>("op_1217"), val = tensor<int32, [4]>([1, 2, 12, 256])];
-            tensor<fp32, [1, 2, 12, 256]> x_31 = reshape(shape = var_1217, x = xt_3)[name = tensor<string, []>("x_31")];
-            tensor<int32, [4]> var_1219_perm_0 = const()[name = tensor<string, []>("op_1219_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1223 = const()[name = tensor<string, []>("op_1223"), val = tensor<int32, [3]>([12, 2, 256])];
-            tensor<fp32, [1, 12, 2, 256]> var_1219 = transpose(perm = var_1219_perm_0, x = x_31)[name = tensor<string, []>("transpose_15")];
-            tensor<fp32, [12, 2, 256]> x = reshape(shape = var_1223, x = var_1219)[name = tensor<string, []>("x")];
+            tensor<fp32, [2, 12, 256]> xt_3 = layer_norm(axes = xt_3_axes_0, beta = inner_decoder_norm22_0_bias, epsilon = var_46, gamma = inner_decoder_norm22_0_weight, x = input_185)[name = tensor<string, []>("xt_3")];
+            tensor<int32, [4]> var_1230 = const()[name = tensor<string, []>("op_1230"), val = tensor<int32, [4]>([1, 2, 12, 256])];
+            tensor<fp32, [1, 2, 12, 256]> x_31 = reshape(shape = var_1230, x = xt_3)[name = tensor<string, []>("x_31")];
+            tensor<int32, [4]> var_1232_perm_0 = const()[name = tensor<string, []>("op_1232_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1236 = const()[name = tensor<string, []>("op_1236"), val = tensor<int32, [3]>([12, 2, 256])];
+            tensor<fp32, [1, 12, 2, 256]> var_1232 = transpose(perm = var_1232_perm_0, x = x_31)[name = tensor<string, []>("transpose_15")];
+            tensor<fp32, [12, 2, 256]> x = reshape(shape = var_1236, x = var_1232)[name = tensor<string, []>("x")];
             tensor<int32, [5]> prev_kv_begin_0 = const()[name = tensor<string, []>("prev_kv_begin_0"), val = tensor<int32, [5]>([1, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_end_0 = const()[name = tensor<string, []>("prev_kv_end_0"), val = tensor<int32, [5]>([2, 12, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_end_mask_0 = const()[name = tensor<string, []>("prev_kv_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -1073,120 +1080,120 @@ program(1.0)
             tensor<bool, [2]> prev_scale_end_mask_0 = const()[name = tensor<string, []>("prev_scale_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale = slice_by_index(begin = prev_scale_begin_0, end = prev_scale_end_0, end_mask = prev_scale_end_mask_0, squeeze_mask = prev_scale_squeeze_mask_0, x = dec_scale)[name = tensor<string, []>("prev_scale")];
-            tensor<fp32, [12, 2, 256]> var_1231 = linear(bias = decoder_q_proj_1_bias, weight = decoder_q_proj_1_weight, x = x)[name = tensor<string, []>("linear_47")];
-            tensor<int32, [4]> var_1232 = const()[name = tensor<string, []>("op_1232"), val = tensor<int32, [4]>([12, 2, 4, 64])];
-            tensor<fp32, [12, 2, 4, 64]> var_1233 = reshape(shape = var_1232, x = var_1231)[name = tensor<string, []>("op_1233")];
+            tensor<fp32, [12, 2, 256]> var_1244 = linear(bias = inner_decoder_q_proj_1_bias, weight = inner_decoder_q_proj_1_weight, x = x)[name = tensor<string, []>("linear_47")];
+            tensor<int32, [4]> var_1245 = const()[name = tensor<string, []>("op_1245"), val = tensor<int32, [4]>([12, 2, 4, 64])];
+            tensor<fp32, [12, 2, 4, 64]> var_1246 = reshape(shape = var_1245, x = var_1244)[name = tensor<string, []>("op_1246")];
             tensor<int32, [4]> q_17_perm_0 = const()[name = tensor<string, []>("q_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 2, 256]> var_1237 = linear(bias = decoder_k_proj_1_bias, weight = decoder_k_proj_1_weight, x = x)[name = tensor<string, []>("linear_48")];
-            tensor<fp32, []> var_1238 = const()[name = tensor<string, []>("op_1238"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [12, 2, 256]> var_1239 = mul(x = var_1237, y = var_1238)[name = tensor<string, []>("op_1239")];
-            tensor<int32, [4]> var_1240 = const()[name = tensor<string, []>("op_1240"), val = tensor<int32, [4]>([12, 2, 4, 64])];
-            tensor<fp32, [12, 2, 4, 64]> var_1241 = reshape(shape = var_1240, x = var_1239)[name = tensor<string, []>("op_1241")];
+            tensor<fp32, [12, 2, 256]> var_1250 = linear(bias = inner_decoder_k_proj_1_bias, weight = inner_decoder_k_proj_1_weight, x = x)[name = tensor<string, []>("linear_48")];
+            tensor<fp32, []> var_1251 = const()[name = tensor<string, []>("op_1251"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [12, 2, 256]> var_1252 = mul(x = var_1250, y = var_1251)[name = tensor<string, []>("op_1252")];
+            tensor<int32, [4]> var_1253 = const()[name = tensor<string, []>("op_1253"), val = tensor<int32, [4]>([12, 2, 4, 64])];
+            tensor<fp32, [12, 2, 4, 64]> var_1254 = reshape(shape = var_1253, x = var_1252)[name = tensor<string, []>("op_1254")];
             tensor<int32, [4]> k_17_perm_0 = const()[name = tensor<string, []>("k_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 2, 256]> var_1245 = linear(bias = decoder_v_proj_1_bias, weight = decoder_v_proj_1_weight, x = x)[name = tensor<string, []>("linear_49")];
-            tensor<int32, [4]> var_1246 = const()[name = tensor<string, []>("op_1246"), val = tensor<int32, [4]>([12, 2, 4, 64])];
-            tensor<fp32, [12, 2, 4, 64]> var_1247 = reshape(shape = var_1246, x = var_1245)[name = tensor<string, []>("op_1247")];
+            tensor<fp32, [12, 2, 256]> var_1258 = linear(bias = inner_decoder_v_proj_1_bias, weight = inner_decoder_v_proj_1_weight, x = x)[name = tensor<string, []>("linear_49")];
+            tensor<int32, [4]> var_1259 = const()[name = tensor<string, []>("op_1259"), val = tensor<int32, [4]>([12, 2, 4, 64])];
+            tensor<fp32, [12, 2, 4, 64]> var_1260 = reshape(shape = var_1259, x = var_1258)[name = tensor<string, []>("op_1260")];
             tensor<int32, [4]> v_17_perm_0 = const()[name = tensor<string, []>("v_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 2, 256]> input_187 = linear(bias = decoder_g_proj_1_bias, weight = decoder_g_proj_1_weight, x = x)[name = tensor<string, []>("linear_50")];
+            tensor<fp32, [12, 2, 256]> input_189 = linear(bias = inner_decoder_g_proj_1_bias, weight = inner_decoder_g_proj_1_weight, x = x)[name = tensor<string, []>("linear_50")];
             tensor<fp32, [1]> sqrt_s0 = sqrt(x = prev_scale)[name = tensor<string, []>("sqrt_s0")];
             tensor<fp32, [2]> s_t = add(x = prev_scale, y = cumsum_mask_1)[name = tensor<string, []>("s_t")];
             tensor<fp32, []> const_32 = const()[name = tensor<string, []>("const_32"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [2]> clip_3 = clip(alpha = var_958, beta = const_32, x = s_t)[name = tensor<string, []>("clip_3")];
+            tensor<fp32, [2]> clip_3 = clip(alpha = var_39, beta = const_32, x = s_t)[name = tensor<string, []>("clip_3")];
             tensor<fp32, [2]> sqrt_s_t = sqrt(x = clip_3)[name = tensor<string, []>("sqrt_s_t")];
             tensor<bool, []> qk_transpose_x_1 = const()[name = tensor<string, []>("qk_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_transpose_y_1 = const()[name = tensor<string, []>("qk_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [12, 4, 2, 64]> k_17 = transpose(perm = k_17_perm_0, x = var_1241)[name = tensor<string, []>("transpose_13")];
-            tensor<fp32, [12, 4, 2, 64]> q_17 = transpose(perm = q_17_perm_0, x = var_1233)[name = tensor<string, []>("transpose_14")];
+            tensor<fp32, [12, 4, 2, 64]> k_17 = transpose(perm = k_17_perm_0, x = var_1254)[name = tensor<string, []>("transpose_13")];
+            tensor<fp32, [12, 4, 2, 64]> q_17 = transpose(perm = q_17_perm_0, x = var_1246)[name = tensor<string, []>("transpose_14")];
             tensor<fp32, [12, 4, 2, 2]> qk = matmul(transpose_x = qk_transpose_x_1, transpose_y = qk_transpose_y_1, x = q_17, y = k_17)[name = tensor<string, []>("qk")];
-            tensor<int32, [2]> var_1262 = const()[name = tensor<string, []>("op_1262"), val = tensor<int32, [2]>([2, 1])];
-            tensor<fp32, [2, 1]> var_1263 = reshape(shape = var_1262, x = sqrt_s_t)[name = tensor<string, []>("op_1263")];
-            tensor<fp32, [2, 2]> M = real_div(x = causal_with_valid_1, y = var_1263)[name = tensor<string, []>("M")];
-            tensor<fp32, [12, 4, 2, 2]> var_1265 = mul(x = qk, y = M)[name = tensor<string, []>("op_1265")];
-            tensor<bool, []> inner_transpose_x_0 = const()[name = tensor<string, []>("inner_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> inner_transpose_y_0 = const()[name = tensor<string, []>("inner_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 2, 64]> v_17 = transpose(perm = v_17_perm_0, x = var_1247)[name = tensor<string, []>("transpose_12")];
-            tensor<fp32, [12, 4, 2, 64]> inner = matmul(transpose_x = inner_transpose_x_0, transpose_y = inner_transpose_y_0, x = var_1265, y = v_17)[name = tensor<string, []>("inner")];
-            tensor<bool, []> var_1267_transpose_x_0 = const()[name = tensor<string, []>("op_1267_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_1267_transpose_y_0 = const()[name = tensor<string, []>("op_1267_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 2, 64]> var_1267 = matmul(transpose_x = var_1267_transpose_x_0, transpose_y = var_1267_transpose_y_0, x = q_17, y = prev_kv)[name = tensor<string, []>("op_1267")];
-            tensor<fp32, [2]> var_1268 = real_div(x = sqrt_s0, y = sqrt_s_t)[name = tensor<string, []>("op_1268")];
-            tensor<int32, [4]> var_1269 = const()[name = tensor<string, []>("op_1269"), val = tensor<int32, [4]>([1, 1, 2, 1])];
-            tensor<fp32, [1, 1, 2, 1]> var_1270 = reshape(shape = var_1269, x = var_1268)[name = tensor<string, []>("op_1270")];
-            tensor<fp32, [12, 4, 2, 64]> cross = mul(x = var_1267, y = var_1270)[name = tensor<string, []>("cross")];
-            tensor<fp32, [12, 4, 2, 64]> out_31 = add(x = inner, y = cross)[name = tensor<string, []>("out_31")];
-            tensor<fp32, [12, 4, 2, 64]> v_masked = mul(x = v_17, y = var_1093)[name = tensor<string, []>("v_masked")];
-            tensor<fp32, [12, 4, 64, 64]> var_1276 = mul(x = prev_kv, y = sqrt_s0)[name = tensor<string, []>("op_1276")];
-            tensor<bool, []> var_1278_transpose_x_1 = const()[name = tensor<string, []>("op_1278_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_1278_transpose_y_1 = const()[name = tensor<string, []>("op_1278_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 64, 64]> var_1278 = matmul(transpose_x = var_1278_transpose_x_1, transpose_y = var_1278_transpose_y_1, x = k_17, y = v_masked)[name = tensor<string, []>("op_1278")];
-            tensor<fp32, [12, 4, 64, 64]> new_kv_unnorm = add(x = var_1276, y = var_1278)[name = tensor<string, []>("new_kv_unnorm")];
-            tensor<fp32, [1]> new_scale = add(x = prev_scale, y = var_1101)[name = tensor<string, []>("new_scale")];
+            tensor<int32, [2]> var_1275 = const()[name = tensor<string, []>("op_1275"), val = tensor<int32, [2]>([2, 1])];
+            tensor<fp32, [2, 1]> var_1276 = reshape(shape = var_1275, x = sqrt_s_t)[name = tensor<string, []>("op_1276")];
+            tensor<fp32, [2, 2]> M = real_div(x = causal_with_valid_1, y = var_1276)[name = tensor<string, []>("M")];
+            tensor<fp32, [12, 4, 2, 2]> var_1278 = mul(x = qk, y = M)[name = tensor<string, []>("op_1278")];
+            tensor<bool, []> inner_11_transpose_x_0 = const()[name = tensor<string, []>("inner_11_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> inner_11_transpose_y_0 = const()[name = tensor<string, []>("inner_11_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 2, 64]> v_17 = transpose(perm = v_17_perm_0, x = var_1260)[name = tensor<string, []>("transpose_12")];
+            tensor<fp32, [12, 4, 2, 64]> inner_11 = matmul(transpose_x = inner_11_transpose_x_0, transpose_y = inner_11_transpose_y_0, x = var_1278, y = v_17)[name = tensor<string, []>("inner_11")];
+            tensor<bool, []> var_1280_transpose_x_0 = const()[name = tensor<string, []>("op_1280_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_1280_transpose_y_0 = const()[name = tensor<string, []>("op_1280_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 2, 64]> var_1280 = matmul(transpose_x = var_1280_transpose_x_0, transpose_y = var_1280_transpose_y_0, x = q_17, y = prev_kv)[name = tensor<string, []>("op_1280")];
+            tensor<fp32, [2]> var_1281 = real_div(x = sqrt_s0, y = sqrt_s_t)[name = tensor<string, []>("op_1281")];
+            tensor<int32, [4]> var_1282 = const()[name = tensor<string, []>("op_1282"), val = tensor<int32, [4]>([1, 1, 2, 1])];
+            tensor<fp32, [1, 1, 2, 1]> var_1283 = reshape(shape = var_1282, x = var_1281)[name = tensor<string, []>("op_1283")];
+            tensor<fp32, [12, 4, 2, 64]> cross = mul(x = var_1280, y = var_1283)[name = tensor<string, []>("cross")];
+            tensor<fp32, [12, 4, 2, 64]> out_31 = add(x = inner_11, y = cross)[name = tensor<string, []>("out_31")];
+            tensor<fp32, [12, 4, 2, 64]> v_masked = mul(x = v_17, y = var_1106)[name = tensor<string, []>("v_masked")];
+            tensor<fp32, [12, 4, 64, 64]> var_1289 = mul(x = prev_kv, y = sqrt_s0)[name = tensor<string, []>("op_1289")];
+            tensor<bool, []> var_1291_transpose_x_1 = const()[name = tensor<string, []>("op_1291_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_1291_transpose_y_1 = const()[name = tensor<string, []>("op_1291_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 64, 64]> var_1291 = matmul(transpose_x = var_1291_transpose_x_1, transpose_y = var_1291_transpose_y_1, x = k_17, y = v_masked)[name = tensor<string, []>("op_1291")];
+            tensor<fp32, [12, 4, 64, 64]> new_kv_unnorm = add(x = var_1289, y = var_1291)[name = tensor<string, []>("new_kv_unnorm")];
+            tensor<fp32, [1]> new_scale = add(x = prev_scale, y = var_1114)[name = tensor<string, []>("new_scale")];
             tensor<fp32, []> const_33 = const()[name = tensor<string, []>("const_33"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_4 = clip(alpha = var_958, beta = const_33, x = new_scale)[name = tensor<string, []>("clip_4")];
+            tensor<fp32, [1]> clip_4 = clip(alpha = var_39, beta = const_33, x = new_scale)[name = tensor<string, []>("clip_4")];
             tensor<fp32, [1]> sqrt_new_scale = sqrt(x = clip_4)[name = tensor<string, []>("sqrt_new_scale")];
             tensor<fp32, [12, 4, 64, 64]> nkv = real_div(x = new_kv_unnorm, y = sqrt_new_scale)[name = tensor<string, []>("nkv")];
-            tensor<int32, [4]> var_1287_perm_0 = const()[name = tensor<string, []>("op_1287_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [4]> var_1300_perm_0 = const()[name = tensor<string, []>("op_1300_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_33_axes_0 = const()[name = tensor<string, []>("out_33_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [12, 2, 4, 64]> var_1287 = transpose(perm = var_1287_perm_0, x = out_31)[name = tensor<string, []>("transpose_11")];
-            tensor<fp32, [12, 2, 4, 64]> out_33 = layer_norm(axes = out_33_axes_0, epsilon = var_955, x = var_1287)[name = tensor<string, []>("out_33")];
-            tensor<int32, [3]> var_1291 = const()[name = tensor<string, []>("op_1291"), val = tensor<int32, [3]>([12, 2, 256])];
-            tensor<fp32, [12, 2, 256]> out = reshape(shape = var_1291, x = out_33)[name = tensor<string, []>("out")];
-            tensor<fp32, [12, 2, 256]> var_1293 = silu(x = input_187)[name = tensor<string, []>("op_1293")];
-            tensor<fp32, [12, 2, 256]> input_189 = mul(x = var_1293, y = out)[name = tensor<string, []>("input_189")];
-            tensor<fp32, [12, 2, 256]> ret_out = linear(bias = decoder_out_proj_1_bias, weight = decoder_out_proj_1_weight, x = input_189)[name = tensor<string, []>("linear_51")];
-            tensor<fp32, [12, 2, 256]> input_191 = add(x = x, y = ret_out)[name = tensor<string, []>("input_191")];
+            tensor<fp32, [12, 2, 4, 64]> var_1300 = transpose(perm = var_1300_perm_0, x = out_31)[name = tensor<string, []>("transpose_11")];
+            tensor<fp32, [12, 2, 4, 64]> out_33 = layer_norm(axes = out_33_axes_0, epsilon = var_54, x = var_1300)[name = tensor<string, []>("out_33")];
+            tensor<int32, [3]> var_1304 = const()[name = tensor<string, []>("op_1304"), val = tensor<int32, [3]>([12, 2, 256])];
+            tensor<fp32, [12, 2, 256]> out = reshape(shape = var_1304, x = out_33)[name = tensor<string, []>("out")];
+            tensor<fp32, [12, 2, 256]> var_1306 = silu(x = input_189)[name = tensor<string, []>("op_1306")];
+            tensor<fp32, [12, 2, 256]> input_191 = mul(x = var_1306, y = out)[name = tensor<string, []>("input_191")];
+            tensor<fp32, [12, 2, 256]> ret_out = linear(bias = inner_decoder_out_proj_1_bias, weight = inner_decoder_out_proj_1_weight, x = input_191)[name = tensor<string, []>("linear_51")];
+            tensor<fp32, [12, 2, 256]> input_193 = add(x = x, y = ret_out)[name = tensor<string, []>("input_193")];
             tensor<int32, [1]> xt_5_axes_0 = const()[name = tensor<string, []>("xt_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [12, 2, 256]> xt_5 = layer_norm(axes = xt_5_axes_0, beta = decoder_norm11_1_bias, epsilon = var_953, gamma = decoder_norm11_1_weight, x = input_191)[name = tensor<string, []>("xt_5")];
-            tensor<int32, [4]> var_1303 = const()[name = tensor<string, []>("op_1303"), val = tensor<int32, [4]>([1, 12, 2, 256])];
-            tensor<fp32, [1, 12, 2, 256]> var_1304 = reshape(shape = var_1303, x = xt_5)[name = tensor<string, []>("op_1304")];
-            tensor<int32, [4]> var_1305_perm_0 = const()[name = tensor<string, []>("op_1305_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1308 = const()[name = tensor<string, []>("op_1308"), val = tensor<int32, [3]>([2, 12, 256])];
-            tensor<fp32, [1, 2, 12, 256]> var_1305 = transpose(perm = var_1305_perm_0, x = var_1304)[name = tensor<string, []>("transpose_10")];
-            tensor<fp32, [2, 12, 256]> query_5 = reshape(shape = var_1308, x = var_1305)[name = tensor<string, []>("query_5")];
+            tensor<fp32, [12, 2, 256]> xt_5 = layer_norm(axes = xt_5_axes_0, beta = inner_decoder_norm11_1_bias, epsilon = var_46, gamma = inner_decoder_norm11_1_weight, x = input_193)[name = tensor<string, []>("xt_5")];
+            tensor<int32, [4]> var_1316 = const()[name = tensor<string, []>("op_1316"), val = tensor<int32, [4]>([1, 12, 2, 256])];
+            tensor<fp32, [1, 12, 2, 256]> var_1317 = reshape(shape = var_1316, x = xt_5)[name = tensor<string, []>("op_1317")];
+            tensor<int32, [4]> var_1318_perm_0 = const()[name = tensor<string, []>("op_1318_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1321 = const()[name = tensor<string, []>("op_1321"), val = tensor<int32, [3]>([2, 12, 256])];
+            tensor<fp32, [1, 2, 12, 256]> var_1318 = transpose(perm = var_1318_perm_0, x = var_1317)[name = tensor<string, []>("transpose_10")];
+            tensor<fp32, [2, 12, 256]> query_5 = reshape(shape = var_1321, x = var_1318)[name = tensor<string, []>("query_5")];
             tensor<int32, [3]> query_perm_0 = const()[name = tensor<string, []>("query_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [12, 2, 256]> query = transpose(perm = query_perm_0, x = query_5)[name = tensor<string, []>("transpose_9")];
-            tensor<fp32, [12, 2, 768]> var_1331 = linear(bias = decoder_self_attn2_1_in_proj_bias, weight = decoder_self_attn2_1_in_proj_weight, x = query)[name = tensor<string, []>("linear_52")];
+            tensor<fp32, [12, 2, 768]> var_1344 = linear(bias = inner_decoder_self_attn2_1_in_proj_bias, weight = inner_decoder_self_attn2_1_in_proj_weight, x = query)[name = tensor<string, []>("linear_52")];
             tensor<int32, [4]> concat_2 = const()[name = tensor<string, []>("concat_2"), val = tensor<int32, [4]>([12, 2, 3, 256])];
-            tensor<fp32, [12, 2, 3, 256]> var_1333 = reshape(shape = concat_2, x = var_1331)[name = tensor<string, []>("op_1333")];
-            tensor<int32, [1]> var_1334_axes_0 = const()[name = tensor<string, []>("op_1334_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp32, [1, 12, 2, 3, 256]> var_1334 = expand_dims(axes = var_1334_axes_0, x = var_1333)[name = tensor<string, []>("op_1334")];
-            tensor<int32, [5]> var_1335_perm_0 = const()[name = tensor<string, []>("op_1335_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
-            tensor<int32, [1]> var_1336_axes_0 = const()[name = tensor<string, []>("op_1336_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp32, [3, 12, 2, 1, 256]> var_1335 = transpose(perm = var_1335_perm_0, x = var_1334)[name = tensor<string, []>("transpose_8")];
-            tensor<fp32, [3, 12, 2, 256]> var_1336 = squeeze(axes = var_1336_axes_0, x = var_1335)[name = tensor<string, []>("op_1336")];
+            tensor<fp32, [12, 2, 3, 256]> var_1346 = reshape(shape = concat_2, x = var_1344)[name = tensor<string, []>("op_1346")];
+            tensor<int32, [1]> var_1347_axes_0 = const()[name = tensor<string, []>("op_1347_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 12, 2, 3, 256]> var_1347 = expand_dims(axes = var_1347_axes_0, x = var_1346)[name = tensor<string, []>("op_1347")];
+            tensor<int32, [5]> var_1348_perm_0 = const()[name = tensor<string, []>("op_1348_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
+            tensor<int32, [1]> var_1349_axes_0 = const()[name = tensor<string, []>("op_1349_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp32, [3, 12, 2, 1, 256]> var_1348 = transpose(perm = var_1348_perm_0, x = var_1347)[name = tensor<string, []>("transpose_8")];
+            tensor<fp32, [3, 12, 2, 256]> var_1349 = squeeze(axes = var_1349_axes_0, x = var_1348)[name = tensor<string, []>("op_1349")];
             tensor<int32, [4]> q_19_begin_0 = const()[name = tensor<string, []>("q_19_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> q_19_end_0 = const()[name = tensor<string, []>("q_19_end_0"), val = tensor<int32, [4]>([1, 12, 2, 256])];
             tensor<bool, [4]> q_19_end_mask_0 = const()[name = tensor<string, []>("q_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> q_19_squeeze_mask_0 = const()[name = tensor<string, []>("q_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 2, 256]> q_19 = slice_by_index(begin = q_19_begin_0, end = q_19_end_0, end_mask = q_19_end_mask_0, squeeze_mask = q_19_squeeze_mask_0, x = var_1336)[name = tensor<string, []>("q_19")];
+            tensor<fp32, [12, 2, 256]> q_19 = slice_by_index(begin = q_19_begin_0, end = q_19_end_0, end_mask = q_19_end_mask_0, squeeze_mask = q_19_squeeze_mask_0, x = var_1349)[name = tensor<string, []>("q_19")];
             tensor<int32, [4]> k_19_begin_0 = const()[name = tensor<string, []>("k_19_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> k_19_end_0 = const()[name = tensor<string, []>("k_19_end_0"), val = tensor<int32, [4]>([2, 12, 2, 256])];
             tensor<bool, [4]> k_19_end_mask_0 = const()[name = tensor<string, []>("k_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> k_19_squeeze_mask_0 = const()[name = tensor<string, []>("k_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 2, 256]> k_19 = slice_by_index(begin = k_19_begin_0, end = k_19_end_0, end_mask = k_19_end_mask_0, squeeze_mask = k_19_squeeze_mask_0, x = var_1336)[name = tensor<string, []>("k_19")];
+            tensor<fp32, [12, 2, 256]> k_19 = slice_by_index(begin = k_19_begin_0, end = k_19_end_0, end_mask = k_19_end_mask_0, squeeze_mask = k_19_squeeze_mask_0, x = var_1349)[name = tensor<string, []>("k_19")];
             tensor<int32, [4]> v_19_begin_0 = const()[name = tensor<string, []>("v_19_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> v_19_end_0 = const()[name = tensor<string, []>("v_19_end_0"), val = tensor<int32, [4]>([3, 12, 2, 256])];
             tensor<bool, [4]> v_19_end_mask_0 = const()[name = tensor<string, []>("v_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> v_19_squeeze_mask_0 = const()[name = tensor<string, []>("v_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 2, 256]> v_19 = slice_by_index(begin = v_19_begin_0, end = v_19_end_0, end_mask = v_19_end_mask_0, squeeze_mask = v_19_squeeze_mask_0, x = var_1336)[name = tensor<string, []>("v_19")];
-            tensor<int32, [3]> var_1344 = const()[name = tensor<string, []>("op_1344"), val = tensor<int32, [3]>([12, 8, 64])];
-            tensor<fp32, [12, 8, 64]> var_1345 = reshape(shape = var_1344, x = q_19)[name = tensor<string, []>("op_1345")];
+            tensor<fp32, [12, 2, 256]> v_19 = slice_by_index(begin = v_19_begin_0, end = v_19_end_0, end_mask = v_19_end_mask_0, squeeze_mask = v_19_squeeze_mask_0, x = var_1349)[name = tensor<string, []>("v_19")];
+            tensor<int32, [3]> var_1357 = const()[name = tensor<string, []>("op_1357"), val = tensor<int32, [3]>([12, 8, 64])];
+            tensor<fp32, [12, 8, 64]> var_1358 = reshape(shape = var_1357, x = q_19)[name = tensor<string, []>("op_1358")];
             tensor<int32, [3]> q_21_perm_0 = const()[name = tensor<string, []>("q_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1351 = const()[name = tensor<string, []>("op_1351"), val = tensor<int32, [3]>([12, 8, 64])];
-            tensor<fp32, [12, 8, 64]> var_1352 = reshape(shape = var_1351, x = k_19)[name = tensor<string, []>("op_1352")];
+            tensor<int32, [3]> var_1364 = const()[name = tensor<string, []>("op_1364"), val = tensor<int32, [3]>([12, 8, 64])];
+            tensor<fp32, [12, 8, 64]> var_1365 = reshape(shape = var_1364, x = k_19)[name = tensor<string, []>("op_1365")];
             tensor<int32, [3]> k_21_perm_0 = const()[name = tensor<string, []>("k_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1358 = const()[name = tensor<string, []>("op_1358"), val = tensor<int32, [3]>([12, 8, 64])];
-            tensor<fp32, [12, 8, 64]> var_1359 = reshape(shape = var_1358, x = v_19)[name = tensor<string, []>("op_1359")];
+            tensor<int32, [3]> var_1371 = const()[name = tensor<string, []>("op_1371"), val = tensor<int32, [3]>([12, 8, 64])];
+            tensor<fp32, [12, 8, 64]> var_1372 = reshape(shape = var_1371, x = v_19)[name = tensor<string, []>("op_1372")];
             tensor<int32, [3]> v_21_perm_0 = const()[name = tensor<string, []>("v_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [4]> var_1362 = const()[name = tensor<string, []>("op_1362"), val = tensor<int32, [4]>([2, 4, 12, 64])];
-            tensor<fp32, [8, 12, 64]> q_21 = transpose(perm = q_21_perm_0, x = var_1345)[name = tensor<string, []>("transpose_7")];
-            tensor<fp32, [2, 4, 12, 64]> q = reshape(shape = var_1362, x = q_21)[name = tensor<string, []>("q")];
-            tensor<int32, [4]> var_1364 = const()[name = tensor<string, []>("op_1364"), val = tensor<int32, [4]>([2, 4, 12, 64])];
-            tensor<fp32, [8, 12, 64]> k_21 = transpose(perm = k_21_perm_0, x = var_1352)[name = tensor<string, []>("transpose_6")];
-            tensor<fp32, [2, 4, 12, 64]> k = reshape(shape = var_1364, x = k_21)[name = tensor<string, []>("k")];
-            tensor<int32, [4]> var_1366 = const()[name = tensor<string, []>("op_1366"), val = tensor<int32, [4]>([2, 4, 12, 64])];
-            tensor<fp32, [8, 12, 64]> v_21 = transpose(perm = v_21_perm_0, x = var_1359)[name = tensor<string, []>("transpose_5")];
-            tensor<fp32, [2, 4, 12, 64]> v = reshape(shape = var_1366, x = v_21)[name = tensor<string, []>("v")];
+            tensor<int32, [4]> var_1375 = const()[name = tensor<string, []>("op_1375"), val = tensor<int32, [4]>([2, 4, 12, 64])];
+            tensor<fp32, [8, 12, 64]> q_21 = transpose(perm = q_21_perm_0, x = var_1358)[name = tensor<string, []>("transpose_7")];
+            tensor<fp32, [2, 4, 12, 64]> q = reshape(shape = var_1375, x = q_21)[name = tensor<string, []>("q")];
+            tensor<int32, [4]> var_1377 = const()[name = tensor<string, []>("op_1377"), val = tensor<int32, [4]>([2, 4, 12, 64])];
+            tensor<fp32, [8, 12, 64]> k_21 = transpose(perm = k_21_perm_0, x = var_1365)[name = tensor<string, []>("transpose_6")];
+            tensor<fp32, [2, 4, 12, 64]> k = reshape(shape = var_1377, x = k_21)[name = tensor<string, []>("k")];
+            tensor<int32, [4]> var_1379 = const()[name = tensor<string, []>("op_1379"), val = tensor<int32, [4]>([2, 4, 12, 64])];
+            tensor<fp32, [8, 12, 64]> v_21 = transpose(perm = v_21_perm_0, x = var_1372)[name = tensor<string, []>("transpose_5")];
+            tensor<fp32, [2, 4, 12, 64]> v = reshape(shape = var_1379, x = v_21)[name = tensor<string, []>("v")];
             tensor<fp32, []> mul_3_y_0 = const()[name = tensor<string, []>("mul_3_y_0"), val = tensor<fp32, []>(0x1p-3)];
             tensor<fp32, [2, 4, 12, 64]> mul_3 = mul(x = q, y = mul_3_y_0)[name = tensor<string, []>("mul_3")];
             tensor<bool, []> matmul_1_transpose_y_0 = const()[name = tensor<string, []>("matmul_1_transpose_y_0"), val = tensor<bool, []>(true)];
@@ -1197,36 +1204,36 @@ program(1.0)
             tensor<bool, []> attn_output_9_transpose_x_0 = const()[name = tensor<string, []>("attn_output_9_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> attn_output_9_transpose_y_0 = const()[name = tensor<string, []>("attn_output_9_transpose_y_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [2, 4, 12, 64]> attn_output_9 = matmul(transpose_x = attn_output_9_transpose_x_0, transpose_y = attn_output_9_transpose_y_0, x = softmax_1, y = v)[name = tensor<string, []>("attn_output_9")];
-            tensor<int32, [4]> var_1369 = const()[name = tensor<string, []>("op_1369"), val = tensor<int32, [4]>([2, 0, 1, 3])];
-            tensor<int32, [2]> var_1374 = const()[name = tensor<string, []>("op_1374"), val = tensor<int32, [2]>([24, 256])];
-            tensor<fp32, [12, 2, 4, 64]> var_1370 = transpose(perm = var_1369, x = attn_output_9)[name = tensor<string, []>("transpose_4")];
-            tensor<fp32, [24, 256]> attn_output_11 = reshape(shape = var_1374, x = var_1370)[name = tensor<string, []>("attn_output_11")];
-            tensor<fp32, [24, 256]> attn_output_13 = linear(bias = decoder_self_attn2_1_out_proj_bias, weight = decoder_self_attn2_1_out_proj_weight, x = attn_output_11)[name = tensor<string, []>("linear_53")];
-            tensor<int32, [3]> var_1378 = const()[name = tensor<string, []>("op_1378"), val = tensor<int32, [3]>([12, 2, 256])];
-            tensor<fp32, [12, 2, 256]> attn_output = reshape(shape = var_1378, x = attn_output_13)[name = tensor<string, []>("attn_output")];
+            tensor<int32, [4]> var_1382 = const()[name = tensor<string, []>("op_1382"), val = tensor<int32, [4]>([2, 0, 1, 3])];
+            tensor<int32, [2]> var_1387 = const()[name = tensor<string, []>("op_1387"), val = tensor<int32, [2]>([24, 256])];
+            tensor<fp32, [12, 2, 4, 64]> var_1383 = transpose(perm = var_1382, x = attn_output_9)[name = tensor<string, []>("transpose_4")];
+            tensor<fp32, [24, 256]> attn_output_11 = reshape(shape = var_1387, x = var_1383)[name = tensor<string, []>("attn_output_11")];
+            tensor<fp32, [24, 256]> attn_output_13 = linear(bias = inner_decoder_self_attn2_1_out_proj_bias, weight = inner_decoder_self_attn2_1_out_proj_weight, x = attn_output_11)[name = tensor<string, []>("linear_53")];
+            tensor<int32, [3]> var_1391 = const()[name = tensor<string, []>("op_1391"), val = tensor<int32, [3]>([12, 2, 256])];
+            tensor<fp32, [12, 2, 256]> attn_output = reshape(shape = var_1391, x = attn_output_13)[name = tensor<string, []>("attn_output")];
             tensor<int32, [3]> sa2_out_perm_0 = const()[name = tensor<string, []>("sa2_out_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [2, 12, 256]> sa2_out = transpose(perm = sa2_out_perm_0, x = attn_output)[name = tensor<string, []>("transpose_3")];
-            tensor<fp32, [2, 12, 256]> input_193 = add(x = query_5, y = sa2_out)[name = tensor<string, []>("input_193")];
-            tensor<int32, [1]> input_195_axes_0 = const()[name = tensor<string, []>("input_195_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [2, 12, 256]> input_195 = layer_norm(axes = input_195_axes_0, beta = decoder_norm21_1_bias, epsilon = var_953, gamma = decoder_norm21_1_weight, x = input_193)[name = tensor<string, []>("input_195")];
-            tensor<fp32, [2, 12, 2048]> input_197 = linear(bias = decoder_linear1_1_bias, weight = decoder_linear1_1_weight, x = input_195)[name = tensor<string, []>("linear_54")];
-            tensor<fp32, [2, 12, 2048]> input_199 = relu(x = input_197)[name = tensor<string, []>("input_199")];
-            tensor<fp32, [2, 12, 256]> ffn_out = linear(bias = decoder_linear2_1_bias, weight = decoder_linear2_1_weight, x = input_199)[name = tensor<string, []>("linear_55")];
-            tensor<fp32, [2, 12, 256]> input_201 = add(x = input_195, y = ffn_out)[name = tensor<string, []>("input_201")];
+            tensor<fp32, [2, 12, 256]> input_195 = add(x = query_5, y = sa2_out)[name = tensor<string, []>("input_195")];
+            tensor<int32, [1]> input_197_axes_0 = const()[name = tensor<string, []>("input_197_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [2, 12, 256]> input_197 = layer_norm(axes = input_197_axes_0, beta = inner_decoder_norm21_1_bias, epsilon = var_46, gamma = inner_decoder_norm21_1_weight, x = input_195)[name = tensor<string, []>("input_197")];
+            tensor<fp32, [2, 12, 2048]> input_199 = linear(bias = inner_decoder_linear1_1_bias, weight = inner_decoder_linear1_1_weight, x = input_197)[name = tensor<string, []>("linear_54")];
+            tensor<fp32, [2, 12, 2048]> input_201 = relu(x = input_199)[name = tensor<string, []>("input_201")];
+            tensor<fp32, [2, 12, 256]> ffn_out = linear(bias = inner_decoder_linear2_1_bias, weight = inner_decoder_linear2_1_weight, x = input_201)[name = tensor<string, []>("linear_55")];
+            tensor<fp32, [2, 12, 256]> input_203 = add(x = input_197, y = ffn_out)[name = tensor<string, []>("input_203")];
             tensor<int32, [1]> xt_axes_0 = const()[name = tensor<string, []>("xt_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [2, 12, 256]> xt = layer_norm(axes = xt_axes_0, beta = decoder_norm22_1_bias, epsilon = var_953, gamma = decoder_norm22_1_weight, x = input_201)[name = tensor<string, []>("xt")];
-            tensor<int32, [4]> var_1398 = const()[name = tensor<string, []>("op_1398"), val = tensor<int32, [4]>([1, 2, 12, 256])];
-            tensor<fp32, [1, 2, 12, 256]> input = reshape(shape = var_1398, x = xt)[name = tensor<string, []>("input")];
-            tensor<int32, [1]> var_1400 = const()[name = tensor<string, []>("op_1400"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 2, 12, 1]> var_1401 = reduce_l2_norm(axes = var_1400, keep_dims = var_956, x = input)[name = tensor<string, []>("op_1401")];
+            tensor<fp32, [2, 12, 256]> xt = layer_norm(axes = xt_axes_0, beta = inner_decoder_norm22_1_bias, epsilon = var_46, gamma = inner_decoder_norm22_1_weight, x = input_203)[name = tensor<string, []>("xt")];
+            tensor<int32, [4]> var_1411 = const()[name = tensor<string, []>("op_1411"), val = tensor<int32, [4]>([1, 2, 12, 256])];
+            tensor<fp32, [1, 2, 12, 256]> input = reshape(shape = var_1411, x = xt)[name = tensor<string, []>("input")];
+            tensor<int32, [1]> var_1413 = const()[name = tensor<string, []>("op_1413"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 2, 12, 1]> var_1414 = reduce_l2_norm(axes = var_1413, keep_dims = var_45, x = input)[name = tensor<string, []>("op_1414")];
             tensor<fp32, []> const_42 = const()[name = tensor<string, []>("const_42"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1, 2, 12, 1]> clip_5 = clip(alpha = var_948, beta = const_42, x = var_1401)[name = tensor<string, []>("clip_5")];
-            tensor<fp32, [1, 2, 12, 256]> var_1403 = real_div(x = input, y = clip_5)[name = tensor<string, []>("op_1403")];
+            tensor<fp32, [1, 2, 12, 1]> clip_5 = clip(alpha = var_59, beta = const_42, x = var_1414)[name = tensor<string, []>("clip_5")];
+            tensor<fp32, [1, 2, 12, 256]> var_1416 = real_div(x = input, y = clip_5)[name = tensor<string, []>("op_1416")];
             tensor<int32, [3]> concat_6 = const()[name = tensor<string, []>("concat_6"), val = tensor<int32, [3]>([2, 1, 256])];
             tensor<fp32, [2, 1, 256]> reshape_0 = reshape(shape = concat_6, x = emb)[name = tensor<string, []>("reshape_0")];
             tensor<int32, [4]> transpose_1_perm_0 = const()[name = tensor<string, []>("transpose_1_perm_0"), val = tensor<int32, [4]>([0, 1, 3, 2])];
             tensor<int32, [3]> concat_7 = const()[name = tensor<string, []>("concat_7"), val = tensor<int32, [3]>([2, 256, 12])];
-            tensor<fp32, [1, 2, 256, 12]> transpose_1 = transpose(perm = transpose_1_perm_0, x = var_1403)[name = tensor<string, []>("transpose_2")];
+            tensor<fp32, [1, 2, 256, 12]> transpose_1 = transpose(perm = transpose_1_perm_0, x = var_1416)[name = tensor<string, []>("transpose_2")];
             tensor<fp32, [2, 256, 12]> reshape_1 = reshape(shape = concat_7, x = transpose_1)[name = tensor<string, []>("reshape_1")];
             tensor<bool, []> matmul_2_transpose_x_0 = const()[name = tensor<string, []>("matmul_2_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> matmul_2_transpose_y_0 = const()[name = tensor<string, []>("matmul_2_transpose_y_0"), val = tensor<bool, []>(false)];
@@ -1237,10 +1244,10 @@ program(1.0)
             tensor<int32, [3]> output_end_0 = const()[name = tensor<string, []>("output_end_0"), val = tensor<int32, [3]>([1, 2, 11])];
             tensor<bool, [3]> output_end_mask_0 = const()[name = tensor<string, []>("output_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
             tensor<fp32, [1, 2, 10]> output = slice_by_index(begin = output_begin_0, end = output_end_0, end_mask = output_end_mask_0, x = reshape_2)[name = tensor<string, []>("output")];
-            tensor<fp32, [1, 2, 10]> probs = sigmoid(x = output)[name = tensor<string, []>("op_1407")];
-            tensor<int32, []> var_1409_axis_0 = const()[name = tensor<string, []>("op_1409_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [2, 12, 4, 64, 64]> dec_kv_new = stack(axis = var_1409_axis_0, values = (var_1105, nkv))[name = tensor<string, []>("op_1409")];
-            tensor<int32, []> var_1411_axis_0 = const()[name = tensor<string, []>("op_1411_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [2, 1]> dec_scale_new = stack(axis = var_1411_axis_0, values = (new_scale_9, new_scale))[name = tensor<string, []>("op_1411")];
+            tensor<fp32, [1, 2, 10]> probs = sigmoid(x = output)[name = tensor<string, []>("op_1420")];
+            tensor<int32, []> var_1422_axis_0 = const()[name = tensor<string, []>("op_1422_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [2, 12, 4, 64, 64]> dec_kv_new = stack(axis = var_1422_axis_0, values = (var_1118, nkv))[name = tensor<string, []>("op_1422")];
+            tensor<int32, []> var_1424_axis_0 = const()[name = tensor<string, []>("op_1424_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [2, 1]> dec_scale_new = stack(axis = var_1424_axis_0, values = (new_scale_9, new_scale))[name = tensor<string, []>("op_1424")];
         } -> (probs, enc_kv_new, enc_scale_new, enc_conv_cache_new, cnn_window_new, dec_kv_new, dec_scale_new);
 }
\ No newline at end of file
diff --git a/optimized/dih3/200ms/ls_eend_dih3_200ms.mlpackage/Data/com.apple.CoreML/model.mlmodel b/optimized/dih3/200ms/ls_eend_dih3_200ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
index c03c36525cece803ff8208a5ea26cac5ac40a745..65c40d2e1c058ddda3921d57f1cd4e276a5a9a92 100644
--- a/optimized/dih3/200ms/ls_eend_dih3_200ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
+++ b/optimized/dih3/200ms/ls_eend_dih3_200ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4835a14c03b80d2ffb221a44b04b7b0b1c4cdb4e6a71682753bcfe838c3773b1
-size 179876
+oid sha256:47aad9d08e699005add53327400f5fd34df38bbf24c5cf7510373733c1983619
+size 184856
diff --git a/optimized/dih3/200ms/ls_eend_dih3_200ms.mlpackage/Manifest.json b/optimized/dih3/200ms/ls_eend_dih3_200ms.mlpackage/Manifest.json
index 0fd137e70ac537ef68adfd0386c412681e663286..c6b16cff84064a7ae23053b1d8da69e7034ab07d 100644
--- a/optimized/dih3/200ms/ls_eend_dih3_200ms.mlpackage/Manifest.json
+++ b/optimized/dih3/200ms/ls_eend_dih3_200ms.mlpackage/Manifest.json
@@ -1,18 +1,18 @@
 {
     "fileFormatVersion": "1.0.0",
     "itemInfoEntries": {
-        "5BAC2999-7E80-40F0-8220-7BBE6801AB8D": {
+        "39AD8B7F-BA07-4BF0-9035-9C5650454BFF": {
             "author": "com.apple.CoreML",
             "description": "CoreML Model Weights",
             "name": "weights",
             "path": "com.apple.CoreML/weights"
         },
-        "ED375499-50AC-4B4A-A792-3AB2C9B91F44": {
+        "5B2A221F-BCEE-4C0E-8D64-CFA3E747939D": {
             "author": "com.apple.CoreML",
             "description": "CoreML Model Specification",
             "name": "model.mlmodel",
             "path": "com.apple.CoreML/model.mlmodel"
         }
     },
-    "rootModelIdentifier": "ED375499-50AC-4B4A-A792-3AB2C9B91F44"
+    "rootModelIdentifier": "5B2A221F-BCEE-4C0E-8D64-CFA3E747939D"
 }
diff --git a/optimized/dih3/300ms/ls_eend_dih3_300ms.mlmodelc/analytics/coremldata.bin b/optimized/dih3/300ms/ls_eend_dih3_300ms.mlmodelc/analytics/coremldata.bin
index 65f7eaf6f1cff00f16e9edf9cb4cdfde14a47a56..c2bd905a2fc18436badf62025ecdac9f5ba4282a 100644
--- a/optimized/dih3/300ms/ls_eend_dih3_300ms.mlmodelc/analytics/coremldata.bin
+++ b/optimized/dih3/300ms/ls_eend_dih3_300ms.mlmodelc/analytics/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0f1ec2df8f646b9374cef6046fd9256230748f14059ab5f8e8258279b0565bda
+oid sha256:09fea4de1562533e09d1e835506a267b44e274c7ab11fa1f8faf37c1580952c9
 size 243
diff --git a/optimized/dih3/300ms/ls_eend_dih3_300ms.mlmodelc/coremldata.bin b/optimized/dih3/300ms/ls_eend_dih3_300ms.mlmodelc/coremldata.bin
index 10dce50396088c6728d963dc8842a625a7b9facc..c6561f077fce2c8bade14393c9ef3927d4594c84 100644
--- a/optimized/dih3/300ms/ls_eend_dih3_300ms.mlmodelc/coremldata.bin
+++ b/optimized/dih3/300ms/ls_eend_dih3_300ms.mlmodelc/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b37ebc9cf057e7be9a35dd0948213af49004827a34416b9b3634e4609254a179
-size 1310
+oid sha256:d88e937e4d0ab1139e715f6bc98ec09ce1af57dfc6bcb6639bdbf18359f72e1d
+size 1413
diff --git a/optimized/dih3/300ms/ls_eend_dih3_300ms.mlmodelc/metadata.json b/optimized/dih3/300ms/ls_eend_dih3_300ms.mlmodelc/metadata.json
index b80d71250f01b02f0cb7ffcce159276184b82521..2783dc934f83ee0345b1f04642de92ed9d264557 100644
--- a/optimized/dih3/300ms/ls_eend_dih3_300ms.mlmodelc/metadata.json
+++ b/optimized/dih3/300ms/ls_eend_dih3_300ms.mlmodelc/metadata.json
@@ -1,7 +1,7 @@
 [
   {
     "metadataOutputVersion" : "3.0",
-    "shortDescription" : "LS-EEND DIHARD III streaming diarizer (pipeline, T=3, max_speakers=10)",
+    "shortDescription" : "LS-EEND DIHARD III streaming diarizer (pipeline, T=3, max_speakers=10, layout=raw_mel, cu=all)",
     "outputSchema" : [
       {
         "hasShapeFlexibility" : "0",
@@ -81,7 +81,7 @@
     "specificationVersion" : 8,
     "mlProgramOperationTypeHistogram" : {
       "Ios16.reduceL2Norm" : 2,
-      "Ios17.reshape" : 68,
+      "Ios17.reshape" : 69,
       "Ios16.softmax" : 2,
       "Ios17.matmul" : 29,
       "Ios17.transpose" : 57,
@@ -89,7 +89,7 @@
       "Ios17.expandDims" : 3,
       "Ios17.add" : 46,
       "Ios16.sigmoid" : 5,
-      "Ios17.sliceByIndex" : 56,
+      "Ios17.sliceByIndex" : 59,
       "Tile" : 1,
       "Ios16.reduceSum" : 1,
       "Ios17.squeeze" : 2,
@@ -101,7 +101,7 @@
       "Ios16.silu" : 18,
       "Ios17.realDiv" : 20,
       "Ios17.linear" : 56,
-      "Stack" : 5,
+      "Stack" : 6,
       "Ios17.concat" : 18,
       "Ios16.relu" : 2,
       "Ios16.cumsum" : 1,
@@ -128,9 +128,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float32",
-        "formattedType" : "MultiArray (Float32 1 × 3 × 345)",
+        "formattedType" : "MultiArray (Float32 1 × 35 × 23)",
         "shortDescription" : "",
-        "shape" : "[1, 3, 345]",
+        "shape" : "[1, 35, 23]",
         "name" : "features",
         "type" : "MultiArray"
       },
@@ -206,8 +206,8 @@
       }
     ],
     "userDefinedMetadata" : {
-      "com.github.apple.coremltools.conversion_date" : "2026-04-16",
-      "config" : "{\"model_name\": \"dih3\", \"model_label\": \"DIHARD III\", \"variant\": \"pipeline\", \"chunk_size\": 3, \"step_duration_ms\": 300, \"frame_duration_ms\": 100, \"frame_duration_seconds\": 0.1, \"max_speakers\": 10, \"max_nspks\": 12, \"n_units\": 256, \"n_heads\": 4, \"enc_n_layers\": 4, \"dec_n_layers\": 2, \"conv_kernel_size\": 16, \"conv_delay\": 9, \"sample_rate\": 8000, \"win_length\": 200, \"hop_length\": 80, \"n_mels\": 23, \"context_size\": 7, \"subsampling\": 10, \"feat_type\": \"logmel23_cummn\", \"pure_roll\": true}",
+      "com.github.apple.coremltools.conversion_date" : "2026-04-18",
+      "config" : "{\"model_name\": \"dih3\", \"model_label\": \"DIHARD III\", \"variant\": \"pipeline\", \"chunk_size\": 3, \"step_duration_ms\": 300, \"frame_duration_ms\": 100, \"frame_duration_seconds\": 0.1, \"max_speakers\": 10, \"max_nspks\": 12, \"n_units\": 256, \"n_heads\": 4, \"enc_n_layers\": 4, \"dec_n_layers\": 2, \"conv_kernel_size\": 16, \"conv_delay\": 9, \"sample_rate\": 8000, \"win_length\": 200, \"hop_length\": 80, \"n_mels\": 23, \"context_size\": 7, \"subsampling\": 10, \"feat_type\": \"logmel23_cummn\", \"pure_roll\": true, \"input_layout\": \"raw_mel\", \"compute_units_export\": \"all\", \"raw_mel_length\": 35}",
       "com.github.apple.coremltools.source" : "torch==2.6.0",
       "com.github.apple.coremltools.version" : "9.0",
       "com.github.apple.coremltools.source_dialect" : "TorchScript"
diff --git a/optimized/dih3/300ms/ls_eend_dih3_300ms.mlmodelc/model.mil b/optimized/dih3/300ms/ls_eend_dih3_300ms.mlmodelc/model.mil
index 48f1c415ceeabfb14a7edfb748becb4366d6de9f..d54a8f41d62f449a9ccc5f0085a42da328ef56f7 100644
--- a/optimized/dih3/300ms/ls_eend_dih3_300ms.mlmodelc/model.mil
+++ b/optimized/dih3/300ms/ls_eend_dih3_300ms.mlmodelc/model.mil
@@ -1,234 +1,252 @@
 program(1.0)
 [buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.6.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "9.0"}})]
 {
-    func main<ios17>(tensor<fp32, [1, 256, 18]> cnn_window, tensor<fp32, [2, 12, 4, 64, 64]> dec_kv, tensor<fp32, [2, 1]> dec_scale, tensor<fp32, [4, 1, 16, 256]> enc_conv_cache, tensor<fp32, [4, 1, 4, 64, 64]> enc_kv, tensor<fp32, [4, 1]> enc_scale, tensor<fp32, [1, 3, 345]> features, tensor<fp32, [3]> valid_mask) {
-            tensor<fp32, [256]> encoder_cnn_bias = const()[name = tensor<string, []>("encoder_cnn_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
-            tensor<fp32, [256, 256, 19]> encoder_cnn_weight = const()[name = tensor<string, []>("encoder_cnn_weight"), val = tensor<fp32, [256, 256, 19]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
-            tensor<fp32, [3, 3]> encoder__causal_mask = const()[name = tensor<string, []>("encoder__causal_mask"), val = tensor<fp32, [3, 3]>([[0x1p+0, 0x0p+0, 0x0p+0], [0x1p+0, 0x1p+0, 0x0p+0], [0x1p+0, 0x1p+0, 0x1p+0]])];
-            tensor<fp32, [3]> encoder__t_index = const()[name = tensor<string, []>("encoder__t_index"), val = tensor<fp32, [3]>([0x1p+0, 0x1p+1, 0x1.8p+1])];
-            tensor<fp32, [256]> encoder_input_projection_linear_bias = const()[name = tensor<string, []>("encoder_input_projection_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4981952)))];
-            tensor<fp32, [256, 345]> encoder_input_projection_linear_weight = const()[name = tensor<string, []>("encoder_input_projection_linear_weight"), val = tensor<fp32, [256, 345]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4983040)))];
-            tensor<fp32, [256]> encoder_pre_layer_norm_bias = const()[name = tensor<string, []>("encoder_pre_layer_norm_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5336384)))];
-            tensor<fp32, [256]> encoder_pre_layer_norm_weight = const()[name = tensor<string, []>("encoder_pre_layer_norm_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5337472)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5338560)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5339648)))];
-            tensor<fp32, [1024]> encoder_ffn1_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5340736)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5344896)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6393536)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6394624)))];
-            tensor<fp32, [256]> encoder_ret_lns_0_bias = const()[name = tensor<string, []>("encoder_ret_lns_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7443264)))];
-            tensor<fp32, [256]> encoder_ret_lns_0_weight = const()[name = tensor<string, []>("encoder_ret_lns_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7444352)))];
-            tensor<fp32, [256]> encoder_q_proj_0_bias = const()[name = tensor<string, []>("encoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7445440)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_0_weight = const()[name = tensor<string, []>("encoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7446528)))];
-            tensor<fp32, [256]> encoder_k_proj_0_bias = const()[name = tensor<string, []>("encoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7708736)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_0_weight = const()[name = tensor<string, []>("encoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7709824)))];
-            tensor<fp32, [256]> encoder_v_proj_0_bias = const()[name = tensor<string, []>("encoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7972032)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_0_weight = const()[name = tensor<string, []>("encoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7973120)))];
-            tensor<fp32, [256]> encoder_g_proj_0_bias = const()[name = tensor<string, []>("encoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8235328)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_0_weight = const()[name = tensor<string, []>("encoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8236416)))];
-            tensor<fp32, [256]> encoder_out_proj_0_bias = const()[name = tensor<string, []>("encoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8498624)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_0_weight = const()[name = tensor<string, []>("encoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8499712)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8761920)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8763008)))];
-            tensor<fp32, [512]> encoder_conv_module_0_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8764096)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_0_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8766208)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_0_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9290560)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9307008)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9308096)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9309184)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9310272)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9311360)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_0_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9312448)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9574656)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9575744)))];
-            tensor<fp32, [1024]> encoder_ffn2_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9576832)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9580992)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10629632)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10630720)))];
-            tensor<fp32, [256]> encoder_layer_norm_0_bias = const()[name = tensor<string, []>("encoder_layer_norm_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11679360)))];
-            tensor<fp32, [256]> encoder_layer_norm_0_weight = const()[name = tensor<string, []>("encoder_layer_norm_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11680448)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11681536)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11682624)))];
-            tensor<fp32, [1024]> encoder_ffn1_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11683712)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11687872)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12736512)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12737600)))];
-            tensor<fp32, [256]> encoder_ret_lns_1_bias = const()[name = tensor<string, []>("encoder_ret_lns_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13786240)))];
-            tensor<fp32, [256]> encoder_ret_lns_1_weight = const()[name = tensor<string, []>("encoder_ret_lns_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13787328)))];
-            tensor<fp32, [256]> encoder_q_proj_1_bias = const()[name = tensor<string, []>("encoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13788416)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_1_weight = const()[name = tensor<string, []>("encoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13789504)))];
-            tensor<fp32, [256]> encoder_k_proj_1_bias = const()[name = tensor<string, []>("encoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14051712)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_1_weight = const()[name = tensor<string, []>("encoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14052800)))];
-            tensor<fp32, [256]> encoder_v_proj_1_bias = const()[name = tensor<string, []>("encoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14315008)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_1_weight = const()[name = tensor<string, []>("encoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14316096)))];
-            tensor<fp32, [256]> encoder_g_proj_1_bias = const()[name = tensor<string, []>("encoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14578304)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_1_weight = const()[name = tensor<string, []>("encoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14579392)))];
-            tensor<fp32, [256]> encoder_out_proj_1_bias = const()[name = tensor<string, []>("encoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14841600)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_1_weight = const()[name = tensor<string, []>("encoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14842688)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15104896)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15105984)))];
-            tensor<fp32, [512]> encoder_conv_module_1_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15107072)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_1_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15109184)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_1_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15633536)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15649984)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15651072)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15652160)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15653248)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15654336)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_1_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15655424)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15917632)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15918720)))];
-            tensor<fp32, [1024]> encoder_ffn2_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15919808)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15923968)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16972608)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16973696)))];
-            tensor<fp32, [256]> encoder_layer_norm_1_bias = const()[name = tensor<string, []>("encoder_layer_norm_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18022336)))];
-            tensor<fp32, [256]> encoder_layer_norm_1_weight = const()[name = tensor<string, []>("encoder_layer_norm_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18023424)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18024512)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18025600)))];
-            tensor<fp32, [1024]> encoder_ffn1_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18026688)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18030848)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19079488)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19080576)))];
-            tensor<fp32, [256]> encoder_ret_lns_2_bias = const()[name = tensor<string, []>("encoder_ret_lns_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20129216)))];
-            tensor<fp32, [256]> encoder_ret_lns_2_weight = const()[name = tensor<string, []>("encoder_ret_lns_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20130304)))];
-            tensor<fp32, [256]> encoder_q_proj_2_bias = const()[name = tensor<string, []>("encoder_q_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20131392)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_2_weight = const()[name = tensor<string, []>("encoder_q_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20132480)))];
-            tensor<fp32, [256]> encoder_k_proj_2_bias = const()[name = tensor<string, []>("encoder_k_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20394688)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_2_weight = const()[name = tensor<string, []>("encoder_k_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20395776)))];
-            tensor<fp32, [256]> encoder_v_proj_2_bias = const()[name = tensor<string, []>("encoder_v_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20657984)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_2_weight = const()[name = tensor<string, []>("encoder_v_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20659072)))];
-            tensor<fp32, [256]> encoder_g_proj_2_bias = const()[name = tensor<string, []>("encoder_g_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20921280)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_2_weight = const()[name = tensor<string, []>("encoder_g_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20922368)))];
-            tensor<fp32, [256]> encoder_out_proj_2_bias = const()[name = tensor<string, []>("encoder_out_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21184576)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_2_weight = const()[name = tensor<string, []>("encoder_out_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21185664)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21447872)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21448960)))];
-            tensor<fp32, [512]> encoder_conv_module_2_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21450048)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_2_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21452160)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_2_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21976512)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21992960)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21994048)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21995136)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21996224)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21997312)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_2_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21998400)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22260608)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22261696)))];
-            tensor<fp32, [1024]> encoder_ffn2_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22262784)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22266944)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23315584)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23316672)))];
-            tensor<fp32, [256]> encoder_layer_norm_2_bias = const()[name = tensor<string, []>("encoder_layer_norm_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24365312)))];
-            tensor<fp32, [256]> encoder_layer_norm_2_weight = const()[name = tensor<string, []>("encoder_layer_norm_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24366400)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24367488)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24368576)))];
-            tensor<fp32, [1024]> encoder_ffn1_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24369664)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24373824)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25422464)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25423552)))];
-            tensor<fp32, [256]> encoder_ret_lns_3_bias = const()[name = tensor<string, []>("encoder_ret_lns_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26472192)))];
-            tensor<fp32, [256]> encoder_ret_lns_3_weight = const()[name = tensor<string, []>("encoder_ret_lns_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26473280)))];
-            tensor<fp32, [256]> encoder_q_proj_3_bias = const()[name = tensor<string, []>("encoder_q_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26474368)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_3_weight = const()[name = tensor<string, []>("encoder_q_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26475456)))];
-            tensor<fp32, [256]> encoder_k_proj_3_bias = const()[name = tensor<string, []>("encoder_k_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26737664)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_3_weight = const()[name = tensor<string, []>("encoder_k_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26738752)))];
-            tensor<fp32, [256]> encoder_v_proj_3_bias = const()[name = tensor<string, []>("encoder_v_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27000960)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_3_weight = const()[name = tensor<string, []>("encoder_v_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27002048)))];
-            tensor<fp32, [256]> encoder_g_proj_3_bias = const()[name = tensor<string, []>("encoder_g_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27264256)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_3_weight = const()[name = tensor<string, []>("encoder_g_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27265344)))];
-            tensor<fp32, [256]> encoder_out_proj_3_bias = const()[name = tensor<string, []>("encoder_out_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27527552)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_3_weight = const()[name = tensor<string, []>("encoder_out_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27528640)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27790848)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27791936)))];
-            tensor<fp32, [512]> encoder_conv_module_3_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27793024)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_3_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27795136)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_3_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28319488)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28335936)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28337024)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28338112)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28339200)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28340288)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_3_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28341376)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28603584)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28604672)))];
-            tensor<fp32, [1024]> encoder_ffn2_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28605760)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28609920)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29658560)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29659648)))];
-            tensor<fp32, [256]> encoder_layer_norm_3_bias = const()[name = tensor<string, []>("encoder_layer_norm_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30708288)))];
-            tensor<fp32, [256]> encoder_layer_norm_3_weight = const()[name = tensor<string, []>("encoder_layer_norm_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30709376)))];
-            tensor<fp32, [3, 3]> decoder__causal_mask = const()[name = tensor<string, []>("decoder__causal_mask"), val = tensor<fp32, [3, 3]>([[0x1p+0, 0x0p+0, 0x0p+0], [0x1p+0, 0x1p+0, 0x0p+0], [0x1p+0, 0x1p+0, 0x1p+0]])];
-            tensor<fp32, [256]> decoder_convert_bias = const()[name = tensor<string, []>("decoder_convert_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710464)))];
-            tensor<fp32, [256, 512]> decoder_convert_weight = const()[name = tensor<string, []>("decoder_convert_weight"), val = tensor<fp32, [256, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30711552)))];
-            tensor<fp32, [256]> decoder_q_proj_0_bias = const()[name = tensor<string, []>("decoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31235904)))];
-            tensor<fp32, [256, 256]> decoder_q_proj_0_weight = const()[name = tensor<string, []>("decoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31236992)))];
-            tensor<fp32, [256]> decoder_k_proj_0_bias = const()[name = tensor<string, []>("decoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31499200)))];
-            tensor<fp32, [256, 256]> decoder_k_proj_0_weight = const()[name = tensor<string, []>("decoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31500288)))];
-            tensor<fp32, [256]> decoder_v_proj_0_bias = const()[name = tensor<string, []>("decoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31762496)))];
-            tensor<fp32, [256, 256]> decoder_v_proj_0_weight = const()[name = tensor<string, []>("decoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31763584)))];
-            tensor<fp32, [256]> decoder_g_proj_0_bias = const()[name = tensor<string, []>("decoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32025792)))];
-            tensor<fp32, [256, 256]> decoder_g_proj_0_weight = const()[name = tensor<string, []>("decoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32026880)))];
-            tensor<fp32, [256]> decoder_out_proj_0_bias = const()[name = tensor<string, []>("decoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32289088)))];
-            tensor<fp32, [256, 256]> decoder_out_proj_0_weight = const()[name = tensor<string, []>("decoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32290176)))];
-            tensor<fp32, [256]> decoder_norm11_0_bias = const()[name = tensor<string, []>("decoder_norm11_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32552384)))];
-            tensor<fp32, [256]> decoder_norm11_0_weight = const()[name = tensor<string, []>("decoder_norm11_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32553472)))];
-            tensor<fp32, [256]> decoder_self_attn2_0_out_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_0_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32554560)))];
-            tensor<fp32, [256, 256]> decoder_self_attn2_0_out_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_0_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32555648)))];
-            tensor<fp32, [768]> decoder_self_attn2_0_in_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_0_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32817856)))];
-            tensor<fp32, [768, 256]> decoder_self_attn2_0_in_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_0_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32820992)))];
-            tensor<fp32, [256]> decoder_norm21_0_bias = const()[name = tensor<string, []>("decoder_norm21_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33607488)))];
-            tensor<fp32, [256]> decoder_norm21_0_weight = const()[name = tensor<string, []>("decoder_norm21_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33608576)))];
-            tensor<fp32, [2048]> decoder_linear1_0_bias = const()[name = tensor<string, []>("decoder_linear1_0_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33609664)))];
-            tensor<fp32, [2048, 256]> decoder_linear1_0_weight = const()[name = tensor<string, []>("decoder_linear1_0_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33617920)))];
-            tensor<fp32, [256]> decoder_linear2_0_bias = const()[name = tensor<string, []>("decoder_linear2_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35715136)))];
-            tensor<fp32, [256, 2048]> decoder_linear2_0_weight = const()[name = tensor<string, []>("decoder_linear2_0_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35716224)))];
-            tensor<fp32, [256]> decoder_norm22_0_bias = const()[name = tensor<string, []>("decoder_norm22_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37813440)))];
-            tensor<fp32, [256]> decoder_norm22_0_weight = const()[name = tensor<string, []>("decoder_norm22_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37814528)))];
-            tensor<fp32, [256]> decoder_q_proj_1_bias = const()[name = tensor<string, []>("decoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37815616)))];
-            tensor<fp32, [256, 256]> decoder_q_proj_1_weight = const()[name = tensor<string, []>("decoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37816704)))];
-            tensor<fp32, [256]> decoder_k_proj_1_bias = const()[name = tensor<string, []>("decoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38078912)))];
-            tensor<fp32, [256, 256]> decoder_k_proj_1_weight = const()[name = tensor<string, []>("decoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38080000)))];
-            tensor<fp32, [256]> decoder_v_proj_1_bias = const()[name = tensor<string, []>("decoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38342208)))];
-            tensor<fp32, [256, 256]> decoder_v_proj_1_weight = const()[name = tensor<string, []>("decoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38343296)))];
-            tensor<fp32, [256]> decoder_g_proj_1_bias = const()[name = tensor<string, []>("decoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38605504)))];
-            tensor<fp32, [256, 256]> decoder_g_proj_1_weight = const()[name = tensor<string, []>("decoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38606592)))];
-            tensor<fp32, [256]> decoder_out_proj_1_bias = const()[name = tensor<string, []>("decoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38868800)))];
-            tensor<fp32, [256, 256]> decoder_out_proj_1_weight = const()[name = tensor<string, []>("decoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38869888)))];
-            tensor<fp32, [256]> decoder_norm11_1_bias = const()[name = tensor<string, []>("decoder_norm11_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39132096)))];
-            tensor<fp32, [256]> decoder_norm11_1_weight = const()[name = tensor<string, []>("decoder_norm11_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39133184)))];
-            tensor<fp32, [256]> decoder_self_attn2_1_out_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_1_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39134272)))];
-            tensor<fp32, [256, 256]> decoder_self_attn2_1_out_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_1_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39135360)))];
-            tensor<fp32, [768]> decoder_self_attn2_1_in_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_1_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39397568)))];
-            tensor<fp32, [768, 256]> decoder_self_attn2_1_in_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_1_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39400704)))];
-            tensor<fp32, [256]> decoder_norm21_1_bias = const()[name = tensor<string, []>("decoder_norm21_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40187200)))];
-            tensor<fp32, [256]> decoder_norm21_1_weight = const()[name = tensor<string, []>("decoder_norm21_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40188288)))];
-            tensor<fp32, [2048]> decoder_linear1_1_bias = const()[name = tensor<string, []>("decoder_linear1_1_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40189376)))];
-            tensor<fp32, [2048, 256]> decoder_linear1_1_weight = const()[name = tensor<string, []>("decoder_linear1_1_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40197632)))];
-            tensor<fp32, [256]> decoder_linear2_1_bias = const()[name = tensor<string, []>("decoder_linear2_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42294848)))];
-            tensor<fp32, [256, 2048]> decoder_linear2_1_weight = const()[name = tensor<string, []>("decoder_linear2_1_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42295936)))];
-            tensor<fp32, [256]> decoder_norm22_1_bias = const()[name = tensor<string, []>("decoder_norm22_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44393152)))];
-            tensor<fp32, [256]> decoder_norm22_1_weight = const()[name = tensor<string, []>("decoder_norm22_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44394240)))];
-            tensor<fp32, []> var_11 = const()[name = tensor<string, []>("op_11"), val = tensor<fp32, []>(0x1.5798eep-27)];
-            tensor<fp32, []> var_17 = const()[name = tensor<string, []>("op_17"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
-            tensor<int32, []> var_20 = const()[name = tensor<string, []>("op_20"), val = tensor<int32, []>(2)];
-            tensor<int32, []> var_23 = const()[name = tensor<string, []>("op_23"), val = tensor<int32, []>(0)];
-            tensor<int32, []> var_26 = const()[name = tensor<string, []>("op_26"), val = tensor<int32, []>(1)];
-            tensor<fp32, []> var_28 = const()[name = tensor<string, []>("op_28"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
-            tensor<bool, []> var_29 = const()[name = tensor<string, []>("op_29"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 3, 256]> input_1 = linear(bias = encoder_input_projection_linear_bias, weight = encoder_input_projection_linear_weight, x = features)[name = tensor<string, []>("linear_0")];
-            tensor<int32, [1]> input_3_axes_0 = const()[name = tensor<string, []>("input_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_3 = layer_norm(axes = input_3_axes_0, beta = encoder_pre_layer_norm_bias, epsilon = var_28, gamma = encoder_pre_layer_norm_weight, x = input_1)[name = tensor<string, []>("input_3")];
+    func main<ios17>(tensor<fp32, [1, 256, 18]> cnn_window, tensor<fp32, [2, 12, 4, 64, 64]> dec_kv, tensor<fp32, [2, 1]> dec_scale, tensor<fp32, [4, 1, 16, 256]> enc_conv_cache, tensor<fp32, [4, 1, 4, 64, 64]> enc_kv, tensor<fp32, [4, 1]> enc_scale, tensor<fp32, [1, 35, 23]> features, tensor<fp32, [3]> valid_mask) {
+            tensor<fp32, [256]> inner_encoder_cnn_bias = const()[name = tensor<string, []>("inner_encoder_cnn_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp32, [256, 256, 19]> inner_encoder_cnn_weight = const()[name = tensor<string, []>("inner_encoder_cnn_weight"), val = tensor<fp32, [256, 256, 19]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
+            tensor<fp32, [3, 3]> inner_encoder__causal_mask = const()[name = tensor<string, []>("inner_encoder__causal_mask"), val = tensor<fp32, [3, 3]>([[0x1p+0, 0x0p+0, 0x0p+0], [0x1p+0, 0x1p+0, 0x0p+0], [0x1p+0, 0x1p+0, 0x1p+0]])];
+            tensor<fp32, [3]> inner_encoder__t_index = const()[name = tensor<string, []>("inner_encoder__t_index"), val = tensor<fp32, [3]>([0x1p+0, 0x1p+1, 0x1.8p+1])];
+            tensor<fp32, [256]> inner_encoder_input_projection_linear_bias = const()[name = tensor<string, []>("inner_encoder_input_projection_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4981952)))];
+            tensor<fp32, [256, 345]> inner_encoder_input_projection_linear_weight = const()[name = tensor<string, []>("inner_encoder_input_projection_linear_weight"), val = tensor<fp32, [256, 345]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4983040)))];
+            tensor<fp32, [256]> inner_encoder_pre_layer_norm_bias = const()[name = tensor<string, []>("inner_encoder_pre_layer_norm_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5336384)))];
+            tensor<fp32, [256]> inner_encoder_pre_layer_norm_weight = const()[name = tensor<string, []>("inner_encoder_pre_layer_norm_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5337472)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5338560)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5339648)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5340736)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5344896)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6393536)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6394624)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_0_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7443264)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_0_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7444352)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7445440)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7446528)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7708736)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7709824)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7972032)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7973120)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8235328)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8236416)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8498624)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8499712)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8761920)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8763008)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_0_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8764096)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_0_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8766208)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_0_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9290560)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9307008)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9308096)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9309184)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9310272)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9311360)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_0_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9312448)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9574656)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9575744)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9576832)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9580992)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10629632)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10630720)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_0_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11679360)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_0_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11680448)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11681536)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11682624)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11683712)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11687872)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12736512)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12737600)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_1_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13786240)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_1_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13787328)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13788416)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13789504)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14051712)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14052800)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14315008)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14316096)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14578304)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14579392)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14841600)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14842688)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15104896)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15105984)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_1_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15107072)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_1_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15109184)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_1_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15633536)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15649984)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15651072)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15652160)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15653248)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15654336)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_1_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15655424)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15917632)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15918720)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15919808)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15923968)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16972608)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16973696)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_1_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18022336)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_1_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18023424)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18024512)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18025600)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18026688)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18030848)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19079488)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19080576)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_2_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20129216)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_2_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20130304)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20131392)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20132480)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20394688)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20395776)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20657984)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20659072)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20921280)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20922368)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21184576)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21185664)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21447872)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21448960)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_2_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21450048)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_2_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21452160)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_2_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21976512)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21992960)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21994048)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21995136)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21996224)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21997312)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_2_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21998400)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22260608)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22261696)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22262784)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22266944)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23315584)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23316672)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_2_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24365312)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_2_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24366400)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24367488)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24368576)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24369664)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24373824)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25422464)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25423552)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_3_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26472192)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_3_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26473280)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26474368)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26475456)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26737664)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26738752)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27000960)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27002048)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27264256)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27265344)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27527552)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27528640)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27790848)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27791936)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_3_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27793024)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_3_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27795136)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_3_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28319488)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28335936)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28337024)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28338112)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28339200)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28340288)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_3_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28341376)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28603584)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28604672)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28605760)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28609920)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29658560)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29659648)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_3_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30708288)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_3_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30709376)))];
+            tensor<fp32, [3, 3]> inner_decoder__causal_mask = const()[name = tensor<string, []>("inner_decoder__causal_mask"), val = tensor<fp32, [3, 3]>([[0x1p+0, 0x0p+0, 0x0p+0], [0x1p+0, 0x1p+0, 0x0p+0], [0x1p+0, 0x1p+0, 0x1p+0]])];
+            tensor<fp32, [256]> inner_decoder_convert_bias = const()[name = tensor<string, []>("inner_decoder_convert_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710464)))];
+            tensor<fp32, [256, 512]> inner_decoder_convert_weight = const()[name = tensor<string, []>("inner_decoder_convert_weight"), val = tensor<fp32, [256, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30711552)))];
+            tensor<fp32, [256]> inner_decoder_q_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31235904)))];
+            tensor<fp32, [256, 256]> inner_decoder_q_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31236992)))];
+            tensor<fp32, [256]> inner_decoder_k_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31499200)))];
+            tensor<fp32, [256, 256]> inner_decoder_k_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31500288)))];
+            tensor<fp32, [256]> inner_decoder_v_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31762496)))];
+            tensor<fp32, [256, 256]> inner_decoder_v_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31763584)))];
+            tensor<fp32, [256]> inner_decoder_g_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32025792)))];
+            tensor<fp32, [256, 256]> inner_decoder_g_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32026880)))];
+            tensor<fp32, [256]> inner_decoder_out_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32289088)))];
+            tensor<fp32, [256, 256]> inner_decoder_out_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32290176)))];
+            tensor<fp32, [256]> inner_decoder_norm11_0_bias = const()[name = tensor<string, []>("inner_decoder_norm11_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32552384)))];
+            tensor<fp32, [256]> inner_decoder_norm11_0_weight = const()[name = tensor<string, []>("inner_decoder_norm11_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32553472)))];
+            tensor<fp32, [256]> inner_decoder_self_attn2_0_out_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32554560)))];
+            tensor<fp32, [256, 256]> inner_decoder_self_attn2_0_out_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32555648)))];
+            tensor<fp32, [768]> inner_decoder_self_attn2_0_in_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32817856)))];
+            tensor<fp32, [768, 256]> inner_decoder_self_attn2_0_in_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32820992)))];
+            tensor<fp32, [256]> inner_decoder_norm21_0_bias = const()[name = tensor<string, []>("inner_decoder_norm21_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33607488)))];
+            tensor<fp32, [256]> inner_decoder_norm21_0_weight = const()[name = tensor<string, []>("inner_decoder_norm21_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33608576)))];
+            tensor<fp32, [2048]> inner_decoder_linear1_0_bias = const()[name = tensor<string, []>("inner_decoder_linear1_0_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33609664)))];
+            tensor<fp32, [2048, 256]> inner_decoder_linear1_0_weight = const()[name = tensor<string, []>("inner_decoder_linear1_0_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33617920)))];
+            tensor<fp32, [256]> inner_decoder_linear2_0_bias = const()[name = tensor<string, []>("inner_decoder_linear2_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35715136)))];
+            tensor<fp32, [256, 2048]> inner_decoder_linear2_0_weight = const()[name = tensor<string, []>("inner_decoder_linear2_0_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35716224)))];
+            tensor<fp32, [256]> inner_decoder_norm22_0_bias = const()[name = tensor<string, []>("inner_decoder_norm22_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37813440)))];
+            tensor<fp32, [256]> inner_decoder_norm22_0_weight = const()[name = tensor<string, []>("inner_decoder_norm22_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37814528)))];
+            tensor<fp32, [256]> inner_decoder_q_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37815616)))];
+            tensor<fp32, [256, 256]> inner_decoder_q_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37816704)))];
+            tensor<fp32, [256]> inner_decoder_k_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38078912)))];
+            tensor<fp32, [256, 256]> inner_decoder_k_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38080000)))];
+            tensor<fp32, [256]> inner_decoder_v_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38342208)))];
+            tensor<fp32, [256, 256]> inner_decoder_v_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38343296)))];
+            tensor<fp32, [256]> inner_decoder_g_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38605504)))];
+            tensor<fp32, [256, 256]> inner_decoder_g_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38606592)))];
+            tensor<fp32, [256]> inner_decoder_out_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38868800)))];
+            tensor<fp32, [256, 256]> inner_decoder_out_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38869888)))];
+            tensor<fp32, [256]> inner_decoder_norm11_1_bias = const()[name = tensor<string, []>("inner_decoder_norm11_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39132096)))];
+            tensor<fp32, [256]> inner_decoder_norm11_1_weight = const()[name = tensor<string, []>("inner_decoder_norm11_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39133184)))];
+            tensor<fp32, [256]> inner_decoder_self_attn2_1_out_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39134272)))];
+            tensor<fp32, [256, 256]> inner_decoder_self_attn2_1_out_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39135360)))];
+            tensor<fp32, [768]> inner_decoder_self_attn2_1_in_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39397568)))];
+            tensor<fp32, [768, 256]> inner_decoder_self_attn2_1_in_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39400704)))];
+            tensor<fp32, [256]> inner_decoder_norm21_1_bias = const()[name = tensor<string, []>("inner_decoder_norm21_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40187200)))];
+            tensor<fp32, [256]> inner_decoder_norm21_1_weight = const()[name = tensor<string, []>("inner_decoder_norm21_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40188288)))];
+            tensor<fp32, [2048]> inner_decoder_linear1_1_bias = const()[name = tensor<string, []>("inner_decoder_linear1_1_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40189376)))];
+            tensor<fp32, [2048, 256]> inner_decoder_linear1_1_weight = const()[name = tensor<string, []>("inner_decoder_linear1_1_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40197632)))];
+            tensor<fp32, [256]> inner_decoder_linear2_1_bias = const()[name = tensor<string, []>("inner_decoder_linear2_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42294848)))];
+            tensor<fp32, [256, 2048]> inner_decoder_linear2_1_weight = const()[name = tensor<string, []>("inner_decoder_linear2_1_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42295936)))];
+            tensor<fp32, [256]> inner_decoder_norm22_1_bias = const()[name = tensor<string, []>("inner_decoder_norm22_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44393152)))];
+            tensor<fp32, [256]> inner_decoder_norm22_1_weight = const()[name = tensor<string, []>("inner_decoder_norm22_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44394240)))];
+            tensor<int32, [3]> var_19_begin_0 = const()[name = tensor<string, []>("op_19_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_19_end_0 = const()[name = tensor<string, []>("op_19_end_0"), val = tensor<int32, [3]>([1, 15, 23])];
+            tensor<bool, [3]> var_19_end_mask_0 = const()[name = tensor<string, []>("op_19_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 15, 23]> var_19 = slice_by_index(begin = var_19_begin_0, end = var_19_end_0, end_mask = var_19_end_mask_0, x = features)[name = tensor<string, []>("op_19")];
+            tensor<int32, [3]> var_29_begin_0 = const()[name = tensor<string, []>("op_29_begin_0"), val = tensor<int32, [3]>([0, 10, 0])];
+            tensor<int32, [3]> var_29_end_0 = const()[name = tensor<string, []>("op_29_end_0"), val = tensor<int32, [3]>([1, 25, 23])];
+            tensor<bool, [3]> var_29_end_mask_0 = const()[name = tensor<string, []>("op_29_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 15, 23]> var_29 = slice_by_index(begin = var_29_begin_0, end = var_29_end_0, end_mask = var_29_end_mask_0, x = features)[name = tensor<string, []>("op_29")];
+            tensor<int32, [3]> var_39_begin_0 = const()[name = tensor<string, []>("op_39_begin_0"), val = tensor<int32, [3]>([0, 20, 0])];
+            tensor<int32, [3]> var_39_end_0 = const()[name = tensor<string, []>("op_39_end_0"), val = tensor<int32, [3]>([1, 1, 23])];
+            tensor<bool, [3]> var_39_end_mask_0 = const()[name = tensor<string, []>("op_39_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 23]> var_39 = slice_by_index(begin = var_39_begin_0, end = var_39_end_0, end_mask = var_39_end_mask_0, x = features)[name = tensor<string, []>("op_39")];
+            tensor<int32, []> stacked_axis_0 = const()[name = tensor<string, []>("stacked_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 3, 15, 23]> stacked = stack(axis = stacked_axis_0, values = (var_19, var_29, var_39))[name = tensor<string, []>("stacked")];
+            tensor<int32, [3]> var_46 = const()[name = tensor<string, []>("op_46"), val = tensor<int32, [3]>([1, 3, 345])];
+            tensor<fp32, [1, 3, 345]> input_1 = reshape(shape = var_46, x = stacked)[name = tensor<string, []>("input_1")];
+            tensor<fp32, []> var_49 = const()[name = tensor<string, []>("op_49"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<bool, []> var_55 = const()[name = tensor<string, []>("op_55"), val = tensor<bool, []>(true)];
+            tensor<fp32, []> var_56 = const()[name = tensor<string, []>("op_56"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
+            tensor<int32, []> var_59 = const()[name = tensor<string, []>("op_59"), val = tensor<int32, []>(0)];
+            tensor<int32, []> var_61 = const()[name = tensor<string, []>("op_61"), val = tensor<int32, []>(2)];
+            tensor<int32, []> var_62 = const()[name = tensor<string, []>("op_62"), val = tensor<int32, []>(-1)];
+            tensor<fp32, []> var_64 = const()[name = tensor<string, []>("op_64"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
+            tensor<fp32, []> var_69 = const()[name = tensor<string, []>("op_69"), val = tensor<fp32, []>(0x1.5798eep-27)];
+            tensor<int32, []> var_72 = const()[name = tensor<string, []>("op_72"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 3, 256]> input_3 = linear(bias = inner_encoder_input_projection_linear_bias, weight = inner_encoder_input_projection_linear_weight, x = input_1)[name = tensor<string, []>("linear_0")];
             tensor<int32, [1]> input_5_axes_0 = const()[name = tensor<string, []>("input_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_5 = layer_norm(axes = input_5_axes_0, beta = encoder_ffn1_0_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_0_module_sequential_0_weight, x = input_3)[name = tensor<string, []>("input_5")];
-            tensor<fp32, [1, 3, 1024]> inputs_1 = linear(bias = encoder_ffn1_0_module_sequential_1_linear_bias, weight = encoder_ffn1_0_module_sequential_1_linear_weight, x = input_5)[name = tensor<string, []>("linear_1")];
-            tensor<fp32, [1, 3, 1024]> input_7 = silu(x = inputs_1)[name = tensor<string, []>("input_7")];
-            tensor<fp32, [1, 3, 256]> input_11 = linear(bias = encoder_ffn1_0_module_sequential_4_linear_bias, weight = encoder_ffn1_0_module_sequential_4_linear_weight, x = input_7)[name = tensor<string, []>("linear_2")];
-            tensor<fp32, []> var_147 = const()[name = tensor<string, []>("op_147"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 3, 256]> var_148 = mul(x = input_11, y = var_147)[name = tensor<string, []>("op_148")];
-            tensor<fp32, [1, 3, 256]> input_13 = add(x = var_148, y = input_3)[name = tensor<string, []>("input_13")];
+            tensor<fp32, [1, 3, 256]> input_5 = layer_norm(axes = input_5_axes_0, beta = inner_encoder_pre_layer_norm_bias, epsilon = var_56, gamma = inner_encoder_pre_layer_norm_weight, x = input_3)[name = tensor<string, []>("input_5")];
+            tensor<int32, [1]> input_7_axes_0 = const()[name = tensor<string, []>("input_7_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 3, 256]> input_7 = layer_norm(axes = input_7_axes_0, beta = inner_encoder_ffn1_0_module_sequential_0_bias, epsilon = var_56, gamma = inner_encoder_ffn1_0_module_sequential_0_weight, x = input_5)[name = tensor<string, []>("input_7")];
+            tensor<fp32, [1, 3, 1024]> inputs_1 = linear(bias = inner_encoder_ffn1_0_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_0_module_sequential_1_linear_weight, x = input_7)[name = tensor<string, []>("linear_1")];
+            tensor<fp32, [1, 3, 1024]> input_9 = silu(x = inputs_1)[name = tensor<string, []>("input_9")];
+            tensor<fp32, [1, 3, 256]> input_13 = linear(bias = inner_encoder_ffn1_0_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_0_module_sequential_4_linear_weight, x = input_9)[name = tensor<string, []>("linear_2")];
+            tensor<fp32, []> var_193 = const()[name = tensor<string, []>("op_193"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 3, 256]> var_194 = mul(x = input_13, y = var_193)[name = tensor<string, []>("op_194")];
+            tensor<fp32, [1, 3, 256]> input_15 = add(x = var_194, y = input_5)[name = tensor<string, []>("input_15")];
             tensor<int32, [1]> x_1_axes_0 = const()[name = tensor<string, []>("x_1_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> x_1 = layer_norm(axes = x_1_axes_0, beta = encoder_ret_lns_0_bias, epsilon = var_28, gamma = encoder_ret_lns_0_weight, x = input_13)[name = tensor<string, []>("x_1")];
+            tensor<fp32, [1, 3, 256]> x_1 = layer_norm(axes = x_1_axes_0, beta = inner_encoder_ret_lns_0_bias, epsilon = var_56, gamma = inner_encoder_ret_lns_0_weight, x = input_15)[name = tensor<string, []>("x_1")];
             tensor<int32, [5]> prev_kv_1_begin_0 = const()[name = tensor<string, []>("prev_kv_1_begin_0"), val = tensor<int32, [5]>([0, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_1_end_0 = const()[name = tensor<string, []>("prev_kv_1_end_0"), val = tensor<int32, [5]>([1, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_1_end_mask_0 = const()[name = tensor<string, []>("prev_kv_1_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -239,163 +257,163 @@ program(1.0)
             tensor<bool, [2]> prev_scale_1_end_mask_0 = const()[name = tensor<string, []>("prev_scale_1_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_1_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_1_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_1 = slice_by_index(begin = prev_scale_1_begin_0, end = prev_scale_1_end_0, end_mask = prev_scale_1_end_mask_0, squeeze_mask = prev_scale_1_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_1")];
-            tensor<fp32, [1, 3, 256]> var_162 = linear(bias = encoder_q_proj_0_bias, weight = encoder_q_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_3")];
-            tensor<int32, [4]> var_163 = const()[name = tensor<string, []>("op_163"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_164 = reshape(shape = var_163, x = var_162)[name = tensor<string, []>("op_164")];
+            tensor<fp32, [1, 3, 256]> var_208 = linear(bias = inner_encoder_q_proj_0_bias, weight = inner_encoder_q_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_3")];
+            tensor<int32, [4]> var_209 = const()[name = tensor<string, []>("op_209"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_210 = reshape(shape = var_209, x = var_208)[name = tensor<string, []>("op_210")];
             tensor<int32, [4]> q_1_perm_0 = const()[name = tensor<string, []>("q_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> var_168 = linear(bias = encoder_k_proj_0_bias, weight = encoder_k_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_4")];
-            tensor<fp32, []> var_169 = const()[name = tensor<string, []>("op_169"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 3, 256]> var_170 = mul(x = var_168, y = var_169)[name = tensor<string, []>("op_170")];
-            tensor<int32, [4]> var_171 = const()[name = tensor<string, []>("op_171"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_172 = reshape(shape = var_171, x = var_170)[name = tensor<string, []>("op_172")];
+            tensor<fp32, [1, 3, 256]> var_214 = linear(bias = inner_encoder_k_proj_0_bias, weight = inner_encoder_k_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_4")];
+            tensor<fp32, []> var_215 = const()[name = tensor<string, []>("op_215"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 3, 256]> var_216 = mul(x = var_214, y = var_215)[name = tensor<string, []>("op_216")];
+            tensor<int32, [4]> var_217 = const()[name = tensor<string, []>("op_217"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_218 = reshape(shape = var_217, x = var_216)[name = tensor<string, []>("op_218")];
             tensor<int32, [4]> k_1_perm_0 = const()[name = tensor<string, []>("k_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> var_176 = linear(bias = encoder_v_proj_0_bias, weight = encoder_v_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_5")];
-            tensor<int32, [4]> var_177 = const()[name = tensor<string, []>("op_177"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_178 = reshape(shape = var_177, x = var_176)[name = tensor<string, []>("op_178")];
+            tensor<fp32, [1, 3, 256]> var_222 = linear(bias = inner_encoder_v_proj_0_bias, weight = inner_encoder_v_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_5")];
+            tensor<int32, [4]> var_223 = const()[name = tensor<string, []>("op_223"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_224 = reshape(shape = var_223, x = var_222)[name = tensor<string, []>("op_224")];
             tensor<int32, [4]> v_1_perm_0 = const()[name = tensor<string, []>("v_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> input_17 = linear(bias = encoder_g_proj_0_bias, weight = encoder_g_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_6")];
+            tensor<fp32, [1, 3, 256]> input_19 = linear(bias = inner_encoder_g_proj_0_bias, weight = inner_encoder_g_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_6")];
             tensor<fp32, [1]> sqrt_s0_1 = sqrt(x = prev_scale_1)[name = tensor<string, []>("sqrt_s0_1")];
-            tensor<fp32, [3]> s_t_1 = add(x = prev_scale_1, y = encoder__t_index)[name = tensor<string, []>("s_t_1")];
+            tensor<fp32, [3]> s_t_1 = add(x = prev_scale_1, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_1")];
             tensor<fp32, [3]> sqrt_s_t_1 = sqrt(x = s_t_1)[name = tensor<string, []>("sqrt_s_t_1")];
             tensor<bool, []> qk_1_transpose_x_1 = const()[name = tensor<string, []>("qk_1_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_1_transpose_y_1 = const()[name = tensor<string, []>("qk_1_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 3, 64]> k_1 = transpose(perm = k_1_perm_0, x = var_172)[name = tensor<string, []>("transpose_57")];
-            tensor<fp32, [1, 4, 3, 64]> q_1 = transpose(perm = q_1_perm_0, x = var_164)[name = tensor<string, []>("transpose_58")];
+            tensor<fp32, [1, 4, 3, 64]> k_1 = transpose(perm = k_1_perm_0, x = var_218)[name = tensor<string, []>("transpose_57")];
+            tensor<fp32, [1, 4, 3, 64]> q_1 = transpose(perm = q_1_perm_0, x = var_210)[name = tensor<string, []>("transpose_58")];
             tensor<fp32, [1, 4, 3, 3]> qk_1 = matmul(transpose_x = qk_1_transpose_x_1, transpose_y = qk_1_transpose_y_1, x = q_1, y = k_1)[name = tensor<string, []>("qk_1")];
-            tensor<int32, [2]> var_188 = const()[name = tensor<string, []>("op_188"), val = tensor<int32, [2]>([3, 1])];
-            tensor<fp32, [3, 1]> var_189 = reshape(shape = var_188, x = sqrt_s_t_1)[name = tensor<string, []>("op_189")];
-            tensor<fp32, [3, 3]> M_1 = real_div(x = encoder__causal_mask, y = var_189)[name = tensor<string, []>("M_1")];
-            tensor<fp32, [1, 4, 3, 3]> var_191 = mul(x = qk_1, y = M_1)[name = tensor<string, []>("op_191")];
+            tensor<int32, [2]> var_234 = const()[name = tensor<string, []>("op_234"), val = tensor<int32, [2]>([3, 1])];
+            tensor<fp32, [3, 1]> var_235 = reshape(shape = var_234, x = sqrt_s_t_1)[name = tensor<string, []>("op_235")];
+            tensor<fp32, [3, 3]> M_1 = real_div(x = inner_encoder__causal_mask, y = var_235)[name = tensor<string, []>("M_1")];
+            tensor<fp32, [1, 4, 3, 3]> var_237 = mul(x = qk_1, y = M_1)[name = tensor<string, []>("op_237")];
             tensor<bool, []> inner_1_transpose_x_0 = const()[name = tensor<string, []>("inner_1_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_1_transpose_y_0 = const()[name = tensor<string, []>("inner_1_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 3, 64]> v_1 = transpose(perm = v_1_perm_0, x = var_178)[name = tensor<string, []>("transpose_56")];
-            tensor<fp32, [1, 4, 3, 64]> inner_1 = matmul(transpose_x = inner_1_transpose_x_0, transpose_y = inner_1_transpose_y_0, x = var_191, y = v_1)[name = tensor<string, []>("inner_1")];
-            tensor<bool, []> var_193_transpose_x_0 = const()[name = tensor<string, []>("op_193_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_193_transpose_y_0 = const()[name = tensor<string, []>("op_193_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 3, 64]> var_193 = matmul(transpose_x = var_193_transpose_x_0, transpose_y = var_193_transpose_y_0, x = q_1, y = prev_kv_1)[name = tensor<string, []>("op_193")];
-            tensor<fp32, [3]> var_194 = real_div(x = sqrt_s0_1, y = sqrt_s_t_1)[name = tensor<string, []>("op_194")];
-            tensor<int32, [4]> var_195 = const()[name = tensor<string, []>("op_195"), val = tensor<int32, [4]>([1, 1, 3, 1])];
-            tensor<fp32, [1, 1, 3, 1]> var_196 = reshape(shape = var_195, x = var_194)[name = tensor<string, []>("op_196")];
-            tensor<fp32, [1, 4, 3, 64]> cross_1 = mul(x = var_193, y = var_196)[name = tensor<string, []>("cross_1")];
+            tensor<fp32, [1, 4, 3, 64]> v_1 = transpose(perm = v_1_perm_0, x = var_224)[name = tensor<string, []>("transpose_56")];
+            tensor<fp32, [1, 4, 3, 64]> inner_1 = matmul(transpose_x = inner_1_transpose_x_0, transpose_y = inner_1_transpose_y_0, x = var_237, y = v_1)[name = tensor<string, []>("inner_1")];
+            tensor<bool, []> var_239_transpose_x_0 = const()[name = tensor<string, []>("op_239_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_239_transpose_y_0 = const()[name = tensor<string, []>("op_239_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 3, 64]> var_239 = matmul(transpose_x = var_239_transpose_x_0, transpose_y = var_239_transpose_y_0, x = q_1, y = prev_kv_1)[name = tensor<string, []>("op_239")];
+            tensor<fp32, [3]> var_240 = real_div(x = sqrt_s0_1, y = sqrt_s_t_1)[name = tensor<string, []>("op_240")];
+            tensor<int32, [4]> var_241 = const()[name = tensor<string, []>("op_241"), val = tensor<int32, [4]>([1, 1, 3, 1])];
+            tensor<fp32, [1, 1, 3, 1]> var_242 = reshape(shape = var_241, x = var_240)[name = tensor<string, []>("op_242")];
+            tensor<fp32, [1, 4, 3, 64]> cross_1 = mul(x = var_239, y = var_242)[name = tensor<string, []>("cross_1")];
             tensor<fp32, [1, 4, 3, 64]> out_1 = add(x = inner_1, y = cross_1)[name = tensor<string, []>("out_1")];
-            tensor<fp32, [1, 4, 64, 64]> var_199 = mul(x = prev_kv_1, y = sqrt_s0_1)[name = tensor<string, []>("op_199")];
-            tensor<bool, []> var_201_transpose_x_1 = const()[name = tensor<string, []>("op_201_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_201_transpose_y_1 = const()[name = tensor<string, []>("op_201_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_201 = matmul(transpose_x = var_201_transpose_x_1, transpose_y = var_201_transpose_y_1, x = k_1, y = v_1)[name = tensor<string, []>("op_201")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_1 = add(x = var_199, y = var_201)[name = tensor<string, []>("new_kv_unnorm_1")];
-            tensor<fp32, []> var_203 = const()[name = tensor<string, []>("op_203"), val = tensor<fp32, []>(0x1.8p+1)];
-            tensor<fp32, [1]> new_scale_1 = add(x = prev_scale_1, y = var_203)[name = tensor<string, []>("new_scale_1")];
-            tensor<fp32, [1]> var_205 = sqrt(x = new_scale_1)[name = tensor<string, []>("op_205")];
-            tensor<fp32, [1, 4, 64, 64]> var_206 = real_div(x = new_kv_unnorm_1, y = var_205)[name = tensor<string, []>("op_206")];
-            tensor<int32, [4]> var_207_perm_0 = const()[name = tensor<string, []>("op_207_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_245 = mul(x = prev_kv_1, y = sqrt_s0_1)[name = tensor<string, []>("op_245")];
+            tensor<bool, []> var_247_transpose_x_1 = const()[name = tensor<string, []>("op_247_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_247_transpose_y_1 = const()[name = tensor<string, []>("op_247_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_247 = matmul(transpose_x = var_247_transpose_x_1, transpose_y = var_247_transpose_y_1, x = k_1, y = v_1)[name = tensor<string, []>("op_247")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_1 = add(x = var_245, y = var_247)[name = tensor<string, []>("new_kv_unnorm_1")];
+            tensor<fp32, []> var_249 = const()[name = tensor<string, []>("op_249"), val = tensor<fp32, []>(0x1.8p+1)];
+            tensor<fp32, [1]> new_scale_1 = add(x = prev_scale_1, y = var_249)[name = tensor<string, []>("new_scale_1")];
+            tensor<fp32, [1]> var_251 = sqrt(x = new_scale_1)[name = tensor<string, []>("op_251")];
+            tensor<fp32, [1, 4, 64, 64]> var_252 = real_div(x = new_kv_unnorm_1, y = var_251)[name = tensor<string, []>("op_252")];
+            tensor<int32, [4]> var_253_perm_0 = const()[name = tensor<string, []>("op_253_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_3_axes_0 = const()[name = tensor<string, []>("out_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 4, 64]> var_207 = transpose(perm = var_207_perm_0, x = out_1)[name = tensor<string, []>("transpose_55")];
-            tensor<fp32, [1, 3, 4, 64]> out_3 = layer_norm(axes = out_3_axes_0, epsilon = var_17, x = var_207)[name = tensor<string, []>("out_3")];
-            tensor<int32, [3]> var_211 = const()[name = tensor<string, []>("op_211"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<fp32, [1, 3, 256]> out_5 = reshape(shape = var_211, x = out_3)[name = tensor<string, []>("out_5")];
-            tensor<fp32, [1, 3, 256]> var_213 = silu(x = input_17)[name = tensor<string, []>("op_213")];
-            tensor<fp32, [1, 3, 256]> input_19 = mul(x = var_213, y = out_5)[name = tensor<string, []>("input_19")];
-            tensor<fp32, [1, 3, 256]> ret_out_1 = linear(bias = encoder_out_proj_0_bias, weight = encoder_out_proj_0_weight, x = input_19)[name = tensor<string, []>("linear_7")];
-            tensor<fp32, [1, 3, 256]> x_3 = add(x = input_13, y = ret_out_1)[name = tensor<string, []>("x_3")];
+            tensor<fp32, [1, 3, 4, 64]> var_253 = transpose(perm = var_253_perm_0, x = out_1)[name = tensor<string, []>("transpose_55")];
+            tensor<fp32, [1, 3, 4, 64]> out_3 = layer_norm(axes = out_3_axes_0, epsilon = var_64, x = var_253)[name = tensor<string, []>("out_3")];
+            tensor<int32, [3]> var_257 = const()[name = tensor<string, []>("op_257"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<fp32, [1, 3, 256]> out_5 = reshape(shape = var_257, x = out_3)[name = tensor<string, []>("out_5")];
+            tensor<fp32, [1, 3, 256]> var_259 = silu(x = input_19)[name = tensor<string, []>("op_259")];
+            tensor<fp32, [1, 3, 256]> input_21 = mul(x = var_259, y = out_5)[name = tensor<string, []>("input_21")];
+            tensor<fp32, [1, 3, 256]> ret_out_1 = linear(bias = inner_encoder_out_proj_0_bias, weight = inner_encoder_out_proj_0_weight, x = input_21)[name = tensor<string, []>("linear_7")];
+            tensor<fp32, [1, 3, 256]> x_3 = add(x = input_15, y = ret_out_1)[name = tensor<string, []>("x_3")];
             tensor<int32, [4]> window_1_begin_0 = const()[name = tensor<string, []>("window_1_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> window_1_end_0 = const()[name = tensor<string, []>("window_1_end_0"), val = tensor<int32, [4]>([1, 1, 16, 256])];
             tensor<bool, [4]> window_1_end_mask_0 = const()[name = tensor<string, []>("window_1_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_1_squeeze_mask_0 = const()[name = tensor<string, []>("window_1_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_1 = slice_by_index(begin = window_1_begin_0, end = window_1_end_0, end_mask = window_1_end_mask_0, squeeze_mask = window_1_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_1")];
-            tensor<int32, [3]> var_221_begin_0 = const()[name = tensor<string, []>("op_221_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_221_end_0 = const()[name = tensor<string, []>("op_221_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_221_end_mask_0 = const()[name = tensor<string, []>("op_221_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_221 = slice_by_index(begin = var_221_begin_0, end = var_221_end_0, end_mask = var_221_end_mask_0, x = x_3)[name = tensor<string, []>("op_221")];
-            tensor<int32, [3]> var_224_begin_0 = const()[name = tensor<string, []>("op_224_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_224_end_0 = const()[name = tensor<string, []>("op_224_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_224_end_mask_0 = const()[name = tensor<string, []>("op_224_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_224 = slice_by_index(begin = var_224_begin_0, end = var_224_end_0, end_mask = var_224_end_mask_0, x = window_1)[name = tensor<string, []>("op_224")];
+            tensor<int32, [3]> var_267_begin_0 = const()[name = tensor<string, []>("op_267_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_267_end_0 = const()[name = tensor<string, []>("op_267_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_267_end_mask_0 = const()[name = tensor<string, []>("op_267_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_267 = slice_by_index(begin = var_267_begin_0, end = var_267_end_0, end_mask = var_267_end_mask_0, x = x_3)[name = tensor<string, []>("op_267")];
+            tensor<int32, [3]> var_270_begin_0 = const()[name = tensor<string, []>("op_270_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_270_end_0 = const()[name = tensor<string, []>("op_270_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_270_end_mask_0 = const()[name = tensor<string, []>("op_270_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_270 = slice_by_index(begin = var_270_begin_0, end = var_270_end_0, end_mask = var_270_end_mask_0, x = window_1)[name = tensor<string, []>("op_270")];
             tensor<bool, []> window_3_interleave_0 = const()[name = tensor<string, []>("window_3_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_3 = concat(axis = var_26, interleave = window_3_interleave_0, values = (var_224, var_221))[name = tensor<string, []>("window_3")];
-            tensor<int32, [3]> var_229_begin_0 = const()[name = tensor<string, []>("op_229_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_229_end_0 = const()[name = tensor<string, []>("op_229_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_229_end_mask_0 = const()[name = tensor<string, []>("op_229_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_229 = slice_by_index(begin = var_229_begin_0, end = var_229_end_0, end_mask = var_229_end_mask_0, x = x_3)[name = tensor<string, []>("op_229")];
-            tensor<int32, [3]> var_232_begin_0 = const()[name = tensor<string, []>("op_232_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_232_end_0 = const()[name = tensor<string, []>("op_232_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_232_end_mask_0 = const()[name = tensor<string, []>("op_232_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_232 = slice_by_index(begin = var_232_begin_0, end = var_232_end_0, end_mask = var_232_end_mask_0, x = window_3)[name = tensor<string, []>("op_232")];
+            tensor<fp32, [1, 16, 256]> window_3 = concat(axis = var_72, interleave = window_3_interleave_0, values = (var_270, var_267))[name = tensor<string, []>("window_3")];
+            tensor<int32, [3]> var_275_begin_0 = const()[name = tensor<string, []>("op_275_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_275_end_0 = const()[name = tensor<string, []>("op_275_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_275_end_mask_0 = const()[name = tensor<string, []>("op_275_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_275 = slice_by_index(begin = var_275_begin_0, end = var_275_end_0, end_mask = var_275_end_mask_0, x = x_3)[name = tensor<string, []>("op_275")];
+            tensor<int32, [3]> var_278_begin_0 = const()[name = tensor<string, []>("op_278_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_278_end_0 = const()[name = tensor<string, []>("op_278_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_278_end_mask_0 = const()[name = tensor<string, []>("op_278_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_278 = slice_by_index(begin = var_278_begin_0, end = var_278_end_0, end_mask = var_278_end_mask_0, x = window_3)[name = tensor<string, []>("op_278")];
             tensor<bool, []> window_5_interleave_0 = const()[name = tensor<string, []>("window_5_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_5 = concat(axis = var_26, interleave = window_5_interleave_0, values = (var_232, var_229))[name = tensor<string, []>("window_5")];
-            tensor<int32, [3]> var_237_begin_0 = const()[name = tensor<string, []>("op_237_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_237_end_0 = const()[name = tensor<string, []>("op_237_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_237_end_mask_0 = const()[name = tensor<string, []>("op_237_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_237 = slice_by_index(begin = var_237_begin_0, end = var_237_end_0, end_mask = var_237_end_mask_0, x = x_3)[name = tensor<string, []>("op_237")];
-            tensor<int32, [3]> var_240_begin_0 = const()[name = tensor<string, []>("op_240_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_240_end_0 = const()[name = tensor<string, []>("op_240_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_240_end_mask_0 = const()[name = tensor<string, []>("op_240_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_240 = slice_by_index(begin = var_240_begin_0, end = var_240_end_0, end_mask = var_240_end_mask_0, x = window_5)[name = tensor<string, []>("op_240")];
+            tensor<fp32, [1, 16, 256]> window_5 = concat(axis = var_72, interleave = window_5_interleave_0, values = (var_278, var_275))[name = tensor<string, []>("window_5")];
+            tensor<int32, [3]> var_283_begin_0 = const()[name = tensor<string, []>("op_283_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_283_end_0 = const()[name = tensor<string, []>("op_283_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_283_end_mask_0 = const()[name = tensor<string, []>("op_283_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_283 = slice_by_index(begin = var_283_begin_0, end = var_283_end_0, end_mask = var_283_end_mask_0, x = x_3)[name = tensor<string, []>("op_283")];
+            tensor<int32, [3]> var_286_begin_0 = const()[name = tensor<string, []>("op_286_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_286_end_0 = const()[name = tensor<string, []>("op_286_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_286_end_mask_0 = const()[name = tensor<string, []>("op_286_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_286 = slice_by_index(begin = var_286_begin_0, end = var_286_end_0, end_mask = var_286_end_mask_0, x = window_5)[name = tensor<string, []>("op_286")];
             tensor<bool, []> window_7_interleave_0 = const()[name = tensor<string, []>("window_7_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_7 = concat(axis = var_26, interleave = window_7_interleave_0, values = (var_240, var_237))[name = tensor<string, []>("window_7")];
-            tensor<bool, []> input_21_interleave_0 = const()[name = tensor<string, []>("input_21_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [3, 16, 256]> input_21 = concat(axis = var_23, interleave = input_21_interleave_0, values = (window_3, window_5, window_7))[name = tensor<string, []>("input_21")];
+            tensor<fp32, [1, 16, 256]> window_7 = concat(axis = var_72, interleave = window_7_interleave_0, values = (var_286, var_283))[name = tensor<string, []>("window_7")];
+            tensor<bool, []> input_23_interleave_0 = const()[name = tensor<string, []>("input_23_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [3, 16, 256]> input_23 = concat(axis = var_59, interleave = input_23_interleave_0, values = (window_3, window_5, window_7))[name = tensor<string, []>("input_23")];
             tensor<int32, [1]> x_5_axes_0 = const()[name = tensor<string, []>("x_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [3, 16, 256]> x_5 = layer_norm(axes = x_5_axes_0, beta = encoder_conv_module_0_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_0_sequential_0_weight, x = input_21)[name = tensor<string, []>("x_5")];
-            tensor<int32, [3]> input_23_perm_0 = const()[name = tensor<string, []>("input_23_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [3, 16, 256]> x_5 = layer_norm(axes = x_5_axes_0, beta = inner_encoder_conv_module_0_sequential_0_bias, epsilon = var_56, gamma = inner_encoder_conv_module_0_sequential_0_weight, x = input_23)[name = tensor<string, []>("x_5")];
+            tensor<int32, [3]> input_25_perm_0 = const()[name = tensor<string, []>("input_25_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_3_pad_type_0 = const()[name = tensor<string, []>("inputs_3_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_3_strides_0 = const()[name = tensor<string, []>("inputs_3_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_3_pad_0 = const()[name = tensor<string, []>("inputs_3_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_3_dilations_0 = const()[name = tensor<string, []>("inputs_3_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_3_groups_0 = const()[name = tensor<string, []>("inputs_3_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> input_23 = transpose(perm = input_23_perm_0, x = x_5)[name = tensor<string, []>("transpose_54")];
-            tensor<fp32, [3, 512, 16]> inputs_3 = conv(bias = encoder_conv_module_0_sequential_2_conv_bias, dilations = inputs_3_dilations_0, groups = inputs_3_groups_0, pad = inputs_3_pad_0, pad_type = inputs_3_pad_type_0, strides = inputs_3_strides_0, weight = encoder_conv_module_0_sequential_2_conv_weight, x = input_23)[name = tensor<string, []>("inputs_3")];
-            tensor<int32, [2]> var_265_split_sizes_0 = const()[name = tensor<string, []>("op_265_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_265_axis_0 = const()[name = tensor<string, []>("op_265_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> var_265_0, tensor<fp32, [3, 256, 16]> var_265_1 = split(axis = var_265_axis_0, split_sizes = var_265_split_sizes_0, x = inputs_3)[name = tensor<string, []>("op_265")];
-            tensor<fp32, [3, 256, 16]> var_267 = sigmoid(x = var_265_1)[name = tensor<string, []>("op_267")];
-            tensor<fp32, [3, 256, 16]> inputs_5 = mul(x = var_265_0, y = var_267)[name = tensor<string, []>("inputs_5")];
+            tensor<fp32, [3, 256, 16]> input_25 = transpose(perm = input_25_perm_0, x = x_5)[name = tensor<string, []>("transpose_54")];
+            tensor<fp32, [3, 512, 16]> inputs_3 = conv(bias = inner_encoder_conv_module_0_sequential_2_conv_bias, dilations = inputs_3_dilations_0, groups = inputs_3_groups_0, pad = inputs_3_pad_0, pad_type = inputs_3_pad_type_0, strides = inputs_3_strides_0, weight = inner_encoder_conv_module_0_sequential_2_conv_weight, x = input_25)[name = tensor<string, []>("inputs_3")];
+            tensor<int32, [2]> var_311_split_sizes_0 = const()[name = tensor<string, []>("op_311_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_311_axis_0 = const()[name = tensor<string, []>("op_311_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [3, 256, 16]> var_311_0, tensor<fp32, [3, 256, 16]> var_311_1 = split(axis = var_311_axis_0, split_sizes = var_311_split_sizes_0, x = inputs_3)[name = tensor<string, []>("op_311")];
+            tensor<fp32, [3, 256, 16]> var_313 = sigmoid(x = var_311_1)[name = tensor<string, []>("op_313")];
+            tensor<fp32, [3, 256, 16]> inputs_5 = mul(x = var_311_0, y = var_313)[name = tensor<string, []>("inputs_5")];
             tensor<string, []> outputs_aug_1_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_1_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_1_pad_0 = const()[name = tensor<string, []>("outputs_aug_1_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_1_groups_0 = const()[name = tensor<string, []>("outputs_aug_1_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_1_strides_0 = const()[name = tensor<string, []>("outputs_aug_1_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_1_dilations_0 = const()[name = tensor<string, []>("outputs_aug_1_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [3, 256, 31]> outputs_aug_1 = conv(dilations = outputs_aug_1_dilations_0, groups = outputs_aug_1_groups_0, pad = outputs_aug_1_pad_0, pad_type = outputs_aug_1_pad_type_0, strides = outputs_aug_1_strides_0, weight = encoder_conv_module_0_sequential_4_conv_weight, x = inputs_5)[name = tensor<string, []>("outputs_aug_1")];
-            tensor<int32, [3]> input_25_begin_0 = const()[name = tensor<string, []>("input_25_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_25_end_0 = const()[name = tensor<string, []>("input_25_end_0"), val = tensor<int32, [3]>([3, 256, 16])];
-            tensor<bool, [3]> input_25_end_mask_0 = const()[name = tensor<string, []>("input_25_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [3, 256, 16]> input_25 = slice_by_index(begin = input_25_begin_0, end = input_25_end_0, end_mask = input_25_end_mask_0, x = outputs_aug_1)[name = tensor<string, []>("input_25")];
-            tensor<fp32, [3, 256, 16]> inputs_7 = batch_norm(beta = encoder_conv_module_0_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_0_sequential_5_weight, mean = encoder_conv_module_0_sequential_5_running_mean, variance = encoder_conv_module_0_sequential_5_running_var, x = input_25)[name = tensor<string, []>("inputs_7")];
-            tensor<fp32, [3, 256, 16]> input_27 = silu(x = inputs_7)[name = tensor<string, []>("input_27")];
-            tensor<string, []> input_29_pad_type_0 = const()[name = tensor<string, []>("input_29_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_29_strides_0 = const()[name = tensor<string, []>("input_29_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_29_pad_0 = const()[name = tensor<string, []>("input_29_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_29_dilations_0 = const()[name = tensor<string, []>("input_29_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_29_groups_0 = const()[name = tensor<string, []>("input_29_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> input_29 = conv(bias = encoder_conv_module_0_sequential_7_conv_bias, dilations = input_29_dilations_0, groups = input_29_groups_0, pad = input_29_pad_0, pad_type = input_29_pad_type_0, strides = input_29_strides_0, weight = encoder_conv_module_0_sequential_7_conv_weight, x = input_27)[name = tensor<string, []>("input_29")];
+            tensor<fp32, [3, 256, 31]> outputs_aug_1 = conv(dilations = outputs_aug_1_dilations_0, groups = outputs_aug_1_groups_0, pad = outputs_aug_1_pad_0, pad_type = outputs_aug_1_pad_type_0, strides = outputs_aug_1_strides_0, weight = inner_encoder_conv_module_0_sequential_4_conv_weight, x = inputs_5)[name = tensor<string, []>("outputs_aug_1")];
+            tensor<int32, [3]> input_27_begin_0 = const()[name = tensor<string, []>("input_27_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_27_end_0 = const()[name = tensor<string, []>("input_27_end_0"), val = tensor<int32, [3]>([3, 256, 16])];
+            tensor<bool, [3]> input_27_end_mask_0 = const()[name = tensor<string, []>("input_27_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [3, 256, 16]> input_27 = slice_by_index(begin = input_27_begin_0, end = input_27_end_0, end_mask = input_27_end_mask_0, x = outputs_aug_1)[name = tensor<string, []>("input_27")];
+            tensor<fp32, [3, 256, 16]> inputs_7 = batch_norm(beta = inner_encoder_conv_module_0_sequential_5_bias, epsilon = var_56, gamma = inner_encoder_conv_module_0_sequential_5_weight, mean = inner_encoder_conv_module_0_sequential_5_running_mean, variance = inner_encoder_conv_module_0_sequential_5_running_var, x = input_27)[name = tensor<string, []>("inputs_7")];
+            tensor<fp32, [3, 256, 16]> input_29 = silu(x = inputs_7)[name = tensor<string, []>("input_29")];
+            tensor<string, []> input_31_pad_type_0 = const()[name = tensor<string, []>("input_31_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_31_strides_0 = const()[name = tensor<string, []>("input_31_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_31_pad_0 = const()[name = tensor<string, []>("input_31_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_31_dilations_0 = const()[name = tensor<string, []>("input_31_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_31_groups_0 = const()[name = tensor<string, []>("input_31_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [3, 256, 16]> input_31 = conv(bias = inner_encoder_conv_module_0_sequential_7_conv_bias, dilations = input_31_dilations_0, groups = input_31_groups_0, pad = input_31_pad_0, pad_type = input_31_pad_type_0, strides = input_31_strides_0, weight = inner_encoder_conv_module_0_sequential_7_conv_weight, x = input_29)[name = tensor<string, []>("input_31")];
             tensor<int32, [3]> conv_out_1_perm_0 = const()[name = tensor<string, []>("conv_out_1_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_298_begin_0 = const()[name = tensor<string, []>("op_298_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_298_end_0 = const()[name = tensor<string, []>("op_298_end_0"), val = tensor<int32, [3]>([3, 16, 256])];
-            tensor<bool, [3]> var_298_end_mask_0 = const()[name = tensor<string, []>("op_298_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [3, 16, 256]> conv_out_1 = transpose(perm = conv_out_1_perm_0, x = input_29)[name = tensor<string, []>("transpose_53")];
-            tensor<fp32, [3, 1, 256]> var_298 = slice_by_index(begin = var_298_begin_0, end = var_298_end_0, end_mask = var_298_end_mask_0, x = conv_out_1)[name = tensor<string, []>("op_298")];
-            tensor<int32, [3]> var_300_perm_0 = const()[name = tensor<string, []>("op_300_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 3, 256]> var_300 = transpose(perm = var_300_perm_0, x = var_298)[name = tensor<string, []>("transpose_52")];
-            tensor<fp32, [1, 3, 256]> input_31 = add(x = x_3, y = var_300)[name = tensor<string, []>("input_31")];
-            tensor<int32, [1]> input_33_axes_0 = const()[name = tensor<string, []>("input_33_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_33 = layer_norm(axes = input_33_axes_0, beta = encoder_ffn2_0_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_0_module_sequential_0_weight, x = input_31)[name = tensor<string, []>("input_33")];
-            tensor<fp32, [1, 3, 1024]> inputs_9 = linear(bias = encoder_ffn2_0_module_sequential_1_linear_bias, weight = encoder_ffn2_0_module_sequential_1_linear_weight, x = input_33)[name = tensor<string, []>("linear_8")];
-            tensor<fp32, [1, 3, 1024]> input_35 = silu(x = inputs_9)[name = tensor<string, []>("input_35")];
-            tensor<fp32, [1, 3, 256]> input_39 = linear(bias = encoder_ffn2_0_module_sequential_4_linear_bias, weight = encoder_ffn2_0_module_sequential_4_linear_weight, x = input_35)[name = tensor<string, []>("linear_9")];
-            tensor<fp32, []> var_323 = const()[name = tensor<string, []>("op_323"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 3, 256]> var_324 = mul(x = input_39, y = var_323)[name = tensor<string, []>("op_324")];
-            tensor<fp32, [1, 3, 256]> input_41 = add(x = var_324, y = input_31)[name = tensor<string, []>("input_41")];
-            tensor<int32, [1]> input_43_axes_0 = const()[name = tensor<string, []>("input_43_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_43 = layer_norm(axes = input_43_axes_0, beta = encoder_layer_norm_0_bias, epsilon = var_28, gamma = encoder_layer_norm_0_weight, x = input_41)[name = tensor<string, []>("input_43")];
+            tensor<int32, [3]> var_344_begin_0 = const()[name = tensor<string, []>("op_344_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_344_end_0 = const()[name = tensor<string, []>("op_344_end_0"), val = tensor<int32, [3]>([3, 16, 256])];
+            tensor<bool, [3]> var_344_end_mask_0 = const()[name = tensor<string, []>("op_344_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [3, 16, 256]> conv_out_1 = transpose(perm = conv_out_1_perm_0, x = input_31)[name = tensor<string, []>("transpose_53")];
+            tensor<fp32, [3, 1, 256]> var_344 = slice_by_index(begin = var_344_begin_0, end = var_344_end_0, end_mask = var_344_end_mask_0, x = conv_out_1)[name = tensor<string, []>("op_344")];
+            tensor<int32, [3]> var_346_perm_0 = const()[name = tensor<string, []>("op_346_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 3, 256]> var_346 = transpose(perm = var_346_perm_0, x = var_344)[name = tensor<string, []>("transpose_52")];
+            tensor<fp32, [1, 3, 256]> input_33 = add(x = x_3, y = var_346)[name = tensor<string, []>("input_33")];
+            tensor<int32, [1]> input_35_axes_0 = const()[name = tensor<string, []>("input_35_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 3, 256]> input_35 = layer_norm(axes = input_35_axes_0, beta = inner_encoder_ffn2_0_module_sequential_0_bias, epsilon = var_56, gamma = inner_encoder_ffn2_0_module_sequential_0_weight, x = input_33)[name = tensor<string, []>("input_35")];
+            tensor<fp32, [1, 3, 1024]> inputs_9 = linear(bias = inner_encoder_ffn2_0_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_0_module_sequential_1_linear_weight, x = input_35)[name = tensor<string, []>("linear_8")];
+            tensor<fp32, [1, 3, 1024]> input_37 = silu(x = inputs_9)[name = tensor<string, []>("input_37")];
+            tensor<fp32, [1, 3, 256]> input_41 = linear(bias = inner_encoder_ffn2_0_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_0_module_sequential_4_linear_weight, x = input_37)[name = tensor<string, []>("linear_9")];
+            tensor<fp32, []> var_369 = const()[name = tensor<string, []>("op_369"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 3, 256]> var_370 = mul(x = input_41, y = var_369)[name = tensor<string, []>("op_370")];
+            tensor<fp32, [1, 3, 256]> input_43 = add(x = var_370, y = input_33)[name = tensor<string, []>("input_43")];
             tensor<int32, [1]> input_45_axes_0 = const()[name = tensor<string, []>("input_45_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_45 = layer_norm(axes = input_45_axes_0, beta = encoder_ffn1_1_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_1_module_sequential_0_weight, x = input_43)[name = tensor<string, []>("input_45")];
-            tensor<fp32, [1, 3, 1024]> inputs_11 = linear(bias = encoder_ffn1_1_module_sequential_1_linear_bias, weight = encoder_ffn1_1_module_sequential_1_linear_weight, x = input_45)[name = tensor<string, []>("linear_10")];
-            tensor<fp32, [1, 3, 1024]> input_47 = silu(x = inputs_11)[name = tensor<string, []>("input_47")];
-            tensor<fp32, [1, 3, 256]> input_51 = linear(bias = encoder_ffn1_1_module_sequential_4_linear_bias, weight = encoder_ffn1_1_module_sequential_4_linear_weight, x = input_47)[name = tensor<string, []>("linear_11")];
-            tensor<fp32, []> var_353 = const()[name = tensor<string, []>("op_353"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 3, 256]> var_354 = mul(x = input_51, y = var_353)[name = tensor<string, []>("op_354")];
-            tensor<fp32, [1, 3, 256]> input_53 = add(x = var_354, y = input_43)[name = tensor<string, []>("input_53")];
+            tensor<fp32, [1, 3, 256]> input_45 = layer_norm(axes = input_45_axes_0, beta = inner_encoder_layer_norm_0_bias, epsilon = var_56, gamma = inner_encoder_layer_norm_0_weight, x = input_43)[name = tensor<string, []>("input_45")];
+            tensor<int32, [1]> input_47_axes_0 = const()[name = tensor<string, []>("input_47_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 3, 256]> input_47 = layer_norm(axes = input_47_axes_0, beta = inner_encoder_ffn1_1_module_sequential_0_bias, epsilon = var_56, gamma = inner_encoder_ffn1_1_module_sequential_0_weight, x = input_45)[name = tensor<string, []>("input_47")];
+            tensor<fp32, [1, 3, 1024]> inputs_11 = linear(bias = inner_encoder_ffn1_1_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_1_module_sequential_1_linear_weight, x = input_47)[name = tensor<string, []>("linear_10")];
+            tensor<fp32, [1, 3, 1024]> input_49 = silu(x = inputs_11)[name = tensor<string, []>("input_49")];
+            tensor<fp32, [1, 3, 256]> input_53 = linear(bias = inner_encoder_ffn1_1_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_1_module_sequential_4_linear_weight, x = input_49)[name = tensor<string, []>("linear_11")];
+            tensor<fp32, []> var_399 = const()[name = tensor<string, []>("op_399"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 3, 256]> var_400 = mul(x = input_53, y = var_399)[name = tensor<string, []>("op_400")];
+            tensor<fp32, [1, 3, 256]> input_55 = add(x = var_400, y = input_45)[name = tensor<string, []>("input_55")];
             tensor<int32, [1]> x_7_axes_0 = const()[name = tensor<string, []>("x_7_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> x_7 = layer_norm(axes = x_7_axes_0, beta = encoder_ret_lns_1_bias, epsilon = var_28, gamma = encoder_ret_lns_1_weight, x = input_53)[name = tensor<string, []>("x_7")];
+            tensor<fp32, [1, 3, 256]> x_7 = layer_norm(axes = x_7_axes_0, beta = inner_encoder_ret_lns_1_bias, epsilon = var_56, gamma = inner_encoder_ret_lns_1_weight, x = input_55)[name = tensor<string, []>("x_7")];
             tensor<int32, [5]> prev_kv_3_begin_0 = const()[name = tensor<string, []>("prev_kv_3_begin_0"), val = tensor<int32, [5]>([1, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_3_end_0 = const()[name = tensor<string, []>("prev_kv_3_end_0"), val = tensor<int32, [5]>([2, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_3_end_mask_0 = const()[name = tensor<string, []>("prev_kv_3_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -406,163 +424,163 @@ program(1.0)
             tensor<bool, [2]> prev_scale_3_end_mask_0 = const()[name = tensor<string, []>("prev_scale_3_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_3_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_3_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_3 = slice_by_index(begin = prev_scale_3_begin_0, end = prev_scale_3_end_0, end_mask = prev_scale_3_end_mask_0, squeeze_mask = prev_scale_3_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_3")];
-            tensor<fp32, [1, 3, 256]> var_368 = linear(bias = encoder_q_proj_1_bias, weight = encoder_q_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_12")];
-            tensor<int32, [4]> var_369 = const()[name = tensor<string, []>("op_369"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_370 = reshape(shape = var_369, x = var_368)[name = tensor<string, []>("op_370")];
+            tensor<fp32, [1, 3, 256]> var_414 = linear(bias = inner_encoder_q_proj_1_bias, weight = inner_encoder_q_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_12")];
+            tensor<int32, [4]> var_415 = const()[name = tensor<string, []>("op_415"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_416 = reshape(shape = var_415, x = var_414)[name = tensor<string, []>("op_416")];
             tensor<int32, [4]> q_3_perm_0 = const()[name = tensor<string, []>("q_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> var_374 = linear(bias = encoder_k_proj_1_bias, weight = encoder_k_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_13")];
-            tensor<fp32, []> var_375 = const()[name = tensor<string, []>("op_375"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 3, 256]> var_376 = mul(x = var_374, y = var_375)[name = tensor<string, []>("op_376")];
-            tensor<int32, [4]> var_377 = const()[name = tensor<string, []>("op_377"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_378 = reshape(shape = var_377, x = var_376)[name = tensor<string, []>("op_378")];
+            tensor<fp32, [1, 3, 256]> var_420 = linear(bias = inner_encoder_k_proj_1_bias, weight = inner_encoder_k_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_13")];
+            tensor<fp32, []> var_421 = const()[name = tensor<string, []>("op_421"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 3, 256]> var_422 = mul(x = var_420, y = var_421)[name = tensor<string, []>("op_422")];
+            tensor<int32, [4]> var_423 = const()[name = tensor<string, []>("op_423"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_424 = reshape(shape = var_423, x = var_422)[name = tensor<string, []>("op_424")];
             tensor<int32, [4]> k_3_perm_0 = const()[name = tensor<string, []>("k_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> var_382 = linear(bias = encoder_v_proj_1_bias, weight = encoder_v_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_14")];
-            tensor<int32, [4]> var_383 = const()[name = tensor<string, []>("op_383"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_384 = reshape(shape = var_383, x = var_382)[name = tensor<string, []>("op_384")];
+            tensor<fp32, [1, 3, 256]> var_428 = linear(bias = inner_encoder_v_proj_1_bias, weight = inner_encoder_v_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_14")];
+            tensor<int32, [4]> var_429 = const()[name = tensor<string, []>("op_429"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_430 = reshape(shape = var_429, x = var_428)[name = tensor<string, []>("op_430")];
             tensor<int32, [4]> v_3_perm_0 = const()[name = tensor<string, []>("v_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> input_57 = linear(bias = encoder_g_proj_1_bias, weight = encoder_g_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_15")];
+            tensor<fp32, [1, 3, 256]> input_59 = linear(bias = inner_encoder_g_proj_1_bias, weight = inner_encoder_g_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_15")];
             tensor<fp32, [1]> sqrt_s0_3 = sqrt(x = prev_scale_3)[name = tensor<string, []>("sqrt_s0_3")];
-            tensor<fp32, [3]> s_t_3 = add(x = prev_scale_3, y = encoder__t_index)[name = tensor<string, []>("s_t_3")];
+            tensor<fp32, [3]> s_t_3 = add(x = prev_scale_3, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_3")];
             tensor<fp32, [3]> sqrt_s_t_3 = sqrt(x = s_t_3)[name = tensor<string, []>("sqrt_s_t_3")];
             tensor<bool, []> qk_3_transpose_x_1 = const()[name = tensor<string, []>("qk_3_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_3_transpose_y_1 = const()[name = tensor<string, []>("qk_3_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 3, 64]> k_3 = transpose(perm = k_3_perm_0, x = var_378)[name = tensor<string, []>("transpose_50")];
-            tensor<fp32, [1, 4, 3, 64]> q_3 = transpose(perm = q_3_perm_0, x = var_370)[name = tensor<string, []>("transpose_51")];
+            tensor<fp32, [1, 4, 3, 64]> k_3 = transpose(perm = k_3_perm_0, x = var_424)[name = tensor<string, []>("transpose_50")];
+            tensor<fp32, [1, 4, 3, 64]> q_3 = transpose(perm = q_3_perm_0, x = var_416)[name = tensor<string, []>("transpose_51")];
             tensor<fp32, [1, 4, 3, 3]> qk_3 = matmul(transpose_x = qk_3_transpose_x_1, transpose_y = qk_3_transpose_y_1, x = q_3, y = k_3)[name = tensor<string, []>("qk_3")];
-            tensor<int32, [2]> var_394 = const()[name = tensor<string, []>("op_394"), val = tensor<int32, [2]>([3, 1])];
-            tensor<fp32, [3, 1]> var_395 = reshape(shape = var_394, x = sqrt_s_t_3)[name = tensor<string, []>("op_395")];
-            tensor<fp32, [3, 3]> M_3 = real_div(x = encoder__causal_mask, y = var_395)[name = tensor<string, []>("M_3")];
-            tensor<fp32, [1, 4, 3, 3]> var_397 = mul(x = qk_3, y = M_3)[name = tensor<string, []>("op_397")];
+            tensor<int32, [2]> var_440 = const()[name = tensor<string, []>("op_440"), val = tensor<int32, [2]>([3, 1])];
+            tensor<fp32, [3, 1]> var_441 = reshape(shape = var_440, x = sqrt_s_t_3)[name = tensor<string, []>("op_441")];
+            tensor<fp32, [3, 3]> M_3 = real_div(x = inner_encoder__causal_mask, y = var_441)[name = tensor<string, []>("M_3")];
+            tensor<fp32, [1, 4, 3, 3]> var_443 = mul(x = qk_3, y = M_3)[name = tensor<string, []>("op_443")];
             tensor<bool, []> inner_3_transpose_x_0 = const()[name = tensor<string, []>("inner_3_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_3_transpose_y_0 = const()[name = tensor<string, []>("inner_3_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 3, 64]> v_3 = transpose(perm = v_3_perm_0, x = var_384)[name = tensor<string, []>("transpose_49")];
-            tensor<fp32, [1, 4, 3, 64]> inner_3 = matmul(transpose_x = inner_3_transpose_x_0, transpose_y = inner_3_transpose_y_0, x = var_397, y = v_3)[name = tensor<string, []>("inner_3")];
-            tensor<bool, []> var_399_transpose_x_0 = const()[name = tensor<string, []>("op_399_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_399_transpose_y_0 = const()[name = tensor<string, []>("op_399_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 3, 64]> var_399 = matmul(transpose_x = var_399_transpose_x_0, transpose_y = var_399_transpose_y_0, x = q_3, y = prev_kv_3)[name = tensor<string, []>("op_399")];
-            tensor<fp32, [3]> var_400 = real_div(x = sqrt_s0_3, y = sqrt_s_t_3)[name = tensor<string, []>("op_400")];
-            tensor<int32, [4]> var_401 = const()[name = tensor<string, []>("op_401"), val = tensor<int32, [4]>([1, 1, 3, 1])];
-            tensor<fp32, [1, 1, 3, 1]> var_402 = reshape(shape = var_401, x = var_400)[name = tensor<string, []>("op_402")];
-            tensor<fp32, [1, 4, 3, 64]> cross_3 = mul(x = var_399, y = var_402)[name = tensor<string, []>("cross_3")];
+            tensor<fp32, [1, 4, 3, 64]> v_3 = transpose(perm = v_3_perm_0, x = var_430)[name = tensor<string, []>("transpose_49")];
+            tensor<fp32, [1, 4, 3, 64]> inner_3 = matmul(transpose_x = inner_3_transpose_x_0, transpose_y = inner_3_transpose_y_0, x = var_443, y = v_3)[name = tensor<string, []>("inner_3")];
+            tensor<bool, []> var_445_transpose_x_0 = const()[name = tensor<string, []>("op_445_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_445_transpose_y_0 = const()[name = tensor<string, []>("op_445_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 3, 64]> var_445 = matmul(transpose_x = var_445_transpose_x_0, transpose_y = var_445_transpose_y_0, x = q_3, y = prev_kv_3)[name = tensor<string, []>("op_445")];
+            tensor<fp32, [3]> var_446 = real_div(x = sqrt_s0_3, y = sqrt_s_t_3)[name = tensor<string, []>("op_446")];
+            tensor<int32, [4]> var_447 = const()[name = tensor<string, []>("op_447"), val = tensor<int32, [4]>([1, 1, 3, 1])];
+            tensor<fp32, [1, 1, 3, 1]> var_448 = reshape(shape = var_447, x = var_446)[name = tensor<string, []>("op_448")];
+            tensor<fp32, [1, 4, 3, 64]> cross_3 = mul(x = var_445, y = var_448)[name = tensor<string, []>("cross_3")];
             tensor<fp32, [1, 4, 3, 64]> out_7 = add(x = inner_3, y = cross_3)[name = tensor<string, []>("out_7")];
-            tensor<fp32, [1, 4, 64, 64]> var_405 = mul(x = prev_kv_3, y = sqrt_s0_3)[name = tensor<string, []>("op_405")];
-            tensor<bool, []> var_407_transpose_x_1 = const()[name = tensor<string, []>("op_407_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_407_transpose_y_1 = const()[name = tensor<string, []>("op_407_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_407 = matmul(transpose_x = var_407_transpose_x_1, transpose_y = var_407_transpose_y_1, x = k_3, y = v_3)[name = tensor<string, []>("op_407")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_3 = add(x = var_405, y = var_407)[name = tensor<string, []>("new_kv_unnorm_3")];
-            tensor<fp32, []> var_409 = const()[name = tensor<string, []>("op_409"), val = tensor<fp32, []>(0x1.8p+1)];
-            tensor<fp32, [1]> new_scale_3 = add(x = prev_scale_3, y = var_409)[name = tensor<string, []>("new_scale_3")];
-            tensor<fp32, [1]> var_411 = sqrt(x = new_scale_3)[name = tensor<string, []>("op_411")];
-            tensor<fp32, [1, 4, 64, 64]> var_412 = real_div(x = new_kv_unnorm_3, y = var_411)[name = tensor<string, []>("op_412")];
-            tensor<int32, [4]> var_413_perm_0 = const()[name = tensor<string, []>("op_413_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_451 = mul(x = prev_kv_3, y = sqrt_s0_3)[name = tensor<string, []>("op_451")];
+            tensor<bool, []> var_453_transpose_x_1 = const()[name = tensor<string, []>("op_453_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_453_transpose_y_1 = const()[name = tensor<string, []>("op_453_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_453 = matmul(transpose_x = var_453_transpose_x_1, transpose_y = var_453_transpose_y_1, x = k_3, y = v_3)[name = tensor<string, []>("op_453")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_3 = add(x = var_451, y = var_453)[name = tensor<string, []>("new_kv_unnorm_3")];
+            tensor<fp32, []> var_455 = const()[name = tensor<string, []>("op_455"), val = tensor<fp32, []>(0x1.8p+1)];
+            tensor<fp32, [1]> new_scale_3 = add(x = prev_scale_3, y = var_455)[name = tensor<string, []>("new_scale_3")];
+            tensor<fp32, [1]> var_457 = sqrt(x = new_scale_3)[name = tensor<string, []>("op_457")];
+            tensor<fp32, [1, 4, 64, 64]> var_458 = real_div(x = new_kv_unnorm_3, y = var_457)[name = tensor<string, []>("op_458")];
+            tensor<int32, [4]> var_459_perm_0 = const()[name = tensor<string, []>("op_459_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_9_axes_0 = const()[name = tensor<string, []>("out_9_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 4, 64]> var_413 = transpose(perm = var_413_perm_0, x = out_7)[name = tensor<string, []>("transpose_48")];
-            tensor<fp32, [1, 3, 4, 64]> out_9 = layer_norm(axes = out_9_axes_0, epsilon = var_17, x = var_413)[name = tensor<string, []>("out_9")];
-            tensor<int32, [3]> var_417 = const()[name = tensor<string, []>("op_417"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<fp32, [1, 3, 256]> out_11 = reshape(shape = var_417, x = out_9)[name = tensor<string, []>("out_11")];
-            tensor<fp32, [1, 3, 256]> var_419 = silu(x = input_57)[name = tensor<string, []>("op_419")];
-            tensor<fp32, [1, 3, 256]> input_59 = mul(x = var_419, y = out_11)[name = tensor<string, []>("input_59")];
-            tensor<fp32, [1, 3, 256]> ret_out_3 = linear(bias = encoder_out_proj_1_bias, weight = encoder_out_proj_1_weight, x = input_59)[name = tensor<string, []>("linear_16")];
-            tensor<fp32, [1, 3, 256]> x_9 = add(x = input_53, y = ret_out_3)[name = tensor<string, []>("x_9")];
+            tensor<fp32, [1, 3, 4, 64]> var_459 = transpose(perm = var_459_perm_0, x = out_7)[name = tensor<string, []>("transpose_48")];
+            tensor<fp32, [1, 3, 4, 64]> out_9 = layer_norm(axes = out_9_axes_0, epsilon = var_64, x = var_459)[name = tensor<string, []>("out_9")];
+            tensor<int32, [3]> var_463 = const()[name = tensor<string, []>("op_463"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<fp32, [1, 3, 256]> out_11 = reshape(shape = var_463, x = out_9)[name = tensor<string, []>("out_11")];
+            tensor<fp32, [1, 3, 256]> var_465 = silu(x = input_59)[name = tensor<string, []>("op_465")];
+            tensor<fp32, [1, 3, 256]> input_61 = mul(x = var_465, y = out_11)[name = tensor<string, []>("input_61")];
+            tensor<fp32, [1, 3, 256]> ret_out_3 = linear(bias = inner_encoder_out_proj_1_bias, weight = inner_encoder_out_proj_1_weight, x = input_61)[name = tensor<string, []>("linear_16")];
+            tensor<fp32, [1, 3, 256]> x_9 = add(x = input_55, y = ret_out_3)[name = tensor<string, []>("x_9")];
             tensor<int32, [4]> window_9_begin_0 = const()[name = tensor<string, []>("window_9_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> window_9_end_0 = const()[name = tensor<string, []>("window_9_end_0"), val = tensor<int32, [4]>([2, 1, 16, 256])];
             tensor<bool, [4]> window_9_end_mask_0 = const()[name = tensor<string, []>("window_9_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_9_squeeze_mask_0 = const()[name = tensor<string, []>("window_9_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_9 = slice_by_index(begin = window_9_begin_0, end = window_9_end_0, end_mask = window_9_end_mask_0, squeeze_mask = window_9_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_9")];
-            tensor<int32, [3]> var_427_begin_0 = const()[name = tensor<string, []>("op_427_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_427_end_0 = const()[name = tensor<string, []>("op_427_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_427_end_mask_0 = const()[name = tensor<string, []>("op_427_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_427 = slice_by_index(begin = var_427_begin_0, end = var_427_end_0, end_mask = var_427_end_mask_0, x = x_9)[name = tensor<string, []>("op_427")];
-            tensor<int32, [3]> var_430_begin_0 = const()[name = tensor<string, []>("op_430_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_430_end_0 = const()[name = tensor<string, []>("op_430_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_430_end_mask_0 = const()[name = tensor<string, []>("op_430_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_430 = slice_by_index(begin = var_430_begin_0, end = var_430_end_0, end_mask = var_430_end_mask_0, x = window_9)[name = tensor<string, []>("op_430")];
+            tensor<int32, [3]> var_473_begin_0 = const()[name = tensor<string, []>("op_473_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_473_end_0 = const()[name = tensor<string, []>("op_473_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_473_end_mask_0 = const()[name = tensor<string, []>("op_473_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_473 = slice_by_index(begin = var_473_begin_0, end = var_473_end_0, end_mask = var_473_end_mask_0, x = x_9)[name = tensor<string, []>("op_473")];
+            tensor<int32, [3]> var_476_begin_0 = const()[name = tensor<string, []>("op_476_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_476_end_0 = const()[name = tensor<string, []>("op_476_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_476_end_mask_0 = const()[name = tensor<string, []>("op_476_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_476 = slice_by_index(begin = var_476_begin_0, end = var_476_end_0, end_mask = var_476_end_mask_0, x = window_9)[name = tensor<string, []>("op_476")];
             tensor<bool, []> window_11_interleave_0 = const()[name = tensor<string, []>("window_11_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_11 = concat(axis = var_26, interleave = window_11_interleave_0, values = (var_430, var_427))[name = tensor<string, []>("window_11")];
-            tensor<int32, [3]> var_435_begin_0 = const()[name = tensor<string, []>("op_435_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_435_end_0 = const()[name = tensor<string, []>("op_435_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_435_end_mask_0 = const()[name = tensor<string, []>("op_435_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_435 = slice_by_index(begin = var_435_begin_0, end = var_435_end_0, end_mask = var_435_end_mask_0, x = x_9)[name = tensor<string, []>("op_435")];
-            tensor<int32, [3]> var_438_begin_0 = const()[name = tensor<string, []>("op_438_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_438_end_0 = const()[name = tensor<string, []>("op_438_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_438_end_mask_0 = const()[name = tensor<string, []>("op_438_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_438 = slice_by_index(begin = var_438_begin_0, end = var_438_end_0, end_mask = var_438_end_mask_0, x = window_11)[name = tensor<string, []>("op_438")];
+            tensor<fp32, [1, 16, 256]> window_11 = concat(axis = var_72, interleave = window_11_interleave_0, values = (var_476, var_473))[name = tensor<string, []>("window_11")];
+            tensor<int32, [3]> var_481_begin_0 = const()[name = tensor<string, []>("op_481_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_481_end_0 = const()[name = tensor<string, []>("op_481_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_481_end_mask_0 = const()[name = tensor<string, []>("op_481_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_481 = slice_by_index(begin = var_481_begin_0, end = var_481_end_0, end_mask = var_481_end_mask_0, x = x_9)[name = tensor<string, []>("op_481")];
+            tensor<int32, [3]> var_484_begin_0 = const()[name = tensor<string, []>("op_484_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_484_end_0 = const()[name = tensor<string, []>("op_484_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_484_end_mask_0 = const()[name = tensor<string, []>("op_484_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_484 = slice_by_index(begin = var_484_begin_0, end = var_484_end_0, end_mask = var_484_end_mask_0, x = window_11)[name = tensor<string, []>("op_484")];
             tensor<bool, []> window_13_interleave_0 = const()[name = tensor<string, []>("window_13_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_13 = concat(axis = var_26, interleave = window_13_interleave_0, values = (var_438, var_435))[name = tensor<string, []>("window_13")];
-            tensor<int32, [3]> var_443_begin_0 = const()[name = tensor<string, []>("op_443_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_443_end_0 = const()[name = tensor<string, []>("op_443_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_443_end_mask_0 = const()[name = tensor<string, []>("op_443_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_443 = slice_by_index(begin = var_443_begin_0, end = var_443_end_0, end_mask = var_443_end_mask_0, x = x_9)[name = tensor<string, []>("op_443")];
-            tensor<int32, [3]> var_446_begin_0 = const()[name = tensor<string, []>("op_446_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_446_end_0 = const()[name = tensor<string, []>("op_446_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_446_end_mask_0 = const()[name = tensor<string, []>("op_446_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_446 = slice_by_index(begin = var_446_begin_0, end = var_446_end_0, end_mask = var_446_end_mask_0, x = window_13)[name = tensor<string, []>("op_446")];
+            tensor<fp32, [1, 16, 256]> window_13 = concat(axis = var_72, interleave = window_13_interleave_0, values = (var_484, var_481))[name = tensor<string, []>("window_13")];
+            tensor<int32, [3]> var_489_begin_0 = const()[name = tensor<string, []>("op_489_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_489_end_0 = const()[name = tensor<string, []>("op_489_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_489_end_mask_0 = const()[name = tensor<string, []>("op_489_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_489 = slice_by_index(begin = var_489_begin_0, end = var_489_end_0, end_mask = var_489_end_mask_0, x = x_9)[name = tensor<string, []>("op_489")];
+            tensor<int32, [3]> var_492_begin_0 = const()[name = tensor<string, []>("op_492_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_492_end_0 = const()[name = tensor<string, []>("op_492_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_492_end_mask_0 = const()[name = tensor<string, []>("op_492_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_492 = slice_by_index(begin = var_492_begin_0, end = var_492_end_0, end_mask = var_492_end_mask_0, x = window_13)[name = tensor<string, []>("op_492")];
             tensor<bool, []> window_15_interleave_0 = const()[name = tensor<string, []>("window_15_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_15 = concat(axis = var_26, interleave = window_15_interleave_0, values = (var_446, var_443))[name = tensor<string, []>("window_15")];
-            tensor<bool, []> input_61_interleave_0 = const()[name = tensor<string, []>("input_61_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [3, 16, 256]> input_61 = concat(axis = var_23, interleave = input_61_interleave_0, values = (window_11, window_13, window_15))[name = tensor<string, []>("input_61")];
+            tensor<fp32, [1, 16, 256]> window_15 = concat(axis = var_72, interleave = window_15_interleave_0, values = (var_492, var_489))[name = tensor<string, []>("window_15")];
+            tensor<bool, []> input_63_interleave_0 = const()[name = tensor<string, []>("input_63_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [3, 16, 256]> input_63 = concat(axis = var_59, interleave = input_63_interleave_0, values = (window_11, window_13, window_15))[name = tensor<string, []>("input_63")];
             tensor<int32, [1]> x_11_axes_0 = const()[name = tensor<string, []>("x_11_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [3, 16, 256]> x_11 = layer_norm(axes = x_11_axes_0, beta = encoder_conv_module_1_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_1_sequential_0_weight, x = input_61)[name = tensor<string, []>("x_11")];
-            tensor<int32, [3]> input_63_perm_0 = const()[name = tensor<string, []>("input_63_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [3, 16, 256]> x_11 = layer_norm(axes = x_11_axes_0, beta = inner_encoder_conv_module_1_sequential_0_bias, epsilon = var_56, gamma = inner_encoder_conv_module_1_sequential_0_weight, x = input_63)[name = tensor<string, []>("x_11")];
+            tensor<int32, [3]> input_65_perm_0 = const()[name = tensor<string, []>("input_65_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_13_pad_type_0 = const()[name = tensor<string, []>("inputs_13_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_13_strides_0 = const()[name = tensor<string, []>("inputs_13_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_13_pad_0 = const()[name = tensor<string, []>("inputs_13_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_13_dilations_0 = const()[name = tensor<string, []>("inputs_13_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_13_groups_0 = const()[name = tensor<string, []>("inputs_13_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> input_63 = transpose(perm = input_63_perm_0, x = x_11)[name = tensor<string, []>("transpose_47")];
-            tensor<fp32, [3, 512, 16]> inputs_13 = conv(bias = encoder_conv_module_1_sequential_2_conv_bias, dilations = inputs_13_dilations_0, groups = inputs_13_groups_0, pad = inputs_13_pad_0, pad_type = inputs_13_pad_type_0, strides = inputs_13_strides_0, weight = encoder_conv_module_1_sequential_2_conv_weight, x = input_63)[name = tensor<string, []>("inputs_13")];
-            tensor<int32, [2]> var_471_split_sizes_0 = const()[name = tensor<string, []>("op_471_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_471_axis_0 = const()[name = tensor<string, []>("op_471_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> var_471_0, tensor<fp32, [3, 256, 16]> var_471_1 = split(axis = var_471_axis_0, split_sizes = var_471_split_sizes_0, x = inputs_13)[name = tensor<string, []>("op_471")];
-            tensor<fp32, [3, 256, 16]> var_473 = sigmoid(x = var_471_1)[name = tensor<string, []>("op_473")];
-            tensor<fp32, [3, 256, 16]> inputs_15 = mul(x = var_471_0, y = var_473)[name = tensor<string, []>("inputs_15")];
+            tensor<fp32, [3, 256, 16]> input_65 = transpose(perm = input_65_perm_0, x = x_11)[name = tensor<string, []>("transpose_47")];
+            tensor<fp32, [3, 512, 16]> inputs_13 = conv(bias = inner_encoder_conv_module_1_sequential_2_conv_bias, dilations = inputs_13_dilations_0, groups = inputs_13_groups_0, pad = inputs_13_pad_0, pad_type = inputs_13_pad_type_0, strides = inputs_13_strides_0, weight = inner_encoder_conv_module_1_sequential_2_conv_weight, x = input_65)[name = tensor<string, []>("inputs_13")];
+            tensor<int32, [2]> var_517_split_sizes_0 = const()[name = tensor<string, []>("op_517_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_517_axis_0 = const()[name = tensor<string, []>("op_517_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [3, 256, 16]> var_517_0, tensor<fp32, [3, 256, 16]> var_517_1 = split(axis = var_517_axis_0, split_sizes = var_517_split_sizes_0, x = inputs_13)[name = tensor<string, []>("op_517")];
+            tensor<fp32, [3, 256, 16]> var_519 = sigmoid(x = var_517_1)[name = tensor<string, []>("op_519")];
+            tensor<fp32, [3, 256, 16]> inputs_15 = mul(x = var_517_0, y = var_519)[name = tensor<string, []>("inputs_15")];
             tensor<string, []> outputs_aug_3_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_3_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_3_pad_0 = const()[name = tensor<string, []>("outputs_aug_3_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_3_groups_0 = const()[name = tensor<string, []>("outputs_aug_3_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_3_strides_0 = const()[name = tensor<string, []>("outputs_aug_3_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_3_dilations_0 = const()[name = tensor<string, []>("outputs_aug_3_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [3, 256, 31]> outputs_aug_3 = conv(dilations = outputs_aug_3_dilations_0, groups = outputs_aug_3_groups_0, pad = outputs_aug_3_pad_0, pad_type = outputs_aug_3_pad_type_0, strides = outputs_aug_3_strides_0, weight = encoder_conv_module_1_sequential_4_conv_weight, x = inputs_15)[name = tensor<string, []>("outputs_aug_3")];
-            tensor<int32, [3]> input_65_begin_0 = const()[name = tensor<string, []>("input_65_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_65_end_0 = const()[name = tensor<string, []>("input_65_end_0"), val = tensor<int32, [3]>([3, 256, 16])];
-            tensor<bool, [3]> input_65_end_mask_0 = const()[name = tensor<string, []>("input_65_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [3, 256, 16]> input_65 = slice_by_index(begin = input_65_begin_0, end = input_65_end_0, end_mask = input_65_end_mask_0, x = outputs_aug_3)[name = tensor<string, []>("input_65")];
-            tensor<fp32, [3, 256, 16]> inputs_17 = batch_norm(beta = encoder_conv_module_1_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_1_sequential_5_weight, mean = encoder_conv_module_1_sequential_5_running_mean, variance = encoder_conv_module_1_sequential_5_running_var, x = input_65)[name = tensor<string, []>("inputs_17")];
-            tensor<fp32, [3, 256, 16]> input_67 = silu(x = inputs_17)[name = tensor<string, []>("input_67")];
-            tensor<string, []> input_69_pad_type_0 = const()[name = tensor<string, []>("input_69_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_69_strides_0 = const()[name = tensor<string, []>("input_69_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_69_pad_0 = const()[name = tensor<string, []>("input_69_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_69_dilations_0 = const()[name = tensor<string, []>("input_69_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_69_groups_0 = const()[name = tensor<string, []>("input_69_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> input_69 = conv(bias = encoder_conv_module_1_sequential_7_conv_bias, dilations = input_69_dilations_0, groups = input_69_groups_0, pad = input_69_pad_0, pad_type = input_69_pad_type_0, strides = input_69_strides_0, weight = encoder_conv_module_1_sequential_7_conv_weight, x = input_67)[name = tensor<string, []>("input_69")];
+            tensor<fp32, [3, 256, 31]> outputs_aug_3 = conv(dilations = outputs_aug_3_dilations_0, groups = outputs_aug_3_groups_0, pad = outputs_aug_3_pad_0, pad_type = outputs_aug_3_pad_type_0, strides = outputs_aug_3_strides_0, weight = inner_encoder_conv_module_1_sequential_4_conv_weight, x = inputs_15)[name = tensor<string, []>("outputs_aug_3")];
+            tensor<int32, [3]> input_67_begin_0 = const()[name = tensor<string, []>("input_67_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_67_end_0 = const()[name = tensor<string, []>("input_67_end_0"), val = tensor<int32, [3]>([3, 256, 16])];
+            tensor<bool, [3]> input_67_end_mask_0 = const()[name = tensor<string, []>("input_67_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [3, 256, 16]> input_67 = slice_by_index(begin = input_67_begin_0, end = input_67_end_0, end_mask = input_67_end_mask_0, x = outputs_aug_3)[name = tensor<string, []>("input_67")];
+            tensor<fp32, [3, 256, 16]> inputs_17 = batch_norm(beta = inner_encoder_conv_module_1_sequential_5_bias, epsilon = var_56, gamma = inner_encoder_conv_module_1_sequential_5_weight, mean = inner_encoder_conv_module_1_sequential_5_running_mean, variance = inner_encoder_conv_module_1_sequential_5_running_var, x = input_67)[name = tensor<string, []>("inputs_17")];
+            tensor<fp32, [3, 256, 16]> input_69 = silu(x = inputs_17)[name = tensor<string, []>("input_69")];
+            tensor<string, []> input_71_pad_type_0 = const()[name = tensor<string, []>("input_71_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_71_strides_0 = const()[name = tensor<string, []>("input_71_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_71_pad_0 = const()[name = tensor<string, []>("input_71_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_71_dilations_0 = const()[name = tensor<string, []>("input_71_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_71_groups_0 = const()[name = tensor<string, []>("input_71_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [3, 256, 16]> input_71 = conv(bias = inner_encoder_conv_module_1_sequential_7_conv_bias, dilations = input_71_dilations_0, groups = input_71_groups_0, pad = input_71_pad_0, pad_type = input_71_pad_type_0, strides = input_71_strides_0, weight = inner_encoder_conv_module_1_sequential_7_conv_weight, x = input_69)[name = tensor<string, []>("input_71")];
             tensor<int32, [3]> conv_out_3_perm_0 = const()[name = tensor<string, []>("conv_out_3_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_504_begin_0 = const()[name = tensor<string, []>("op_504_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_504_end_0 = const()[name = tensor<string, []>("op_504_end_0"), val = tensor<int32, [3]>([3, 16, 256])];
-            tensor<bool, [3]> var_504_end_mask_0 = const()[name = tensor<string, []>("op_504_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [3, 16, 256]> conv_out_3 = transpose(perm = conv_out_3_perm_0, x = input_69)[name = tensor<string, []>("transpose_46")];
-            tensor<fp32, [3, 1, 256]> var_504 = slice_by_index(begin = var_504_begin_0, end = var_504_end_0, end_mask = var_504_end_mask_0, x = conv_out_3)[name = tensor<string, []>("op_504")];
-            tensor<int32, [3]> var_506_perm_0 = const()[name = tensor<string, []>("op_506_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 3, 256]> var_506 = transpose(perm = var_506_perm_0, x = var_504)[name = tensor<string, []>("transpose_45")];
-            tensor<fp32, [1, 3, 256]> input_71 = add(x = x_9, y = var_506)[name = tensor<string, []>("input_71")];
-            tensor<int32, [1]> input_73_axes_0 = const()[name = tensor<string, []>("input_73_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_73 = layer_norm(axes = input_73_axes_0, beta = encoder_ffn2_1_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_1_module_sequential_0_weight, x = input_71)[name = tensor<string, []>("input_73")];
-            tensor<fp32, [1, 3, 1024]> inputs_19 = linear(bias = encoder_ffn2_1_module_sequential_1_linear_bias, weight = encoder_ffn2_1_module_sequential_1_linear_weight, x = input_73)[name = tensor<string, []>("linear_17")];
-            tensor<fp32, [1, 3, 1024]> input_75 = silu(x = inputs_19)[name = tensor<string, []>("input_75")];
-            tensor<fp32, [1, 3, 256]> input_79 = linear(bias = encoder_ffn2_1_module_sequential_4_linear_bias, weight = encoder_ffn2_1_module_sequential_4_linear_weight, x = input_75)[name = tensor<string, []>("linear_18")];
-            tensor<fp32, []> var_529 = const()[name = tensor<string, []>("op_529"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 3, 256]> var_530 = mul(x = input_79, y = var_529)[name = tensor<string, []>("op_530")];
-            tensor<fp32, [1, 3, 256]> input_81 = add(x = var_530, y = input_71)[name = tensor<string, []>("input_81")];
-            tensor<int32, [1]> input_83_axes_0 = const()[name = tensor<string, []>("input_83_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_83 = layer_norm(axes = input_83_axes_0, beta = encoder_layer_norm_1_bias, epsilon = var_28, gamma = encoder_layer_norm_1_weight, x = input_81)[name = tensor<string, []>("input_83")];
+            tensor<int32, [3]> var_550_begin_0 = const()[name = tensor<string, []>("op_550_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_550_end_0 = const()[name = tensor<string, []>("op_550_end_0"), val = tensor<int32, [3]>([3, 16, 256])];
+            tensor<bool, [3]> var_550_end_mask_0 = const()[name = tensor<string, []>("op_550_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [3, 16, 256]> conv_out_3 = transpose(perm = conv_out_3_perm_0, x = input_71)[name = tensor<string, []>("transpose_46")];
+            tensor<fp32, [3, 1, 256]> var_550 = slice_by_index(begin = var_550_begin_0, end = var_550_end_0, end_mask = var_550_end_mask_0, x = conv_out_3)[name = tensor<string, []>("op_550")];
+            tensor<int32, [3]> var_552_perm_0 = const()[name = tensor<string, []>("op_552_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 3, 256]> var_552 = transpose(perm = var_552_perm_0, x = var_550)[name = tensor<string, []>("transpose_45")];
+            tensor<fp32, [1, 3, 256]> input_73 = add(x = x_9, y = var_552)[name = tensor<string, []>("input_73")];
+            tensor<int32, [1]> input_75_axes_0 = const()[name = tensor<string, []>("input_75_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 3, 256]> input_75 = layer_norm(axes = input_75_axes_0, beta = inner_encoder_ffn2_1_module_sequential_0_bias, epsilon = var_56, gamma = inner_encoder_ffn2_1_module_sequential_0_weight, x = input_73)[name = tensor<string, []>("input_75")];
+            tensor<fp32, [1, 3, 1024]> inputs_19 = linear(bias = inner_encoder_ffn2_1_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_1_module_sequential_1_linear_weight, x = input_75)[name = tensor<string, []>("linear_17")];
+            tensor<fp32, [1, 3, 1024]> input_77 = silu(x = inputs_19)[name = tensor<string, []>("input_77")];
+            tensor<fp32, [1, 3, 256]> input_81 = linear(bias = inner_encoder_ffn2_1_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_1_module_sequential_4_linear_weight, x = input_77)[name = tensor<string, []>("linear_18")];
+            tensor<fp32, []> var_575 = const()[name = tensor<string, []>("op_575"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 3, 256]> var_576 = mul(x = input_81, y = var_575)[name = tensor<string, []>("op_576")];
+            tensor<fp32, [1, 3, 256]> input_83 = add(x = var_576, y = input_73)[name = tensor<string, []>("input_83")];
             tensor<int32, [1]> input_85_axes_0 = const()[name = tensor<string, []>("input_85_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_85 = layer_norm(axes = input_85_axes_0, beta = encoder_ffn1_2_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_2_module_sequential_0_weight, x = input_83)[name = tensor<string, []>("input_85")];
-            tensor<fp32, [1, 3, 1024]> inputs_21 = linear(bias = encoder_ffn1_2_module_sequential_1_linear_bias, weight = encoder_ffn1_2_module_sequential_1_linear_weight, x = input_85)[name = tensor<string, []>("linear_19")];
-            tensor<fp32, [1, 3, 1024]> input_87 = silu(x = inputs_21)[name = tensor<string, []>("input_87")];
-            tensor<fp32, [1, 3, 256]> input_91 = linear(bias = encoder_ffn1_2_module_sequential_4_linear_bias, weight = encoder_ffn1_2_module_sequential_4_linear_weight, x = input_87)[name = tensor<string, []>("linear_20")];
-            tensor<fp32, []> var_559 = const()[name = tensor<string, []>("op_559"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 3, 256]> var_560 = mul(x = input_91, y = var_559)[name = tensor<string, []>("op_560")];
-            tensor<fp32, [1, 3, 256]> input_93 = add(x = var_560, y = input_83)[name = tensor<string, []>("input_93")];
+            tensor<fp32, [1, 3, 256]> input_85 = layer_norm(axes = input_85_axes_0, beta = inner_encoder_layer_norm_1_bias, epsilon = var_56, gamma = inner_encoder_layer_norm_1_weight, x = input_83)[name = tensor<string, []>("input_85")];
+            tensor<int32, [1]> input_87_axes_0 = const()[name = tensor<string, []>("input_87_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 3, 256]> input_87 = layer_norm(axes = input_87_axes_0, beta = inner_encoder_ffn1_2_module_sequential_0_bias, epsilon = var_56, gamma = inner_encoder_ffn1_2_module_sequential_0_weight, x = input_85)[name = tensor<string, []>("input_87")];
+            tensor<fp32, [1, 3, 1024]> inputs_21 = linear(bias = inner_encoder_ffn1_2_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_2_module_sequential_1_linear_weight, x = input_87)[name = tensor<string, []>("linear_19")];
+            tensor<fp32, [1, 3, 1024]> input_89 = silu(x = inputs_21)[name = tensor<string, []>("input_89")];
+            tensor<fp32, [1, 3, 256]> input_93 = linear(bias = inner_encoder_ffn1_2_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_2_module_sequential_4_linear_weight, x = input_89)[name = tensor<string, []>("linear_20")];
+            tensor<fp32, []> var_605 = const()[name = tensor<string, []>("op_605"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 3, 256]> var_606 = mul(x = input_93, y = var_605)[name = tensor<string, []>("op_606")];
+            tensor<fp32, [1, 3, 256]> input_95 = add(x = var_606, y = input_85)[name = tensor<string, []>("input_95")];
             tensor<int32, [1]> x_13_axes_0 = const()[name = tensor<string, []>("x_13_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> x_13 = layer_norm(axes = x_13_axes_0, beta = encoder_ret_lns_2_bias, epsilon = var_28, gamma = encoder_ret_lns_2_weight, x = input_93)[name = tensor<string, []>("x_13")];
+            tensor<fp32, [1, 3, 256]> x_13 = layer_norm(axes = x_13_axes_0, beta = inner_encoder_ret_lns_2_bias, epsilon = var_56, gamma = inner_encoder_ret_lns_2_weight, x = input_95)[name = tensor<string, []>("x_13")];
             tensor<int32, [5]> prev_kv_5_begin_0 = const()[name = tensor<string, []>("prev_kv_5_begin_0"), val = tensor<int32, [5]>([2, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_5_end_0 = const()[name = tensor<string, []>("prev_kv_5_end_0"), val = tensor<int32, [5]>([3, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_5_end_mask_0 = const()[name = tensor<string, []>("prev_kv_5_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -573,163 +591,163 @@ program(1.0)
             tensor<bool, [2]> prev_scale_5_end_mask_0 = const()[name = tensor<string, []>("prev_scale_5_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_5_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_5_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_5 = slice_by_index(begin = prev_scale_5_begin_0, end = prev_scale_5_end_0, end_mask = prev_scale_5_end_mask_0, squeeze_mask = prev_scale_5_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_5")];
-            tensor<fp32, [1, 3, 256]> var_574 = linear(bias = encoder_q_proj_2_bias, weight = encoder_q_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_21")];
-            tensor<int32, [4]> var_575 = const()[name = tensor<string, []>("op_575"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_576 = reshape(shape = var_575, x = var_574)[name = tensor<string, []>("op_576")];
+            tensor<fp32, [1, 3, 256]> var_620 = linear(bias = inner_encoder_q_proj_2_bias, weight = inner_encoder_q_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_21")];
+            tensor<int32, [4]> var_621 = const()[name = tensor<string, []>("op_621"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_622 = reshape(shape = var_621, x = var_620)[name = tensor<string, []>("op_622")];
             tensor<int32, [4]> q_5_perm_0 = const()[name = tensor<string, []>("q_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> var_580 = linear(bias = encoder_k_proj_2_bias, weight = encoder_k_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_22")];
-            tensor<fp32, []> var_581 = const()[name = tensor<string, []>("op_581"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 3, 256]> var_582 = mul(x = var_580, y = var_581)[name = tensor<string, []>("op_582")];
-            tensor<int32, [4]> var_583 = const()[name = tensor<string, []>("op_583"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_584 = reshape(shape = var_583, x = var_582)[name = tensor<string, []>("op_584")];
+            tensor<fp32, [1, 3, 256]> var_626 = linear(bias = inner_encoder_k_proj_2_bias, weight = inner_encoder_k_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_22")];
+            tensor<fp32, []> var_627 = const()[name = tensor<string, []>("op_627"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 3, 256]> var_628 = mul(x = var_626, y = var_627)[name = tensor<string, []>("op_628")];
+            tensor<int32, [4]> var_629 = const()[name = tensor<string, []>("op_629"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_630 = reshape(shape = var_629, x = var_628)[name = tensor<string, []>("op_630")];
             tensor<int32, [4]> k_5_perm_0 = const()[name = tensor<string, []>("k_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> var_588 = linear(bias = encoder_v_proj_2_bias, weight = encoder_v_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_23")];
-            tensor<int32, [4]> var_589 = const()[name = tensor<string, []>("op_589"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_590 = reshape(shape = var_589, x = var_588)[name = tensor<string, []>("op_590")];
+            tensor<fp32, [1, 3, 256]> var_634 = linear(bias = inner_encoder_v_proj_2_bias, weight = inner_encoder_v_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_23")];
+            tensor<int32, [4]> var_635 = const()[name = tensor<string, []>("op_635"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_636 = reshape(shape = var_635, x = var_634)[name = tensor<string, []>("op_636")];
             tensor<int32, [4]> v_5_perm_0 = const()[name = tensor<string, []>("v_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> input_97 = linear(bias = encoder_g_proj_2_bias, weight = encoder_g_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_24")];
+            tensor<fp32, [1, 3, 256]> input_99 = linear(bias = inner_encoder_g_proj_2_bias, weight = inner_encoder_g_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_24")];
             tensor<fp32, [1]> sqrt_s0_5 = sqrt(x = prev_scale_5)[name = tensor<string, []>("sqrt_s0_5")];
-            tensor<fp32, [3]> s_t_5 = add(x = prev_scale_5, y = encoder__t_index)[name = tensor<string, []>("s_t_5")];
+            tensor<fp32, [3]> s_t_5 = add(x = prev_scale_5, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_5")];
             tensor<fp32, [3]> sqrt_s_t_5 = sqrt(x = s_t_5)[name = tensor<string, []>("sqrt_s_t_5")];
             tensor<bool, []> qk_5_transpose_x_1 = const()[name = tensor<string, []>("qk_5_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_5_transpose_y_1 = const()[name = tensor<string, []>("qk_5_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 3, 64]> k_5 = transpose(perm = k_5_perm_0, x = var_584)[name = tensor<string, []>("transpose_43")];
-            tensor<fp32, [1, 4, 3, 64]> q_5 = transpose(perm = q_5_perm_0, x = var_576)[name = tensor<string, []>("transpose_44")];
+            tensor<fp32, [1, 4, 3, 64]> k_5 = transpose(perm = k_5_perm_0, x = var_630)[name = tensor<string, []>("transpose_43")];
+            tensor<fp32, [1, 4, 3, 64]> q_5 = transpose(perm = q_5_perm_0, x = var_622)[name = tensor<string, []>("transpose_44")];
             tensor<fp32, [1, 4, 3, 3]> qk_5 = matmul(transpose_x = qk_5_transpose_x_1, transpose_y = qk_5_transpose_y_1, x = q_5, y = k_5)[name = tensor<string, []>("qk_5")];
-            tensor<int32, [2]> var_600 = const()[name = tensor<string, []>("op_600"), val = tensor<int32, [2]>([3, 1])];
-            tensor<fp32, [3, 1]> var_601 = reshape(shape = var_600, x = sqrt_s_t_5)[name = tensor<string, []>("op_601")];
-            tensor<fp32, [3, 3]> M_5 = real_div(x = encoder__causal_mask, y = var_601)[name = tensor<string, []>("M_5")];
-            tensor<fp32, [1, 4, 3, 3]> var_603 = mul(x = qk_5, y = M_5)[name = tensor<string, []>("op_603")];
+            tensor<int32, [2]> var_646 = const()[name = tensor<string, []>("op_646"), val = tensor<int32, [2]>([3, 1])];
+            tensor<fp32, [3, 1]> var_647 = reshape(shape = var_646, x = sqrt_s_t_5)[name = tensor<string, []>("op_647")];
+            tensor<fp32, [3, 3]> M_5 = real_div(x = inner_encoder__causal_mask, y = var_647)[name = tensor<string, []>("M_5")];
+            tensor<fp32, [1, 4, 3, 3]> var_649 = mul(x = qk_5, y = M_5)[name = tensor<string, []>("op_649")];
             tensor<bool, []> inner_5_transpose_x_0 = const()[name = tensor<string, []>("inner_5_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_5_transpose_y_0 = const()[name = tensor<string, []>("inner_5_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 3, 64]> v_5 = transpose(perm = v_5_perm_0, x = var_590)[name = tensor<string, []>("transpose_42")];
-            tensor<fp32, [1, 4, 3, 64]> inner_5 = matmul(transpose_x = inner_5_transpose_x_0, transpose_y = inner_5_transpose_y_0, x = var_603, y = v_5)[name = tensor<string, []>("inner_5")];
-            tensor<bool, []> var_605_transpose_x_0 = const()[name = tensor<string, []>("op_605_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_605_transpose_y_0 = const()[name = tensor<string, []>("op_605_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 3, 64]> var_605 = matmul(transpose_x = var_605_transpose_x_0, transpose_y = var_605_transpose_y_0, x = q_5, y = prev_kv_5)[name = tensor<string, []>("op_605")];
-            tensor<fp32, [3]> var_606 = real_div(x = sqrt_s0_5, y = sqrt_s_t_5)[name = tensor<string, []>("op_606")];
-            tensor<int32, [4]> var_607 = const()[name = tensor<string, []>("op_607"), val = tensor<int32, [4]>([1, 1, 3, 1])];
-            tensor<fp32, [1, 1, 3, 1]> var_608 = reshape(shape = var_607, x = var_606)[name = tensor<string, []>("op_608")];
-            tensor<fp32, [1, 4, 3, 64]> cross_5 = mul(x = var_605, y = var_608)[name = tensor<string, []>("cross_5")];
+            tensor<fp32, [1, 4, 3, 64]> v_5 = transpose(perm = v_5_perm_0, x = var_636)[name = tensor<string, []>("transpose_42")];
+            tensor<fp32, [1, 4, 3, 64]> inner_5 = matmul(transpose_x = inner_5_transpose_x_0, transpose_y = inner_5_transpose_y_0, x = var_649, y = v_5)[name = tensor<string, []>("inner_5")];
+            tensor<bool, []> var_651_transpose_x_0 = const()[name = tensor<string, []>("op_651_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_651_transpose_y_0 = const()[name = tensor<string, []>("op_651_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 3, 64]> var_651 = matmul(transpose_x = var_651_transpose_x_0, transpose_y = var_651_transpose_y_0, x = q_5, y = prev_kv_5)[name = tensor<string, []>("op_651")];
+            tensor<fp32, [3]> var_652 = real_div(x = sqrt_s0_5, y = sqrt_s_t_5)[name = tensor<string, []>("op_652")];
+            tensor<int32, [4]> var_653 = const()[name = tensor<string, []>("op_653"), val = tensor<int32, [4]>([1, 1, 3, 1])];
+            tensor<fp32, [1, 1, 3, 1]> var_654 = reshape(shape = var_653, x = var_652)[name = tensor<string, []>("op_654")];
+            tensor<fp32, [1, 4, 3, 64]> cross_5 = mul(x = var_651, y = var_654)[name = tensor<string, []>("cross_5")];
             tensor<fp32, [1, 4, 3, 64]> out_13 = add(x = inner_5, y = cross_5)[name = tensor<string, []>("out_13")];
-            tensor<fp32, [1, 4, 64, 64]> var_611 = mul(x = prev_kv_5, y = sqrt_s0_5)[name = tensor<string, []>("op_611")];
-            tensor<bool, []> var_613_transpose_x_1 = const()[name = tensor<string, []>("op_613_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_613_transpose_y_1 = const()[name = tensor<string, []>("op_613_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_613 = matmul(transpose_x = var_613_transpose_x_1, transpose_y = var_613_transpose_y_1, x = k_5, y = v_5)[name = tensor<string, []>("op_613")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_5 = add(x = var_611, y = var_613)[name = tensor<string, []>("new_kv_unnorm_5")];
-            tensor<fp32, []> var_615 = const()[name = tensor<string, []>("op_615"), val = tensor<fp32, []>(0x1.8p+1)];
-            tensor<fp32, [1]> new_scale_5 = add(x = prev_scale_5, y = var_615)[name = tensor<string, []>("new_scale_5")];
-            tensor<fp32, [1]> var_617 = sqrt(x = new_scale_5)[name = tensor<string, []>("op_617")];
-            tensor<fp32, [1, 4, 64, 64]> var_618 = real_div(x = new_kv_unnorm_5, y = var_617)[name = tensor<string, []>("op_618")];
-            tensor<int32, [4]> var_619_perm_0 = const()[name = tensor<string, []>("op_619_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_657 = mul(x = prev_kv_5, y = sqrt_s0_5)[name = tensor<string, []>("op_657")];
+            tensor<bool, []> var_659_transpose_x_1 = const()[name = tensor<string, []>("op_659_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_659_transpose_y_1 = const()[name = tensor<string, []>("op_659_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_659 = matmul(transpose_x = var_659_transpose_x_1, transpose_y = var_659_transpose_y_1, x = k_5, y = v_5)[name = tensor<string, []>("op_659")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_5 = add(x = var_657, y = var_659)[name = tensor<string, []>("new_kv_unnorm_5")];
+            tensor<fp32, []> var_661 = const()[name = tensor<string, []>("op_661"), val = tensor<fp32, []>(0x1.8p+1)];
+            tensor<fp32, [1]> new_scale_5 = add(x = prev_scale_5, y = var_661)[name = tensor<string, []>("new_scale_5")];
+            tensor<fp32, [1]> var_663 = sqrt(x = new_scale_5)[name = tensor<string, []>("op_663")];
+            tensor<fp32, [1, 4, 64, 64]> var_664 = real_div(x = new_kv_unnorm_5, y = var_663)[name = tensor<string, []>("op_664")];
+            tensor<int32, [4]> var_665_perm_0 = const()[name = tensor<string, []>("op_665_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_15_axes_0 = const()[name = tensor<string, []>("out_15_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 4, 64]> var_619 = transpose(perm = var_619_perm_0, x = out_13)[name = tensor<string, []>("transpose_41")];
-            tensor<fp32, [1, 3, 4, 64]> out_15 = layer_norm(axes = out_15_axes_0, epsilon = var_17, x = var_619)[name = tensor<string, []>("out_15")];
-            tensor<int32, [3]> var_623 = const()[name = tensor<string, []>("op_623"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<fp32, [1, 3, 256]> out_17 = reshape(shape = var_623, x = out_15)[name = tensor<string, []>("out_17")];
-            tensor<fp32, [1, 3, 256]> var_625 = silu(x = input_97)[name = tensor<string, []>("op_625")];
-            tensor<fp32, [1, 3, 256]> input_99 = mul(x = var_625, y = out_17)[name = tensor<string, []>("input_99")];
-            tensor<fp32, [1, 3, 256]> ret_out_5 = linear(bias = encoder_out_proj_2_bias, weight = encoder_out_proj_2_weight, x = input_99)[name = tensor<string, []>("linear_25")];
-            tensor<fp32, [1, 3, 256]> x_15 = add(x = input_93, y = ret_out_5)[name = tensor<string, []>("x_15")];
+            tensor<fp32, [1, 3, 4, 64]> var_665 = transpose(perm = var_665_perm_0, x = out_13)[name = tensor<string, []>("transpose_41")];
+            tensor<fp32, [1, 3, 4, 64]> out_15 = layer_norm(axes = out_15_axes_0, epsilon = var_64, x = var_665)[name = tensor<string, []>("out_15")];
+            tensor<int32, [3]> var_669 = const()[name = tensor<string, []>("op_669"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<fp32, [1, 3, 256]> out_17 = reshape(shape = var_669, x = out_15)[name = tensor<string, []>("out_17")];
+            tensor<fp32, [1, 3, 256]> var_671 = silu(x = input_99)[name = tensor<string, []>("op_671")];
+            tensor<fp32, [1, 3, 256]> input_101 = mul(x = var_671, y = out_17)[name = tensor<string, []>("input_101")];
+            tensor<fp32, [1, 3, 256]> ret_out_5 = linear(bias = inner_encoder_out_proj_2_bias, weight = inner_encoder_out_proj_2_weight, x = input_101)[name = tensor<string, []>("linear_25")];
+            tensor<fp32, [1, 3, 256]> x_15 = add(x = input_95, y = ret_out_5)[name = tensor<string, []>("x_15")];
             tensor<int32, [4]> window_17_begin_0 = const()[name = tensor<string, []>("window_17_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> window_17_end_0 = const()[name = tensor<string, []>("window_17_end_0"), val = tensor<int32, [4]>([3, 1, 16, 256])];
             tensor<bool, [4]> window_17_end_mask_0 = const()[name = tensor<string, []>("window_17_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_17_squeeze_mask_0 = const()[name = tensor<string, []>("window_17_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_17 = slice_by_index(begin = window_17_begin_0, end = window_17_end_0, end_mask = window_17_end_mask_0, squeeze_mask = window_17_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_17")];
-            tensor<int32, [3]> var_633_begin_0 = const()[name = tensor<string, []>("op_633_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_633_end_0 = const()[name = tensor<string, []>("op_633_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_633_end_mask_0 = const()[name = tensor<string, []>("op_633_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_633 = slice_by_index(begin = var_633_begin_0, end = var_633_end_0, end_mask = var_633_end_mask_0, x = x_15)[name = tensor<string, []>("op_633")];
-            tensor<int32, [3]> var_636_begin_0 = const()[name = tensor<string, []>("op_636_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_636_end_0 = const()[name = tensor<string, []>("op_636_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_636_end_mask_0 = const()[name = tensor<string, []>("op_636_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_636 = slice_by_index(begin = var_636_begin_0, end = var_636_end_0, end_mask = var_636_end_mask_0, x = window_17)[name = tensor<string, []>("op_636")];
+            tensor<int32, [3]> var_679_begin_0 = const()[name = tensor<string, []>("op_679_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_679_end_0 = const()[name = tensor<string, []>("op_679_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_679_end_mask_0 = const()[name = tensor<string, []>("op_679_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_679 = slice_by_index(begin = var_679_begin_0, end = var_679_end_0, end_mask = var_679_end_mask_0, x = x_15)[name = tensor<string, []>("op_679")];
+            tensor<int32, [3]> var_682_begin_0 = const()[name = tensor<string, []>("op_682_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_682_end_0 = const()[name = tensor<string, []>("op_682_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_682_end_mask_0 = const()[name = tensor<string, []>("op_682_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_682 = slice_by_index(begin = var_682_begin_0, end = var_682_end_0, end_mask = var_682_end_mask_0, x = window_17)[name = tensor<string, []>("op_682")];
             tensor<bool, []> window_19_interleave_0 = const()[name = tensor<string, []>("window_19_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_19 = concat(axis = var_26, interleave = window_19_interleave_0, values = (var_636, var_633))[name = tensor<string, []>("window_19")];
-            tensor<int32, [3]> var_641_begin_0 = const()[name = tensor<string, []>("op_641_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_641_end_0 = const()[name = tensor<string, []>("op_641_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_641_end_mask_0 = const()[name = tensor<string, []>("op_641_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_641 = slice_by_index(begin = var_641_begin_0, end = var_641_end_0, end_mask = var_641_end_mask_0, x = x_15)[name = tensor<string, []>("op_641")];
-            tensor<int32, [3]> var_644_begin_0 = const()[name = tensor<string, []>("op_644_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_644_end_0 = const()[name = tensor<string, []>("op_644_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_644_end_mask_0 = const()[name = tensor<string, []>("op_644_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_644 = slice_by_index(begin = var_644_begin_0, end = var_644_end_0, end_mask = var_644_end_mask_0, x = window_19)[name = tensor<string, []>("op_644")];
+            tensor<fp32, [1, 16, 256]> window_19 = concat(axis = var_72, interleave = window_19_interleave_0, values = (var_682, var_679))[name = tensor<string, []>("window_19")];
+            tensor<int32, [3]> var_687_begin_0 = const()[name = tensor<string, []>("op_687_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_687_end_0 = const()[name = tensor<string, []>("op_687_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_687_end_mask_0 = const()[name = tensor<string, []>("op_687_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_687 = slice_by_index(begin = var_687_begin_0, end = var_687_end_0, end_mask = var_687_end_mask_0, x = x_15)[name = tensor<string, []>("op_687")];
+            tensor<int32, [3]> var_690_begin_0 = const()[name = tensor<string, []>("op_690_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_690_end_0 = const()[name = tensor<string, []>("op_690_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_690_end_mask_0 = const()[name = tensor<string, []>("op_690_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_690 = slice_by_index(begin = var_690_begin_0, end = var_690_end_0, end_mask = var_690_end_mask_0, x = window_19)[name = tensor<string, []>("op_690")];
             tensor<bool, []> window_21_interleave_0 = const()[name = tensor<string, []>("window_21_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_21 = concat(axis = var_26, interleave = window_21_interleave_0, values = (var_644, var_641))[name = tensor<string, []>("window_21")];
-            tensor<int32, [3]> var_649_begin_0 = const()[name = tensor<string, []>("op_649_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_649_end_0 = const()[name = tensor<string, []>("op_649_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_649_end_mask_0 = const()[name = tensor<string, []>("op_649_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_649 = slice_by_index(begin = var_649_begin_0, end = var_649_end_0, end_mask = var_649_end_mask_0, x = x_15)[name = tensor<string, []>("op_649")];
-            tensor<int32, [3]> var_652_begin_0 = const()[name = tensor<string, []>("op_652_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_652_end_0 = const()[name = tensor<string, []>("op_652_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_652_end_mask_0 = const()[name = tensor<string, []>("op_652_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_652 = slice_by_index(begin = var_652_begin_0, end = var_652_end_0, end_mask = var_652_end_mask_0, x = window_21)[name = tensor<string, []>("op_652")];
+            tensor<fp32, [1, 16, 256]> window_21 = concat(axis = var_72, interleave = window_21_interleave_0, values = (var_690, var_687))[name = tensor<string, []>("window_21")];
+            tensor<int32, [3]> var_695_begin_0 = const()[name = tensor<string, []>("op_695_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_695_end_0 = const()[name = tensor<string, []>("op_695_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_695_end_mask_0 = const()[name = tensor<string, []>("op_695_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_695 = slice_by_index(begin = var_695_begin_0, end = var_695_end_0, end_mask = var_695_end_mask_0, x = x_15)[name = tensor<string, []>("op_695")];
+            tensor<int32, [3]> var_698_begin_0 = const()[name = tensor<string, []>("op_698_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_698_end_0 = const()[name = tensor<string, []>("op_698_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_698_end_mask_0 = const()[name = tensor<string, []>("op_698_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_698 = slice_by_index(begin = var_698_begin_0, end = var_698_end_0, end_mask = var_698_end_mask_0, x = window_21)[name = tensor<string, []>("op_698")];
             tensor<bool, []> window_23_interleave_0 = const()[name = tensor<string, []>("window_23_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_23 = concat(axis = var_26, interleave = window_23_interleave_0, values = (var_652, var_649))[name = tensor<string, []>("window_23")];
-            tensor<bool, []> input_101_interleave_0 = const()[name = tensor<string, []>("input_101_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [3, 16, 256]> input_101 = concat(axis = var_23, interleave = input_101_interleave_0, values = (window_19, window_21, window_23))[name = tensor<string, []>("input_101")];
+            tensor<fp32, [1, 16, 256]> window_23 = concat(axis = var_72, interleave = window_23_interleave_0, values = (var_698, var_695))[name = tensor<string, []>("window_23")];
+            tensor<bool, []> input_103_interleave_0 = const()[name = tensor<string, []>("input_103_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [3, 16, 256]> input_103 = concat(axis = var_59, interleave = input_103_interleave_0, values = (window_19, window_21, window_23))[name = tensor<string, []>("input_103")];
             tensor<int32, [1]> x_17_axes_0 = const()[name = tensor<string, []>("x_17_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [3, 16, 256]> x_17 = layer_norm(axes = x_17_axes_0, beta = encoder_conv_module_2_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_2_sequential_0_weight, x = input_101)[name = tensor<string, []>("x_17")];
-            tensor<int32, [3]> input_103_perm_0 = const()[name = tensor<string, []>("input_103_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [3, 16, 256]> x_17 = layer_norm(axes = x_17_axes_0, beta = inner_encoder_conv_module_2_sequential_0_bias, epsilon = var_56, gamma = inner_encoder_conv_module_2_sequential_0_weight, x = input_103)[name = tensor<string, []>("x_17")];
+            tensor<int32, [3]> input_105_perm_0 = const()[name = tensor<string, []>("input_105_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_23_pad_type_0 = const()[name = tensor<string, []>("inputs_23_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_23_strides_0 = const()[name = tensor<string, []>("inputs_23_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_23_pad_0 = const()[name = tensor<string, []>("inputs_23_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_23_dilations_0 = const()[name = tensor<string, []>("inputs_23_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_23_groups_0 = const()[name = tensor<string, []>("inputs_23_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> input_103 = transpose(perm = input_103_perm_0, x = x_17)[name = tensor<string, []>("transpose_40")];
-            tensor<fp32, [3, 512, 16]> inputs_23 = conv(bias = encoder_conv_module_2_sequential_2_conv_bias, dilations = inputs_23_dilations_0, groups = inputs_23_groups_0, pad = inputs_23_pad_0, pad_type = inputs_23_pad_type_0, strides = inputs_23_strides_0, weight = encoder_conv_module_2_sequential_2_conv_weight, x = input_103)[name = tensor<string, []>("inputs_23")];
-            tensor<int32, [2]> var_677_split_sizes_0 = const()[name = tensor<string, []>("op_677_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_677_axis_0 = const()[name = tensor<string, []>("op_677_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> var_677_0, tensor<fp32, [3, 256, 16]> var_677_1 = split(axis = var_677_axis_0, split_sizes = var_677_split_sizes_0, x = inputs_23)[name = tensor<string, []>("op_677")];
-            tensor<fp32, [3, 256, 16]> var_679 = sigmoid(x = var_677_1)[name = tensor<string, []>("op_679")];
-            tensor<fp32, [3, 256, 16]> inputs_25 = mul(x = var_677_0, y = var_679)[name = tensor<string, []>("inputs_25")];
+            tensor<fp32, [3, 256, 16]> input_105 = transpose(perm = input_105_perm_0, x = x_17)[name = tensor<string, []>("transpose_40")];
+            tensor<fp32, [3, 512, 16]> inputs_23 = conv(bias = inner_encoder_conv_module_2_sequential_2_conv_bias, dilations = inputs_23_dilations_0, groups = inputs_23_groups_0, pad = inputs_23_pad_0, pad_type = inputs_23_pad_type_0, strides = inputs_23_strides_0, weight = inner_encoder_conv_module_2_sequential_2_conv_weight, x = input_105)[name = tensor<string, []>("inputs_23")];
+            tensor<int32, [2]> var_723_split_sizes_0 = const()[name = tensor<string, []>("op_723_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_723_axis_0 = const()[name = tensor<string, []>("op_723_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [3, 256, 16]> var_723_0, tensor<fp32, [3, 256, 16]> var_723_1 = split(axis = var_723_axis_0, split_sizes = var_723_split_sizes_0, x = inputs_23)[name = tensor<string, []>("op_723")];
+            tensor<fp32, [3, 256, 16]> var_725 = sigmoid(x = var_723_1)[name = tensor<string, []>("op_725")];
+            tensor<fp32, [3, 256, 16]> inputs_25 = mul(x = var_723_0, y = var_725)[name = tensor<string, []>("inputs_25")];
             tensor<string, []> outputs_aug_5_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_5_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_5_pad_0 = const()[name = tensor<string, []>("outputs_aug_5_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_5_groups_0 = const()[name = tensor<string, []>("outputs_aug_5_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_5_strides_0 = const()[name = tensor<string, []>("outputs_aug_5_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_5_dilations_0 = const()[name = tensor<string, []>("outputs_aug_5_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [3, 256, 31]> outputs_aug_5 = conv(dilations = outputs_aug_5_dilations_0, groups = outputs_aug_5_groups_0, pad = outputs_aug_5_pad_0, pad_type = outputs_aug_5_pad_type_0, strides = outputs_aug_5_strides_0, weight = encoder_conv_module_2_sequential_4_conv_weight, x = inputs_25)[name = tensor<string, []>("outputs_aug_5")];
-            tensor<int32, [3]> input_105_begin_0 = const()[name = tensor<string, []>("input_105_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_105_end_0 = const()[name = tensor<string, []>("input_105_end_0"), val = tensor<int32, [3]>([3, 256, 16])];
-            tensor<bool, [3]> input_105_end_mask_0 = const()[name = tensor<string, []>("input_105_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [3, 256, 16]> input_105 = slice_by_index(begin = input_105_begin_0, end = input_105_end_0, end_mask = input_105_end_mask_0, x = outputs_aug_5)[name = tensor<string, []>("input_105")];
-            tensor<fp32, [3, 256, 16]> inputs_27 = batch_norm(beta = encoder_conv_module_2_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_2_sequential_5_weight, mean = encoder_conv_module_2_sequential_5_running_mean, variance = encoder_conv_module_2_sequential_5_running_var, x = input_105)[name = tensor<string, []>("inputs_27")];
-            tensor<fp32, [3, 256, 16]> input_107 = silu(x = inputs_27)[name = tensor<string, []>("input_107")];
-            tensor<string, []> input_109_pad_type_0 = const()[name = tensor<string, []>("input_109_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_109_strides_0 = const()[name = tensor<string, []>("input_109_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_109_pad_0 = const()[name = tensor<string, []>("input_109_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_109_dilations_0 = const()[name = tensor<string, []>("input_109_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_109_groups_0 = const()[name = tensor<string, []>("input_109_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> input_109 = conv(bias = encoder_conv_module_2_sequential_7_conv_bias, dilations = input_109_dilations_0, groups = input_109_groups_0, pad = input_109_pad_0, pad_type = input_109_pad_type_0, strides = input_109_strides_0, weight = encoder_conv_module_2_sequential_7_conv_weight, x = input_107)[name = tensor<string, []>("input_109")];
+            tensor<fp32, [3, 256, 31]> outputs_aug_5 = conv(dilations = outputs_aug_5_dilations_0, groups = outputs_aug_5_groups_0, pad = outputs_aug_5_pad_0, pad_type = outputs_aug_5_pad_type_0, strides = outputs_aug_5_strides_0, weight = inner_encoder_conv_module_2_sequential_4_conv_weight, x = inputs_25)[name = tensor<string, []>("outputs_aug_5")];
+            tensor<int32, [3]> input_107_begin_0 = const()[name = tensor<string, []>("input_107_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_107_end_0 = const()[name = tensor<string, []>("input_107_end_0"), val = tensor<int32, [3]>([3, 256, 16])];
+            tensor<bool, [3]> input_107_end_mask_0 = const()[name = tensor<string, []>("input_107_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [3, 256, 16]> input_107 = slice_by_index(begin = input_107_begin_0, end = input_107_end_0, end_mask = input_107_end_mask_0, x = outputs_aug_5)[name = tensor<string, []>("input_107")];
+            tensor<fp32, [3, 256, 16]> inputs_27 = batch_norm(beta = inner_encoder_conv_module_2_sequential_5_bias, epsilon = var_56, gamma = inner_encoder_conv_module_2_sequential_5_weight, mean = inner_encoder_conv_module_2_sequential_5_running_mean, variance = inner_encoder_conv_module_2_sequential_5_running_var, x = input_107)[name = tensor<string, []>("inputs_27")];
+            tensor<fp32, [3, 256, 16]> input_109 = silu(x = inputs_27)[name = tensor<string, []>("input_109")];
+            tensor<string, []> input_111_pad_type_0 = const()[name = tensor<string, []>("input_111_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_111_strides_0 = const()[name = tensor<string, []>("input_111_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_111_pad_0 = const()[name = tensor<string, []>("input_111_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_111_dilations_0 = const()[name = tensor<string, []>("input_111_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_111_groups_0 = const()[name = tensor<string, []>("input_111_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [3, 256, 16]> input_111 = conv(bias = inner_encoder_conv_module_2_sequential_7_conv_bias, dilations = input_111_dilations_0, groups = input_111_groups_0, pad = input_111_pad_0, pad_type = input_111_pad_type_0, strides = input_111_strides_0, weight = inner_encoder_conv_module_2_sequential_7_conv_weight, x = input_109)[name = tensor<string, []>("input_111")];
             tensor<int32, [3]> conv_out_5_perm_0 = const()[name = tensor<string, []>("conv_out_5_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_710_begin_0 = const()[name = tensor<string, []>("op_710_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_710_end_0 = const()[name = tensor<string, []>("op_710_end_0"), val = tensor<int32, [3]>([3, 16, 256])];
-            tensor<bool, [3]> var_710_end_mask_0 = const()[name = tensor<string, []>("op_710_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [3, 16, 256]> conv_out_5 = transpose(perm = conv_out_5_perm_0, x = input_109)[name = tensor<string, []>("transpose_39")];
-            tensor<fp32, [3, 1, 256]> var_710 = slice_by_index(begin = var_710_begin_0, end = var_710_end_0, end_mask = var_710_end_mask_0, x = conv_out_5)[name = tensor<string, []>("op_710")];
-            tensor<int32, [3]> var_712_perm_0 = const()[name = tensor<string, []>("op_712_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 3, 256]> var_712 = transpose(perm = var_712_perm_0, x = var_710)[name = tensor<string, []>("transpose_38")];
-            tensor<fp32, [1, 3, 256]> input_111 = add(x = x_15, y = var_712)[name = tensor<string, []>("input_111")];
-            tensor<int32, [1]> input_113_axes_0 = const()[name = tensor<string, []>("input_113_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_113 = layer_norm(axes = input_113_axes_0, beta = encoder_ffn2_2_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_2_module_sequential_0_weight, x = input_111)[name = tensor<string, []>("input_113")];
-            tensor<fp32, [1, 3, 1024]> inputs_29 = linear(bias = encoder_ffn2_2_module_sequential_1_linear_bias, weight = encoder_ffn2_2_module_sequential_1_linear_weight, x = input_113)[name = tensor<string, []>("linear_26")];
-            tensor<fp32, [1, 3, 1024]> input_115 = silu(x = inputs_29)[name = tensor<string, []>("input_115")];
-            tensor<fp32, [1, 3, 256]> input_119 = linear(bias = encoder_ffn2_2_module_sequential_4_linear_bias, weight = encoder_ffn2_2_module_sequential_4_linear_weight, x = input_115)[name = tensor<string, []>("linear_27")];
-            tensor<fp32, []> var_735 = const()[name = tensor<string, []>("op_735"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 3, 256]> var_736 = mul(x = input_119, y = var_735)[name = tensor<string, []>("op_736")];
-            tensor<fp32, [1, 3, 256]> input_121 = add(x = var_736, y = input_111)[name = tensor<string, []>("input_121")];
-            tensor<int32, [1]> input_123_axes_0 = const()[name = tensor<string, []>("input_123_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_123 = layer_norm(axes = input_123_axes_0, beta = encoder_layer_norm_2_bias, epsilon = var_28, gamma = encoder_layer_norm_2_weight, x = input_121)[name = tensor<string, []>("input_123")];
+            tensor<int32, [3]> var_756_begin_0 = const()[name = tensor<string, []>("op_756_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_756_end_0 = const()[name = tensor<string, []>("op_756_end_0"), val = tensor<int32, [3]>([3, 16, 256])];
+            tensor<bool, [3]> var_756_end_mask_0 = const()[name = tensor<string, []>("op_756_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [3, 16, 256]> conv_out_5 = transpose(perm = conv_out_5_perm_0, x = input_111)[name = tensor<string, []>("transpose_39")];
+            tensor<fp32, [3, 1, 256]> var_756 = slice_by_index(begin = var_756_begin_0, end = var_756_end_0, end_mask = var_756_end_mask_0, x = conv_out_5)[name = tensor<string, []>("op_756")];
+            tensor<int32, [3]> var_758_perm_0 = const()[name = tensor<string, []>("op_758_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 3, 256]> var_758 = transpose(perm = var_758_perm_0, x = var_756)[name = tensor<string, []>("transpose_38")];
+            tensor<fp32, [1, 3, 256]> input_113 = add(x = x_15, y = var_758)[name = tensor<string, []>("input_113")];
+            tensor<int32, [1]> input_115_axes_0 = const()[name = tensor<string, []>("input_115_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 3, 256]> input_115 = layer_norm(axes = input_115_axes_0, beta = inner_encoder_ffn2_2_module_sequential_0_bias, epsilon = var_56, gamma = inner_encoder_ffn2_2_module_sequential_0_weight, x = input_113)[name = tensor<string, []>("input_115")];
+            tensor<fp32, [1, 3, 1024]> inputs_29 = linear(bias = inner_encoder_ffn2_2_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_2_module_sequential_1_linear_weight, x = input_115)[name = tensor<string, []>("linear_26")];
+            tensor<fp32, [1, 3, 1024]> input_117 = silu(x = inputs_29)[name = tensor<string, []>("input_117")];
+            tensor<fp32, [1, 3, 256]> input_121 = linear(bias = inner_encoder_ffn2_2_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_2_module_sequential_4_linear_weight, x = input_117)[name = tensor<string, []>("linear_27")];
+            tensor<fp32, []> var_781 = const()[name = tensor<string, []>("op_781"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 3, 256]> var_782 = mul(x = input_121, y = var_781)[name = tensor<string, []>("op_782")];
+            tensor<fp32, [1, 3, 256]> input_123 = add(x = var_782, y = input_113)[name = tensor<string, []>("input_123")];
             tensor<int32, [1]> input_125_axes_0 = const()[name = tensor<string, []>("input_125_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_125 = layer_norm(axes = input_125_axes_0, beta = encoder_ffn1_3_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_3_module_sequential_0_weight, x = input_123)[name = tensor<string, []>("input_125")];
-            tensor<fp32, [1, 3, 1024]> inputs_31 = linear(bias = encoder_ffn1_3_module_sequential_1_linear_bias, weight = encoder_ffn1_3_module_sequential_1_linear_weight, x = input_125)[name = tensor<string, []>("linear_28")];
-            tensor<fp32, [1, 3, 1024]> input_127 = silu(x = inputs_31)[name = tensor<string, []>("input_127")];
-            tensor<fp32, [1, 3, 256]> input_131 = linear(bias = encoder_ffn1_3_module_sequential_4_linear_bias, weight = encoder_ffn1_3_module_sequential_4_linear_weight, x = input_127)[name = tensor<string, []>("linear_29")];
-            tensor<fp32, []> var_765 = const()[name = tensor<string, []>("op_765"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 3, 256]> var_766 = mul(x = input_131, y = var_765)[name = tensor<string, []>("op_766")];
-            tensor<fp32, [1, 3, 256]> input_133 = add(x = var_766, y = input_123)[name = tensor<string, []>("input_133")];
+            tensor<fp32, [1, 3, 256]> input_125 = layer_norm(axes = input_125_axes_0, beta = inner_encoder_layer_norm_2_bias, epsilon = var_56, gamma = inner_encoder_layer_norm_2_weight, x = input_123)[name = tensor<string, []>("input_125")];
+            tensor<int32, [1]> input_127_axes_0 = const()[name = tensor<string, []>("input_127_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 3, 256]> input_127 = layer_norm(axes = input_127_axes_0, beta = inner_encoder_ffn1_3_module_sequential_0_bias, epsilon = var_56, gamma = inner_encoder_ffn1_3_module_sequential_0_weight, x = input_125)[name = tensor<string, []>("input_127")];
+            tensor<fp32, [1, 3, 1024]> inputs_31 = linear(bias = inner_encoder_ffn1_3_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_3_module_sequential_1_linear_weight, x = input_127)[name = tensor<string, []>("linear_28")];
+            tensor<fp32, [1, 3, 1024]> input_129 = silu(x = inputs_31)[name = tensor<string, []>("input_129")];
+            tensor<fp32, [1, 3, 256]> input_133 = linear(bias = inner_encoder_ffn1_3_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_3_module_sequential_4_linear_weight, x = input_129)[name = tensor<string, []>("linear_29")];
+            tensor<fp32, []> var_811 = const()[name = tensor<string, []>("op_811"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 3, 256]> var_812 = mul(x = input_133, y = var_811)[name = tensor<string, []>("op_812")];
+            tensor<fp32, [1, 3, 256]> input_135 = add(x = var_812, y = input_125)[name = tensor<string, []>("input_135")];
             tensor<int32, [1]> x_19_axes_0 = const()[name = tensor<string, []>("x_19_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> x_19 = layer_norm(axes = x_19_axes_0, beta = encoder_ret_lns_3_bias, epsilon = var_28, gamma = encoder_ret_lns_3_weight, x = input_133)[name = tensor<string, []>("x_19")];
+            tensor<fp32, [1, 3, 256]> x_19 = layer_norm(axes = x_19_axes_0, beta = inner_encoder_ret_lns_3_bias, epsilon = var_56, gamma = inner_encoder_ret_lns_3_weight, x = input_135)[name = tensor<string, []>("x_19")];
             tensor<int32, [5]> prev_kv_7_begin_0 = const()[name = tensor<string, []>("prev_kv_7_begin_0"), val = tensor<int32, [5]>([3, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_7_end_0 = const()[name = tensor<string, []>("prev_kv_7_end_0"), val = tensor<int32, [5]>([4, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_7_end_mask_0 = const()[name = tensor<string, []>("prev_kv_7_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -740,199 +758,192 @@ program(1.0)
             tensor<bool, [2]> prev_scale_7_end_mask_0 = const()[name = tensor<string, []>("prev_scale_7_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_7_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_7_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_7 = slice_by_index(begin = prev_scale_7_begin_0, end = prev_scale_7_end_0, end_mask = prev_scale_7_end_mask_0, squeeze_mask = prev_scale_7_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_7")];
-            tensor<fp32, [1, 3, 256]> var_780 = linear(bias = encoder_q_proj_3_bias, weight = encoder_q_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_30")];
-            tensor<int32, [4]> var_781 = const()[name = tensor<string, []>("op_781"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_782 = reshape(shape = var_781, x = var_780)[name = tensor<string, []>("op_782")];
+            tensor<fp32, [1, 3, 256]> var_826 = linear(bias = inner_encoder_q_proj_3_bias, weight = inner_encoder_q_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_30")];
+            tensor<int32, [4]> var_827 = const()[name = tensor<string, []>("op_827"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_828 = reshape(shape = var_827, x = var_826)[name = tensor<string, []>("op_828")];
             tensor<int32, [4]> q_7_perm_0 = const()[name = tensor<string, []>("q_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> var_786 = linear(bias = encoder_k_proj_3_bias, weight = encoder_k_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_31")];
-            tensor<fp32, []> var_787 = const()[name = tensor<string, []>("op_787"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 3, 256]> var_788 = mul(x = var_786, y = var_787)[name = tensor<string, []>("op_788")];
-            tensor<int32, [4]> var_789 = const()[name = tensor<string, []>("op_789"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_790 = reshape(shape = var_789, x = var_788)[name = tensor<string, []>("op_790")];
+            tensor<fp32, [1, 3, 256]> var_832 = linear(bias = inner_encoder_k_proj_3_bias, weight = inner_encoder_k_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_31")];
+            tensor<fp32, []> var_833 = const()[name = tensor<string, []>("op_833"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 3, 256]> var_834 = mul(x = var_832, y = var_833)[name = tensor<string, []>("op_834")];
+            tensor<int32, [4]> var_835 = const()[name = tensor<string, []>("op_835"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_836 = reshape(shape = var_835, x = var_834)[name = tensor<string, []>("op_836")];
             tensor<int32, [4]> k_7_perm_0 = const()[name = tensor<string, []>("k_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> var_794 = linear(bias = encoder_v_proj_3_bias, weight = encoder_v_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_32")];
-            tensor<int32, [4]> var_795 = const()[name = tensor<string, []>("op_795"), val = tensor<int32, [4]>([1, 3, 4, 64])];
-            tensor<fp32, [1, 3, 4, 64]> var_796 = reshape(shape = var_795, x = var_794)[name = tensor<string, []>("op_796")];
+            tensor<fp32, [1, 3, 256]> var_840 = linear(bias = inner_encoder_v_proj_3_bias, weight = inner_encoder_v_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_32")];
+            tensor<int32, [4]> var_841 = const()[name = tensor<string, []>("op_841"), val = tensor<int32, [4]>([1, 3, 4, 64])];
+            tensor<fp32, [1, 3, 4, 64]> var_842 = reshape(shape = var_841, x = var_840)[name = tensor<string, []>("op_842")];
             tensor<int32, [4]> v_7_perm_0 = const()[name = tensor<string, []>("v_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 3, 256]> input_137 = linear(bias = encoder_g_proj_3_bias, weight = encoder_g_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_33")];
+            tensor<fp32, [1, 3, 256]> input_139 = linear(bias = inner_encoder_g_proj_3_bias, weight = inner_encoder_g_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_33")];
             tensor<fp32, [1]> sqrt_s0_7 = sqrt(x = prev_scale_7)[name = tensor<string, []>("sqrt_s0_7")];
-            tensor<fp32, [3]> s_t_7 = add(x = prev_scale_7, y = encoder__t_index)[name = tensor<string, []>("s_t_7")];
+            tensor<fp32, [3]> s_t_7 = add(x = prev_scale_7, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_7")];
             tensor<fp32, [3]> sqrt_s_t_7 = sqrt(x = s_t_7)[name = tensor<string, []>("sqrt_s_t_7")];
             tensor<bool, []> qk_7_transpose_x_1 = const()[name = tensor<string, []>("qk_7_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_7_transpose_y_1 = const()[name = tensor<string, []>("qk_7_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 3, 64]> k_7 = transpose(perm = k_7_perm_0, x = var_790)[name = tensor<string, []>("transpose_36")];
-            tensor<fp32, [1, 4, 3, 64]> q_7 = transpose(perm = q_7_perm_0, x = var_782)[name = tensor<string, []>("transpose_37")];
+            tensor<fp32, [1, 4, 3, 64]> k_7 = transpose(perm = k_7_perm_0, x = var_836)[name = tensor<string, []>("transpose_36")];
+            tensor<fp32, [1, 4, 3, 64]> q_7 = transpose(perm = q_7_perm_0, x = var_828)[name = tensor<string, []>("transpose_37")];
             tensor<fp32, [1, 4, 3, 3]> qk_7 = matmul(transpose_x = qk_7_transpose_x_1, transpose_y = qk_7_transpose_y_1, x = q_7, y = k_7)[name = tensor<string, []>("qk_7")];
-            tensor<int32, [2]> var_806 = const()[name = tensor<string, []>("op_806"), val = tensor<int32, [2]>([3, 1])];
-            tensor<fp32, [3, 1]> var_807 = reshape(shape = var_806, x = sqrt_s_t_7)[name = tensor<string, []>("op_807")];
-            tensor<fp32, [3, 3]> M_7 = real_div(x = encoder__causal_mask, y = var_807)[name = tensor<string, []>("M_7")];
-            tensor<fp32, [1, 4, 3, 3]> var_809 = mul(x = qk_7, y = M_7)[name = tensor<string, []>("op_809")];
+            tensor<int32, [2]> var_852 = const()[name = tensor<string, []>("op_852"), val = tensor<int32, [2]>([3, 1])];
+            tensor<fp32, [3, 1]> var_853 = reshape(shape = var_852, x = sqrt_s_t_7)[name = tensor<string, []>("op_853")];
+            tensor<fp32, [3, 3]> M_7 = real_div(x = inner_encoder__causal_mask, y = var_853)[name = tensor<string, []>("M_7")];
+            tensor<fp32, [1, 4, 3, 3]> var_855 = mul(x = qk_7, y = M_7)[name = tensor<string, []>("op_855")];
             tensor<bool, []> inner_7_transpose_x_0 = const()[name = tensor<string, []>("inner_7_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_7_transpose_y_0 = const()[name = tensor<string, []>("inner_7_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 3, 64]> v_7 = transpose(perm = v_7_perm_0, x = var_796)[name = tensor<string, []>("transpose_35")];
-            tensor<fp32, [1, 4, 3, 64]> inner_7 = matmul(transpose_x = inner_7_transpose_x_0, transpose_y = inner_7_transpose_y_0, x = var_809, y = v_7)[name = tensor<string, []>("inner_7")];
-            tensor<bool, []> var_811_transpose_x_0 = const()[name = tensor<string, []>("op_811_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_811_transpose_y_0 = const()[name = tensor<string, []>("op_811_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 3, 64]> var_811 = matmul(transpose_x = var_811_transpose_x_0, transpose_y = var_811_transpose_y_0, x = q_7, y = prev_kv_7)[name = tensor<string, []>("op_811")];
-            tensor<fp32, [3]> var_812 = real_div(x = sqrt_s0_7, y = sqrt_s_t_7)[name = tensor<string, []>("op_812")];
-            tensor<int32, [4]> var_813 = const()[name = tensor<string, []>("op_813"), val = tensor<int32, [4]>([1, 1, 3, 1])];
-            tensor<fp32, [1, 1, 3, 1]> var_814 = reshape(shape = var_813, x = var_812)[name = tensor<string, []>("op_814")];
-            tensor<fp32, [1, 4, 3, 64]> cross_7 = mul(x = var_811, y = var_814)[name = tensor<string, []>("cross_7")];
+            tensor<fp32, [1, 4, 3, 64]> v_7 = transpose(perm = v_7_perm_0, x = var_842)[name = tensor<string, []>("transpose_35")];
+            tensor<fp32, [1, 4, 3, 64]> inner_7 = matmul(transpose_x = inner_7_transpose_x_0, transpose_y = inner_7_transpose_y_0, x = var_855, y = v_7)[name = tensor<string, []>("inner_7")];
+            tensor<bool, []> var_857_transpose_x_0 = const()[name = tensor<string, []>("op_857_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_857_transpose_y_0 = const()[name = tensor<string, []>("op_857_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 3, 64]> var_857 = matmul(transpose_x = var_857_transpose_x_0, transpose_y = var_857_transpose_y_0, x = q_7, y = prev_kv_7)[name = tensor<string, []>("op_857")];
+            tensor<fp32, [3]> var_858 = real_div(x = sqrt_s0_7, y = sqrt_s_t_7)[name = tensor<string, []>("op_858")];
+            tensor<int32, [4]> var_859 = const()[name = tensor<string, []>("op_859"), val = tensor<int32, [4]>([1, 1, 3, 1])];
+            tensor<fp32, [1, 1, 3, 1]> var_860 = reshape(shape = var_859, x = var_858)[name = tensor<string, []>("op_860")];
+            tensor<fp32, [1, 4, 3, 64]> cross_7 = mul(x = var_857, y = var_860)[name = tensor<string, []>("cross_7")];
             tensor<fp32, [1, 4, 3, 64]> out_19 = add(x = inner_7, y = cross_7)[name = tensor<string, []>("out_19")];
-            tensor<fp32, [1, 4, 64, 64]> var_817 = mul(x = prev_kv_7, y = sqrt_s0_7)[name = tensor<string, []>("op_817")];
-            tensor<bool, []> var_819_transpose_x_1 = const()[name = tensor<string, []>("op_819_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_819_transpose_y_1 = const()[name = tensor<string, []>("op_819_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_819 = matmul(transpose_x = var_819_transpose_x_1, transpose_y = var_819_transpose_y_1, x = k_7, y = v_7)[name = tensor<string, []>("op_819")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_7 = add(x = var_817, y = var_819)[name = tensor<string, []>("new_kv_unnorm_7")];
-            tensor<fp32, []> var_821 = const()[name = tensor<string, []>("op_821"), val = tensor<fp32, []>(0x1.8p+1)];
-            tensor<fp32, [1]> new_scale_7 = add(x = prev_scale_7, y = var_821)[name = tensor<string, []>("new_scale_7")];
-            tensor<fp32, [1]> var_823 = sqrt(x = new_scale_7)[name = tensor<string, []>("op_823")];
-            tensor<fp32, [1, 4, 64, 64]> nkv_1 = real_div(x = new_kv_unnorm_7, y = var_823)[name = tensor<string, []>("nkv_1")];
-            tensor<int32, [4]> var_825_perm_0 = const()[name = tensor<string, []>("op_825_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_863 = mul(x = prev_kv_7, y = sqrt_s0_7)[name = tensor<string, []>("op_863")];
+            tensor<bool, []> var_865_transpose_x_1 = const()[name = tensor<string, []>("op_865_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_865_transpose_y_1 = const()[name = tensor<string, []>("op_865_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_865 = matmul(transpose_x = var_865_transpose_x_1, transpose_y = var_865_transpose_y_1, x = k_7, y = v_7)[name = tensor<string, []>("op_865")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_7 = add(x = var_863, y = var_865)[name = tensor<string, []>("new_kv_unnorm_7")];
+            tensor<fp32, []> var_867 = const()[name = tensor<string, []>("op_867"), val = tensor<fp32, []>(0x1.8p+1)];
+            tensor<fp32, [1]> new_scale_7 = add(x = prev_scale_7, y = var_867)[name = tensor<string, []>("new_scale_7")];
+            tensor<fp32, [1]> var_869 = sqrt(x = new_scale_7)[name = tensor<string, []>("op_869")];
+            tensor<fp32, [1, 4, 64, 64]> nkv_1 = real_div(x = new_kv_unnorm_7, y = var_869)[name = tensor<string, []>("nkv_1")];
+            tensor<int32, [4]> var_871_perm_0 = const()[name = tensor<string, []>("op_871_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_21_axes_0 = const()[name = tensor<string, []>("out_21_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 4, 64]> var_825 = transpose(perm = var_825_perm_0, x = out_19)[name = tensor<string, []>("transpose_34")];
-            tensor<fp32, [1, 3, 4, 64]> out_21 = layer_norm(axes = out_21_axes_0, epsilon = var_17, x = var_825)[name = tensor<string, []>("out_21")];
-            tensor<int32, [3]> var_829 = const()[name = tensor<string, []>("op_829"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<fp32, [1, 3, 256]> out_23 = reshape(shape = var_829, x = out_21)[name = tensor<string, []>("out_23")];
-            tensor<fp32, [1, 3, 256]> var_831 = silu(x = input_137)[name = tensor<string, []>("op_831")];
-            tensor<fp32, [1, 3, 256]> input_139 = mul(x = var_831, y = out_23)[name = tensor<string, []>("input_139")];
-            tensor<fp32, [1, 3, 256]> ret_out_7 = linear(bias = encoder_out_proj_3_bias, weight = encoder_out_proj_3_weight, x = input_139)[name = tensor<string, []>("linear_34")];
-            tensor<fp32, [1, 3, 256]> x_21 = add(x = input_133, y = ret_out_7)[name = tensor<string, []>("x_21")];
+            tensor<fp32, [1, 3, 4, 64]> var_871 = transpose(perm = var_871_perm_0, x = out_19)[name = tensor<string, []>("transpose_34")];
+            tensor<fp32, [1, 3, 4, 64]> out_21 = layer_norm(axes = out_21_axes_0, epsilon = var_64, x = var_871)[name = tensor<string, []>("out_21")];
+            tensor<int32, [3]> var_875 = const()[name = tensor<string, []>("op_875"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<fp32, [1, 3, 256]> out_23 = reshape(shape = var_875, x = out_21)[name = tensor<string, []>("out_23")];
+            tensor<fp32, [1, 3, 256]> var_877 = silu(x = input_139)[name = tensor<string, []>("op_877")];
+            tensor<fp32, [1, 3, 256]> input_141 = mul(x = var_877, y = out_23)[name = tensor<string, []>("input_141")];
+            tensor<fp32, [1, 3, 256]> ret_out_7 = linear(bias = inner_encoder_out_proj_3_bias, weight = inner_encoder_out_proj_3_weight, x = input_141)[name = tensor<string, []>("linear_34")];
+            tensor<fp32, [1, 3, 256]> x_21 = add(x = input_135, y = ret_out_7)[name = tensor<string, []>("x_21")];
             tensor<int32, [4]> window_25_begin_0 = const()[name = tensor<string, []>("window_25_begin_0"), val = tensor<int32, [4]>([3, 0, 0, 0])];
             tensor<int32, [4]> window_25_end_0 = const()[name = tensor<string, []>("window_25_end_0"), val = tensor<int32, [4]>([4, 1, 16, 256])];
             tensor<bool, [4]> window_25_end_mask_0 = const()[name = tensor<string, []>("window_25_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_25_squeeze_mask_0 = const()[name = tensor<string, []>("window_25_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_25 = slice_by_index(begin = window_25_begin_0, end = window_25_end_0, end_mask = window_25_end_mask_0, squeeze_mask = window_25_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_25")];
-            tensor<int32, [3]> var_839_begin_0 = const()[name = tensor<string, []>("op_839_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_839_end_0 = const()[name = tensor<string, []>("op_839_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_839_end_mask_0 = const()[name = tensor<string, []>("op_839_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_839 = slice_by_index(begin = var_839_begin_0, end = var_839_end_0, end_mask = var_839_end_mask_0, x = x_21)[name = tensor<string, []>("op_839")];
-            tensor<int32, [3]> var_842_begin_0 = const()[name = tensor<string, []>("op_842_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_842_end_0 = const()[name = tensor<string, []>("op_842_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_842_end_mask_0 = const()[name = tensor<string, []>("op_842_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_842 = slice_by_index(begin = var_842_begin_0, end = var_842_end_0, end_mask = var_842_end_mask_0, x = window_25)[name = tensor<string, []>("op_842")];
+            tensor<int32, [3]> var_885_begin_0 = const()[name = tensor<string, []>("op_885_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_885_end_0 = const()[name = tensor<string, []>("op_885_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_885_end_mask_0 = const()[name = tensor<string, []>("op_885_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_885 = slice_by_index(begin = var_885_begin_0, end = var_885_end_0, end_mask = var_885_end_mask_0, x = x_21)[name = tensor<string, []>("op_885")];
+            tensor<int32, [3]> var_888_begin_0 = const()[name = tensor<string, []>("op_888_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_888_end_0 = const()[name = tensor<string, []>("op_888_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_888_end_mask_0 = const()[name = tensor<string, []>("op_888_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_888 = slice_by_index(begin = var_888_begin_0, end = var_888_end_0, end_mask = var_888_end_mask_0, x = window_25)[name = tensor<string, []>("op_888")];
             tensor<bool, []> window_27_interleave_0 = const()[name = tensor<string, []>("window_27_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_27 = concat(axis = var_26, interleave = window_27_interleave_0, values = (var_842, var_839))[name = tensor<string, []>("window_27")];
-            tensor<int32, [3]> var_847_begin_0 = const()[name = tensor<string, []>("op_847_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_847_end_0 = const()[name = tensor<string, []>("op_847_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_847_end_mask_0 = const()[name = tensor<string, []>("op_847_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_847 = slice_by_index(begin = var_847_begin_0, end = var_847_end_0, end_mask = var_847_end_mask_0, x = x_21)[name = tensor<string, []>("op_847")];
-            tensor<int32, [3]> var_850_begin_0 = const()[name = tensor<string, []>("op_850_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_850_end_0 = const()[name = tensor<string, []>("op_850_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_850_end_mask_0 = const()[name = tensor<string, []>("op_850_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_850 = slice_by_index(begin = var_850_begin_0, end = var_850_end_0, end_mask = var_850_end_mask_0, x = window_27)[name = tensor<string, []>("op_850")];
+            tensor<fp32, [1, 16, 256]> window_27 = concat(axis = var_72, interleave = window_27_interleave_0, values = (var_888, var_885))[name = tensor<string, []>("window_27")];
+            tensor<int32, [3]> var_893_begin_0 = const()[name = tensor<string, []>("op_893_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_893_end_0 = const()[name = tensor<string, []>("op_893_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_893_end_mask_0 = const()[name = tensor<string, []>("op_893_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_893 = slice_by_index(begin = var_893_begin_0, end = var_893_end_0, end_mask = var_893_end_mask_0, x = x_21)[name = tensor<string, []>("op_893")];
+            tensor<int32, [3]> var_896_begin_0 = const()[name = tensor<string, []>("op_896_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_896_end_0 = const()[name = tensor<string, []>("op_896_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_896_end_mask_0 = const()[name = tensor<string, []>("op_896_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_896 = slice_by_index(begin = var_896_begin_0, end = var_896_end_0, end_mask = var_896_end_mask_0, x = window_27)[name = tensor<string, []>("op_896")];
             tensor<bool, []> window_29_interleave_0 = const()[name = tensor<string, []>("window_29_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_29 = concat(axis = var_26, interleave = window_29_interleave_0, values = (var_850, var_847))[name = tensor<string, []>("window_29")];
-            tensor<int32, [3]> var_855_begin_0 = const()[name = tensor<string, []>("op_855_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_855_end_0 = const()[name = tensor<string, []>("op_855_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_855_end_mask_0 = const()[name = tensor<string, []>("op_855_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_855 = slice_by_index(begin = var_855_begin_0, end = var_855_end_0, end_mask = var_855_end_mask_0, x = x_21)[name = tensor<string, []>("op_855")];
-            tensor<int32, [3]> var_858_begin_0 = const()[name = tensor<string, []>("op_858_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_858_end_0 = const()[name = tensor<string, []>("op_858_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_858_end_mask_0 = const()[name = tensor<string, []>("op_858_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_858 = slice_by_index(begin = var_858_begin_0, end = var_858_end_0, end_mask = var_858_end_mask_0, x = window_29)[name = tensor<string, []>("op_858")];
+            tensor<fp32, [1, 16, 256]> window_29 = concat(axis = var_72, interleave = window_29_interleave_0, values = (var_896, var_893))[name = tensor<string, []>("window_29")];
+            tensor<int32, [3]> var_901_begin_0 = const()[name = tensor<string, []>("op_901_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_901_end_0 = const()[name = tensor<string, []>("op_901_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_901_end_mask_0 = const()[name = tensor<string, []>("op_901_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_901 = slice_by_index(begin = var_901_begin_0, end = var_901_end_0, end_mask = var_901_end_mask_0, x = x_21)[name = tensor<string, []>("op_901")];
+            tensor<int32, [3]> var_904_begin_0 = const()[name = tensor<string, []>("op_904_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_904_end_0 = const()[name = tensor<string, []>("op_904_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_904_end_mask_0 = const()[name = tensor<string, []>("op_904_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_904 = slice_by_index(begin = var_904_begin_0, end = var_904_end_0, end_mask = var_904_end_mask_0, x = window_29)[name = tensor<string, []>("op_904")];
             tensor<bool, []> window_interleave_0 = const()[name = tensor<string, []>("window_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window = concat(axis = var_26, interleave = window_interleave_0, values = (var_858, var_855))[name = tensor<string, []>("window")];
-            tensor<bool, []> input_141_interleave_0 = const()[name = tensor<string, []>("input_141_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [3, 16, 256]> input_141 = concat(axis = var_23, interleave = input_141_interleave_0, values = (window_27, window_29, window))[name = tensor<string, []>("input_141")];
+            tensor<fp32, [1, 16, 256]> window = concat(axis = var_72, interleave = window_interleave_0, values = (var_904, var_901))[name = tensor<string, []>("window")];
+            tensor<bool, []> input_143_interleave_0 = const()[name = tensor<string, []>("input_143_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [3, 16, 256]> input_143 = concat(axis = var_59, interleave = input_143_interleave_0, values = (window_27, window_29, window))[name = tensor<string, []>("input_143")];
             tensor<int32, [1]> x_23_axes_0 = const()[name = tensor<string, []>("x_23_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [3, 16, 256]> x_23 = layer_norm(axes = x_23_axes_0, beta = encoder_conv_module_3_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_3_sequential_0_weight, x = input_141)[name = tensor<string, []>("x_23")];
-            tensor<int32, [3]> input_143_perm_0 = const()[name = tensor<string, []>("input_143_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [3, 16, 256]> x_23 = layer_norm(axes = x_23_axes_0, beta = inner_encoder_conv_module_3_sequential_0_bias, epsilon = var_56, gamma = inner_encoder_conv_module_3_sequential_0_weight, x = input_143)[name = tensor<string, []>("x_23")];
+            tensor<int32, [3]> input_145_perm_0 = const()[name = tensor<string, []>("input_145_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_33_pad_type_0 = const()[name = tensor<string, []>("inputs_33_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_33_strides_0 = const()[name = tensor<string, []>("inputs_33_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_33_pad_0 = const()[name = tensor<string, []>("inputs_33_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_33_dilations_0 = const()[name = tensor<string, []>("inputs_33_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_33_groups_0 = const()[name = tensor<string, []>("inputs_33_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> input_143 = transpose(perm = input_143_perm_0, x = x_23)[name = tensor<string, []>("transpose_33")];
-            tensor<fp32, [3, 512, 16]> inputs_33 = conv(bias = encoder_conv_module_3_sequential_2_conv_bias, dilations = inputs_33_dilations_0, groups = inputs_33_groups_0, pad = inputs_33_pad_0, pad_type = inputs_33_pad_type_0, strides = inputs_33_strides_0, weight = encoder_conv_module_3_sequential_2_conv_weight, x = input_143)[name = tensor<string, []>("inputs_33")];
-            tensor<int32, [2]> var_883_split_sizes_0 = const()[name = tensor<string, []>("op_883_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_883_axis_0 = const()[name = tensor<string, []>("op_883_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> var_883_0, tensor<fp32, [3, 256, 16]> var_883_1 = split(axis = var_883_axis_0, split_sizes = var_883_split_sizes_0, x = inputs_33)[name = tensor<string, []>("op_883")];
-            tensor<fp32, [3, 256, 16]> var_885 = sigmoid(x = var_883_1)[name = tensor<string, []>("op_885")];
-            tensor<fp32, [3, 256, 16]> inputs_35 = mul(x = var_883_0, y = var_885)[name = tensor<string, []>("inputs_35")];
+            tensor<fp32, [3, 256, 16]> input_145 = transpose(perm = input_145_perm_0, x = x_23)[name = tensor<string, []>("transpose_33")];
+            tensor<fp32, [3, 512, 16]> inputs_33 = conv(bias = inner_encoder_conv_module_3_sequential_2_conv_bias, dilations = inputs_33_dilations_0, groups = inputs_33_groups_0, pad = inputs_33_pad_0, pad_type = inputs_33_pad_type_0, strides = inputs_33_strides_0, weight = inner_encoder_conv_module_3_sequential_2_conv_weight, x = input_145)[name = tensor<string, []>("inputs_33")];
+            tensor<int32, [2]> var_929_split_sizes_0 = const()[name = tensor<string, []>("op_929_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_929_axis_0 = const()[name = tensor<string, []>("op_929_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [3, 256, 16]> var_929_0, tensor<fp32, [3, 256, 16]> var_929_1 = split(axis = var_929_axis_0, split_sizes = var_929_split_sizes_0, x = inputs_33)[name = tensor<string, []>("op_929")];
+            tensor<fp32, [3, 256, 16]> var_931 = sigmoid(x = var_929_1)[name = tensor<string, []>("op_931")];
+            tensor<fp32, [3, 256, 16]> inputs_35 = mul(x = var_929_0, y = var_931)[name = tensor<string, []>("inputs_35")];
             tensor<string, []> outputs_aug_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_pad_0 = const()[name = tensor<string, []>("outputs_aug_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_groups_0 = const()[name = tensor<string, []>("outputs_aug_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_strides_0 = const()[name = tensor<string, []>("outputs_aug_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_dilations_0 = const()[name = tensor<string, []>("outputs_aug_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [3, 256, 31]> outputs_aug = conv(dilations = outputs_aug_dilations_0, groups = outputs_aug_groups_0, pad = outputs_aug_pad_0, pad_type = outputs_aug_pad_type_0, strides = outputs_aug_strides_0, weight = encoder_conv_module_3_sequential_4_conv_weight, x = inputs_35)[name = tensor<string, []>("outputs_aug")];
-            tensor<int32, [3]> input_145_begin_0 = const()[name = tensor<string, []>("input_145_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_145_end_0 = const()[name = tensor<string, []>("input_145_end_0"), val = tensor<int32, [3]>([3, 256, 16])];
-            tensor<bool, [3]> input_145_end_mask_0 = const()[name = tensor<string, []>("input_145_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [3, 256, 16]> input_145 = slice_by_index(begin = input_145_begin_0, end = input_145_end_0, end_mask = input_145_end_mask_0, x = outputs_aug)[name = tensor<string, []>("input_145")];
-            tensor<fp32, [3, 256, 16]> inputs_37 = batch_norm(beta = encoder_conv_module_3_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_3_sequential_5_weight, mean = encoder_conv_module_3_sequential_5_running_mean, variance = encoder_conv_module_3_sequential_5_running_var, x = input_145)[name = tensor<string, []>("inputs_37")];
-            tensor<fp32, [3, 256, 16]> input_147 = silu(x = inputs_37)[name = tensor<string, []>("input_147")];
-            tensor<string, []> input_149_pad_type_0 = const()[name = tensor<string, []>("input_149_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_149_strides_0 = const()[name = tensor<string, []>("input_149_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_149_pad_0 = const()[name = tensor<string, []>("input_149_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_149_dilations_0 = const()[name = tensor<string, []>("input_149_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_149_groups_0 = const()[name = tensor<string, []>("input_149_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [3, 256, 16]> input_149 = conv(bias = encoder_conv_module_3_sequential_7_conv_bias, dilations = input_149_dilations_0, groups = input_149_groups_0, pad = input_149_pad_0, pad_type = input_149_pad_type_0, strides = input_149_strides_0, weight = encoder_conv_module_3_sequential_7_conv_weight, x = input_147)[name = tensor<string, []>("input_149")];
+            tensor<fp32, [3, 256, 31]> outputs_aug = conv(dilations = outputs_aug_dilations_0, groups = outputs_aug_groups_0, pad = outputs_aug_pad_0, pad_type = outputs_aug_pad_type_0, strides = outputs_aug_strides_0, weight = inner_encoder_conv_module_3_sequential_4_conv_weight, x = inputs_35)[name = tensor<string, []>("outputs_aug")];
+            tensor<int32, [3]> input_147_begin_0 = const()[name = tensor<string, []>("input_147_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_147_end_0 = const()[name = tensor<string, []>("input_147_end_0"), val = tensor<int32, [3]>([3, 256, 16])];
+            tensor<bool, [3]> input_147_end_mask_0 = const()[name = tensor<string, []>("input_147_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [3, 256, 16]> input_147 = slice_by_index(begin = input_147_begin_0, end = input_147_end_0, end_mask = input_147_end_mask_0, x = outputs_aug)[name = tensor<string, []>("input_147")];
+            tensor<fp32, [3, 256, 16]> inputs_37 = batch_norm(beta = inner_encoder_conv_module_3_sequential_5_bias, epsilon = var_56, gamma = inner_encoder_conv_module_3_sequential_5_weight, mean = inner_encoder_conv_module_3_sequential_5_running_mean, variance = inner_encoder_conv_module_3_sequential_5_running_var, x = input_147)[name = tensor<string, []>("inputs_37")];
+            tensor<fp32, [3, 256, 16]> input_149 = silu(x = inputs_37)[name = tensor<string, []>("input_149")];
+            tensor<string, []> input_151_pad_type_0 = const()[name = tensor<string, []>("input_151_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_151_strides_0 = const()[name = tensor<string, []>("input_151_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_151_pad_0 = const()[name = tensor<string, []>("input_151_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_151_dilations_0 = const()[name = tensor<string, []>("input_151_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_151_groups_0 = const()[name = tensor<string, []>("input_151_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [3, 256, 16]> input_151 = conv(bias = inner_encoder_conv_module_3_sequential_7_conv_bias, dilations = input_151_dilations_0, groups = input_151_groups_0, pad = input_151_pad_0, pad_type = input_151_pad_type_0, strides = input_151_strides_0, weight = inner_encoder_conv_module_3_sequential_7_conv_weight, x = input_149)[name = tensor<string, []>("input_151")];
             tensor<int32, [3]> conv_out_7_perm_0 = const()[name = tensor<string, []>("conv_out_7_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_916_begin_0 = const()[name = tensor<string, []>("op_916_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_916_end_0 = const()[name = tensor<string, []>("op_916_end_0"), val = tensor<int32, [3]>([3, 16, 256])];
-            tensor<bool, [3]> var_916_end_mask_0 = const()[name = tensor<string, []>("op_916_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [3, 16, 256]> conv_out_7 = transpose(perm = conv_out_7_perm_0, x = input_149)[name = tensor<string, []>("transpose_32")];
-            tensor<fp32, [3, 1, 256]> var_916 = slice_by_index(begin = var_916_begin_0, end = var_916_end_0, end_mask = var_916_end_mask_0, x = conv_out_7)[name = tensor<string, []>("op_916")];
-            tensor<int32, [3]> var_918_perm_0 = const()[name = tensor<string, []>("op_918_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 3, 256]> var_918 = transpose(perm = var_918_perm_0, x = var_916)[name = tensor<string, []>("transpose_31")];
-            tensor<fp32, [1, 3, 256]> input_151 = add(x = x_21, y = var_918)[name = tensor<string, []>("input_151")];
-            tensor<int32, [1]> input_153_axes_0 = const()[name = tensor<string, []>("input_153_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_153 = layer_norm(axes = input_153_axes_0, beta = encoder_ffn2_3_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_3_module_sequential_0_weight, x = input_151)[name = tensor<string, []>("input_153")];
-            tensor<fp32, [1, 3, 1024]> inputs = linear(bias = encoder_ffn2_3_module_sequential_1_linear_bias, weight = encoder_ffn2_3_module_sequential_1_linear_weight, x = input_153)[name = tensor<string, []>("linear_35")];
-            tensor<fp32, [1, 3, 1024]> input_155 = silu(x = inputs)[name = tensor<string, []>("input_155")];
-            tensor<fp32, [1, 3, 256]> input_159 = linear(bias = encoder_ffn2_3_module_sequential_4_linear_bias, weight = encoder_ffn2_3_module_sequential_4_linear_weight, x = input_155)[name = tensor<string, []>("linear_36")];
-            tensor<fp32, []> var_941 = const()[name = tensor<string, []>("op_941"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 3, 256]> var_942 = mul(x = input_159, y = var_941)[name = tensor<string, []>("op_942")];
-            tensor<fp32, [1, 3, 256]> input_161 = add(x = var_942, y = input_151)[name = tensor<string, []>("input_161")];
+            tensor<int32, [3]> var_962_begin_0 = const()[name = tensor<string, []>("op_962_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_962_end_0 = const()[name = tensor<string, []>("op_962_end_0"), val = tensor<int32, [3]>([3, 16, 256])];
+            tensor<bool, [3]> var_962_end_mask_0 = const()[name = tensor<string, []>("op_962_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [3, 16, 256]> conv_out_7 = transpose(perm = conv_out_7_perm_0, x = input_151)[name = tensor<string, []>("transpose_32")];
+            tensor<fp32, [3, 1, 256]> var_962 = slice_by_index(begin = var_962_begin_0, end = var_962_end_0, end_mask = var_962_end_mask_0, x = conv_out_7)[name = tensor<string, []>("op_962")];
+            tensor<int32, [3]> var_964_perm_0 = const()[name = tensor<string, []>("op_964_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 3, 256]> var_964 = transpose(perm = var_964_perm_0, x = var_962)[name = tensor<string, []>("transpose_31")];
+            tensor<fp32, [1, 3, 256]> input_153 = add(x = x_21, y = var_964)[name = tensor<string, []>("input_153")];
+            tensor<int32, [1]> input_155_axes_0 = const()[name = tensor<string, []>("input_155_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 3, 256]> input_155 = layer_norm(axes = input_155_axes_0, beta = inner_encoder_ffn2_3_module_sequential_0_bias, epsilon = var_56, gamma = inner_encoder_ffn2_3_module_sequential_0_weight, x = input_153)[name = tensor<string, []>("input_155")];
+            tensor<fp32, [1, 3, 1024]> inputs = linear(bias = inner_encoder_ffn2_3_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_3_module_sequential_1_linear_weight, x = input_155)[name = tensor<string, []>("linear_35")];
+            tensor<fp32, [1, 3, 1024]> input_157 = silu(x = inputs)[name = tensor<string, []>("input_157")];
+            tensor<fp32, [1, 3, 256]> input_161 = linear(bias = inner_encoder_ffn2_3_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_3_module_sequential_4_linear_weight, x = input_157)[name = tensor<string, []>("linear_36")];
+            tensor<fp32, []> var_987 = const()[name = tensor<string, []>("op_987"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 3, 256]> var_988 = mul(x = input_161, y = var_987)[name = tensor<string, []>("op_988")];
+            tensor<fp32, [1, 3, 256]> input_163 = add(x = var_988, y = input_153)[name = tensor<string, []>("input_163")];
             tensor<int32, [1]> x_25_axes_0 = const()[name = tensor<string, []>("x_25_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> x_25 = layer_norm(axes = x_25_axes_0, beta = encoder_layer_norm_3_bias, epsilon = var_28, gamma = encoder_layer_norm_3_weight, x = input_161)[name = tensor<string, []>("x_25")];
+            tensor<fp32, [1, 3, 256]> x_25 = layer_norm(axes = x_25_axes_0, beta = inner_encoder_layer_norm_3_bias, epsilon = var_56, gamma = inner_encoder_layer_norm_3_weight, x = input_163)[name = tensor<string, []>("x_25")];
             tensor<int32, [3]> x_ct_perm_0 = const()[name = tensor<string, []>("x_ct_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<bool, []> cat_interleave_0 = const()[name = tensor<string, []>("cat_interleave_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [1, 256, 3]> x_ct = transpose(perm = x_ct_perm_0, x = x_25)[name = tensor<string, []>("transpose_30")];
-            tensor<fp32, [1, 256, 21]> cat = concat(axis = var_20, interleave = cat_interleave_0, values = (cnn_window, x_ct))[name = tensor<string, []>("cat")];
+            tensor<fp32, [1, 256, 21]> cat = concat(axis = var_61, interleave = cat_interleave_0, values = (cnn_window, x_ct))[name = tensor<string, []>("cat")];
             tensor<string, []> conv_out_pad_type_0 = const()[name = tensor<string, []>("conv_out_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> conv_out_strides_0 = const()[name = tensor<string, []>("conv_out_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> conv_out_pad_0 = const()[name = tensor<string, []>("conv_out_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> conv_out_dilations_0 = const()[name = tensor<string, []>("conv_out_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> conv_out_groups_0 = const()[name = tensor<string, []>("conv_out_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 3]> conv_out = conv(bias = encoder_cnn_bias, dilations = conv_out_dilations_0, groups = conv_out_groups_0, pad = conv_out_pad_0, pad_type = conv_out_pad_type_0, strides = conv_out_strides_0, weight = encoder_cnn_weight, x = cat)[name = tensor<string, []>("conv_out")];
-            tensor<int32, [3]> var_960_begin_0 = const()[name = tensor<string, []>("op_960_begin_0"), val = tensor<int32, [3]>([0, 0, 3])];
-            tensor<int32, [3]> var_960_end_0 = const()[name = tensor<string, []>("op_960_end_0"), val = tensor<int32, [3]>([1, 256, 21])];
-            tensor<bool, [3]> var_960_end_mask_0 = const()[name = tensor<string, []>("op_960_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 256, 18]> cnn_window_new = slice_by_index(begin = var_960_begin_0, end = var_960_end_0, end_mask = var_960_end_mask_0, x = cat)[name = tensor<string, []>("op_960")];
-            tensor<int32, [3]> input_163_perm_0 = const()[name = tensor<string, []>("input_163_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [1]> var_962 = const()[name = tensor<string, []>("op_962"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 256]> input_163 = transpose(perm = input_163_perm_0, x = conv_out)[name = tensor<string, []>("transpose_29")];
-            tensor<fp32, [1, 3, 1]> var_963 = reduce_l2_norm(axes = var_962, keep_dims = var_29, x = input_163)[name = tensor<string, []>("op_963")];
+            tensor<fp32, [1, 256, 3]> conv_out = conv(bias = inner_encoder_cnn_bias, dilations = conv_out_dilations_0, groups = conv_out_groups_0, pad = conv_out_pad_0, pad_type = conv_out_pad_type_0, strides = conv_out_strides_0, weight = inner_encoder_cnn_weight, x = cat)[name = tensor<string, []>("conv_out")];
+            tensor<int32, [3]> var_1006_begin_0 = const()[name = tensor<string, []>("op_1006_begin_0"), val = tensor<int32, [3]>([0, 0, 3])];
+            tensor<int32, [3]> var_1006_end_0 = const()[name = tensor<string, []>("op_1006_end_0"), val = tensor<int32, [3]>([1, 256, 21])];
+            tensor<bool, [3]> var_1006_end_mask_0 = const()[name = tensor<string, []>("op_1006_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 256, 18]> cnn_window_new = slice_by_index(begin = var_1006_begin_0, end = var_1006_end_0, end_mask = var_1006_end_mask_0, x = cat)[name = tensor<string, []>("op_1006")];
+            tensor<int32, [3]> input_165_perm_0 = const()[name = tensor<string, []>("input_165_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_1008 = const()[name = tensor<string, []>("op_1008"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 3, 256]> input_165 = transpose(perm = input_165_perm_0, x = conv_out)[name = tensor<string, []>("transpose_29")];
+            tensor<fp32, [1, 3, 1]> var_1009 = reduce_l2_norm(axes = var_1008, keep_dims = var_55, x = input_165)[name = tensor<string, []>("op_1009")];
             tensor<fp32, []> const_12 = const()[name = tensor<string, []>("const_12"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1, 3, 1]> clip_0 = clip(alpha = var_11, beta = const_12, x = var_963)[name = tensor<string, []>("clip_0")];
-            tensor<fp32, [1, 3, 256]> emb = real_div(x = input_163, y = clip_0)[name = tensor<string, []>("emb")];
-            tensor<int32, []> var_967_axis_0 = const()[name = tensor<string, []>("op_967_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1, 4, 64, 64]> enc_kv_new = stack(axis = var_967_axis_0, values = (var_206, var_412, var_618, nkv_1))[name = tensor<string, []>("op_967")];
-            tensor<int32, []> var_969_axis_0 = const()[name = tensor<string, []>("op_969_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1]> enc_scale_new = stack(axis = var_969_axis_0, values = (new_scale_1, new_scale_3, new_scale_5, new_scale_7))[name = tensor<string, []>("op_969")];
-            tensor<int32, []> var_971_axis_0 = const()[name = tensor<string, []>("op_971_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1, 16, 256]> enc_conv_cache_new = stack(axis = var_971_axis_0, values = (window_7, window_15, window_23, window))[name = tensor<string, []>("op_971")];
-            tensor<fp32, []> var_980 = const()[name = tensor<string, []>("op_980"), val = tensor<fp32, []>(0x1.5798eep-27)];
-            tensor<fp32, []> var_985 = const()[name = tensor<string, []>("op_985"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
-            tensor<fp32, []> var_987 = const()[name = tensor<string, []>("op_987"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
-            tensor<bool, []> var_988 = const()[name = tensor<string, []>("op_988"), val = tensor<bool, []>(true)];
-            tensor<fp32, []> var_990 = const()[name = tensor<string, []>("op_990"), val = tensor<fp32, []>(0x1p+0)];
-            tensor<int32, []> var_994 = const()[name = tensor<string, []>("op_994"), val = tensor<int32, []>(-1)];
-            tensor<int32, []> var_1000 = const()[name = tensor<string, []>("op_1000"), val = tensor<int32, []>(0)];
+            tensor<fp32, [1, 3, 1]> clip_0 = clip(alpha = var_69, beta = const_12, x = var_1009)[name = tensor<string, []>("clip_0")];
+            tensor<fp32, [1, 3, 256]> emb = real_div(x = input_165, y = clip_0)[name = tensor<string, []>("emb")];
+            tensor<int32, []> var_1013_axis_0 = const()[name = tensor<string, []>("op_1013_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1, 4, 64, 64]> enc_kv_new = stack(axis = var_1013_axis_0, values = (var_252, var_458, var_664, nkv_1))[name = tensor<string, []>("op_1013")];
+            tensor<int32, []> var_1015_axis_0 = const()[name = tensor<string, []>("op_1015_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1]> enc_scale_new = stack(axis = var_1015_axis_0, values = (new_scale_1, new_scale_3, new_scale_5, new_scale_7))[name = tensor<string, []>("op_1015")];
+            tensor<int32, []> var_1017_axis_0 = const()[name = tensor<string, []>("op_1017_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1, 16, 256]> enc_conv_cache_new = stack(axis = var_1017_axis_0, values = (window_7, window_15, window_23, window))[name = tensor<string, []>("op_1017")];
             tensor<fp32, [1, 3, 12, 256]> pos = const()[name = tensor<string, []>("pos"), val = tensor<fp32, [1, 3, 12, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44395328)))];
-            tensor<int32, [1]> var_1062_axes_0 = const()[name = tensor<string, []>("op_1062_axes_0"), val = tensor<int32, [1]>([2])];
-            tensor<fp32, [1, 3, 1, 256]> var_1062 = expand_dims(axes = var_1062_axes_0, x = emb)[name = tensor<string, []>("op_1062")];
+            tensor<int32, [1]> var_1085_axes_0 = const()[name = tensor<string, []>("op_1085_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp32, [1, 3, 1, 256]> var_1085 = expand_dims(axes = var_1085_axes_0, x = emb)[name = tensor<string, []>("op_1085")];
             tensor<int32, [4]> emb_exp_reps_0 = const()[name = tensor<string, []>("emb_exp_reps_0"), val = tensor<int32, [4]>([1, 1, 12, 1])];
-            tensor<fp32, [1, 3, 12, 256]> emb_exp = tile(reps = emb_exp_reps_0, x = var_1062)[name = tensor<string, []>("emb_exp")];
-            tensor<bool, []> input_165_interleave_0 = const()[name = tensor<string, []>("input_165_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 3, 12, 512]> input_165 = concat(axis = var_994, interleave = input_165_interleave_0, values = (emb_exp, pos))[name = tensor<string, []>("input_165")];
-            tensor<fp32, [1, 3, 12, 256]> x_27 = linear(bias = decoder_convert_bias, weight = decoder_convert_weight, x = input_165)[name = tensor<string, []>("linear_37")];
-            tensor<int32, [4]> var_1070_perm_0 = const()[name = tensor<string, []>("op_1070_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1074 = const()[name = tensor<string, []>("op_1074"), val = tensor<int32, [3]>([12, 3, 256])];
-            tensor<fp32, [1, 12, 3, 256]> var_1070 = transpose(perm = var_1070_perm_0, x = x_27)[name = tensor<string, []>("transpose_28")];
-            tensor<fp32, [12, 3, 256]> x_29 = reshape(shape = var_1074, x = var_1070)[name = tensor<string, []>("x_29")];
+            tensor<fp32, [1, 3, 12, 256]> emb_exp = tile(reps = emb_exp_reps_0, x = var_1085)[name = tensor<string, []>("emb_exp")];
+            tensor<bool, []> input_167_interleave_0 = const()[name = tensor<string, []>("input_167_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 3, 12, 512]> input_167 = concat(axis = var_62, interleave = input_167_interleave_0, values = (emb_exp, pos))[name = tensor<string, []>("input_167")];
+            tensor<fp32, [1, 3, 12, 256]> x_27 = linear(bias = inner_decoder_convert_bias, weight = inner_decoder_convert_weight, x = input_167)[name = tensor<string, []>("linear_37")];
+            tensor<int32, [4]> var_1093_perm_0 = const()[name = tensor<string, []>("op_1093_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1097 = const()[name = tensor<string, []>("op_1097"), val = tensor<int32, [3]>([12, 3, 256])];
+            tensor<fp32, [1, 12, 3, 256]> var_1093 = transpose(perm = var_1093_perm_0, x = x_27)[name = tensor<string, []>("transpose_28")];
+            tensor<fp32, [12, 3, 256]> x_29 = reshape(shape = var_1097, x = var_1093)[name = tensor<string, []>("x_29")];
             tensor<int32, [5]> prev_kv_9_begin_0 = const()[name = tensor<string, []>("prev_kv_9_begin_0"), val = tensor<int32, [5]>([0, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_9_end_0 = const()[name = tensor<string, []>("prev_kv_9_end_0"), val = tensor<int32, [5]>([1, 12, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_9_end_mask_0 = const()[name = tensor<string, []>("prev_kv_9_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -943,132 +954,132 @@ program(1.0)
             tensor<bool, [2]> prev_scale_9_end_mask_0 = const()[name = tensor<string, []>("prev_scale_9_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_9_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_9_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_9 = slice_by_index(begin = prev_scale_9_begin_0, end = prev_scale_9_end_0, end_mask = prev_scale_9_end_mask_0, squeeze_mask = prev_scale_9_squeeze_mask_0, x = dec_scale)[name = tensor<string, []>("prev_scale_9")];
-            tensor<fp32, [12, 3, 256]> var_1082 = linear(bias = decoder_q_proj_0_bias, weight = decoder_q_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_38")];
-            tensor<int32, [4]> var_1083 = const()[name = tensor<string, []>("op_1083"), val = tensor<int32, [4]>([12, 3, 4, 64])];
-            tensor<fp32, [12, 3, 4, 64]> var_1084 = reshape(shape = var_1083, x = var_1082)[name = tensor<string, []>("op_1084")];
+            tensor<fp32, [12, 3, 256]> var_1105 = linear(bias = inner_decoder_q_proj_0_bias, weight = inner_decoder_q_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_38")];
+            tensor<int32, [4]> var_1106 = const()[name = tensor<string, []>("op_1106"), val = tensor<int32, [4]>([12, 3, 4, 64])];
+            tensor<fp32, [12, 3, 4, 64]> var_1107 = reshape(shape = var_1106, x = var_1105)[name = tensor<string, []>("op_1107")];
             tensor<int32, [4]> q_9_perm_0 = const()[name = tensor<string, []>("q_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 3, 256]> var_1088 = linear(bias = decoder_k_proj_0_bias, weight = decoder_k_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_39")];
-            tensor<fp32, []> var_1089 = const()[name = tensor<string, []>("op_1089"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [12, 3, 256]> var_1090 = mul(x = var_1088, y = var_1089)[name = tensor<string, []>("op_1090")];
-            tensor<int32, [4]> var_1091 = const()[name = tensor<string, []>("op_1091"), val = tensor<int32, [4]>([12, 3, 4, 64])];
-            tensor<fp32, [12, 3, 4, 64]> var_1092 = reshape(shape = var_1091, x = var_1090)[name = tensor<string, []>("op_1092")];
+            tensor<fp32, [12, 3, 256]> var_1111 = linear(bias = inner_decoder_k_proj_0_bias, weight = inner_decoder_k_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_39")];
+            tensor<fp32, []> var_1112 = const()[name = tensor<string, []>("op_1112"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [12, 3, 256]> var_1113 = mul(x = var_1111, y = var_1112)[name = tensor<string, []>("op_1113")];
+            tensor<int32, [4]> var_1114 = const()[name = tensor<string, []>("op_1114"), val = tensor<int32, [4]>([12, 3, 4, 64])];
+            tensor<fp32, [12, 3, 4, 64]> var_1115 = reshape(shape = var_1114, x = var_1113)[name = tensor<string, []>("op_1115")];
             tensor<int32, [4]> k_9_perm_0 = const()[name = tensor<string, []>("k_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 3, 256]> var_1096 = linear(bias = decoder_v_proj_0_bias, weight = decoder_v_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_40")];
-            tensor<int32, [4]> var_1097 = const()[name = tensor<string, []>("op_1097"), val = tensor<int32, [4]>([12, 3, 4, 64])];
-            tensor<fp32, [12, 3, 4, 64]> var_1098 = reshape(shape = var_1097, x = var_1096)[name = tensor<string, []>("op_1098")];
+            tensor<fp32, [12, 3, 256]> var_1119 = linear(bias = inner_decoder_v_proj_0_bias, weight = inner_decoder_v_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_40")];
+            tensor<int32, [4]> var_1120 = const()[name = tensor<string, []>("op_1120"), val = tensor<int32, [4]>([12, 3, 4, 64])];
+            tensor<fp32, [12, 3, 4, 64]> var_1121 = reshape(shape = var_1120, x = var_1119)[name = tensor<string, []>("op_1121")];
             tensor<int32, [4]> v_9_perm_0 = const()[name = tensor<string, []>("v_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 3, 256]> input_169 = linear(bias = decoder_g_proj_0_bias, weight = decoder_g_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_41")];
+            tensor<fp32, [12, 3, 256]> input_171 = linear(bias = inner_decoder_g_proj_0_bias, weight = inner_decoder_g_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_41")];
             tensor<bool, []> cumsum_mask_1_exclusive_0 = const()[name = tensor<string, []>("cumsum_mask_1_exclusive_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> cumsum_mask_1_reverse_0 = const()[name = tensor<string, []>("cumsum_mask_1_reverse_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [3]> cumsum_mask_1 = cumsum(axis = var_1000, exclusive = cumsum_mask_1_exclusive_0, reverse = cumsum_mask_1_reverse_0, x = valid_mask)[name = tensor<string, []>("cumsum_mask_1")];
+            tensor<fp32, [3]> cumsum_mask_1 = cumsum(axis = var_59, exclusive = cumsum_mask_1_exclusive_0, reverse = cumsum_mask_1_reverse_0, x = valid_mask)[name = tensor<string, []>("cumsum_mask_1")];
             tensor<fp32, [1]> sqrt_s0_9 = sqrt(x = prev_scale_9)[name = tensor<string, []>("sqrt_s0_9")];
             tensor<fp32, [3]> s_t_9 = add(x = prev_scale_9, y = cumsum_mask_1)[name = tensor<string, []>("s_t_9")];
             tensor<fp32, []> const_20 = const()[name = tensor<string, []>("const_20"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [3]> clip_1 = clip(alpha = var_990, beta = const_20, x = s_t_9)[name = tensor<string, []>("clip_1")];
+            tensor<fp32, [3]> clip_1 = clip(alpha = var_49, beta = const_20, x = s_t_9)[name = tensor<string, []>("clip_1")];
             tensor<fp32, [3]> sqrt_s_t_9 = sqrt(x = clip_1)[name = tensor<string, []>("sqrt_s_t_9")];
             tensor<bool, []> qk_9_transpose_x_1 = const()[name = tensor<string, []>("qk_9_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_9_transpose_y_1 = const()[name = tensor<string, []>("qk_9_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [12, 4, 3, 64]> k_9 = transpose(perm = k_9_perm_0, x = var_1092)[name = tensor<string, []>("transpose_26")];
-            tensor<fp32, [12, 4, 3, 64]> q_9 = transpose(perm = q_9_perm_0, x = var_1084)[name = tensor<string, []>("transpose_27")];
+            tensor<fp32, [12, 4, 3, 64]> k_9 = transpose(perm = k_9_perm_0, x = var_1115)[name = tensor<string, []>("transpose_26")];
+            tensor<fp32, [12, 4, 3, 64]> q_9 = transpose(perm = q_9_perm_0, x = var_1107)[name = tensor<string, []>("transpose_27")];
             tensor<fp32, [12, 4, 3, 3]> qk_9 = matmul(transpose_x = qk_9_transpose_x_1, transpose_y = qk_9_transpose_y_1, x = q_9, y = k_9)[name = tensor<string, []>("qk_9")];
-            tensor<int32, [2]> var_1110 = const()[name = tensor<string, []>("op_1110"), val = tensor<int32, [2]>([1, 3])];
-            tensor<fp32, [1, 3]> var_1111 = reshape(shape = var_1110, x = valid_mask)[name = tensor<string, []>("op_1111")];
-            tensor<fp32, [3, 3]> causal_with_valid_1 = mul(x = decoder__causal_mask, y = var_1111)[name = tensor<string, []>("causal_with_valid_1")];
-            tensor<int32, [2]> var_1113 = const()[name = tensor<string, []>("op_1113"), val = tensor<int32, [2]>([3, 1])];
-            tensor<fp32, [3, 1]> var_1114 = reshape(shape = var_1113, x = sqrt_s_t_9)[name = tensor<string, []>("op_1114")];
-            tensor<fp32, [3, 3]> M_9 = real_div(x = causal_with_valid_1, y = var_1114)[name = tensor<string, []>("M_9")];
-            tensor<fp32, [12, 4, 3, 3]> var_1116 = mul(x = qk_9, y = M_9)[name = tensor<string, []>("op_1116")];
+            tensor<int32, [2]> var_1133 = const()[name = tensor<string, []>("op_1133"), val = tensor<int32, [2]>([1, 3])];
+            tensor<fp32, [1, 3]> var_1134 = reshape(shape = var_1133, x = valid_mask)[name = tensor<string, []>("op_1134")];
+            tensor<fp32, [3, 3]> causal_with_valid_1 = mul(x = inner_decoder__causal_mask, y = var_1134)[name = tensor<string, []>("causal_with_valid_1")];
+            tensor<int32, [2]> var_1136 = const()[name = tensor<string, []>("op_1136"), val = tensor<int32, [2]>([3, 1])];
+            tensor<fp32, [3, 1]> var_1137 = reshape(shape = var_1136, x = sqrt_s_t_9)[name = tensor<string, []>("op_1137")];
+            tensor<fp32, [3, 3]> M_9 = real_div(x = causal_with_valid_1, y = var_1137)[name = tensor<string, []>("M_9")];
+            tensor<fp32, [12, 4, 3, 3]> var_1139 = mul(x = qk_9, y = M_9)[name = tensor<string, []>("op_1139")];
             tensor<bool, []> inner_9_transpose_x_0 = const()[name = tensor<string, []>("inner_9_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_9_transpose_y_0 = const()[name = tensor<string, []>("inner_9_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 3, 64]> v_9 = transpose(perm = v_9_perm_0, x = var_1098)[name = tensor<string, []>("transpose_25")];
-            tensor<fp32, [12, 4, 3, 64]> inner_9 = matmul(transpose_x = inner_9_transpose_x_0, transpose_y = inner_9_transpose_y_0, x = var_1116, y = v_9)[name = tensor<string, []>("inner_9")];
-            tensor<bool, []> var_1118_transpose_x_0 = const()[name = tensor<string, []>("op_1118_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_1118_transpose_y_0 = const()[name = tensor<string, []>("op_1118_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 3, 64]> var_1118 = matmul(transpose_x = var_1118_transpose_x_0, transpose_y = var_1118_transpose_y_0, x = q_9, y = prev_kv_9)[name = tensor<string, []>("op_1118")];
-            tensor<fp32, [3]> var_1119 = real_div(x = sqrt_s0_9, y = sqrt_s_t_9)[name = tensor<string, []>("op_1119")];
-            tensor<int32, [4]> var_1120 = const()[name = tensor<string, []>("op_1120"), val = tensor<int32, [4]>([1, 1, 3, 1])];
-            tensor<fp32, [1, 1, 3, 1]> var_1121 = reshape(shape = var_1120, x = var_1119)[name = tensor<string, []>("op_1121")];
-            tensor<fp32, [12, 4, 3, 64]> cross_9 = mul(x = var_1118, y = var_1121)[name = tensor<string, []>("cross_9")];
+            tensor<fp32, [12, 4, 3, 64]> v_9 = transpose(perm = v_9_perm_0, x = var_1121)[name = tensor<string, []>("transpose_25")];
+            tensor<fp32, [12, 4, 3, 64]> inner_9 = matmul(transpose_x = inner_9_transpose_x_0, transpose_y = inner_9_transpose_y_0, x = var_1139, y = v_9)[name = tensor<string, []>("inner_9")];
+            tensor<bool, []> var_1141_transpose_x_0 = const()[name = tensor<string, []>("op_1141_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_1141_transpose_y_0 = const()[name = tensor<string, []>("op_1141_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 3, 64]> var_1141 = matmul(transpose_x = var_1141_transpose_x_0, transpose_y = var_1141_transpose_y_0, x = q_9, y = prev_kv_9)[name = tensor<string, []>("op_1141")];
+            tensor<fp32, [3]> var_1142 = real_div(x = sqrt_s0_9, y = sqrt_s_t_9)[name = tensor<string, []>("op_1142")];
+            tensor<int32, [4]> var_1143 = const()[name = tensor<string, []>("op_1143"), val = tensor<int32, [4]>([1, 1, 3, 1])];
+            tensor<fp32, [1, 1, 3, 1]> var_1144 = reshape(shape = var_1143, x = var_1142)[name = tensor<string, []>("op_1144")];
+            tensor<fp32, [12, 4, 3, 64]> cross_9 = mul(x = var_1141, y = var_1144)[name = tensor<string, []>("cross_9")];
             tensor<fp32, [12, 4, 3, 64]> out_25 = add(x = inner_9, y = cross_9)[name = tensor<string, []>("out_25")];
-            tensor<int32, [4]> var_1124 = const()[name = tensor<string, []>("op_1124"), val = tensor<int32, [4]>([1, 1, 3, 1])];
-            tensor<fp32, [1, 1, 3, 1]> var_1125 = reshape(shape = var_1124, x = valid_mask)[name = tensor<string, []>("op_1125")];
-            tensor<fp32, [12, 4, 3, 64]> v_masked_1 = mul(x = v_9, y = var_1125)[name = tensor<string, []>("v_masked_1")];
-            tensor<fp32, [12, 4, 64, 64]> var_1127 = mul(x = prev_kv_9, y = sqrt_s0_9)[name = tensor<string, []>("op_1127")];
-            tensor<bool, []> var_1129_transpose_x_1 = const()[name = tensor<string, []>("op_1129_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_1129_transpose_y_1 = const()[name = tensor<string, []>("op_1129_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 64, 64]> var_1129 = matmul(transpose_x = var_1129_transpose_x_1, transpose_y = var_1129_transpose_y_1, x = k_9, y = v_masked_1)[name = tensor<string, []>("op_1129")];
-            tensor<fp32, [12, 4, 64, 64]> new_kv_unnorm_9 = add(x = var_1127, y = var_1129)[name = tensor<string, []>("new_kv_unnorm_9")];
-            tensor<bool, []> var_1131_keep_dims_0 = const()[name = tensor<string, []>("op_1131_keep_dims_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, []> var_1131 = reduce_sum(keep_dims = var_1131_keep_dims_0, x = valid_mask)[name = tensor<string, []>("op_1131")];
-            tensor<int32, [1]> var_1132 = const()[name = tensor<string, []>("op_1132"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [1]> var_1133 = reshape(shape = var_1132, x = var_1131)[name = tensor<string, []>("op_1133")];
-            tensor<fp32, [1]> new_scale_9 = add(x = prev_scale_9, y = var_1133)[name = tensor<string, []>("new_scale_9")];
+            tensor<int32, [4]> var_1147 = const()[name = tensor<string, []>("op_1147"), val = tensor<int32, [4]>([1, 1, 3, 1])];
+            tensor<fp32, [1, 1, 3, 1]> var_1148 = reshape(shape = var_1147, x = valid_mask)[name = tensor<string, []>("op_1148")];
+            tensor<fp32, [12, 4, 3, 64]> v_masked_1 = mul(x = v_9, y = var_1148)[name = tensor<string, []>("v_masked_1")];
+            tensor<fp32, [12, 4, 64, 64]> var_1150 = mul(x = prev_kv_9, y = sqrt_s0_9)[name = tensor<string, []>("op_1150")];
+            tensor<bool, []> var_1152_transpose_x_1 = const()[name = tensor<string, []>("op_1152_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_1152_transpose_y_1 = const()[name = tensor<string, []>("op_1152_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 64, 64]> var_1152 = matmul(transpose_x = var_1152_transpose_x_1, transpose_y = var_1152_transpose_y_1, x = k_9, y = v_masked_1)[name = tensor<string, []>("op_1152")];
+            tensor<fp32, [12, 4, 64, 64]> new_kv_unnorm_9 = add(x = var_1150, y = var_1152)[name = tensor<string, []>("new_kv_unnorm_9")];
+            tensor<bool, []> var_1154_keep_dims_0 = const()[name = tensor<string, []>("op_1154_keep_dims_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, []> var_1154 = reduce_sum(keep_dims = var_1154_keep_dims_0, x = valid_mask)[name = tensor<string, []>("op_1154")];
+            tensor<int32, [1]> var_1155 = const()[name = tensor<string, []>("op_1155"), val = tensor<int32, [1]>([1])];
+            tensor<fp32, [1]> var_1156 = reshape(shape = var_1155, x = var_1154)[name = tensor<string, []>("op_1156")];
+            tensor<fp32, [1]> new_scale_9 = add(x = prev_scale_9, y = var_1156)[name = tensor<string, []>("new_scale_9")];
             tensor<fp32, []> const_21 = const()[name = tensor<string, []>("const_21"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_2 = clip(alpha = var_990, beta = const_21, x = new_scale_9)[name = tensor<string, []>("clip_2")];
+            tensor<fp32, [1]> clip_2 = clip(alpha = var_49, beta = const_21, x = new_scale_9)[name = tensor<string, []>("clip_2")];
             tensor<fp32, [1]> sqrt_new_scale_1 = sqrt(x = clip_2)[name = tensor<string, []>("sqrt_new_scale_1")];
-            tensor<fp32, [12, 4, 64, 64]> var_1137 = real_div(x = new_kv_unnorm_9, y = sqrt_new_scale_1)[name = tensor<string, []>("op_1137")];
-            tensor<int32, [4]> var_1138_perm_0 = const()[name = tensor<string, []>("op_1138_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [12, 4, 64, 64]> var_1160 = real_div(x = new_kv_unnorm_9, y = sqrt_new_scale_1)[name = tensor<string, []>("op_1160")];
+            tensor<int32, [4]> var_1161_perm_0 = const()[name = tensor<string, []>("op_1161_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_27_axes_0 = const()[name = tensor<string, []>("out_27_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [12, 3, 4, 64]> var_1138 = transpose(perm = var_1138_perm_0, x = out_25)[name = tensor<string, []>("transpose_24")];
-            tensor<fp32, [12, 3, 4, 64]> out_27 = layer_norm(axes = out_27_axes_0, epsilon = var_987, x = var_1138)[name = tensor<string, []>("out_27")];
-            tensor<int32, [3]> var_1142 = const()[name = tensor<string, []>("op_1142"), val = tensor<int32, [3]>([12, 3, 256])];
-            tensor<fp32, [12, 3, 256]> out_29 = reshape(shape = var_1142, x = out_27)[name = tensor<string, []>("out_29")];
-            tensor<fp32, [12, 3, 256]> var_1144 = silu(x = input_169)[name = tensor<string, []>("op_1144")];
-            tensor<fp32, [12, 3, 256]> input_171 = mul(x = var_1144, y = out_29)[name = tensor<string, []>("input_171")];
-            tensor<fp32, [12, 3, 256]> ret_out_9 = linear(bias = decoder_out_proj_0_bias, weight = decoder_out_proj_0_weight, x = input_171)[name = tensor<string, []>("linear_42")];
-            tensor<fp32, [12, 3, 256]> input_173 = add(x = x_29, y = ret_out_9)[name = tensor<string, []>("input_173")];
+            tensor<fp32, [12, 3, 4, 64]> var_1161 = transpose(perm = var_1161_perm_0, x = out_25)[name = tensor<string, []>("transpose_24")];
+            tensor<fp32, [12, 3, 4, 64]> out_27 = layer_norm(axes = out_27_axes_0, epsilon = var_64, x = var_1161)[name = tensor<string, []>("out_27")];
+            tensor<int32, [3]> var_1165 = const()[name = tensor<string, []>("op_1165"), val = tensor<int32, [3]>([12, 3, 256])];
+            tensor<fp32, [12, 3, 256]> out_29 = reshape(shape = var_1165, x = out_27)[name = tensor<string, []>("out_29")];
+            tensor<fp32, [12, 3, 256]> var_1167 = silu(x = input_171)[name = tensor<string, []>("op_1167")];
+            tensor<fp32, [12, 3, 256]> input_173 = mul(x = var_1167, y = out_29)[name = tensor<string, []>("input_173")];
+            tensor<fp32, [12, 3, 256]> ret_out_9 = linear(bias = inner_decoder_out_proj_0_bias, weight = inner_decoder_out_proj_0_weight, x = input_173)[name = tensor<string, []>("linear_42")];
+            tensor<fp32, [12, 3, 256]> input_175 = add(x = x_29, y = ret_out_9)[name = tensor<string, []>("input_175")];
             tensor<int32, [1]> xt_1_axes_0 = const()[name = tensor<string, []>("xt_1_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [12, 3, 256]> xt_1 = layer_norm(axes = xt_1_axes_0, beta = decoder_norm11_0_bias, epsilon = var_985, gamma = decoder_norm11_0_weight, x = input_173)[name = tensor<string, []>("xt_1")];
-            tensor<int32, [4]> var_1154 = const()[name = tensor<string, []>("op_1154"), val = tensor<int32, [4]>([1, 12, 3, 256])];
-            tensor<fp32, [1, 12, 3, 256]> var_1155 = reshape(shape = var_1154, x = xt_1)[name = tensor<string, []>("op_1155")];
-            tensor<int32, [4]> var_1156_perm_0 = const()[name = tensor<string, []>("op_1156_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1159 = const()[name = tensor<string, []>("op_1159"), val = tensor<int32, [3]>([3, 12, 256])];
-            tensor<fp32, [1, 3, 12, 256]> var_1156 = transpose(perm = var_1156_perm_0, x = var_1155)[name = tensor<string, []>("transpose_23")];
-            tensor<fp32, [3, 12, 256]> query_1 = reshape(shape = var_1159, x = var_1156)[name = tensor<string, []>("query_1")];
+            tensor<fp32, [12, 3, 256]> xt_1 = layer_norm(axes = xt_1_axes_0, beta = inner_decoder_norm11_0_bias, epsilon = var_56, gamma = inner_decoder_norm11_0_weight, x = input_175)[name = tensor<string, []>("xt_1")];
+            tensor<int32, [4]> var_1177 = const()[name = tensor<string, []>("op_1177"), val = tensor<int32, [4]>([1, 12, 3, 256])];
+            tensor<fp32, [1, 12, 3, 256]> var_1178 = reshape(shape = var_1177, x = xt_1)[name = tensor<string, []>("op_1178")];
+            tensor<int32, [4]> var_1179_perm_0 = const()[name = tensor<string, []>("op_1179_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1182 = const()[name = tensor<string, []>("op_1182"), val = tensor<int32, [3]>([3, 12, 256])];
+            tensor<fp32, [1, 3, 12, 256]> var_1179 = transpose(perm = var_1179_perm_0, x = var_1178)[name = tensor<string, []>("transpose_23")];
+            tensor<fp32, [3, 12, 256]> query_1 = reshape(shape = var_1182, x = var_1179)[name = tensor<string, []>("query_1")];
             tensor<int32, [3]> query_3_perm_0 = const()[name = tensor<string, []>("query_3_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [12, 3, 256]> query_3 = transpose(perm = query_3_perm_0, x = query_1)[name = tensor<string, []>("transpose_22")];
-            tensor<fp32, [12, 3, 768]> var_1182 = linear(bias = decoder_self_attn2_0_in_proj_bias, weight = decoder_self_attn2_0_in_proj_weight, x = query_3)[name = tensor<string, []>("linear_43")];
+            tensor<fp32, [12, 3, 768]> var_1205 = linear(bias = inner_decoder_self_attn2_0_in_proj_bias, weight = inner_decoder_self_attn2_0_in_proj_weight, x = query_3)[name = tensor<string, []>("linear_43")];
             tensor<int32, [4]> concat_1 = const()[name = tensor<string, []>("concat_1"), val = tensor<int32, [4]>([12, 3, 3, 256])];
-            tensor<fp32, [12, 3, 3, 256]> var_1184 = reshape(shape = concat_1, x = var_1182)[name = tensor<string, []>("op_1184")];
-            tensor<int32, [1]> var_1185_axes_0 = const()[name = tensor<string, []>("op_1185_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp32, [1, 12, 3, 3, 256]> var_1185 = expand_dims(axes = var_1185_axes_0, x = var_1184)[name = tensor<string, []>("op_1185")];
-            tensor<int32, [5]> var_1186_perm_0 = const()[name = tensor<string, []>("op_1186_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
-            tensor<int32, [1]> var_1187_axes_0 = const()[name = tensor<string, []>("op_1187_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp32, [3, 12, 3, 1, 256]> var_1186 = transpose(perm = var_1186_perm_0, x = var_1185)[name = tensor<string, []>("transpose_21")];
-            tensor<fp32, [3, 12, 3, 256]> var_1187 = squeeze(axes = var_1187_axes_0, x = var_1186)[name = tensor<string, []>("op_1187")];
+            tensor<fp32, [12, 3, 3, 256]> var_1207 = reshape(shape = concat_1, x = var_1205)[name = tensor<string, []>("op_1207")];
+            tensor<int32, [1]> var_1208_axes_0 = const()[name = tensor<string, []>("op_1208_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 12, 3, 3, 256]> var_1208 = expand_dims(axes = var_1208_axes_0, x = var_1207)[name = tensor<string, []>("op_1208")];
+            tensor<int32, [5]> var_1209_perm_0 = const()[name = tensor<string, []>("op_1209_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
+            tensor<int32, [1]> var_1210_axes_0 = const()[name = tensor<string, []>("op_1210_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp32, [3, 12, 3, 1, 256]> var_1209 = transpose(perm = var_1209_perm_0, x = var_1208)[name = tensor<string, []>("transpose_21")];
+            tensor<fp32, [3, 12, 3, 256]> var_1210 = squeeze(axes = var_1210_axes_0, x = var_1209)[name = tensor<string, []>("op_1210")];
             tensor<int32, [4]> q_11_begin_0 = const()[name = tensor<string, []>("q_11_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> q_11_end_0 = const()[name = tensor<string, []>("q_11_end_0"), val = tensor<int32, [4]>([1, 12, 3, 256])];
             tensor<bool, [4]> q_11_end_mask_0 = const()[name = tensor<string, []>("q_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> q_11_squeeze_mask_0 = const()[name = tensor<string, []>("q_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 3, 256]> q_11 = slice_by_index(begin = q_11_begin_0, end = q_11_end_0, end_mask = q_11_end_mask_0, squeeze_mask = q_11_squeeze_mask_0, x = var_1187)[name = tensor<string, []>("q_11")];
+            tensor<fp32, [12, 3, 256]> q_11 = slice_by_index(begin = q_11_begin_0, end = q_11_end_0, end_mask = q_11_end_mask_0, squeeze_mask = q_11_squeeze_mask_0, x = var_1210)[name = tensor<string, []>("q_11")];
             tensor<int32, [4]> k_11_begin_0 = const()[name = tensor<string, []>("k_11_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> k_11_end_0 = const()[name = tensor<string, []>("k_11_end_0"), val = tensor<int32, [4]>([2, 12, 3, 256])];
             tensor<bool, [4]> k_11_end_mask_0 = const()[name = tensor<string, []>("k_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> k_11_squeeze_mask_0 = const()[name = tensor<string, []>("k_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 3, 256]> k_11 = slice_by_index(begin = k_11_begin_0, end = k_11_end_0, end_mask = k_11_end_mask_0, squeeze_mask = k_11_squeeze_mask_0, x = var_1187)[name = tensor<string, []>("k_11")];
+            tensor<fp32, [12, 3, 256]> k_11 = slice_by_index(begin = k_11_begin_0, end = k_11_end_0, end_mask = k_11_end_mask_0, squeeze_mask = k_11_squeeze_mask_0, x = var_1210)[name = tensor<string, []>("k_11")];
             tensor<int32, [4]> v_11_begin_0 = const()[name = tensor<string, []>("v_11_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> v_11_end_0 = const()[name = tensor<string, []>("v_11_end_0"), val = tensor<int32, [4]>([3, 12, 3, 256])];
             tensor<bool, [4]> v_11_end_mask_0 = const()[name = tensor<string, []>("v_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> v_11_squeeze_mask_0 = const()[name = tensor<string, []>("v_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 3, 256]> v_11 = slice_by_index(begin = v_11_begin_0, end = v_11_end_0, end_mask = v_11_end_mask_0, squeeze_mask = v_11_squeeze_mask_0, x = var_1187)[name = tensor<string, []>("v_11")];
-            tensor<int32, [3]> var_1195 = const()[name = tensor<string, []>("op_1195"), val = tensor<int32, [3]>([12, 12, 64])];
-            tensor<fp32, [12, 12, 64]> var_1196 = reshape(shape = var_1195, x = q_11)[name = tensor<string, []>("op_1196")];
+            tensor<fp32, [12, 3, 256]> v_11 = slice_by_index(begin = v_11_begin_0, end = v_11_end_0, end_mask = v_11_end_mask_0, squeeze_mask = v_11_squeeze_mask_0, x = var_1210)[name = tensor<string, []>("v_11")];
+            tensor<int32, [3]> var_1218 = const()[name = tensor<string, []>("op_1218"), val = tensor<int32, [3]>([12, 12, 64])];
+            tensor<fp32, [12, 12, 64]> var_1219 = reshape(shape = var_1218, x = q_11)[name = tensor<string, []>("op_1219")];
             tensor<int32, [3]> q_13_perm_0 = const()[name = tensor<string, []>("q_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1202 = const()[name = tensor<string, []>("op_1202"), val = tensor<int32, [3]>([12, 12, 64])];
-            tensor<fp32, [12, 12, 64]> var_1203 = reshape(shape = var_1202, x = k_11)[name = tensor<string, []>("op_1203")];
+            tensor<int32, [3]> var_1225 = const()[name = tensor<string, []>("op_1225"), val = tensor<int32, [3]>([12, 12, 64])];
+            tensor<fp32, [12, 12, 64]> var_1226 = reshape(shape = var_1225, x = k_11)[name = tensor<string, []>("op_1226")];
             tensor<int32, [3]> k_13_perm_0 = const()[name = tensor<string, []>("k_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1209 = const()[name = tensor<string, []>("op_1209"), val = tensor<int32, [3]>([12, 12, 64])];
-            tensor<fp32, [12, 12, 64]> var_1210 = reshape(shape = var_1209, x = v_11)[name = tensor<string, []>("op_1210")];
+            tensor<int32, [3]> var_1232 = const()[name = tensor<string, []>("op_1232"), val = tensor<int32, [3]>([12, 12, 64])];
+            tensor<fp32, [12, 12, 64]> var_1233 = reshape(shape = var_1232, x = v_11)[name = tensor<string, []>("op_1233")];
             tensor<int32, [3]> v_13_perm_0 = const()[name = tensor<string, []>("v_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [4]> var_1213 = const()[name = tensor<string, []>("op_1213"), val = tensor<int32, [4]>([3, 4, 12, 64])];
-            tensor<fp32, [12, 12, 64]> q_13 = transpose(perm = q_13_perm_0, x = var_1196)[name = tensor<string, []>("transpose_20")];
-            tensor<fp32, [3, 4, 12, 64]> q_15 = reshape(shape = var_1213, x = q_13)[name = tensor<string, []>("q_15")];
-            tensor<int32, [4]> var_1215 = const()[name = tensor<string, []>("op_1215"), val = tensor<int32, [4]>([3, 4, 12, 64])];
-            tensor<fp32, [12, 12, 64]> k_13 = transpose(perm = k_13_perm_0, x = var_1203)[name = tensor<string, []>("transpose_19")];
-            tensor<fp32, [3, 4, 12, 64]> k_15 = reshape(shape = var_1215, x = k_13)[name = tensor<string, []>("k_15")];
-            tensor<int32, [4]> var_1217 = const()[name = tensor<string, []>("op_1217"), val = tensor<int32, [4]>([3, 4, 12, 64])];
-            tensor<fp32, [12, 12, 64]> v_13 = transpose(perm = v_13_perm_0, x = var_1210)[name = tensor<string, []>("transpose_18")];
-            tensor<fp32, [3, 4, 12, 64]> v_15 = reshape(shape = var_1217, x = v_13)[name = tensor<string, []>("v_15")];
+            tensor<int32, [4]> var_1236 = const()[name = tensor<string, []>("op_1236"), val = tensor<int32, [4]>([3, 4, 12, 64])];
+            tensor<fp32, [12, 12, 64]> q_13 = transpose(perm = q_13_perm_0, x = var_1219)[name = tensor<string, []>("transpose_20")];
+            tensor<fp32, [3, 4, 12, 64]> q_15 = reshape(shape = var_1236, x = q_13)[name = tensor<string, []>("q_15")];
+            tensor<int32, [4]> var_1238 = const()[name = tensor<string, []>("op_1238"), val = tensor<int32, [4]>([3, 4, 12, 64])];
+            tensor<fp32, [12, 12, 64]> k_13 = transpose(perm = k_13_perm_0, x = var_1226)[name = tensor<string, []>("transpose_19")];
+            tensor<fp32, [3, 4, 12, 64]> k_15 = reshape(shape = var_1238, x = k_13)[name = tensor<string, []>("k_15")];
+            tensor<int32, [4]> var_1240 = const()[name = tensor<string, []>("op_1240"), val = tensor<int32, [4]>([3, 4, 12, 64])];
+            tensor<fp32, [12, 12, 64]> v_13 = transpose(perm = v_13_perm_0, x = var_1233)[name = tensor<string, []>("transpose_18")];
+            tensor<fp32, [3, 4, 12, 64]> v_15 = reshape(shape = var_1240, x = v_13)[name = tensor<string, []>("v_15")];
             tensor<fp32, []> mul_1_y_0 = const()[name = tensor<string, []>("mul_1_y_0"), val = tensor<fp32, []>(0x1p-3)];
             tensor<fp32, [3, 4, 12, 64]> mul_1 = mul(x = q_15, y = mul_1_y_0)[name = tensor<string, []>("mul_1")];
             tensor<bool, []> matmul_0_transpose_y_0 = const()[name = tensor<string, []>("matmul_0_transpose_y_0"), val = tensor<bool, []>(true)];
@@ -1079,30 +1090,30 @@ program(1.0)
             tensor<bool, []> attn_output_1_transpose_x_0 = const()[name = tensor<string, []>("attn_output_1_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> attn_output_1_transpose_y_0 = const()[name = tensor<string, []>("attn_output_1_transpose_y_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [3, 4, 12, 64]> attn_output_1 = matmul(transpose_x = attn_output_1_transpose_x_0, transpose_y = attn_output_1_transpose_y_0, x = softmax_0, y = v_15)[name = tensor<string, []>("attn_output_1")];
-            tensor<int32, [4]> var_1220 = const()[name = tensor<string, []>("op_1220"), val = tensor<int32, [4]>([2, 0, 1, 3])];
-            tensor<int32, [2]> var_1225 = const()[name = tensor<string, []>("op_1225"), val = tensor<int32, [2]>([36, 256])];
-            tensor<fp32, [12, 3, 4, 64]> var_1221 = transpose(perm = var_1220, x = attn_output_1)[name = tensor<string, []>("transpose_17")];
-            tensor<fp32, [36, 256]> attn_output_3 = reshape(shape = var_1225, x = var_1221)[name = tensor<string, []>("attn_output_3")];
-            tensor<fp32, [36, 256]> attn_output_5 = linear(bias = decoder_self_attn2_0_out_proj_bias, weight = decoder_self_attn2_0_out_proj_weight, x = attn_output_3)[name = tensor<string, []>("linear_44")];
-            tensor<int32, [3]> var_1229 = const()[name = tensor<string, []>("op_1229"), val = tensor<int32, [3]>([12, 3, 256])];
-            tensor<fp32, [12, 3, 256]> attn_output_7 = reshape(shape = var_1229, x = attn_output_5)[name = tensor<string, []>("attn_output_7")];
+            tensor<int32, [4]> var_1243 = const()[name = tensor<string, []>("op_1243"), val = tensor<int32, [4]>([2, 0, 1, 3])];
+            tensor<int32, [2]> var_1248 = const()[name = tensor<string, []>("op_1248"), val = tensor<int32, [2]>([36, 256])];
+            tensor<fp32, [12, 3, 4, 64]> var_1244 = transpose(perm = var_1243, x = attn_output_1)[name = tensor<string, []>("transpose_17")];
+            tensor<fp32, [36, 256]> attn_output_3 = reshape(shape = var_1248, x = var_1244)[name = tensor<string, []>("attn_output_3")];
+            tensor<fp32, [36, 256]> attn_output_5 = linear(bias = inner_decoder_self_attn2_0_out_proj_bias, weight = inner_decoder_self_attn2_0_out_proj_weight, x = attn_output_3)[name = tensor<string, []>("linear_44")];
+            tensor<int32, [3]> var_1252 = const()[name = tensor<string, []>("op_1252"), val = tensor<int32, [3]>([12, 3, 256])];
+            tensor<fp32, [12, 3, 256]> attn_output_7 = reshape(shape = var_1252, x = attn_output_5)[name = tensor<string, []>("attn_output_7")];
             tensor<int32, [3]> sa2_out_1_perm_0 = const()[name = tensor<string, []>("sa2_out_1_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [3, 12, 256]> sa2_out_1 = transpose(perm = sa2_out_1_perm_0, x = attn_output_7)[name = tensor<string, []>("transpose_16")];
-            tensor<fp32, [3, 12, 256]> input_175 = add(x = query_1, y = sa2_out_1)[name = tensor<string, []>("input_175")];
-            tensor<int32, [1]> input_177_axes_0 = const()[name = tensor<string, []>("input_177_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [3, 12, 256]> input_177 = layer_norm(axes = input_177_axes_0, beta = decoder_norm21_0_bias, epsilon = var_985, gamma = decoder_norm21_0_weight, x = input_175)[name = tensor<string, []>("input_177")];
-            tensor<fp32, [3, 12, 2048]> input_179 = linear(bias = decoder_linear1_0_bias, weight = decoder_linear1_0_weight, x = input_177)[name = tensor<string, []>("linear_45")];
-            tensor<fp32, [3, 12, 2048]> input_181 = relu(x = input_179)[name = tensor<string, []>("input_181")];
-            tensor<fp32, [3, 12, 256]> ffn_out_1 = linear(bias = decoder_linear2_0_bias, weight = decoder_linear2_0_weight, x = input_181)[name = tensor<string, []>("linear_46")];
-            tensor<fp32, [3, 12, 256]> input_183 = add(x = input_177, y = ffn_out_1)[name = tensor<string, []>("input_183")];
+            tensor<fp32, [3, 12, 256]> input_177 = add(x = query_1, y = sa2_out_1)[name = tensor<string, []>("input_177")];
+            tensor<int32, [1]> input_179_axes_0 = const()[name = tensor<string, []>("input_179_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [3, 12, 256]> input_179 = layer_norm(axes = input_179_axes_0, beta = inner_decoder_norm21_0_bias, epsilon = var_56, gamma = inner_decoder_norm21_0_weight, x = input_177)[name = tensor<string, []>("input_179")];
+            tensor<fp32, [3, 12, 2048]> input_181 = linear(bias = inner_decoder_linear1_0_bias, weight = inner_decoder_linear1_0_weight, x = input_179)[name = tensor<string, []>("linear_45")];
+            tensor<fp32, [3, 12, 2048]> input_183 = relu(x = input_181)[name = tensor<string, []>("input_183")];
+            tensor<fp32, [3, 12, 256]> ffn_out_1 = linear(bias = inner_decoder_linear2_0_bias, weight = inner_decoder_linear2_0_weight, x = input_183)[name = tensor<string, []>("linear_46")];
+            tensor<fp32, [3, 12, 256]> input_185 = add(x = input_179, y = ffn_out_1)[name = tensor<string, []>("input_185")];
             tensor<int32, [1]> xt_3_axes_0 = const()[name = tensor<string, []>("xt_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [3, 12, 256]> xt_3 = layer_norm(axes = xt_3_axes_0, beta = decoder_norm22_0_bias, epsilon = var_985, gamma = decoder_norm22_0_weight, x = input_183)[name = tensor<string, []>("xt_3")];
-            tensor<int32, [4]> var_1249 = const()[name = tensor<string, []>("op_1249"), val = tensor<int32, [4]>([1, 3, 12, 256])];
-            tensor<fp32, [1, 3, 12, 256]> x_31 = reshape(shape = var_1249, x = xt_3)[name = tensor<string, []>("x_31")];
-            tensor<int32, [4]> var_1251_perm_0 = const()[name = tensor<string, []>("op_1251_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1255 = const()[name = tensor<string, []>("op_1255"), val = tensor<int32, [3]>([12, 3, 256])];
-            tensor<fp32, [1, 12, 3, 256]> var_1251 = transpose(perm = var_1251_perm_0, x = x_31)[name = tensor<string, []>("transpose_15")];
-            tensor<fp32, [12, 3, 256]> x = reshape(shape = var_1255, x = var_1251)[name = tensor<string, []>("x")];
+            tensor<fp32, [3, 12, 256]> xt_3 = layer_norm(axes = xt_3_axes_0, beta = inner_decoder_norm22_0_bias, epsilon = var_56, gamma = inner_decoder_norm22_0_weight, x = input_185)[name = tensor<string, []>("xt_3")];
+            tensor<int32, [4]> var_1272 = const()[name = tensor<string, []>("op_1272"), val = tensor<int32, [4]>([1, 3, 12, 256])];
+            tensor<fp32, [1, 3, 12, 256]> x_31 = reshape(shape = var_1272, x = xt_3)[name = tensor<string, []>("x_31")];
+            tensor<int32, [4]> var_1274_perm_0 = const()[name = tensor<string, []>("op_1274_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1278 = const()[name = tensor<string, []>("op_1278"), val = tensor<int32, [3]>([12, 3, 256])];
+            tensor<fp32, [1, 12, 3, 256]> var_1274 = transpose(perm = var_1274_perm_0, x = x_31)[name = tensor<string, []>("transpose_15")];
+            tensor<fp32, [12, 3, 256]> x = reshape(shape = var_1278, x = var_1274)[name = tensor<string, []>("x")];
             tensor<int32, [5]> prev_kv_begin_0 = const()[name = tensor<string, []>("prev_kv_begin_0"), val = tensor<int32, [5]>([1, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_end_0 = const()[name = tensor<string, []>("prev_kv_end_0"), val = tensor<int32, [5]>([2, 12, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_end_mask_0 = const()[name = tensor<string, []>("prev_kv_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -1113,120 +1124,120 @@ program(1.0)
             tensor<bool, [2]> prev_scale_end_mask_0 = const()[name = tensor<string, []>("prev_scale_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale = slice_by_index(begin = prev_scale_begin_0, end = prev_scale_end_0, end_mask = prev_scale_end_mask_0, squeeze_mask = prev_scale_squeeze_mask_0, x = dec_scale)[name = tensor<string, []>("prev_scale")];
-            tensor<fp32, [12, 3, 256]> var_1263 = linear(bias = decoder_q_proj_1_bias, weight = decoder_q_proj_1_weight, x = x)[name = tensor<string, []>("linear_47")];
-            tensor<int32, [4]> var_1264 = const()[name = tensor<string, []>("op_1264"), val = tensor<int32, [4]>([12, 3, 4, 64])];
-            tensor<fp32, [12, 3, 4, 64]> var_1265 = reshape(shape = var_1264, x = var_1263)[name = tensor<string, []>("op_1265")];
+            tensor<fp32, [12, 3, 256]> var_1286 = linear(bias = inner_decoder_q_proj_1_bias, weight = inner_decoder_q_proj_1_weight, x = x)[name = tensor<string, []>("linear_47")];
+            tensor<int32, [4]> var_1287 = const()[name = tensor<string, []>("op_1287"), val = tensor<int32, [4]>([12, 3, 4, 64])];
+            tensor<fp32, [12, 3, 4, 64]> var_1288 = reshape(shape = var_1287, x = var_1286)[name = tensor<string, []>("op_1288")];
             tensor<int32, [4]> q_17_perm_0 = const()[name = tensor<string, []>("q_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 3, 256]> var_1269 = linear(bias = decoder_k_proj_1_bias, weight = decoder_k_proj_1_weight, x = x)[name = tensor<string, []>("linear_48")];
-            tensor<fp32, []> var_1270 = const()[name = tensor<string, []>("op_1270"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [12, 3, 256]> var_1271 = mul(x = var_1269, y = var_1270)[name = tensor<string, []>("op_1271")];
-            tensor<int32, [4]> var_1272 = const()[name = tensor<string, []>("op_1272"), val = tensor<int32, [4]>([12, 3, 4, 64])];
-            tensor<fp32, [12, 3, 4, 64]> var_1273 = reshape(shape = var_1272, x = var_1271)[name = tensor<string, []>("op_1273")];
+            tensor<fp32, [12, 3, 256]> var_1292 = linear(bias = inner_decoder_k_proj_1_bias, weight = inner_decoder_k_proj_1_weight, x = x)[name = tensor<string, []>("linear_48")];
+            tensor<fp32, []> var_1293 = const()[name = tensor<string, []>("op_1293"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [12, 3, 256]> var_1294 = mul(x = var_1292, y = var_1293)[name = tensor<string, []>("op_1294")];
+            tensor<int32, [4]> var_1295 = const()[name = tensor<string, []>("op_1295"), val = tensor<int32, [4]>([12, 3, 4, 64])];
+            tensor<fp32, [12, 3, 4, 64]> var_1296 = reshape(shape = var_1295, x = var_1294)[name = tensor<string, []>("op_1296")];
             tensor<int32, [4]> k_17_perm_0 = const()[name = tensor<string, []>("k_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 3, 256]> var_1277 = linear(bias = decoder_v_proj_1_bias, weight = decoder_v_proj_1_weight, x = x)[name = tensor<string, []>("linear_49")];
-            tensor<int32, [4]> var_1278 = const()[name = tensor<string, []>("op_1278"), val = tensor<int32, [4]>([12, 3, 4, 64])];
-            tensor<fp32, [12, 3, 4, 64]> var_1279 = reshape(shape = var_1278, x = var_1277)[name = tensor<string, []>("op_1279")];
+            tensor<fp32, [12, 3, 256]> var_1300 = linear(bias = inner_decoder_v_proj_1_bias, weight = inner_decoder_v_proj_1_weight, x = x)[name = tensor<string, []>("linear_49")];
+            tensor<int32, [4]> var_1301 = const()[name = tensor<string, []>("op_1301"), val = tensor<int32, [4]>([12, 3, 4, 64])];
+            tensor<fp32, [12, 3, 4, 64]> var_1302 = reshape(shape = var_1301, x = var_1300)[name = tensor<string, []>("op_1302")];
             tensor<int32, [4]> v_17_perm_0 = const()[name = tensor<string, []>("v_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 3, 256]> input_187 = linear(bias = decoder_g_proj_1_bias, weight = decoder_g_proj_1_weight, x = x)[name = tensor<string, []>("linear_50")];
+            tensor<fp32, [12, 3, 256]> input_189 = linear(bias = inner_decoder_g_proj_1_bias, weight = inner_decoder_g_proj_1_weight, x = x)[name = tensor<string, []>("linear_50")];
             tensor<fp32, [1]> sqrt_s0 = sqrt(x = prev_scale)[name = tensor<string, []>("sqrt_s0")];
             tensor<fp32, [3]> s_t = add(x = prev_scale, y = cumsum_mask_1)[name = tensor<string, []>("s_t")];
             tensor<fp32, []> const_32 = const()[name = tensor<string, []>("const_32"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [3]> clip_3 = clip(alpha = var_990, beta = const_32, x = s_t)[name = tensor<string, []>("clip_3")];
+            tensor<fp32, [3]> clip_3 = clip(alpha = var_49, beta = const_32, x = s_t)[name = tensor<string, []>("clip_3")];
             tensor<fp32, [3]> sqrt_s_t = sqrt(x = clip_3)[name = tensor<string, []>("sqrt_s_t")];
             tensor<bool, []> qk_transpose_x_1 = const()[name = tensor<string, []>("qk_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_transpose_y_1 = const()[name = tensor<string, []>("qk_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [12, 4, 3, 64]> k_17 = transpose(perm = k_17_perm_0, x = var_1273)[name = tensor<string, []>("transpose_13")];
-            tensor<fp32, [12, 4, 3, 64]> q_17 = transpose(perm = q_17_perm_0, x = var_1265)[name = tensor<string, []>("transpose_14")];
+            tensor<fp32, [12, 4, 3, 64]> k_17 = transpose(perm = k_17_perm_0, x = var_1296)[name = tensor<string, []>("transpose_13")];
+            tensor<fp32, [12, 4, 3, 64]> q_17 = transpose(perm = q_17_perm_0, x = var_1288)[name = tensor<string, []>("transpose_14")];
             tensor<fp32, [12, 4, 3, 3]> qk = matmul(transpose_x = qk_transpose_x_1, transpose_y = qk_transpose_y_1, x = q_17, y = k_17)[name = tensor<string, []>("qk")];
-            tensor<int32, [2]> var_1294 = const()[name = tensor<string, []>("op_1294"), val = tensor<int32, [2]>([3, 1])];
-            tensor<fp32, [3, 1]> var_1295 = reshape(shape = var_1294, x = sqrt_s_t)[name = tensor<string, []>("op_1295")];
-            tensor<fp32, [3, 3]> M = real_div(x = causal_with_valid_1, y = var_1295)[name = tensor<string, []>("M")];
-            tensor<fp32, [12, 4, 3, 3]> var_1297 = mul(x = qk, y = M)[name = tensor<string, []>("op_1297")];
-            tensor<bool, []> inner_transpose_x_0 = const()[name = tensor<string, []>("inner_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> inner_transpose_y_0 = const()[name = tensor<string, []>("inner_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 3, 64]> v_17 = transpose(perm = v_17_perm_0, x = var_1279)[name = tensor<string, []>("transpose_12")];
-            tensor<fp32, [12, 4, 3, 64]> inner = matmul(transpose_x = inner_transpose_x_0, transpose_y = inner_transpose_y_0, x = var_1297, y = v_17)[name = tensor<string, []>("inner")];
-            tensor<bool, []> var_1299_transpose_x_0 = const()[name = tensor<string, []>("op_1299_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_1299_transpose_y_0 = const()[name = tensor<string, []>("op_1299_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 3, 64]> var_1299 = matmul(transpose_x = var_1299_transpose_x_0, transpose_y = var_1299_transpose_y_0, x = q_17, y = prev_kv)[name = tensor<string, []>("op_1299")];
-            tensor<fp32, [3]> var_1300 = real_div(x = sqrt_s0, y = sqrt_s_t)[name = tensor<string, []>("op_1300")];
-            tensor<int32, [4]> var_1301 = const()[name = tensor<string, []>("op_1301"), val = tensor<int32, [4]>([1, 1, 3, 1])];
-            tensor<fp32, [1, 1, 3, 1]> var_1302 = reshape(shape = var_1301, x = var_1300)[name = tensor<string, []>("op_1302")];
-            tensor<fp32, [12, 4, 3, 64]> cross = mul(x = var_1299, y = var_1302)[name = tensor<string, []>("cross")];
-            tensor<fp32, [12, 4, 3, 64]> out_31 = add(x = inner, y = cross)[name = tensor<string, []>("out_31")];
-            tensor<fp32, [12, 4, 3, 64]> v_masked = mul(x = v_17, y = var_1125)[name = tensor<string, []>("v_masked")];
-            tensor<fp32, [12, 4, 64, 64]> var_1308 = mul(x = prev_kv, y = sqrt_s0)[name = tensor<string, []>("op_1308")];
-            tensor<bool, []> var_1310_transpose_x_1 = const()[name = tensor<string, []>("op_1310_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_1310_transpose_y_1 = const()[name = tensor<string, []>("op_1310_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 64, 64]> var_1310 = matmul(transpose_x = var_1310_transpose_x_1, transpose_y = var_1310_transpose_y_1, x = k_17, y = v_masked)[name = tensor<string, []>("op_1310")];
-            tensor<fp32, [12, 4, 64, 64]> new_kv_unnorm = add(x = var_1308, y = var_1310)[name = tensor<string, []>("new_kv_unnorm")];
-            tensor<fp32, [1]> new_scale = add(x = prev_scale, y = var_1133)[name = tensor<string, []>("new_scale")];
+            tensor<int32, [2]> var_1317 = const()[name = tensor<string, []>("op_1317"), val = tensor<int32, [2]>([3, 1])];
+            tensor<fp32, [3, 1]> var_1318 = reshape(shape = var_1317, x = sqrt_s_t)[name = tensor<string, []>("op_1318")];
+            tensor<fp32, [3, 3]> M = real_div(x = causal_with_valid_1, y = var_1318)[name = tensor<string, []>("M")];
+            tensor<fp32, [12, 4, 3, 3]> var_1320 = mul(x = qk, y = M)[name = tensor<string, []>("op_1320")];
+            tensor<bool, []> inner_11_transpose_x_0 = const()[name = tensor<string, []>("inner_11_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> inner_11_transpose_y_0 = const()[name = tensor<string, []>("inner_11_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 3, 64]> v_17 = transpose(perm = v_17_perm_0, x = var_1302)[name = tensor<string, []>("transpose_12")];
+            tensor<fp32, [12, 4, 3, 64]> inner_11 = matmul(transpose_x = inner_11_transpose_x_0, transpose_y = inner_11_transpose_y_0, x = var_1320, y = v_17)[name = tensor<string, []>("inner_11")];
+            tensor<bool, []> var_1322_transpose_x_0 = const()[name = tensor<string, []>("op_1322_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_1322_transpose_y_0 = const()[name = tensor<string, []>("op_1322_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 3, 64]> var_1322 = matmul(transpose_x = var_1322_transpose_x_0, transpose_y = var_1322_transpose_y_0, x = q_17, y = prev_kv)[name = tensor<string, []>("op_1322")];
+            tensor<fp32, [3]> var_1323 = real_div(x = sqrt_s0, y = sqrt_s_t)[name = tensor<string, []>("op_1323")];
+            tensor<int32, [4]> var_1324 = const()[name = tensor<string, []>("op_1324"), val = tensor<int32, [4]>([1, 1, 3, 1])];
+            tensor<fp32, [1, 1, 3, 1]> var_1325 = reshape(shape = var_1324, x = var_1323)[name = tensor<string, []>("op_1325")];
+            tensor<fp32, [12, 4, 3, 64]> cross = mul(x = var_1322, y = var_1325)[name = tensor<string, []>("cross")];
+            tensor<fp32, [12, 4, 3, 64]> out_31 = add(x = inner_11, y = cross)[name = tensor<string, []>("out_31")];
+            tensor<fp32, [12, 4, 3, 64]> v_masked = mul(x = v_17, y = var_1148)[name = tensor<string, []>("v_masked")];
+            tensor<fp32, [12, 4, 64, 64]> var_1331 = mul(x = prev_kv, y = sqrt_s0)[name = tensor<string, []>("op_1331")];
+            tensor<bool, []> var_1333_transpose_x_1 = const()[name = tensor<string, []>("op_1333_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_1333_transpose_y_1 = const()[name = tensor<string, []>("op_1333_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 64, 64]> var_1333 = matmul(transpose_x = var_1333_transpose_x_1, transpose_y = var_1333_transpose_y_1, x = k_17, y = v_masked)[name = tensor<string, []>("op_1333")];
+            tensor<fp32, [12, 4, 64, 64]> new_kv_unnorm = add(x = var_1331, y = var_1333)[name = tensor<string, []>("new_kv_unnorm")];
+            tensor<fp32, [1]> new_scale = add(x = prev_scale, y = var_1156)[name = tensor<string, []>("new_scale")];
             tensor<fp32, []> const_33 = const()[name = tensor<string, []>("const_33"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_4 = clip(alpha = var_990, beta = const_33, x = new_scale)[name = tensor<string, []>("clip_4")];
+            tensor<fp32, [1]> clip_4 = clip(alpha = var_49, beta = const_33, x = new_scale)[name = tensor<string, []>("clip_4")];
             tensor<fp32, [1]> sqrt_new_scale = sqrt(x = clip_4)[name = tensor<string, []>("sqrt_new_scale")];
             tensor<fp32, [12, 4, 64, 64]> nkv = real_div(x = new_kv_unnorm, y = sqrt_new_scale)[name = tensor<string, []>("nkv")];
-            tensor<int32, [4]> var_1319_perm_0 = const()[name = tensor<string, []>("op_1319_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [4]> var_1342_perm_0 = const()[name = tensor<string, []>("op_1342_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_33_axes_0 = const()[name = tensor<string, []>("out_33_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [12, 3, 4, 64]> var_1319 = transpose(perm = var_1319_perm_0, x = out_31)[name = tensor<string, []>("transpose_11")];
-            tensor<fp32, [12, 3, 4, 64]> out_33 = layer_norm(axes = out_33_axes_0, epsilon = var_987, x = var_1319)[name = tensor<string, []>("out_33")];
-            tensor<int32, [3]> var_1323 = const()[name = tensor<string, []>("op_1323"), val = tensor<int32, [3]>([12, 3, 256])];
-            tensor<fp32, [12, 3, 256]> out = reshape(shape = var_1323, x = out_33)[name = tensor<string, []>("out")];
-            tensor<fp32, [12, 3, 256]> var_1325 = silu(x = input_187)[name = tensor<string, []>("op_1325")];
-            tensor<fp32, [12, 3, 256]> input_189 = mul(x = var_1325, y = out)[name = tensor<string, []>("input_189")];
-            tensor<fp32, [12, 3, 256]> ret_out = linear(bias = decoder_out_proj_1_bias, weight = decoder_out_proj_1_weight, x = input_189)[name = tensor<string, []>("linear_51")];
-            tensor<fp32, [12, 3, 256]> input_191 = add(x = x, y = ret_out)[name = tensor<string, []>("input_191")];
+            tensor<fp32, [12, 3, 4, 64]> var_1342 = transpose(perm = var_1342_perm_0, x = out_31)[name = tensor<string, []>("transpose_11")];
+            tensor<fp32, [12, 3, 4, 64]> out_33 = layer_norm(axes = out_33_axes_0, epsilon = var_64, x = var_1342)[name = tensor<string, []>("out_33")];
+            tensor<int32, [3]> var_1346 = const()[name = tensor<string, []>("op_1346"), val = tensor<int32, [3]>([12, 3, 256])];
+            tensor<fp32, [12, 3, 256]> out = reshape(shape = var_1346, x = out_33)[name = tensor<string, []>("out")];
+            tensor<fp32, [12, 3, 256]> var_1348 = silu(x = input_189)[name = tensor<string, []>("op_1348")];
+            tensor<fp32, [12, 3, 256]> input_191 = mul(x = var_1348, y = out)[name = tensor<string, []>("input_191")];
+            tensor<fp32, [12, 3, 256]> ret_out = linear(bias = inner_decoder_out_proj_1_bias, weight = inner_decoder_out_proj_1_weight, x = input_191)[name = tensor<string, []>("linear_51")];
+            tensor<fp32, [12, 3, 256]> input_193 = add(x = x, y = ret_out)[name = tensor<string, []>("input_193")];
             tensor<int32, [1]> xt_5_axes_0 = const()[name = tensor<string, []>("xt_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [12, 3, 256]> xt_5 = layer_norm(axes = xt_5_axes_0, beta = decoder_norm11_1_bias, epsilon = var_985, gamma = decoder_norm11_1_weight, x = input_191)[name = tensor<string, []>("xt_5")];
-            tensor<int32, [4]> var_1335 = const()[name = tensor<string, []>("op_1335"), val = tensor<int32, [4]>([1, 12, 3, 256])];
-            tensor<fp32, [1, 12, 3, 256]> var_1336 = reshape(shape = var_1335, x = xt_5)[name = tensor<string, []>("op_1336")];
-            tensor<int32, [4]> var_1337_perm_0 = const()[name = tensor<string, []>("op_1337_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1340 = const()[name = tensor<string, []>("op_1340"), val = tensor<int32, [3]>([3, 12, 256])];
-            tensor<fp32, [1, 3, 12, 256]> var_1337 = transpose(perm = var_1337_perm_0, x = var_1336)[name = tensor<string, []>("transpose_10")];
-            tensor<fp32, [3, 12, 256]> query_5 = reshape(shape = var_1340, x = var_1337)[name = tensor<string, []>("query_5")];
+            tensor<fp32, [12, 3, 256]> xt_5 = layer_norm(axes = xt_5_axes_0, beta = inner_decoder_norm11_1_bias, epsilon = var_56, gamma = inner_decoder_norm11_1_weight, x = input_193)[name = tensor<string, []>("xt_5")];
+            tensor<int32, [4]> var_1358 = const()[name = tensor<string, []>("op_1358"), val = tensor<int32, [4]>([1, 12, 3, 256])];
+            tensor<fp32, [1, 12, 3, 256]> var_1359 = reshape(shape = var_1358, x = xt_5)[name = tensor<string, []>("op_1359")];
+            tensor<int32, [4]> var_1360_perm_0 = const()[name = tensor<string, []>("op_1360_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1363 = const()[name = tensor<string, []>("op_1363"), val = tensor<int32, [3]>([3, 12, 256])];
+            tensor<fp32, [1, 3, 12, 256]> var_1360 = transpose(perm = var_1360_perm_0, x = var_1359)[name = tensor<string, []>("transpose_10")];
+            tensor<fp32, [3, 12, 256]> query_5 = reshape(shape = var_1363, x = var_1360)[name = tensor<string, []>("query_5")];
             tensor<int32, [3]> query_perm_0 = const()[name = tensor<string, []>("query_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [12, 3, 256]> query = transpose(perm = query_perm_0, x = query_5)[name = tensor<string, []>("transpose_9")];
-            tensor<fp32, [12, 3, 768]> var_1363 = linear(bias = decoder_self_attn2_1_in_proj_bias, weight = decoder_self_attn2_1_in_proj_weight, x = query)[name = tensor<string, []>("linear_52")];
+            tensor<fp32, [12, 3, 768]> var_1386 = linear(bias = inner_decoder_self_attn2_1_in_proj_bias, weight = inner_decoder_self_attn2_1_in_proj_weight, x = query)[name = tensor<string, []>("linear_52")];
             tensor<int32, [4]> concat_2 = const()[name = tensor<string, []>("concat_2"), val = tensor<int32, [4]>([12, 3, 3, 256])];
-            tensor<fp32, [12, 3, 3, 256]> var_1365 = reshape(shape = concat_2, x = var_1363)[name = tensor<string, []>("op_1365")];
-            tensor<int32, [1]> var_1366_axes_0 = const()[name = tensor<string, []>("op_1366_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp32, [1, 12, 3, 3, 256]> var_1366 = expand_dims(axes = var_1366_axes_0, x = var_1365)[name = tensor<string, []>("op_1366")];
-            tensor<int32, [5]> var_1367_perm_0 = const()[name = tensor<string, []>("op_1367_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
-            tensor<int32, [1]> var_1368_axes_0 = const()[name = tensor<string, []>("op_1368_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp32, [3, 12, 3, 1, 256]> var_1367 = transpose(perm = var_1367_perm_0, x = var_1366)[name = tensor<string, []>("transpose_8")];
-            tensor<fp32, [3, 12, 3, 256]> var_1368 = squeeze(axes = var_1368_axes_0, x = var_1367)[name = tensor<string, []>("op_1368")];
+            tensor<fp32, [12, 3, 3, 256]> var_1388 = reshape(shape = concat_2, x = var_1386)[name = tensor<string, []>("op_1388")];
+            tensor<int32, [1]> var_1389_axes_0 = const()[name = tensor<string, []>("op_1389_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 12, 3, 3, 256]> var_1389 = expand_dims(axes = var_1389_axes_0, x = var_1388)[name = tensor<string, []>("op_1389")];
+            tensor<int32, [5]> var_1390_perm_0 = const()[name = tensor<string, []>("op_1390_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
+            tensor<int32, [1]> var_1391_axes_0 = const()[name = tensor<string, []>("op_1391_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp32, [3, 12, 3, 1, 256]> var_1390 = transpose(perm = var_1390_perm_0, x = var_1389)[name = tensor<string, []>("transpose_8")];
+            tensor<fp32, [3, 12, 3, 256]> var_1391 = squeeze(axes = var_1391_axes_0, x = var_1390)[name = tensor<string, []>("op_1391")];
             tensor<int32, [4]> q_19_begin_0 = const()[name = tensor<string, []>("q_19_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> q_19_end_0 = const()[name = tensor<string, []>("q_19_end_0"), val = tensor<int32, [4]>([1, 12, 3, 256])];
             tensor<bool, [4]> q_19_end_mask_0 = const()[name = tensor<string, []>("q_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> q_19_squeeze_mask_0 = const()[name = tensor<string, []>("q_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 3, 256]> q_19 = slice_by_index(begin = q_19_begin_0, end = q_19_end_0, end_mask = q_19_end_mask_0, squeeze_mask = q_19_squeeze_mask_0, x = var_1368)[name = tensor<string, []>("q_19")];
+            tensor<fp32, [12, 3, 256]> q_19 = slice_by_index(begin = q_19_begin_0, end = q_19_end_0, end_mask = q_19_end_mask_0, squeeze_mask = q_19_squeeze_mask_0, x = var_1391)[name = tensor<string, []>("q_19")];
             tensor<int32, [4]> k_19_begin_0 = const()[name = tensor<string, []>("k_19_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> k_19_end_0 = const()[name = tensor<string, []>("k_19_end_0"), val = tensor<int32, [4]>([2, 12, 3, 256])];
             tensor<bool, [4]> k_19_end_mask_0 = const()[name = tensor<string, []>("k_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> k_19_squeeze_mask_0 = const()[name = tensor<string, []>("k_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 3, 256]> k_19 = slice_by_index(begin = k_19_begin_0, end = k_19_end_0, end_mask = k_19_end_mask_0, squeeze_mask = k_19_squeeze_mask_0, x = var_1368)[name = tensor<string, []>("k_19")];
+            tensor<fp32, [12, 3, 256]> k_19 = slice_by_index(begin = k_19_begin_0, end = k_19_end_0, end_mask = k_19_end_mask_0, squeeze_mask = k_19_squeeze_mask_0, x = var_1391)[name = tensor<string, []>("k_19")];
             tensor<int32, [4]> v_19_begin_0 = const()[name = tensor<string, []>("v_19_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> v_19_end_0 = const()[name = tensor<string, []>("v_19_end_0"), val = tensor<int32, [4]>([3, 12, 3, 256])];
             tensor<bool, [4]> v_19_end_mask_0 = const()[name = tensor<string, []>("v_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> v_19_squeeze_mask_0 = const()[name = tensor<string, []>("v_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 3, 256]> v_19 = slice_by_index(begin = v_19_begin_0, end = v_19_end_0, end_mask = v_19_end_mask_0, squeeze_mask = v_19_squeeze_mask_0, x = var_1368)[name = tensor<string, []>("v_19")];
-            tensor<int32, [3]> var_1376 = const()[name = tensor<string, []>("op_1376"), val = tensor<int32, [3]>([12, 12, 64])];
-            tensor<fp32, [12, 12, 64]> var_1377 = reshape(shape = var_1376, x = q_19)[name = tensor<string, []>("op_1377")];
+            tensor<fp32, [12, 3, 256]> v_19 = slice_by_index(begin = v_19_begin_0, end = v_19_end_0, end_mask = v_19_end_mask_0, squeeze_mask = v_19_squeeze_mask_0, x = var_1391)[name = tensor<string, []>("v_19")];
+            tensor<int32, [3]> var_1399 = const()[name = tensor<string, []>("op_1399"), val = tensor<int32, [3]>([12, 12, 64])];
+            tensor<fp32, [12, 12, 64]> var_1400 = reshape(shape = var_1399, x = q_19)[name = tensor<string, []>("op_1400")];
             tensor<int32, [3]> q_21_perm_0 = const()[name = tensor<string, []>("q_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1383 = const()[name = tensor<string, []>("op_1383"), val = tensor<int32, [3]>([12, 12, 64])];
-            tensor<fp32, [12, 12, 64]> var_1384 = reshape(shape = var_1383, x = k_19)[name = tensor<string, []>("op_1384")];
+            tensor<int32, [3]> var_1406 = const()[name = tensor<string, []>("op_1406"), val = tensor<int32, [3]>([12, 12, 64])];
+            tensor<fp32, [12, 12, 64]> var_1407 = reshape(shape = var_1406, x = k_19)[name = tensor<string, []>("op_1407")];
             tensor<int32, [3]> k_21_perm_0 = const()[name = tensor<string, []>("k_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1390 = const()[name = tensor<string, []>("op_1390"), val = tensor<int32, [3]>([12, 12, 64])];
-            tensor<fp32, [12, 12, 64]> var_1391 = reshape(shape = var_1390, x = v_19)[name = tensor<string, []>("op_1391")];
+            tensor<int32, [3]> var_1413 = const()[name = tensor<string, []>("op_1413"), val = tensor<int32, [3]>([12, 12, 64])];
+            tensor<fp32, [12, 12, 64]> var_1414 = reshape(shape = var_1413, x = v_19)[name = tensor<string, []>("op_1414")];
             tensor<int32, [3]> v_21_perm_0 = const()[name = tensor<string, []>("v_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [4]> var_1394 = const()[name = tensor<string, []>("op_1394"), val = tensor<int32, [4]>([3, 4, 12, 64])];
-            tensor<fp32, [12, 12, 64]> q_21 = transpose(perm = q_21_perm_0, x = var_1377)[name = tensor<string, []>("transpose_7")];
-            tensor<fp32, [3, 4, 12, 64]> q = reshape(shape = var_1394, x = q_21)[name = tensor<string, []>("q")];
-            tensor<int32, [4]> var_1396 = const()[name = tensor<string, []>("op_1396"), val = tensor<int32, [4]>([3, 4, 12, 64])];
-            tensor<fp32, [12, 12, 64]> k_21 = transpose(perm = k_21_perm_0, x = var_1384)[name = tensor<string, []>("transpose_6")];
-            tensor<fp32, [3, 4, 12, 64]> k = reshape(shape = var_1396, x = k_21)[name = tensor<string, []>("k")];
-            tensor<int32, [4]> var_1398 = const()[name = tensor<string, []>("op_1398"), val = tensor<int32, [4]>([3, 4, 12, 64])];
-            tensor<fp32, [12, 12, 64]> v_21 = transpose(perm = v_21_perm_0, x = var_1391)[name = tensor<string, []>("transpose_5")];
-            tensor<fp32, [3, 4, 12, 64]> v = reshape(shape = var_1398, x = v_21)[name = tensor<string, []>("v")];
+            tensor<int32, [4]> var_1417 = const()[name = tensor<string, []>("op_1417"), val = tensor<int32, [4]>([3, 4, 12, 64])];
+            tensor<fp32, [12, 12, 64]> q_21 = transpose(perm = q_21_perm_0, x = var_1400)[name = tensor<string, []>("transpose_7")];
+            tensor<fp32, [3, 4, 12, 64]> q = reshape(shape = var_1417, x = q_21)[name = tensor<string, []>("q")];
+            tensor<int32, [4]> var_1419 = const()[name = tensor<string, []>("op_1419"), val = tensor<int32, [4]>([3, 4, 12, 64])];
+            tensor<fp32, [12, 12, 64]> k_21 = transpose(perm = k_21_perm_0, x = var_1407)[name = tensor<string, []>("transpose_6")];
+            tensor<fp32, [3, 4, 12, 64]> k = reshape(shape = var_1419, x = k_21)[name = tensor<string, []>("k")];
+            tensor<int32, [4]> var_1421 = const()[name = tensor<string, []>("op_1421"), val = tensor<int32, [4]>([3, 4, 12, 64])];
+            tensor<fp32, [12, 12, 64]> v_21 = transpose(perm = v_21_perm_0, x = var_1414)[name = tensor<string, []>("transpose_5")];
+            tensor<fp32, [3, 4, 12, 64]> v = reshape(shape = var_1421, x = v_21)[name = tensor<string, []>("v")];
             tensor<fp32, []> mul_3_y_0 = const()[name = tensor<string, []>("mul_3_y_0"), val = tensor<fp32, []>(0x1p-3)];
             tensor<fp32, [3, 4, 12, 64]> mul_3 = mul(x = q, y = mul_3_y_0)[name = tensor<string, []>("mul_3")];
             tensor<bool, []> matmul_1_transpose_y_0 = const()[name = tensor<string, []>("matmul_1_transpose_y_0"), val = tensor<bool, []>(true)];
@@ -1237,36 +1248,36 @@ program(1.0)
             tensor<bool, []> attn_output_9_transpose_x_0 = const()[name = tensor<string, []>("attn_output_9_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> attn_output_9_transpose_y_0 = const()[name = tensor<string, []>("attn_output_9_transpose_y_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [3, 4, 12, 64]> attn_output_9 = matmul(transpose_x = attn_output_9_transpose_x_0, transpose_y = attn_output_9_transpose_y_0, x = softmax_1, y = v)[name = tensor<string, []>("attn_output_9")];
-            tensor<int32, [4]> var_1401 = const()[name = tensor<string, []>("op_1401"), val = tensor<int32, [4]>([2, 0, 1, 3])];
-            tensor<int32, [2]> var_1406 = const()[name = tensor<string, []>("op_1406"), val = tensor<int32, [2]>([36, 256])];
-            tensor<fp32, [12, 3, 4, 64]> var_1402 = transpose(perm = var_1401, x = attn_output_9)[name = tensor<string, []>("transpose_4")];
-            tensor<fp32, [36, 256]> attn_output_11 = reshape(shape = var_1406, x = var_1402)[name = tensor<string, []>("attn_output_11")];
-            tensor<fp32, [36, 256]> attn_output_13 = linear(bias = decoder_self_attn2_1_out_proj_bias, weight = decoder_self_attn2_1_out_proj_weight, x = attn_output_11)[name = tensor<string, []>("linear_53")];
-            tensor<int32, [3]> var_1410 = const()[name = tensor<string, []>("op_1410"), val = tensor<int32, [3]>([12, 3, 256])];
-            tensor<fp32, [12, 3, 256]> attn_output = reshape(shape = var_1410, x = attn_output_13)[name = tensor<string, []>("attn_output")];
+            tensor<int32, [4]> var_1424 = const()[name = tensor<string, []>("op_1424"), val = tensor<int32, [4]>([2, 0, 1, 3])];
+            tensor<int32, [2]> var_1429 = const()[name = tensor<string, []>("op_1429"), val = tensor<int32, [2]>([36, 256])];
+            tensor<fp32, [12, 3, 4, 64]> var_1425 = transpose(perm = var_1424, x = attn_output_9)[name = tensor<string, []>("transpose_4")];
+            tensor<fp32, [36, 256]> attn_output_11 = reshape(shape = var_1429, x = var_1425)[name = tensor<string, []>("attn_output_11")];
+            tensor<fp32, [36, 256]> attn_output_13 = linear(bias = inner_decoder_self_attn2_1_out_proj_bias, weight = inner_decoder_self_attn2_1_out_proj_weight, x = attn_output_11)[name = tensor<string, []>("linear_53")];
+            tensor<int32, [3]> var_1433 = const()[name = tensor<string, []>("op_1433"), val = tensor<int32, [3]>([12, 3, 256])];
+            tensor<fp32, [12, 3, 256]> attn_output = reshape(shape = var_1433, x = attn_output_13)[name = tensor<string, []>("attn_output")];
             tensor<int32, [3]> sa2_out_perm_0 = const()[name = tensor<string, []>("sa2_out_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [3, 12, 256]> sa2_out = transpose(perm = sa2_out_perm_0, x = attn_output)[name = tensor<string, []>("transpose_3")];
-            tensor<fp32, [3, 12, 256]> input_193 = add(x = query_5, y = sa2_out)[name = tensor<string, []>("input_193")];
-            tensor<int32, [1]> input_195_axes_0 = const()[name = tensor<string, []>("input_195_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [3, 12, 256]> input_195 = layer_norm(axes = input_195_axes_0, beta = decoder_norm21_1_bias, epsilon = var_985, gamma = decoder_norm21_1_weight, x = input_193)[name = tensor<string, []>("input_195")];
-            tensor<fp32, [3, 12, 2048]> input_197 = linear(bias = decoder_linear1_1_bias, weight = decoder_linear1_1_weight, x = input_195)[name = tensor<string, []>("linear_54")];
-            tensor<fp32, [3, 12, 2048]> input_199 = relu(x = input_197)[name = tensor<string, []>("input_199")];
-            tensor<fp32, [3, 12, 256]> ffn_out = linear(bias = decoder_linear2_1_bias, weight = decoder_linear2_1_weight, x = input_199)[name = tensor<string, []>("linear_55")];
-            tensor<fp32, [3, 12, 256]> input_201 = add(x = input_195, y = ffn_out)[name = tensor<string, []>("input_201")];
+            tensor<fp32, [3, 12, 256]> input_195 = add(x = query_5, y = sa2_out)[name = tensor<string, []>("input_195")];
+            tensor<int32, [1]> input_197_axes_0 = const()[name = tensor<string, []>("input_197_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [3, 12, 256]> input_197 = layer_norm(axes = input_197_axes_0, beta = inner_decoder_norm21_1_bias, epsilon = var_56, gamma = inner_decoder_norm21_1_weight, x = input_195)[name = tensor<string, []>("input_197")];
+            tensor<fp32, [3, 12, 2048]> input_199 = linear(bias = inner_decoder_linear1_1_bias, weight = inner_decoder_linear1_1_weight, x = input_197)[name = tensor<string, []>("linear_54")];
+            tensor<fp32, [3, 12, 2048]> input_201 = relu(x = input_199)[name = tensor<string, []>("input_201")];
+            tensor<fp32, [3, 12, 256]> ffn_out = linear(bias = inner_decoder_linear2_1_bias, weight = inner_decoder_linear2_1_weight, x = input_201)[name = tensor<string, []>("linear_55")];
+            tensor<fp32, [3, 12, 256]> input_203 = add(x = input_197, y = ffn_out)[name = tensor<string, []>("input_203")];
             tensor<int32, [1]> xt_axes_0 = const()[name = tensor<string, []>("xt_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [3, 12, 256]> xt = layer_norm(axes = xt_axes_0, beta = decoder_norm22_1_bias, epsilon = var_985, gamma = decoder_norm22_1_weight, x = input_201)[name = tensor<string, []>("xt")];
-            tensor<int32, [4]> var_1430 = const()[name = tensor<string, []>("op_1430"), val = tensor<int32, [4]>([1, 3, 12, 256])];
-            tensor<fp32, [1, 3, 12, 256]> input = reshape(shape = var_1430, x = xt)[name = tensor<string, []>("input")];
-            tensor<int32, [1]> var_1432 = const()[name = tensor<string, []>("op_1432"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 3, 12, 1]> var_1433 = reduce_l2_norm(axes = var_1432, keep_dims = var_988, x = input)[name = tensor<string, []>("op_1433")];
+            tensor<fp32, [3, 12, 256]> xt = layer_norm(axes = xt_axes_0, beta = inner_decoder_norm22_1_bias, epsilon = var_56, gamma = inner_decoder_norm22_1_weight, x = input_203)[name = tensor<string, []>("xt")];
+            tensor<int32, [4]> var_1453 = const()[name = tensor<string, []>("op_1453"), val = tensor<int32, [4]>([1, 3, 12, 256])];
+            tensor<fp32, [1, 3, 12, 256]> input = reshape(shape = var_1453, x = xt)[name = tensor<string, []>("input")];
+            tensor<int32, [1]> var_1455 = const()[name = tensor<string, []>("op_1455"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 3, 12, 1]> var_1456 = reduce_l2_norm(axes = var_1455, keep_dims = var_55, x = input)[name = tensor<string, []>("op_1456")];
             tensor<fp32, []> const_42 = const()[name = tensor<string, []>("const_42"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1, 3, 12, 1]> clip_5 = clip(alpha = var_980, beta = const_42, x = var_1433)[name = tensor<string, []>("clip_5")];
-            tensor<fp32, [1, 3, 12, 256]> var_1435 = real_div(x = input, y = clip_5)[name = tensor<string, []>("op_1435")];
+            tensor<fp32, [1, 3, 12, 1]> clip_5 = clip(alpha = var_69, beta = const_42, x = var_1456)[name = tensor<string, []>("clip_5")];
+            tensor<fp32, [1, 3, 12, 256]> var_1458 = real_div(x = input, y = clip_5)[name = tensor<string, []>("op_1458")];
             tensor<int32, [3]> concat_6 = const()[name = tensor<string, []>("concat_6"), val = tensor<int32, [3]>([3, 1, 256])];
             tensor<fp32, [3, 1, 256]> reshape_0 = reshape(shape = concat_6, x = emb)[name = tensor<string, []>("reshape_0")];
             tensor<int32, [4]> transpose_1_perm_0 = const()[name = tensor<string, []>("transpose_1_perm_0"), val = tensor<int32, [4]>([0, 1, 3, 2])];
             tensor<int32, [3]> concat_7 = const()[name = tensor<string, []>("concat_7"), val = tensor<int32, [3]>([3, 256, 12])];
-            tensor<fp32, [1, 3, 256, 12]> transpose_1 = transpose(perm = transpose_1_perm_0, x = var_1435)[name = tensor<string, []>("transpose_2")];
+            tensor<fp32, [1, 3, 256, 12]> transpose_1 = transpose(perm = transpose_1_perm_0, x = var_1458)[name = tensor<string, []>("transpose_2")];
             tensor<fp32, [3, 256, 12]> reshape_1 = reshape(shape = concat_7, x = transpose_1)[name = tensor<string, []>("reshape_1")];
             tensor<bool, []> matmul_2_transpose_x_0 = const()[name = tensor<string, []>("matmul_2_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> matmul_2_transpose_y_0 = const()[name = tensor<string, []>("matmul_2_transpose_y_0"), val = tensor<bool, []>(false)];
@@ -1277,10 +1288,10 @@ program(1.0)
             tensor<int32, [3]> output_end_0 = const()[name = tensor<string, []>("output_end_0"), val = tensor<int32, [3]>([1, 3, 11])];
             tensor<bool, [3]> output_end_mask_0 = const()[name = tensor<string, []>("output_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
             tensor<fp32, [1, 3, 10]> output = slice_by_index(begin = output_begin_0, end = output_end_0, end_mask = output_end_mask_0, x = reshape_2)[name = tensor<string, []>("output")];
-            tensor<fp32, [1, 3, 10]> probs = sigmoid(x = output)[name = tensor<string, []>("op_1439")];
-            tensor<int32, []> var_1441_axis_0 = const()[name = tensor<string, []>("op_1441_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [2, 12, 4, 64, 64]> dec_kv_new = stack(axis = var_1441_axis_0, values = (var_1137, nkv))[name = tensor<string, []>("op_1441")];
-            tensor<int32, []> var_1443_axis_0 = const()[name = tensor<string, []>("op_1443_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [2, 1]> dec_scale_new = stack(axis = var_1443_axis_0, values = (new_scale_9, new_scale))[name = tensor<string, []>("op_1443")];
+            tensor<fp32, [1, 3, 10]> probs = sigmoid(x = output)[name = tensor<string, []>("op_1462")];
+            tensor<int32, []> var_1464_axis_0 = const()[name = tensor<string, []>("op_1464_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [2, 12, 4, 64, 64]> dec_kv_new = stack(axis = var_1464_axis_0, values = (var_1160, nkv))[name = tensor<string, []>("op_1464")];
+            tensor<int32, []> var_1466_axis_0 = const()[name = tensor<string, []>("op_1466_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [2, 1]> dec_scale_new = stack(axis = var_1466_axis_0, values = (new_scale_9, new_scale))[name = tensor<string, []>("op_1466")];
         } -> (probs, enc_kv_new, enc_scale_new, enc_conv_cache_new, cnn_window_new, dec_kv_new, dec_scale_new);
 }
\ No newline at end of file
diff --git a/optimized/dih3/300ms/ls_eend_dih3_300ms.mlpackage/Data/com.apple.CoreML/model.mlmodel b/optimized/dih3/300ms/ls_eend_dih3_300ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
index d17264963ac53e18b1094bfefb69b97ef1395517..d68918dc375d2c7c3af14336bad337a916fb7fd2 100644
--- a/optimized/dih3/300ms/ls_eend_dih3_300ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
+++ b/optimized/dih3/300ms/ls_eend_dih3_300ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9986be1fe8f5022bafee300a2d8b49effe1540945ddb6ae9eaab94821982c313
-size 185469
+oid sha256:50cd667bc954b3b7ebc4888f43fd165ccee40cb8ed18429ed49740e418099933
+size 191014
diff --git a/optimized/dih3/300ms/ls_eend_dih3_300ms.mlpackage/Manifest.json b/optimized/dih3/300ms/ls_eend_dih3_300ms.mlpackage/Manifest.json
index d174cfd424c10645eda95ea853dbe518047303b0..c3ffeb208da36539e3188bc5e845c45adf972d18 100644
--- a/optimized/dih3/300ms/ls_eend_dih3_300ms.mlpackage/Manifest.json
+++ b/optimized/dih3/300ms/ls_eend_dih3_300ms.mlpackage/Manifest.json
@@ -1,18 +1,18 @@
 {
     "fileFormatVersion": "1.0.0",
     "itemInfoEntries": {
-        "5F03C3B0-3340-4DD2-B75B-F78D5596775F": {
-            "author": "com.apple.CoreML",
-            "description": "CoreML Model Weights",
-            "name": "weights",
-            "path": "com.apple.CoreML/weights"
-        },
-        "F681008A-C1D4-4681-92CB-73FB966EF8F3": {
+        "0454F758-53A0-42B0-9756-169BE37A9D36": {
             "author": "com.apple.CoreML",
             "description": "CoreML Model Specification",
             "name": "model.mlmodel",
             "path": "com.apple.CoreML/model.mlmodel"
+        },
+        "AE39AE0A-40A8-4BF2-AD07-BF3EC2B3068E": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Weights",
+            "name": "weights",
+            "path": "com.apple.CoreML/weights"
         }
     },
-    "rootModelIdentifier": "F681008A-C1D4-4681-92CB-73FB966EF8F3"
+    "rootModelIdentifier": "0454F758-53A0-42B0-9756-169BE37A9D36"
 }
diff --git a/optimized/dih3/400ms/ls_eend_dih3_400ms.mlmodelc/analytics/coremldata.bin b/optimized/dih3/400ms/ls_eend_dih3_400ms.mlmodelc/analytics/coremldata.bin
index 71fa07880e9abdfe454605d65a2e99de84ec0b8c..9bd6bbbc32040f0faedbd1ff3b74fb49b5c05998 100644
--- a/optimized/dih3/400ms/ls_eend_dih3_400ms.mlmodelc/analytics/coremldata.bin
+++ b/optimized/dih3/400ms/ls_eend_dih3_400ms.mlmodelc/analytics/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:88851cb773f9d1d6b36189e9c262dcc461c0d873a69a1f1dcf4087f48c4dc60b
+oid sha256:3addfcd6bc7150ebc79d1c7db27c7efa955489919122c583720e067b219299cc
 size 243
diff --git a/optimized/dih3/400ms/ls_eend_dih3_400ms.mlmodelc/coremldata.bin b/optimized/dih3/400ms/ls_eend_dih3_400ms.mlmodelc/coremldata.bin
index 18f01273657c24b054f38d6280b09122f7253d2a..a5260a611a9affcac436c67af4d6a8db2d0621fa 100644
--- a/optimized/dih3/400ms/ls_eend_dih3_400ms.mlmodelc/coremldata.bin
+++ b/optimized/dih3/400ms/ls_eend_dih3_400ms.mlmodelc/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:606fe839f02edeb537028ed20f1abdc784cf5a18cef58ae9ff1a2783540da01b
-size 1310
+oid sha256:f4df96ad5df011033bd68dce218b1d654cbb0cb49cbb3ee9dea54e7fa31130ac
+size 1413
diff --git a/optimized/dih3/400ms/ls_eend_dih3_400ms.mlmodelc/metadata.json b/optimized/dih3/400ms/ls_eend_dih3_400ms.mlmodelc/metadata.json
index cba5d30a2ad995503a2fcbad94a71f5afd760118..5dc93bf1d80f7012ba070e1a4aefee4d175ae4f1 100644
--- a/optimized/dih3/400ms/ls_eend_dih3_400ms.mlmodelc/metadata.json
+++ b/optimized/dih3/400ms/ls_eend_dih3_400ms.mlmodelc/metadata.json
@@ -1,7 +1,7 @@
 [
   {
     "metadataOutputVersion" : "3.0",
-    "shortDescription" : "LS-EEND DIHARD III streaming diarizer (pipeline, T=4, max_speakers=10)",
+    "shortDescription" : "LS-EEND DIHARD III streaming diarizer (pipeline, T=4, max_speakers=10, layout=raw_mel, cu=all)",
     "outputSchema" : [
       {
         "hasShapeFlexibility" : "0",
@@ -81,7 +81,7 @@
     "specificationVersion" : 8,
     "mlProgramOperationTypeHistogram" : {
       "Ios16.reduceL2Norm" : 2,
-      "Ios17.reshape" : 68,
+      "Ios17.reshape" : 69,
       "Ios16.softmax" : 2,
       "Ios17.matmul" : 29,
       "Ios17.transpose" : 57,
@@ -89,7 +89,7 @@
       "Ios17.expandDims" : 3,
       "Ios17.add" : 46,
       "Ios16.sigmoid" : 5,
-      "Ios17.sliceByIndex" : 64,
+      "Ios17.sliceByIndex" : 68,
       "Tile" : 1,
       "Ios16.reduceSum" : 1,
       "Ios17.squeeze" : 2,
@@ -101,7 +101,7 @@
       "Ios16.silu" : 18,
       "Ios17.realDiv" : 20,
       "Ios17.linear" : 56,
-      "Stack" : 5,
+      "Stack" : 6,
       "Ios17.concat" : 22,
       "Ios16.relu" : 2,
       "Ios16.cumsum" : 1,
@@ -128,9 +128,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float32",
-        "formattedType" : "MultiArray (Float32 1 × 4 × 345)",
+        "formattedType" : "MultiArray (Float32 1 × 45 × 23)",
         "shortDescription" : "",
-        "shape" : "[1, 4, 345]",
+        "shape" : "[1, 45, 23]",
         "name" : "features",
         "type" : "MultiArray"
       },
@@ -206,8 +206,8 @@
       }
     ],
     "userDefinedMetadata" : {
-      "com.github.apple.coremltools.conversion_date" : "2026-04-16",
-      "config" : "{\"model_name\": \"dih3\", \"model_label\": \"DIHARD III\", \"variant\": \"pipeline\", \"chunk_size\": 4, \"step_duration_ms\": 400, \"frame_duration_ms\": 100, \"frame_duration_seconds\": 0.1, \"max_speakers\": 10, \"max_nspks\": 12, \"n_units\": 256, \"n_heads\": 4, \"enc_n_layers\": 4, \"dec_n_layers\": 2, \"conv_kernel_size\": 16, \"conv_delay\": 9, \"sample_rate\": 8000, \"win_length\": 200, \"hop_length\": 80, \"n_mels\": 23, \"context_size\": 7, \"subsampling\": 10, \"feat_type\": \"logmel23_cummn\", \"pure_roll\": true}",
+      "com.github.apple.coremltools.conversion_date" : "2026-04-18",
+      "config" : "{\"model_name\": \"dih3\", \"model_label\": \"DIHARD III\", \"variant\": \"pipeline\", \"chunk_size\": 4, \"step_duration_ms\": 400, \"frame_duration_ms\": 100, \"frame_duration_seconds\": 0.1, \"max_speakers\": 10, \"max_nspks\": 12, \"n_units\": 256, \"n_heads\": 4, \"enc_n_layers\": 4, \"dec_n_layers\": 2, \"conv_kernel_size\": 16, \"conv_delay\": 9, \"sample_rate\": 8000, \"win_length\": 200, \"hop_length\": 80, \"n_mels\": 23, \"context_size\": 7, \"subsampling\": 10, \"feat_type\": \"logmel23_cummn\", \"pure_roll\": true, \"input_layout\": \"raw_mel\", \"compute_units_export\": \"all\", \"raw_mel_length\": 45}",
       "com.github.apple.coremltools.source" : "torch==2.6.0",
       "com.github.apple.coremltools.version" : "9.0",
       "com.github.apple.coremltools.source_dialect" : "TorchScript"
diff --git a/optimized/dih3/400ms/ls_eend_dih3_400ms.mlmodelc/model.mil b/optimized/dih3/400ms/ls_eend_dih3_400ms.mlmodelc/model.mil
index 7135f6328b831d1038e863b851b7563d75bba5c3..546c200804eb017fad873f0d353fbf1c38982ea7 100644
--- a/optimized/dih3/400ms/ls_eend_dih3_400ms.mlmodelc/model.mil
+++ b/optimized/dih3/400ms/ls_eend_dih3_400ms.mlmodelc/model.mil
@@ -1,234 +1,256 @@
 program(1.0)
 [buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.6.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "9.0"}})]
 {
-    func main<ios17>(tensor<fp32, [1, 256, 18]> cnn_window, tensor<fp32, [2, 12, 4, 64, 64]> dec_kv, tensor<fp32, [2, 1]> dec_scale, tensor<fp32, [4, 1, 16, 256]> enc_conv_cache, tensor<fp32, [4, 1, 4, 64, 64]> enc_kv, tensor<fp32, [4, 1]> enc_scale, tensor<fp32, [1, 4, 345]> features, tensor<fp32, [4]> valid_mask) {
-            tensor<fp32, [256]> encoder_cnn_bias = const()[name = tensor<string, []>("encoder_cnn_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
-            tensor<fp32, [256, 256, 19]> encoder_cnn_weight = const()[name = tensor<string, []>("encoder_cnn_weight"), val = tensor<fp32, [256, 256, 19]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
-            tensor<fp32, [4, 4]> encoder__causal_mask = const()[name = tensor<string, []>("encoder__causal_mask"), val = tensor<fp32, [4, 4]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4981952)))];
-            tensor<fp32, [4]> encoder__t_index = const()[name = tensor<string, []>("encoder__t_index"), val = tensor<fp32, [4]>([0x1p+0, 0x1p+1, 0x1.8p+1, 0x1p+2])];
-            tensor<fp32, [256]> encoder_input_projection_linear_bias = const()[name = tensor<string, []>("encoder_input_projection_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4982080)))];
-            tensor<fp32, [256, 345]> encoder_input_projection_linear_weight = const()[name = tensor<string, []>("encoder_input_projection_linear_weight"), val = tensor<fp32, [256, 345]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4983168)))];
-            tensor<fp32, [256]> encoder_pre_layer_norm_bias = const()[name = tensor<string, []>("encoder_pre_layer_norm_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5336512)))];
-            tensor<fp32, [256]> encoder_pre_layer_norm_weight = const()[name = tensor<string, []>("encoder_pre_layer_norm_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5337600)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5338688)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5339776)))];
-            tensor<fp32, [1024]> encoder_ffn1_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5340864)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5345024)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6393664)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6394752)))];
-            tensor<fp32, [256]> encoder_ret_lns_0_bias = const()[name = tensor<string, []>("encoder_ret_lns_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7443392)))];
-            tensor<fp32, [256]> encoder_ret_lns_0_weight = const()[name = tensor<string, []>("encoder_ret_lns_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7444480)))];
-            tensor<fp32, [256]> encoder_q_proj_0_bias = const()[name = tensor<string, []>("encoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7445568)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_0_weight = const()[name = tensor<string, []>("encoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7446656)))];
-            tensor<fp32, [256]> encoder_k_proj_0_bias = const()[name = tensor<string, []>("encoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7708864)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_0_weight = const()[name = tensor<string, []>("encoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7709952)))];
-            tensor<fp32, [256]> encoder_v_proj_0_bias = const()[name = tensor<string, []>("encoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7972160)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_0_weight = const()[name = tensor<string, []>("encoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7973248)))];
-            tensor<fp32, [256]> encoder_g_proj_0_bias = const()[name = tensor<string, []>("encoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8235456)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_0_weight = const()[name = tensor<string, []>("encoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8236544)))];
-            tensor<fp32, [256]> encoder_out_proj_0_bias = const()[name = tensor<string, []>("encoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8498752)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_0_weight = const()[name = tensor<string, []>("encoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8499840)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8762048)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8763136)))];
-            tensor<fp32, [512]> encoder_conv_module_0_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8764224)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_0_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8766336)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_0_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9290688)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9307136)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9308224)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9309312)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9310400)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9311488)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_0_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9312576)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9574784)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9575872)))];
-            tensor<fp32, [1024]> encoder_ffn2_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9576960)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9581120)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10629760)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10630848)))];
-            tensor<fp32, [256]> encoder_layer_norm_0_bias = const()[name = tensor<string, []>("encoder_layer_norm_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11679488)))];
-            tensor<fp32, [256]> encoder_layer_norm_0_weight = const()[name = tensor<string, []>("encoder_layer_norm_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11680576)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11681664)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11682752)))];
-            tensor<fp32, [1024]> encoder_ffn1_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11683840)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11688000)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12736640)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12737728)))];
-            tensor<fp32, [256]> encoder_ret_lns_1_bias = const()[name = tensor<string, []>("encoder_ret_lns_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13786368)))];
-            tensor<fp32, [256]> encoder_ret_lns_1_weight = const()[name = tensor<string, []>("encoder_ret_lns_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13787456)))];
-            tensor<fp32, [256]> encoder_q_proj_1_bias = const()[name = tensor<string, []>("encoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13788544)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_1_weight = const()[name = tensor<string, []>("encoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13789632)))];
-            tensor<fp32, [256]> encoder_k_proj_1_bias = const()[name = tensor<string, []>("encoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14051840)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_1_weight = const()[name = tensor<string, []>("encoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14052928)))];
-            tensor<fp32, [256]> encoder_v_proj_1_bias = const()[name = tensor<string, []>("encoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14315136)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_1_weight = const()[name = tensor<string, []>("encoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14316224)))];
-            tensor<fp32, [256]> encoder_g_proj_1_bias = const()[name = tensor<string, []>("encoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14578432)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_1_weight = const()[name = tensor<string, []>("encoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14579520)))];
-            tensor<fp32, [256]> encoder_out_proj_1_bias = const()[name = tensor<string, []>("encoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14841728)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_1_weight = const()[name = tensor<string, []>("encoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14842816)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15105024)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15106112)))];
-            tensor<fp32, [512]> encoder_conv_module_1_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15107200)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_1_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15109312)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_1_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15633664)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15650112)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15651200)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15652288)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15653376)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15654464)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_1_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15655552)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15917760)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15918848)))];
-            tensor<fp32, [1024]> encoder_ffn2_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15919936)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15924096)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16972736)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16973824)))];
-            tensor<fp32, [256]> encoder_layer_norm_1_bias = const()[name = tensor<string, []>("encoder_layer_norm_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18022464)))];
-            tensor<fp32, [256]> encoder_layer_norm_1_weight = const()[name = tensor<string, []>("encoder_layer_norm_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18023552)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18024640)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18025728)))];
-            tensor<fp32, [1024]> encoder_ffn1_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18026816)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18030976)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19079616)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19080704)))];
-            tensor<fp32, [256]> encoder_ret_lns_2_bias = const()[name = tensor<string, []>("encoder_ret_lns_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20129344)))];
-            tensor<fp32, [256]> encoder_ret_lns_2_weight = const()[name = tensor<string, []>("encoder_ret_lns_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20130432)))];
-            tensor<fp32, [256]> encoder_q_proj_2_bias = const()[name = tensor<string, []>("encoder_q_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20131520)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_2_weight = const()[name = tensor<string, []>("encoder_q_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20132608)))];
-            tensor<fp32, [256]> encoder_k_proj_2_bias = const()[name = tensor<string, []>("encoder_k_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20394816)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_2_weight = const()[name = tensor<string, []>("encoder_k_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20395904)))];
-            tensor<fp32, [256]> encoder_v_proj_2_bias = const()[name = tensor<string, []>("encoder_v_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20658112)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_2_weight = const()[name = tensor<string, []>("encoder_v_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20659200)))];
-            tensor<fp32, [256]> encoder_g_proj_2_bias = const()[name = tensor<string, []>("encoder_g_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20921408)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_2_weight = const()[name = tensor<string, []>("encoder_g_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20922496)))];
-            tensor<fp32, [256]> encoder_out_proj_2_bias = const()[name = tensor<string, []>("encoder_out_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21184704)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_2_weight = const()[name = tensor<string, []>("encoder_out_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21185792)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21448000)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21449088)))];
-            tensor<fp32, [512]> encoder_conv_module_2_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21450176)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_2_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21452288)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_2_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21976640)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21993088)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21994176)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21995264)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21996352)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21997440)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_2_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21998528)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22260736)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22261824)))];
-            tensor<fp32, [1024]> encoder_ffn2_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22262912)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22267072)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23315712)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23316800)))];
-            tensor<fp32, [256]> encoder_layer_norm_2_bias = const()[name = tensor<string, []>("encoder_layer_norm_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24365440)))];
-            tensor<fp32, [256]> encoder_layer_norm_2_weight = const()[name = tensor<string, []>("encoder_layer_norm_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24366528)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24367616)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24368704)))];
-            tensor<fp32, [1024]> encoder_ffn1_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24369792)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24373952)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25422592)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25423680)))];
-            tensor<fp32, [256]> encoder_ret_lns_3_bias = const()[name = tensor<string, []>("encoder_ret_lns_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26472320)))];
-            tensor<fp32, [256]> encoder_ret_lns_3_weight = const()[name = tensor<string, []>("encoder_ret_lns_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26473408)))];
-            tensor<fp32, [256]> encoder_q_proj_3_bias = const()[name = tensor<string, []>("encoder_q_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26474496)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_3_weight = const()[name = tensor<string, []>("encoder_q_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26475584)))];
-            tensor<fp32, [256]> encoder_k_proj_3_bias = const()[name = tensor<string, []>("encoder_k_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26737792)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_3_weight = const()[name = tensor<string, []>("encoder_k_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26738880)))];
-            tensor<fp32, [256]> encoder_v_proj_3_bias = const()[name = tensor<string, []>("encoder_v_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27001088)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_3_weight = const()[name = tensor<string, []>("encoder_v_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27002176)))];
-            tensor<fp32, [256]> encoder_g_proj_3_bias = const()[name = tensor<string, []>("encoder_g_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27264384)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_3_weight = const()[name = tensor<string, []>("encoder_g_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27265472)))];
-            tensor<fp32, [256]> encoder_out_proj_3_bias = const()[name = tensor<string, []>("encoder_out_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27527680)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_3_weight = const()[name = tensor<string, []>("encoder_out_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27528768)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27790976)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27792064)))];
-            tensor<fp32, [512]> encoder_conv_module_3_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27793152)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_3_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27795264)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_3_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28319616)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28336064)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28337152)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28338240)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28339328)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28340416)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_3_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28341504)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28603712)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28604800)))];
-            tensor<fp32, [1024]> encoder_ffn2_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28605888)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28610048)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29658688)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29659776)))];
-            tensor<fp32, [256]> encoder_layer_norm_3_bias = const()[name = tensor<string, []>("encoder_layer_norm_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30708416)))];
-            tensor<fp32, [256]> encoder_layer_norm_3_weight = const()[name = tensor<string, []>("encoder_layer_norm_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30709504)))];
-            tensor<fp32, [4, 4]> decoder__causal_mask = const()[name = tensor<string, []>("decoder__causal_mask"), val = tensor<fp32, [4, 4]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710592)))];
-            tensor<fp32, [256]> decoder_convert_bias = const()[name = tensor<string, []>("decoder_convert_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710720)))];
-            tensor<fp32, [256, 512]> decoder_convert_weight = const()[name = tensor<string, []>("decoder_convert_weight"), val = tensor<fp32, [256, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30711808)))];
-            tensor<fp32, [256]> decoder_q_proj_0_bias = const()[name = tensor<string, []>("decoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31236160)))];
-            tensor<fp32, [256, 256]> decoder_q_proj_0_weight = const()[name = tensor<string, []>("decoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31237248)))];
-            tensor<fp32, [256]> decoder_k_proj_0_bias = const()[name = tensor<string, []>("decoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31499456)))];
-            tensor<fp32, [256, 256]> decoder_k_proj_0_weight = const()[name = tensor<string, []>("decoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31500544)))];
-            tensor<fp32, [256]> decoder_v_proj_0_bias = const()[name = tensor<string, []>("decoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31762752)))];
-            tensor<fp32, [256, 256]> decoder_v_proj_0_weight = const()[name = tensor<string, []>("decoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31763840)))];
-            tensor<fp32, [256]> decoder_g_proj_0_bias = const()[name = tensor<string, []>("decoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32026048)))];
-            tensor<fp32, [256, 256]> decoder_g_proj_0_weight = const()[name = tensor<string, []>("decoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32027136)))];
-            tensor<fp32, [256]> decoder_out_proj_0_bias = const()[name = tensor<string, []>("decoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32289344)))];
-            tensor<fp32, [256, 256]> decoder_out_proj_0_weight = const()[name = tensor<string, []>("decoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32290432)))];
-            tensor<fp32, [256]> decoder_norm11_0_bias = const()[name = tensor<string, []>("decoder_norm11_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32552640)))];
-            tensor<fp32, [256]> decoder_norm11_0_weight = const()[name = tensor<string, []>("decoder_norm11_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32553728)))];
-            tensor<fp32, [256]> decoder_self_attn2_0_out_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_0_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32554816)))];
-            tensor<fp32, [256, 256]> decoder_self_attn2_0_out_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_0_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32555904)))];
-            tensor<fp32, [768]> decoder_self_attn2_0_in_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_0_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32818112)))];
-            tensor<fp32, [768, 256]> decoder_self_attn2_0_in_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_0_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32821248)))];
-            tensor<fp32, [256]> decoder_norm21_0_bias = const()[name = tensor<string, []>("decoder_norm21_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33607744)))];
-            tensor<fp32, [256]> decoder_norm21_0_weight = const()[name = tensor<string, []>("decoder_norm21_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33608832)))];
-            tensor<fp32, [2048]> decoder_linear1_0_bias = const()[name = tensor<string, []>("decoder_linear1_0_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33609920)))];
-            tensor<fp32, [2048, 256]> decoder_linear1_0_weight = const()[name = tensor<string, []>("decoder_linear1_0_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33618176)))];
-            tensor<fp32, [256]> decoder_linear2_0_bias = const()[name = tensor<string, []>("decoder_linear2_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35715392)))];
-            tensor<fp32, [256, 2048]> decoder_linear2_0_weight = const()[name = tensor<string, []>("decoder_linear2_0_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35716480)))];
-            tensor<fp32, [256]> decoder_norm22_0_bias = const()[name = tensor<string, []>("decoder_norm22_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37813696)))];
-            tensor<fp32, [256]> decoder_norm22_0_weight = const()[name = tensor<string, []>("decoder_norm22_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37814784)))];
-            tensor<fp32, [256]> decoder_q_proj_1_bias = const()[name = tensor<string, []>("decoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37815872)))];
-            tensor<fp32, [256, 256]> decoder_q_proj_1_weight = const()[name = tensor<string, []>("decoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37816960)))];
-            tensor<fp32, [256]> decoder_k_proj_1_bias = const()[name = tensor<string, []>("decoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38079168)))];
-            tensor<fp32, [256, 256]> decoder_k_proj_1_weight = const()[name = tensor<string, []>("decoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38080256)))];
-            tensor<fp32, [256]> decoder_v_proj_1_bias = const()[name = tensor<string, []>("decoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38342464)))];
-            tensor<fp32, [256, 256]> decoder_v_proj_1_weight = const()[name = tensor<string, []>("decoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38343552)))];
-            tensor<fp32, [256]> decoder_g_proj_1_bias = const()[name = tensor<string, []>("decoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38605760)))];
-            tensor<fp32, [256, 256]> decoder_g_proj_1_weight = const()[name = tensor<string, []>("decoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38606848)))];
-            tensor<fp32, [256]> decoder_out_proj_1_bias = const()[name = tensor<string, []>("decoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38869056)))];
-            tensor<fp32, [256, 256]> decoder_out_proj_1_weight = const()[name = tensor<string, []>("decoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38870144)))];
-            tensor<fp32, [256]> decoder_norm11_1_bias = const()[name = tensor<string, []>("decoder_norm11_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39132352)))];
-            tensor<fp32, [256]> decoder_norm11_1_weight = const()[name = tensor<string, []>("decoder_norm11_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39133440)))];
-            tensor<fp32, [256]> decoder_self_attn2_1_out_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_1_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39134528)))];
-            tensor<fp32, [256, 256]> decoder_self_attn2_1_out_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_1_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39135616)))];
-            tensor<fp32, [768]> decoder_self_attn2_1_in_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_1_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39397824)))];
-            tensor<fp32, [768, 256]> decoder_self_attn2_1_in_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_1_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39400960)))];
-            tensor<fp32, [256]> decoder_norm21_1_bias = const()[name = tensor<string, []>("decoder_norm21_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40187456)))];
-            tensor<fp32, [256]> decoder_norm21_1_weight = const()[name = tensor<string, []>("decoder_norm21_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40188544)))];
-            tensor<fp32, [2048]> decoder_linear1_1_bias = const()[name = tensor<string, []>("decoder_linear1_1_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40189632)))];
-            tensor<fp32, [2048, 256]> decoder_linear1_1_weight = const()[name = tensor<string, []>("decoder_linear1_1_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40197888)))];
-            tensor<fp32, [256]> decoder_linear2_1_bias = const()[name = tensor<string, []>("decoder_linear2_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42295104)))];
-            tensor<fp32, [256, 2048]> decoder_linear2_1_weight = const()[name = tensor<string, []>("decoder_linear2_1_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42296192)))];
-            tensor<fp32, [256]> decoder_norm22_1_bias = const()[name = tensor<string, []>("decoder_norm22_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44393408)))];
-            tensor<fp32, [256]> decoder_norm22_1_weight = const()[name = tensor<string, []>("decoder_norm22_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44394496)))];
-            tensor<fp32, []> var_11 = const()[name = tensor<string, []>("op_11"), val = tensor<fp32, []>(0x1.5798eep-27)];
-            tensor<fp32, []> var_17 = const()[name = tensor<string, []>("op_17"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
-            tensor<int32, []> var_20 = const()[name = tensor<string, []>("op_20"), val = tensor<int32, []>(2)];
-            tensor<int32, []> var_23 = const()[name = tensor<string, []>("op_23"), val = tensor<int32, []>(0)];
-            tensor<int32, []> var_26 = const()[name = tensor<string, []>("op_26"), val = tensor<int32, []>(1)];
-            tensor<fp32, []> var_28 = const()[name = tensor<string, []>("op_28"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
-            tensor<bool, []> var_29 = const()[name = tensor<string, []>("op_29"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 256]> input_1 = linear(bias = encoder_input_projection_linear_bias, weight = encoder_input_projection_linear_weight, x = features)[name = tensor<string, []>("linear_0")];
-            tensor<int32, [1]> input_3_axes_0 = const()[name = tensor<string, []>("input_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_3 = layer_norm(axes = input_3_axes_0, beta = encoder_pre_layer_norm_bias, epsilon = var_28, gamma = encoder_pre_layer_norm_weight, x = input_1)[name = tensor<string, []>("input_3")];
+    func main<ios17>(tensor<fp32, [1, 256, 18]> cnn_window, tensor<fp32, [2, 12, 4, 64, 64]> dec_kv, tensor<fp32, [2, 1]> dec_scale, tensor<fp32, [4, 1, 16, 256]> enc_conv_cache, tensor<fp32, [4, 1, 4, 64, 64]> enc_kv, tensor<fp32, [4, 1]> enc_scale, tensor<fp32, [1, 45, 23]> features, tensor<fp32, [4]> valid_mask) {
+            tensor<fp32, [256]> inner_encoder_cnn_bias = const()[name = tensor<string, []>("inner_encoder_cnn_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp32, [256, 256, 19]> inner_encoder_cnn_weight = const()[name = tensor<string, []>("inner_encoder_cnn_weight"), val = tensor<fp32, [256, 256, 19]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
+            tensor<fp32, [4, 4]> inner_encoder__causal_mask = const()[name = tensor<string, []>("inner_encoder__causal_mask"), val = tensor<fp32, [4, 4]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4981952)))];
+            tensor<fp32, [4]> inner_encoder__t_index = const()[name = tensor<string, []>("inner_encoder__t_index"), val = tensor<fp32, [4]>([0x1p+0, 0x1p+1, 0x1.8p+1, 0x1p+2])];
+            tensor<fp32, [256]> inner_encoder_input_projection_linear_bias = const()[name = tensor<string, []>("inner_encoder_input_projection_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4982080)))];
+            tensor<fp32, [256, 345]> inner_encoder_input_projection_linear_weight = const()[name = tensor<string, []>("inner_encoder_input_projection_linear_weight"), val = tensor<fp32, [256, 345]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4983168)))];
+            tensor<fp32, [256]> inner_encoder_pre_layer_norm_bias = const()[name = tensor<string, []>("inner_encoder_pre_layer_norm_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5336512)))];
+            tensor<fp32, [256]> inner_encoder_pre_layer_norm_weight = const()[name = tensor<string, []>("inner_encoder_pre_layer_norm_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5337600)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5338688)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5339776)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5340864)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5345024)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6393664)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6394752)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_0_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7443392)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_0_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7444480)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7445568)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7446656)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7708864)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7709952)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7972160)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7973248)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8235456)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8236544)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8498752)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8499840)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8762048)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8763136)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_0_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8764224)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_0_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8766336)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_0_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9290688)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9307136)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9308224)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9309312)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9310400)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9311488)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_0_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9312576)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9574784)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9575872)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9576960)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9581120)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10629760)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10630848)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_0_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11679488)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_0_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11680576)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11681664)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11682752)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11683840)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11688000)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12736640)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12737728)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_1_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13786368)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_1_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13787456)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13788544)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13789632)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14051840)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14052928)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14315136)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14316224)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14578432)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14579520)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14841728)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14842816)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15105024)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15106112)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_1_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15107200)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_1_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15109312)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_1_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15633664)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15650112)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15651200)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15652288)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15653376)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15654464)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_1_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15655552)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15917760)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15918848)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15919936)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15924096)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16972736)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16973824)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_1_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18022464)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_1_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18023552)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18024640)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18025728)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18026816)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18030976)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19079616)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19080704)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_2_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20129344)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_2_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20130432)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20131520)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20132608)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20394816)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20395904)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20658112)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20659200)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20921408)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20922496)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21184704)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21185792)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21448000)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21449088)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_2_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21450176)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_2_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21452288)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_2_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21976640)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21993088)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21994176)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21995264)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21996352)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21997440)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_2_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21998528)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22260736)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22261824)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22262912)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22267072)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23315712)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23316800)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_2_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24365440)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_2_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24366528)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24367616)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24368704)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24369792)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24373952)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25422592)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25423680)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_3_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26472320)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_3_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26473408)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26474496)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26475584)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26737792)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26738880)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27001088)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27002176)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27264384)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27265472)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27527680)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27528768)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27790976)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27792064)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_3_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27793152)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_3_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27795264)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_3_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28319616)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28336064)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28337152)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28338240)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28339328)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28340416)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_3_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28341504)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28603712)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28604800)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28605888)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28610048)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29658688)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29659776)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_3_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30708416)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_3_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30709504)))];
+            tensor<fp32, [4, 4]> inner_decoder__causal_mask = const()[name = tensor<string, []>("inner_decoder__causal_mask"), val = tensor<fp32, [4, 4]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710592)))];
+            tensor<fp32, [256]> inner_decoder_convert_bias = const()[name = tensor<string, []>("inner_decoder_convert_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710720)))];
+            tensor<fp32, [256, 512]> inner_decoder_convert_weight = const()[name = tensor<string, []>("inner_decoder_convert_weight"), val = tensor<fp32, [256, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30711808)))];
+            tensor<fp32, [256]> inner_decoder_q_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31236160)))];
+            tensor<fp32, [256, 256]> inner_decoder_q_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31237248)))];
+            tensor<fp32, [256]> inner_decoder_k_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31499456)))];
+            tensor<fp32, [256, 256]> inner_decoder_k_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31500544)))];
+            tensor<fp32, [256]> inner_decoder_v_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31762752)))];
+            tensor<fp32, [256, 256]> inner_decoder_v_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31763840)))];
+            tensor<fp32, [256]> inner_decoder_g_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32026048)))];
+            tensor<fp32, [256, 256]> inner_decoder_g_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32027136)))];
+            tensor<fp32, [256]> inner_decoder_out_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32289344)))];
+            tensor<fp32, [256, 256]> inner_decoder_out_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32290432)))];
+            tensor<fp32, [256]> inner_decoder_norm11_0_bias = const()[name = tensor<string, []>("inner_decoder_norm11_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32552640)))];
+            tensor<fp32, [256]> inner_decoder_norm11_0_weight = const()[name = tensor<string, []>("inner_decoder_norm11_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32553728)))];
+            tensor<fp32, [256]> inner_decoder_self_attn2_0_out_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32554816)))];
+            tensor<fp32, [256, 256]> inner_decoder_self_attn2_0_out_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32555904)))];
+            tensor<fp32, [768]> inner_decoder_self_attn2_0_in_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32818112)))];
+            tensor<fp32, [768, 256]> inner_decoder_self_attn2_0_in_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32821248)))];
+            tensor<fp32, [256]> inner_decoder_norm21_0_bias = const()[name = tensor<string, []>("inner_decoder_norm21_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33607744)))];
+            tensor<fp32, [256]> inner_decoder_norm21_0_weight = const()[name = tensor<string, []>("inner_decoder_norm21_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33608832)))];
+            tensor<fp32, [2048]> inner_decoder_linear1_0_bias = const()[name = tensor<string, []>("inner_decoder_linear1_0_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33609920)))];
+            tensor<fp32, [2048, 256]> inner_decoder_linear1_0_weight = const()[name = tensor<string, []>("inner_decoder_linear1_0_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33618176)))];
+            tensor<fp32, [256]> inner_decoder_linear2_0_bias = const()[name = tensor<string, []>("inner_decoder_linear2_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35715392)))];
+            tensor<fp32, [256, 2048]> inner_decoder_linear2_0_weight = const()[name = tensor<string, []>("inner_decoder_linear2_0_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35716480)))];
+            tensor<fp32, [256]> inner_decoder_norm22_0_bias = const()[name = tensor<string, []>("inner_decoder_norm22_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37813696)))];
+            tensor<fp32, [256]> inner_decoder_norm22_0_weight = const()[name = tensor<string, []>("inner_decoder_norm22_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37814784)))];
+            tensor<fp32, [256]> inner_decoder_q_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37815872)))];
+            tensor<fp32, [256, 256]> inner_decoder_q_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37816960)))];
+            tensor<fp32, [256]> inner_decoder_k_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38079168)))];
+            tensor<fp32, [256, 256]> inner_decoder_k_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38080256)))];
+            tensor<fp32, [256]> inner_decoder_v_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38342464)))];
+            tensor<fp32, [256, 256]> inner_decoder_v_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38343552)))];
+            tensor<fp32, [256]> inner_decoder_g_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38605760)))];
+            tensor<fp32, [256, 256]> inner_decoder_g_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38606848)))];
+            tensor<fp32, [256]> inner_decoder_out_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38869056)))];
+            tensor<fp32, [256, 256]> inner_decoder_out_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38870144)))];
+            tensor<fp32, [256]> inner_decoder_norm11_1_bias = const()[name = tensor<string, []>("inner_decoder_norm11_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39132352)))];
+            tensor<fp32, [256]> inner_decoder_norm11_1_weight = const()[name = tensor<string, []>("inner_decoder_norm11_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39133440)))];
+            tensor<fp32, [256]> inner_decoder_self_attn2_1_out_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39134528)))];
+            tensor<fp32, [256, 256]> inner_decoder_self_attn2_1_out_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39135616)))];
+            tensor<fp32, [768]> inner_decoder_self_attn2_1_in_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39397824)))];
+            tensor<fp32, [768, 256]> inner_decoder_self_attn2_1_in_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39400960)))];
+            tensor<fp32, [256]> inner_decoder_norm21_1_bias = const()[name = tensor<string, []>("inner_decoder_norm21_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40187456)))];
+            tensor<fp32, [256]> inner_decoder_norm21_1_weight = const()[name = tensor<string, []>("inner_decoder_norm21_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40188544)))];
+            tensor<fp32, [2048]> inner_decoder_linear1_1_bias = const()[name = tensor<string, []>("inner_decoder_linear1_1_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40189632)))];
+            tensor<fp32, [2048, 256]> inner_decoder_linear1_1_weight = const()[name = tensor<string, []>("inner_decoder_linear1_1_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40197888)))];
+            tensor<fp32, [256]> inner_decoder_linear2_1_bias = const()[name = tensor<string, []>("inner_decoder_linear2_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42295104)))];
+            tensor<fp32, [256, 2048]> inner_decoder_linear2_1_weight = const()[name = tensor<string, []>("inner_decoder_linear2_1_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42296192)))];
+            tensor<fp32, [256]> inner_decoder_norm22_1_bias = const()[name = tensor<string, []>("inner_decoder_norm22_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44393408)))];
+            tensor<fp32, [256]> inner_decoder_norm22_1_weight = const()[name = tensor<string, []>("inner_decoder_norm22_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44394496)))];
+            tensor<int32, [3]> var_19_begin_0 = const()[name = tensor<string, []>("op_19_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_19_end_0 = const()[name = tensor<string, []>("op_19_end_0"), val = tensor<int32, [3]>([1, 15, 23])];
+            tensor<bool, [3]> var_19_end_mask_0 = const()[name = tensor<string, []>("op_19_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 15, 23]> var_19 = slice_by_index(begin = var_19_begin_0, end = var_19_end_0, end_mask = var_19_end_mask_0, x = features)[name = tensor<string, []>("op_19")];
+            tensor<int32, [3]> var_29_begin_0 = const()[name = tensor<string, []>("op_29_begin_0"), val = tensor<int32, [3]>([0, 10, 0])];
+            tensor<int32, [3]> var_29_end_0 = const()[name = tensor<string, []>("op_29_end_0"), val = tensor<int32, [3]>([1, 25, 23])];
+            tensor<bool, [3]> var_29_end_mask_0 = const()[name = tensor<string, []>("op_29_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 15, 23]> var_29 = slice_by_index(begin = var_29_begin_0, end = var_29_end_0, end_mask = var_29_end_mask_0, x = features)[name = tensor<string, []>("op_29")];
+            tensor<int32, [3]> var_39_begin_0 = const()[name = tensor<string, []>("op_39_begin_0"), val = tensor<int32, [3]>([0, 20, 0])];
+            tensor<int32, [3]> var_39_end_0 = const()[name = tensor<string, []>("op_39_end_0"), val = tensor<int32, [3]>([1, 35, 23])];
+            tensor<bool, [3]> var_39_end_mask_0 = const()[name = tensor<string, []>("op_39_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 15, 23]> var_39 = slice_by_index(begin = var_39_begin_0, end = var_39_end_0, end_mask = var_39_end_mask_0, x = features)[name = tensor<string, []>("op_39")];
+            tensor<int32, [3]> var_49_begin_0 = const()[name = tensor<string, []>("op_49_begin_0"), val = tensor<int32, [3]>([0, 30, 0])];
+            tensor<int32, [3]> var_49_end_0 = const()[name = tensor<string, []>("op_49_end_0"), val = tensor<int32, [3]>([1, 1, 23])];
+            tensor<bool, [3]> var_49_end_mask_0 = const()[name = tensor<string, []>("op_49_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 23]> var_49 = slice_by_index(begin = var_49_begin_0, end = var_49_end_0, end_mask = var_49_end_mask_0, x = features)[name = tensor<string, []>("op_49")];
+            tensor<int32, []> stacked_axis_0 = const()[name = tensor<string, []>("stacked_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 4, 15, 23]> stacked = stack(axis = stacked_axis_0, values = (var_19, var_29, var_39, var_49))[name = tensor<string, []>("stacked")];
+            tensor<int32, [3]> var_56 = const()[name = tensor<string, []>("op_56"), val = tensor<int32, [3]>([1, 4, 345])];
+            tensor<fp32, [1, 4, 345]> input_1 = reshape(shape = var_56, x = stacked)[name = tensor<string, []>("input_1")];
+            tensor<fp32, []> var_59 = const()[name = tensor<string, []>("op_59"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<bool, []> var_65 = const()[name = tensor<string, []>("op_65"), val = tensor<bool, []>(true)];
+            tensor<fp32, []> var_66 = const()[name = tensor<string, []>("op_66"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
+            tensor<int32, []> var_69 = const()[name = tensor<string, []>("op_69"), val = tensor<int32, []>(0)];
+            tensor<int32, []> var_71 = const()[name = tensor<string, []>("op_71"), val = tensor<int32, []>(2)];
+            tensor<int32, []> var_72 = const()[name = tensor<string, []>("op_72"), val = tensor<int32, []>(-1)];
+            tensor<fp32, []> var_74 = const()[name = tensor<string, []>("op_74"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
+            tensor<fp32, []> var_79 = const()[name = tensor<string, []>("op_79"), val = tensor<fp32, []>(0x1.5798eep-27)];
+            tensor<int32, []> var_82 = const()[name = tensor<string, []>("op_82"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 4, 256]> input_3 = linear(bias = inner_encoder_input_projection_linear_bias, weight = inner_encoder_input_projection_linear_weight, x = input_1)[name = tensor<string, []>("linear_0")];
             tensor<int32, [1]> input_5_axes_0 = const()[name = tensor<string, []>("input_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_5 = layer_norm(axes = input_5_axes_0, beta = encoder_ffn1_0_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_0_module_sequential_0_weight, x = input_3)[name = tensor<string, []>("input_5")];
-            tensor<fp32, [1, 4, 1024]> inputs_1 = linear(bias = encoder_ffn1_0_module_sequential_1_linear_bias, weight = encoder_ffn1_0_module_sequential_1_linear_weight, x = input_5)[name = tensor<string, []>("linear_1")];
-            tensor<fp32, [1, 4, 1024]> input_7 = silu(x = inputs_1)[name = tensor<string, []>("input_7")];
-            tensor<fp32, [1, 4, 256]> input_11 = linear(bias = encoder_ffn1_0_module_sequential_4_linear_bias, weight = encoder_ffn1_0_module_sequential_4_linear_weight, x = input_7)[name = tensor<string, []>("linear_2")];
-            tensor<fp32, []> var_147 = const()[name = tensor<string, []>("op_147"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 4, 256]> var_148 = mul(x = input_11, y = var_147)[name = tensor<string, []>("op_148")];
-            tensor<fp32, [1, 4, 256]> input_13 = add(x = var_148, y = input_3)[name = tensor<string, []>("input_13")];
+            tensor<fp32, [1, 4, 256]> input_5 = layer_norm(axes = input_5_axes_0, beta = inner_encoder_pre_layer_norm_bias, epsilon = var_66, gamma = inner_encoder_pre_layer_norm_weight, x = input_3)[name = tensor<string, []>("input_5")];
+            tensor<int32, [1]> input_7_axes_0 = const()[name = tensor<string, []>("input_7_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 4, 256]> input_7 = layer_norm(axes = input_7_axes_0, beta = inner_encoder_ffn1_0_module_sequential_0_bias, epsilon = var_66, gamma = inner_encoder_ffn1_0_module_sequential_0_weight, x = input_5)[name = tensor<string, []>("input_7")];
+            tensor<fp32, [1, 4, 1024]> inputs_1 = linear(bias = inner_encoder_ffn1_0_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_0_module_sequential_1_linear_weight, x = input_7)[name = tensor<string, []>("linear_1")];
+            tensor<fp32, [1, 4, 1024]> input_9 = silu(x = inputs_1)[name = tensor<string, []>("input_9")];
+            tensor<fp32, [1, 4, 256]> input_13 = linear(bias = inner_encoder_ffn1_0_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_0_module_sequential_4_linear_weight, x = input_9)[name = tensor<string, []>("linear_2")];
+            tensor<fp32, []> var_203 = const()[name = tensor<string, []>("op_203"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 4, 256]> var_204 = mul(x = input_13, y = var_203)[name = tensor<string, []>("op_204")];
+            tensor<fp32, [1, 4, 256]> input_15 = add(x = var_204, y = input_5)[name = tensor<string, []>("input_15")];
             tensor<int32, [1]> x_1_axes_0 = const()[name = tensor<string, []>("x_1_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> x_1 = layer_norm(axes = x_1_axes_0, beta = encoder_ret_lns_0_bias, epsilon = var_28, gamma = encoder_ret_lns_0_weight, x = input_13)[name = tensor<string, []>("x_1")];
+            tensor<fp32, [1, 4, 256]> x_1 = layer_norm(axes = x_1_axes_0, beta = inner_encoder_ret_lns_0_bias, epsilon = var_66, gamma = inner_encoder_ret_lns_0_weight, x = input_15)[name = tensor<string, []>("x_1")];
             tensor<int32, [5]> prev_kv_1_begin_0 = const()[name = tensor<string, []>("prev_kv_1_begin_0"), val = tensor<int32, [5]>([0, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_1_end_0 = const()[name = tensor<string, []>("prev_kv_1_end_0"), val = tensor<int32, [5]>([1, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_1_end_mask_0 = const()[name = tensor<string, []>("prev_kv_1_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -239,173 +261,173 @@ program(1.0)
             tensor<bool, [2]> prev_scale_1_end_mask_0 = const()[name = tensor<string, []>("prev_scale_1_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_1_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_1_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_1 = slice_by_index(begin = prev_scale_1_begin_0, end = prev_scale_1_end_0, end_mask = prev_scale_1_end_mask_0, squeeze_mask = prev_scale_1_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_1")];
-            tensor<fp32, [1, 4, 256]> var_162 = linear(bias = encoder_q_proj_0_bias, weight = encoder_q_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_3")];
-            tensor<int32, [4]> var_163 = const()[name = tensor<string, []>("op_163"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_164 = reshape(shape = var_163, x = var_162)[name = tensor<string, []>("op_164")];
+            tensor<fp32, [1, 4, 256]> var_218 = linear(bias = inner_encoder_q_proj_0_bias, weight = inner_encoder_q_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_3")];
+            tensor<int32, [4]> var_219 = const()[name = tensor<string, []>("op_219"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_220 = reshape(shape = var_219, x = var_218)[name = tensor<string, []>("op_220")];
             tensor<int32, [4]> q_1_perm_0 = const()[name = tensor<string, []>("q_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> var_168 = linear(bias = encoder_k_proj_0_bias, weight = encoder_k_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_4")];
-            tensor<fp32, []> var_169 = const()[name = tensor<string, []>("op_169"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 4, 256]> var_170 = mul(x = var_168, y = var_169)[name = tensor<string, []>("op_170")];
-            tensor<int32, [4]> var_171 = const()[name = tensor<string, []>("op_171"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_172 = reshape(shape = var_171, x = var_170)[name = tensor<string, []>("op_172")];
+            tensor<fp32, [1, 4, 256]> var_224 = linear(bias = inner_encoder_k_proj_0_bias, weight = inner_encoder_k_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_4")];
+            tensor<fp32, []> var_225 = const()[name = tensor<string, []>("op_225"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 4, 256]> var_226 = mul(x = var_224, y = var_225)[name = tensor<string, []>("op_226")];
+            tensor<int32, [4]> var_227 = const()[name = tensor<string, []>("op_227"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_228 = reshape(shape = var_227, x = var_226)[name = tensor<string, []>("op_228")];
             tensor<int32, [4]> k_1_perm_0 = const()[name = tensor<string, []>("k_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> var_176 = linear(bias = encoder_v_proj_0_bias, weight = encoder_v_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_5")];
-            tensor<int32, [4]> var_177 = const()[name = tensor<string, []>("op_177"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_178 = reshape(shape = var_177, x = var_176)[name = tensor<string, []>("op_178")];
+            tensor<fp32, [1, 4, 256]> var_232 = linear(bias = inner_encoder_v_proj_0_bias, weight = inner_encoder_v_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_5")];
+            tensor<int32, [4]> var_233 = const()[name = tensor<string, []>("op_233"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_234 = reshape(shape = var_233, x = var_232)[name = tensor<string, []>("op_234")];
             tensor<int32, [4]> v_1_perm_0 = const()[name = tensor<string, []>("v_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> input_17 = linear(bias = encoder_g_proj_0_bias, weight = encoder_g_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_6")];
+            tensor<fp32, [1, 4, 256]> input_19 = linear(bias = inner_encoder_g_proj_0_bias, weight = inner_encoder_g_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_6")];
             tensor<fp32, [1]> sqrt_s0_1 = sqrt(x = prev_scale_1)[name = tensor<string, []>("sqrt_s0_1")];
-            tensor<fp32, [4]> s_t_1 = add(x = prev_scale_1, y = encoder__t_index)[name = tensor<string, []>("s_t_1")];
+            tensor<fp32, [4]> s_t_1 = add(x = prev_scale_1, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_1")];
             tensor<fp32, [4]> sqrt_s_t_1 = sqrt(x = s_t_1)[name = tensor<string, []>("sqrt_s_t_1")];
             tensor<bool, []> qk_1_transpose_x_1 = const()[name = tensor<string, []>("qk_1_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_1_transpose_y_1 = const()[name = tensor<string, []>("qk_1_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 4, 64]> k_1 = transpose(perm = k_1_perm_0, x = var_172)[name = tensor<string, []>("transpose_57")];
-            tensor<fp32, [1, 4, 4, 64]> q_1 = transpose(perm = q_1_perm_0, x = var_164)[name = tensor<string, []>("transpose_58")];
+            tensor<fp32, [1, 4, 4, 64]> k_1 = transpose(perm = k_1_perm_0, x = var_228)[name = tensor<string, []>("transpose_57")];
+            tensor<fp32, [1, 4, 4, 64]> q_1 = transpose(perm = q_1_perm_0, x = var_220)[name = tensor<string, []>("transpose_58")];
             tensor<fp32, [1, 4, 4, 4]> qk_1 = matmul(transpose_x = qk_1_transpose_x_1, transpose_y = qk_1_transpose_y_1, x = q_1, y = k_1)[name = tensor<string, []>("qk_1")];
-            tensor<int32, [2]> var_188 = const()[name = tensor<string, []>("op_188"), val = tensor<int32, [2]>([4, 1])];
-            tensor<fp32, [4, 1]> var_189 = reshape(shape = var_188, x = sqrt_s_t_1)[name = tensor<string, []>("op_189")];
-            tensor<fp32, [4, 4]> M_1 = real_div(x = encoder__causal_mask, y = var_189)[name = tensor<string, []>("M_1")];
-            tensor<fp32, [1, 4, 4, 4]> var_191 = mul(x = qk_1, y = M_1)[name = tensor<string, []>("op_191")];
+            tensor<int32, [2]> var_244 = const()[name = tensor<string, []>("op_244"), val = tensor<int32, [2]>([4, 1])];
+            tensor<fp32, [4, 1]> var_245 = reshape(shape = var_244, x = sqrt_s_t_1)[name = tensor<string, []>("op_245")];
+            tensor<fp32, [4, 4]> M_1 = real_div(x = inner_encoder__causal_mask, y = var_245)[name = tensor<string, []>("M_1")];
+            tensor<fp32, [1, 4, 4, 4]> var_247 = mul(x = qk_1, y = M_1)[name = tensor<string, []>("op_247")];
             tensor<bool, []> inner_1_transpose_x_0 = const()[name = tensor<string, []>("inner_1_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_1_transpose_y_0 = const()[name = tensor<string, []>("inner_1_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 4, 64]> v_1 = transpose(perm = v_1_perm_0, x = var_178)[name = tensor<string, []>("transpose_56")];
-            tensor<fp32, [1, 4, 4, 64]> inner_1 = matmul(transpose_x = inner_1_transpose_x_0, transpose_y = inner_1_transpose_y_0, x = var_191, y = v_1)[name = tensor<string, []>("inner_1")];
-            tensor<bool, []> var_193_transpose_x_0 = const()[name = tensor<string, []>("op_193_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_193_transpose_y_0 = const()[name = tensor<string, []>("op_193_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 4, 64]> var_193 = matmul(transpose_x = var_193_transpose_x_0, transpose_y = var_193_transpose_y_0, x = q_1, y = prev_kv_1)[name = tensor<string, []>("op_193")];
-            tensor<fp32, [4]> var_194 = real_div(x = sqrt_s0_1, y = sqrt_s_t_1)[name = tensor<string, []>("op_194")];
-            tensor<int32, [4]> var_195 = const()[name = tensor<string, []>("op_195"), val = tensor<int32, [4]>([1, 1, 4, 1])];
-            tensor<fp32, [1, 1, 4, 1]> var_196 = reshape(shape = var_195, x = var_194)[name = tensor<string, []>("op_196")];
-            tensor<fp32, [1, 4, 4, 64]> cross_1 = mul(x = var_193, y = var_196)[name = tensor<string, []>("cross_1")];
+            tensor<fp32, [1, 4, 4, 64]> v_1 = transpose(perm = v_1_perm_0, x = var_234)[name = tensor<string, []>("transpose_56")];
+            tensor<fp32, [1, 4, 4, 64]> inner_1 = matmul(transpose_x = inner_1_transpose_x_0, transpose_y = inner_1_transpose_y_0, x = var_247, y = v_1)[name = tensor<string, []>("inner_1")];
+            tensor<bool, []> var_249_transpose_x_0 = const()[name = tensor<string, []>("op_249_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_249_transpose_y_0 = const()[name = tensor<string, []>("op_249_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 4, 64]> var_249 = matmul(transpose_x = var_249_transpose_x_0, transpose_y = var_249_transpose_y_0, x = q_1, y = prev_kv_1)[name = tensor<string, []>("op_249")];
+            tensor<fp32, [4]> var_250 = real_div(x = sqrt_s0_1, y = sqrt_s_t_1)[name = tensor<string, []>("op_250")];
+            tensor<int32, [4]> var_251 = const()[name = tensor<string, []>("op_251"), val = tensor<int32, [4]>([1, 1, 4, 1])];
+            tensor<fp32, [1, 1, 4, 1]> var_252 = reshape(shape = var_251, x = var_250)[name = tensor<string, []>("op_252")];
+            tensor<fp32, [1, 4, 4, 64]> cross_1 = mul(x = var_249, y = var_252)[name = tensor<string, []>("cross_1")];
             tensor<fp32, [1, 4, 4, 64]> out_1 = add(x = inner_1, y = cross_1)[name = tensor<string, []>("out_1")];
-            tensor<fp32, [1, 4, 64, 64]> var_199 = mul(x = prev_kv_1, y = sqrt_s0_1)[name = tensor<string, []>("op_199")];
-            tensor<bool, []> var_201_transpose_x_1 = const()[name = tensor<string, []>("op_201_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_201_transpose_y_1 = const()[name = tensor<string, []>("op_201_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_201 = matmul(transpose_x = var_201_transpose_x_1, transpose_y = var_201_transpose_y_1, x = k_1, y = v_1)[name = tensor<string, []>("op_201")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_1 = add(x = var_199, y = var_201)[name = tensor<string, []>("new_kv_unnorm_1")];
-            tensor<fp32, []> var_203 = const()[name = tensor<string, []>("op_203"), val = tensor<fp32, []>(0x1p+2)];
-            tensor<fp32, [1]> new_scale_1 = add(x = prev_scale_1, y = var_203)[name = tensor<string, []>("new_scale_1")];
-            tensor<fp32, [1]> var_205 = sqrt(x = new_scale_1)[name = tensor<string, []>("op_205")];
-            tensor<fp32, [1, 4, 64, 64]> var_206 = real_div(x = new_kv_unnorm_1, y = var_205)[name = tensor<string, []>("op_206")];
-            tensor<int32, [4]> var_207_perm_0 = const()[name = tensor<string, []>("op_207_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_255 = mul(x = prev_kv_1, y = sqrt_s0_1)[name = tensor<string, []>("op_255")];
+            tensor<bool, []> var_257_transpose_x_1 = const()[name = tensor<string, []>("op_257_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_257_transpose_y_1 = const()[name = tensor<string, []>("op_257_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_257 = matmul(transpose_x = var_257_transpose_x_1, transpose_y = var_257_transpose_y_1, x = k_1, y = v_1)[name = tensor<string, []>("op_257")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_1 = add(x = var_255, y = var_257)[name = tensor<string, []>("new_kv_unnorm_1")];
+            tensor<fp32, []> var_259 = const()[name = tensor<string, []>("op_259"), val = tensor<fp32, []>(0x1p+2)];
+            tensor<fp32, [1]> new_scale_1 = add(x = prev_scale_1, y = var_259)[name = tensor<string, []>("new_scale_1")];
+            tensor<fp32, [1]> var_261 = sqrt(x = new_scale_1)[name = tensor<string, []>("op_261")];
+            tensor<fp32, [1, 4, 64, 64]> var_262 = real_div(x = new_kv_unnorm_1, y = var_261)[name = tensor<string, []>("op_262")];
+            tensor<int32, [4]> var_263_perm_0 = const()[name = tensor<string, []>("op_263_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_3_axes_0 = const()[name = tensor<string, []>("out_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 4, 64]> var_207 = transpose(perm = var_207_perm_0, x = out_1)[name = tensor<string, []>("transpose_55")];
-            tensor<fp32, [1, 4, 4, 64]> out_3 = layer_norm(axes = out_3_axes_0, epsilon = var_17, x = var_207)[name = tensor<string, []>("out_3")];
-            tensor<int32, [3]> var_211 = const()[name = tensor<string, []>("op_211"), val = tensor<int32, [3]>([1, 4, 256])];
-            tensor<fp32, [1, 4, 256]> out_5 = reshape(shape = var_211, x = out_3)[name = tensor<string, []>("out_5")];
-            tensor<fp32, [1, 4, 256]> var_213 = silu(x = input_17)[name = tensor<string, []>("op_213")];
-            tensor<fp32, [1, 4, 256]> input_19 = mul(x = var_213, y = out_5)[name = tensor<string, []>("input_19")];
-            tensor<fp32, [1, 4, 256]> ret_out_1 = linear(bias = encoder_out_proj_0_bias, weight = encoder_out_proj_0_weight, x = input_19)[name = tensor<string, []>("linear_7")];
-            tensor<fp32, [1, 4, 256]> x_3 = add(x = input_13, y = ret_out_1)[name = tensor<string, []>("x_3")];
+            tensor<fp32, [1, 4, 4, 64]> var_263 = transpose(perm = var_263_perm_0, x = out_1)[name = tensor<string, []>("transpose_55")];
+            tensor<fp32, [1, 4, 4, 64]> out_3 = layer_norm(axes = out_3_axes_0, epsilon = var_74, x = var_263)[name = tensor<string, []>("out_3")];
+            tensor<int32, [3]> var_267 = const()[name = tensor<string, []>("op_267"), val = tensor<int32, [3]>([1, 4, 256])];
+            tensor<fp32, [1, 4, 256]> out_5 = reshape(shape = var_267, x = out_3)[name = tensor<string, []>("out_5")];
+            tensor<fp32, [1, 4, 256]> var_269 = silu(x = input_19)[name = tensor<string, []>("op_269")];
+            tensor<fp32, [1, 4, 256]> input_21 = mul(x = var_269, y = out_5)[name = tensor<string, []>("input_21")];
+            tensor<fp32, [1, 4, 256]> ret_out_1 = linear(bias = inner_encoder_out_proj_0_bias, weight = inner_encoder_out_proj_0_weight, x = input_21)[name = tensor<string, []>("linear_7")];
+            tensor<fp32, [1, 4, 256]> x_3 = add(x = input_15, y = ret_out_1)[name = tensor<string, []>("x_3")];
             tensor<int32, [4]> window_1_begin_0 = const()[name = tensor<string, []>("window_1_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> window_1_end_0 = const()[name = tensor<string, []>("window_1_end_0"), val = tensor<int32, [4]>([1, 1, 16, 256])];
             tensor<bool, [4]> window_1_end_mask_0 = const()[name = tensor<string, []>("window_1_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_1_squeeze_mask_0 = const()[name = tensor<string, []>("window_1_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_1 = slice_by_index(begin = window_1_begin_0, end = window_1_end_0, end_mask = window_1_end_mask_0, squeeze_mask = window_1_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_1")];
-            tensor<int32, [3]> var_221_begin_0 = const()[name = tensor<string, []>("op_221_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_221_end_0 = const()[name = tensor<string, []>("op_221_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_221_end_mask_0 = const()[name = tensor<string, []>("op_221_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_221 = slice_by_index(begin = var_221_begin_0, end = var_221_end_0, end_mask = var_221_end_mask_0, x = x_3)[name = tensor<string, []>("op_221")];
-            tensor<int32, [3]> var_224_begin_0 = const()[name = tensor<string, []>("op_224_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_224_end_0 = const()[name = tensor<string, []>("op_224_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_224_end_mask_0 = const()[name = tensor<string, []>("op_224_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_224 = slice_by_index(begin = var_224_begin_0, end = var_224_end_0, end_mask = var_224_end_mask_0, x = window_1)[name = tensor<string, []>("op_224")];
+            tensor<int32, [3]> var_277_begin_0 = const()[name = tensor<string, []>("op_277_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_277_end_0 = const()[name = tensor<string, []>("op_277_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_277_end_mask_0 = const()[name = tensor<string, []>("op_277_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_277 = slice_by_index(begin = var_277_begin_0, end = var_277_end_0, end_mask = var_277_end_mask_0, x = x_3)[name = tensor<string, []>("op_277")];
+            tensor<int32, [3]> var_280_begin_0 = const()[name = tensor<string, []>("op_280_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_280_end_0 = const()[name = tensor<string, []>("op_280_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_280_end_mask_0 = const()[name = tensor<string, []>("op_280_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_280 = slice_by_index(begin = var_280_begin_0, end = var_280_end_0, end_mask = var_280_end_mask_0, x = window_1)[name = tensor<string, []>("op_280")];
             tensor<bool, []> window_3_interleave_0 = const()[name = tensor<string, []>("window_3_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_3 = concat(axis = var_26, interleave = window_3_interleave_0, values = (var_224, var_221))[name = tensor<string, []>("window_3")];
-            tensor<int32, [3]> var_229_begin_0 = const()[name = tensor<string, []>("op_229_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_229_end_0 = const()[name = tensor<string, []>("op_229_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_229_end_mask_0 = const()[name = tensor<string, []>("op_229_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_229 = slice_by_index(begin = var_229_begin_0, end = var_229_end_0, end_mask = var_229_end_mask_0, x = x_3)[name = tensor<string, []>("op_229")];
-            tensor<int32, [3]> var_232_begin_0 = const()[name = tensor<string, []>("op_232_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_232_end_0 = const()[name = tensor<string, []>("op_232_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_232_end_mask_0 = const()[name = tensor<string, []>("op_232_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_232 = slice_by_index(begin = var_232_begin_0, end = var_232_end_0, end_mask = var_232_end_mask_0, x = window_3)[name = tensor<string, []>("op_232")];
+            tensor<fp32, [1, 16, 256]> window_3 = concat(axis = var_82, interleave = window_3_interleave_0, values = (var_280, var_277))[name = tensor<string, []>("window_3")];
+            tensor<int32, [3]> var_285_begin_0 = const()[name = tensor<string, []>("op_285_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_285_end_0 = const()[name = tensor<string, []>("op_285_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_285_end_mask_0 = const()[name = tensor<string, []>("op_285_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_285 = slice_by_index(begin = var_285_begin_0, end = var_285_end_0, end_mask = var_285_end_mask_0, x = x_3)[name = tensor<string, []>("op_285")];
+            tensor<int32, [3]> var_288_begin_0 = const()[name = tensor<string, []>("op_288_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_288_end_0 = const()[name = tensor<string, []>("op_288_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_288_end_mask_0 = const()[name = tensor<string, []>("op_288_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_288 = slice_by_index(begin = var_288_begin_0, end = var_288_end_0, end_mask = var_288_end_mask_0, x = window_3)[name = tensor<string, []>("op_288")];
             tensor<bool, []> window_5_interleave_0 = const()[name = tensor<string, []>("window_5_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_5 = concat(axis = var_26, interleave = window_5_interleave_0, values = (var_232, var_229))[name = tensor<string, []>("window_5")];
-            tensor<int32, [3]> var_237_begin_0 = const()[name = tensor<string, []>("op_237_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_237_end_0 = const()[name = tensor<string, []>("op_237_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<bool, [3]> var_237_end_mask_0 = const()[name = tensor<string, []>("op_237_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_237 = slice_by_index(begin = var_237_begin_0, end = var_237_end_0, end_mask = var_237_end_mask_0, x = x_3)[name = tensor<string, []>("op_237")];
-            tensor<int32, [3]> var_240_begin_0 = const()[name = tensor<string, []>("op_240_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_240_end_0 = const()[name = tensor<string, []>("op_240_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_240_end_mask_0 = const()[name = tensor<string, []>("op_240_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_240 = slice_by_index(begin = var_240_begin_0, end = var_240_end_0, end_mask = var_240_end_mask_0, x = window_5)[name = tensor<string, []>("op_240")];
+            tensor<fp32, [1, 16, 256]> window_5 = concat(axis = var_82, interleave = window_5_interleave_0, values = (var_288, var_285))[name = tensor<string, []>("window_5")];
+            tensor<int32, [3]> var_293_begin_0 = const()[name = tensor<string, []>("op_293_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_293_end_0 = const()[name = tensor<string, []>("op_293_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<bool, [3]> var_293_end_mask_0 = const()[name = tensor<string, []>("op_293_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_293 = slice_by_index(begin = var_293_begin_0, end = var_293_end_0, end_mask = var_293_end_mask_0, x = x_3)[name = tensor<string, []>("op_293")];
+            tensor<int32, [3]> var_296_begin_0 = const()[name = tensor<string, []>("op_296_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_296_end_0 = const()[name = tensor<string, []>("op_296_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_296_end_mask_0 = const()[name = tensor<string, []>("op_296_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_296 = slice_by_index(begin = var_296_begin_0, end = var_296_end_0, end_mask = var_296_end_mask_0, x = window_5)[name = tensor<string, []>("op_296")];
             tensor<bool, []> window_7_interleave_0 = const()[name = tensor<string, []>("window_7_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_7 = concat(axis = var_26, interleave = window_7_interleave_0, values = (var_240, var_237))[name = tensor<string, []>("window_7")];
-            tensor<int32, [3]> var_245_begin_0 = const()[name = tensor<string, []>("op_245_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
-            tensor<int32, [3]> var_245_end_0 = const()[name = tensor<string, []>("op_245_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_245_end_mask_0 = const()[name = tensor<string, []>("op_245_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_245 = slice_by_index(begin = var_245_begin_0, end = var_245_end_0, end_mask = var_245_end_mask_0, x = x_3)[name = tensor<string, []>("op_245")];
-            tensor<int32, [3]> var_248_begin_0 = const()[name = tensor<string, []>("op_248_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_248_end_0 = const()[name = tensor<string, []>("op_248_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_248_end_mask_0 = const()[name = tensor<string, []>("op_248_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_248 = slice_by_index(begin = var_248_begin_0, end = var_248_end_0, end_mask = var_248_end_mask_0, x = window_7)[name = tensor<string, []>("op_248")];
+            tensor<fp32, [1, 16, 256]> window_7 = concat(axis = var_82, interleave = window_7_interleave_0, values = (var_296, var_293))[name = tensor<string, []>("window_7")];
+            tensor<int32, [3]> var_301_begin_0 = const()[name = tensor<string, []>("op_301_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
+            tensor<int32, [3]> var_301_end_0 = const()[name = tensor<string, []>("op_301_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_301_end_mask_0 = const()[name = tensor<string, []>("op_301_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_301 = slice_by_index(begin = var_301_begin_0, end = var_301_end_0, end_mask = var_301_end_mask_0, x = x_3)[name = tensor<string, []>("op_301")];
+            tensor<int32, [3]> var_304_begin_0 = const()[name = tensor<string, []>("op_304_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_304_end_0 = const()[name = tensor<string, []>("op_304_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_304_end_mask_0 = const()[name = tensor<string, []>("op_304_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_304 = slice_by_index(begin = var_304_begin_0, end = var_304_end_0, end_mask = var_304_end_mask_0, x = window_7)[name = tensor<string, []>("op_304")];
             tensor<bool, []> window_9_interleave_0 = const()[name = tensor<string, []>("window_9_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_9 = concat(axis = var_26, interleave = window_9_interleave_0, values = (var_248, var_245))[name = tensor<string, []>("window_9")];
-            tensor<bool, []> input_21_interleave_0 = const()[name = tensor<string, []>("input_21_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [4, 16, 256]> input_21 = concat(axis = var_23, interleave = input_21_interleave_0, values = (window_3, window_5, window_7, window_9))[name = tensor<string, []>("input_21")];
+            tensor<fp32, [1, 16, 256]> window_9 = concat(axis = var_82, interleave = window_9_interleave_0, values = (var_304, var_301))[name = tensor<string, []>("window_9")];
+            tensor<bool, []> input_23_interleave_0 = const()[name = tensor<string, []>("input_23_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [4, 16, 256]> input_23 = concat(axis = var_69, interleave = input_23_interleave_0, values = (window_3, window_5, window_7, window_9))[name = tensor<string, []>("input_23")];
             tensor<int32, [1]> x_5_axes_0 = const()[name = tensor<string, []>("x_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [4, 16, 256]> x_5 = layer_norm(axes = x_5_axes_0, beta = encoder_conv_module_0_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_0_sequential_0_weight, x = input_21)[name = tensor<string, []>("x_5")];
-            tensor<int32, [3]> input_23_perm_0 = const()[name = tensor<string, []>("input_23_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [4, 16, 256]> x_5 = layer_norm(axes = x_5_axes_0, beta = inner_encoder_conv_module_0_sequential_0_bias, epsilon = var_66, gamma = inner_encoder_conv_module_0_sequential_0_weight, x = input_23)[name = tensor<string, []>("x_5")];
+            tensor<int32, [3]> input_25_perm_0 = const()[name = tensor<string, []>("input_25_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_3_pad_type_0 = const()[name = tensor<string, []>("inputs_3_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_3_strides_0 = const()[name = tensor<string, []>("inputs_3_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_3_pad_0 = const()[name = tensor<string, []>("inputs_3_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_3_dilations_0 = const()[name = tensor<string, []>("inputs_3_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_3_groups_0 = const()[name = tensor<string, []>("inputs_3_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> input_23 = transpose(perm = input_23_perm_0, x = x_5)[name = tensor<string, []>("transpose_54")];
-            tensor<fp32, [4, 512, 16]> inputs_3 = conv(bias = encoder_conv_module_0_sequential_2_conv_bias, dilations = inputs_3_dilations_0, groups = inputs_3_groups_0, pad = inputs_3_pad_0, pad_type = inputs_3_pad_type_0, strides = inputs_3_strides_0, weight = encoder_conv_module_0_sequential_2_conv_weight, x = input_23)[name = tensor<string, []>("inputs_3")];
-            tensor<int32, [2]> var_273_split_sizes_0 = const()[name = tensor<string, []>("op_273_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_273_axis_0 = const()[name = tensor<string, []>("op_273_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> var_273_0, tensor<fp32, [4, 256, 16]> var_273_1 = split(axis = var_273_axis_0, split_sizes = var_273_split_sizes_0, x = inputs_3)[name = tensor<string, []>("op_273")];
-            tensor<fp32, [4, 256, 16]> var_275 = sigmoid(x = var_273_1)[name = tensor<string, []>("op_275")];
-            tensor<fp32, [4, 256, 16]> inputs_5 = mul(x = var_273_0, y = var_275)[name = tensor<string, []>("inputs_5")];
+            tensor<fp32, [4, 256, 16]> input_25 = transpose(perm = input_25_perm_0, x = x_5)[name = tensor<string, []>("transpose_54")];
+            tensor<fp32, [4, 512, 16]> inputs_3 = conv(bias = inner_encoder_conv_module_0_sequential_2_conv_bias, dilations = inputs_3_dilations_0, groups = inputs_3_groups_0, pad = inputs_3_pad_0, pad_type = inputs_3_pad_type_0, strides = inputs_3_strides_0, weight = inner_encoder_conv_module_0_sequential_2_conv_weight, x = input_25)[name = tensor<string, []>("inputs_3")];
+            tensor<int32, [2]> var_329_split_sizes_0 = const()[name = tensor<string, []>("op_329_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_329_axis_0 = const()[name = tensor<string, []>("op_329_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [4, 256, 16]> var_329_0, tensor<fp32, [4, 256, 16]> var_329_1 = split(axis = var_329_axis_0, split_sizes = var_329_split_sizes_0, x = inputs_3)[name = tensor<string, []>("op_329")];
+            tensor<fp32, [4, 256, 16]> var_331 = sigmoid(x = var_329_1)[name = tensor<string, []>("op_331")];
+            tensor<fp32, [4, 256, 16]> inputs_5 = mul(x = var_329_0, y = var_331)[name = tensor<string, []>("inputs_5")];
             tensor<string, []> outputs_aug_1_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_1_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_1_pad_0 = const()[name = tensor<string, []>("outputs_aug_1_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_1_groups_0 = const()[name = tensor<string, []>("outputs_aug_1_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_1_strides_0 = const()[name = tensor<string, []>("outputs_aug_1_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_1_dilations_0 = const()[name = tensor<string, []>("outputs_aug_1_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [4, 256, 31]> outputs_aug_1 = conv(dilations = outputs_aug_1_dilations_0, groups = outputs_aug_1_groups_0, pad = outputs_aug_1_pad_0, pad_type = outputs_aug_1_pad_type_0, strides = outputs_aug_1_strides_0, weight = encoder_conv_module_0_sequential_4_conv_weight, x = inputs_5)[name = tensor<string, []>("outputs_aug_1")];
-            tensor<int32, [3]> input_25_begin_0 = const()[name = tensor<string, []>("input_25_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_25_end_0 = const()[name = tensor<string, []>("input_25_end_0"), val = tensor<int32, [3]>([4, 256, 16])];
-            tensor<bool, [3]> input_25_end_mask_0 = const()[name = tensor<string, []>("input_25_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [4, 256, 16]> input_25 = slice_by_index(begin = input_25_begin_0, end = input_25_end_0, end_mask = input_25_end_mask_0, x = outputs_aug_1)[name = tensor<string, []>("input_25")];
-            tensor<fp32, [4, 256, 16]> inputs_7 = batch_norm(beta = encoder_conv_module_0_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_0_sequential_5_weight, mean = encoder_conv_module_0_sequential_5_running_mean, variance = encoder_conv_module_0_sequential_5_running_var, x = input_25)[name = tensor<string, []>("inputs_7")];
-            tensor<fp32, [4, 256, 16]> input_27 = silu(x = inputs_7)[name = tensor<string, []>("input_27")];
-            tensor<string, []> input_29_pad_type_0 = const()[name = tensor<string, []>("input_29_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_29_strides_0 = const()[name = tensor<string, []>("input_29_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_29_pad_0 = const()[name = tensor<string, []>("input_29_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_29_dilations_0 = const()[name = tensor<string, []>("input_29_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_29_groups_0 = const()[name = tensor<string, []>("input_29_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> input_29 = conv(bias = encoder_conv_module_0_sequential_7_conv_bias, dilations = input_29_dilations_0, groups = input_29_groups_0, pad = input_29_pad_0, pad_type = input_29_pad_type_0, strides = input_29_strides_0, weight = encoder_conv_module_0_sequential_7_conv_weight, x = input_27)[name = tensor<string, []>("input_29")];
+            tensor<fp32, [4, 256, 31]> outputs_aug_1 = conv(dilations = outputs_aug_1_dilations_0, groups = outputs_aug_1_groups_0, pad = outputs_aug_1_pad_0, pad_type = outputs_aug_1_pad_type_0, strides = outputs_aug_1_strides_0, weight = inner_encoder_conv_module_0_sequential_4_conv_weight, x = inputs_5)[name = tensor<string, []>("outputs_aug_1")];
+            tensor<int32, [3]> input_27_begin_0 = const()[name = tensor<string, []>("input_27_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_27_end_0 = const()[name = tensor<string, []>("input_27_end_0"), val = tensor<int32, [3]>([4, 256, 16])];
+            tensor<bool, [3]> input_27_end_mask_0 = const()[name = tensor<string, []>("input_27_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [4, 256, 16]> input_27 = slice_by_index(begin = input_27_begin_0, end = input_27_end_0, end_mask = input_27_end_mask_0, x = outputs_aug_1)[name = tensor<string, []>("input_27")];
+            tensor<fp32, [4, 256, 16]> inputs_7 = batch_norm(beta = inner_encoder_conv_module_0_sequential_5_bias, epsilon = var_66, gamma = inner_encoder_conv_module_0_sequential_5_weight, mean = inner_encoder_conv_module_0_sequential_5_running_mean, variance = inner_encoder_conv_module_0_sequential_5_running_var, x = input_27)[name = tensor<string, []>("inputs_7")];
+            tensor<fp32, [4, 256, 16]> input_29 = silu(x = inputs_7)[name = tensor<string, []>("input_29")];
+            tensor<string, []> input_31_pad_type_0 = const()[name = tensor<string, []>("input_31_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_31_strides_0 = const()[name = tensor<string, []>("input_31_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_31_pad_0 = const()[name = tensor<string, []>("input_31_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_31_dilations_0 = const()[name = tensor<string, []>("input_31_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_31_groups_0 = const()[name = tensor<string, []>("input_31_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [4, 256, 16]> input_31 = conv(bias = inner_encoder_conv_module_0_sequential_7_conv_bias, dilations = input_31_dilations_0, groups = input_31_groups_0, pad = input_31_pad_0, pad_type = input_31_pad_type_0, strides = input_31_strides_0, weight = inner_encoder_conv_module_0_sequential_7_conv_weight, x = input_29)[name = tensor<string, []>("input_31")];
             tensor<int32, [3]> conv_out_1_perm_0 = const()[name = tensor<string, []>("conv_out_1_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_306_begin_0 = const()[name = tensor<string, []>("op_306_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_306_end_0 = const()[name = tensor<string, []>("op_306_end_0"), val = tensor<int32, [3]>([4, 16, 256])];
-            tensor<bool, [3]> var_306_end_mask_0 = const()[name = tensor<string, []>("op_306_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [4, 16, 256]> conv_out_1 = transpose(perm = conv_out_1_perm_0, x = input_29)[name = tensor<string, []>("transpose_53")];
-            tensor<fp32, [4, 1, 256]> var_306 = slice_by_index(begin = var_306_begin_0, end = var_306_end_0, end_mask = var_306_end_mask_0, x = conv_out_1)[name = tensor<string, []>("op_306")];
-            tensor<int32, [3]> var_308_perm_0 = const()[name = tensor<string, []>("op_308_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 4, 256]> var_308 = transpose(perm = var_308_perm_0, x = var_306)[name = tensor<string, []>("transpose_52")];
-            tensor<fp32, [1, 4, 256]> input_31 = add(x = x_3, y = var_308)[name = tensor<string, []>("input_31")];
-            tensor<int32, [1]> input_33_axes_0 = const()[name = tensor<string, []>("input_33_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_33 = layer_norm(axes = input_33_axes_0, beta = encoder_ffn2_0_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_0_module_sequential_0_weight, x = input_31)[name = tensor<string, []>("input_33")];
-            tensor<fp32, [1, 4, 1024]> inputs_9 = linear(bias = encoder_ffn2_0_module_sequential_1_linear_bias, weight = encoder_ffn2_0_module_sequential_1_linear_weight, x = input_33)[name = tensor<string, []>("linear_8")];
-            tensor<fp32, [1, 4, 1024]> input_35 = silu(x = inputs_9)[name = tensor<string, []>("input_35")];
-            tensor<fp32, [1, 4, 256]> input_39 = linear(bias = encoder_ffn2_0_module_sequential_4_linear_bias, weight = encoder_ffn2_0_module_sequential_4_linear_weight, x = input_35)[name = tensor<string, []>("linear_9")];
-            tensor<fp32, []> var_331 = const()[name = tensor<string, []>("op_331"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 4, 256]> var_332 = mul(x = input_39, y = var_331)[name = tensor<string, []>("op_332")];
-            tensor<fp32, [1, 4, 256]> input_41 = add(x = var_332, y = input_31)[name = tensor<string, []>("input_41")];
-            tensor<int32, [1]> input_43_axes_0 = const()[name = tensor<string, []>("input_43_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_43 = layer_norm(axes = input_43_axes_0, beta = encoder_layer_norm_0_bias, epsilon = var_28, gamma = encoder_layer_norm_0_weight, x = input_41)[name = tensor<string, []>("input_43")];
+            tensor<int32, [3]> var_362_begin_0 = const()[name = tensor<string, []>("op_362_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_362_end_0 = const()[name = tensor<string, []>("op_362_end_0"), val = tensor<int32, [3]>([4, 16, 256])];
+            tensor<bool, [3]> var_362_end_mask_0 = const()[name = tensor<string, []>("op_362_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [4, 16, 256]> conv_out_1 = transpose(perm = conv_out_1_perm_0, x = input_31)[name = tensor<string, []>("transpose_53")];
+            tensor<fp32, [4, 1, 256]> var_362 = slice_by_index(begin = var_362_begin_0, end = var_362_end_0, end_mask = var_362_end_mask_0, x = conv_out_1)[name = tensor<string, []>("op_362")];
+            tensor<int32, [3]> var_364_perm_0 = const()[name = tensor<string, []>("op_364_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 4, 256]> var_364 = transpose(perm = var_364_perm_0, x = var_362)[name = tensor<string, []>("transpose_52")];
+            tensor<fp32, [1, 4, 256]> input_33 = add(x = x_3, y = var_364)[name = tensor<string, []>("input_33")];
+            tensor<int32, [1]> input_35_axes_0 = const()[name = tensor<string, []>("input_35_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 4, 256]> input_35 = layer_norm(axes = input_35_axes_0, beta = inner_encoder_ffn2_0_module_sequential_0_bias, epsilon = var_66, gamma = inner_encoder_ffn2_0_module_sequential_0_weight, x = input_33)[name = tensor<string, []>("input_35")];
+            tensor<fp32, [1, 4, 1024]> inputs_9 = linear(bias = inner_encoder_ffn2_0_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_0_module_sequential_1_linear_weight, x = input_35)[name = tensor<string, []>("linear_8")];
+            tensor<fp32, [1, 4, 1024]> input_37 = silu(x = inputs_9)[name = tensor<string, []>("input_37")];
+            tensor<fp32, [1, 4, 256]> input_41 = linear(bias = inner_encoder_ffn2_0_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_0_module_sequential_4_linear_weight, x = input_37)[name = tensor<string, []>("linear_9")];
+            tensor<fp32, []> var_387 = const()[name = tensor<string, []>("op_387"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 4, 256]> var_388 = mul(x = input_41, y = var_387)[name = tensor<string, []>("op_388")];
+            tensor<fp32, [1, 4, 256]> input_43 = add(x = var_388, y = input_33)[name = tensor<string, []>("input_43")];
             tensor<int32, [1]> input_45_axes_0 = const()[name = tensor<string, []>("input_45_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_45 = layer_norm(axes = input_45_axes_0, beta = encoder_ffn1_1_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_1_module_sequential_0_weight, x = input_43)[name = tensor<string, []>("input_45")];
-            tensor<fp32, [1, 4, 1024]> inputs_11 = linear(bias = encoder_ffn1_1_module_sequential_1_linear_bias, weight = encoder_ffn1_1_module_sequential_1_linear_weight, x = input_45)[name = tensor<string, []>("linear_10")];
-            tensor<fp32, [1, 4, 1024]> input_47 = silu(x = inputs_11)[name = tensor<string, []>("input_47")];
-            tensor<fp32, [1, 4, 256]> input_51 = linear(bias = encoder_ffn1_1_module_sequential_4_linear_bias, weight = encoder_ffn1_1_module_sequential_4_linear_weight, x = input_47)[name = tensor<string, []>("linear_11")];
-            tensor<fp32, []> var_361 = const()[name = tensor<string, []>("op_361"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 4, 256]> var_362 = mul(x = input_51, y = var_361)[name = tensor<string, []>("op_362")];
-            tensor<fp32, [1, 4, 256]> input_53 = add(x = var_362, y = input_43)[name = tensor<string, []>("input_53")];
+            tensor<fp32, [1, 4, 256]> input_45 = layer_norm(axes = input_45_axes_0, beta = inner_encoder_layer_norm_0_bias, epsilon = var_66, gamma = inner_encoder_layer_norm_0_weight, x = input_43)[name = tensor<string, []>("input_45")];
+            tensor<int32, [1]> input_47_axes_0 = const()[name = tensor<string, []>("input_47_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 4, 256]> input_47 = layer_norm(axes = input_47_axes_0, beta = inner_encoder_ffn1_1_module_sequential_0_bias, epsilon = var_66, gamma = inner_encoder_ffn1_1_module_sequential_0_weight, x = input_45)[name = tensor<string, []>("input_47")];
+            tensor<fp32, [1, 4, 1024]> inputs_11 = linear(bias = inner_encoder_ffn1_1_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_1_module_sequential_1_linear_weight, x = input_47)[name = tensor<string, []>("linear_10")];
+            tensor<fp32, [1, 4, 1024]> input_49 = silu(x = inputs_11)[name = tensor<string, []>("input_49")];
+            tensor<fp32, [1, 4, 256]> input_53 = linear(bias = inner_encoder_ffn1_1_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_1_module_sequential_4_linear_weight, x = input_49)[name = tensor<string, []>("linear_11")];
+            tensor<fp32, []> var_417 = const()[name = tensor<string, []>("op_417"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 4, 256]> var_418 = mul(x = input_53, y = var_417)[name = tensor<string, []>("op_418")];
+            tensor<fp32, [1, 4, 256]> input_55 = add(x = var_418, y = input_45)[name = tensor<string, []>("input_55")];
             tensor<int32, [1]> x_7_axes_0 = const()[name = tensor<string, []>("x_7_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> x_7 = layer_norm(axes = x_7_axes_0, beta = encoder_ret_lns_1_bias, epsilon = var_28, gamma = encoder_ret_lns_1_weight, x = input_53)[name = tensor<string, []>("x_7")];
+            tensor<fp32, [1, 4, 256]> x_7 = layer_norm(axes = x_7_axes_0, beta = inner_encoder_ret_lns_1_bias, epsilon = var_66, gamma = inner_encoder_ret_lns_1_weight, x = input_55)[name = tensor<string, []>("x_7")];
             tensor<int32, [5]> prev_kv_3_begin_0 = const()[name = tensor<string, []>("prev_kv_3_begin_0"), val = tensor<int32, [5]>([1, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_3_end_0 = const()[name = tensor<string, []>("prev_kv_3_end_0"), val = tensor<int32, [5]>([2, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_3_end_mask_0 = const()[name = tensor<string, []>("prev_kv_3_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -416,173 +438,173 @@ program(1.0)
             tensor<bool, [2]> prev_scale_3_end_mask_0 = const()[name = tensor<string, []>("prev_scale_3_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_3_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_3_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_3 = slice_by_index(begin = prev_scale_3_begin_0, end = prev_scale_3_end_0, end_mask = prev_scale_3_end_mask_0, squeeze_mask = prev_scale_3_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_3")];
-            tensor<fp32, [1, 4, 256]> var_376 = linear(bias = encoder_q_proj_1_bias, weight = encoder_q_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_12")];
-            tensor<int32, [4]> var_377 = const()[name = tensor<string, []>("op_377"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_378 = reshape(shape = var_377, x = var_376)[name = tensor<string, []>("op_378")];
+            tensor<fp32, [1, 4, 256]> var_432 = linear(bias = inner_encoder_q_proj_1_bias, weight = inner_encoder_q_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_12")];
+            tensor<int32, [4]> var_433 = const()[name = tensor<string, []>("op_433"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_434 = reshape(shape = var_433, x = var_432)[name = tensor<string, []>("op_434")];
             tensor<int32, [4]> q_3_perm_0 = const()[name = tensor<string, []>("q_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> var_382 = linear(bias = encoder_k_proj_1_bias, weight = encoder_k_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_13")];
-            tensor<fp32, []> var_383 = const()[name = tensor<string, []>("op_383"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 4, 256]> var_384 = mul(x = var_382, y = var_383)[name = tensor<string, []>("op_384")];
-            tensor<int32, [4]> var_385 = const()[name = tensor<string, []>("op_385"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_386 = reshape(shape = var_385, x = var_384)[name = tensor<string, []>("op_386")];
+            tensor<fp32, [1, 4, 256]> var_438 = linear(bias = inner_encoder_k_proj_1_bias, weight = inner_encoder_k_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_13")];
+            tensor<fp32, []> var_439 = const()[name = tensor<string, []>("op_439"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 4, 256]> var_440 = mul(x = var_438, y = var_439)[name = tensor<string, []>("op_440")];
+            tensor<int32, [4]> var_441 = const()[name = tensor<string, []>("op_441"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_442 = reshape(shape = var_441, x = var_440)[name = tensor<string, []>("op_442")];
             tensor<int32, [4]> k_3_perm_0 = const()[name = tensor<string, []>("k_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> var_390 = linear(bias = encoder_v_proj_1_bias, weight = encoder_v_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_14")];
-            tensor<int32, [4]> var_391 = const()[name = tensor<string, []>("op_391"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_392 = reshape(shape = var_391, x = var_390)[name = tensor<string, []>("op_392")];
+            tensor<fp32, [1, 4, 256]> var_446 = linear(bias = inner_encoder_v_proj_1_bias, weight = inner_encoder_v_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_14")];
+            tensor<int32, [4]> var_447 = const()[name = tensor<string, []>("op_447"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_448 = reshape(shape = var_447, x = var_446)[name = tensor<string, []>("op_448")];
             tensor<int32, [4]> v_3_perm_0 = const()[name = tensor<string, []>("v_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> input_57 = linear(bias = encoder_g_proj_1_bias, weight = encoder_g_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_15")];
+            tensor<fp32, [1, 4, 256]> input_59 = linear(bias = inner_encoder_g_proj_1_bias, weight = inner_encoder_g_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_15")];
             tensor<fp32, [1]> sqrt_s0_3 = sqrt(x = prev_scale_3)[name = tensor<string, []>("sqrt_s0_3")];
-            tensor<fp32, [4]> s_t_3 = add(x = prev_scale_3, y = encoder__t_index)[name = tensor<string, []>("s_t_3")];
+            tensor<fp32, [4]> s_t_3 = add(x = prev_scale_3, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_3")];
             tensor<fp32, [4]> sqrt_s_t_3 = sqrt(x = s_t_3)[name = tensor<string, []>("sqrt_s_t_3")];
             tensor<bool, []> qk_3_transpose_x_1 = const()[name = tensor<string, []>("qk_3_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_3_transpose_y_1 = const()[name = tensor<string, []>("qk_3_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 4, 64]> k_3 = transpose(perm = k_3_perm_0, x = var_386)[name = tensor<string, []>("transpose_50")];
-            tensor<fp32, [1, 4, 4, 64]> q_3 = transpose(perm = q_3_perm_0, x = var_378)[name = tensor<string, []>("transpose_51")];
+            tensor<fp32, [1, 4, 4, 64]> k_3 = transpose(perm = k_3_perm_0, x = var_442)[name = tensor<string, []>("transpose_50")];
+            tensor<fp32, [1, 4, 4, 64]> q_3 = transpose(perm = q_3_perm_0, x = var_434)[name = tensor<string, []>("transpose_51")];
             tensor<fp32, [1, 4, 4, 4]> qk_3 = matmul(transpose_x = qk_3_transpose_x_1, transpose_y = qk_3_transpose_y_1, x = q_3, y = k_3)[name = tensor<string, []>("qk_3")];
-            tensor<int32, [2]> var_402 = const()[name = tensor<string, []>("op_402"), val = tensor<int32, [2]>([4, 1])];
-            tensor<fp32, [4, 1]> var_403 = reshape(shape = var_402, x = sqrt_s_t_3)[name = tensor<string, []>("op_403")];
-            tensor<fp32, [4, 4]> M_3 = real_div(x = encoder__causal_mask, y = var_403)[name = tensor<string, []>("M_3")];
-            tensor<fp32, [1, 4, 4, 4]> var_405 = mul(x = qk_3, y = M_3)[name = tensor<string, []>("op_405")];
+            tensor<int32, [2]> var_458 = const()[name = tensor<string, []>("op_458"), val = tensor<int32, [2]>([4, 1])];
+            tensor<fp32, [4, 1]> var_459 = reshape(shape = var_458, x = sqrt_s_t_3)[name = tensor<string, []>("op_459")];
+            tensor<fp32, [4, 4]> M_3 = real_div(x = inner_encoder__causal_mask, y = var_459)[name = tensor<string, []>("M_3")];
+            tensor<fp32, [1, 4, 4, 4]> var_461 = mul(x = qk_3, y = M_3)[name = tensor<string, []>("op_461")];
             tensor<bool, []> inner_3_transpose_x_0 = const()[name = tensor<string, []>("inner_3_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_3_transpose_y_0 = const()[name = tensor<string, []>("inner_3_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 4, 64]> v_3 = transpose(perm = v_3_perm_0, x = var_392)[name = tensor<string, []>("transpose_49")];
-            tensor<fp32, [1, 4, 4, 64]> inner_3 = matmul(transpose_x = inner_3_transpose_x_0, transpose_y = inner_3_transpose_y_0, x = var_405, y = v_3)[name = tensor<string, []>("inner_3")];
-            tensor<bool, []> var_407_transpose_x_0 = const()[name = tensor<string, []>("op_407_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_407_transpose_y_0 = const()[name = tensor<string, []>("op_407_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 4, 64]> var_407 = matmul(transpose_x = var_407_transpose_x_0, transpose_y = var_407_transpose_y_0, x = q_3, y = prev_kv_3)[name = tensor<string, []>("op_407")];
-            tensor<fp32, [4]> var_408 = real_div(x = sqrt_s0_3, y = sqrt_s_t_3)[name = tensor<string, []>("op_408")];
-            tensor<int32, [4]> var_409 = const()[name = tensor<string, []>("op_409"), val = tensor<int32, [4]>([1, 1, 4, 1])];
-            tensor<fp32, [1, 1, 4, 1]> var_410 = reshape(shape = var_409, x = var_408)[name = tensor<string, []>("op_410")];
-            tensor<fp32, [1, 4, 4, 64]> cross_3 = mul(x = var_407, y = var_410)[name = tensor<string, []>("cross_3")];
+            tensor<fp32, [1, 4, 4, 64]> v_3 = transpose(perm = v_3_perm_0, x = var_448)[name = tensor<string, []>("transpose_49")];
+            tensor<fp32, [1, 4, 4, 64]> inner_3 = matmul(transpose_x = inner_3_transpose_x_0, transpose_y = inner_3_transpose_y_0, x = var_461, y = v_3)[name = tensor<string, []>("inner_3")];
+            tensor<bool, []> var_463_transpose_x_0 = const()[name = tensor<string, []>("op_463_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_463_transpose_y_0 = const()[name = tensor<string, []>("op_463_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 4, 64]> var_463 = matmul(transpose_x = var_463_transpose_x_0, transpose_y = var_463_transpose_y_0, x = q_3, y = prev_kv_3)[name = tensor<string, []>("op_463")];
+            tensor<fp32, [4]> var_464 = real_div(x = sqrt_s0_3, y = sqrt_s_t_3)[name = tensor<string, []>("op_464")];
+            tensor<int32, [4]> var_465 = const()[name = tensor<string, []>("op_465"), val = tensor<int32, [4]>([1, 1, 4, 1])];
+            tensor<fp32, [1, 1, 4, 1]> var_466 = reshape(shape = var_465, x = var_464)[name = tensor<string, []>("op_466")];
+            tensor<fp32, [1, 4, 4, 64]> cross_3 = mul(x = var_463, y = var_466)[name = tensor<string, []>("cross_3")];
             tensor<fp32, [1, 4, 4, 64]> out_7 = add(x = inner_3, y = cross_3)[name = tensor<string, []>("out_7")];
-            tensor<fp32, [1, 4, 64, 64]> var_413 = mul(x = prev_kv_3, y = sqrt_s0_3)[name = tensor<string, []>("op_413")];
-            tensor<bool, []> var_415_transpose_x_1 = const()[name = tensor<string, []>("op_415_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_415_transpose_y_1 = const()[name = tensor<string, []>("op_415_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_415 = matmul(transpose_x = var_415_transpose_x_1, transpose_y = var_415_transpose_y_1, x = k_3, y = v_3)[name = tensor<string, []>("op_415")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_3 = add(x = var_413, y = var_415)[name = tensor<string, []>("new_kv_unnorm_3")];
-            tensor<fp32, []> var_417 = const()[name = tensor<string, []>("op_417"), val = tensor<fp32, []>(0x1p+2)];
-            tensor<fp32, [1]> new_scale_3 = add(x = prev_scale_3, y = var_417)[name = tensor<string, []>("new_scale_3")];
-            tensor<fp32, [1]> var_419 = sqrt(x = new_scale_3)[name = tensor<string, []>("op_419")];
-            tensor<fp32, [1, 4, 64, 64]> var_420 = real_div(x = new_kv_unnorm_3, y = var_419)[name = tensor<string, []>("op_420")];
-            tensor<int32, [4]> var_421_perm_0 = const()[name = tensor<string, []>("op_421_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_469 = mul(x = prev_kv_3, y = sqrt_s0_3)[name = tensor<string, []>("op_469")];
+            tensor<bool, []> var_471_transpose_x_1 = const()[name = tensor<string, []>("op_471_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_471_transpose_y_1 = const()[name = tensor<string, []>("op_471_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_471 = matmul(transpose_x = var_471_transpose_x_1, transpose_y = var_471_transpose_y_1, x = k_3, y = v_3)[name = tensor<string, []>("op_471")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_3 = add(x = var_469, y = var_471)[name = tensor<string, []>("new_kv_unnorm_3")];
+            tensor<fp32, []> var_473 = const()[name = tensor<string, []>("op_473"), val = tensor<fp32, []>(0x1p+2)];
+            tensor<fp32, [1]> new_scale_3 = add(x = prev_scale_3, y = var_473)[name = tensor<string, []>("new_scale_3")];
+            tensor<fp32, [1]> var_475 = sqrt(x = new_scale_3)[name = tensor<string, []>("op_475")];
+            tensor<fp32, [1, 4, 64, 64]> var_476 = real_div(x = new_kv_unnorm_3, y = var_475)[name = tensor<string, []>("op_476")];
+            tensor<int32, [4]> var_477_perm_0 = const()[name = tensor<string, []>("op_477_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_9_axes_0 = const()[name = tensor<string, []>("out_9_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 4, 64]> var_421 = transpose(perm = var_421_perm_0, x = out_7)[name = tensor<string, []>("transpose_48")];
-            tensor<fp32, [1, 4, 4, 64]> out_9 = layer_norm(axes = out_9_axes_0, epsilon = var_17, x = var_421)[name = tensor<string, []>("out_9")];
-            tensor<int32, [3]> var_425 = const()[name = tensor<string, []>("op_425"), val = tensor<int32, [3]>([1, 4, 256])];
-            tensor<fp32, [1, 4, 256]> out_11 = reshape(shape = var_425, x = out_9)[name = tensor<string, []>("out_11")];
-            tensor<fp32, [1, 4, 256]> var_427 = silu(x = input_57)[name = tensor<string, []>("op_427")];
-            tensor<fp32, [1, 4, 256]> input_59 = mul(x = var_427, y = out_11)[name = tensor<string, []>("input_59")];
-            tensor<fp32, [1, 4, 256]> ret_out_3 = linear(bias = encoder_out_proj_1_bias, weight = encoder_out_proj_1_weight, x = input_59)[name = tensor<string, []>("linear_16")];
-            tensor<fp32, [1, 4, 256]> x_9 = add(x = input_53, y = ret_out_3)[name = tensor<string, []>("x_9")];
+            tensor<fp32, [1, 4, 4, 64]> var_477 = transpose(perm = var_477_perm_0, x = out_7)[name = tensor<string, []>("transpose_48")];
+            tensor<fp32, [1, 4, 4, 64]> out_9 = layer_norm(axes = out_9_axes_0, epsilon = var_74, x = var_477)[name = tensor<string, []>("out_9")];
+            tensor<int32, [3]> var_481 = const()[name = tensor<string, []>("op_481"), val = tensor<int32, [3]>([1, 4, 256])];
+            tensor<fp32, [1, 4, 256]> out_11 = reshape(shape = var_481, x = out_9)[name = tensor<string, []>("out_11")];
+            tensor<fp32, [1, 4, 256]> var_483 = silu(x = input_59)[name = tensor<string, []>("op_483")];
+            tensor<fp32, [1, 4, 256]> input_61 = mul(x = var_483, y = out_11)[name = tensor<string, []>("input_61")];
+            tensor<fp32, [1, 4, 256]> ret_out_3 = linear(bias = inner_encoder_out_proj_1_bias, weight = inner_encoder_out_proj_1_weight, x = input_61)[name = tensor<string, []>("linear_16")];
+            tensor<fp32, [1, 4, 256]> x_9 = add(x = input_55, y = ret_out_3)[name = tensor<string, []>("x_9")];
             tensor<int32, [4]> window_11_begin_0 = const()[name = tensor<string, []>("window_11_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> window_11_end_0 = const()[name = tensor<string, []>("window_11_end_0"), val = tensor<int32, [4]>([2, 1, 16, 256])];
             tensor<bool, [4]> window_11_end_mask_0 = const()[name = tensor<string, []>("window_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_11_squeeze_mask_0 = const()[name = tensor<string, []>("window_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_11 = slice_by_index(begin = window_11_begin_0, end = window_11_end_0, end_mask = window_11_end_mask_0, squeeze_mask = window_11_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_11")];
-            tensor<int32, [3]> var_435_begin_0 = const()[name = tensor<string, []>("op_435_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_435_end_0 = const()[name = tensor<string, []>("op_435_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_435_end_mask_0 = const()[name = tensor<string, []>("op_435_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_435 = slice_by_index(begin = var_435_begin_0, end = var_435_end_0, end_mask = var_435_end_mask_0, x = x_9)[name = tensor<string, []>("op_435")];
-            tensor<int32, [3]> var_438_begin_0 = const()[name = tensor<string, []>("op_438_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_438_end_0 = const()[name = tensor<string, []>("op_438_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_438_end_mask_0 = const()[name = tensor<string, []>("op_438_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_438 = slice_by_index(begin = var_438_begin_0, end = var_438_end_0, end_mask = var_438_end_mask_0, x = window_11)[name = tensor<string, []>("op_438")];
+            tensor<int32, [3]> var_491_begin_0 = const()[name = tensor<string, []>("op_491_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_491_end_0 = const()[name = tensor<string, []>("op_491_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_491_end_mask_0 = const()[name = tensor<string, []>("op_491_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_491 = slice_by_index(begin = var_491_begin_0, end = var_491_end_0, end_mask = var_491_end_mask_0, x = x_9)[name = tensor<string, []>("op_491")];
+            tensor<int32, [3]> var_494_begin_0 = const()[name = tensor<string, []>("op_494_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_494_end_0 = const()[name = tensor<string, []>("op_494_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_494_end_mask_0 = const()[name = tensor<string, []>("op_494_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_494 = slice_by_index(begin = var_494_begin_0, end = var_494_end_0, end_mask = var_494_end_mask_0, x = window_11)[name = tensor<string, []>("op_494")];
             tensor<bool, []> window_13_interleave_0 = const()[name = tensor<string, []>("window_13_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_13 = concat(axis = var_26, interleave = window_13_interleave_0, values = (var_438, var_435))[name = tensor<string, []>("window_13")];
-            tensor<int32, [3]> var_443_begin_0 = const()[name = tensor<string, []>("op_443_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_443_end_0 = const()[name = tensor<string, []>("op_443_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_443_end_mask_0 = const()[name = tensor<string, []>("op_443_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_443 = slice_by_index(begin = var_443_begin_0, end = var_443_end_0, end_mask = var_443_end_mask_0, x = x_9)[name = tensor<string, []>("op_443")];
-            tensor<int32, [3]> var_446_begin_0 = const()[name = tensor<string, []>("op_446_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_446_end_0 = const()[name = tensor<string, []>("op_446_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_446_end_mask_0 = const()[name = tensor<string, []>("op_446_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_446 = slice_by_index(begin = var_446_begin_0, end = var_446_end_0, end_mask = var_446_end_mask_0, x = window_13)[name = tensor<string, []>("op_446")];
+            tensor<fp32, [1, 16, 256]> window_13 = concat(axis = var_82, interleave = window_13_interleave_0, values = (var_494, var_491))[name = tensor<string, []>("window_13")];
+            tensor<int32, [3]> var_499_begin_0 = const()[name = tensor<string, []>("op_499_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_499_end_0 = const()[name = tensor<string, []>("op_499_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_499_end_mask_0 = const()[name = tensor<string, []>("op_499_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_499 = slice_by_index(begin = var_499_begin_0, end = var_499_end_0, end_mask = var_499_end_mask_0, x = x_9)[name = tensor<string, []>("op_499")];
+            tensor<int32, [3]> var_502_begin_0 = const()[name = tensor<string, []>("op_502_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_502_end_0 = const()[name = tensor<string, []>("op_502_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_502_end_mask_0 = const()[name = tensor<string, []>("op_502_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_502 = slice_by_index(begin = var_502_begin_0, end = var_502_end_0, end_mask = var_502_end_mask_0, x = window_13)[name = tensor<string, []>("op_502")];
             tensor<bool, []> window_15_interleave_0 = const()[name = tensor<string, []>("window_15_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_15 = concat(axis = var_26, interleave = window_15_interleave_0, values = (var_446, var_443))[name = tensor<string, []>("window_15")];
-            tensor<int32, [3]> var_451_begin_0 = const()[name = tensor<string, []>("op_451_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_451_end_0 = const()[name = tensor<string, []>("op_451_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<bool, [3]> var_451_end_mask_0 = const()[name = tensor<string, []>("op_451_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_451 = slice_by_index(begin = var_451_begin_0, end = var_451_end_0, end_mask = var_451_end_mask_0, x = x_9)[name = tensor<string, []>("op_451")];
-            tensor<int32, [3]> var_454_begin_0 = const()[name = tensor<string, []>("op_454_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_454_end_0 = const()[name = tensor<string, []>("op_454_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_454_end_mask_0 = const()[name = tensor<string, []>("op_454_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_454 = slice_by_index(begin = var_454_begin_0, end = var_454_end_0, end_mask = var_454_end_mask_0, x = window_15)[name = tensor<string, []>("op_454")];
+            tensor<fp32, [1, 16, 256]> window_15 = concat(axis = var_82, interleave = window_15_interleave_0, values = (var_502, var_499))[name = tensor<string, []>("window_15")];
+            tensor<int32, [3]> var_507_begin_0 = const()[name = tensor<string, []>("op_507_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_507_end_0 = const()[name = tensor<string, []>("op_507_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<bool, [3]> var_507_end_mask_0 = const()[name = tensor<string, []>("op_507_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_507 = slice_by_index(begin = var_507_begin_0, end = var_507_end_0, end_mask = var_507_end_mask_0, x = x_9)[name = tensor<string, []>("op_507")];
+            tensor<int32, [3]> var_510_begin_0 = const()[name = tensor<string, []>("op_510_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_510_end_0 = const()[name = tensor<string, []>("op_510_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_510_end_mask_0 = const()[name = tensor<string, []>("op_510_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_510 = slice_by_index(begin = var_510_begin_0, end = var_510_end_0, end_mask = var_510_end_mask_0, x = window_15)[name = tensor<string, []>("op_510")];
             tensor<bool, []> window_17_interleave_0 = const()[name = tensor<string, []>("window_17_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_17 = concat(axis = var_26, interleave = window_17_interleave_0, values = (var_454, var_451))[name = tensor<string, []>("window_17")];
-            tensor<int32, [3]> var_459_begin_0 = const()[name = tensor<string, []>("op_459_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
-            tensor<int32, [3]> var_459_end_0 = const()[name = tensor<string, []>("op_459_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_459_end_mask_0 = const()[name = tensor<string, []>("op_459_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_459 = slice_by_index(begin = var_459_begin_0, end = var_459_end_0, end_mask = var_459_end_mask_0, x = x_9)[name = tensor<string, []>("op_459")];
-            tensor<int32, [3]> var_462_begin_0 = const()[name = tensor<string, []>("op_462_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_462_end_0 = const()[name = tensor<string, []>("op_462_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_462_end_mask_0 = const()[name = tensor<string, []>("op_462_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_462 = slice_by_index(begin = var_462_begin_0, end = var_462_end_0, end_mask = var_462_end_mask_0, x = window_17)[name = tensor<string, []>("op_462")];
+            tensor<fp32, [1, 16, 256]> window_17 = concat(axis = var_82, interleave = window_17_interleave_0, values = (var_510, var_507))[name = tensor<string, []>("window_17")];
+            tensor<int32, [3]> var_515_begin_0 = const()[name = tensor<string, []>("op_515_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
+            tensor<int32, [3]> var_515_end_0 = const()[name = tensor<string, []>("op_515_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_515_end_mask_0 = const()[name = tensor<string, []>("op_515_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_515 = slice_by_index(begin = var_515_begin_0, end = var_515_end_0, end_mask = var_515_end_mask_0, x = x_9)[name = tensor<string, []>("op_515")];
+            tensor<int32, [3]> var_518_begin_0 = const()[name = tensor<string, []>("op_518_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_518_end_0 = const()[name = tensor<string, []>("op_518_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_518_end_mask_0 = const()[name = tensor<string, []>("op_518_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_518 = slice_by_index(begin = var_518_begin_0, end = var_518_end_0, end_mask = var_518_end_mask_0, x = window_17)[name = tensor<string, []>("op_518")];
             tensor<bool, []> window_19_interleave_0 = const()[name = tensor<string, []>("window_19_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_19 = concat(axis = var_26, interleave = window_19_interleave_0, values = (var_462, var_459))[name = tensor<string, []>("window_19")];
-            tensor<bool, []> input_61_interleave_0 = const()[name = tensor<string, []>("input_61_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [4, 16, 256]> input_61 = concat(axis = var_23, interleave = input_61_interleave_0, values = (window_13, window_15, window_17, window_19))[name = tensor<string, []>("input_61")];
+            tensor<fp32, [1, 16, 256]> window_19 = concat(axis = var_82, interleave = window_19_interleave_0, values = (var_518, var_515))[name = tensor<string, []>("window_19")];
+            tensor<bool, []> input_63_interleave_0 = const()[name = tensor<string, []>("input_63_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [4, 16, 256]> input_63 = concat(axis = var_69, interleave = input_63_interleave_0, values = (window_13, window_15, window_17, window_19))[name = tensor<string, []>("input_63")];
             tensor<int32, [1]> x_11_axes_0 = const()[name = tensor<string, []>("x_11_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [4, 16, 256]> x_11 = layer_norm(axes = x_11_axes_0, beta = encoder_conv_module_1_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_1_sequential_0_weight, x = input_61)[name = tensor<string, []>("x_11")];
-            tensor<int32, [3]> input_63_perm_0 = const()[name = tensor<string, []>("input_63_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [4, 16, 256]> x_11 = layer_norm(axes = x_11_axes_0, beta = inner_encoder_conv_module_1_sequential_0_bias, epsilon = var_66, gamma = inner_encoder_conv_module_1_sequential_0_weight, x = input_63)[name = tensor<string, []>("x_11")];
+            tensor<int32, [3]> input_65_perm_0 = const()[name = tensor<string, []>("input_65_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_13_pad_type_0 = const()[name = tensor<string, []>("inputs_13_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_13_strides_0 = const()[name = tensor<string, []>("inputs_13_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_13_pad_0 = const()[name = tensor<string, []>("inputs_13_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_13_dilations_0 = const()[name = tensor<string, []>("inputs_13_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_13_groups_0 = const()[name = tensor<string, []>("inputs_13_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> input_63 = transpose(perm = input_63_perm_0, x = x_11)[name = tensor<string, []>("transpose_47")];
-            tensor<fp32, [4, 512, 16]> inputs_13 = conv(bias = encoder_conv_module_1_sequential_2_conv_bias, dilations = inputs_13_dilations_0, groups = inputs_13_groups_0, pad = inputs_13_pad_0, pad_type = inputs_13_pad_type_0, strides = inputs_13_strides_0, weight = encoder_conv_module_1_sequential_2_conv_weight, x = input_63)[name = tensor<string, []>("inputs_13")];
-            tensor<int32, [2]> var_487_split_sizes_0 = const()[name = tensor<string, []>("op_487_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_487_axis_0 = const()[name = tensor<string, []>("op_487_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> var_487_0, tensor<fp32, [4, 256, 16]> var_487_1 = split(axis = var_487_axis_0, split_sizes = var_487_split_sizes_0, x = inputs_13)[name = tensor<string, []>("op_487")];
-            tensor<fp32, [4, 256, 16]> var_489 = sigmoid(x = var_487_1)[name = tensor<string, []>("op_489")];
-            tensor<fp32, [4, 256, 16]> inputs_15 = mul(x = var_487_0, y = var_489)[name = tensor<string, []>("inputs_15")];
+            tensor<fp32, [4, 256, 16]> input_65 = transpose(perm = input_65_perm_0, x = x_11)[name = tensor<string, []>("transpose_47")];
+            tensor<fp32, [4, 512, 16]> inputs_13 = conv(bias = inner_encoder_conv_module_1_sequential_2_conv_bias, dilations = inputs_13_dilations_0, groups = inputs_13_groups_0, pad = inputs_13_pad_0, pad_type = inputs_13_pad_type_0, strides = inputs_13_strides_0, weight = inner_encoder_conv_module_1_sequential_2_conv_weight, x = input_65)[name = tensor<string, []>("inputs_13")];
+            tensor<int32, [2]> var_543_split_sizes_0 = const()[name = tensor<string, []>("op_543_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_543_axis_0 = const()[name = tensor<string, []>("op_543_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [4, 256, 16]> var_543_0, tensor<fp32, [4, 256, 16]> var_543_1 = split(axis = var_543_axis_0, split_sizes = var_543_split_sizes_0, x = inputs_13)[name = tensor<string, []>("op_543")];
+            tensor<fp32, [4, 256, 16]> var_545 = sigmoid(x = var_543_1)[name = tensor<string, []>("op_545")];
+            tensor<fp32, [4, 256, 16]> inputs_15 = mul(x = var_543_0, y = var_545)[name = tensor<string, []>("inputs_15")];
             tensor<string, []> outputs_aug_3_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_3_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_3_pad_0 = const()[name = tensor<string, []>("outputs_aug_3_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_3_groups_0 = const()[name = tensor<string, []>("outputs_aug_3_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_3_strides_0 = const()[name = tensor<string, []>("outputs_aug_3_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_3_dilations_0 = const()[name = tensor<string, []>("outputs_aug_3_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [4, 256, 31]> outputs_aug_3 = conv(dilations = outputs_aug_3_dilations_0, groups = outputs_aug_3_groups_0, pad = outputs_aug_3_pad_0, pad_type = outputs_aug_3_pad_type_0, strides = outputs_aug_3_strides_0, weight = encoder_conv_module_1_sequential_4_conv_weight, x = inputs_15)[name = tensor<string, []>("outputs_aug_3")];
-            tensor<int32, [3]> input_65_begin_0 = const()[name = tensor<string, []>("input_65_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_65_end_0 = const()[name = tensor<string, []>("input_65_end_0"), val = tensor<int32, [3]>([4, 256, 16])];
-            tensor<bool, [3]> input_65_end_mask_0 = const()[name = tensor<string, []>("input_65_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [4, 256, 16]> input_65 = slice_by_index(begin = input_65_begin_0, end = input_65_end_0, end_mask = input_65_end_mask_0, x = outputs_aug_3)[name = tensor<string, []>("input_65")];
-            tensor<fp32, [4, 256, 16]> inputs_17 = batch_norm(beta = encoder_conv_module_1_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_1_sequential_5_weight, mean = encoder_conv_module_1_sequential_5_running_mean, variance = encoder_conv_module_1_sequential_5_running_var, x = input_65)[name = tensor<string, []>("inputs_17")];
-            tensor<fp32, [4, 256, 16]> input_67 = silu(x = inputs_17)[name = tensor<string, []>("input_67")];
-            tensor<string, []> input_69_pad_type_0 = const()[name = tensor<string, []>("input_69_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_69_strides_0 = const()[name = tensor<string, []>("input_69_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_69_pad_0 = const()[name = tensor<string, []>("input_69_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_69_dilations_0 = const()[name = tensor<string, []>("input_69_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_69_groups_0 = const()[name = tensor<string, []>("input_69_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> input_69 = conv(bias = encoder_conv_module_1_sequential_7_conv_bias, dilations = input_69_dilations_0, groups = input_69_groups_0, pad = input_69_pad_0, pad_type = input_69_pad_type_0, strides = input_69_strides_0, weight = encoder_conv_module_1_sequential_7_conv_weight, x = input_67)[name = tensor<string, []>("input_69")];
+            tensor<fp32, [4, 256, 31]> outputs_aug_3 = conv(dilations = outputs_aug_3_dilations_0, groups = outputs_aug_3_groups_0, pad = outputs_aug_3_pad_0, pad_type = outputs_aug_3_pad_type_0, strides = outputs_aug_3_strides_0, weight = inner_encoder_conv_module_1_sequential_4_conv_weight, x = inputs_15)[name = tensor<string, []>("outputs_aug_3")];
+            tensor<int32, [3]> input_67_begin_0 = const()[name = tensor<string, []>("input_67_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_67_end_0 = const()[name = tensor<string, []>("input_67_end_0"), val = tensor<int32, [3]>([4, 256, 16])];
+            tensor<bool, [3]> input_67_end_mask_0 = const()[name = tensor<string, []>("input_67_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [4, 256, 16]> input_67 = slice_by_index(begin = input_67_begin_0, end = input_67_end_0, end_mask = input_67_end_mask_0, x = outputs_aug_3)[name = tensor<string, []>("input_67")];
+            tensor<fp32, [4, 256, 16]> inputs_17 = batch_norm(beta = inner_encoder_conv_module_1_sequential_5_bias, epsilon = var_66, gamma = inner_encoder_conv_module_1_sequential_5_weight, mean = inner_encoder_conv_module_1_sequential_5_running_mean, variance = inner_encoder_conv_module_1_sequential_5_running_var, x = input_67)[name = tensor<string, []>("inputs_17")];
+            tensor<fp32, [4, 256, 16]> input_69 = silu(x = inputs_17)[name = tensor<string, []>("input_69")];
+            tensor<string, []> input_71_pad_type_0 = const()[name = tensor<string, []>("input_71_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_71_strides_0 = const()[name = tensor<string, []>("input_71_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_71_pad_0 = const()[name = tensor<string, []>("input_71_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_71_dilations_0 = const()[name = tensor<string, []>("input_71_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_71_groups_0 = const()[name = tensor<string, []>("input_71_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [4, 256, 16]> input_71 = conv(bias = inner_encoder_conv_module_1_sequential_7_conv_bias, dilations = input_71_dilations_0, groups = input_71_groups_0, pad = input_71_pad_0, pad_type = input_71_pad_type_0, strides = input_71_strides_0, weight = inner_encoder_conv_module_1_sequential_7_conv_weight, x = input_69)[name = tensor<string, []>("input_71")];
             tensor<int32, [3]> conv_out_3_perm_0 = const()[name = tensor<string, []>("conv_out_3_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_520_begin_0 = const()[name = tensor<string, []>("op_520_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_520_end_0 = const()[name = tensor<string, []>("op_520_end_0"), val = tensor<int32, [3]>([4, 16, 256])];
-            tensor<bool, [3]> var_520_end_mask_0 = const()[name = tensor<string, []>("op_520_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [4, 16, 256]> conv_out_3 = transpose(perm = conv_out_3_perm_0, x = input_69)[name = tensor<string, []>("transpose_46")];
-            tensor<fp32, [4, 1, 256]> var_520 = slice_by_index(begin = var_520_begin_0, end = var_520_end_0, end_mask = var_520_end_mask_0, x = conv_out_3)[name = tensor<string, []>("op_520")];
-            tensor<int32, [3]> var_522_perm_0 = const()[name = tensor<string, []>("op_522_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 4, 256]> var_522 = transpose(perm = var_522_perm_0, x = var_520)[name = tensor<string, []>("transpose_45")];
-            tensor<fp32, [1, 4, 256]> input_71 = add(x = x_9, y = var_522)[name = tensor<string, []>("input_71")];
-            tensor<int32, [1]> input_73_axes_0 = const()[name = tensor<string, []>("input_73_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_73 = layer_norm(axes = input_73_axes_0, beta = encoder_ffn2_1_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_1_module_sequential_0_weight, x = input_71)[name = tensor<string, []>("input_73")];
-            tensor<fp32, [1, 4, 1024]> inputs_19 = linear(bias = encoder_ffn2_1_module_sequential_1_linear_bias, weight = encoder_ffn2_1_module_sequential_1_linear_weight, x = input_73)[name = tensor<string, []>("linear_17")];
-            tensor<fp32, [1, 4, 1024]> input_75 = silu(x = inputs_19)[name = tensor<string, []>("input_75")];
-            tensor<fp32, [1, 4, 256]> input_79 = linear(bias = encoder_ffn2_1_module_sequential_4_linear_bias, weight = encoder_ffn2_1_module_sequential_4_linear_weight, x = input_75)[name = tensor<string, []>("linear_18")];
-            tensor<fp32, []> var_545 = const()[name = tensor<string, []>("op_545"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 4, 256]> var_546 = mul(x = input_79, y = var_545)[name = tensor<string, []>("op_546")];
-            tensor<fp32, [1, 4, 256]> input_81 = add(x = var_546, y = input_71)[name = tensor<string, []>("input_81")];
-            tensor<int32, [1]> input_83_axes_0 = const()[name = tensor<string, []>("input_83_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_83 = layer_norm(axes = input_83_axes_0, beta = encoder_layer_norm_1_bias, epsilon = var_28, gamma = encoder_layer_norm_1_weight, x = input_81)[name = tensor<string, []>("input_83")];
+            tensor<int32, [3]> var_576_begin_0 = const()[name = tensor<string, []>("op_576_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_576_end_0 = const()[name = tensor<string, []>("op_576_end_0"), val = tensor<int32, [3]>([4, 16, 256])];
+            tensor<bool, [3]> var_576_end_mask_0 = const()[name = tensor<string, []>("op_576_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [4, 16, 256]> conv_out_3 = transpose(perm = conv_out_3_perm_0, x = input_71)[name = tensor<string, []>("transpose_46")];
+            tensor<fp32, [4, 1, 256]> var_576 = slice_by_index(begin = var_576_begin_0, end = var_576_end_0, end_mask = var_576_end_mask_0, x = conv_out_3)[name = tensor<string, []>("op_576")];
+            tensor<int32, [3]> var_578_perm_0 = const()[name = tensor<string, []>("op_578_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 4, 256]> var_578 = transpose(perm = var_578_perm_0, x = var_576)[name = tensor<string, []>("transpose_45")];
+            tensor<fp32, [1, 4, 256]> input_73 = add(x = x_9, y = var_578)[name = tensor<string, []>("input_73")];
+            tensor<int32, [1]> input_75_axes_0 = const()[name = tensor<string, []>("input_75_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 4, 256]> input_75 = layer_norm(axes = input_75_axes_0, beta = inner_encoder_ffn2_1_module_sequential_0_bias, epsilon = var_66, gamma = inner_encoder_ffn2_1_module_sequential_0_weight, x = input_73)[name = tensor<string, []>("input_75")];
+            tensor<fp32, [1, 4, 1024]> inputs_19 = linear(bias = inner_encoder_ffn2_1_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_1_module_sequential_1_linear_weight, x = input_75)[name = tensor<string, []>("linear_17")];
+            tensor<fp32, [1, 4, 1024]> input_77 = silu(x = inputs_19)[name = tensor<string, []>("input_77")];
+            tensor<fp32, [1, 4, 256]> input_81 = linear(bias = inner_encoder_ffn2_1_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_1_module_sequential_4_linear_weight, x = input_77)[name = tensor<string, []>("linear_18")];
+            tensor<fp32, []> var_601 = const()[name = tensor<string, []>("op_601"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 4, 256]> var_602 = mul(x = input_81, y = var_601)[name = tensor<string, []>("op_602")];
+            tensor<fp32, [1, 4, 256]> input_83 = add(x = var_602, y = input_73)[name = tensor<string, []>("input_83")];
             tensor<int32, [1]> input_85_axes_0 = const()[name = tensor<string, []>("input_85_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_85 = layer_norm(axes = input_85_axes_0, beta = encoder_ffn1_2_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_2_module_sequential_0_weight, x = input_83)[name = tensor<string, []>("input_85")];
-            tensor<fp32, [1, 4, 1024]> inputs_21 = linear(bias = encoder_ffn1_2_module_sequential_1_linear_bias, weight = encoder_ffn1_2_module_sequential_1_linear_weight, x = input_85)[name = tensor<string, []>("linear_19")];
-            tensor<fp32, [1, 4, 1024]> input_87 = silu(x = inputs_21)[name = tensor<string, []>("input_87")];
-            tensor<fp32, [1, 4, 256]> input_91 = linear(bias = encoder_ffn1_2_module_sequential_4_linear_bias, weight = encoder_ffn1_2_module_sequential_4_linear_weight, x = input_87)[name = tensor<string, []>("linear_20")];
-            tensor<fp32, []> var_575 = const()[name = tensor<string, []>("op_575"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 4, 256]> var_576 = mul(x = input_91, y = var_575)[name = tensor<string, []>("op_576")];
-            tensor<fp32, [1, 4, 256]> input_93 = add(x = var_576, y = input_83)[name = tensor<string, []>("input_93")];
+            tensor<fp32, [1, 4, 256]> input_85 = layer_norm(axes = input_85_axes_0, beta = inner_encoder_layer_norm_1_bias, epsilon = var_66, gamma = inner_encoder_layer_norm_1_weight, x = input_83)[name = tensor<string, []>("input_85")];
+            tensor<int32, [1]> input_87_axes_0 = const()[name = tensor<string, []>("input_87_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 4, 256]> input_87 = layer_norm(axes = input_87_axes_0, beta = inner_encoder_ffn1_2_module_sequential_0_bias, epsilon = var_66, gamma = inner_encoder_ffn1_2_module_sequential_0_weight, x = input_85)[name = tensor<string, []>("input_87")];
+            tensor<fp32, [1, 4, 1024]> inputs_21 = linear(bias = inner_encoder_ffn1_2_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_2_module_sequential_1_linear_weight, x = input_87)[name = tensor<string, []>("linear_19")];
+            tensor<fp32, [1, 4, 1024]> input_89 = silu(x = inputs_21)[name = tensor<string, []>("input_89")];
+            tensor<fp32, [1, 4, 256]> input_93 = linear(bias = inner_encoder_ffn1_2_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_2_module_sequential_4_linear_weight, x = input_89)[name = tensor<string, []>("linear_20")];
+            tensor<fp32, []> var_631 = const()[name = tensor<string, []>("op_631"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 4, 256]> var_632 = mul(x = input_93, y = var_631)[name = tensor<string, []>("op_632")];
+            tensor<fp32, [1, 4, 256]> input_95 = add(x = var_632, y = input_85)[name = tensor<string, []>("input_95")];
             tensor<int32, [1]> x_13_axes_0 = const()[name = tensor<string, []>("x_13_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> x_13 = layer_norm(axes = x_13_axes_0, beta = encoder_ret_lns_2_bias, epsilon = var_28, gamma = encoder_ret_lns_2_weight, x = input_93)[name = tensor<string, []>("x_13")];
+            tensor<fp32, [1, 4, 256]> x_13 = layer_norm(axes = x_13_axes_0, beta = inner_encoder_ret_lns_2_bias, epsilon = var_66, gamma = inner_encoder_ret_lns_2_weight, x = input_95)[name = tensor<string, []>("x_13")];
             tensor<int32, [5]> prev_kv_5_begin_0 = const()[name = tensor<string, []>("prev_kv_5_begin_0"), val = tensor<int32, [5]>([2, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_5_end_0 = const()[name = tensor<string, []>("prev_kv_5_end_0"), val = tensor<int32, [5]>([3, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_5_end_mask_0 = const()[name = tensor<string, []>("prev_kv_5_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -593,173 +615,173 @@ program(1.0)
             tensor<bool, [2]> prev_scale_5_end_mask_0 = const()[name = tensor<string, []>("prev_scale_5_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_5_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_5_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_5 = slice_by_index(begin = prev_scale_5_begin_0, end = prev_scale_5_end_0, end_mask = prev_scale_5_end_mask_0, squeeze_mask = prev_scale_5_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_5")];
-            tensor<fp32, [1, 4, 256]> var_590 = linear(bias = encoder_q_proj_2_bias, weight = encoder_q_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_21")];
-            tensor<int32, [4]> var_591 = const()[name = tensor<string, []>("op_591"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_592 = reshape(shape = var_591, x = var_590)[name = tensor<string, []>("op_592")];
+            tensor<fp32, [1, 4, 256]> var_646 = linear(bias = inner_encoder_q_proj_2_bias, weight = inner_encoder_q_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_21")];
+            tensor<int32, [4]> var_647 = const()[name = tensor<string, []>("op_647"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_648 = reshape(shape = var_647, x = var_646)[name = tensor<string, []>("op_648")];
             tensor<int32, [4]> q_5_perm_0 = const()[name = tensor<string, []>("q_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> var_596 = linear(bias = encoder_k_proj_2_bias, weight = encoder_k_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_22")];
-            tensor<fp32, []> var_597 = const()[name = tensor<string, []>("op_597"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 4, 256]> var_598 = mul(x = var_596, y = var_597)[name = tensor<string, []>("op_598")];
-            tensor<int32, [4]> var_599 = const()[name = tensor<string, []>("op_599"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_600 = reshape(shape = var_599, x = var_598)[name = tensor<string, []>("op_600")];
+            tensor<fp32, [1, 4, 256]> var_652 = linear(bias = inner_encoder_k_proj_2_bias, weight = inner_encoder_k_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_22")];
+            tensor<fp32, []> var_653 = const()[name = tensor<string, []>("op_653"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 4, 256]> var_654 = mul(x = var_652, y = var_653)[name = tensor<string, []>("op_654")];
+            tensor<int32, [4]> var_655 = const()[name = tensor<string, []>("op_655"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_656 = reshape(shape = var_655, x = var_654)[name = tensor<string, []>("op_656")];
             tensor<int32, [4]> k_5_perm_0 = const()[name = tensor<string, []>("k_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> var_604 = linear(bias = encoder_v_proj_2_bias, weight = encoder_v_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_23")];
-            tensor<int32, [4]> var_605 = const()[name = tensor<string, []>("op_605"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_606 = reshape(shape = var_605, x = var_604)[name = tensor<string, []>("op_606")];
+            tensor<fp32, [1, 4, 256]> var_660 = linear(bias = inner_encoder_v_proj_2_bias, weight = inner_encoder_v_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_23")];
+            tensor<int32, [4]> var_661 = const()[name = tensor<string, []>("op_661"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_662 = reshape(shape = var_661, x = var_660)[name = tensor<string, []>("op_662")];
             tensor<int32, [4]> v_5_perm_0 = const()[name = tensor<string, []>("v_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> input_97 = linear(bias = encoder_g_proj_2_bias, weight = encoder_g_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_24")];
+            tensor<fp32, [1, 4, 256]> input_99 = linear(bias = inner_encoder_g_proj_2_bias, weight = inner_encoder_g_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_24")];
             tensor<fp32, [1]> sqrt_s0_5 = sqrt(x = prev_scale_5)[name = tensor<string, []>("sqrt_s0_5")];
-            tensor<fp32, [4]> s_t_5 = add(x = prev_scale_5, y = encoder__t_index)[name = tensor<string, []>("s_t_5")];
+            tensor<fp32, [4]> s_t_5 = add(x = prev_scale_5, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_5")];
             tensor<fp32, [4]> sqrt_s_t_5 = sqrt(x = s_t_5)[name = tensor<string, []>("sqrt_s_t_5")];
             tensor<bool, []> qk_5_transpose_x_1 = const()[name = tensor<string, []>("qk_5_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_5_transpose_y_1 = const()[name = tensor<string, []>("qk_5_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 4, 64]> k_5 = transpose(perm = k_5_perm_0, x = var_600)[name = tensor<string, []>("transpose_43")];
-            tensor<fp32, [1, 4, 4, 64]> q_5 = transpose(perm = q_5_perm_0, x = var_592)[name = tensor<string, []>("transpose_44")];
+            tensor<fp32, [1, 4, 4, 64]> k_5 = transpose(perm = k_5_perm_0, x = var_656)[name = tensor<string, []>("transpose_43")];
+            tensor<fp32, [1, 4, 4, 64]> q_5 = transpose(perm = q_5_perm_0, x = var_648)[name = tensor<string, []>("transpose_44")];
             tensor<fp32, [1, 4, 4, 4]> qk_5 = matmul(transpose_x = qk_5_transpose_x_1, transpose_y = qk_5_transpose_y_1, x = q_5, y = k_5)[name = tensor<string, []>("qk_5")];
-            tensor<int32, [2]> var_616 = const()[name = tensor<string, []>("op_616"), val = tensor<int32, [2]>([4, 1])];
-            tensor<fp32, [4, 1]> var_617 = reshape(shape = var_616, x = sqrt_s_t_5)[name = tensor<string, []>("op_617")];
-            tensor<fp32, [4, 4]> M_5 = real_div(x = encoder__causal_mask, y = var_617)[name = tensor<string, []>("M_5")];
-            tensor<fp32, [1, 4, 4, 4]> var_619 = mul(x = qk_5, y = M_5)[name = tensor<string, []>("op_619")];
+            tensor<int32, [2]> var_672 = const()[name = tensor<string, []>("op_672"), val = tensor<int32, [2]>([4, 1])];
+            tensor<fp32, [4, 1]> var_673 = reshape(shape = var_672, x = sqrt_s_t_5)[name = tensor<string, []>("op_673")];
+            tensor<fp32, [4, 4]> M_5 = real_div(x = inner_encoder__causal_mask, y = var_673)[name = tensor<string, []>("M_5")];
+            tensor<fp32, [1, 4, 4, 4]> var_675 = mul(x = qk_5, y = M_5)[name = tensor<string, []>("op_675")];
             tensor<bool, []> inner_5_transpose_x_0 = const()[name = tensor<string, []>("inner_5_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_5_transpose_y_0 = const()[name = tensor<string, []>("inner_5_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 4, 64]> v_5 = transpose(perm = v_5_perm_0, x = var_606)[name = tensor<string, []>("transpose_42")];
-            tensor<fp32, [1, 4, 4, 64]> inner_5 = matmul(transpose_x = inner_5_transpose_x_0, transpose_y = inner_5_transpose_y_0, x = var_619, y = v_5)[name = tensor<string, []>("inner_5")];
-            tensor<bool, []> var_621_transpose_x_0 = const()[name = tensor<string, []>("op_621_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_621_transpose_y_0 = const()[name = tensor<string, []>("op_621_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 4, 64]> var_621 = matmul(transpose_x = var_621_transpose_x_0, transpose_y = var_621_transpose_y_0, x = q_5, y = prev_kv_5)[name = tensor<string, []>("op_621")];
-            tensor<fp32, [4]> var_622 = real_div(x = sqrt_s0_5, y = sqrt_s_t_5)[name = tensor<string, []>("op_622")];
-            tensor<int32, [4]> var_623 = const()[name = tensor<string, []>("op_623"), val = tensor<int32, [4]>([1, 1, 4, 1])];
-            tensor<fp32, [1, 1, 4, 1]> var_624 = reshape(shape = var_623, x = var_622)[name = tensor<string, []>("op_624")];
-            tensor<fp32, [1, 4, 4, 64]> cross_5 = mul(x = var_621, y = var_624)[name = tensor<string, []>("cross_5")];
+            tensor<fp32, [1, 4, 4, 64]> v_5 = transpose(perm = v_5_perm_0, x = var_662)[name = tensor<string, []>("transpose_42")];
+            tensor<fp32, [1, 4, 4, 64]> inner_5 = matmul(transpose_x = inner_5_transpose_x_0, transpose_y = inner_5_transpose_y_0, x = var_675, y = v_5)[name = tensor<string, []>("inner_5")];
+            tensor<bool, []> var_677_transpose_x_0 = const()[name = tensor<string, []>("op_677_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_677_transpose_y_0 = const()[name = tensor<string, []>("op_677_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 4, 64]> var_677 = matmul(transpose_x = var_677_transpose_x_0, transpose_y = var_677_transpose_y_0, x = q_5, y = prev_kv_5)[name = tensor<string, []>("op_677")];
+            tensor<fp32, [4]> var_678 = real_div(x = sqrt_s0_5, y = sqrt_s_t_5)[name = tensor<string, []>("op_678")];
+            tensor<int32, [4]> var_679 = const()[name = tensor<string, []>("op_679"), val = tensor<int32, [4]>([1, 1, 4, 1])];
+            tensor<fp32, [1, 1, 4, 1]> var_680 = reshape(shape = var_679, x = var_678)[name = tensor<string, []>("op_680")];
+            tensor<fp32, [1, 4, 4, 64]> cross_5 = mul(x = var_677, y = var_680)[name = tensor<string, []>("cross_5")];
             tensor<fp32, [1, 4, 4, 64]> out_13 = add(x = inner_5, y = cross_5)[name = tensor<string, []>("out_13")];
-            tensor<fp32, [1, 4, 64, 64]> var_627 = mul(x = prev_kv_5, y = sqrt_s0_5)[name = tensor<string, []>("op_627")];
-            tensor<bool, []> var_629_transpose_x_1 = const()[name = tensor<string, []>("op_629_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_629_transpose_y_1 = const()[name = tensor<string, []>("op_629_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_629 = matmul(transpose_x = var_629_transpose_x_1, transpose_y = var_629_transpose_y_1, x = k_5, y = v_5)[name = tensor<string, []>("op_629")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_5 = add(x = var_627, y = var_629)[name = tensor<string, []>("new_kv_unnorm_5")];
-            tensor<fp32, []> var_631 = const()[name = tensor<string, []>("op_631"), val = tensor<fp32, []>(0x1p+2)];
-            tensor<fp32, [1]> new_scale_5 = add(x = prev_scale_5, y = var_631)[name = tensor<string, []>("new_scale_5")];
-            tensor<fp32, [1]> var_633 = sqrt(x = new_scale_5)[name = tensor<string, []>("op_633")];
-            tensor<fp32, [1, 4, 64, 64]> var_634 = real_div(x = new_kv_unnorm_5, y = var_633)[name = tensor<string, []>("op_634")];
-            tensor<int32, [4]> var_635_perm_0 = const()[name = tensor<string, []>("op_635_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_683 = mul(x = prev_kv_5, y = sqrt_s0_5)[name = tensor<string, []>("op_683")];
+            tensor<bool, []> var_685_transpose_x_1 = const()[name = tensor<string, []>("op_685_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_685_transpose_y_1 = const()[name = tensor<string, []>("op_685_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_685 = matmul(transpose_x = var_685_transpose_x_1, transpose_y = var_685_transpose_y_1, x = k_5, y = v_5)[name = tensor<string, []>("op_685")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_5 = add(x = var_683, y = var_685)[name = tensor<string, []>("new_kv_unnorm_5")];
+            tensor<fp32, []> var_687 = const()[name = tensor<string, []>("op_687"), val = tensor<fp32, []>(0x1p+2)];
+            tensor<fp32, [1]> new_scale_5 = add(x = prev_scale_5, y = var_687)[name = tensor<string, []>("new_scale_5")];
+            tensor<fp32, [1]> var_689 = sqrt(x = new_scale_5)[name = tensor<string, []>("op_689")];
+            tensor<fp32, [1, 4, 64, 64]> var_690 = real_div(x = new_kv_unnorm_5, y = var_689)[name = tensor<string, []>("op_690")];
+            tensor<int32, [4]> var_691_perm_0 = const()[name = tensor<string, []>("op_691_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_15_axes_0 = const()[name = tensor<string, []>("out_15_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 4, 64]> var_635 = transpose(perm = var_635_perm_0, x = out_13)[name = tensor<string, []>("transpose_41")];
-            tensor<fp32, [1, 4, 4, 64]> out_15 = layer_norm(axes = out_15_axes_0, epsilon = var_17, x = var_635)[name = tensor<string, []>("out_15")];
-            tensor<int32, [3]> var_639 = const()[name = tensor<string, []>("op_639"), val = tensor<int32, [3]>([1, 4, 256])];
-            tensor<fp32, [1, 4, 256]> out_17 = reshape(shape = var_639, x = out_15)[name = tensor<string, []>("out_17")];
-            tensor<fp32, [1, 4, 256]> var_641 = silu(x = input_97)[name = tensor<string, []>("op_641")];
-            tensor<fp32, [1, 4, 256]> input_99 = mul(x = var_641, y = out_17)[name = tensor<string, []>("input_99")];
-            tensor<fp32, [1, 4, 256]> ret_out_5 = linear(bias = encoder_out_proj_2_bias, weight = encoder_out_proj_2_weight, x = input_99)[name = tensor<string, []>("linear_25")];
-            tensor<fp32, [1, 4, 256]> x_15 = add(x = input_93, y = ret_out_5)[name = tensor<string, []>("x_15")];
+            tensor<fp32, [1, 4, 4, 64]> var_691 = transpose(perm = var_691_perm_0, x = out_13)[name = tensor<string, []>("transpose_41")];
+            tensor<fp32, [1, 4, 4, 64]> out_15 = layer_norm(axes = out_15_axes_0, epsilon = var_74, x = var_691)[name = tensor<string, []>("out_15")];
+            tensor<int32, [3]> var_695 = const()[name = tensor<string, []>("op_695"), val = tensor<int32, [3]>([1, 4, 256])];
+            tensor<fp32, [1, 4, 256]> out_17 = reshape(shape = var_695, x = out_15)[name = tensor<string, []>("out_17")];
+            tensor<fp32, [1, 4, 256]> var_697 = silu(x = input_99)[name = tensor<string, []>("op_697")];
+            tensor<fp32, [1, 4, 256]> input_101 = mul(x = var_697, y = out_17)[name = tensor<string, []>("input_101")];
+            tensor<fp32, [1, 4, 256]> ret_out_5 = linear(bias = inner_encoder_out_proj_2_bias, weight = inner_encoder_out_proj_2_weight, x = input_101)[name = tensor<string, []>("linear_25")];
+            tensor<fp32, [1, 4, 256]> x_15 = add(x = input_95, y = ret_out_5)[name = tensor<string, []>("x_15")];
             tensor<int32, [4]> window_21_begin_0 = const()[name = tensor<string, []>("window_21_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> window_21_end_0 = const()[name = tensor<string, []>("window_21_end_0"), val = tensor<int32, [4]>([3, 1, 16, 256])];
             tensor<bool, [4]> window_21_end_mask_0 = const()[name = tensor<string, []>("window_21_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_21_squeeze_mask_0 = const()[name = tensor<string, []>("window_21_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_21 = slice_by_index(begin = window_21_begin_0, end = window_21_end_0, end_mask = window_21_end_mask_0, squeeze_mask = window_21_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_21")];
-            tensor<int32, [3]> var_649_begin_0 = const()[name = tensor<string, []>("op_649_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_649_end_0 = const()[name = tensor<string, []>("op_649_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_649_end_mask_0 = const()[name = tensor<string, []>("op_649_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_649 = slice_by_index(begin = var_649_begin_0, end = var_649_end_0, end_mask = var_649_end_mask_0, x = x_15)[name = tensor<string, []>("op_649")];
-            tensor<int32, [3]> var_652_begin_0 = const()[name = tensor<string, []>("op_652_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_652_end_0 = const()[name = tensor<string, []>("op_652_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_652_end_mask_0 = const()[name = tensor<string, []>("op_652_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_652 = slice_by_index(begin = var_652_begin_0, end = var_652_end_0, end_mask = var_652_end_mask_0, x = window_21)[name = tensor<string, []>("op_652")];
+            tensor<int32, [3]> var_705_begin_0 = const()[name = tensor<string, []>("op_705_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_705_end_0 = const()[name = tensor<string, []>("op_705_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_705_end_mask_0 = const()[name = tensor<string, []>("op_705_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_705 = slice_by_index(begin = var_705_begin_0, end = var_705_end_0, end_mask = var_705_end_mask_0, x = x_15)[name = tensor<string, []>("op_705")];
+            tensor<int32, [3]> var_708_begin_0 = const()[name = tensor<string, []>("op_708_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_708_end_0 = const()[name = tensor<string, []>("op_708_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_708_end_mask_0 = const()[name = tensor<string, []>("op_708_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_708 = slice_by_index(begin = var_708_begin_0, end = var_708_end_0, end_mask = var_708_end_mask_0, x = window_21)[name = tensor<string, []>("op_708")];
             tensor<bool, []> window_23_interleave_0 = const()[name = tensor<string, []>("window_23_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_23 = concat(axis = var_26, interleave = window_23_interleave_0, values = (var_652, var_649))[name = tensor<string, []>("window_23")];
-            tensor<int32, [3]> var_657_begin_0 = const()[name = tensor<string, []>("op_657_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_657_end_0 = const()[name = tensor<string, []>("op_657_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_657_end_mask_0 = const()[name = tensor<string, []>("op_657_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_657 = slice_by_index(begin = var_657_begin_0, end = var_657_end_0, end_mask = var_657_end_mask_0, x = x_15)[name = tensor<string, []>("op_657")];
-            tensor<int32, [3]> var_660_begin_0 = const()[name = tensor<string, []>("op_660_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_660_end_0 = const()[name = tensor<string, []>("op_660_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_660_end_mask_0 = const()[name = tensor<string, []>("op_660_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_660 = slice_by_index(begin = var_660_begin_0, end = var_660_end_0, end_mask = var_660_end_mask_0, x = window_23)[name = tensor<string, []>("op_660")];
+            tensor<fp32, [1, 16, 256]> window_23 = concat(axis = var_82, interleave = window_23_interleave_0, values = (var_708, var_705))[name = tensor<string, []>("window_23")];
+            tensor<int32, [3]> var_713_begin_0 = const()[name = tensor<string, []>("op_713_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_713_end_0 = const()[name = tensor<string, []>("op_713_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_713_end_mask_0 = const()[name = tensor<string, []>("op_713_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_713 = slice_by_index(begin = var_713_begin_0, end = var_713_end_0, end_mask = var_713_end_mask_0, x = x_15)[name = tensor<string, []>("op_713")];
+            tensor<int32, [3]> var_716_begin_0 = const()[name = tensor<string, []>("op_716_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_716_end_0 = const()[name = tensor<string, []>("op_716_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_716_end_mask_0 = const()[name = tensor<string, []>("op_716_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_716 = slice_by_index(begin = var_716_begin_0, end = var_716_end_0, end_mask = var_716_end_mask_0, x = window_23)[name = tensor<string, []>("op_716")];
             tensor<bool, []> window_25_interleave_0 = const()[name = tensor<string, []>("window_25_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_25 = concat(axis = var_26, interleave = window_25_interleave_0, values = (var_660, var_657))[name = tensor<string, []>("window_25")];
-            tensor<int32, [3]> var_665_begin_0 = const()[name = tensor<string, []>("op_665_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_665_end_0 = const()[name = tensor<string, []>("op_665_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<bool, [3]> var_665_end_mask_0 = const()[name = tensor<string, []>("op_665_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_665 = slice_by_index(begin = var_665_begin_0, end = var_665_end_0, end_mask = var_665_end_mask_0, x = x_15)[name = tensor<string, []>("op_665")];
-            tensor<int32, [3]> var_668_begin_0 = const()[name = tensor<string, []>("op_668_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_668_end_0 = const()[name = tensor<string, []>("op_668_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_668_end_mask_0 = const()[name = tensor<string, []>("op_668_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_668 = slice_by_index(begin = var_668_begin_0, end = var_668_end_0, end_mask = var_668_end_mask_0, x = window_25)[name = tensor<string, []>("op_668")];
+            tensor<fp32, [1, 16, 256]> window_25 = concat(axis = var_82, interleave = window_25_interleave_0, values = (var_716, var_713))[name = tensor<string, []>("window_25")];
+            tensor<int32, [3]> var_721_begin_0 = const()[name = tensor<string, []>("op_721_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_721_end_0 = const()[name = tensor<string, []>("op_721_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<bool, [3]> var_721_end_mask_0 = const()[name = tensor<string, []>("op_721_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_721 = slice_by_index(begin = var_721_begin_0, end = var_721_end_0, end_mask = var_721_end_mask_0, x = x_15)[name = tensor<string, []>("op_721")];
+            tensor<int32, [3]> var_724_begin_0 = const()[name = tensor<string, []>("op_724_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_724_end_0 = const()[name = tensor<string, []>("op_724_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_724_end_mask_0 = const()[name = tensor<string, []>("op_724_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_724 = slice_by_index(begin = var_724_begin_0, end = var_724_end_0, end_mask = var_724_end_mask_0, x = window_25)[name = tensor<string, []>("op_724")];
             tensor<bool, []> window_27_interleave_0 = const()[name = tensor<string, []>("window_27_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_27 = concat(axis = var_26, interleave = window_27_interleave_0, values = (var_668, var_665))[name = tensor<string, []>("window_27")];
-            tensor<int32, [3]> var_673_begin_0 = const()[name = tensor<string, []>("op_673_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
-            tensor<int32, [3]> var_673_end_0 = const()[name = tensor<string, []>("op_673_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_673_end_mask_0 = const()[name = tensor<string, []>("op_673_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_673 = slice_by_index(begin = var_673_begin_0, end = var_673_end_0, end_mask = var_673_end_mask_0, x = x_15)[name = tensor<string, []>("op_673")];
-            tensor<int32, [3]> var_676_begin_0 = const()[name = tensor<string, []>("op_676_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_676_end_0 = const()[name = tensor<string, []>("op_676_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_676_end_mask_0 = const()[name = tensor<string, []>("op_676_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_676 = slice_by_index(begin = var_676_begin_0, end = var_676_end_0, end_mask = var_676_end_mask_0, x = window_27)[name = tensor<string, []>("op_676")];
+            tensor<fp32, [1, 16, 256]> window_27 = concat(axis = var_82, interleave = window_27_interleave_0, values = (var_724, var_721))[name = tensor<string, []>("window_27")];
+            tensor<int32, [3]> var_729_begin_0 = const()[name = tensor<string, []>("op_729_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
+            tensor<int32, [3]> var_729_end_0 = const()[name = tensor<string, []>("op_729_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_729_end_mask_0 = const()[name = tensor<string, []>("op_729_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_729 = slice_by_index(begin = var_729_begin_0, end = var_729_end_0, end_mask = var_729_end_mask_0, x = x_15)[name = tensor<string, []>("op_729")];
+            tensor<int32, [3]> var_732_begin_0 = const()[name = tensor<string, []>("op_732_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_732_end_0 = const()[name = tensor<string, []>("op_732_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_732_end_mask_0 = const()[name = tensor<string, []>("op_732_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_732 = slice_by_index(begin = var_732_begin_0, end = var_732_end_0, end_mask = var_732_end_mask_0, x = window_27)[name = tensor<string, []>("op_732")];
             tensor<bool, []> window_29_interleave_0 = const()[name = tensor<string, []>("window_29_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_29 = concat(axis = var_26, interleave = window_29_interleave_0, values = (var_676, var_673))[name = tensor<string, []>("window_29")];
-            tensor<bool, []> input_101_interleave_0 = const()[name = tensor<string, []>("input_101_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [4, 16, 256]> input_101 = concat(axis = var_23, interleave = input_101_interleave_0, values = (window_23, window_25, window_27, window_29))[name = tensor<string, []>("input_101")];
+            tensor<fp32, [1, 16, 256]> window_29 = concat(axis = var_82, interleave = window_29_interleave_0, values = (var_732, var_729))[name = tensor<string, []>("window_29")];
+            tensor<bool, []> input_103_interleave_0 = const()[name = tensor<string, []>("input_103_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [4, 16, 256]> input_103 = concat(axis = var_69, interleave = input_103_interleave_0, values = (window_23, window_25, window_27, window_29))[name = tensor<string, []>("input_103")];
             tensor<int32, [1]> x_17_axes_0 = const()[name = tensor<string, []>("x_17_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [4, 16, 256]> x_17 = layer_norm(axes = x_17_axes_0, beta = encoder_conv_module_2_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_2_sequential_0_weight, x = input_101)[name = tensor<string, []>("x_17")];
-            tensor<int32, [3]> input_103_perm_0 = const()[name = tensor<string, []>("input_103_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [4, 16, 256]> x_17 = layer_norm(axes = x_17_axes_0, beta = inner_encoder_conv_module_2_sequential_0_bias, epsilon = var_66, gamma = inner_encoder_conv_module_2_sequential_0_weight, x = input_103)[name = tensor<string, []>("x_17")];
+            tensor<int32, [3]> input_105_perm_0 = const()[name = tensor<string, []>("input_105_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_23_pad_type_0 = const()[name = tensor<string, []>("inputs_23_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_23_strides_0 = const()[name = tensor<string, []>("inputs_23_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_23_pad_0 = const()[name = tensor<string, []>("inputs_23_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_23_dilations_0 = const()[name = tensor<string, []>("inputs_23_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_23_groups_0 = const()[name = tensor<string, []>("inputs_23_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> input_103 = transpose(perm = input_103_perm_0, x = x_17)[name = tensor<string, []>("transpose_40")];
-            tensor<fp32, [4, 512, 16]> inputs_23 = conv(bias = encoder_conv_module_2_sequential_2_conv_bias, dilations = inputs_23_dilations_0, groups = inputs_23_groups_0, pad = inputs_23_pad_0, pad_type = inputs_23_pad_type_0, strides = inputs_23_strides_0, weight = encoder_conv_module_2_sequential_2_conv_weight, x = input_103)[name = tensor<string, []>("inputs_23")];
-            tensor<int32, [2]> var_701_split_sizes_0 = const()[name = tensor<string, []>("op_701_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_701_axis_0 = const()[name = tensor<string, []>("op_701_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> var_701_0, tensor<fp32, [4, 256, 16]> var_701_1 = split(axis = var_701_axis_0, split_sizes = var_701_split_sizes_0, x = inputs_23)[name = tensor<string, []>("op_701")];
-            tensor<fp32, [4, 256, 16]> var_703 = sigmoid(x = var_701_1)[name = tensor<string, []>("op_703")];
-            tensor<fp32, [4, 256, 16]> inputs_25 = mul(x = var_701_0, y = var_703)[name = tensor<string, []>("inputs_25")];
+            tensor<fp32, [4, 256, 16]> input_105 = transpose(perm = input_105_perm_0, x = x_17)[name = tensor<string, []>("transpose_40")];
+            tensor<fp32, [4, 512, 16]> inputs_23 = conv(bias = inner_encoder_conv_module_2_sequential_2_conv_bias, dilations = inputs_23_dilations_0, groups = inputs_23_groups_0, pad = inputs_23_pad_0, pad_type = inputs_23_pad_type_0, strides = inputs_23_strides_0, weight = inner_encoder_conv_module_2_sequential_2_conv_weight, x = input_105)[name = tensor<string, []>("inputs_23")];
+            tensor<int32, [2]> var_757_split_sizes_0 = const()[name = tensor<string, []>("op_757_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_757_axis_0 = const()[name = tensor<string, []>("op_757_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [4, 256, 16]> var_757_0, tensor<fp32, [4, 256, 16]> var_757_1 = split(axis = var_757_axis_0, split_sizes = var_757_split_sizes_0, x = inputs_23)[name = tensor<string, []>("op_757")];
+            tensor<fp32, [4, 256, 16]> var_759 = sigmoid(x = var_757_1)[name = tensor<string, []>("op_759")];
+            tensor<fp32, [4, 256, 16]> inputs_25 = mul(x = var_757_0, y = var_759)[name = tensor<string, []>("inputs_25")];
             tensor<string, []> outputs_aug_5_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_5_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_5_pad_0 = const()[name = tensor<string, []>("outputs_aug_5_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_5_groups_0 = const()[name = tensor<string, []>("outputs_aug_5_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_5_strides_0 = const()[name = tensor<string, []>("outputs_aug_5_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_5_dilations_0 = const()[name = tensor<string, []>("outputs_aug_5_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [4, 256, 31]> outputs_aug_5 = conv(dilations = outputs_aug_5_dilations_0, groups = outputs_aug_5_groups_0, pad = outputs_aug_5_pad_0, pad_type = outputs_aug_5_pad_type_0, strides = outputs_aug_5_strides_0, weight = encoder_conv_module_2_sequential_4_conv_weight, x = inputs_25)[name = tensor<string, []>("outputs_aug_5")];
-            tensor<int32, [3]> input_105_begin_0 = const()[name = tensor<string, []>("input_105_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_105_end_0 = const()[name = tensor<string, []>("input_105_end_0"), val = tensor<int32, [3]>([4, 256, 16])];
-            tensor<bool, [3]> input_105_end_mask_0 = const()[name = tensor<string, []>("input_105_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [4, 256, 16]> input_105 = slice_by_index(begin = input_105_begin_0, end = input_105_end_0, end_mask = input_105_end_mask_0, x = outputs_aug_5)[name = tensor<string, []>("input_105")];
-            tensor<fp32, [4, 256, 16]> inputs_27 = batch_norm(beta = encoder_conv_module_2_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_2_sequential_5_weight, mean = encoder_conv_module_2_sequential_5_running_mean, variance = encoder_conv_module_2_sequential_5_running_var, x = input_105)[name = tensor<string, []>("inputs_27")];
-            tensor<fp32, [4, 256, 16]> input_107 = silu(x = inputs_27)[name = tensor<string, []>("input_107")];
-            tensor<string, []> input_109_pad_type_0 = const()[name = tensor<string, []>("input_109_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_109_strides_0 = const()[name = tensor<string, []>("input_109_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_109_pad_0 = const()[name = tensor<string, []>("input_109_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_109_dilations_0 = const()[name = tensor<string, []>("input_109_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_109_groups_0 = const()[name = tensor<string, []>("input_109_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> input_109 = conv(bias = encoder_conv_module_2_sequential_7_conv_bias, dilations = input_109_dilations_0, groups = input_109_groups_0, pad = input_109_pad_0, pad_type = input_109_pad_type_0, strides = input_109_strides_0, weight = encoder_conv_module_2_sequential_7_conv_weight, x = input_107)[name = tensor<string, []>("input_109")];
+            tensor<fp32, [4, 256, 31]> outputs_aug_5 = conv(dilations = outputs_aug_5_dilations_0, groups = outputs_aug_5_groups_0, pad = outputs_aug_5_pad_0, pad_type = outputs_aug_5_pad_type_0, strides = outputs_aug_5_strides_0, weight = inner_encoder_conv_module_2_sequential_4_conv_weight, x = inputs_25)[name = tensor<string, []>("outputs_aug_5")];
+            tensor<int32, [3]> input_107_begin_0 = const()[name = tensor<string, []>("input_107_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_107_end_0 = const()[name = tensor<string, []>("input_107_end_0"), val = tensor<int32, [3]>([4, 256, 16])];
+            tensor<bool, [3]> input_107_end_mask_0 = const()[name = tensor<string, []>("input_107_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [4, 256, 16]> input_107 = slice_by_index(begin = input_107_begin_0, end = input_107_end_0, end_mask = input_107_end_mask_0, x = outputs_aug_5)[name = tensor<string, []>("input_107")];
+            tensor<fp32, [4, 256, 16]> inputs_27 = batch_norm(beta = inner_encoder_conv_module_2_sequential_5_bias, epsilon = var_66, gamma = inner_encoder_conv_module_2_sequential_5_weight, mean = inner_encoder_conv_module_2_sequential_5_running_mean, variance = inner_encoder_conv_module_2_sequential_5_running_var, x = input_107)[name = tensor<string, []>("inputs_27")];
+            tensor<fp32, [4, 256, 16]> input_109 = silu(x = inputs_27)[name = tensor<string, []>("input_109")];
+            tensor<string, []> input_111_pad_type_0 = const()[name = tensor<string, []>("input_111_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_111_strides_0 = const()[name = tensor<string, []>("input_111_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_111_pad_0 = const()[name = tensor<string, []>("input_111_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_111_dilations_0 = const()[name = tensor<string, []>("input_111_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_111_groups_0 = const()[name = tensor<string, []>("input_111_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [4, 256, 16]> input_111 = conv(bias = inner_encoder_conv_module_2_sequential_7_conv_bias, dilations = input_111_dilations_0, groups = input_111_groups_0, pad = input_111_pad_0, pad_type = input_111_pad_type_0, strides = input_111_strides_0, weight = inner_encoder_conv_module_2_sequential_7_conv_weight, x = input_109)[name = tensor<string, []>("input_111")];
             tensor<int32, [3]> conv_out_5_perm_0 = const()[name = tensor<string, []>("conv_out_5_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_734_begin_0 = const()[name = tensor<string, []>("op_734_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_734_end_0 = const()[name = tensor<string, []>("op_734_end_0"), val = tensor<int32, [3]>([4, 16, 256])];
-            tensor<bool, [3]> var_734_end_mask_0 = const()[name = tensor<string, []>("op_734_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [4, 16, 256]> conv_out_5 = transpose(perm = conv_out_5_perm_0, x = input_109)[name = tensor<string, []>("transpose_39")];
-            tensor<fp32, [4, 1, 256]> var_734 = slice_by_index(begin = var_734_begin_0, end = var_734_end_0, end_mask = var_734_end_mask_0, x = conv_out_5)[name = tensor<string, []>("op_734")];
-            tensor<int32, [3]> var_736_perm_0 = const()[name = tensor<string, []>("op_736_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 4, 256]> var_736 = transpose(perm = var_736_perm_0, x = var_734)[name = tensor<string, []>("transpose_38")];
-            tensor<fp32, [1, 4, 256]> input_111 = add(x = x_15, y = var_736)[name = tensor<string, []>("input_111")];
-            tensor<int32, [1]> input_113_axes_0 = const()[name = tensor<string, []>("input_113_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_113 = layer_norm(axes = input_113_axes_0, beta = encoder_ffn2_2_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_2_module_sequential_0_weight, x = input_111)[name = tensor<string, []>("input_113")];
-            tensor<fp32, [1, 4, 1024]> inputs_29 = linear(bias = encoder_ffn2_2_module_sequential_1_linear_bias, weight = encoder_ffn2_2_module_sequential_1_linear_weight, x = input_113)[name = tensor<string, []>("linear_26")];
-            tensor<fp32, [1, 4, 1024]> input_115 = silu(x = inputs_29)[name = tensor<string, []>("input_115")];
-            tensor<fp32, [1, 4, 256]> input_119 = linear(bias = encoder_ffn2_2_module_sequential_4_linear_bias, weight = encoder_ffn2_2_module_sequential_4_linear_weight, x = input_115)[name = tensor<string, []>("linear_27")];
-            tensor<fp32, []> var_759 = const()[name = tensor<string, []>("op_759"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 4, 256]> var_760 = mul(x = input_119, y = var_759)[name = tensor<string, []>("op_760")];
-            tensor<fp32, [1, 4, 256]> input_121 = add(x = var_760, y = input_111)[name = tensor<string, []>("input_121")];
-            tensor<int32, [1]> input_123_axes_0 = const()[name = tensor<string, []>("input_123_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_123 = layer_norm(axes = input_123_axes_0, beta = encoder_layer_norm_2_bias, epsilon = var_28, gamma = encoder_layer_norm_2_weight, x = input_121)[name = tensor<string, []>("input_123")];
+            tensor<int32, [3]> var_790_begin_0 = const()[name = tensor<string, []>("op_790_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_790_end_0 = const()[name = tensor<string, []>("op_790_end_0"), val = tensor<int32, [3]>([4, 16, 256])];
+            tensor<bool, [3]> var_790_end_mask_0 = const()[name = tensor<string, []>("op_790_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [4, 16, 256]> conv_out_5 = transpose(perm = conv_out_5_perm_0, x = input_111)[name = tensor<string, []>("transpose_39")];
+            tensor<fp32, [4, 1, 256]> var_790 = slice_by_index(begin = var_790_begin_0, end = var_790_end_0, end_mask = var_790_end_mask_0, x = conv_out_5)[name = tensor<string, []>("op_790")];
+            tensor<int32, [3]> var_792_perm_0 = const()[name = tensor<string, []>("op_792_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 4, 256]> var_792 = transpose(perm = var_792_perm_0, x = var_790)[name = tensor<string, []>("transpose_38")];
+            tensor<fp32, [1, 4, 256]> input_113 = add(x = x_15, y = var_792)[name = tensor<string, []>("input_113")];
+            tensor<int32, [1]> input_115_axes_0 = const()[name = tensor<string, []>("input_115_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 4, 256]> input_115 = layer_norm(axes = input_115_axes_0, beta = inner_encoder_ffn2_2_module_sequential_0_bias, epsilon = var_66, gamma = inner_encoder_ffn2_2_module_sequential_0_weight, x = input_113)[name = tensor<string, []>("input_115")];
+            tensor<fp32, [1, 4, 1024]> inputs_29 = linear(bias = inner_encoder_ffn2_2_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_2_module_sequential_1_linear_weight, x = input_115)[name = tensor<string, []>("linear_26")];
+            tensor<fp32, [1, 4, 1024]> input_117 = silu(x = inputs_29)[name = tensor<string, []>("input_117")];
+            tensor<fp32, [1, 4, 256]> input_121 = linear(bias = inner_encoder_ffn2_2_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_2_module_sequential_4_linear_weight, x = input_117)[name = tensor<string, []>("linear_27")];
+            tensor<fp32, []> var_815 = const()[name = tensor<string, []>("op_815"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 4, 256]> var_816 = mul(x = input_121, y = var_815)[name = tensor<string, []>("op_816")];
+            tensor<fp32, [1, 4, 256]> input_123 = add(x = var_816, y = input_113)[name = tensor<string, []>("input_123")];
             tensor<int32, [1]> input_125_axes_0 = const()[name = tensor<string, []>("input_125_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_125 = layer_norm(axes = input_125_axes_0, beta = encoder_ffn1_3_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn1_3_module_sequential_0_weight, x = input_123)[name = tensor<string, []>("input_125")];
-            tensor<fp32, [1, 4, 1024]> inputs_31 = linear(bias = encoder_ffn1_3_module_sequential_1_linear_bias, weight = encoder_ffn1_3_module_sequential_1_linear_weight, x = input_125)[name = tensor<string, []>("linear_28")];
-            tensor<fp32, [1, 4, 1024]> input_127 = silu(x = inputs_31)[name = tensor<string, []>("input_127")];
-            tensor<fp32, [1, 4, 256]> input_131 = linear(bias = encoder_ffn1_3_module_sequential_4_linear_bias, weight = encoder_ffn1_3_module_sequential_4_linear_weight, x = input_127)[name = tensor<string, []>("linear_29")];
-            tensor<fp32, []> var_789 = const()[name = tensor<string, []>("op_789"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 4, 256]> var_790 = mul(x = input_131, y = var_789)[name = tensor<string, []>("op_790")];
-            tensor<fp32, [1, 4, 256]> input_133 = add(x = var_790, y = input_123)[name = tensor<string, []>("input_133")];
+            tensor<fp32, [1, 4, 256]> input_125 = layer_norm(axes = input_125_axes_0, beta = inner_encoder_layer_norm_2_bias, epsilon = var_66, gamma = inner_encoder_layer_norm_2_weight, x = input_123)[name = tensor<string, []>("input_125")];
+            tensor<int32, [1]> input_127_axes_0 = const()[name = tensor<string, []>("input_127_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 4, 256]> input_127 = layer_norm(axes = input_127_axes_0, beta = inner_encoder_ffn1_3_module_sequential_0_bias, epsilon = var_66, gamma = inner_encoder_ffn1_3_module_sequential_0_weight, x = input_125)[name = tensor<string, []>("input_127")];
+            tensor<fp32, [1, 4, 1024]> inputs_31 = linear(bias = inner_encoder_ffn1_3_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_3_module_sequential_1_linear_weight, x = input_127)[name = tensor<string, []>("linear_28")];
+            tensor<fp32, [1, 4, 1024]> input_129 = silu(x = inputs_31)[name = tensor<string, []>("input_129")];
+            tensor<fp32, [1, 4, 256]> input_133 = linear(bias = inner_encoder_ffn1_3_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_3_module_sequential_4_linear_weight, x = input_129)[name = tensor<string, []>("linear_29")];
+            tensor<fp32, []> var_845 = const()[name = tensor<string, []>("op_845"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 4, 256]> var_846 = mul(x = input_133, y = var_845)[name = tensor<string, []>("op_846")];
+            tensor<fp32, [1, 4, 256]> input_135 = add(x = var_846, y = input_125)[name = tensor<string, []>("input_135")];
             tensor<int32, [1]> x_19_axes_0 = const()[name = tensor<string, []>("x_19_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> x_19 = layer_norm(axes = x_19_axes_0, beta = encoder_ret_lns_3_bias, epsilon = var_28, gamma = encoder_ret_lns_3_weight, x = input_133)[name = tensor<string, []>("x_19")];
+            tensor<fp32, [1, 4, 256]> x_19 = layer_norm(axes = x_19_axes_0, beta = inner_encoder_ret_lns_3_bias, epsilon = var_66, gamma = inner_encoder_ret_lns_3_weight, x = input_135)[name = tensor<string, []>("x_19")];
             tensor<int32, [5]> prev_kv_7_begin_0 = const()[name = tensor<string, []>("prev_kv_7_begin_0"), val = tensor<int32, [5]>([3, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_7_end_0 = const()[name = tensor<string, []>("prev_kv_7_end_0"), val = tensor<int32, [5]>([4, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_7_end_mask_0 = const()[name = tensor<string, []>("prev_kv_7_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -770,209 +792,202 @@ program(1.0)
             tensor<bool, [2]> prev_scale_7_end_mask_0 = const()[name = tensor<string, []>("prev_scale_7_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_7_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_7_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_7 = slice_by_index(begin = prev_scale_7_begin_0, end = prev_scale_7_end_0, end_mask = prev_scale_7_end_mask_0, squeeze_mask = prev_scale_7_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_7")];
-            tensor<fp32, [1, 4, 256]> var_804 = linear(bias = encoder_q_proj_3_bias, weight = encoder_q_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_30")];
-            tensor<int32, [4]> var_805 = const()[name = tensor<string, []>("op_805"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_806 = reshape(shape = var_805, x = var_804)[name = tensor<string, []>("op_806")];
+            tensor<fp32, [1, 4, 256]> var_860 = linear(bias = inner_encoder_q_proj_3_bias, weight = inner_encoder_q_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_30")];
+            tensor<int32, [4]> var_861 = const()[name = tensor<string, []>("op_861"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_862 = reshape(shape = var_861, x = var_860)[name = tensor<string, []>("op_862")];
             tensor<int32, [4]> q_7_perm_0 = const()[name = tensor<string, []>("q_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> var_810 = linear(bias = encoder_k_proj_3_bias, weight = encoder_k_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_31")];
-            tensor<fp32, []> var_811 = const()[name = tensor<string, []>("op_811"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 4, 256]> var_812 = mul(x = var_810, y = var_811)[name = tensor<string, []>("op_812")];
-            tensor<int32, [4]> var_813 = const()[name = tensor<string, []>("op_813"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_814 = reshape(shape = var_813, x = var_812)[name = tensor<string, []>("op_814")];
+            tensor<fp32, [1, 4, 256]> var_866 = linear(bias = inner_encoder_k_proj_3_bias, weight = inner_encoder_k_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_31")];
+            tensor<fp32, []> var_867 = const()[name = tensor<string, []>("op_867"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 4, 256]> var_868 = mul(x = var_866, y = var_867)[name = tensor<string, []>("op_868")];
+            tensor<int32, [4]> var_869 = const()[name = tensor<string, []>("op_869"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_870 = reshape(shape = var_869, x = var_868)[name = tensor<string, []>("op_870")];
             tensor<int32, [4]> k_7_perm_0 = const()[name = tensor<string, []>("k_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> var_818 = linear(bias = encoder_v_proj_3_bias, weight = encoder_v_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_32")];
-            tensor<int32, [4]> var_819 = const()[name = tensor<string, []>("op_819"), val = tensor<int32, [4]>([1, 4, 4, 64])];
-            tensor<fp32, [1, 4, 4, 64]> var_820 = reshape(shape = var_819, x = var_818)[name = tensor<string, []>("op_820")];
+            tensor<fp32, [1, 4, 256]> var_874 = linear(bias = inner_encoder_v_proj_3_bias, weight = inner_encoder_v_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_32")];
+            tensor<int32, [4]> var_875 = const()[name = tensor<string, []>("op_875"), val = tensor<int32, [4]>([1, 4, 4, 64])];
+            tensor<fp32, [1, 4, 4, 64]> var_876 = reshape(shape = var_875, x = var_874)[name = tensor<string, []>("op_876")];
             tensor<int32, [4]> v_7_perm_0 = const()[name = tensor<string, []>("v_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 4, 256]> input_137 = linear(bias = encoder_g_proj_3_bias, weight = encoder_g_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_33")];
+            tensor<fp32, [1, 4, 256]> input_139 = linear(bias = inner_encoder_g_proj_3_bias, weight = inner_encoder_g_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_33")];
             tensor<fp32, [1]> sqrt_s0_7 = sqrt(x = prev_scale_7)[name = tensor<string, []>("sqrt_s0_7")];
-            tensor<fp32, [4]> s_t_7 = add(x = prev_scale_7, y = encoder__t_index)[name = tensor<string, []>("s_t_7")];
+            tensor<fp32, [4]> s_t_7 = add(x = prev_scale_7, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_7")];
             tensor<fp32, [4]> sqrt_s_t_7 = sqrt(x = s_t_7)[name = tensor<string, []>("sqrt_s_t_7")];
             tensor<bool, []> qk_7_transpose_x_1 = const()[name = tensor<string, []>("qk_7_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_7_transpose_y_1 = const()[name = tensor<string, []>("qk_7_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 4, 64]> k_7 = transpose(perm = k_7_perm_0, x = var_814)[name = tensor<string, []>("transpose_36")];
-            tensor<fp32, [1, 4, 4, 64]> q_7 = transpose(perm = q_7_perm_0, x = var_806)[name = tensor<string, []>("transpose_37")];
+            tensor<fp32, [1, 4, 4, 64]> k_7 = transpose(perm = k_7_perm_0, x = var_870)[name = tensor<string, []>("transpose_36")];
+            tensor<fp32, [1, 4, 4, 64]> q_7 = transpose(perm = q_7_perm_0, x = var_862)[name = tensor<string, []>("transpose_37")];
             tensor<fp32, [1, 4, 4, 4]> qk_7 = matmul(transpose_x = qk_7_transpose_x_1, transpose_y = qk_7_transpose_y_1, x = q_7, y = k_7)[name = tensor<string, []>("qk_7")];
-            tensor<int32, [2]> var_830 = const()[name = tensor<string, []>("op_830"), val = tensor<int32, [2]>([4, 1])];
-            tensor<fp32, [4, 1]> var_831 = reshape(shape = var_830, x = sqrt_s_t_7)[name = tensor<string, []>("op_831")];
-            tensor<fp32, [4, 4]> M_7 = real_div(x = encoder__causal_mask, y = var_831)[name = tensor<string, []>("M_7")];
-            tensor<fp32, [1, 4, 4, 4]> var_833 = mul(x = qk_7, y = M_7)[name = tensor<string, []>("op_833")];
+            tensor<int32, [2]> var_886 = const()[name = tensor<string, []>("op_886"), val = tensor<int32, [2]>([4, 1])];
+            tensor<fp32, [4, 1]> var_887 = reshape(shape = var_886, x = sqrt_s_t_7)[name = tensor<string, []>("op_887")];
+            tensor<fp32, [4, 4]> M_7 = real_div(x = inner_encoder__causal_mask, y = var_887)[name = tensor<string, []>("M_7")];
+            tensor<fp32, [1, 4, 4, 4]> var_889 = mul(x = qk_7, y = M_7)[name = tensor<string, []>("op_889")];
             tensor<bool, []> inner_7_transpose_x_0 = const()[name = tensor<string, []>("inner_7_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_7_transpose_y_0 = const()[name = tensor<string, []>("inner_7_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 4, 64]> v_7 = transpose(perm = v_7_perm_0, x = var_820)[name = tensor<string, []>("transpose_35")];
-            tensor<fp32, [1, 4, 4, 64]> inner_7 = matmul(transpose_x = inner_7_transpose_x_0, transpose_y = inner_7_transpose_y_0, x = var_833, y = v_7)[name = tensor<string, []>("inner_7")];
-            tensor<bool, []> var_835_transpose_x_0 = const()[name = tensor<string, []>("op_835_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_835_transpose_y_0 = const()[name = tensor<string, []>("op_835_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 4, 64]> var_835 = matmul(transpose_x = var_835_transpose_x_0, transpose_y = var_835_transpose_y_0, x = q_7, y = prev_kv_7)[name = tensor<string, []>("op_835")];
-            tensor<fp32, [4]> var_836 = real_div(x = sqrt_s0_7, y = sqrt_s_t_7)[name = tensor<string, []>("op_836")];
-            tensor<int32, [4]> var_837 = const()[name = tensor<string, []>("op_837"), val = tensor<int32, [4]>([1, 1, 4, 1])];
-            tensor<fp32, [1, 1, 4, 1]> var_838 = reshape(shape = var_837, x = var_836)[name = tensor<string, []>("op_838")];
-            tensor<fp32, [1, 4, 4, 64]> cross_7 = mul(x = var_835, y = var_838)[name = tensor<string, []>("cross_7")];
+            tensor<fp32, [1, 4, 4, 64]> v_7 = transpose(perm = v_7_perm_0, x = var_876)[name = tensor<string, []>("transpose_35")];
+            tensor<fp32, [1, 4, 4, 64]> inner_7 = matmul(transpose_x = inner_7_transpose_x_0, transpose_y = inner_7_transpose_y_0, x = var_889, y = v_7)[name = tensor<string, []>("inner_7")];
+            tensor<bool, []> var_891_transpose_x_0 = const()[name = tensor<string, []>("op_891_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_891_transpose_y_0 = const()[name = tensor<string, []>("op_891_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 4, 64]> var_891 = matmul(transpose_x = var_891_transpose_x_0, transpose_y = var_891_transpose_y_0, x = q_7, y = prev_kv_7)[name = tensor<string, []>("op_891")];
+            tensor<fp32, [4]> var_892 = real_div(x = sqrt_s0_7, y = sqrt_s_t_7)[name = tensor<string, []>("op_892")];
+            tensor<int32, [4]> var_893 = const()[name = tensor<string, []>("op_893"), val = tensor<int32, [4]>([1, 1, 4, 1])];
+            tensor<fp32, [1, 1, 4, 1]> var_894 = reshape(shape = var_893, x = var_892)[name = tensor<string, []>("op_894")];
+            tensor<fp32, [1, 4, 4, 64]> cross_7 = mul(x = var_891, y = var_894)[name = tensor<string, []>("cross_7")];
             tensor<fp32, [1, 4, 4, 64]> out_19 = add(x = inner_7, y = cross_7)[name = tensor<string, []>("out_19")];
-            tensor<fp32, [1, 4, 64, 64]> var_841 = mul(x = prev_kv_7, y = sqrt_s0_7)[name = tensor<string, []>("op_841")];
-            tensor<bool, []> var_843_transpose_x_1 = const()[name = tensor<string, []>("op_843_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_843_transpose_y_1 = const()[name = tensor<string, []>("op_843_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_843 = matmul(transpose_x = var_843_transpose_x_1, transpose_y = var_843_transpose_y_1, x = k_7, y = v_7)[name = tensor<string, []>("op_843")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_7 = add(x = var_841, y = var_843)[name = tensor<string, []>("new_kv_unnorm_7")];
-            tensor<fp32, []> var_845 = const()[name = tensor<string, []>("op_845"), val = tensor<fp32, []>(0x1p+2)];
-            tensor<fp32, [1]> new_scale_7 = add(x = prev_scale_7, y = var_845)[name = tensor<string, []>("new_scale_7")];
-            tensor<fp32, [1]> var_847 = sqrt(x = new_scale_7)[name = tensor<string, []>("op_847")];
-            tensor<fp32, [1, 4, 64, 64]> nkv_1 = real_div(x = new_kv_unnorm_7, y = var_847)[name = tensor<string, []>("nkv_1")];
-            tensor<int32, [4]> var_849_perm_0 = const()[name = tensor<string, []>("op_849_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_897 = mul(x = prev_kv_7, y = sqrt_s0_7)[name = tensor<string, []>("op_897")];
+            tensor<bool, []> var_899_transpose_x_1 = const()[name = tensor<string, []>("op_899_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_899_transpose_y_1 = const()[name = tensor<string, []>("op_899_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_899 = matmul(transpose_x = var_899_transpose_x_1, transpose_y = var_899_transpose_y_1, x = k_7, y = v_7)[name = tensor<string, []>("op_899")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_7 = add(x = var_897, y = var_899)[name = tensor<string, []>("new_kv_unnorm_7")];
+            tensor<fp32, []> var_901 = const()[name = tensor<string, []>("op_901"), val = tensor<fp32, []>(0x1p+2)];
+            tensor<fp32, [1]> new_scale_7 = add(x = prev_scale_7, y = var_901)[name = tensor<string, []>("new_scale_7")];
+            tensor<fp32, [1]> var_903 = sqrt(x = new_scale_7)[name = tensor<string, []>("op_903")];
+            tensor<fp32, [1, 4, 64, 64]> nkv_1 = real_div(x = new_kv_unnorm_7, y = var_903)[name = tensor<string, []>("nkv_1")];
+            tensor<int32, [4]> var_905_perm_0 = const()[name = tensor<string, []>("op_905_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_21_axes_0 = const()[name = tensor<string, []>("out_21_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 4, 64]> var_849 = transpose(perm = var_849_perm_0, x = out_19)[name = tensor<string, []>("transpose_34")];
-            tensor<fp32, [1, 4, 4, 64]> out_21 = layer_norm(axes = out_21_axes_0, epsilon = var_17, x = var_849)[name = tensor<string, []>("out_21")];
-            tensor<int32, [3]> var_853 = const()[name = tensor<string, []>("op_853"), val = tensor<int32, [3]>([1, 4, 256])];
-            tensor<fp32, [1, 4, 256]> out_23 = reshape(shape = var_853, x = out_21)[name = tensor<string, []>("out_23")];
-            tensor<fp32, [1, 4, 256]> var_855 = silu(x = input_137)[name = tensor<string, []>("op_855")];
-            tensor<fp32, [1, 4, 256]> input_139 = mul(x = var_855, y = out_23)[name = tensor<string, []>("input_139")];
-            tensor<fp32, [1, 4, 256]> ret_out_7 = linear(bias = encoder_out_proj_3_bias, weight = encoder_out_proj_3_weight, x = input_139)[name = tensor<string, []>("linear_34")];
-            tensor<fp32, [1, 4, 256]> x_21 = add(x = input_133, y = ret_out_7)[name = tensor<string, []>("x_21")];
+            tensor<fp32, [1, 4, 4, 64]> var_905 = transpose(perm = var_905_perm_0, x = out_19)[name = tensor<string, []>("transpose_34")];
+            tensor<fp32, [1, 4, 4, 64]> out_21 = layer_norm(axes = out_21_axes_0, epsilon = var_74, x = var_905)[name = tensor<string, []>("out_21")];
+            tensor<int32, [3]> var_909 = const()[name = tensor<string, []>("op_909"), val = tensor<int32, [3]>([1, 4, 256])];
+            tensor<fp32, [1, 4, 256]> out_23 = reshape(shape = var_909, x = out_21)[name = tensor<string, []>("out_23")];
+            tensor<fp32, [1, 4, 256]> var_911 = silu(x = input_139)[name = tensor<string, []>("op_911")];
+            tensor<fp32, [1, 4, 256]> input_141 = mul(x = var_911, y = out_23)[name = tensor<string, []>("input_141")];
+            tensor<fp32, [1, 4, 256]> ret_out_7 = linear(bias = inner_encoder_out_proj_3_bias, weight = inner_encoder_out_proj_3_weight, x = input_141)[name = tensor<string, []>("linear_34")];
+            tensor<fp32, [1, 4, 256]> x_21 = add(x = input_135, y = ret_out_7)[name = tensor<string, []>("x_21")];
             tensor<int32, [4]> window_31_begin_0 = const()[name = tensor<string, []>("window_31_begin_0"), val = tensor<int32, [4]>([3, 0, 0, 0])];
             tensor<int32, [4]> window_31_end_0 = const()[name = tensor<string, []>("window_31_end_0"), val = tensor<int32, [4]>([4, 1, 16, 256])];
             tensor<bool, [4]> window_31_end_mask_0 = const()[name = tensor<string, []>("window_31_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_31_squeeze_mask_0 = const()[name = tensor<string, []>("window_31_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_31 = slice_by_index(begin = window_31_begin_0, end = window_31_end_0, end_mask = window_31_end_mask_0, squeeze_mask = window_31_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_31")];
-            tensor<int32, [3]> var_863_begin_0 = const()[name = tensor<string, []>("op_863_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_863_end_0 = const()[name = tensor<string, []>("op_863_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_863_end_mask_0 = const()[name = tensor<string, []>("op_863_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_863 = slice_by_index(begin = var_863_begin_0, end = var_863_end_0, end_mask = var_863_end_mask_0, x = x_21)[name = tensor<string, []>("op_863")];
-            tensor<int32, [3]> var_866_begin_0 = const()[name = tensor<string, []>("op_866_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_866_end_0 = const()[name = tensor<string, []>("op_866_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_866_end_mask_0 = const()[name = tensor<string, []>("op_866_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_866 = slice_by_index(begin = var_866_begin_0, end = var_866_end_0, end_mask = var_866_end_mask_0, x = window_31)[name = tensor<string, []>("op_866")];
+            tensor<int32, [3]> var_919_begin_0 = const()[name = tensor<string, []>("op_919_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_919_end_0 = const()[name = tensor<string, []>("op_919_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_919_end_mask_0 = const()[name = tensor<string, []>("op_919_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_919 = slice_by_index(begin = var_919_begin_0, end = var_919_end_0, end_mask = var_919_end_mask_0, x = x_21)[name = tensor<string, []>("op_919")];
+            tensor<int32, [3]> var_922_begin_0 = const()[name = tensor<string, []>("op_922_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_922_end_0 = const()[name = tensor<string, []>("op_922_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_922_end_mask_0 = const()[name = tensor<string, []>("op_922_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_922 = slice_by_index(begin = var_922_begin_0, end = var_922_end_0, end_mask = var_922_end_mask_0, x = window_31)[name = tensor<string, []>("op_922")];
             tensor<bool, []> window_33_interleave_0 = const()[name = tensor<string, []>("window_33_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_33 = concat(axis = var_26, interleave = window_33_interleave_0, values = (var_866, var_863))[name = tensor<string, []>("window_33")];
-            tensor<int32, [3]> var_871_begin_0 = const()[name = tensor<string, []>("op_871_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_871_end_0 = const()[name = tensor<string, []>("op_871_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_871_end_mask_0 = const()[name = tensor<string, []>("op_871_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_871 = slice_by_index(begin = var_871_begin_0, end = var_871_end_0, end_mask = var_871_end_mask_0, x = x_21)[name = tensor<string, []>("op_871")];
-            tensor<int32, [3]> var_874_begin_0 = const()[name = tensor<string, []>("op_874_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_874_end_0 = const()[name = tensor<string, []>("op_874_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_874_end_mask_0 = const()[name = tensor<string, []>("op_874_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_874 = slice_by_index(begin = var_874_begin_0, end = var_874_end_0, end_mask = var_874_end_mask_0, x = window_33)[name = tensor<string, []>("op_874")];
+            tensor<fp32, [1, 16, 256]> window_33 = concat(axis = var_82, interleave = window_33_interleave_0, values = (var_922, var_919))[name = tensor<string, []>("window_33")];
+            tensor<int32, [3]> var_927_begin_0 = const()[name = tensor<string, []>("op_927_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_927_end_0 = const()[name = tensor<string, []>("op_927_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_927_end_mask_0 = const()[name = tensor<string, []>("op_927_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_927 = slice_by_index(begin = var_927_begin_0, end = var_927_end_0, end_mask = var_927_end_mask_0, x = x_21)[name = tensor<string, []>("op_927")];
+            tensor<int32, [3]> var_930_begin_0 = const()[name = tensor<string, []>("op_930_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_930_end_0 = const()[name = tensor<string, []>("op_930_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_930_end_mask_0 = const()[name = tensor<string, []>("op_930_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_930 = slice_by_index(begin = var_930_begin_0, end = var_930_end_0, end_mask = var_930_end_mask_0, x = window_33)[name = tensor<string, []>("op_930")];
             tensor<bool, []> window_35_interleave_0 = const()[name = tensor<string, []>("window_35_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_35 = concat(axis = var_26, interleave = window_35_interleave_0, values = (var_874, var_871))[name = tensor<string, []>("window_35")];
-            tensor<int32, [3]> var_879_begin_0 = const()[name = tensor<string, []>("op_879_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_879_end_0 = const()[name = tensor<string, []>("op_879_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<bool, [3]> var_879_end_mask_0 = const()[name = tensor<string, []>("op_879_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_879 = slice_by_index(begin = var_879_begin_0, end = var_879_end_0, end_mask = var_879_end_mask_0, x = x_21)[name = tensor<string, []>("op_879")];
-            tensor<int32, [3]> var_882_begin_0 = const()[name = tensor<string, []>("op_882_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_882_end_0 = const()[name = tensor<string, []>("op_882_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_882_end_mask_0 = const()[name = tensor<string, []>("op_882_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_882 = slice_by_index(begin = var_882_begin_0, end = var_882_end_0, end_mask = var_882_end_mask_0, x = window_35)[name = tensor<string, []>("op_882")];
+            tensor<fp32, [1, 16, 256]> window_35 = concat(axis = var_82, interleave = window_35_interleave_0, values = (var_930, var_927))[name = tensor<string, []>("window_35")];
+            tensor<int32, [3]> var_935_begin_0 = const()[name = tensor<string, []>("op_935_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_935_end_0 = const()[name = tensor<string, []>("op_935_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<bool, [3]> var_935_end_mask_0 = const()[name = tensor<string, []>("op_935_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_935 = slice_by_index(begin = var_935_begin_0, end = var_935_end_0, end_mask = var_935_end_mask_0, x = x_21)[name = tensor<string, []>("op_935")];
+            tensor<int32, [3]> var_938_begin_0 = const()[name = tensor<string, []>("op_938_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_938_end_0 = const()[name = tensor<string, []>("op_938_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_938_end_mask_0 = const()[name = tensor<string, []>("op_938_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_938 = slice_by_index(begin = var_938_begin_0, end = var_938_end_0, end_mask = var_938_end_mask_0, x = window_35)[name = tensor<string, []>("op_938")];
             tensor<bool, []> window_37_interleave_0 = const()[name = tensor<string, []>("window_37_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_37 = concat(axis = var_26, interleave = window_37_interleave_0, values = (var_882, var_879))[name = tensor<string, []>("window_37")];
-            tensor<int32, [3]> var_887_begin_0 = const()[name = tensor<string, []>("op_887_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
-            tensor<int32, [3]> var_887_end_0 = const()[name = tensor<string, []>("op_887_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_887_end_mask_0 = const()[name = tensor<string, []>("op_887_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_887 = slice_by_index(begin = var_887_begin_0, end = var_887_end_0, end_mask = var_887_end_mask_0, x = x_21)[name = tensor<string, []>("op_887")];
-            tensor<int32, [3]> var_890_begin_0 = const()[name = tensor<string, []>("op_890_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_890_end_0 = const()[name = tensor<string, []>("op_890_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_890_end_mask_0 = const()[name = tensor<string, []>("op_890_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_890 = slice_by_index(begin = var_890_begin_0, end = var_890_end_0, end_mask = var_890_end_mask_0, x = window_37)[name = tensor<string, []>("op_890")];
+            tensor<fp32, [1, 16, 256]> window_37 = concat(axis = var_82, interleave = window_37_interleave_0, values = (var_938, var_935))[name = tensor<string, []>("window_37")];
+            tensor<int32, [3]> var_943_begin_0 = const()[name = tensor<string, []>("op_943_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
+            tensor<int32, [3]> var_943_end_0 = const()[name = tensor<string, []>("op_943_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_943_end_mask_0 = const()[name = tensor<string, []>("op_943_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_943 = slice_by_index(begin = var_943_begin_0, end = var_943_end_0, end_mask = var_943_end_mask_0, x = x_21)[name = tensor<string, []>("op_943")];
+            tensor<int32, [3]> var_946_begin_0 = const()[name = tensor<string, []>("op_946_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_946_end_0 = const()[name = tensor<string, []>("op_946_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_946_end_mask_0 = const()[name = tensor<string, []>("op_946_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_946 = slice_by_index(begin = var_946_begin_0, end = var_946_end_0, end_mask = var_946_end_mask_0, x = window_37)[name = tensor<string, []>("op_946")];
             tensor<bool, []> window_interleave_0 = const()[name = tensor<string, []>("window_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window = concat(axis = var_26, interleave = window_interleave_0, values = (var_890, var_887))[name = tensor<string, []>("window")];
-            tensor<bool, []> input_141_interleave_0 = const()[name = tensor<string, []>("input_141_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [4, 16, 256]> input_141 = concat(axis = var_23, interleave = input_141_interleave_0, values = (window_33, window_35, window_37, window))[name = tensor<string, []>("input_141")];
+            tensor<fp32, [1, 16, 256]> window = concat(axis = var_82, interleave = window_interleave_0, values = (var_946, var_943))[name = tensor<string, []>("window")];
+            tensor<bool, []> input_143_interleave_0 = const()[name = tensor<string, []>("input_143_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [4, 16, 256]> input_143 = concat(axis = var_69, interleave = input_143_interleave_0, values = (window_33, window_35, window_37, window))[name = tensor<string, []>("input_143")];
             tensor<int32, [1]> x_23_axes_0 = const()[name = tensor<string, []>("x_23_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [4, 16, 256]> x_23 = layer_norm(axes = x_23_axes_0, beta = encoder_conv_module_3_sequential_0_bias, epsilon = var_28, gamma = encoder_conv_module_3_sequential_0_weight, x = input_141)[name = tensor<string, []>("x_23")];
-            tensor<int32, [3]> input_143_perm_0 = const()[name = tensor<string, []>("input_143_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [4, 16, 256]> x_23 = layer_norm(axes = x_23_axes_0, beta = inner_encoder_conv_module_3_sequential_0_bias, epsilon = var_66, gamma = inner_encoder_conv_module_3_sequential_0_weight, x = input_143)[name = tensor<string, []>("x_23")];
+            tensor<int32, [3]> input_145_perm_0 = const()[name = tensor<string, []>("input_145_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_33_pad_type_0 = const()[name = tensor<string, []>("inputs_33_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_33_strides_0 = const()[name = tensor<string, []>("inputs_33_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_33_pad_0 = const()[name = tensor<string, []>("inputs_33_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_33_dilations_0 = const()[name = tensor<string, []>("inputs_33_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_33_groups_0 = const()[name = tensor<string, []>("inputs_33_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> input_143 = transpose(perm = input_143_perm_0, x = x_23)[name = tensor<string, []>("transpose_33")];
-            tensor<fp32, [4, 512, 16]> inputs_33 = conv(bias = encoder_conv_module_3_sequential_2_conv_bias, dilations = inputs_33_dilations_0, groups = inputs_33_groups_0, pad = inputs_33_pad_0, pad_type = inputs_33_pad_type_0, strides = inputs_33_strides_0, weight = encoder_conv_module_3_sequential_2_conv_weight, x = input_143)[name = tensor<string, []>("inputs_33")];
-            tensor<int32, [2]> var_915_split_sizes_0 = const()[name = tensor<string, []>("op_915_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_915_axis_0 = const()[name = tensor<string, []>("op_915_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> var_915_0, tensor<fp32, [4, 256, 16]> var_915_1 = split(axis = var_915_axis_0, split_sizes = var_915_split_sizes_0, x = inputs_33)[name = tensor<string, []>("op_915")];
-            tensor<fp32, [4, 256, 16]> var_917 = sigmoid(x = var_915_1)[name = tensor<string, []>("op_917")];
-            tensor<fp32, [4, 256, 16]> inputs_35 = mul(x = var_915_0, y = var_917)[name = tensor<string, []>("inputs_35")];
+            tensor<fp32, [4, 256, 16]> input_145 = transpose(perm = input_145_perm_0, x = x_23)[name = tensor<string, []>("transpose_33")];
+            tensor<fp32, [4, 512, 16]> inputs_33 = conv(bias = inner_encoder_conv_module_3_sequential_2_conv_bias, dilations = inputs_33_dilations_0, groups = inputs_33_groups_0, pad = inputs_33_pad_0, pad_type = inputs_33_pad_type_0, strides = inputs_33_strides_0, weight = inner_encoder_conv_module_3_sequential_2_conv_weight, x = input_145)[name = tensor<string, []>("inputs_33")];
+            tensor<int32, [2]> var_971_split_sizes_0 = const()[name = tensor<string, []>("op_971_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_971_axis_0 = const()[name = tensor<string, []>("op_971_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [4, 256, 16]> var_971_0, tensor<fp32, [4, 256, 16]> var_971_1 = split(axis = var_971_axis_0, split_sizes = var_971_split_sizes_0, x = inputs_33)[name = tensor<string, []>("op_971")];
+            tensor<fp32, [4, 256, 16]> var_973 = sigmoid(x = var_971_1)[name = tensor<string, []>("op_973")];
+            tensor<fp32, [4, 256, 16]> inputs_35 = mul(x = var_971_0, y = var_973)[name = tensor<string, []>("inputs_35")];
             tensor<string, []> outputs_aug_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_pad_0 = const()[name = tensor<string, []>("outputs_aug_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_groups_0 = const()[name = tensor<string, []>("outputs_aug_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_strides_0 = const()[name = tensor<string, []>("outputs_aug_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_dilations_0 = const()[name = tensor<string, []>("outputs_aug_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [4, 256, 31]> outputs_aug = conv(dilations = outputs_aug_dilations_0, groups = outputs_aug_groups_0, pad = outputs_aug_pad_0, pad_type = outputs_aug_pad_type_0, strides = outputs_aug_strides_0, weight = encoder_conv_module_3_sequential_4_conv_weight, x = inputs_35)[name = tensor<string, []>("outputs_aug")];
-            tensor<int32, [3]> input_145_begin_0 = const()[name = tensor<string, []>("input_145_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_145_end_0 = const()[name = tensor<string, []>("input_145_end_0"), val = tensor<int32, [3]>([4, 256, 16])];
-            tensor<bool, [3]> input_145_end_mask_0 = const()[name = tensor<string, []>("input_145_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [4, 256, 16]> input_145 = slice_by_index(begin = input_145_begin_0, end = input_145_end_0, end_mask = input_145_end_mask_0, x = outputs_aug)[name = tensor<string, []>("input_145")];
-            tensor<fp32, [4, 256, 16]> inputs_37 = batch_norm(beta = encoder_conv_module_3_sequential_5_bias, epsilon = var_28, gamma = encoder_conv_module_3_sequential_5_weight, mean = encoder_conv_module_3_sequential_5_running_mean, variance = encoder_conv_module_3_sequential_5_running_var, x = input_145)[name = tensor<string, []>("inputs_37")];
-            tensor<fp32, [4, 256, 16]> input_147 = silu(x = inputs_37)[name = tensor<string, []>("input_147")];
-            tensor<string, []> input_149_pad_type_0 = const()[name = tensor<string, []>("input_149_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_149_strides_0 = const()[name = tensor<string, []>("input_149_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_149_pad_0 = const()[name = tensor<string, []>("input_149_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_149_dilations_0 = const()[name = tensor<string, []>("input_149_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_149_groups_0 = const()[name = tensor<string, []>("input_149_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [4, 256, 16]> input_149 = conv(bias = encoder_conv_module_3_sequential_7_conv_bias, dilations = input_149_dilations_0, groups = input_149_groups_0, pad = input_149_pad_0, pad_type = input_149_pad_type_0, strides = input_149_strides_0, weight = encoder_conv_module_3_sequential_7_conv_weight, x = input_147)[name = tensor<string, []>("input_149")];
+            tensor<fp32, [4, 256, 31]> outputs_aug = conv(dilations = outputs_aug_dilations_0, groups = outputs_aug_groups_0, pad = outputs_aug_pad_0, pad_type = outputs_aug_pad_type_0, strides = outputs_aug_strides_0, weight = inner_encoder_conv_module_3_sequential_4_conv_weight, x = inputs_35)[name = tensor<string, []>("outputs_aug")];
+            tensor<int32, [3]> input_147_begin_0 = const()[name = tensor<string, []>("input_147_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_147_end_0 = const()[name = tensor<string, []>("input_147_end_0"), val = tensor<int32, [3]>([4, 256, 16])];
+            tensor<bool, [3]> input_147_end_mask_0 = const()[name = tensor<string, []>("input_147_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [4, 256, 16]> input_147 = slice_by_index(begin = input_147_begin_0, end = input_147_end_0, end_mask = input_147_end_mask_0, x = outputs_aug)[name = tensor<string, []>("input_147")];
+            tensor<fp32, [4, 256, 16]> inputs_37 = batch_norm(beta = inner_encoder_conv_module_3_sequential_5_bias, epsilon = var_66, gamma = inner_encoder_conv_module_3_sequential_5_weight, mean = inner_encoder_conv_module_3_sequential_5_running_mean, variance = inner_encoder_conv_module_3_sequential_5_running_var, x = input_147)[name = tensor<string, []>("inputs_37")];
+            tensor<fp32, [4, 256, 16]> input_149 = silu(x = inputs_37)[name = tensor<string, []>("input_149")];
+            tensor<string, []> input_151_pad_type_0 = const()[name = tensor<string, []>("input_151_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_151_strides_0 = const()[name = tensor<string, []>("input_151_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_151_pad_0 = const()[name = tensor<string, []>("input_151_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_151_dilations_0 = const()[name = tensor<string, []>("input_151_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_151_groups_0 = const()[name = tensor<string, []>("input_151_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [4, 256, 16]> input_151 = conv(bias = inner_encoder_conv_module_3_sequential_7_conv_bias, dilations = input_151_dilations_0, groups = input_151_groups_0, pad = input_151_pad_0, pad_type = input_151_pad_type_0, strides = input_151_strides_0, weight = inner_encoder_conv_module_3_sequential_7_conv_weight, x = input_149)[name = tensor<string, []>("input_151")];
             tensor<int32, [3]> conv_out_7_perm_0 = const()[name = tensor<string, []>("conv_out_7_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_948_begin_0 = const()[name = tensor<string, []>("op_948_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_948_end_0 = const()[name = tensor<string, []>("op_948_end_0"), val = tensor<int32, [3]>([4, 16, 256])];
-            tensor<bool, [3]> var_948_end_mask_0 = const()[name = tensor<string, []>("op_948_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [4, 16, 256]> conv_out_7 = transpose(perm = conv_out_7_perm_0, x = input_149)[name = tensor<string, []>("transpose_32")];
-            tensor<fp32, [4, 1, 256]> var_948 = slice_by_index(begin = var_948_begin_0, end = var_948_end_0, end_mask = var_948_end_mask_0, x = conv_out_7)[name = tensor<string, []>("op_948")];
-            tensor<int32, [3]> var_950_perm_0 = const()[name = tensor<string, []>("op_950_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 4, 256]> var_950 = transpose(perm = var_950_perm_0, x = var_948)[name = tensor<string, []>("transpose_31")];
-            tensor<fp32, [1, 4, 256]> input_151 = add(x = x_21, y = var_950)[name = tensor<string, []>("input_151")];
-            tensor<int32, [1]> input_153_axes_0 = const()[name = tensor<string, []>("input_153_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_153 = layer_norm(axes = input_153_axes_0, beta = encoder_ffn2_3_module_sequential_0_bias, epsilon = var_28, gamma = encoder_ffn2_3_module_sequential_0_weight, x = input_151)[name = tensor<string, []>("input_153")];
-            tensor<fp32, [1, 4, 1024]> inputs = linear(bias = encoder_ffn2_3_module_sequential_1_linear_bias, weight = encoder_ffn2_3_module_sequential_1_linear_weight, x = input_153)[name = tensor<string, []>("linear_35")];
-            tensor<fp32, [1, 4, 1024]> input_155 = silu(x = inputs)[name = tensor<string, []>("input_155")];
-            tensor<fp32, [1, 4, 256]> input_159 = linear(bias = encoder_ffn2_3_module_sequential_4_linear_bias, weight = encoder_ffn2_3_module_sequential_4_linear_weight, x = input_155)[name = tensor<string, []>("linear_36")];
-            tensor<fp32, []> var_973 = const()[name = tensor<string, []>("op_973"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 4, 256]> var_974 = mul(x = input_159, y = var_973)[name = tensor<string, []>("op_974")];
-            tensor<fp32, [1, 4, 256]> input_161 = add(x = var_974, y = input_151)[name = tensor<string, []>("input_161")];
+            tensor<int32, [3]> var_1004_begin_0 = const()[name = tensor<string, []>("op_1004_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_1004_end_0 = const()[name = tensor<string, []>("op_1004_end_0"), val = tensor<int32, [3]>([4, 16, 256])];
+            tensor<bool, [3]> var_1004_end_mask_0 = const()[name = tensor<string, []>("op_1004_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [4, 16, 256]> conv_out_7 = transpose(perm = conv_out_7_perm_0, x = input_151)[name = tensor<string, []>("transpose_32")];
+            tensor<fp32, [4, 1, 256]> var_1004 = slice_by_index(begin = var_1004_begin_0, end = var_1004_end_0, end_mask = var_1004_end_mask_0, x = conv_out_7)[name = tensor<string, []>("op_1004")];
+            tensor<int32, [3]> var_1006_perm_0 = const()[name = tensor<string, []>("op_1006_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 4, 256]> var_1006 = transpose(perm = var_1006_perm_0, x = var_1004)[name = tensor<string, []>("transpose_31")];
+            tensor<fp32, [1, 4, 256]> input_153 = add(x = x_21, y = var_1006)[name = tensor<string, []>("input_153")];
+            tensor<int32, [1]> input_155_axes_0 = const()[name = tensor<string, []>("input_155_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 4, 256]> input_155 = layer_norm(axes = input_155_axes_0, beta = inner_encoder_ffn2_3_module_sequential_0_bias, epsilon = var_66, gamma = inner_encoder_ffn2_3_module_sequential_0_weight, x = input_153)[name = tensor<string, []>("input_155")];
+            tensor<fp32, [1, 4, 1024]> inputs = linear(bias = inner_encoder_ffn2_3_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_3_module_sequential_1_linear_weight, x = input_155)[name = tensor<string, []>("linear_35")];
+            tensor<fp32, [1, 4, 1024]> input_157 = silu(x = inputs)[name = tensor<string, []>("input_157")];
+            tensor<fp32, [1, 4, 256]> input_161 = linear(bias = inner_encoder_ffn2_3_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_3_module_sequential_4_linear_weight, x = input_157)[name = tensor<string, []>("linear_36")];
+            tensor<fp32, []> var_1029 = const()[name = tensor<string, []>("op_1029"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 4, 256]> var_1030 = mul(x = input_161, y = var_1029)[name = tensor<string, []>("op_1030")];
+            tensor<fp32, [1, 4, 256]> input_163 = add(x = var_1030, y = input_153)[name = tensor<string, []>("input_163")];
             tensor<int32, [1]> x_25_axes_0 = const()[name = tensor<string, []>("x_25_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> x_25 = layer_norm(axes = x_25_axes_0, beta = encoder_layer_norm_3_bias, epsilon = var_28, gamma = encoder_layer_norm_3_weight, x = input_161)[name = tensor<string, []>("x_25")];
+            tensor<fp32, [1, 4, 256]> x_25 = layer_norm(axes = x_25_axes_0, beta = inner_encoder_layer_norm_3_bias, epsilon = var_66, gamma = inner_encoder_layer_norm_3_weight, x = input_163)[name = tensor<string, []>("x_25")];
             tensor<int32, [3]> x_ct_perm_0 = const()[name = tensor<string, []>("x_ct_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<bool, []> cat_interleave_0 = const()[name = tensor<string, []>("cat_interleave_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [1, 256, 4]> x_ct = transpose(perm = x_ct_perm_0, x = x_25)[name = tensor<string, []>("transpose_30")];
-            tensor<fp32, [1, 256, 22]> cat = concat(axis = var_20, interleave = cat_interleave_0, values = (cnn_window, x_ct))[name = tensor<string, []>("cat")];
+            tensor<fp32, [1, 256, 22]> cat = concat(axis = var_71, interleave = cat_interleave_0, values = (cnn_window, x_ct))[name = tensor<string, []>("cat")];
             tensor<string, []> conv_out_pad_type_0 = const()[name = tensor<string, []>("conv_out_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> conv_out_strides_0 = const()[name = tensor<string, []>("conv_out_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> conv_out_pad_0 = const()[name = tensor<string, []>("conv_out_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> conv_out_dilations_0 = const()[name = tensor<string, []>("conv_out_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> conv_out_groups_0 = const()[name = tensor<string, []>("conv_out_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 4]> conv_out = conv(bias = encoder_cnn_bias, dilations = conv_out_dilations_0, groups = conv_out_groups_0, pad = conv_out_pad_0, pad_type = conv_out_pad_type_0, strides = conv_out_strides_0, weight = encoder_cnn_weight, x = cat)[name = tensor<string, []>("conv_out")];
-            tensor<int32, [3]> var_992_begin_0 = const()[name = tensor<string, []>("op_992_begin_0"), val = tensor<int32, [3]>([0, 0, 4])];
-            tensor<int32, [3]> var_992_end_0 = const()[name = tensor<string, []>("op_992_end_0"), val = tensor<int32, [3]>([1, 256, 22])];
-            tensor<bool, [3]> var_992_end_mask_0 = const()[name = tensor<string, []>("op_992_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 256, 18]> cnn_window_new = slice_by_index(begin = var_992_begin_0, end = var_992_end_0, end_mask = var_992_end_mask_0, x = cat)[name = tensor<string, []>("op_992")];
-            tensor<int32, [3]> input_163_perm_0 = const()[name = tensor<string, []>("input_163_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [1]> var_994 = const()[name = tensor<string, []>("op_994"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 256]> input_163 = transpose(perm = input_163_perm_0, x = conv_out)[name = tensor<string, []>("transpose_29")];
-            tensor<fp32, [1, 4, 1]> var_995 = reduce_l2_norm(axes = var_994, keep_dims = var_29, x = input_163)[name = tensor<string, []>("op_995")];
+            tensor<fp32, [1, 256, 4]> conv_out = conv(bias = inner_encoder_cnn_bias, dilations = conv_out_dilations_0, groups = conv_out_groups_0, pad = conv_out_pad_0, pad_type = conv_out_pad_type_0, strides = conv_out_strides_0, weight = inner_encoder_cnn_weight, x = cat)[name = tensor<string, []>("conv_out")];
+            tensor<int32, [3]> var_1048_begin_0 = const()[name = tensor<string, []>("op_1048_begin_0"), val = tensor<int32, [3]>([0, 0, 4])];
+            tensor<int32, [3]> var_1048_end_0 = const()[name = tensor<string, []>("op_1048_end_0"), val = tensor<int32, [3]>([1, 256, 22])];
+            tensor<bool, [3]> var_1048_end_mask_0 = const()[name = tensor<string, []>("op_1048_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 256, 18]> cnn_window_new = slice_by_index(begin = var_1048_begin_0, end = var_1048_end_0, end_mask = var_1048_end_mask_0, x = cat)[name = tensor<string, []>("op_1048")];
+            tensor<int32, [3]> input_165_perm_0 = const()[name = tensor<string, []>("input_165_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_1050 = const()[name = tensor<string, []>("op_1050"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 4, 256]> input_165 = transpose(perm = input_165_perm_0, x = conv_out)[name = tensor<string, []>("transpose_29")];
+            tensor<fp32, [1, 4, 1]> var_1051 = reduce_l2_norm(axes = var_1050, keep_dims = var_65, x = input_165)[name = tensor<string, []>("op_1051")];
             tensor<fp32, []> const_12 = const()[name = tensor<string, []>("const_12"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1, 4, 1]> clip_0 = clip(alpha = var_11, beta = const_12, x = var_995)[name = tensor<string, []>("clip_0")];
-            tensor<fp32, [1, 4, 256]> emb = real_div(x = input_163, y = clip_0)[name = tensor<string, []>("emb")];
-            tensor<int32, []> var_999_axis_0 = const()[name = tensor<string, []>("op_999_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1, 4, 64, 64]> enc_kv_new = stack(axis = var_999_axis_0, values = (var_206, var_420, var_634, nkv_1))[name = tensor<string, []>("op_999")];
-            tensor<int32, []> var_1001_axis_0 = const()[name = tensor<string, []>("op_1001_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1]> enc_scale_new = stack(axis = var_1001_axis_0, values = (new_scale_1, new_scale_3, new_scale_5, new_scale_7))[name = tensor<string, []>("op_1001")];
-            tensor<int32, []> var_1003_axis_0 = const()[name = tensor<string, []>("op_1003_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1, 16, 256]> enc_conv_cache_new = stack(axis = var_1003_axis_0, values = (window_9, window_19, window_29, window))[name = tensor<string, []>("op_1003")];
-            tensor<fp32, []> var_1012 = const()[name = tensor<string, []>("op_1012"), val = tensor<fp32, []>(0x1.5798eep-27)];
-            tensor<fp32, []> var_1017 = const()[name = tensor<string, []>("op_1017"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
-            tensor<fp32, []> var_1019 = const()[name = tensor<string, []>("op_1019"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
-            tensor<bool, []> var_1020 = const()[name = tensor<string, []>("op_1020"), val = tensor<bool, []>(true)];
-            tensor<fp32, []> var_1022 = const()[name = tensor<string, []>("op_1022"), val = tensor<fp32, []>(0x1p+0)];
-            tensor<int32, []> var_1026 = const()[name = tensor<string, []>("op_1026"), val = tensor<int32, []>(-1)];
-            tensor<int32, []> var_1032 = const()[name = tensor<string, []>("op_1032"), val = tensor<int32, []>(0)];
+            tensor<fp32, [1, 4, 1]> clip_0 = clip(alpha = var_79, beta = const_12, x = var_1051)[name = tensor<string, []>("clip_0")];
+            tensor<fp32, [1, 4, 256]> emb = real_div(x = input_165, y = clip_0)[name = tensor<string, []>("emb")];
+            tensor<int32, []> var_1055_axis_0 = const()[name = tensor<string, []>("op_1055_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1, 4, 64, 64]> enc_kv_new = stack(axis = var_1055_axis_0, values = (var_262, var_476, var_690, nkv_1))[name = tensor<string, []>("op_1055")];
+            tensor<int32, []> var_1057_axis_0 = const()[name = tensor<string, []>("op_1057_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1]> enc_scale_new = stack(axis = var_1057_axis_0, values = (new_scale_1, new_scale_3, new_scale_5, new_scale_7))[name = tensor<string, []>("op_1057")];
+            tensor<int32, []> var_1059_axis_0 = const()[name = tensor<string, []>("op_1059_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1, 16, 256]> enc_conv_cache_new = stack(axis = var_1059_axis_0, values = (window_9, window_19, window_29, window))[name = tensor<string, []>("op_1059")];
             tensor<fp32, [1, 4, 12, 256]> pos = const()[name = tensor<string, []>("pos"), val = tensor<fp32, [1, 4, 12, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44395584)))];
-            tensor<int32, [1]> var_1094_axes_0 = const()[name = tensor<string, []>("op_1094_axes_0"), val = tensor<int32, [1]>([2])];
-            tensor<fp32, [1, 4, 1, 256]> var_1094 = expand_dims(axes = var_1094_axes_0, x = emb)[name = tensor<string, []>("op_1094")];
+            tensor<int32, [1]> var_1127_axes_0 = const()[name = tensor<string, []>("op_1127_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp32, [1, 4, 1, 256]> var_1127 = expand_dims(axes = var_1127_axes_0, x = emb)[name = tensor<string, []>("op_1127")];
             tensor<int32, [4]> emb_exp_reps_0 = const()[name = tensor<string, []>("emb_exp_reps_0"), val = tensor<int32, [4]>([1, 1, 12, 1])];
-            tensor<fp32, [1, 4, 12, 256]> emb_exp = tile(reps = emb_exp_reps_0, x = var_1094)[name = tensor<string, []>("emb_exp")];
-            tensor<bool, []> input_165_interleave_0 = const()[name = tensor<string, []>("input_165_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 12, 512]> input_165 = concat(axis = var_1026, interleave = input_165_interleave_0, values = (emb_exp, pos))[name = tensor<string, []>("input_165")];
-            tensor<fp32, [1, 4, 12, 256]> x_27 = linear(bias = decoder_convert_bias, weight = decoder_convert_weight, x = input_165)[name = tensor<string, []>("linear_37")];
-            tensor<int32, [4]> var_1102_perm_0 = const()[name = tensor<string, []>("op_1102_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1106 = const()[name = tensor<string, []>("op_1106"), val = tensor<int32, [3]>([12, 4, 256])];
-            tensor<fp32, [1, 12, 4, 256]> var_1102 = transpose(perm = var_1102_perm_0, x = x_27)[name = tensor<string, []>("transpose_28")];
-            tensor<fp32, [12, 4, 256]> x_29 = reshape(shape = var_1106, x = var_1102)[name = tensor<string, []>("x_29")];
+            tensor<fp32, [1, 4, 12, 256]> emb_exp = tile(reps = emb_exp_reps_0, x = var_1127)[name = tensor<string, []>("emb_exp")];
+            tensor<bool, []> input_167_interleave_0 = const()[name = tensor<string, []>("input_167_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 12, 512]> input_167 = concat(axis = var_72, interleave = input_167_interleave_0, values = (emb_exp, pos))[name = tensor<string, []>("input_167")];
+            tensor<fp32, [1, 4, 12, 256]> x_27 = linear(bias = inner_decoder_convert_bias, weight = inner_decoder_convert_weight, x = input_167)[name = tensor<string, []>("linear_37")];
+            tensor<int32, [4]> var_1135_perm_0 = const()[name = tensor<string, []>("op_1135_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1139 = const()[name = tensor<string, []>("op_1139"), val = tensor<int32, [3]>([12, 4, 256])];
+            tensor<fp32, [1, 12, 4, 256]> var_1135 = transpose(perm = var_1135_perm_0, x = x_27)[name = tensor<string, []>("transpose_28")];
+            tensor<fp32, [12, 4, 256]> x_29 = reshape(shape = var_1139, x = var_1135)[name = tensor<string, []>("x_29")];
             tensor<int32, [5]> prev_kv_9_begin_0 = const()[name = tensor<string, []>("prev_kv_9_begin_0"), val = tensor<int32, [5]>([0, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_9_end_0 = const()[name = tensor<string, []>("prev_kv_9_end_0"), val = tensor<int32, [5]>([1, 12, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_9_end_mask_0 = const()[name = tensor<string, []>("prev_kv_9_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -983,132 +998,132 @@ program(1.0)
             tensor<bool, [2]> prev_scale_9_end_mask_0 = const()[name = tensor<string, []>("prev_scale_9_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_9_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_9_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_9 = slice_by_index(begin = prev_scale_9_begin_0, end = prev_scale_9_end_0, end_mask = prev_scale_9_end_mask_0, squeeze_mask = prev_scale_9_squeeze_mask_0, x = dec_scale)[name = tensor<string, []>("prev_scale_9")];
-            tensor<fp32, [12, 4, 256]> var_1114 = linear(bias = decoder_q_proj_0_bias, weight = decoder_q_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_38")];
-            tensor<int32, [4]> var_1115 = const()[name = tensor<string, []>("op_1115"), val = tensor<int32, [4]>([12, 4, 4, 64])];
-            tensor<fp32, [12, 4, 4, 64]> var_1116 = reshape(shape = var_1115, x = var_1114)[name = tensor<string, []>("op_1116")];
+            tensor<fp32, [12, 4, 256]> var_1147 = linear(bias = inner_decoder_q_proj_0_bias, weight = inner_decoder_q_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_38")];
+            tensor<int32, [4]> var_1148 = const()[name = tensor<string, []>("op_1148"), val = tensor<int32, [4]>([12, 4, 4, 64])];
+            tensor<fp32, [12, 4, 4, 64]> var_1149 = reshape(shape = var_1148, x = var_1147)[name = tensor<string, []>("op_1149")];
             tensor<int32, [4]> q_9_perm_0 = const()[name = tensor<string, []>("q_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 4, 256]> var_1120 = linear(bias = decoder_k_proj_0_bias, weight = decoder_k_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_39")];
-            tensor<fp32, []> var_1121 = const()[name = tensor<string, []>("op_1121"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [12, 4, 256]> var_1122 = mul(x = var_1120, y = var_1121)[name = tensor<string, []>("op_1122")];
-            tensor<int32, [4]> var_1123 = const()[name = tensor<string, []>("op_1123"), val = tensor<int32, [4]>([12, 4, 4, 64])];
-            tensor<fp32, [12, 4, 4, 64]> var_1124 = reshape(shape = var_1123, x = var_1122)[name = tensor<string, []>("op_1124")];
+            tensor<fp32, [12, 4, 256]> var_1153 = linear(bias = inner_decoder_k_proj_0_bias, weight = inner_decoder_k_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_39")];
+            tensor<fp32, []> var_1154 = const()[name = tensor<string, []>("op_1154"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [12, 4, 256]> var_1155 = mul(x = var_1153, y = var_1154)[name = tensor<string, []>("op_1155")];
+            tensor<int32, [4]> var_1156 = const()[name = tensor<string, []>("op_1156"), val = tensor<int32, [4]>([12, 4, 4, 64])];
+            tensor<fp32, [12, 4, 4, 64]> var_1157 = reshape(shape = var_1156, x = var_1155)[name = tensor<string, []>("op_1157")];
             tensor<int32, [4]> k_9_perm_0 = const()[name = tensor<string, []>("k_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 4, 256]> var_1128 = linear(bias = decoder_v_proj_0_bias, weight = decoder_v_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_40")];
-            tensor<int32, [4]> var_1129 = const()[name = tensor<string, []>("op_1129"), val = tensor<int32, [4]>([12, 4, 4, 64])];
-            tensor<fp32, [12, 4, 4, 64]> var_1130 = reshape(shape = var_1129, x = var_1128)[name = tensor<string, []>("op_1130")];
+            tensor<fp32, [12, 4, 256]> var_1161 = linear(bias = inner_decoder_v_proj_0_bias, weight = inner_decoder_v_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_40")];
+            tensor<int32, [4]> var_1162 = const()[name = tensor<string, []>("op_1162"), val = tensor<int32, [4]>([12, 4, 4, 64])];
+            tensor<fp32, [12, 4, 4, 64]> var_1163 = reshape(shape = var_1162, x = var_1161)[name = tensor<string, []>("op_1163")];
             tensor<int32, [4]> v_9_perm_0 = const()[name = tensor<string, []>("v_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 4, 256]> input_169 = linear(bias = decoder_g_proj_0_bias, weight = decoder_g_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_41")];
+            tensor<fp32, [12, 4, 256]> input_171 = linear(bias = inner_decoder_g_proj_0_bias, weight = inner_decoder_g_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_41")];
             tensor<bool, []> cumsum_mask_1_exclusive_0 = const()[name = tensor<string, []>("cumsum_mask_1_exclusive_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> cumsum_mask_1_reverse_0 = const()[name = tensor<string, []>("cumsum_mask_1_reverse_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [4]> cumsum_mask_1 = cumsum(axis = var_1032, exclusive = cumsum_mask_1_exclusive_0, reverse = cumsum_mask_1_reverse_0, x = valid_mask)[name = tensor<string, []>("cumsum_mask_1")];
+            tensor<fp32, [4]> cumsum_mask_1 = cumsum(axis = var_69, exclusive = cumsum_mask_1_exclusive_0, reverse = cumsum_mask_1_reverse_0, x = valid_mask)[name = tensor<string, []>("cumsum_mask_1")];
             tensor<fp32, [1]> sqrt_s0_9 = sqrt(x = prev_scale_9)[name = tensor<string, []>("sqrt_s0_9")];
             tensor<fp32, [4]> s_t_9 = add(x = prev_scale_9, y = cumsum_mask_1)[name = tensor<string, []>("s_t_9")];
             tensor<fp32, []> const_20 = const()[name = tensor<string, []>("const_20"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [4]> clip_1 = clip(alpha = var_1022, beta = const_20, x = s_t_9)[name = tensor<string, []>("clip_1")];
+            tensor<fp32, [4]> clip_1 = clip(alpha = var_59, beta = const_20, x = s_t_9)[name = tensor<string, []>("clip_1")];
             tensor<fp32, [4]> sqrt_s_t_9 = sqrt(x = clip_1)[name = tensor<string, []>("sqrt_s_t_9")];
             tensor<bool, []> qk_9_transpose_x_1 = const()[name = tensor<string, []>("qk_9_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_9_transpose_y_1 = const()[name = tensor<string, []>("qk_9_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [12, 4, 4, 64]> k_9 = transpose(perm = k_9_perm_0, x = var_1124)[name = tensor<string, []>("transpose_26")];
-            tensor<fp32, [12, 4, 4, 64]> q_9 = transpose(perm = q_9_perm_0, x = var_1116)[name = tensor<string, []>("transpose_27")];
+            tensor<fp32, [12, 4, 4, 64]> k_9 = transpose(perm = k_9_perm_0, x = var_1157)[name = tensor<string, []>("transpose_26")];
+            tensor<fp32, [12, 4, 4, 64]> q_9 = transpose(perm = q_9_perm_0, x = var_1149)[name = tensor<string, []>("transpose_27")];
             tensor<fp32, [12, 4, 4, 4]> qk_9 = matmul(transpose_x = qk_9_transpose_x_1, transpose_y = qk_9_transpose_y_1, x = q_9, y = k_9)[name = tensor<string, []>("qk_9")];
-            tensor<int32, [2]> var_1142 = const()[name = tensor<string, []>("op_1142"), val = tensor<int32, [2]>([1, 4])];
-            tensor<fp32, [1, 4]> var_1143 = reshape(shape = var_1142, x = valid_mask)[name = tensor<string, []>("op_1143")];
-            tensor<fp32, [4, 4]> causal_with_valid_1 = mul(x = decoder__causal_mask, y = var_1143)[name = tensor<string, []>("causal_with_valid_1")];
-            tensor<int32, [2]> var_1145 = const()[name = tensor<string, []>("op_1145"), val = tensor<int32, [2]>([4, 1])];
-            tensor<fp32, [4, 1]> var_1146 = reshape(shape = var_1145, x = sqrt_s_t_9)[name = tensor<string, []>("op_1146")];
-            tensor<fp32, [4, 4]> M_9 = real_div(x = causal_with_valid_1, y = var_1146)[name = tensor<string, []>("M_9")];
-            tensor<fp32, [12, 4, 4, 4]> var_1148 = mul(x = qk_9, y = M_9)[name = tensor<string, []>("op_1148")];
+            tensor<int32, [2]> var_1175 = const()[name = tensor<string, []>("op_1175"), val = tensor<int32, [2]>([1, 4])];
+            tensor<fp32, [1, 4]> var_1176 = reshape(shape = var_1175, x = valid_mask)[name = tensor<string, []>("op_1176")];
+            tensor<fp32, [4, 4]> causal_with_valid_1 = mul(x = inner_decoder__causal_mask, y = var_1176)[name = tensor<string, []>("causal_with_valid_1")];
+            tensor<int32, [2]> var_1178 = const()[name = tensor<string, []>("op_1178"), val = tensor<int32, [2]>([4, 1])];
+            tensor<fp32, [4, 1]> var_1179 = reshape(shape = var_1178, x = sqrt_s_t_9)[name = tensor<string, []>("op_1179")];
+            tensor<fp32, [4, 4]> M_9 = real_div(x = causal_with_valid_1, y = var_1179)[name = tensor<string, []>("M_9")];
+            tensor<fp32, [12, 4, 4, 4]> var_1181 = mul(x = qk_9, y = M_9)[name = tensor<string, []>("op_1181")];
             tensor<bool, []> inner_9_transpose_x_0 = const()[name = tensor<string, []>("inner_9_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_9_transpose_y_0 = const()[name = tensor<string, []>("inner_9_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 4, 64]> v_9 = transpose(perm = v_9_perm_0, x = var_1130)[name = tensor<string, []>("transpose_25")];
-            tensor<fp32, [12, 4, 4, 64]> inner_9 = matmul(transpose_x = inner_9_transpose_x_0, transpose_y = inner_9_transpose_y_0, x = var_1148, y = v_9)[name = tensor<string, []>("inner_9")];
-            tensor<bool, []> var_1150_transpose_x_0 = const()[name = tensor<string, []>("op_1150_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_1150_transpose_y_0 = const()[name = tensor<string, []>("op_1150_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 4, 64]> var_1150 = matmul(transpose_x = var_1150_transpose_x_0, transpose_y = var_1150_transpose_y_0, x = q_9, y = prev_kv_9)[name = tensor<string, []>("op_1150")];
-            tensor<fp32, [4]> var_1151 = real_div(x = sqrt_s0_9, y = sqrt_s_t_9)[name = tensor<string, []>("op_1151")];
-            tensor<int32, [4]> var_1152 = const()[name = tensor<string, []>("op_1152"), val = tensor<int32, [4]>([1, 1, 4, 1])];
-            tensor<fp32, [1, 1, 4, 1]> var_1153 = reshape(shape = var_1152, x = var_1151)[name = tensor<string, []>("op_1153")];
-            tensor<fp32, [12, 4, 4, 64]> cross_9 = mul(x = var_1150, y = var_1153)[name = tensor<string, []>("cross_9")];
+            tensor<fp32, [12, 4, 4, 64]> v_9 = transpose(perm = v_9_perm_0, x = var_1163)[name = tensor<string, []>("transpose_25")];
+            tensor<fp32, [12, 4, 4, 64]> inner_9 = matmul(transpose_x = inner_9_transpose_x_0, transpose_y = inner_9_transpose_y_0, x = var_1181, y = v_9)[name = tensor<string, []>("inner_9")];
+            tensor<bool, []> var_1183_transpose_x_0 = const()[name = tensor<string, []>("op_1183_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_1183_transpose_y_0 = const()[name = tensor<string, []>("op_1183_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 4, 64]> var_1183 = matmul(transpose_x = var_1183_transpose_x_0, transpose_y = var_1183_transpose_y_0, x = q_9, y = prev_kv_9)[name = tensor<string, []>("op_1183")];
+            tensor<fp32, [4]> var_1184 = real_div(x = sqrt_s0_9, y = sqrt_s_t_9)[name = tensor<string, []>("op_1184")];
+            tensor<int32, [4]> var_1185 = const()[name = tensor<string, []>("op_1185"), val = tensor<int32, [4]>([1, 1, 4, 1])];
+            tensor<fp32, [1, 1, 4, 1]> var_1186 = reshape(shape = var_1185, x = var_1184)[name = tensor<string, []>("op_1186")];
+            tensor<fp32, [12, 4, 4, 64]> cross_9 = mul(x = var_1183, y = var_1186)[name = tensor<string, []>("cross_9")];
             tensor<fp32, [12, 4, 4, 64]> out_25 = add(x = inner_9, y = cross_9)[name = tensor<string, []>("out_25")];
-            tensor<int32, [4]> var_1156 = const()[name = tensor<string, []>("op_1156"), val = tensor<int32, [4]>([1, 1, 4, 1])];
-            tensor<fp32, [1, 1, 4, 1]> var_1157 = reshape(shape = var_1156, x = valid_mask)[name = tensor<string, []>("op_1157")];
-            tensor<fp32, [12, 4, 4, 64]> v_masked_1 = mul(x = v_9, y = var_1157)[name = tensor<string, []>("v_masked_1")];
-            tensor<fp32, [12, 4, 64, 64]> var_1159 = mul(x = prev_kv_9, y = sqrt_s0_9)[name = tensor<string, []>("op_1159")];
-            tensor<bool, []> var_1161_transpose_x_1 = const()[name = tensor<string, []>("op_1161_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_1161_transpose_y_1 = const()[name = tensor<string, []>("op_1161_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 64, 64]> var_1161 = matmul(transpose_x = var_1161_transpose_x_1, transpose_y = var_1161_transpose_y_1, x = k_9, y = v_masked_1)[name = tensor<string, []>("op_1161")];
-            tensor<fp32, [12, 4, 64, 64]> new_kv_unnorm_9 = add(x = var_1159, y = var_1161)[name = tensor<string, []>("new_kv_unnorm_9")];
-            tensor<bool, []> var_1163_keep_dims_0 = const()[name = tensor<string, []>("op_1163_keep_dims_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, []> var_1163 = reduce_sum(keep_dims = var_1163_keep_dims_0, x = valid_mask)[name = tensor<string, []>("op_1163")];
-            tensor<int32, [1]> var_1164 = const()[name = tensor<string, []>("op_1164"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [1]> var_1165 = reshape(shape = var_1164, x = var_1163)[name = tensor<string, []>("op_1165")];
-            tensor<fp32, [1]> new_scale_9 = add(x = prev_scale_9, y = var_1165)[name = tensor<string, []>("new_scale_9")];
+            tensor<int32, [4]> var_1189 = const()[name = tensor<string, []>("op_1189"), val = tensor<int32, [4]>([1, 1, 4, 1])];
+            tensor<fp32, [1, 1, 4, 1]> var_1190 = reshape(shape = var_1189, x = valid_mask)[name = tensor<string, []>("op_1190")];
+            tensor<fp32, [12, 4, 4, 64]> v_masked_1 = mul(x = v_9, y = var_1190)[name = tensor<string, []>("v_masked_1")];
+            tensor<fp32, [12, 4, 64, 64]> var_1192 = mul(x = prev_kv_9, y = sqrt_s0_9)[name = tensor<string, []>("op_1192")];
+            tensor<bool, []> var_1194_transpose_x_1 = const()[name = tensor<string, []>("op_1194_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_1194_transpose_y_1 = const()[name = tensor<string, []>("op_1194_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 64, 64]> var_1194 = matmul(transpose_x = var_1194_transpose_x_1, transpose_y = var_1194_transpose_y_1, x = k_9, y = v_masked_1)[name = tensor<string, []>("op_1194")];
+            tensor<fp32, [12, 4, 64, 64]> new_kv_unnorm_9 = add(x = var_1192, y = var_1194)[name = tensor<string, []>("new_kv_unnorm_9")];
+            tensor<bool, []> var_1196_keep_dims_0 = const()[name = tensor<string, []>("op_1196_keep_dims_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, []> var_1196 = reduce_sum(keep_dims = var_1196_keep_dims_0, x = valid_mask)[name = tensor<string, []>("op_1196")];
+            tensor<int32, [1]> var_1197 = const()[name = tensor<string, []>("op_1197"), val = tensor<int32, [1]>([1])];
+            tensor<fp32, [1]> var_1198 = reshape(shape = var_1197, x = var_1196)[name = tensor<string, []>("op_1198")];
+            tensor<fp32, [1]> new_scale_9 = add(x = prev_scale_9, y = var_1198)[name = tensor<string, []>("new_scale_9")];
             tensor<fp32, []> const_21 = const()[name = tensor<string, []>("const_21"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_2 = clip(alpha = var_1022, beta = const_21, x = new_scale_9)[name = tensor<string, []>("clip_2")];
+            tensor<fp32, [1]> clip_2 = clip(alpha = var_59, beta = const_21, x = new_scale_9)[name = tensor<string, []>("clip_2")];
             tensor<fp32, [1]> sqrt_new_scale_1 = sqrt(x = clip_2)[name = tensor<string, []>("sqrt_new_scale_1")];
-            tensor<fp32, [12, 4, 64, 64]> var_1169 = real_div(x = new_kv_unnorm_9, y = sqrt_new_scale_1)[name = tensor<string, []>("op_1169")];
-            tensor<int32, [4]> var_1170_perm_0 = const()[name = tensor<string, []>("op_1170_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [12, 4, 64, 64]> var_1202 = real_div(x = new_kv_unnorm_9, y = sqrt_new_scale_1)[name = tensor<string, []>("op_1202")];
+            tensor<int32, [4]> var_1203_perm_0 = const()[name = tensor<string, []>("op_1203_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_27_axes_0 = const()[name = tensor<string, []>("out_27_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [12, 4, 4, 64]> var_1170 = transpose(perm = var_1170_perm_0, x = out_25)[name = tensor<string, []>("transpose_24")];
-            tensor<fp32, [12, 4, 4, 64]> out_27 = layer_norm(axes = out_27_axes_0, epsilon = var_1019, x = var_1170)[name = tensor<string, []>("out_27")];
-            tensor<int32, [3]> var_1174 = const()[name = tensor<string, []>("op_1174"), val = tensor<int32, [3]>([12, 4, 256])];
-            tensor<fp32, [12, 4, 256]> out_29 = reshape(shape = var_1174, x = out_27)[name = tensor<string, []>("out_29")];
-            tensor<fp32, [12, 4, 256]> var_1176 = silu(x = input_169)[name = tensor<string, []>("op_1176")];
-            tensor<fp32, [12, 4, 256]> input_171 = mul(x = var_1176, y = out_29)[name = tensor<string, []>("input_171")];
-            tensor<fp32, [12, 4, 256]> ret_out_9 = linear(bias = decoder_out_proj_0_bias, weight = decoder_out_proj_0_weight, x = input_171)[name = tensor<string, []>("linear_42")];
-            tensor<fp32, [12, 4, 256]> input_173 = add(x = x_29, y = ret_out_9)[name = tensor<string, []>("input_173")];
+            tensor<fp32, [12, 4, 4, 64]> var_1203 = transpose(perm = var_1203_perm_0, x = out_25)[name = tensor<string, []>("transpose_24")];
+            tensor<fp32, [12, 4, 4, 64]> out_27 = layer_norm(axes = out_27_axes_0, epsilon = var_74, x = var_1203)[name = tensor<string, []>("out_27")];
+            tensor<int32, [3]> var_1207 = const()[name = tensor<string, []>("op_1207"), val = tensor<int32, [3]>([12, 4, 256])];
+            tensor<fp32, [12, 4, 256]> out_29 = reshape(shape = var_1207, x = out_27)[name = tensor<string, []>("out_29")];
+            tensor<fp32, [12, 4, 256]> var_1209 = silu(x = input_171)[name = tensor<string, []>("op_1209")];
+            tensor<fp32, [12, 4, 256]> input_173 = mul(x = var_1209, y = out_29)[name = tensor<string, []>("input_173")];
+            tensor<fp32, [12, 4, 256]> ret_out_9 = linear(bias = inner_decoder_out_proj_0_bias, weight = inner_decoder_out_proj_0_weight, x = input_173)[name = tensor<string, []>("linear_42")];
+            tensor<fp32, [12, 4, 256]> input_175 = add(x = x_29, y = ret_out_9)[name = tensor<string, []>("input_175")];
             tensor<int32, [1]> xt_1_axes_0 = const()[name = tensor<string, []>("xt_1_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [12, 4, 256]> xt_1 = layer_norm(axes = xt_1_axes_0, beta = decoder_norm11_0_bias, epsilon = var_1017, gamma = decoder_norm11_0_weight, x = input_173)[name = tensor<string, []>("xt_1")];
-            tensor<int32, [4]> var_1186 = const()[name = tensor<string, []>("op_1186"), val = tensor<int32, [4]>([1, 12, 4, 256])];
-            tensor<fp32, [1, 12, 4, 256]> var_1187 = reshape(shape = var_1186, x = xt_1)[name = tensor<string, []>("op_1187")];
-            tensor<int32, [4]> var_1188_perm_0 = const()[name = tensor<string, []>("op_1188_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1191 = const()[name = tensor<string, []>("op_1191"), val = tensor<int32, [3]>([4, 12, 256])];
-            tensor<fp32, [1, 4, 12, 256]> var_1188 = transpose(perm = var_1188_perm_0, x = var_1187)[name = tensor<string, []>("transpose_23")];
-            tensor<fp32, [4, 12, 256]> query_1 = reshape(shape = var_1191, x = var_1188)[name = tensor<string, []>("query_1")];
+            tensor<fp32, [12, 4, 256]> xt_1 = layer_norm(axes = xt_1_axes_0, beta = inner_decoder_norm11_0_bias, epsilon = var_66, gamma = inner_decoder_norm11_0_weight, x = input_175)[name = tensor<string, []>("xt_1")];
+            tensor<int32, [4]> var_1219 = const()[name = tensor<string, []>("op_1219"), val = tensor<int32, [4]>([1, 12, 4, 256])];
+            tensor<fp32, [1, 12, 4, 256]> var_1220 = reshape(shape = var_1219, x = xt_1)[name = tensor<string, []>("op_1220")];
+            tensor<int32, [4]> var_1221_perm_0 = const()[name = tensor<string, []>("op_1221_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1224 = const()[name = tensor<string, []>("op_1224"), val = tensor<int32, [3]>([4, 12, 256])];
+            tensor<fp32, [1, 4, 12, 256]> var_1221 = transpose(perm = var_1221_perm_0, x = var_1220)[name = tensor<string, []>("transpose_23")];
+            tensor<fp32, [4, 12, 256]> query_1 = reshape(shape = var_1224, x = var_1221)[name = tensor<string, []>("query_1")];
             tensor<int32, [3]> query_3_perm_0 = const()[name = tensor<string, []>("query_3_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [12, 4, 256]> query_3 = transpose(perm = query_3_perm_0, x = query_1)[name = tensor<string, []>("transpose_22")];
-            tensor<fp32, [12, 4, 768]> var_1214 = linear(bias = decoder_self_attn2_0_in_proj_bias, weight = decoder_self_attn2_0_in_proj_weight, x = query_3)[name = tensor<string, []>("linear_43")];
+            tensor<fp32, [12, 4, 768]> var_1247 = linear(bias = inner_decoder_self_attn2_0_in_proj_bias, weight = inner_decoder_self_attn2_0_in_proj_weight, x = query_3)[name = tensor<string, []>("linear_43")];
             tensor<int32, [4]> concat_1 = const()[name = tensor<string, []>("concat_1"), val = tensor<int32, [4]>([12, 4, 3, 256])];
-            tensor<fp32, [12, 4, 3, 256]> var_1216 = reshape(shape = concat_1, x = var_1214)[name = tensor<string, []>("op_1216")];
-            tensor<int32, [1]> var_1217_axes_0 = const()[name = tensor<string, []>("op_1217_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp32, [1, 12, 4, 3, 256]> var_1217 = expand_dims(axes = var_1217_axes_0, x = var_1216)[name = tensor<string, []>("op_1217")];
-            tensor<int32, [5]> var_1218_perm_0 = const()[name = tensor<string, []>("op_1218_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
-            tensor<int32, [1]> var_1219_axes_0 = const()[name = tensor<string, []>("op_1219_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp32, [3, 12, 4, 1, 256]> var_1218 = transpose(perm = var_1218_perm_0, x = var_1217)[name = tensor<string, []>("transpose_21")];
-            tensor<fp32, [3, 12, 4, 256]> var_1219 = squeeze(axes = var_1219_axes_0, x = var_1218)[name = tensor<string, []>("op_1219")];
+            tensor<fp32, [12, 4, 3, 256]> var_1249 = reshape(shape = concat_1, x = var_1247)[name = tensor<string, []>("op_1249")];
+            tensor<int32, [1]> var_1250_axes_0 = const()[name = tensor<string, []>("op_1250_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 12, 4, 3, 256]> var_1250 = expand_dims(axes = var_1250_axes_0, x = var_1249)[name = tensor<string, []>("op_1250")];
+            tensor<int32, [5]> var_1251_perm_0 = const()[name = tensor<string, []>("op_1251_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
+            tensor<int32, [1]> var_1252_axes_0 = const()[name = tensor<string, []>("op_1252_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp32, [3, 12, 4, 1, 256]> var_1251 = transpose(perm = var_1251_perm_0, x = var_1250)[name = tensor<string, []>("transpose_21")];
+            tensor<fp32, [3, 12, 4, 256]> var_1252 = squeeze(axes = var_1252_axes_0, x = var_1251)[name = tensor<string, []>("op_1252")];
             tensor<int32, [4]> q_11_begin_0 = const()[name = tensor<string, []>("q_11_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> q_11_end_0 = const()[name = tensor<string, []>("q_11_end_0"), val = tensor<int32, [4]>([1, 12, 4, 256])];
             tensor<bool, [4]> q_11_end_mask_0 = const()[name = tensor<string, []>("q_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> q_11_squeeze_mask_0 = const()[name = tensor<string, []>("q_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 4, 256]> q_11 = slice_by_index(begin = q_11_begin_0, end = q_11_end_0, end_mask = q_11_end_mask_0, squeeze_mask = q_11_squeeze_mask_0, x = var_1219)[name = tensor<string, []>("q_11")];
+            tensor<fp32, [12, 4, 256]> q_11 = slice_by_index(begin = q_11_begin_0, end = q_11_end_0, end_mask = q_11_end_mask_0, squeeze_mask = q_11_squeeze_mask_0, x = var_1252)[name = tensor<string, []>("q_11")];
             tensor<int32, [4]> k_11_begin_0 = const()[name = tensor<string, []>("k_11_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> k_11_end_0 = const()[name = tensor<string, []>("k_11_end_0"), val = tensor<int32, [4]>([2, 12, 4, 256])];
             tensor<bool, [4]> k_11_end_mask_0 = const()[name = tensor<string, []>("k_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> k_11_squeeze_mask_0 = const()[name = tensor<string, []>("k_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 4, 256]> k_11 = slice_by_index(begin = k_11_begin_0, end = k_11_end_0, end_mask = k_11_end_mask_0, squeeze_mask = k_11_squeeze_mask_0, x = var_1219)[name = tensor<string, []>("k_11")];
+            tensor<fp32, [12, 4, 256]> k_11 = slice_by_index(begin = k_11_begin_0, end = k_11_end_0, end_mask = k_11_end_mask_0, squeeze_mask = k_11_squeeze_mask_0, x = var_1252)[name = tensor<string, []>("k_11")];
             tensor<int32, [4]> v_11_begin_0 = const()[name = tensor<string, []>("v_11_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> v_11_end_0 = const()[name = tensor<string, []>("v_11_end_0"), val = tensor<int32, [4]>([3, 12, 4, 256])];
             tensor<bool, [4]> v_11_end_mask_0 = const()[name = tensor<string, []>("v_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> v_11_squeeze_mask_0 = const()[name = tensor<string, []>("v_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 4, 256]> v_11 = slice_by_index(begin = v_11_begin_0, end = v_11_end_0, end_mask = v_11_end_mask_0, squeeze_mask = v_11_squeeze_mask_0, x = var_1219)[name = tensor<string, []>("v_11")];
-            tensor<int32, [3]> var_1227 = const()[name = tensor<string, []>("op_1227"), val = tensor<int32, [3]>([12, 16, 64])];
-            tensor<fp32, [12, 16, 64]> var_1228 = reshape(shape = var_1227, x = q_11)[name = tensor<string, []>("op_1228")];
+            tensor<fp32, [12, 4, 256]> v_11 = slice_by_index(begin = v_11_begin_0, end = v_11_end_0, end_mask = v_11_end_mask_0, squeeze_mask = v_11_squeeze_mask_0, x = var_1252)[name = tensor<string, []>("v_11")];
+            tensor<int32, [3]> var_1260 = const()[name = tensor<string, []>("op_1260"), val = tensor<int32, [3]>([12, 16, 64])];
+            tensor<fp32, [12, 16, 64]> var_1261 = reshape(shape = var_1260, x = q_11)[name = tensor<string, []>("op_1261")];
             tensor<int32, [3]> q_13_perm_0 = const()[name = tensor<string, []>("q_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1234 = const()[name = tensor<string, []>("op_1234"), val = tensor<int32, [3]>([12, 16, 64])];
-            tensor<fp32, [12, 16, 64]> var_1235 = reshape(shape = var_1234, x = k_11)[name = tensor<string, []>("op_1235")];
+            tensor<int32, [3]> var_1267 = const()[name = tensor<string, []>("op_1267"), val = tensor<int32, [3]>([12, 16, 64])];
+            tensor<fp32, [12, 16, 64]> var_1268 = reshape(shape = var_1267, x = k_11)[name = tensor<string, []>("op_1268")];
             tensor<int32, [3]> k_13_perm_0 = const()[name = tensor<string, []>("k_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1241 = const()[name = tensor<string, []>("op_1241"), val = tensor<int32, [3]>([12, 16, 64])];
-            tensor<fp32, [12, 16, 64]> var_1242 = reshape(shape = var_1241, x = v_11)[name = tensor<string, []>("op_1242")];
+            tensor<int32, [3]> var_1274 = const()[name = tensor<string, []>("op_1274"), val = tensor<int32, [3]>([12, 16, 64])];
+            tensor<fp32, [12, 16, 64]> var_1275 = reshape(shape = var_1274, x = v_11)[name = tensor<string, []>("op_1275")];
             tensor<int32, [3]> v_13_perm_0 = const()[name = tensor<string, []>("v_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [4]> var_1245 = const()[name = tensor<string, []>("op_1245"), val = tensor<int32, [4]>([4, 4, 12, 64])];
-            tensor<fp32, [16, 12, 64]> q_13 = transpose(perm = q_13_perm_0, x = var_1228)[name = tensor<string, []>("transpose_20")];
-            tensor<fp32, [4, 4, 12, 64]> q_15 = reshape(shape = var_1245, x = q_13)[name = tensor<string, []>("q_15")];
-            tensor<int32, [4]> var_1247 = const()[name = tensor<string, []>("op_1247"), val = tensor<int32, [4]>([4, 4, 12, 64])];
-            tensor<fp32, [16, 12, 64]> k_13 = transpose(perm = k_13_perm_0, x = var_1235)[name = tensor<string, []>("transpose_19")];
-            tensor<fp32, [4, 4, 12, 64]> k_15 = reshape(shape = var_1247, x = k_13)[name = tensor<string, []>("k_15")];
-            tensor<int32, [4]> var_1249 = const()[name = tensor<string, []>("op_1249"), val = tensor<int32, [4]>([4, 4, 12, 64])];
-            tensor<fp32, [16, 12, 64]> v_13 = transpose(perm = v_13_perm_0, x = var_1242)[name = tensor<string, []>("transpose_18")];
-            tensor<fp32, [4, 4, 12, 64]> v_15 = reshape(shape = var_1249, x = v_13)[name = tensor<string, []>("v_15")];
+            tensor<int32, [4]> var_1278 = const()[name = tensor<string, []>("op_1278"), val = tensor<int32, [4]>([4, 4, 12, 64])];
+            tensor<fp32, [16, 12, 64]> q_13 = transpose(perm = q_13_perm_0, x = var_1261)[name = tensor<string, []>("transpose_20")];
+            tensor<fp32, [4, 4, 12, 64]> q_15 = reshape(shape = var_1278, x = q_13)[name = tensor<string, []>("q_15")];
+            tensor<int32, [4]> var_1280 = const()[name = tensor<string, []>("op_1280"), val = tensor<int32, [4]>([4, 4, 12, 64])];
+            tensor<fp32, [16, 12, 64]> k_13 = transpose(perm = k_13_perm_0, x = var_1268)[name = tensor<string, []>("transpose_19")];
+            tensor<fp32, [4, 4, 12, 64]> k_15 = reshape(shape = var_1280, x = k_13)[name = tensor<string, []>("k_15")];
+            tensor<int32, [4]> var_1282 = const()[name = tensor<string, []>("op_1282"), val = tensor<int32, [4]>([4, 4, 12, 64])];
+            tensor<fp32, [16, 12, 64]> v_13 = transpose(perm = v_13_perm_0, x = var_1275)[name = tensor<string, []>("transpose_18")];
+            tensor<fp32, [4, 4, 12, 64]> v_15 = reshape(shape = var_1282, x = v_13)[name = tensor<string, []>("v_15")];
             tensor<fp32, []> mul_1_y_0 = const()[name = tensor<string, []>("mul_1_y_0"), val = tensor<fp32, []>(0x1p-3)];
             tensor<fp32, [4, 4, 12, 64]> mul_1 = mul(x = q_15, y = mul_1_y_0)[name = tensor<string, []>("mul_1")];
             tensor<bool, []> matmul_0_transpose_y_0 = const()[name = tensor<string, []>("matmul_0_transpose_y_0"), val = tensor<bool, []>(true)];
@@ -1119,30 +1134,30 @@ program(1.0)
             tensor<bool, []> attn_output_1_transpose_x_0 = const()[name = tensor<string, []>("attn_output_1_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> attn_output_1_transpose_y_0 = const()[name = tensor<string, []>("attn_output_1_transpose_y_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [4, 4, 12, 64]> attn_output_1 = matmul(transpose_x = attn_output_1_transpose_x_0, transpose_y = attn_output_1_transpose_y_0, x = softmax_0, y = v_15)[name = tensor<string, []>("attn_output_1")];
-            tensor<int32, [4]> var_1252 = const()[name = tensor<string, []>("op_1252"), val = tensor<int32, [4]>([2, 0, 1, 3])];
-            tensor<int32, [2]> var_1257 = const()[name = tensor<string, []>("op_1257"), val = tensor<int32, [2]>([48, 256])];
-            tensor<fp32, [12, 4, 4, 64]> var_1253 = transpose(perm = var_1252, x = attn_output_1)[name = tensor<string, []>("transpose_17")];
-            tensor<fp32, [48, 256]> attn_output_3 = reshape(shape = var_1257, x = var_1253)[name = tensor<string, []>("attn_output_3")];
-            tensor<fp32, [48, 256]> attn_output_5 = linear(bias = decoder_self_attn2_0_out_proj_bias, weight = decoder_self_attn2_0_out_proj_weight, x = attn_output_3)[name = tensor<string, []>("linear_44")];
-            tensor<int32, [3]> var_1261 = const()[name = tensor<string, []>("op_1261"), val = tensor<int32, [3]>([12, 4, 256])];
-            tensor<fp32, [12, 4, 256]> attn_output_7 = reshape(shape = var_1261, x = attn_output_5)[name = tensor<string, []>("attn_output_7")];
+            tensor<int32, [4]> var_1285 = const()[name = tensor<string, []>("op_1285"), val = tensor<int32, [4]>([2, 0, 1, 3])];
+            tensor<int32, [2]> var_1290 = const()[name = tensor<string, []>("op_1290"), val = tensor<int32, [2]>([48, 256])];
+            tensor<fp32, [12, 4, 4, 64]> var_1286 = transpose(perm = var_1285, x = attn_output_1)[name = tensor<string, []>("transpose_17")];
+            tensor<fp32, [48, 256]> attn_output_3 = reshape(shape = var_1290, x = var_1286)[name = tensor<string, []>("attn_output_3")];
+            tensor<fp32, [48, 256]> attn_output_5 = linear(bias = inner_decoder_self_attn2_0_out_proj_bias, weight = inner_decoder_self_attn2_0_out_proj_weight, x = attn_output_3)[name = tensor<string, []>("linear_44")];
+            tensor<int32, [3]> var_1294 = const()[name = tensor<string, []>("op_1294"), val = tensor<int32, [3]>([12, 4, 256])];
+            tensor<fp32, [12, 4, 256]> attn_output_7 = reshape(shape = var_1294, x = attn_output_5)[name = tensor<string, []>("attn_output_7")];
             tensor<int32, [3]> sa2_out_1_perm_0 = const()[name = tensor<string, []>("sa2_out_1_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [4, 12, 256]> sa2_out_1 = transpose(perm = sa2_out_1_perm_0, x = attn_output_7)[name = tensor<string, []>("transpose_16")];
-            tensor<fp32, [4, 12, 256]> input_175 = add(x = query_1, y = sa2_out_1)[name = tensor<string, []>("input_175")];
-            tensor<int32, [1]> input_177_axes_0 = const()[name = tensor<string, []>("input_177_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [4, 12, 256]> input_177 = layer_norm(axes = input_177_axes_0, beta = decoder_norm21_0_bias, epsilon = var_1017, gamma = decoder_norm21_0_weight, x = input_175)[name = tensor<string, []>("input_177")];
-            tensor<fp32, [4, 12, 2048]> input_179 = linear(bias = decoder_linear1_0_bias, weight = decoder_linear1_0_weight, x = input_177)[name = tensor<string, []>("linear_45")];
-            tensor<fp32, [4, 12, 2048]> input_181 = relu(x = input_179)[name = tensor<string, []>("input_181")];
-            tensor<fp32, [4, 12, 256]> ffn_out_1 = linear(bias = decoder_linear2_0_bias, weight = decoder_linear2_0_weight, x = input_181)[name = tensor<string, []>("linear_46")];
-            tensor<fp32, [4, 12, 256]> input_183 = add(x = input_177, y = ffn_out_1)[name = tensor<string, []>("input_183")];
+            tensor<fp32, [4, 12, 256]> input_177 = add(x = query_1, y = sa2_out_1)[name = tensor<string, []>("input_177")];
+            tensor<int32, [1]> input_179_axes_0 = const()[name = tensor<string, []>("input_179_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [4, 12, 256]> input_179 = layer_norm(axes = input_179_axes_0, beta = inner_decoder_norm21_0_bias, epsilon = var_66, gamma = inner_decoder_norm21_0_weight, x = input_177)[name = tensor<string, []>("input_179")];
+            tensor<fp32, [4, 12, 2048]> input_181 = linear(bias = inner_decoder_linear1_0_bias, weight = inner_decoder_linear1_0_weight, x = input_179)[name = tensor<string, []>("linear_45")];
+            tensor<fp32, [4, 12, 2048]> input_183 = relu(x = input_181)[name = tensor<string, []>("input_183")];
+            tensor<fp32, [4, 12, 256]> ffn_out_1 = linear(bias = inner_decoder_linear2_0_bias, weight = inner_decoder_linear2_0_weight, x = input_183)[name = tensor<string, []>("linear_46")];
+            tensor<fp32, [4, 12, 256]> input_185 = add(x = input_179, y = ffn_out_1)[name = tensor<string, []>("input_185")];
             tensor<int32, [1]> xt_3_axes_0 = const()[name = tensor<string, []>("xt_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [4, 12, 256]> xt_3 = layer_norm(axes = xt_3_axes_0, beta = decoder_norm22_0_bias, epsilon = var_1017, gamma = decoder_norm22_0_weight, x = input_183)[name = tensor<string, []>("xt_3")];
-            tensor<int32, [4]> var_1281 = const()[name = tensor<string, []>("op_1281"), val = tensor<int32, [4]>([1, 4, 12, 256])];
-            tensor<fp32, [1, 4, 12, 256]> x_31 = reshape(shape = var_1281, x = xt_3)[name = tensor<string, []>("x_31")];
-            tensor<int32, [4]> var_1283_perm_0 = const()[name = tensor<string, []>("op_1283_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1287 = const()[name = tensor<string, []>("op_1287"), val = tensor<int32, [3]>([12, 4, 256])];
-            tensor<fp32, [1, 12, 4, 256]> var_1283 = transpose(perm = var_1283_perm_0, x = x_31)[name = tensor<string, []>("transpose_15")];
-            tensor<fp32, [12, 4, 256]> x = reshape(shape = var_1287, x = var_1283)[name = tensor<string, []>("x")];
+            tensor<fp32, [4, 12, 256]> xt_3 = layer_norm(axes = xt_3_axes_0, beta = inner_decoder_norm22_0_bias, epsilon = var_66, gamma = inner_decoder_norm22_0_weight, x = input_185)[name = tensor<string, []>("xt_3")];
+            tensor<int32, [4]> var_1314 = const()[name = tensor<string, []>("op_1314"), val = tensor<int32, [4]>([1, 4, 12, 256])];
+            tensor<fp32, [1, 4, 12, 256]> x_31 = reshape(shape = var_1314, x = xt_3)[name = tensor<string, []>("x_31")];
+            tensor<int32, [4]> var_1316_perm_0 = const()[name = tensor<string, []>("op_1316_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1320 = const()[name = tensor<string, []>("op_1320"), val = tensor<int32, [3]>([12, 4, 256])];
+            tensor<fp32, [1, 12, 4, 256]> var_1316 = transpose(perm = var_1316_perm_0, x = x_31)[name = tensor<string, []>("transpose_15")];
+            tensor<fp32, [12, 4, 256]> x = reshape(shape = var_1320, x = var_1316)[name = tensor<string, []>("x")];
             tensor<int32, [5]> prev_kv_begin_0 = const()[name = tensor<string, []>("prev_kv_begin_0"), val = tensor<int32, [5]>([1, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_end_0 = const()[name = tensor<string, []>("prev_kv_end_0"), val = tensor<int32, [5]>([2, 12, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_end_mask_0 = const()[name = tensor<string, []>("prev_kv_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -1153,120 +1168,120 @@ program(1.0)
             tensor<bool, [2]> prev_scale_end_mask_0 = const()[name = tensor<string, []>("prev_scale_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale = slice_by_index(begin = prev_scale_begin_0, end = prev_scale_end_0, end_mask = prev_scale_end_mask_0, squeeze_mask = prev_scale_squeeze_mask_0, x = dec_scale)[name = tensor<string, []>("prev_scale")];
-            tensor<fp32, [12, 4, 256]> var_1295 = linear(bias = decoder_q_proj_1_bias, weight = decoder_q_proj_1_weight, x = x)[name = tensor<string, []>("linear_47")];
-            tensor<int32, [4]> var_1296 = const()[name = tensor<string, []>("op_1296"), val = tensor<int32, [4]>([12, 4, 4, 64])];
-            tensor<fp32, [12, 4, 4, 64]> var_1297 = reshape(shape = var_1296, x = var_1295)[name = tensor<string, []>("op_1297")];
+            tensor<fp32, [12, 4, 256]> var_1328 = linear(bias = inner_decoder_q_proj_1_bias, weight = inner_decoder_q_proj_1_weight, x = x)[name = tensor<string, []>("linear_47")];
+            tensor<int32, [4]> var_1329 = const()[name = tensor<string, []>("op_1329"), val = tensor<int32, [4]>([12, 4, 4, 64])];
+            tensor<fp32, [12, 4, 4, 64]> var_1330 = reshape(shape = var_1329, x = var_1328)[name = tensor<string, []>("op_1330")];
             tensor<int32, [4]> q_17_perm_0 = const()[name = tensor<string, []>("q_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 4, 256]> var_1301 = linear(bias = decoder_k_proj_1_bias, weight = decoder_k_proj_1_weight, x = x)[name = tensor<string, []>("linear_48")];
-            tensor<fp32, []> var_1302 = const()[name = tensor<string, []>("op_1302"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [12, 4, 256]> var_1303 = mul(x = var_1301, y = var_1302)[name = tensor<string, []>("op_1303")];
-            tensor<int32, [4]> var_1304 = const()[name = tensor<string, []>("op_1304"), val = tensor<int32, [4]>([12, 4, 4, 64])];
-            tensor<fp32, [12, 4, 4, 64]> var_1305 = reshape(shape = var_1304, x = var_1303)[name = tensor<string, []>("op_1305")];
+            tensor<fp32, [12, 4, 256]> var_1334 = linear(bias = inner_decoder_k_proj_1_bias, weight = inner_decoder_k_proj_1_weight, x = x)[name = tensor<string, []>("linear_48")];
+            tensor<fp32, []> var_1335 = const()[name = tensor<string, []>("op_1335"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [12, 4, 256]> var_1336 = mul(x = var_1334, y = var_1335)[name = tensor<string, []>("op_1336")];
+            tensor<int32, [4]> var_1337 = const()[name = tensor<string, []>("op_1337"), val = tensor<int32, [4]>([12, 4, 4, 64])];
+            tensor<fp32, [12, 4, 4, 64]> var_1338 = reshape(shape = var_1337, x = var_1336)[name = tensor<string, []>("op_1338")];
             tensor<int32, [4]> k_17_perm_0 = const()[name = tensor<string, []>("k_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 4, 256]> var_1309 = linear(bias = decoder_v_proj_1_bias, weight = decoder_v_proj_1_weight, x = x)[name = tensor<string, []>("linear_49")];
-            tensor<int32, [4]> var_1310 = const()[name = tensor<string, []>("op_1310"), val = tensor<int32, [4]>([12, 4, 4, 64])];
-            tensor<fp32, [12, 4, 4, 64]> var_1311 = reshape(shape = var_1310, x = var_1309)[name = tensor<string, []>("op_1311")];
+            tensor<fp32, [12, 4, 256]> var_1342 = linear(bias = inner_decoder_v_proj_1_bias, weight = inner_decoder_v_proj_1_weight, x = x)[name = tensor<string, []>("linear_49")];
+            tensor<int32, [4]> var_1343 = const()[name = tensor<string, []>("op_1343"), val = tensor<int32, [4]>([12, 4, 4, 64])];
+            tensor<fp32, [12, 4, 4, 64]> var_1344 = reshape(shape = var_1343, x = var_1342)[name = tensor<string, []>("op_1344")];
             tensor<int32, [4]> v_17_perm_0 = const()[name = tensor<string, []>("v_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 4, 256]> input_187 = linear(bias = decoder_g_proj_1_bias, weight = decoder_g_proj_1_weight, x = x)[name = tensor<string, []>("linear_50")];
+            tensor<fp32, [12, 4, 256]> input_189 = linear(bias = inner_decoder_g_proj_1_bias, weight = inner_decoder_g_proj_1_weight, x = x)[name = tensor<string, []>("linear_50")];
             tensor<fp32, [1]> sqrt_s0 = sqrt(x = prev_scale)[name = tensor<string, []>("sqrt_s0")];
             tensor<fp32, [4]> s_t = add(x = prev_scale, y = cumsum_mask_1)[name = tensor<string, []>("s_t")];
             tensor<fp32, []> const_32 = const()[name = tensor<string, []>("const_32"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [4]> clip_3 = clip(alpha = var_1022, beta = const_32, x = s_t)[name = tensor<string, []>("clip_3")];
+            tensor<fp32, [4]> clip_3 = clip(alpha = var_59, beta = const_32, x = s_t)[name = tensor<string, []>("clip_3")];
             tensor<fp32, [4]> sqrt_s_t = sqrt(x = clip_3)[name = tensor<string, []>("sqrt_s_t")];
             tensor<bool, []> qk_transpose_x_1 = const()[name = tensor<string, []>("qk_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_transpose_y_1 = const()[name = tensor<string, []>("qk_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [12, 4, 4, 64]> k_17 = transpose(perm = k_17_perm_0, x = var_1305)[name = tensor<string, []>("transpose_13")];
-            tensor<fp32, [12, 4, 4, 64]> q_17 = transpose(perm = q_17_perm_0, x = var_1297)[name = tensor<string, []>("transpose_14")];
+            tensor<fp32, [12, 4, 4, 64]> k_17 = transpose(perm = k_17_perm_0, x = var_1338)[name = tensor<string, []>("transpose_13")];
+            tensor<fp32, [12, 4, 4, 64]> q_17 = transpose(perm = q_17_perm_0, x = var_1330)[name = tensor<string, []>("transpose_14")];
             tensor<fp32, [12, 4, 4, 4]> qk = matmul(transpose_x = qk_transpose_x_1, transpose_y = qk_transpose_y_1, x = q_17, y = k_17)[name = tensor<string, []>("qk")];
-            tensor<int32, [2]> var_1326 = const()[name = tensor<string, []>("op_1326"), val = tensor<int32, [2]>([4, 1])];
-            tensor<fp32, [4, 1]> var_1327 = reshape(shape = var_1326, x = sqrt_s_t)[name = tensor<string, []>("op_1327")];
-            tensor<fp32, [4, 4]> M = real_div(x = causal_with_valid_1, y = var_1327)[name = tensor<string, []>("M")];
-            tensor<fp32, [12, 4, 4, 4]> var_1329 = mul(x = qk, y = M)[name = tensor<string, []>("op_1329")];
-            tensor<bool, []> inner_transpose_x_0 = const()[name = tensor<string, []>("inner_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> inner_transpose_y_0 = const()[name = tensor<string, []>("inner_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 4, 64]> v_17 = transpose(perm = v_17_perm_0, x = var_1311)[name = tensor<string, []>("transpose_12")];
-            tensor<fp32, [12, 4, 4, 64]> inner = matmul(transpose_x = inner_transpose_x_0, transpose_y = inner_transpose_y_0, x = var_1329, y = v_17)[name = tensor<string, []>("inner")];
-            tensor<bool, []> var_1331_transpose_x_0 = const()[name = tensor<string, []>("op_1331_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_1331_transpose_y_0 = const()[name = tensor<string, []>("op_1331_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 4, 64]> var_1331 = matmul(transpose_x = var_1331_transpose_x_0, transpose_y = var_1331_transpose_y_0, x = q_17, y = prev_kv)[name = tensor<string, []>("op_1331")];
-            tensor<fp32, [4]> var_1332 = real_div(x = sqrt_s0, y = sqrt_s_t)[name = tensor<string, []>("op_1332")];
-            tensor<int32, [4]> var_1333 = const()[name = tensor<string, []>("op_1333"), val = tensor<int32, [4]>([1, 1, 4, 1])];
-            tensor<fp32, [1, 1, 4, 1]> var_1334 = reshape(shape = var_1333, x = var_1332)[name = tensor<string, []>("op_1334")];
-            tensor<fp32, [12, 4, 4, 64]> cross = mul(x = var_1331, y = var_1334)[name = tensor<string, []>("cross")];
-            tensor<fp32, [12, 4, 4, 64]> out_31 = add(x = inner, y = cross)[name = tensor<string, []>("out_31")];
-            tensor<fp32, [12, 4, 4, 64]> v_masked = mul(x = v_17, y = var_1157)[name = tensor<string, []>("v_masked")];
-            tensor<fp32, [12, 4, 64, 64]> var_1340 = mul(x = prev_kv, y = sqrt_s0)[name = tensor<string, []>("op_1340")];
-            tensor<bool, []> var_1342_transpose_x_1 = const()[name = tensor<string, []>("op_1342_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_1342_transpose_y_1 = const()[name = tensor<string, []>("op_1342_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 64, 64]> var_1342 = matmul(transpose_x = var_1342_transpose_x_1, transpose_y = var_1342_transpose_y_1, x = k_17, y = v_masked)[name = tensor<string, []>("op_1342")];
-            tensor<fp32, [12, 4, 64, 64]> new_kv_unnorm = add(x = var_1340, y = var_1342)[name = tensor<string, []>("new_kv_unnorm")];
-            tensor<fp32, [1]> new_scale = add(x = prev_scale, y = var_1165)[name = tensor<string, []>("new_scale")];
+            tensor<int32, [2]> var_1359 = const()[name = tensor<string, []>("op_1359"), val = tensor<int32, [2]>([4, 1])];
+            tensor<fp32, [4, 1]> var_1360 = reshape(shape = var_1359, x = sqrt_s_t)[name = tensor<string, []>("op_1360")];
+            tensor<fp32, [4, 4]> M = real_div(x = causal_with_valid_1, y = var_1360)[name = tensor<string, []>("M")];
+            tensor<fp32, [12, 4, 4, 4]> var_1362 = mul(x = qk, y = M)[name = tensor<string, []>("op_1362")];
+            tensor<bool, []> inner_11_transpose_x_0 = const()[name = tensor<string, []>("inner_11_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> inner_11_transpose_y_0 = const()[name = tensor<string, []>("inner_11_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 4, 64]> v_17 = transpose(perm = v_17_perm_0, x = var_1344)[name = tensor<string, []>("transpose_12")];
+            tensor<fp32, [12, 4, 4, 64]> inner_11 = matmul(transpose_x = inner_11_transpose_x_0, transpose_y = inner_11_transpose_y_0, x = var_1362, y = v_17)[name = tensor<string, []>("inner_11")];
+            tensor<bool, []> var_1364_transpose_x_0 = const()[name = tensor<string, []>("op_1364_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_1364_transpose_y_0 = const()[name = tensor<string, []>("op_1364_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 4, 64]> var_1364 = matmul(transpose_x = var_1364_transpose_x_0, transpose_y = var_1364_transpose_y_0, x = q_17, y = prev_kv)[name = tensor<string, []>("op_1364")];
+            tensor<fp32, [4]> var_1365 = real_div(x = sqrt_s0, y = sqrt_s_t)[name = tensor<string, []>("op_1365")];
+            tensor<int32, [4]> var_1366 = const()[name = tensor<string, []>("op_1366"), val = tensor<int32, [4]>([1, 1, 4, 1])];
+            tensor<fp32, [1, 1, 4, 1]> var_1367 = reshape(shape = var_1366, x = var_1365)[name = tensor<string, []>("op_1367")];
+            tensor<fp32, [12, 4, 4, 64]> cross = mul(x = var_1364, y = var_1367)[name = tensor<string, []>("cross")];
+            tensor<fp32, [12, 4, 4, 64]> out_31 = add(x = inner_11, y = cross)[name = tensor<string, []>("out_31")];
+            tensor<fp32, [12, 4, 4, 64]> v_masked = mul(x = v_17, y = var_1190)[name = tensor<string, []>("v_masked")];
+            tensor<fp32, [12, 4, 64, 64]> var_1373 = mul(x = prev_kv, y = sqrt_s0)[name = tensor<string, []>("op_1373")];
+            tensor<bool, []> var_1375_transpose_x_1 = const()[name = tensor<string, []>("op_1375_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_1375_transpose_y_1 = const()[name = tensor<string, []>("op_1375_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 64, 64]> var_1375 = matmul(transpose_x = var_1375_transpose_x_1, transpose_y = var_1375_transpose_y_1, x = k_17, y = v_masked)[name = tensor<string, []>("op_1375")];
+            tensor<fp32, [12, 4, 64, 64]> new_kv_unnorm = add(x = var_1373, y = var_1375)[name = tensor<string, []>("new_kv_unnorm")];
+            tensor<fp32, [1]> new_scale = add(x = prev_scale, y = var_1198)[name = tensor<string, []>("new_scale")];
             tensor<fp32, []> const_33 = const()[name = tensor<string, []>("const_33"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_4 = clip(alpha = var_1022, beta = const_33, x = new_scale)[name = tensor<string, []>("clip_4")];
+            tensor<fp32, [1]> clip_4 = clip(alpha = var_59, beta = const_33, x = new_scale)[name = tensor<string, []>("clip_4")];
             tensor<fp32, [1]> sqrt_new_scale = sqrt(x = clip_4)[name = tensor<string, []>("sqrt_new_scale")];
             tensor<fp32, [12, 4, 64, 64]> nkv = real_div(x = new_kv_unnorm, y = sqrt_new_scale)[name = tensor<string, []>("nkv")];
-            tensor<int32, [4]> var_1351_perm_0 = const()[name = tensor<string, []>("op_1351_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [4]> var_1384_perm_0 = const()[name = tensor<string, []>("op_1384_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_33_axes_0 = const()[name = tensor<string, []>("out_33_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [12, 4, 4, 64]> var_1351 = transpose(perm = var_1351_perm_0, x = out_31)[name = tensor<string, []>("transpose_11")];
-            tensor<fp32, [12, 4, 4, 64]> out_33 = layer_norm(axes = out_33_axes_0, epsilon = var_1019, x = var_1351)[name = tensor<string, []>("out_33")];
-            tensor<int32, [3]> var_1355 = const()[name = tensor<string, []>("op_1355"), val = tensor<int32, [3]>([12, 4, 256])];
-            tensor<fp32, [12, 4, 256]> out = reshape(shape = var_1355, x = out_33)[name = tensor<string, []>("out")];
-            tensor<fp32, [12, 4, 256]> var_1357 = silu(x = input_187)[name = tensor<string, []>("op_1357")];
-            tensor<fp32, [12, 4, 256]> input_189 = mul(x = var_1357, y = out)[name = tensor<string, []>("input_189")];
-            tensor<fp32, [12, 4, 256]> ret_out = linear(bias = decoder_out_proj_1_bias, weight = decoder_out_proj_1_weight, x = input_189)[name = tensor<string, []>("linear_51")];
-            tensor<fp32, [12, 4, 256]> input_191 = add(x = x, y = ret_out)[name = tensor<string, []>("input_191")];
+            tensor<fp32, [12, 4, 4, 64]> var_1384 = transpose(perm = var_1384_perm_0, x = out_31)[name = tensor<string, []>("transpose_11")];
+            tensor<fp32, [12, 4, 4, 64]> out_33 = layer_norm(axes = out_33_axes_0, epsilon = var_74, x = var_1384)[name = tensor<string, []>("out_33")];
+            tensor<int32, [3]> var_1388 = const()[name = tensor<string, []>("op_1388"), val = tensor<int32, [3]>([12, 4, 256])];
+            tensor<fp32, [12, 4, 256]> out = reshape(shape = var_1388, x = out_33)[name = tensor<string, []>("out")];
+            tensor<fp32, [12, 4, 256]> var_1390 = silu(x = input_189)[name = tensor<string, []>("op_1390")];
+            tensor<fp32, [12, 4, 256]> input_191 = mul(x = var_1390, y = out)[name = tensor<string, []>("input_191")];
+            tensor<fp32, [12, 4, 256]> ret_out = linear(bias = inner_decoder_out_proj_1_bias, weight = inner_decoder_out_proj_1_weight, x = input_191)[name = tensor<string, []>("linear_51")];
+            tensor<fp32, [12, 4, 256]> input_193 = add(x = x, y = ret_out)[name = tensor<string, []>("input_193")];
             tensor<int32, [1]> xt_5_axes_0 = const()[name = tensor<string, []>("xt_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [12, 4, 256]> xt_5 = layer_norm(axes = xt_5_axes_0, beta = decoder_norm11_1_bias, epsilon = var_1017, gamma = decoder_norm11_1_weight, x = input_191)[name = tensor<string, []>("xt_5")];
-            tensor<int32, [4]> var_1367 = const()[name = tensor<string, []>("op_1367"), val = tensor<int32, [4]>([1, 12, 4, 256])];
-            tensor<fp32, [1, 12, 4, 256]> var_1368 = reshape(shape = var_1367, x = xt_5)[name = tensor<string, []>("op_1368")];
-            tensor<int32, [4]> var_1369_perm_0 = const()[name = tensor<string, []>("op_1369_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1372 = const()[name = tensor<string, []>("op_1372"), val = tensor<int32, [3]>([4, 12, 256])];
-            tensor<fp32, [1, 4, 12, 256]> var_1369 = transpose(perm = var_1369_perm_0, x = var_1368)[name = tensor<string, []>("transpose_10")];
-            tensor<fp32, [4, 12, 256]> query_5 = reshape(shape = var_1372, x = var_1369)[name = tensor<string, []>("query_5")];
+            tensor<fp32, [12, 4, 256]> xt_5 = layer_norm(axes = xt_5_axes_0, beta = inner_decoder_norm11_1_bias, epsilon = var_66, gamma = inner_decoder_norm11_1_weight, x = input_193)[name = tensor<string, []>("xt_5")];
+            tensor<int32, [4]> var_1400 = const()[name = tensor<string, []>("op_1400"), val = tensor<int32, [4]>([1, 12, 4, 256])];
+            tensor<fp32, [1, 12, 4, 256]> var_1401 = reshape(shape = var_1400, x = xt_5)[name = tensor<string, []>("op_1401")];
+            tensor<int32, [4]> var_1402_perm_0 = const()[name = tensor<string, []>("op_1402_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1405 = const()[name = tensor<string, []>("op_1405"), val = tensor<int32, [3]>([4, 12, 256])];
+            tensor<fp32, [1, 4, 12, 256]> var_1402 = transpose(perm = var_1402_perm_0, x = var_1401)[name = tensor<string, []>("transpose_10")];
+            tensor<fp32, [4, 12, 256]> query_5 = reshape(shape = var_1405, x = var_1402)[name = tensor<string, []>("query_5")];
             tensor<int32, [3]> query_perm_0 = const()[name = tensor<string, []>("query_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [12, 4, 256]> query = transpose(perm = query_perm_0, x = query_5)[name = tensor<string, []>("transpose_9")];
-            tensor<fp32, [12, 4, 768]> var_1395 = linear(bias = decoder_self_attn2_1_in_proj_bias, weight = decoder_self_attn2_1_in_proj_weight, x = query)[name = tensor<string, []>("linear_52")];
+            tensor<fp32, [12, 4, 768]> var_1428 = linear(bias = inner_decoder_self_attn2_1_in_proj_bias, weight = inner_decoder_self_attn2_1_in_proj_weight, x = query)[name = tensor<string, []>("linear_52")];
             tensor<int32, [4]> concat_2 = const()[name = tensor<string, []>("concat_2"), val = tensor<int32, [4]>([12, 4, 3, 256])];
-            tensor<fp32, [12, 4, 3, 256]> var_1397 = reshape(shape = concat_2, x = var_1395)[name = tensor<string, []>("op_1397")];
-            tensor<int32, [1]> var_1398_axes_0 = const()[name = tensor<string, []>("op_1398_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp32, [1, 12, 4, 3, 256]> var_1398 = expand_dims(axes = var_1398_axes_0, x = var_1397)[name = tensor<string, []>("op_1398")];
-            tensor<int32, [5]> var_1399_perm_0 = const()[name = tensor<string, []>("op_1399_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
-            tensor<int32, [1]> var_1400_axes_0 = const()[name = tensor<string, []>("op_1400_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp32, [3, 12, 4, 1, 256]> var_1399 = transpose(perm = var_1399_perm_0, x = var_1398)[name = tensor<string, []>("transpose_8")];
-            tensor<fp32, [3, 12, 4, 256]> var_1400 = squeeze(axes = var_1400_axes_0, x = var_1399)[name = tensor<string, []>("op_1400")];
+            tensor<fp32, [12, 4, 3, 256]> var_1430 = reshape(shape = concat_2, x = var_1428)[name = tensor<string, []>("op_1430")];
+            tensor<int32, [1]> var_1431_axes_0 = const()[name = tensor<string, []>("op_1431_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 12, 4, 3, 256]> var_1431 = expand_dims(axes = var_1431_axes_0, x = var_1430)[name = tensor<string, []>("op_1431")];
+            tensor<int32, [5]> var_1432_perm_0 = const()[name = tensor<string, []>("op_1432_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
+            tensor<int32, [1]> var_1433_axes_0 = const()[name = tensor<string, []>("op_1433_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp32, [3, 12, 4, 1, 256]> var_1432 = transpose(perm = var_1432_perm_0, x = var_1431)[name = tensor<string, []>("transpose_8")];
+            tensor<fp32, [3, 12, 4, 256]> var_1433 = squeeze(axes = var_1433_axes_0, x = var_1432)[name = tensor<string, []>("op_1433")];
             tensor<int32, [4]> q_19_begin_0 = const()[name = tensor<string, []>("q_19_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> q_19_end_0 = const()[name = tensor<string, []>("q_19_end_0"), val = tensor<int32, [4]>([1, 12, 4, 256])];
             tensor<bool, [4]> q_19_end_mask_0 = const()[name = tensor<string, []>("q_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> q_19_squeeze_mask_0 = const()[name = tensor<string, []>("q_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 4, 256]> q_19 = slice_by_index(begin = q_19_begin_0, end = q_19_end_0, end_mask = q_19_end_mask_0, squeeze_mask = q_19_squeeze_mask_0, x = var_1400)[name = tensor<string, []>("q_19")];
+            tensor<fp32, [12, 4, 256]> q_19 = slice_by_index(begin = q_19_begin_0, end = q_19_end_0, end_mask = q_19_end_mask_0, squeeze_mask = q_19_squeeze_mask_0, x = var_1433)[name = tensor<string, []>("q_19")];
             tensor<int32, [4]> k_19_begin_0 = const()[name = tensor<string, []>("k_19_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> k_19_end_0 = const()[name = tensor<string, []>("k_19_end_0"), val = tensor<int32, [4]>([2, 12, 4, 256])];
             tensor<bool, [4]> k_19_end_mask_0 = const()[name = tensor<string, []>("k_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> k_19_squeeze_mask_0 = const()[name = tensor<string, []>("k_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 4, 256]> k_19 = slice_by_index(begin = k_19_begin_0, end = k_19_end_0, end_mask = k_19_end_mask_0, squeeze_mask = k_19_squeeze_mask_0, x = var_1400)[name = tensor<string, []>("k_19")];
+            tensor<fp32, [12, 4, 256]> k_19 = slice_by_index(begin = k_19_begin_0, end = k_19_end_0, end_mask = k_19_end_mask_0, squeeze_mask = k_19_squeeze_mask_0, x = var_1433)[name = tensor<string, []>("k_19")];
             tensor<int32, [4]> v_19_begin_0 = const()[name = tensor<string, []>("v_19_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> v_19_end_0 = const()[name = tensor<string, []>("v_19_end_0"), val = tensor<int32, [4]>([3, 12, 4, 256])];
             tensor<bool, [4]> v_19_end_mask_0 = const()[name = tensor<string, []>("v_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> v_19_squeeze_mask_0 = const()[name = tensor<string, []>("v_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 4, 256]> v_19 = slice_by_index(begin = v_19_begin_0, end = v_19_end_0, end_mask = v_19_end_mask_0, squeeze_mask = v_19_squeeze_mask_0, x = var_1400)[name = tensor<string, []>("v_19")];
-            tensor<int32, [3]> var_1408 = const()[name = tensor<string, []>("op_1408"), val = tensor<int32, [3]>([12, 16, 64])];
-            tensor<fp32, [12, 16, 64]> var_1409 = reshape(shape = var_1408, x = q_19)[name = tensor<string, []>("op_1409")];
+            tensor<fp32, [12, 4, 256]> v_19 = slice_by_index(begin = v_19_begin_0, end = v_19_end_0, end_mask = v_19_end_mask_0, squeeze_mask = v_19_squeeze_mask_0, x = var_1433)[name = tensor<string, []>("v_19")];
+            tensor<int32, [3]> var_1441 = const()[name = tensor<string, []>("op_1441"), val = tensor<int32, [3]>([12, 16, 64])];
+            tensor<fp32, [12, 16, 64]> var_1442 = reshape(shape = var_1441, x = q_19)[name = tensor<string, []>("op_1442")];
             tensor<int32, [3]> q_21_perm_0 = const()[name = tensor<string, []>("q_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1415 = const()[name = tensor<string, []>("op_1415"), val = tensor<int32, [3]>([12, 16, 64])];
-            tensor<fp32, [12, 16, 64]> var_1416 = reshape(shape = var_1415, x = k_19)[name = tensor<string, []>("op_1416")];
+            tensor<int32, [3]> var_1448 = const()[name = tensor<string, []>("op_1448"), val = tensor<int32, [3]>([12, 16, 64])];
+            tensor<fp32, [12, 16, 64]> var_1449 = reshape(shape = var_1448, x = k_19)[name = tensor<string, []>("op_1449")];
             tensor<int32, [3]> k_21_perm_0 = const()[name = tensor<string, []>("k_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1422 = const()[name = tensor<string, []>("op_1422"), val = tensor<int32, [3]>([12, 16, 64])];
-            tensor<fp32, [12, 16, 64]> var_1423 = reshape(shape = var_1422, x = v_19)[name = tensor<string, []>("op_1423")];
+            tensor<int32, [3]> var_1455 = const()[name = tensor<string, []>("op_1455"), val = tensor<int32, [3]>([12, 16, 64])];
+            tensor<fp32, [12, 16, 64]> var_1456 = reshape(shape = var_1455, x = v_19)[name = tensor<string, []>("op_1456")];
             tensor<int32, [3]> v_21_perm_0 = const()[name = tensor<string, []>("v_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [4]> var_1426 = const()[name = tensor<string, []>("op_1426"), val = tensor<int32, [4]>([4, 4, 12, 64])];
-            tensor<fp32, [16, 12, 64]> q_21 = transpose(perm = q_21_perm_0, x = var_1409)[name = tensor<string, []>("transpose_7")];
-            tensor<fp32, [4, 4, 12, 64]> q = reshape(shape = var_1426, x = q_21)[name = tensor<string, []>("q")];
-            tensor<int32, [4]> var_1428 = const()[name = tensor<string, []>("op_1428"), val = tensor<int32, [4]>([4, 4, 12, 64])];
-            tensor<fp32, [16, 12, 64]> k_21 = transpose(perm = k_21_perm_0, x = var_1416)[name = tensor<string, []>("transpose_6")];
-            tensor<fp32, [4, 4, 12, 64]> k = reshape(shape = var_1428, x = k_21)[name = tensor<string, []>("k")];
-            tensor<int32, [4]> var_1430 = const()[name = tensor<string, []>("op_1430"), val = tensor<int32, [4]>([4, 4, 12, 64])];
-            tensor<fp32, [16, 12, 64]> v_21 = transpose(perm = v_21_perm_0, x = var_1423)[name = tensor<string, []>("transpose_5")];
-            tensor<fp32, [4, 4, 12, 64]> v = reshape(shape = var_1430, x = v_21)[name = tensor<string, []>("v")];
+            tensor<int32, [4]> var_1459 = const()[name = tensor<string, []>("op_1459"), val = tensor<int32, [4]>([4, 4, 12, 64])];
+            tensor<fp32, [16, 12, 64]> q_21 = transpose(perm = q_21_perm_0, x = var_1442)[name = tensor<string, []>("transpose_7")];
+            tensor<fp32, [4, 4, 12, 64]> q = reshape(shape = var_1459, x = q_21)[name = tensor<string, []>("q")];
+            tensor<int32, [4]> var_1461 = const()[name = tensor<string, []>("op_1461"), val = tensor<int32, [4]>([4, 4, 12, 64])];
+            tensor<fp32, [16, 12, 64]> k_21 = transpose(perm = k_21_perm_0, x = var_1449)[name = tensor<string, []>("transpose_6")];
+            tensor<fp32, [4, 4, 12, 64]> k = reshape(shape = var_1461, x = k_21)[name = tensor<string, []>("k")];
+            tensor<int32, [4]> var_1463 = const()[name = tensor<string, []>("op_1463"), val = tensor<int32, [4]>([4, 4, 12, 64])];
+            tensor<fp32, [16, 12, 64]> v_21 = transpose(perm = v_21_perm_0, x = var_1456)[name = tensor<string, []>("transpose_5")];
+            tensor<fp32, [4, 4, 12, 64]> v = reshape(shape = var_1463, x = v_21)[name = tensor<string, []>("v")];
             tensor<fp32, []> mul_3_y_0 = const()[name = tensor<string, []>("mul_3_y_0"), val = tensor<fp32, []>(0x1p-3)];
             tensor<fp32, [4, 4, 12, 64]> mul_3 = mul(x = q, y = mul_3_y_0)[name = tensor<string, []>("mul_3")];
             tensor<bool, []> matmul_1_transpose_y_0 = const()[name = tensor<string, []>("matmul_1_transpose_y_0"), val = tensor<bool, []>(true)];
@@ -1277,36 +1292,36 @@ program(1.0)
             tensor<bool, []> attn_output_9_transpose_x_0 = const()[name = tensor<string, []>("attn_output_9_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> attn_output_9_transpose_y_0 = const()[name = tensor<string, []>("attn_output_9_transpose_y_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [4, 4, 12, 64]> attn_output_9 = matmul(transpose_x = attn_output_9_transpose_x_0, transpose_y = attn_output_9_transpose_y_0, x = softmax_1, y = v)[name = tensor<string, []>("attn_output_9")];
-            tensor<int32, [4]> var_1433 = const()[name = tensor<string, []>("op_1433"), val = tensor<int32, [4]>([2, 0, 1, 3])];
-            tensor<int32, [2]> var_1438 = const()[name = tensor<string, []>("op_1438"), val = tensor<int32, [2]>([48, 256])];
-            tensor<fp32, [12, 4, 4, 64]> var_1434 = transpose(perm = var_1433, x = attn_output_9)[name = tensor<string, []>("transpose_4")];
-            tensor<fp32, [48, 256]> attn_output_11 = reshape(shape = var_1438, x = var_1434)[name = tensor<string, []>("attn_output_11")];
-            tensor<fp32, [48, 256]> attn_output_13 = linear(bias = decoder_self_attn2_1_out_proj_bias, weight = decoder_self_attn2_1_out_proj_weight, x = attn_output_11)[name = tensor<string, []>("linear_53")];
-            tensor<int32, [3]> var_1442 = const()[name = tensor<string, []>("op_1442"), val = tensor<int32, [3]>([12, 4, 256])];
-            tensor<fp32, [12, 4, 256]> attn_output = reshape(shape = var_1442, x = attn_output_13)[name = tensor<string, []>("attn_output")];
+            tensor<int32, [4]> var_1466 = const()[name = tensor<string, []>("op_1466"), val = tensor<int32, [4]>([2, 0, 1, 3])];
+            tensor<int32, [2]> var_1471 = const()[name = tensor<string, []>("op_1471"), val = tensor<int32, [2]>([48, 256])];
+            tensor<fp32, [12, 4, 4, 64]> var_1467 = transpose(perm = var_1466, x = attn_output_9)[name = tensor<string, []>("transpose_4")];
+            tensor<fp32, [48, 256]> attn_output_11 = reshape(shape = var_1471, x = var_1467)[name = tensor<string, []>("attn_output_11")];
+            tensor<fp32, [48, 256]> attn_output_13 = linear(bias = inner_decoder_self_attn2_1_out_proj_bias, weight = inner_decoder_self_attn2_1_out_proj_weight, x = attn_output_11)[name = tensor<string, []>("linear_53")];
+            tensor<int32, [3]> var_1475 = const()[name = tensor<string, []>("op_1475"), val = tensor<int32, [3]>([12, 4, 256])];
+            tensor<fp32, [12, 4, 256]> attn_output = reshape(shape = var_1475, x = attn_output_13)[name = tensor<string, []>("attn_output")];
             tensor<int32, [3]> sa2_out_perm_0 = const()[name = tensor<string, []>("sa2_out_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [4, 12, 256]> sa2_out = transpose(perm = sa2_out_perm_0, x = attn_output)[name = tensor<string, []>("transpose_3")];
-            tensor<fp32, [4, 12, 256]> input_193 = add(x = query_5, y = sa2_out)[name = tensor<string, []>("input_193")];
-            tensor<int32, [1]> input_195_axes_0 = const()[name = tensor<string, []>("input_195_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [4, 12, 256]> input_195 = layer_norm(axes = input_195_axes_0, beta = decoder_norm21_1_bias, epsilon = var_1017, gamma = decoder_norm21_1_weight, x = input_193)[name = tensor<string, []>("input_195")];
-            tensor<fp32, [4, 12, 2048]> input_197 = linear(bias = decoder_linear1_1_bias, weight = decoder_linear1_1_weight, x = input_195)[name = tensor<string, []>("linear_54")];
-            tensor<fp32, [4, 12, 2048]> input_199 = relu(x = input_197)[name = tensor<string, []>("input_199")];
-            tensor<fp32, [4, 12, 256]> ffn_out = linear(bias = decoder_linear2_1_bias, weight = decoder_linear2_1_weight, x = input_199)[name = tensor<string, []>("linear_55")];
-            tensor<fp32, [4, 12, 256]> input_201 = add(x = input_195, y = ffn_out)[name = tensor<string, []>("input_201")];
+            tensor<fp32, [4, 12, 256]> input_195 = add(x = query_5, y = sa2_out)[name = tensor<string, []>("input_195")];
+            tensor<int32, [1]> input_197_axes_0 = const()[name = tensor<string, []>("input_197_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [4, 12, 256]> input_197 = layer_norm(axes = input_197_axes_0, beta = inner_decoder_norm21_1_bias, epsilon = var_66, gamma = inner_decoder_norm21_1_weight, x = input_195)[name = tensor<string, []>("input_197")];
+            tensor<fp32, [4, 12, 2048]> input_199 = linear(bias = inner_decoder_linear1_1_bias, weight = inner_decoder_linear1_1_weight, x = input_197)[name = tensor<string, []>("linear_54")];
+            tensor<fp32, [4, 12, 2048]> input_201 = relu(x = input_199)[name = tensor<string, []>("input_201")];
+            tensor<fp32, [4, 12, 256]> ffn_out = linear(bias = inner_decoder_linear2_1_bias, weight = inner_decoder_linear2_1_weight, x = input_201)[name = tensor<string, []>("linear_55")];
+            tensor<fp32, [4, 12, 256]> input_203 = add(x = input_197, y = ffn_out)[name = tensor<string, []>("input_203")];
             tensor<int32, [1]> xt_axes_0 = const()[name = tensor<string, []>("xt_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [4, 12, 256]> xt = layer_norm(axes = xt_axes_0, beta = decoder_norm22_1_bias, epsilon = var_1017, gamma = decoder_norm22_1_weight, x = input_201)[name = tensor<string, []>("xt")];
-            tensor<int32, [4]> var_1462 = const()[name = tensor<string, []>("op_1462"), val = tensor<int32, [4]>([1, 4, 12, 256])];
-            tensor<fp32, [1, 4, 12, 256]> input = reshape(shape = var_1462, x = xt)[name = tensor<string, []>("input")];
-            tensor<int32, [1]> var_1464 = const()[name = tensor<string, []>("op_1464"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 4, 12, 1]> var_1465 = reduce_l2_norm(axes = var_1464, keep_dims = var_1020, x = input)[name = tensor<string, []>("op_1465")];
+            tensor<fp32, [4, 12, 256]> xt = layer_norm(axes = xt_axes_0, beta = inner_decoder_norm22_1_bias, epsilon = var_66, gamma = inner_decoder_norm22_1_weight, x = input_203)[name = tensor<string, []>("xt")];
+            tensor<int32, [4]> var_1495 = const()[name = tensor<string, []>("op_1495"), val = tensor<int32, [4]>([1, 4, 12, 256])];
+            tensor<fp32, [1, 4, 12, 256]> input = reshape(shape = var_1495, x = xt)[name = tensor<string, []>("input")];
+            tensor<int32, [1]> var_1497 = const()[name = tensor<string, []>("op_1497"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 4, 12, 1]> var_1498 = reduce_l2_norm(axes = var_1497, keep_dims = var_65, x = input)[name = tensor<string, []>("op_1498")];
             tensor<fp32, []> const_42 = const()[name = tensor<string, []>("const_42"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1, 4, 12, 1]> clip_5 = clip(alpha = var_1012, beta = const_42, x = var_1465)[name = tensor<string, []>("clip_5")];
-            tensor<fp32, [1, 4, 12, 256]> var_1467 = real_div(x = input, y = clip_5)[name = tensor<string, []>("op_1467")];
+            tensor<fp32, [1, 4, 12, 1]> clip_5 = clip(alpha = var_79, beta = const_42, x = var_1498)[name = tensor<string, []>("clip_5")];
+            tensor<fp32, [1, 4, 12, 256]> var_1500 = real_div(x = input, y = clip_5)[name = tensor<string, []>("op_1500")];
             tensor<int32, [3]> concat_6 = const()[name = tensor<string, []>("concat_6"), val = tensor<int32, [3]>([4, 1, 256])];
             tensor<fp32, [4, 1, 256]> reshape_0 = reshape(shape = concat_6, x = emb)[name = tensor<string, []>("reshape_0")];
             tensor<int32, [4]> transpose_1_perm_0 = const()[name = tensor<string, []>("transpose_1_perm_0"), val = tensor<int32, [4]>([0, 1, 3, 2])];
             tensor<int32, [3]> concat_7 = const()[name = tensor<string, []>("concat_7"), val = tensor<int32, [3]>([4, 256, 12])];
-            tensor<fp32, [1, 4, 256, 12]> transpose_1 = transpose(perm = transpose_1_perm_0, x = var_1467)[name = tensor<string, []>("transpose_2")];
+            tensor<fp32, [1, 4, 256, 12]> transpose_1 = transpose(perm = transpose_1_perm_0, x = var_1500)[name = tensor<string, []>("transpose_2")];
             tensor<fp32, [4, 256, 12]> reshape_1 = reshape(shape = concat_7, x = transpose_1)[name = tensor<string, []>("reshape_1")];
             tensor<bool, []> matmul_2_transpose_x_0 = const()[name = tensor<string, []>("matmul_2_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> matmul_2_transpose_y_0 = const()[name = tensor<string, []>("matmul_2_transpose_y_0"), val = tensor<bool, []>(false)];
@@ -1317,10 +1332,10 @@ program(1.0)
             tensor<int32, [3]> output_end_0 = const()[name = tensor<string, []>("output_end_0"), val = tensor<int32, [3]>([1, 4, 11])];
             tensor<bool, [3]> output_end_mask_0 = const()[name = tensor<string, []>("output_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
             tensor<fp32, [1, 4, 10]> output = slice_by_index(begin = output_begin_0, end = output_end_0, end_mask = output_end_mask_0, x = reshape_2)[name = tensor<string, []>("output")];
-            tensor<fp32, [1, 4, 10]> probs = sigmoid(x = output)[name = tensor<string, []>("op_1471")];
-            tensor<int32, []> var_1473_axis_0 = const()[name = tensor<string, []>("op_1473_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [2, 12, 4, 64, 64]> dec_kv_new = stack(axis = var_1473_axis_0, values = (var_1169, nkv))[name = tensor<string, []>("op_1473")];
-            tensor<int32, []> var_1475_axis_0 = const()[name = tensor<string, []>("op_1475_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [2, 1]> dec_scale_new = stack(axis = var_1475_axis_0, values = (new_scale_9, new_scale))[name = tensor<string, []>("op_1475")];
+            tensor<fp32, [1, 4, 10]> probs = sigmoid(x = output)[name = tensor<string, []>("op_1504")];
+            tensor<int32, []> var_1506_axis_0 = const()[name = tensor<string, []>("op_1506_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [2, 12, 4, 64, 64]> dec_kv_new = stack(axis = var_1506_axis_0, values = (var_1202, nkv))[name = tensor<string, []>("op_1506")];
+            tensor<int32, []> var_1508_axis_0 = const()[name = tensor<string, []>("op_1508_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [2, 1]> dec_scale_new = stack(axis = var_1508_axis_0, values = (new_scale_9, new_scale))[name = tensor<string, []>("op_1508")];
         } -> (probs, enc_kv_new, enc_scale_new, enc_conv_cache_new, cnn_window_new, dec_kv_new, dec_scale_new);
 }
\ No newline at end of file
diff --git a/optimized/dih3/400ms/ls_eend_dih3_400ms.mlpackage/Data/com.apple.CoreML/model.mlmodel b/optimized/dih3/400ms/ls_eend_dih3_400ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
index 966abc59c27168f5003194361dd833d1665658b1..ea36f705b04c49693e5c48fc1441db62573a3b14 100644
--- a/optimized/dih3/400ms/ls_eend_dih3_400ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
+++ b/optimized/dih3/400ms/ls_eend_dih3_400ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:000abbd1df11bb0943f0adfab67500eee955b12db8f10bdc9369e7e808b09657
-size 191053
+oid sha256:d7ca94092df462cdab87116ed8e295dc81895635f82bbf9f00a73e4b8816b11d
+size 197125
diff --git a/optimized/dih3/400ms/ls_eend_dih3_400ms.mlpackage/Manifest.json b/optimized/dih3/400ms/ls_eend_dih3_400ms.mlpackage/Manifest.json
index 7f043626a70d0c5eccc20cae217fcc5767d6d198..d123250cff4561b4464284ef420ac06e692da2ef 100644
--- a/optimized/dih3/400ms/ls_eend_dih3_400ms.mlpackage/Manifest.json
+++ b/optimized/dih3/400ms/ls_eend_dih3_400ms.mlpackage/Manifest.json
@@ -1,18 +1,18 @@
 {
     "fileFormatVersion": "1.0.0",
     "itemInfoEntries": {
-        "7E6A9ABB-631C-4011-875E-660D5DCE8EFC": {
-            "author": "com.apple.CoreML",
-            "description": "CoreML Model Specification",
-            "name": "model.mlmodel",
-            "path": "com.apple.CoreML/model.mlmodel"
-        },
-        "A7E5EDC0-82DC-425C-A42A-618F7B2317F7": {
+        "17D6D145-F4A1-42F1-BE9D-64E3EE4EB6F2": {
             "author": "com.apple.CoreML",
             "description": "CoreML Model Weights",
             "name": "weights",
             "path": "com.apple.CoreML/weights"
+        },
+        "B19871A9-DBD2-40BC-AB61-069157BF4EFD": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Specification",
+            "name": "model.mlmodel",
+            "path": "com.apple.CoreML/model.mlmodel"
         }
     },
-    "rootModelIdentifier": "7E6A9ABB-631C-4011-875E-660D5DCE8EFC"
+    "rootModelIdentifier": "B19871A9-DBD2-40BC-AB61-069157BF4EFD"
 }
diff --git a/optimized/dih3/500ms/ls_eend_dih3_500ms.mlmodelc/analytics/coremldata.bin b/optimized/dih3/500ms/ls_eend_dih3_500ms.mlmodelc/analytics/coremldata.bin
index 4d38c0949a58bc566d5e4794910ebd4ae04a7a27..2b2834505010012c170e9a17ae542b2bca78c3c8 100644
--- a/optimized/dih3/500ms/ls_eend_dih3_500ms.mlmodelc/analytics/coremldata.bin
+++ b/optimized/dih3/500ms/ls_eend_dih3_500ms.mlmodelc/analytics/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4f8e24abfe7f695ffad28e2adc4c3072705f7bf9cf80102cc782f8e933e72f27
+oid sha256:0d0577390ca35f2d7e42a1e75c3d4275127b262c37750a48fec0ac1d711ab2ff
 size 243
diff --git a/optimized/dih3/500ms/ls_eend_dih3_500ms.mlmodelc/coremldata.bin b/optimized/dih3/500ms/ls_eend_dih3_500ms.mlmodelc/coremldata.bin
index 8915b742fe103cc63e86c1d10e5b105b8647f78d..dd62f2e097c2cc7ef122683ba8079a5bcc35ec5e 100644
--- a/optimized/dih3/500ms/ls_eend_dih3_500ms.mlmodelc/coremldata.bin
+++ b/optimized/dih3/500ms/ls_eend_dih3_500ms.mlmodelc/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b9da2e560b5f0570964bbcfe58e5520dfcb2b29af2679608619c52ae68309c02
-size 1310
+oid sha256:3cc2c2502a1cfbc4ccf456695ae9130fb7f0dbfb992c52a2ebd8f20654744133
+size 1413
diff --git a/optimized/dih3/500ms/ls_eend_dih3_500ms.mlmodelc/metadata.json b/optimized/dih3/500ms/ls_eend_dih3_500ms.mlmodelc/metadata.json
index bbd1303a91af367bee74f4f22f06f98437f409e1..d982b6126dab58eb13d536236f47070ffa68742e 100644
--- a/optimized/dih3/500ms/ls_eend_dih3_500ms.mlmodelc/metadata.json
+++ b/optimized/dih3/500ms/ls_eend_dih3_500ms.mlmodelc/metadata.json
@@ -1,7 +1,7 @@
 [
   {
     "metadataOutputVersion" : "3.0",
-    "shortDescription" : "LS-EEND DIHARD III streaming diarizer (pipeline, T=5, max_speakers=10)",
+    "shortDescription" : "LS-EEND DIHARD III streaming diarizer (pipeline, T=5, max_speakers=10, layout=raw_mel, cu=all)",
     "outputSchema" : [
       {
         "hasShapeFlexibility" : "0",
@@ -81,7 +81,7 @@
     "specificationVersion" : 8,
     "mlProgramOperationTypeHistogram" : {
       "Ios16.reduceL2Norm" : 2,
-      "Ios17.reshape" : 68,
+      "Ios17.reshape" : 69,
       "Ios16.softmax" : 2,
       "Ios17.matmul" : 29,
       "Ios17.transpose" : 57,
@@ -89,7 +89,7 @@
       "Ios17.expandDims" : 3,
       "Ios17.add" : 46,
       "Ios16.sigmoid" : 5,
-      "Ios17.sliceByIndex" : 72,
+      "Ios17.sliceByIndex" : 77,
       "Tile" : 1,
       "Ios16.reduceSum" : 1,
       "Ios17.squeeze" : 2,
@@ -101,7 +101,7 @@
       "Ios16.silu" : 18,
       "Ios17.realDiv" : 20,
       "Ios17.linear" : 56,
-      "Stack" : 5,
+      "Stack" : 6,
       "Ios17.concat" : 26,
       "Ios16.relu" : 2,
       "Ios16.cumsum" : 1,
@@ -128,9 +128,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float32",
-        "formattedType" : "MultiArray (Float32 1 × 5 × 345)",
+        "formattedType" : "MultiArray (Float32 1 × 55 × 23)",
         "shortDescription" : "",
-        "shape" : "[1, 5, 345]",
+        "shape" : "[1, 55, 23]",
         "name" : "features",
         "type" : "MultiArray"
       },
@@ -206,8 +206,8 @@
       }
     ],
     "userDefinedMetadata" : {
-      "com.github.apple.coremltools.conversion_date" : "2026-04-16",
-      "config" : "{\"model_name\": \"dih3\", \"model_label\": \"DIHARD III\", \"variant\": \"pipeline\", \"chunk_size\": 5, \"step_duration_ms\": 500, \"frame_duration_ms\": 100, \"frame_duration_seconds\": 0.1, \"max_speakers\": 10, \"max_nspks\": 12, \"n_units\": 256, \"n_heads\": 4, \"enc_n_layers\": 4, \"dec_n_layers\": 2, \"conv_kernel_size\": 16, \"conv_delay\": 9, \"sample_rate\": 8000, \"win_length\": 200, \"hop_length\": 80, \"n_mels\": 23, \"context_size\": 7, \"subsampling\": 10, \"feat_type\": \"logmel23_cummn\", \"pure_roll\": true}",
+      "com.github.apple.coremltools.conversion_date" : "2026-04-18",
+      "config" : "{\"model_name\": \"dih3\", \"model_label\": \"DIHARD III\", \"variant\": \"pipeline\", \"chunk_size\": 5, \"step_duration_ms\": 500, \"frame_duration_ms\": 100, \"frame_duration_seconds\": 0.1, \"max_speakers\": 10, \"max_nspks\": 12, \"n_units\": 256, \"n_heads\": 4, \"enc_n_layers\": 4, \"dec_n_layers\": 2, \"conv_kernel_size\": 16, \"conv_delay\": 9, \"sample_rate\": 8000, \"win_length\": 200, \"hop_length\": 80, \"n_mels\": 23, \"context_size\": 7, \"subsampling\": 10, \"feat_type\": \"logmel23_cummn\", \"pure_roll\": true, \"input_layout\": \"raw_mel\", \"compute_units_export\": \"all\", \"raw_mel_length\": 55}",
       "com.github.apple.coremltools.source" : "torch==2.6.0",
       "com.github.apple.coremltools.version" : "9.0",
       "com.github.apple.coremltools.source_dialect" : "TorchScript"
diff --git a/optimized/dih3/500ms/ls_eend_dih3_500ms.mlmodelc/model.mil b/optimized/dih3/500ms/ls_eend_dih3_500ms.mlmodelc/model.mil
index d06d8cb5807c4c0b54c3e07f38ba5655c925422a..ab59896474305668d95443aae5b48ae6a2df960c 100644
--- a/optimized/dih3/500ms/ls_eend_dih3_500ms.mlmodelc/model.mil
+++ b/optimized/dih3/500ms/ls_eend_dih3_500ms.mlmodelc/model.mil
@@ -1,234 +1,260 @@
 program(1.0)
 [buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.6.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "9.0"}})]
 {
-    func main<ios17>(tensor<fp32, [1, 256, 18]> cnn_window, tensor<fp32, [2, 12, 4, 64, 64]> dec_kv, tensor<fp32, [2, 1]> dec_scale, tensor<fp32, [4, 1, 16, 256]> enc_conv_cache, tensor<fp32, [4, 1, 4, 64, 64]> enc_kv, tensor<fp32, [4, 1]> enc_scale, tensor<fp32, [1, 5, 345]> features, tensor<fp32, [5]> valid_mask) {
-            tensor<fp32, [256]> encoder_cnn_bias = const()[name = tensor<string, []>("encoder_cnn_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
-            tensor<fp32, [256, 256, 19]> encoder_cnn_weight = const()[name = tensor<string, []>("encoder_cnn_weight"), val = tensor<fp32, [256, 256, 19]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
-            tensor<fp32, [5, 5]> encoder__causal_mask = const()[name = tensor<string, []>("encoder__causal_mask"), val = tensor<fp32, [5, 5]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4981952)))];
-            tensor<fp32, [5]> encoder__t_index = const()[name = tensor<string, []>("encoder__t_index"), val = tensor<fp32, [5]>([0x1p+0, 0x1p+1, 0x1.8p+1, 0x1p+2, 0x1.4p+2])];
-            tensor<fp32, [256]> encoder_input_projection_linear_bias = const()[name = tensor<string, []>("encoder_input_projection_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4982144)))];
-            tensor<fp32, [256, 345]> encoder_input_projection_linear_weight = const()[name = tensor<string, []>("encoder_input_projection_linear_weight"), val = tensor<fp32, [256, 345]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4983232)))];
-            tensor<fp32, [256]> encoder_pre_layer_norm_bias = const()[name = tensor<string, []>("encoder_pre_layer_norm_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5336576)))];
-            tensor<fp32, [256]> encoder_pre_layer_norm_weight = const()[name = tensor<string, []>("encoder_pre_layer_norm_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5337664)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5338752)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5339840)))];
-            tensor<fp32, [1024]> encoder_ffn1_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5340928)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5345088)))];
-            tensor<fp32, [256]> encoder_ffn1_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6393728)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6394816)))];
-            tensor<fp32, [256]> encoder_ret_lns_0_bias = const()[name = tensor<string, []>("encoder_ret_lns_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7443456)))];
-            tensor<fp32, [256]> encoder_ret_lns_0_weight = const()[name = tensor<string, []>("encoder_ret_lns_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7444544)))];
-            tensor<fp32, [256]> encoder_q_proj_0_bias = const()[name = tensor<string, []>("encoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7445632)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_0_weight = const()[name = tensor<string, []>("encoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7446720)))];
-            tensor<fp32, [256]> encoder_k_proj_0_bias = const()[name = tensor<string, []>("encoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7708928)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_0_weight = const()[name = tensor<string, []>("encoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7710016)))];
-            tensor<fp32, [256]> encoder_v_proj_0_bias = const()[name = tensor<string, []>("encoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7972224)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_0_weight = const()[name = tensor<string, []>("encoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7973312)))];
-            tensor<fp32, [256]> encoder_g_proj_0_bias = const()[name = tensor<string, []>("encoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8235520)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_0_weight = const()[name = tensor<string, []>("encoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8236608)))];
-            tensor<fp32, [256]> encoder_out_proj_0_bias = const()[name = tensor<string, []>("encoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8498816)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_0_weight = const()[name = tensor<string, []>("encoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8499904)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8762112)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8763200)))];
-            tensor<fp32, [512]> encoder_conv_module_0_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8764288)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_0_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8766400)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_0_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9290752)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9307200)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9308288)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9309376)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9310464)))];
-            tensor<fp32, [256]> encoder_conv_module_0_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9311552)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_0_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_0_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9312640)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9574848)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9575936)))];
-            tensor<fp32, [1024]> encoder_ffn2_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9577024)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9581184)))];
-            tensor<fp32, [256]> encoder_ffn2_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10629824)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10630912)))];
-            tensor<fp32, [256]> encoder_layer_norm_0_bias = const()[name = tensor<string, []>("encoder_layer_norm_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11679552)))];
-            tensor<fp32, [256]> encoder_layer_norm_0_weight = const()[name = tensor<string, []>("encoder_layer_norm_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11680640)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11681728)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11682816)))];
-            tensor<fp32, [1024]> encoder_ffn1_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11683904)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11688064)))];
-            tensor<fp32, [256]> encoder_ffn1_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12736704)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12737792)))];
-            tensor<fp32, [256]> encoder_ret_lns_1_bias = const()[name = tensor<string, []>("encoder_ret_lns_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13786432)))];
-            tensor<fp32, [256]> encoder_ret_lns_1_weight = const()[name = tensor<string, []>("encoder_ret_lns_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13787520)))];
-            tensor<fp32, [256]> encoder_q_proj_1_bias = const()[name = tensor<string, []>("encoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13788608)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_1_weight = const()[name = tensor<string, []>("encoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13789696)))];
-            tensor<fp32, [256]> encoder_k_proj_1_bias = const()[name = tensor<string, []>("encoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14051904)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_1_weight = const()[name = tensor<string, []>("encoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14052992)))];
-            tensor<fp32, [256]> encoder_v_proj_1_bias = const()[name = tensor<string, []>("encoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14315200)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_1_weight = const()[name = tensor<string, []>("encoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14316288)))];
-            tensor<fp32, [256]> encoder_g_proj_1_bias = const()[name = tensor<string, []>("encoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14578496)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_1_weight = const()[name = tensor<string, []>("encoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14579584)))];
-            tensor<fp32, [256]> encoder_out_proj_1_bias = const()[name = tensor<string, []>("encoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14841792)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_1_weight = const()[name = tensor<string, []>("encoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14842880)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15105088)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15106176)))];
-            tensor<fp32, [512]> encoder_conv_module_1_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15107264)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_1_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15109376)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_1_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15633728)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15650176)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15651264)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15652352)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15653440)))];
-            tensor<fp32, [256]> encoder_conv_module_1_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15654528)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_1_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_1_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15655616)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15917824)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15918912)))];
-            tensor<fp32, [1024]> encoder_ffn2_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15920000)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15924160)))];
-            tensor<fp32, [256]> encoder_ffn2_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16972800)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16973888)))];
-            tensor<fp32, [256]> encoder_layer_norm_1_bias = const()[name = tensor<string, []>("encoder_layer_norm_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18022528)))];
-            tensor<fp32, [256]> encoder_layer_norm_1_weight = const()[name = tensor<string, []>("encoder_layer_norm_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18023616)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18024704)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18025792)))];
-            tensor<fp32, [1024]> encoder_ffn1_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18026880)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18031040)))];
-            tensor<fp32, [256]> encoder_ffn1_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19079680)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19080768)))];
-            tensor<fp32, [256]> encoder_ret_lns_2_bias = const()[name = tensor<string, []>("encoder_ret_lns_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20129408)))];
-            tensor<fp32, [256]> encoder_ret_lns_2_weight = const()[name = tensor<string, []>("encoder_ret_lns_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20130496)))];
-            tensor<fp32, [256]> encoder_q_proj_2_bias = const()[name = tensor<string, []>("encoder_q_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20131584)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_2_weight = const()[name = tensor<string, []>("encoder_q_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20132672)))];
-            tensor<fp32, [256]> encoder_k_proj_2_bias = const()[name = tensor<string, []>("encoder_k_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20394880)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_2_weight = const()[name = tensor<string, []>("encoder_k_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20395968)))];
-            tensor<fp32, [256]> encoder_v_proj_2_bias = const()[name = tensor<string, []>("encoder_v_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20658176)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_2_weight = const()[name = tensor<string, []>("encoder_v_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20659264)))];
-            tensor<fp32, [256]> encoder_g_proj_2_bias = const()[name = tensor<string, []>("encoder_g_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20921472)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_2_weight = const()[name = tensor<string, []>("encoder_g_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20922560)))];
-            tensor<fp32, [256]> encoder_out_proj_2_bias = const()[name = tensor<string, []>("encoder_out_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21184768)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_2_weight = const()[name = tensor<string, []>("encoder_out_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21185856)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21448064)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21449152)))];
-            tensor<fp32, [512]> encoder_conv_module_2_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21450240)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_2_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21452352)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_2_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21976704)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21993152)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21994240)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21995328)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21996416)))];
-            tensor<fp32, [256]> encoder_conv_module_2_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21997504)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_2_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_2_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21998592)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22260800)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22261888)))];
-            tensor<fp32, [1024]> encoder_ffn2_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22262976)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22267136)))];
-            tensor<fp32, [256]> encoder_ffn2_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23315776)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23316864)))];
-            tensor<fp32, [256]> encoder_layer_norm_2_bias = const()[name = tensor<string, []>("encoder_layer_norm_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24365504)))];
-            tensor<fp32, [256]> encoder_layer_norm_2_weight = const()[name = tensor<string, []>("encoder_layer_norm_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24366592)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24367680)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24368768)))];
-            tensor<fp32, [1024]> encoder_ffn1_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24369856)))];
-            tensor<fp32, [1024, 256]> encoder_ffn1_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24374016)))];
-            tensor<fp32, [256]> encoder_ffn1_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25422656)))];
-            tensor<fp32, [256, 1024]> encoder_ffn1_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn1_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25423744)))];
-            tensor<fp32, [256]> encoder_ret_lns_3_bias = const()[name = tensor<string, []>("encoder_ret_lns_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26472384)))];
-            tensor<fp32, [256]> encoder_ret_lns_3_weight = const()[name = tensor<string, []>("encoder_ret_lns_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26473472)))];
-            tensor<fp32, [256]> encoder_q_proj_3_bias = const()[name = tensor<string, []>("encoder_q_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26474560)))];
-            tensor<fp32, [256, 256]> encoder_q_proj_3_weight = const()[name = tensor<string, []>("encoder_q_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26475648)))];
-            tensor<fp32, [256]> encoder_k_proj_3_bias = const()[name = tensor<string, []>("encoder_k_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26737856)))];
-            tensor<fp32, [256, 256]> encoder_k_proj_3_weight = const()[name = tensor<string, []>("encoder_k_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26738944)))];
-            tensor<fp32, [256]> encoder_v_proj_3_bias = const()[name = tensor<string, []>("encoder_v_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27001152)))];
-            tensor<fp32, [256, 256]> encoder_v_proj_3_weight = const()[name = tensor<string, []>("encoder_v_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27002240)))];
-            tensor<fp32, [256]> encoder_g_proj_3_bias = const()[name = tensor<string, []>("encoder_g_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27264448)))];
-            tensor<fp32, [256, 256]> encoder_g_proj_3_weight = const()[name = tensor<string, []>("encoder_g_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27265536)))];
-            tensor<fp32, [256]> encoder_out_proj_3_bias = const()[name = tensor<string, []>("encoder_out_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27527744)))];
-            tensor<fp32, [256, 256]> encoder_out_proj_3_weight = const()[name = tensor<string, []>("encoder_out_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27528832)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_0_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27791040)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_0_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27792128)))];
-            tensor<fp32, [512]> encoder_conv_module_3_sequential_2_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27793216)))];
-            tensor<fp32, [512, 256, 1]> encoder_conv_module_3_sequential_2_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27795328)))];
-            tensor<fp32, [256, 1, 16]> encoder_conv_module_3_sequential_4_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28319680)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_running_var = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28336128)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_running_mean = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28337216)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28338304)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_5_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28339392)))];
-            tensor<fp32, [256]> encoder_conv_module_3_sequential_7_conv_bias = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28340480)))];
-            tensor<fp32, [256, 256, 1]> encoder_conv_module_3_sequential_7_conv_weight = const()[name = tensor<string, []>("encoder_conv_module_3_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28341568)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_0_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28603776)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_0_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28604864)))];
-            tensor<fp32, [1024]> encoder_ffn2_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28605952)))];
-            tensor<fp32, [1024, 256]> encoder_ffn2_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28610112)))];
-            tensor<fp32, [256]> encoder_ffn2_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29658752)))];
-            tensor<fp32, [256, 1024]> encoder_ffn2_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("encoder_ffn2_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29659840)))];
-            tensor<fp32, [256]> encoder_layer_norm_3_bias = const()[name = tensor<string, []>("encoder_layer_norm_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30708480)))];
-            tensor<fp32, [256]> encoder_layer_norm_3_weight = const()[name = tensor<string, []>("encoder_layer_norm_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30709568)))];
-            tensor<fp32, [5, 5]> decoder__causal_mask = const()[name = tensor<string, []>("decoder__causal_mask"), val = tensor<fp32, [5, 5]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710656)))];
-            tensor<fp32, [256]> decoder_convert_bias = const()[name = tensor<string, []>("decoder_convert_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710848)))];
-            tensor<fp32, [256, 512]> decoder_convert_weight = const()[name = tensor<string, []>("decoder_convert_weight"), val = tensor<fp32, [256, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30711936)))];
-            tensor<fp32, [256]> decoder_q_proj_0_bias = const()[name = tensor<string, []>("decoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31236288)))];
-            tensor<fp32, [256, 256]> decoder_q_proj_0_weight = const()[name = tensor<string, []>("decoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31237376)))];
-            tensor<fp32, [256]> decoder_k_proj_0_bias = const()[name = tensor<string, []>("decoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31499584)))];
-            tensor<fp32, [256, 256]> decoder_k_proj_0_weight = const()[name = tensor<string, []>("decoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31500672)))];
-            tensor<fp32, [256]> decoder_v_proj_0_bias = const()[name = tensor<string, []>("decoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31762880)))];
-            tensor<fp32, [256, 256]> decoder_v_proj_0_weight = const()[name = tensor<string, []>("decoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31763968)))];
-            tensor<fp32, [256]> decoder_g_proj_0_bias = const()[name = tensor<string, []>("decoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32026176)))];
-            tensor<fp32, [256, 256]> decoder_g_proj_0_weight = const()[name = tensor<string, []>("decoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32027264)))];
-            tensor<fp32, [256]> decoder_out_proj_0_bias = const()[name = tensor<string, []>("decoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32289472)))];
-            tensor<fp32, [256, 256]> decoder_out_proj_0_weight = const()[name = tensor<string, []>("decoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32290560)))];
-            tensor<fp32, [256]> decoder_norm11_0_bias = const()[name = tensor<string, []>("decoder_norm11_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32552768)))];
-            tensor<fp32, [256]> decoder_norm11_0_weight = const()[name = tensor<string, []>("decoder_norm11_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32553856)))];
-            tensor<fp32, [256]> decoder_self_attn2_0_out_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_0_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32554944)))];
-            tensor<fp32, [256, 256]> decoder_self_attn2_0_out_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_0_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32556032)))];
-            tensor<fp32, [768]> decoder_self_attn2_0_in_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_0_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32818240)))];
-            tensor<fp32, [768, 256]> decoder_self_attn2_0_in_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_0_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32821376)))];
-            tensor<fp32, [256]> decoder_norm21_0_bias = const()[name = tensor<string, []>("decoder_norm21_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33607872)))];
-            tensor<fp32, [256]> decoder_norm21_0_weight = const()[name = tensor<string, []>("decoder_norm21_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33608960)))];
-            tensor<fp32, [2048]> decoder_linear1_0_bias = const()[name = tensor<string, []>("decoder_linear1_0_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33610048)))];
-            tensor<fp32, [2048, 256]> decoder_linear1_0_weight = const()[name = tensor<string, []>("decoder_linear1_0_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33618304)))];
-            tensor<fp32, [256]> decoder_linear2_0_bias = const()[name = tensor<string, []>("decoder_linear2_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35715520)))];
-            tensor<fp32, [256, 2048]> decoder_linear2_0_weight = const()[name = tensor<string, []>("decoder_linear2_0_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35716608)))];
-            tensor<fp32, [256]> decoder_norm22_0_bias = const()[name = tensor<string, []>("decoder_norm22_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37813824)))];
-            tensor<fp32, [256]> decoder_norm22_0_weight = const()[name = tensor<string, []>("decoder_norm22_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37814912)))];
-            tensor<fp32, [256]> decoder_q_proj_1_bias = const()[name = tensor<string, []>("decoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37816000)))];
-            tensor<fp32, [256, 256]> decoder_q_proj_1_weight = const()[name = tensor<string, []>("decoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37817088)))];
-            tensor<fp32, [256]> decoder_k_proj_1_bias = const()[name = tensor<string, []>("decoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38079296)))];
-            tensor<fp32, [256, 256]> decoder_k_proj_1_weight = const()[name = tensor<string, []>("decoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38080384)))];
-            tensor<fp32, [256]> decoder_v_proj_1_bias = const()[name = tensor<string, []>("decoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38342592)))];
-            tensor<fp32, [256, 256]> decoder_v_proj_1_weight = const()[name = tensor<string, []>("decoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38343680)))];
-            tensor<fp32, [256]> decoder_g_proj_1_bias = const()[name = tensor<string, []>("decoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38605888)))];
-            tensor<fp32, [256, 256]> decoder_g_proj_1_weight = const()[name = tensor<string, []>("decoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38606976)))];
-            tensor<fp32, [256]> decoder_out_proj_1_bias = const()[name = tensor<string, []>("decoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38869184)))];
-            tensor<fp32, [256, 256]> decoder_out_proj_1_weight = const()[name = tensor<string, []>("decoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38870272)))];
-            tensor<fp32, [256]> decoder_norm11_1_bias = const()[name = tensor<string, []>("decoder_norm11_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39132480)))];
-            tensor<fp32, [256]> decoder_norm11_1_weight = const()[name = tensor<string, []>("decoder_norm11_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39133568)))];
-            tensor<fp32, [256]> decoder_self_attn2_1_out_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_1_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39134656)))];
-            tensor<fp32, [256, 256]> decoder_self_attn2_1_out_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_1_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39135744)))];
-            tensor<fp32, [768]> decoder_self_attn2_1_in_proj_bias = const()[name = tensor<string, []>("decoder_self_attn2_1_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39397952)))];
-            tensor<fp32, [768, 256]> decoder_self_attn2_1_in_proj_weight = const()[name = tensor<string, []>("decoder_self_attn2_1_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39401088)))];
-            tensor<fp32, [256]> decoder_norm21_1_bias = const()[name = tensor<string, []>("decoder_norm21_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40187584)))];
-            tensor<fp32, [256]> decoder_norm21_1_weight = const()[name = tensor<string, []>("decoder_norm21_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40188672)))];
-            tensor<fp32, [2048]> decoder_linear1_1_bias = const()[name = tensor<string, []>("decoder_linear1_1_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40189760)))];
-            tensor<fp32, [2048, 256]> decoder_linear1_1_weight = const()[name = tensor<string, []>("decoder_linear1_1_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40198016)))];
-            tensor<fp32, [256]> decoder_linear2_1_bias = const()[name = tensor<string, []>("decoder_linear2_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42295232)))];
-            tensor<fp32, [256, 2048]> decoder_linear2_1_weight = const()[name = tensor<string, []>("decoder_linear2_1_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42296320)))];
-            tensor<fp32, [256]> decoder_norm22_1_bias = const()[name = tensor<string, []>("decoder_norm22_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44393536)))];
-            tensor<fp32, [256]> decoder_norm22_1_weight = const()[name = tensor<string, []>("decoder_norm22_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44394624)))];
-            tensor<fp32, []> var_11 = const()[name = tensor<string, []>("op_11"), val = tensor<fp32, []>(0x1.5798eep-27)];
-            tensor<fp32, []> var_18 = const()[name = tensor<string, []>("op_18"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
-            tensor<int32, []> var_21 = const()[name = tensor<string, []>("op_21"), val = tensor<int32, []>(2)];
-            tensor<int32, []> var_24 = const()[name = tensor<string, []>("op_24"), val = tensor<int32, []>(0)];
-            tensor<int32, []> var_27 = const()[name = tensor<string, []>("op_27"), val = tensor<int32, []>(1)];
-            tensor<fp32, []> var_29 = const()[name = tensor<string, []>("op_29"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
-            tensor<bool, []> var_30 = const()[name = tensor<string, []>("op_30"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 5, 256]> input_1 = linear(bias = encoder_input_projection_linear_bias, weight = encoder_input_projection_linear_weight, x = features)[name = tensor<string, []>("linear_0")];
-            tensor<int32, [1]> input_3_axes_0 = const()[name = tensor<string, []>("input_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_3 = layer_norm(axes = input_3_axes_0, beta = encoder_pre_layer_norm_bias, epsilon = var_29, gamma = encoder_pre_layer_norm_weight, x = input_1)[name = tensor<string, []>("input_3")];
+    func main<ios17>(tensor<fp32, [1, 256, 18]> cnn_window, tensor<fp32, [2, 12, 4, 64, 64]> dec_kv, tensor<fp32, [2, 1]> dec_scale, tensor<fp32, [4, 1, 16, 256]> enc_conv_cache, tensor<fp32, [4, 1, 4, 64, 64]> enc_kv, tensor<fp32, [4, 1]> enc_scale, tensor<fp32, [1, 55, 23]> features, tensor<fp32, [5]> valid_mask) {
+            tensor<fp32, [256]> inner_encoder_cnn_bias = const()[name = tensor<string, []>("inner_encoder_cnn_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp32, [256, 256, 19]> inner_encoder_cnn_weight = const()[name = tensor<string, []>("inner_encoder_cnn_weight"), val = tensor<fp32, [256, 256, 19]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
+            tensor<fp32, [5, 5]> inner_encoder__causal_mask = const()[name = tensor<string, []>("inner_encoder__causal_mask"), val = tensor<fp32, [5, 5]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4981952)))];
+            tensor<fp32, [5]> inner_encoder__t_index = const()[name = tensor<string, []>("inner_encoder__t_index"), val = tensor<fp32, [5]>([0x1p+0, 0x1p+1, 0x1.8p+1, 0x1p+2, 0x1.4p+2])];
+            tensor<fp32, [256]> inner_encoder_input_projection_linear_bias = const()[name = tensor<string, []>("inner_encoder_input_projection_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4982144)))];
+            tensor<fp32, [256, 345]> inner_encoder_input_projection_linear_weight = const()[name = tensor<string, []>("inner_encoder_input_projection_linear_weight"), val = tensor<fp32, [256, 345]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4983232)))];
+            tensor<fp32, [256]> inner_encoder_pre_layer_norm_bias = const()[name = tensor<string, []>("inner_encoder_pre_layer_norm_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5336576)))];
+            tensor<fp32, [256]> inner_encoder_pre_layer_norm_weight = const()[name = tensor<string, []>("inner_encoder_pre_layer_norm_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5337664)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5338752)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5339840)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5340928)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5345088)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6393728)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6394816)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_0_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7443456)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_0_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7444544)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7445632)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7446720)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7708928)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7710016)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7972224)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7973312)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8235520)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8236608)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_0_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8498816)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_0_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8499904)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8762112)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8763200)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_0_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8764288)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_0_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8766400)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_0_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9290752)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9307200)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9308288)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9309376)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9310464)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_0_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9311552)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_0_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_0_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9312640)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9574848)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9575936)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_0_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9577024)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_0_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9581184)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_0_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10629824)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_0_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_0_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10630912)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_0_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11679552)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_0_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11680640)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11681728)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11682816)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11683904)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11688064)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12736704)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12737792)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_1_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13786432)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_1_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13787520)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13788608)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13789696)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14051904)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14052992)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14315200)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14316288)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14578496)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14579584)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_1_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14841792)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_1_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14842880)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15105088)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15106176)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_1_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15107264)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_1_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15109376)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_1_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15633728)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15650176)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15651264)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15652352)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15653440)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_1_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15654528)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_1_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_1_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15655616)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15917824)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15918912)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_1_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15920000)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_1_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15924160)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_1_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16972800)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_1_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_1_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16973888)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_1_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18022528)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_1_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18023616)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18024704)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18025792)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18026880)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18031040)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19079680)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19080768)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_2_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20129408)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_2_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20130496)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20131584)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20132672)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20394880)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20395968)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20658176)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20659264)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20921472)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20922560)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_2_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21184768)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_2_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_2_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21185856)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21448064)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21449152)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_2_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21450240)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_2_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21452352)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_2_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21976704)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21993152)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21994240)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21995328)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21996416)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_2_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21997504)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_2_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_2_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21998592)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22260800)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22261888)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_2_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22262976)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_2_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22267136)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_2_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23315776)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_2_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_2_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23316864)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_2_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24365504)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_2_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_2_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24366592)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24367680)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24368768)))];
+            tensor<fp32, [1024]> inner_encoder_ffn1_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24369856)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn1_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24374016)))];
+            tensor<fp32, [256]> inner_encoder_ffn1_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25422656)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn1_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn1_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25423744)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_3_bias = const()[name = tensor<string, []>("inner_encoder_ret_lns_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26472384)))];
+            tensor<fp32, [256]> inner_encoder_ret_lns_3_weight = const()[name = tensor<string, []>("inner_encoder_ret_lns_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26473472)))];
+            tensor<fp32, [256]> inner_encoder_q_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_q_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26474560)))];
+            tensor<fp32, [256, 256]> inner_encoder_q_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_q_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26475648)))];
+            tensor<fp32, [256]> inner_encoder_k_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_k_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26737856)))];
+            tensor<fp32, [256, 256]> inner_encoder_k_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_k_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26738944)))];
+            tensor<fp32, [256]> inner_encoder_v_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_v_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27001152)))];
+            tensor<fp32, [256, 256]> inner_encoder_v_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_v_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27002240)))];
+            tensor<fp32, [256]> inner_encoder_g_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_g_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27264448)))];
+            tensor<fp32, [256, 256]> inner_encoder_g_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_g_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27265536)))];
+            tensor<fp32, [256]> inner_encoder_out_proj_3_bias = const()[name = tensor<string, []>("inner_encoder_out_proj_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27527744)))];
+            tensor<fp32, [256, 256]> inner_encoder_out_proj_3_weight = const()[name = tensor<string, []>("inner_encoder_out_proj_3_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27528832)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27791040)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27792128)))];
+            tensor<fp32, [512]> inner_encoder_conv_module_3_sequential_2_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_2_conv_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27793216)))];
+            tensor<fp32, [512, 256, 1]> inner_encoder_conv_module_3_sequential_2_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_2_conv_weight"), val = tensor<fp32, [512, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27795328)))];
+            tensor<fp32, [256, 1, 16]> inner_encoder_conv_module_3_sequential_4_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_4_conv_weight"), val = tensor<fp32, [256, 1, 16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28319680)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_running_var = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_running_var"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28336128)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_running_mean = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_running_mean"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28337216)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28338304)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_5_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_5_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28339392)))];
+            tensor<fp32, [256]> inner_encoder_conv_module_3_sequential_7_conv_bias = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_7_conv_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28340480)))];
+            tensor<fp32, [256, 256, 1]> inner_encoder_conv_module_3_sequential_7_conv_weight = const()[name = tensor<string, []>("inner_encoder_conv_module_3_sequential_7_conv_weight"), val = tensor<fp32, [256, 256, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28341568)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_0_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28603776)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_0_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28604864)))];
+            tensor<fp32, [1024]> inner_encoder_ffn2_3_module_sequential_1_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_1_linear_bias"), val = tensor<fp32, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28605952)))];
+            tensor<fp32, [1024, 256]> inner_encoder_ffn2_3_module_sequential_1_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_1_linear_weight"), val = tensor<fp32, [1024, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28610112)))];
+            tensor<fp32, [256]> inner_encoder_ffn2_3_module_sequential_4_linear_bias = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_4_linear_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29658752)))];
+            tensor<fp32, [256, 1024]> inner_encoder_ffn2_3_module_sequential_4_linear_weight = const()[name = tensor<string, []>("inner_encoder_ffn2_3_module_sequential_4_linear_weight"), val = tensor<fp32, [256, 1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29659840)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_3_bias = const()[name = tensor<string, []>("inner_encoder_layer_norm_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30708480)))];
+            tensor<fp32, [256]> inner_encoder_layer_norm_3_weight = const()[name = tensor<string, []>("inner_encoder_layer_norm_3_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30709568)))];
+            tensor<fp32, [5, 5]> inner_decoder__causal_mask = const()[name = tensor<string, []>("inner_decoder__causal_mask"), val = tensor<fp32, [5, 5]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710656)))];
+            tensor<fp32, [256]> inner_decoder_convert_bias = const()[name = tensor<string, []>("inner_decoder_convert_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30710848)))];
+            tensor<fp32, [256, 512]> inner_decoder_convert_weight = const()[name = tensor<string, []>("inner_decoder_convert_weight"), val = tensor<fp32, [256, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30711936)))];
+            tensor<fp32, [256]> inner_decoder_q_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_q_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31236288)))];
+            tensor<fp32, [256, 256]> inner_decoder_q_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_q_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31237376)))];
+            tensor<fp32, [256]> inner_decoder_k_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_k_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31499584)))];
+            tensor<fp32, [256, 256]> inner_decoder_k_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_k_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31500672)))];
+            tensor<fp32, [256]> inner_decoder_v_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_v_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31762880)))];
+            tensor<fp32, [256, 256]> inner_decoder_v_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_v_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31763968)))];
+            tensor<fp32, [256]> inner_decoder_g_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_g_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32026176)))];
+            tensor<fp32, [256, 256]> inner_decoder_g_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_g_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32027264)))];
+            tensor<fp32, [256]> inner_decoder_out_proj_0_bias = const()[name = tensor<string, []>("inner_decoder_out_proj_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32289472)))];
+            tensor<fp32, [256, 256]> inner_decoder_out_proj_0_weight = const()[name = tensor<string, []>("inner_decoder_out_proj_0_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32290560)))];
+            tensor<fp32, [256]> inner_decoder_norm11_0_bias = const()[name = tensor<string, []>("inner_decoder_norm11_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32552768)))];
+            tensor<fp32, [256]> inner_decoder_norm11_0_weight = const()[name = tensor<string, []>("inner_decoder_norm11_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32553856)))];
+            tensor<fp32, [256]> inner_decoder_self_attn2_0_out_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32554944)))];
+            tensor<fp32, [256, 256]> inner_decoder_self_attn2_0_out_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32556032)))];
+            tensor<fp32, [768]> inner_decoder_self_attn2_0_in_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32818240)))];
+            tensor<fp32, [768, 256]> inner_decoder_self_attn2_0_in_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_0_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(32821376)))];
+            tensor<fp32, [256]> inner_decoder_norm21_0_bias = const()[name = tensor<string, []>("inner_decoder_norm21_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33607872)))];
+            tensor<fp32, [256]> inner_decoder_norm21_0_weight = const()[name = tensor<string, []>("inner_decoder_norm21_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33608960)))];
+            tensor<fp32, [2048]> inner_decoder_linear1_0_bias = const()[name = tensor<string, []>("inner_decoder_linear1_0_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33610048)))];
+            tensor<fp32, [2048, 256]> inner_decoder_linear1_0_weight = const()[name = tensor<string, []>("inner_decoder_linear1_0_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33618304)))];
+            tensor<fp32, [256]> inner_decoder_linear2_0_bias = const()[name = tensor<string, []>("inner_decoder_linear2_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35715520)))];
+            tensor<fp32, [256, 2048]> inner_decoder_linear2_0_weight = const()[name = tensor<string, []>("inner_decoder_linear2_0_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35716608)))];
+            tensor<fp32, [256]> inner_decoder_norm22_0_bias = const()[name = tensor<string, []>("inner_decoder_norm22_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37813824)))];
+            tensor<fp32, [256]> inner_decoder_norm22_0_weight = const()[name = tensor<string, []>("inner_decoder_norm22_0_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37814912)))];
+            tensor<fp32, [256]> inner_decoder_q_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_q_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37816000)))];
+            tensor<fp32, [256, 256]> inner_decoder_q_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_q_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37817088)))];
+            tensor<fp32, [256]> inner_decoder_k_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_k_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38079296)))];
+            tensor<fp32, [256, 256]> inner_decoder_k_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_k_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38080384)))];
+            tensor<fp32, [256]> inner_decoder_v_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_v_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38342592)))];
+            tensor<fp32, [256, 256]> inner_decoder_v_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_v_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38343680)))];
+            tensor<fp32, [256]> inner_decoder_g_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_g_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38605888)))];
+            tensor<fp32, [256, 256]> inner_decoder_g_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_g_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38606976)))];
+            tensor<fp32, [256]> inner_decoder_out_proj_1_bias = const()[name = tensor<string, []>("inner_decoder_out_proj_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38869184)))];
+            tensor<fp32, [256, 256]> inner_decoder_out_proj_1_weight = const()[name = tensor<string, []>("inner_decoder_out_proj_1_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38870272)))];
+            tensor<fp32, [256]> inner_decoder_norm11_1_bias = const()[name = tensor<string, []>("inner_decoder_norm11_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39132480)))];
+            tensor<fp32, [256]> inner_decoder_norm11_1_weight = const()[name = tensor<string, []>("inner_decoder_norm11_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39133568)))];
+            tensor<fp32, [256]> inner_decoder_self_attn2_1_out_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_out_proj_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39134656)))];
+            tensor<fp32, [256, 256]> inner_decoder_self_attn2_1_out_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_out_proj_weight"), val = tensor<fp32, [256, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39135744)))];
+            tensor<fp32, [768]> inner_decoder_self_attn2_1_in_proj_bias = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_in_proj_bias"), val = tensor<fp32, [768]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39397952)))];
+            tensor<fp32, [768, 256]> inner_decoder_self_attn2_1_in_proj_weight = const()[name = tensor<string, []>("inner_decoder_self_attn2_1_in_proj_weight"), val = tensor<fp32, [768, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39401088)))];
+            tensor<fp32, [256]> inner_decoder_norm21_1_bias = const()[name = tensor<string, []>("inner_decoder_norm21_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40187584)))];
+            tensor<fp32, [256]> inner_decoder_norm21_1_weight = const()[name = tensor<string, []>("inner_decoder_norm21_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40188672)))];
+            tensor<fp32, [2048]> inner_decoder_linear1_1_bias = const()[name = tensor<string, []>("inner_decoder_linear1_1_bias"), val = tensor<fp32, [2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40189760)))];
+            tensor<fp32, [2048, 256]> inner_decoder_linear1_1_weight = const()[name = tensor<string, []>("inner_decoder_linear1_1_weight"), val = tensor<fp32, [2048, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40198016)))];
+            tensor<fp32, [256]> inner_decoder_linear2_1_bias = const()[name = tensor<string, []>("inner_decoder_linear2_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42295232)))];
+            tensor<fp32, [256, 2048]> inner_decoder_linear2_1_weight = const()[name = tensor<string, []>("inner_decoder_linear2_1_weight"), val = tensor<fp32, [256, 2048]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42296320)))];
+            tensor<fp32, [256]> inner_decoder_norm22_1_bias = const()[name = tensor<string, []>("inner_decoder_norm22_1_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44393536)))];
+            tensor<fp32, [256]> inner_decoder_norm22_1_weight = const()[name = tensor<string, []>("inner_decoder_norm22_1_weight"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44394624)))];
+            tensor<int32, [3]> var_19_begin_0 = const()[name = tensor<string, []>("op_19_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_19_end_0 = const()[name = tensor<string, []>("op_19_end_0"), val = tensor<int32, [3]>([1, 15, 23])];
+            tensor<bool, [3]> var_19_end_mask_0 = const()[name = tensor<string, []>("op_19_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 15, 23]> var_19 = slice_by_index(begin = var_19_begin_0, end = var_19_end_0, end_mask = var_19_end_mask_0, x = features)[name = tensor<string, []>("op_19")];
+            tensor<int32, [3]> var_29_begin_0 = const()[name = tensor<string, []>("op_29_begin_0"), val = tensor<int32, [3]>([0, 10, 0])];
+            tensor<int32, [3]> var_29_end_0 = const()[name = tensor<string, []>("op_29_end_0"), val = tensor<int32, [3]>([1, 25, 23])];
+            tensor<bool, [3]> var_29_end_mask_0 = const()[name = tensor<string, []>("op_29_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 15, 23]> var_29 = slice_by_index(begin = var_29_begin_0, end = var_29_end_0, end_mask = var_29_end_mask_0, x = features)[name = tensor<string, []>("op_29")];
+            tensor<int32, [3]> var_39_begin_0 = const()[name = tensor<string, []>("op_39_begin_0"), val = tensor<int32, [3]>([0, 20, 0])];
+            tensor<int32, [3]> var_39_end_0 = const()[name = tensor<string, []>("op_39_end_0"), val = tensor<int32, [3]>([1, 35, 23])];
+            tensor<bool, [3]> var_39_end_mask_0 = const()[name = tensor<string, []>("op_39_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 15, 23]> var_39 = slice_by_index(begin = var_39_begin_0, end = var_39_end_0, end_mask = var_39_end_mask_0, x = features)[name = tensor<string, []>("op_39")];
+            tensor<int32, [3]> var_49_begin_0 = const()[name = tensor<string, []>("op_49_begin_0"), val = tensor<int32, [3]>([0, 30, 0])];
+            tensor<int32, [3]> var_49_end_0 = const()[name = tensor<string, []>("op_49_end_0"), val = tensor<int32, [3]>([1, 45, 23])];
+            tensor<bool, [3]> var_49_end_mask_0 = const()[name = tensor<string, []>("op_49_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 15, 23]> var_49 = slice_by_index(begin = var_49_begin_0, end = var_49_end_0, end_mask = var_49_end_mask_0, x = features)[name = tensor<string, []>("op_49")];
+            tensor<int32, [3]> var_59_begin_0 = const()[name = tensor<string, []>("op_59_begin_0"), val = tensor<int32, [3]>([0, 40, 0])];
+            tensor<int32, [3]> var_59_end_0 = const()[name = tensor<string, []>("op_59_end_0"), val = tensor<int32, [3]>([1, 1, 23])];
+            tensor<bool, [3]> var_59_end_mask_0 = const()[name = tensor<string, []>("op_59_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 23]> var_59 = slice_by_index(begin = var_59_begin_0, end = var_59_end_0, end_mask = var_59_end_mask_0, x = features)[name = tensor<string, []>("op_59")];
+            tensor<int32, []> stacked_axis_0 = const()[name = tensor<string, []>("stacked_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 5, 15, 23]> stacked = stack(axis = stacked_axis_0, values = (var_19, var_29, var_39, var_49, var_59))[name = tensor<string, []>("stacked")];
+            tensor<int32, [3]> var_66 = const()[name = tensor<string, []>("op_66"), val = tensor<int32, [3]>([1, 5, 345])];
+            tensor<fp32, [1, 5, 345]> input_1 = reshape(shape = var_66, x = stacked)[name = tensor<string, []>("input_1")];
+            tensor<fp32, []> var_69 = const()[name = tensor<string, []>("op_69"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<bool, []> var_75 = const()[name = tensor<string, []>("op_75"), val = tensor<bool, []>(true)];
+            tensor<fp32, []> var_76 = const()[name = tensor<string, []>("op_76"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
+            tensor<int32, []> var_79 = const()[name = tensor<string, []>("op_79"), val = tensor<int32, []>(0)];
+            tensor<int32, []> var_81 = const()[name = tensor<string, []>("op_81"), val = tensor<int32, []>(2)];
+            tensor<int32, []> var_82 = const()[name = tensor<string, []>("op_82"), val = tensor<int32, []>(-1)];
+            tensor<fp32, []> var_84 = const()[name = tensor<string, []>("op_84"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
+            tensor<fp32, []> var_90 = const()[name = tensor<string, []>("op_90"), val = tensor<fp32, []>(0x1.5798eep-27)];
+            tensor<int32, []> var_93 = const()[name = tensor<string, []>("op_93"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 5, 256]> input_3 = linear(bias = inner_encoder_input_projection_linear_bias, weight = inner_encoder_input_projection_linear_weight, x = input_1)[name = tensor<string, []>("linear_0")];
             tensor<int32, [1]> input_5_axes_0 = const()[name = tensor<string, []>("input_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_5 = layer_norm(axes = input_5_axes_0, beta = encoder_ffn1_0_module_sequential_0_bias, epsilon = var_29, gamma = encoder_ffn1_0_module_sequential_0_weight, x = input_3)[name = tensor<string, []>("input_5")];
-            tensor<fp32, [1, 5, 1024]> inputs_1 = linear(bias = encoder_ffn1_0_module_sequential_1_linear_bias, weight = encoder_ffn1_0_module_sequential_1_linear_weight, x = input_5)[name = tensor<string, []>("linear_1")];
-            tensor<fp32, [1, 5, 1024]> input_7 = silu(x = inputs_1)[name = tensor<string, []>("input_7")];
-            tensor<fp32, [1, 5, 256]> input_11 = linear(bias = encoder_ffn1_0_module_sequential_4_linear_bias, weight = encoder_ffn1_0_module_sequential_4_linear_weight, x = input_7)[name = tensor<string, []>("linear_2")];
-            tensor<fp32, []> var_148 = const()[name = tensor<string, []>("op_148"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 5, 256]> var_149 = mul(x = input_11, y = var_148)[name = tensor<string, []>("op_149")];
-            tensor<fp32, [1, 5, 256]> input_13 = add(x = var_149, y = input_3)[name = tensor<string, []>("input_13")];
+            tensor<fp32, [1, 5, 256]> input_5 = layer_norm(axes = input_5_axes_0, beta = inner_encoder_pre_layer_norm_bias, epsilon = var_76, gamma = inner_encoder_pre_layer_norm_weight, x = input_3)[name = tensor<string, []>("input_5")];
+            tensor<int32, [1]> input_7_axes_0 = const()[name = tensor<string, []>("input_7_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 5, 256]> input_7 = layer_norm(axes = input_7_axes_0, beta = inner_encoder_ffn1_0_module_sequential_0_bias, epsilon = var_76, gamma = inner_encoder_ffn1_0_module_sequential_0_weight, x = input_5)[name = tensor<string, []>("input_7")];
+            tensor<fp32, [1, 5, 1024]> inputs_1 = linear(bias = inner_encoder_ffn1_0_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_0_module_sequential_1_linear_weight, x = input_7)[name = tensor<string, []>("linear_1")];
+            tensor<fp32, [1, 5, 1024]> input_9 = silu(x = inputs_1)[name = tensor<string, []>("input_9")];
+            tensor<fp32, [1, 5, 256]> input_13 = linear(bias = inner_encoder_ffn1_0_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_0_module_sequential_4_linear_weight, x = input_9)[name = tensor<string, []>("linear_2")];
+            tensor<fp32, []> var_214 = const()[name = tensor<string, []>("op_214"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 5, 256]> var_215 = mul(x = input_13, y = var_214)[name = tensor<string, []>("op_215")];
+            tensor<fp32, [1, 5, 256]> input_15 = add(x = var_215, y = input_5)[name = tensor<string, []>("input_15")];
             tensor<int32, [1]> x_1_axes_0 = const()[name = tensor<string, []>("x_1_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> x_1 = layer_norm(axes = x_1_axes_0, beta = encoder_ret_lns_0_bias, epsilon = var_29, gamma = encoder_ret_lns_0_weight, x = input_13)[name = tensor<string, []>("x_1")];
+            tensor<fp32, [1, 5, 256]> x_1 = layer_norm(axes = x_1_axes_0, beta = inner_encoder_ret_lns_0_bias, epsilon = var_76, gamma = inner_encoder_ret_lns_0_weight, x = input_15)[name = tensor<string, []>("x_1")];
             tensor<int32, [5]> prev_kv_1_begin_0 = const()[name = tensor<string, []>("prev_kv_1_begin_0"), val = tensor<int32, [5]>([0, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_1_end_0 = const()[name = tensor<string, []>("prev_kv_1_end_0"), val = tensor<int32, [5]>([1, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_1_end_mask_0 = const()[name = tensor<string, []>("prev_kv_1_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -239,183 +265,183 @@ program(1.0)
             tensor<bool, [2]> prev_scale_1_end_mask_0 = const()[name = tensor<string, []>("prev_scale_1_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_1_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_1_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_1 = slice_by_index(begin = prev_scale_1_begin_0, end = prev_scale_1_end_0, end_mask = prev_scale_1_end_mask_0, squeeze_mask = prev_scale_1_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_1")];
-            tensor<fp32, [1, 5, 256]> var_163 = linear(bias = encoder_q_proj_0_bias, weight = encoder_q_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_3")];
-            tensor<int32, [4]> var_164 = const()[name = tensor<string, []>("op_164"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_165 = reshape(shape = var_164, x = var_163)[name = tensor<string, []>("op_165")];
+            tensor<fp32, [1, 5, 256]> var_229 = linear(bias = inner_encoder_q_proj_0_bias, weight = inner_encoder_q_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_3")];
+            tensor<int32, [4]> var_230 = const()[name = tensor<string, []>("op_230"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_231 = reshape(shape = var_230, x = var_229)[name = tensor<string, []>("op_231")];
             tensor<int32, [4]> q_1_perm_0 = const()[name = tensor<string, []>("q_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> var_169 = linear(bias = encoder_k_proj_0_bias, weight = encoder_k_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_4")];
-            tensor<fp32, []> var_170 = const()[name = tensor<string, []>("op_170"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 5, 256]> var_171 = mul(x = var_169, y = var_170)[name = tensor<string, []>("op_171")];
-            tensor<int32, [4]> var_172 = const()[name = tensor<string, []>("op_172"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_173 = reshape(shape = var_172, x = var_171)[name = tensor<string, []>("op_173")];
+            tensor<fp32, [1, 5, 256]> var_235 = linear(bias = inner_encoder_k_proj_0_bias, weight = inner_encoder_k_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_4")];
+            tensor<fp32, []> var_236 = const()[name = tensor<string, []>("op_236"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 5, 256]> var_237 = mul(x = var_235, y = var_236)[name = tensor<string, []>("op_237")];
+            tensor<int32, [4]> var_238 = const()[name = tensor<string, []>("op_238"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_239 = reshape(shape = var_238, x = var_237)[name = tensor<string, []>("op_239")];
             tensor<int32, [4]> k_1_perm_0 = const()[name = tensor<string, []>("k_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> var_177 = linear(bias = encoder_v_proj_0_bias, weight = encoder_v_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_5")];
-            tensor<int32, [4]> var_178 = const()[name = tensor<string, []>("op_178"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_179 = reshape(shape = var_178, x = var_177)[name = tensor<string, []>("op_179")];
+            tensor<fp32, [1, 5, 256]> var_243 = linear(bias = inner_encoder_v_proj_0_bias, weight = inner_encoder_v_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_5")];
+            tensor<int32, [4]> var_244 = const()[name = tensor<string, []>("op_244"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_245 = reshape(shape = var_244, x = var_243)[name = tensor<string, []>("op_245")];
             tensor<int32, [4]> v_1_perm_0 = const()[name = tensor<string, []>("v_1_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> input_17 = linear(bias = encoder_g_proj_0_bias, weight = encoder_g_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_6")];
+            tensor<fp32, [1, 5, 256]> input_19 = linear(bias = inner_encoder_g_proj_0_bias, weight = inner_encoder_g_proj_0_weight, x = x_1)[name = tensor<string, []>("linear_6")];
             tensor<fp32, [1]> sqrt_s0_1 = sqrt(x = prev_scale_1)[name = tensor<string, []>("sqrt_s0_1")];
-            tensor<fp32, [5]> s_t_1 = add(x = prev_scale_1, y = encoder__t_index)[name = tensor<string, []>("s_t_1")];
+            tensor<fp32, [5]> s_t_1 = add(x = prev_scale_1, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_1")];
             tensor<fp32, [5]> sqrt_s_t_1 = sqrt(x = s_t_1)[name = tensor<string, []>("sqrt_s_t_1")];
             tensor<bool, []> qk_1_transpose_x_1 = const()[name = tensor<string, []>("qk_1_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_1_transpose_y_1 = const()[name = tensor<string, []>("qk_1_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 5, 64]> k_1 = transpose(perm = k_1_perm_0, x = var_173)[name = tensor<string, []>("transpose_57")];
-            tensor<fp32, [1, 4, 5, 64]> q_1 = transpose(perm = q_1_perm_0, x = var_165)[name = tensor<string, []>("transpose_58")];
+            tensor<fp32, [1, 4, 5, 64]> k_1 = transpose(perm = k_1_perm_0, x = var_239)[name = tensor<string, []>("transpose_57")];
+            tensor<fp32, [1, 4, 5, 64]> q_1 = transpose(perm = q_1_perm_0, x = var_231)[name = tensor<string, []>("transpose_58")];
             tensor<fp32, [1, 4, 5, 5]> qk_1 = matmul(transpose_x = qk_1_transpose_x_1, transpose_y = qk_1_transpose_y_1, x = q_1, y = k_1)[name = tensor<string, []>("qk_1")];
-            tensor<int32, [2]> var_189 = const()[name = tensor<string, []>("op_189"), val = tensor<int32, [2]>([5, 1])];
-            tensor<fp32, [5, 1]> var_190 = reshape(shape = var_189, x = sqrt_s_t_1)[name = tensor<string, []>("op_190")];
-            tensor<fp32, [5, 5]> M_1 = real_div(x = encoder__causal_mask, y = var_190)[name = tensor<string, []>("M_1")];
-            tensor<fp32, [1, 4, 5, 5]> var_192 = mul(x = qk_1, y = M_1)[name = tensor<string, []>("op_192")];
+            tensor<int32, [2]> var_255 = const()[name = tensor<string, []>("op_255"), val = tensor<int32, [2]>([5, 1])];
+            tensor<fp32, [5, 1]> var_256 = reshape(shape = var_255, x = sqrt_s_t_1)[name = tensor<string, []>("op_256")];
+            tensor<fp32, [5, 5]> M_1 = real_div(x = inner_encoder__causal_mask, y = var_256)[name = tensor<string, []>("M_1")];
+            tensor<fp32, [1, 4, 5, 5]> var_258 = mul(x = qk_1, y = M_1)[name = tensor<string, []>("op_258")];
             tensor<bool, []> inner_1_transpose_x_0 = const()[name = tensor<string, []>("inner_1_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_1_transpose_y_0 = const()[name = tensor<string, []>("inner_1_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 5, 64]> v_1 = transpose(perm = v_1_perm_0, x = var_179)[name = tensor<string, []>("transpose_56")];
-            tensor<fp32, [1, 4, 5, 64]> inner_1 = matmul(transpose_x = inner_1_transpose_x_0, transpose_y = inner_1_transpose_y_0, x = var_192, y = v_1)[name = tensor<string, []>("inner_1")];
-            tensor<bool, []> var_194_transpose_x_0 = const()[name = tensor<string, []>("op_194_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_194_transpose_y_0 = const()[name = tensor<string, []>("op_194_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 5, 64]> var_194 = matmul(transpose_x = var_194_transpose_x_0, transpose_y = var_194_transpose_y_0, x = q_1, y = prev_kv_1)[name = tensor<string, []>("op_194")];
-            tensor<fp32, [5]> var_195 = real_div(x = sqrt_s0_1, y = sqrt_s_t_1)[name = tensor<string, []>("op_195")];
-            tensor<int32, [4]> var_196 = const()[name = tensor<string, []>("op_196"), val = tensor<int32, [4]>([1, 1, 5, 1])];
-            tensor<fp32, [1, 1, 5, 1]> var_197 = reshape(shape = var_196, x = var_195)[name = tensor<string, []>("op_197")];
-            tensor<fp32, [1, 4, 5, 64]> cross_1 = mul(x = var_194, y = var_197)[name = tensor<string, []>("cross_1")];
+            tensor<fp32, [1, 4, 5, 64]> v_1 = transpose(perm = v_1_perm_0, x = var_245)[name = tensor<string, []>("transpose_56")];
+            tensor<fp32, [1, 4, 5, 64]> inner_1 = matmul(transpose_x = inner_1_transpose_x_0, transpose_y = inner_1_transpose_y_0, x = var_258, y = v_1)[name = tensor<string, []>("inner_1")];
+            tensor<bool, []> var_260_transpose_x_0 = const()[name = tensor<string, []>("op_260_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_260_transpose_y_0 = const()[name = tensor<string, []>("op_260_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 5, 64]> var_260 = matmul(transpose_x = var_260_transpose_x_0, transpose_y = var_260_transpose_y_0, x = q_1, y = prev_kv_1)[name = tensor<string, []>("op_260")];
+            tensor<fp32, [5]> var_261 = real_div(x = sqrt_s0_1, y = sqrt_s_t_1)[name = tensor<string, []>("op_261")];
+            tensor<int32, [4]> var_262 = const()[name = tensor<string, []>("op_262"), val = tensor<int32, [4]>([1, 1, 5, 1])];
+            tensor<fp32, [1, 1, 5, 1]> var_263 = reshape(shape = var_262, x = var_261)[name = tensor<string, []>("op_263")];
+            tensor<fp32, [1, 4, 5, 64]> cross_1 = mul(x = var_260, y = var_263)[name = tensor<string, []>("cross_1")];
             tensor<fp32, [1, 4, 5, 64]> out_1 = add(x = inner_1, y = cross_1)[name = tensor<string, []>("out_1")];
-            tensor<fp32, [1, 4, 64, 64]> var_200 = mul(x = prev_kv_1, y = sqrt_s0_1)[name = tensor<string, []>("op_200")];
-            tensor<bool, []> var_202_transpose_x_1 = const()[name = tensor<string, []>("op_202_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_202_transpose_y_1 = const()[name = tensor<string, []>("op_202_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_202 = matmul(transpose_x = var_202_transpose_x_1, transpose_y = var_202_transpose_y_1, x = k_1, y = v_1)[name = tensor<string, []>("op_202")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_1 = add(x = var_200, y = var_202)[name = tensor<string, []>("new_kv_unnorm_1")];
-            tensor<fp32, []> var_204 = const()[name = tensor<string, []>("op_204"), val = tensor<fp32, []>(0x1.4p+2)];
-            tensor<fp32, [1]> new_scale_1 = add(x = prev_scale_1, y = var_204)[name = tensor<string, []>("new_scale_1")];
-            tensor<fp32, [1]> var_206 = sqrt(x = new_scale_1)[name = tensor<string, []>("op_206")];
-            tensor<fp32, [1, 4, 64, 64]> var_207 = real_div(x = new_kv_unnorm_1, y = var_206)[name = tensor<string, []>("op_207")];
-            tensor<int32, [4]> var_208_perm_0 = const()[name = tensor<string, []>("op_208_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_266 = mul(x = prev_kv_1, y = sqrt_s0_1)[name = tensor<string, []>("op_266")];
+            tensor<bool, []> var_268_transpose_x_1 = const()[name = tensor<string, []>("op_268_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_268_transpose_y_1 = const()[name = tensor<string, []>("op_268_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_268 = matmul(transpose_x = var_268_transpose_x_1, transpose_y = var_268_transpose_y_1, x = k_1, y = v_1)[name = tensor<string, []>("op_268")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_1 = add(x = var_266, y = var_268)[name = tensor<string, []>("new_kv_unnorm_1")];
+            tensor<fp32, []> var_270 = const()[name = tensor<string, []>("op_270"), val = tensor<fp32, []>(0x1.4p+2)];
+            tensor<fp32, [1]> new_scale_1 = add(x = prev_scale_1, y = var_270)[name = tensor<string, []>("new_scale_1")];
+            tensor<fp32, [1]> var_272 = sqrt(x = new_scale_1)[name = tensor<string, []>("op_272")];
+            tensor<fp32, [1, 4, 64, 64]> var_273 = real_div(x = new_kv_unnorm_1, y = var_272)[name = tensor<string, []>("op_273")];
+            tensor<int32, [4]> var_274_perm_0 = const()[name = tensor<string, []>("op_274_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_3_axes_0 = const()[name = tensor<string, []>("out_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 4, 64]> var_208 = transpose(perm = var_208_perm_0, x = out_1)[name = tensor<string, []>("transpose_55")];
-            tensor<fp32, [1, 5, 4, 64]> out_3 = layer_norm(axes = out_3_axes_0, epsilon = var_18, x = var_208)[name = tensor<string, []>("out_3")];
-            tensor<int32, [3]> var_212 = const()[name = tensor<string, []>("op_212"), val = tensor<int32, [3]>([1, 5, 256])];
-            tensor<fp32, [1, 5, 256]> out_5 = reshape(shape = var_212, x = out_3)[name = tensor<string, []>("out_5")];
-            tensor<fp32, [1, 5, 256]> var_214 = silu(x = input_17)[name = tensor<string, []>("op_214")];
-            tensor<fp32, [1, 5, 256]> input_19 = mul(x = var_214, y = out_5)[name = tensor<string, []>("input_19")];
-            tensor<fp32, [1, 5, 256]> ret_out_1 = linear(bias = encoder_out_proj_0_bias, weight = encoder_out_proj_0_weight, x = input_19)[name = tensor<string, []>("linear_7")];
-            tensor<fp32, [1, 5, 256]> x_3 = add(x = input_13, y = ret_out_1)[name = tensor<string, []>("x_3")];
+            tensor<fp32, [1, 5, 4, 64]> var_274 = transpose(perm = var_274_perm_0, x = out_1)[name = tensor<string, []>("transpose_55")];
+            tensor<fp32, [1, 5, 4, 64]> out_3 = layer_norm(axes = out_3_axes_0, epsilon = var_84, x = var_274)[name = tensor<string, []>("out_3")];
+            tensor<int32, [3]> var_278 = const()[name = tensor<string, []>("op_278"), val = tensor<int32, [3]>([1, 5, 256])];
+            tensor<fp32, [1, 5, 256]> out_5 = reshape(shape = var_278, x = out_3)[name = tensor<string, []>("out_5")];
+            tensor<fp32, [1, 5, 256]> var_280 = silu(x = input_19)[name = tensor<string, []>("op_280")];
+            tensor<fp32, [1, 5, 256]> input_21 = mul(x = var_280, y = out_5)[name = tensor<string, []>("input_21")];
+            tensor<fp32, [1, 5, 256]> ret_out_1 = linear(bias = inner_encoder_out_proj_0_bias, weight = inner_encoder_out_proj_0_weight, x = input_21)[name = tensor<string, []>("linear_7")];
+            tensor<fp32, [1, 5, 256]> x_3 = add(x = input_15, y = ret_out_1)[name = tensor<string, []>("x_3")];
             tensor<int32, [4]> window_1_begin_0 = const()[name = tensor<string, []>("window_1_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> window_1_end_0 = const()[name = tensor<string, []>("window_1_end_0"), val = tensor<int32, [4]>([1, 1, 16, 256])];
             tensor<bool, [4]> window_1_end_mask_0 = const()[name = tensor<string, []>("window_1_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_1_squeeze_mask_0 = const()[name = tensor<string, []>("window_1_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_1 = slice_by_index(begin = window_1_begin_0, end = window_1_end_0, end_mask = window_1_end_mask_0, squeeze_mask = window_1_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_1")];
-            tensor<int32, [3]> var_222_begin_0 = const()[name = tensor<string, []>("op_222_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_222_end_0 = const()[name = tensor<string, []>("op_222_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_222_end_mask_0 = const()[name = tensor<string, []>("op_222_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_222 = slice_by_index(begin = var_222_begin_0, end = var_222_end_0, end_mask = var_222_end_mask_0, x = x_3)[name = tensor<string, []>("op_222")];
-            tensor<int32, [3]> var_225_begin_0 = const()[name = tensor<string, []>("op_225_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_225_end_0 = const()[name = tensor<string, []>("op_225_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_225_end_mask_0 = const()[name = tensor<string, []>("op_225_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_225 = slice_by_index(begin = var_225_begin_0, end = var_225_end_0, end_mask = var_225_end_mask_0, x = window_1)[name = tensor<string, []>("op_225")];
+            tensor<int32, [3]> var_288_begin_0 = const()[name = tensor<string, []>("op_288_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_288_end_0 = const()[name = tensor<string, []>("op_288_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_288_end_mask_0 = const()[name = tensor<string, []>("op_288_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_288 = slice_by_index(begin = var_288_begin_0, end = var_288_end_0, end_mask = var_288_end_mask_0, x = x_3)[name = tensor<string, []>("op_288")];
+            tensor<int32, [3]> var_291_begin_0 = const()[name = tensor<string, []>("op_291_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_291_end_0 = const()[name = tensor<string, []>("op_291_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_291_end_mask_0 = const()[name = tensor<string, []>("op_291_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_291 = slice_by_index(begin = var_291_begin_0, end = var_291_end_0, end_mask = var_291_end_mask_0, x = window_1)[name = tensor<string, []>("op_291")];
             tensor<bool, []> window_3_interleave_0 = const()[name = tensor<string, []>("window_3_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_3 = concat(axis = var_27, interleave = window_3_interleave_0, values = (var_225, var_222))[name = tensor<string, []>("window_3")];
-            tensor<int32, [3]> var_230_begin_0 = const()[name = tensor<string, []>("op_230_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_230_end_0 = const()[name = tensor<string, []>("op_230_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_230_end_mask_0 = const()[name = tensor<string, []>("op_230_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_230 = slice_by_index(begin = var_230_begin_0, end = var_230_end_0, end_mask = var_230_end_mask_0, x = x_3)[name = tensor<string, []>("op_230")];
-            tensor<int32, [3]> var_233_begin_0 = const()[name = tensor<string, []>("op_233_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_233_end_0 = const()[name = tensor<string, []>("op_233_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_233_end_mask_0 = const()[name = tensor<string, []>("op_233_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_233 = slice_by_index(begin = var_233_begin_0, end = var_233_end_0, end_mask = var_233_end_mask_0, x = window_3)[name = tensor<string, []>("op_233")];
+            tensor<fp32, [1, 16, 256]> window_3 = concat(axis = var_93, interleave = window_3_interleave_0, values = (var_291, var_288))[name = tensor<string, []>("window_3")];
+            tensor<int32, [3]> var_296_begin_0 = const()[name = tensor<string, []>("op_296_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_296_end_0 = const()[name = tensor<string, []>("op_296_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_296_end_mask_0 = const()[name = tensor<string, []>("op_296_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_296 = slice_by_index(begin = var_296_begin_0, end = var_296_end_0, end_mask = var_296_end_mask_0, x = x_3)[name = tensor<string, []>("op_296")];
+            tensor<int32, [3]> var_299_begin_0 = const()[name = tensor<string, []>("op_299_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_299_end_0 = const()[name = tensor<string, []>("op_299_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_299_end_mask_0 = const()[name = tensor<string, []>("op_299_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_299 = slice_by_index(begin = var_299_begin_0, end = var_299_end_0, end_mask = var_299_end_mask_0, x = window_3)[name = tensor<string, []>("op_299")];
             tensor<bool, []> window_5_interleave_0 = const()[name = tensor<string, []>("window_5_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_5 = concat(axis = var_27, interleave = window_5_interleave_0, values = (var_233, var_230))[name = tensor<string, []>("window_5")];
-            tensor<int32, [3]> var_238_begin_0 = const()[name = tensor<string, []>("op_238_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_238_end_0 = const()[name = tensor<string, []>("op_238_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<bool, [3]> var_238_end_mask_0 = const()[name = tensor<string, []>("op_238_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_238 = slice_by_index(begin = var_238_begin_0, end = var_238_end_0, end_mask = var_238_end_mask_0, x = x_3)[name = tensor<string, []>("op_238")];
-            tensor<int32, [3]> var_241_begin_0 = const()[name = tensor<string, []>("op_241_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_241_end_0 = const()[name = tensor<string, []>("op_241_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_241_end_mask_0 = const()[name = tensor<string, []>("op_241_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_241 = slice_by_index(begin = var_241_begin_0, end = var_241_end_0, end_mask = var_241_end_mask_0, x = window_5)[name = tensor<string, []>("op_241")];
+            tensor<fp32, [1, 16, 256]> window_5 = concat(axis = var_93, interleave = window_5_interleave_0, values = (var_299, var_296))[name = tensor<string, []>("window_5")];
+            tensor<int32, [3]> var_304_begin_0 = const()[name = tensor<string, []>("op_304_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_304_end_0 = const()[name = tensor<string, []>("op_304_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<bool, [3]> var_304_end_mask_0 = const()[name = tensor<string, []>("op_304_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_304 = slice_by_index(begin = var_304_begin_0, end = var_304_end_0, end_mask = var_304_end_mask_0, x = x_3)[name = tensor<string, []>("op_304")];
+            tensor<int32, [3]> var_307_begin_0 = const()[name = tensor<string, []>("op_307_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_307_end_0 = const()[name = tensor<string, []>("op_307_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_307_end_mask_0 = const()[name = tensor<string, []>("op_307_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_307 = slice_by_index(begin = var_307_begin_0, end = var_307_end_0, end_mask = var_307_end_mask_0, x = window_5)[name = tensor<string, []>("op_307")];
             tensor<bool, []> window_7_interleave_0 = const()[name = tensor<string, []>("window_7_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_7 = concat(axis = var_27, interleave = window_7_interleave_0, values = (var_241, var_238))[name = tensor<string, []>("window_7")];
-            tensor<int32, [3]> var_246_begin_0 = const()[name = tensor<string, []>("op_246_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
-            tensor<int32, [3]> var_246_end_0 = const()[name = tensor<string, []>("op_246_end_0"), val = tensor<int32, [3]>([1, 4, 256])];
-            tensor<bool, [3]> var_246_end_mask_0 = const()[name = tensor<string, []>("op_246_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_246 = slice_by_index(begin = var_246_begin_0, end = var_246_end_0, end_mask = var_246_end_mask_0, x = x_3)[name = tensor<string, []>("op_246")];
-            tensor<int32, [3]> var_249_begin_0 = const()[name = tensor<string, []>("op_249_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_249_end_0 = const()[name = tensor<string, []>("op_249_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_249_end_mask_0 = const()[name = tensor<string, []>("op_249_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_249 = slice_by_index(begin = var_249_begin_0, end = var_249_end_0, end_mask = var_249_end_mask_0, x = window_7)[name = tensor<string, []>("op_249")];
+            tensor<fp32, [1, 16, 256]> window_7 = concat(axis = var_93, interleave = window_7_interleave_0, values = (var_307, var_304))[name = tensor<string, []>("window_7")];
+            tensor<int32, [3]> var_312_begin_0 = const()[name = tensor<string, []>("op_312_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
+            tensor<int32, [3]> var_312_end_0 = const()[name = tensor<string, []>("op_312_end_0"), val = tensor<int32, [3]>([1, 4, 256])];
+            tensor<bool, [3]> var_312_end_mask_0 = const()[name = tensor<string, []>("op_312_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_312 = slice_by_index(begin = var_312_begin_0, end = var_312_end_0, end_mask = var_312_end_mask_0, x = x_3)[name = tensor<string, []>("op_312")];
+            tensor<int32, [3]> var_315_begin_0 = const()[name = tensor<string, []>("op_315_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_315_end_0 = const()[name = tensor<string, []>("op_315_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_315_end_mask_0 = const()[name = tensor<string, []>("op_315_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_315 = slice_by_index(begin = var_315_begin_0, end = var_315_end_0, end_mask = var_315_end_mask_0, x = window_7)[name = tensor<string, []>("op_315")];
             tensor<bool, []> window_9_interleave_0 = const()[name = tensor<string, []>("window_9_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_9 = concat(axis = var_27, interleave = window_9_interleave_0, values = (var_249, var_246))[name = tensor<string, []>("window_9")];
-            tensor<int32, [3]> var_254_begin_0 = const()[name = tensor<string, []>("op_254_begin_0"), val = tensor<int32, [3]>([0, 4, 0])];
-            tensor<int32, [3]> var_254_end_0 = const()[name = tensor<string, []>("op_254_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_254_end_mask_0 = const()[name = tensor<string, []>("op_254_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_254 = slice_by_index(begin = var_254_begin_0, end = var_254_end_0, end_mask = var_254_end_mask_0, x = x_3)[name = tensor<string, []>("op_254")];
-            tensor<int32, [3]> var_257_begin_0 = const()[name = tensor<string, []>("op_257_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_257_end_0 = const()[name = tensor<string, []>("op_257_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_257_end_mask_0 = const()[name = tensor<string, []>("op_257_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_257 = slice_by_index(begin = var_257_begin_0, end = var_257_end_0, end_mask = var_257_end_mask_0, x = window_9)[name = tensor<string, []>("op_257")];
+            tensor<fp32, [1, 16, 256]> window_9 = concat(axis = var_93, interleave = window_9_interleave_0, values = (var_315, var_312))[name = tensor<string, []>("window_9")];
+            tensor<int32, [3]> var_320_begin_0 = const()[name = tensor<string, []>("op_320_begin_0"), val = tensor<int32, [3]>([0, 4, 0])];
+            tensor<int32, [3]> var_320_end_0 = const()[name = tensor<string, []>("op_320_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_320_end_mask_0 = const()[name = tensor<string, []>("op_320_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_320 = slice_by_index(begin = var_320_begin_0, end = var_320_end_0, end_mask = var_320_end_mask_0, x = x_3)[name = tensor<string, []>("op_320")];
+            tensor<int32, [3]> var_323_begin_0 = const()[name = tensor<string, []>("op_323_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_323_end_0 = const()[name = tensor<string, []>("op_323_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_323_end_mask_0 = const()[name = tensor<string, []>("op_323_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_323 = slice_by_index(begin = var_323_begin_0, end = var_323_end_0, end_mask = var_323_end_mask_0, x = window_9)[name = tensor<string, []>("op_323")];
             tensor<bool, []> window_11_interleave_0 = const()[name = tensor<string, []>("window_11_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_11 = concat(axis = var_27, interleave = window_11_interleave_0, values = (var_257, var_254))[name = tensor<string, []>("window_11")];
-            tensor<bool, []> input_21_interleave_0 = const()[name = tensor<string, []>("input_21_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [5, 16, 256]> input_21 = concat(axis = var_24, interleave = input_21_interleave_0, values = (window_3, window_5, window_7, window_9, window_11))[name = tensor<string, []>("input_21")];
+            tensor<fp32, [1, 16, 256]> window_11 = concat(axis = var_93, interleave = window_11_interleave_0, values = (var_323, var_320))[name = tensor<string, []>("window_11")];
+            tensor<bool, []> input_23_interleave_0 = const()[name = tensor<string, []>("input_23_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [5, 16, 256]> input_23 = concat(axis = var_79, interleave = input_23_interleave_0, values = (window_3, window_5, window_7, window_9, window_11))[name = tensor<string, []>("input_23")];
             tensor<int32, [1]> x_5_axes_0 = const()[name = tensor<string, []>("x_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [5, 16, 256]> x_5 = layer_norm(axes = x_5_axes_0, beta = encoder_conv_module_0_sequential_0_bias, epsilon = var_29, gamma = encoder_conv_module_0_sequential_0_weight, x = input_21)[name = tensor<string, []>("x_5")];
-            tensor<int32, [3]> input_23_perm_0 = const()[name = tensor<string, []>("input_23_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [5, 16, 256]> x_5 = layer_norm(axes = x_5_axes_0, beta = inner_encoder_conv_module_0_sequential_0_bias, epsilon = var_76, gamma = inner_encoder_conv_module_0_sequential_0_weight, x = input_23)[name = tensor<string, []>("x_5")];
+            tensor<int32, [3]> input_25_perm_0 = const()[name = tensor<string, []>("input_25_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_3_pad_type_0 = const()[name = tensor<string, []>("inputs_3_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_3_strides_0 = const()[name = tensor<string, []>("inputs_3_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_3_pad_0 = const()[name = tensor<string, []>("inputs_3_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_3_dilations_0 = const()[name = tensor<string, []>("inputs_3_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_3_groups_0 = const()[name = tensor<string, []>("inputs_3_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> input_23 = transpose(perm = input_23_perm_0, x = x_5)[name = tensor<string, []>("transpose_54")];
-            tensor<fp32, [5, 512, 16]> inputs_3 = conv(bias = encoder_conv_module_0_sequential_2_conv_bias, dilations = inputs_3_dilations_0, groups = inputs_3_groups_0, pad = inputs_3_pad_0, pad_type = inputs_3_pad_type_0, strides = inputs_3_strides_0, weight = encoder_conv_module_0_sequential_2_conv_weight, x = input_23)[name = tensor<string, []>("inputs_3")];
-            tensor<int32, [2]> var_282_split_sizes_0 = const()[name = tensor<string, []>("op_282_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_282_axis_0 = const()[name = tensor<string, []>("op_282_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> var_282_0, tensor<fp32, [5, 256, 16]> var_282_1 = split(axis = var_282_axis_0, split_sizes = var_282_split_sizes_0, x = inputs_3)[name = tensor<string, []>("op_282")];
-            tensor<fp32, [5, 256, 16]> var_284 = sigmoid(x = var_282_1)[name = tensor<string, []>("op_284")];
-            tensor<fp32, [5, 256, 16]> inputs_5 = mul(x = var_282_0, y = var_284)[name = tensor<string, []>("inputs_5")];
+            tensor<fp32, [5, 256, 16]> input_25 = transpose(perm = input_25_perm_0, x = x_5)[name = tensor<string, []>("transpose_54")];
+            tensor<fp32, [5, 512, 16]> inputs_3 = conv(bias = inner_encoder_conv_module_0_sequential_2_conv_bias, dilations = inputs_3_dilations_0, groups = inputs_3_groups_0, pad = inputs_3_pad_0, pad_type = inputs_3_pad_type_0, strides = inputs_3_strides_0, weight = inner_encoder_conv_module_0_sequential_2_conv_weight, x = input_25)[name = tensor<string, []>("inputs_3")];
+            tensor<int32, [2]> var_348_split_sizes_0 = const()[name = tensor<string, []>("op_348_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_348_axis_0 = const()[name = tensor<string, []>("op_348_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [5, 256, 16]> var_348_0, tensor<fp32, [5, 256, 16]> var_348_1 = split(axis = var_348_axis_0, split_sizes = var_348_split_sizes_0, x = inputs_3)[name = tensor<string, []>("op_348")];
+            tensor<fp32, [5, 256, 16]> var_350 = sigmoid(x = var_348_1)[name = tensor<string, []>("op_350")];
+            tensor<fp32, [5, 256, 16]> inputs_5 = mul(x = var_348_0, y = var_350)[name = tensor<string, []>("inputs_5")];
             tensor<string, []> outputs_aug_1_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_1_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_1_pad_0 = const()[name = tensor<string, []>("outputs_aug_1_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_1_groups_0 = const()[name = tensor<string, []>("outputs_aug_1_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_1_strides_0 = const()[name = tensor<string, []>("outputs_aug_1_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_1_dilations_0 = const()[name = tensor<string, []>("outputs_aug_1_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [5, 256, 31]> outputs_aug_1 = conv(dilations = outputs_aug_1_dilations_0, groups = outputs_aug_1_groups_0, pad = outputs_aug_1_pad_0, pad_type = outputs_aug_1_pad_type_0, strides = outputs_aug_1_strides_0, weight = encoder_conv_module_0_sequential_4_conv_weight, x = inputs_5)[name = tensor<string, []>("outputs_aug_1")];
-            tensor<int32, [3]> input_25_begin_0 = const()[name = tensor<string, []>("input_25_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_25_end_0 = const()[name = tensor<string, []>("input_25_end_0"), val = tensor<int32, [3]>([5, 256, 16])];
-            tensor<bool, [3]> input_25_end_mask_0 = const()[name = tensor<string, []>("input_25_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [5, 256, 16]> input_25 = slice_by_index(begin = input_25_begin_0, end = input_25_end_0, end_mask = input_25_end_mask_0, x = outputs_aug_1)[name = tensor<string, []>("input_25")];
-            tensor<fp32, [5, 256, 16]> inputs_7 = batch_norm(beta = encoder_conv_module_0_sequential_5_bias, epsilon = var_29, gamma = encoder_conv_module_0_sequential_5_weight, mean = encoder_conv_module_0_sequential_5_running_mean, variance = encoder_conv_module_0_sequential_5_running_var, x = input_25)[name = tensor<string, []>("inputs_7")];
-            tensor<fp32, [5, 256, 16]> input_27 = silu(x = inputs_7)[name = tensor<string, []>("input_27")];
-            tensor<string, []> input_29_pad_type_0 = const()[name = tensor<string, []>("input_29_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_29_strides_0 = const()[name = tensor<string, []>("input_29_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_29_pad_0 = const()[name = tensor<string, []>("input_29_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_29_dilations_0 = const()[name = tensor<string, []>("input_29_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_29_groups_0 = const()[name = tensor<string, []>("input_29_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> input_29 = conv(bias = encoder_conv_module_0_sequential_7_conv_bias, dilations = input_29_dilations_0, groups = input_29_groups_0, pad = input_29_pad_0, pad_type = input_29_pad_type_0, strides = input_29_strides_0, weight = encoder_conv_module_0_sequential_7_conv_weight, x = input_27)[name = tensor<string, []>("input_29")];
+            tensor<fp32, [5, 256, 31]> outputs_aug_1 = conv(dilations = outputs_aug_1_dilations_0, groups = outputs_aug_1_groups_0, pad = outputs_aug_1_pad_0, pad_type = outputs_aug_1_pad_type_0, strides = outputs_aug_1_strides_0, weight = inner_encoder_conv_module_0_sequential_4_conv_weight, x = inputs_5)[name = tensor<string, []>("outputs_aug_1")];
+            tensor<int32, [3]> input_27_begin_0 = const()[name = tensor<string, []>("input_27_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_27_end_0 = const()[name = tensor<string, []>("input_27_end_0"), val = tensor<int32, [3]>([5, 256, 16])];
+            tensor<bool, [3]> input_27_end_mask_0 = const()[name = tensor<string, []>("input_27_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [5, 256, 16]> input_27 = slice_by_index(begin = input_27_begin_0, end = input_27_end_0, end_mask = input_27_end_mask_0, x = outputs_aug_1)[name = tensor<string, []>("input_27")];
+            tensor<fp32, [5, 256, 16]> inputs_7 = batch_norm(beta = inner_encoder_conv_module_0_sequential_5_bias, epsilon = var_76, gamma = inner_encoder_conv_module_0_sequential_5_weight, mean = inner_encoder_conv_module_0_sequential_5_running_mean, variance = inner_encoder_conv_module_0_sequential_5_running_var, x = input_27)[name = tensor<string, []>("inputs_7")];
+            tensor<fp32, [5, 256, 16]> input_29 = silu(x = inputs_7)[name = tensor<string, []>("input_29")];
+            tensor<string, []> input_31_pad_type_0 = const()[name = tensor<string, []>("input_31_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_31_strides_0 = const()[name = tensor<string, []>("input_31_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_31_pad_0 = const()[name = tensor<string, []>("input_31_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_31_dilations_0 = const()[name = tensor<string, []>("input_31_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_31_groups_0 = const()[name = tensor<string, []>("input_31_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [5, 256, 16]> input_31 = conv(bias = inner_encoder_conv_module_0_sequential_7_conv_bias, dilations = input_31_dilations_0, groups = input_31_groups_0, pad = input_31_pad_0, pad_type = input_31_pad_type_0, strides = input_31_strides_0, weight = inner_encoder_conv_module_0_sequential_7_conv_weight, x = input_29)[name = tensor<string, []>("input_31")];
             tensor<int32, [3]> conv_out_1_perm_0 = const()[name = tensor<string, []>("conv_out_1_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_315_begin_0 = const()[name = tensor<string, []>("op_315_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_315_end_0 = const()[name = tensor<string, []>("op_315_end_0"), val = tensor<int32, [3]>([5, 16, 256])];
-            tensor<bool, [3]> var_315_end_mask_0 = const()[name = tensor<string, []>("op_315_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [5, 16, 256]> conv_out_1 = transpose(perm = conv_out_1_perm_0, x = input_29)[name = tensor<string, []>("transpose_53")];
-            tensor<fp32, [5, 1, 256]> var_315 = slice_by_index(begin = var_315_begin_0, end = var_315_end_0, end_mask = var_315_end_mask_0, x = conv_out_1)[name = tensor<string, []>("op_315")];
-            tensor<int32, [3]> var_317_perm_0 = const()[name = tensor<string, []>("op_317_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 5, 256]> var_317 = transpose(perm = var_317_perm_0, x = var_315)[name = tensor<string, []>("transpose_52")];
-            tensor<fp32, [1, 5, 256]> input_31 = add(x = x_3, y = var_317)[name = tensor<string, []>("input_31")];
-            tensor<int32, [1]> input_33_axes_0 = const()[name = tensor<string, []>("input_33_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_33 = layer_norm(axes = input_33_axes_0, beta = encoder_ffn2_0_module_sequential_0_bias, epsilon = var_29, gamma = encoder_ffn2_0_module_sequential_0_weight, x = input_31)[name = tensor<string, []>("input_33")];
-            tensor<fp32, [1, 5, 1024]> inputs_9 = linear(bias = encoder_ffn2_0_module_sequential_1_linear_bias, weight = encoder_ffn2_0_module_sequential_1_linear_weight, x = input_33)[name = tensor<string, []>("linear_8")];
-            tensor<fp32, [1, 5, 1024]> input_35 = silu(x = inputs_9)[name = tensor<string, []>("input_35")];
-            tensor<fp32, [1, 5, 256]> input_39 = linear(bias = encoder_ffn2_0_module_sequential_4_linear_bias, weight = encoder_ffn2_0_module_sequential_4_linear_weight, x = input_35)[name = tensor<string, []>("linear_9")];
-            tensor<fp32, []> var_340 = const()[name = tensor<string, []>("op_340"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 5, 256]> var_341 = mul(x = input_39, y = var_340)[name = tensor<string, []>("op_341")];
-            tensor<fp32, [1, 5, 256]> input_41 = add(x = var_341, y = input_31)[name = tensor<string, []>("input_41")];
-            tensor<int32, [1]> input_43_axes_0 = const()[name = tensor<string, []>("input_43_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_43 = layer_norm(axes = input_43_axes_0, beta = encoder_layer_norm_0_bias, epsilon = var_29, gamma = encoder_layer_norm_0_weight, x = input_41)[name = tensor<string, []>("input_43")];
+            tensor<int32, [3]> var_381_begin_0 = const()[name = tensor<string, []>("op_381_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_381_end_0 = const()[name = tensor<string, []>("op_381_end_0"), val = tensor<int32, [3]>([5, 16, 256])];
+            tensor<bool, [3]> var_381_end_mask_0 = const()[name = tensor<string, []>("op_381_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [5, 16, 256]> conv_out_1 = transpose(perm = conv_out_1_perm_0, x = input_31)[name = tensor<string, []>("transpose_53")];
+            tensor<fp32, [5, 1, 256]> var_381 = slice_by_index(begin = var_381_begin_0, end = var_381_end_0, end_mask = var_381_end_mask_0, x = conv_out_1)[name = tensor<string, []>("op_381")];
+            tensor<int32, [3]> var_383_perm_0 = const()[name = tensor<string, []>("op_383_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 5, 256]> var_383 = transpose(perm = var_383_perm_0, x = var_381)[name = tensor<string, []>("transpose_52")];
+            tensor<fp32, [1, 5, 256]> input_33 = add(x = x_3, y = var_383)[name = tensor<string, []>("input_33")];
+            tensor<int32, [1]> input_35_axes_0 = const()[name = tensor<string, []>("input_35_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 5, 256]> input_35 = layer_norm(axes = input_35_axes_0, beta = inner_encoder_ffn2_0_module_sequential_0_bias, epsilon = var_76, gamma = inner_encoder_ffn2_0_module_sequential_0_weight, x = input_33)[name = tensor<string, []>("input_35")];
+            tensor<fp32, [1, 5, 1024]> inputs_9 = linear(bias = inner_encoder_ffn2_0_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_0_module_sequential_1_linear_weight, x = input_35)[name = tensor<string, []>("linear_8")];
+            tensor<fp32, [1, 5, 1024]> input_37 = silu(x = inputs_9)[name = tensor<string, []>("input_37")];
+            tensor<fp32, [1, 5, 256]> input_41 = linear(bias = inner_encoder_ffn2_0_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_0_module_sequential_4_linear_weight, x = input_37)[name = tensor<string, []>("linear_9")];
+            tensor<fp32, []> var_406 = const()[name = tensor<string, []>("op_406"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 5, 256]> var_407 = mul(x = input_41, y = var_406)[name = tensor<string, []>("op_407")];
+            tensor<fp32, [1, 5, 256]> input_43 = add(x = var_407, y = input_33)[name = tensor<string, []>("input_43")];
             tensor<int32, [1]> input_45_axes_0 = const()[name = tensor<string, []>("input_45_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_45 = layer_norm(axes = input_45_axes_0, beta = encoder_ffn1_1_module_sequential_0_bias, epsilon = var_29, gamma = encoder_ffn1_1_module_sequential_0_weight, x = input_43)[name = tensor<string, []>("input_45")];
-            tensor<fp32, [1, 5, 1024]> inputs_11 = linear(bias = encoder_ffn1_1_module_sequential_1_linear_bias, weight = encoder_ffn1_1_module_sequential_1_linear_weight, x = input_45)[name = tensor<string, []>("linear_10")];
-            tensor<fp32, [1, 5, 1024]> input_47 = silu(x = inputs_11)[name = tensor<string, []>("input_47")];
-            tensor<fp32, [1, 5, 256]> input_51 = linear(bias = encoder_ffn1_1_module_sequential_4_linear_bias, weight = encoder_ffn1_1_module_sequential_4_linear_weight, x = input_47)[name = tensor<string, []>("linear_11")];
-            tensor<fp32, []> var_370 = const()[name = tensor<string, []>("op_370"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 5, 256]> var_371 = mul(x = input_51, y = var_370)[name = tensor<string, []>("op_371")];
-            tensor<fp32, [1, 5, 256]> input_53 = add(x = var_371, y = input_43)[name = tensor<string, []>("input_53")];
+            tensor<fp32, [1, 5, 256]> input_45 = layer_norm(axes = input_45_axes_0, beta = inner_encoder_layer_norm_0_bias, epsilon = var_76, gamma = inner_encoder_layer_norm_0_weight, x = input_43)[name = tensor<string, []>("input_45")];
+            tensor<int32, [1]> input_47_axes_0 = const()[name = tensor<string, []>("input_47_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 5, 256]> input_47 = layer_norm(axes = input_47_axes_0, beta = inner_encoder_ffn1_1_module_sequential_0_bias, epsilon = var_76, gamma = inner_encoder_ffn1_1_module_sequential_0_weight, x = input_45)[name = tensor<string, []>("input_47")];
+            tensor<fp32, [1, 5, 1024]> inputs_11 = linear(bias = inner_encoder_ffn1_1_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_1_module_sequential_1_linear_weight, x = input_47)[name = tensor<string, []>("linear_10")];
+            tensor<fp32, [1, 5, 1024]> input_49 = silu(x = inputs_11)[name = tensor<string, []>("input_49")];
+            tensor<fp32, [1, 5, 256]> input_53 = linear(bias = inner_encoder_ffn1_1_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_1_module_sequential_4_linear_weight, x = input_49)[name = tensor<string, []>("linear_11")];
+            tensor<fp32, []> var_436 = const()[name = tensor<string, []>("op_436"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 5, 256]> var_437 = mul(x = input_53, y = var_436)[name = tensor<string, []>("op_437")];
+            tensor<fp32, [1, 5, 256]> input_55 = add(x = var_437, y = input_45)[name = tensor<string, []>("input_55")];
             tensor<int32, [1]> x_7_axes_0 = const()[name = tensor<string, []>("x_7_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> x_7 = layer_norm(axes = x_7_axes_0, beta = encoder_ret_lns_1_bias, epsilon = var_29, gamma = encoder_ret_lns_1_weight, x = input_53)[name = tensor<string, []>("x_7")];
+            tensor<fp32, [1, 5, 256]> x_7 = layer_norm(axes = x_7_axes_0, beta = inner_encoder_ret_lns_1_bias, epsilon = var_76, gamma = inner_encoder_ret_lns_1_weight, x = input_55)[name = tensor<string, []>("x_7")];
             tensor<int32, [5]> prev_kv_3_begin_0 = const()[name = tensor<string, []>("prev_kv_3_begin_0"), val = tensor<int32, [5]>([1, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_3_end_0 = const()[name = tensor<string, []>("prev_kv_3_end_0"), val = tensor<int32, [5]>([2, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_3_end_mask_0 = const()[name = tensor<string, []>("prev_kv_3_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -426,183 +452,183 @@ program(1.0)
             tensor<bool, [2]> prev_scale_3_end_mask_0 = const()[name = tensor<string, []>("prev_scale_3_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_3_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_3_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_3 = slice_by_index(begin = prev_scale_3_begin_0, end = prev_scale_3_end_0, end_mask = prev_scale_3_end_mask_0, squeeze_mask = prev_scale_3_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_3")];
-            tensor<fp32, [1, 5, 256]> var_385 = linear(bias = encoder_q_proj_1_bias, weight = encoder_q_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_12")];
-            tensor<int32, [4]> var_386 = const()[name = tensor<string, []>("op_386"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_387 = reshape(shape = var_386, x = var_385)[name = tensor<string, []>("op_387")];
+            tensor<fp32, [1, 5, 256]> var_451 = linear(bias = inner_encoder_q_proj_1_bias, weight = inner_encoder_q_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_12")];
+            tensor<int32, [4]> var_452 = const()[name = tensor<string, []>("op_452"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_453 = reshape(shape = var_452, x = var_451)[name = tensor<string, []>("op_453")];
             tensor<int32, [4]> q_3_perm_0 = const()[name = tensor<string, []>("q_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> var_391 = linear(bias = encoder_k_proj_1_bias, weight = encoder_k_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_13")];
-            tensor<fp32, []> var_392 = const()[name = tensor<string, []>("op_392"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 5, 256]> var_393 = mul(x = var_391, y = var_392)[name = tensor<string, []>("op_393")];
-            tensor<int32, [4]> var_394 = const()[name = tensor<string, []>("op_394"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_395 = reshape(shape = var_394, x = var_393)[name = tensor<string, []>("op_395")];
+            tensor<fp32, [1, 5, 256]> var_457 = linear(bias = inner_encoder_k_proj_1_bias, weight = inner_encoder_k_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_13")];
+            tensor<fp32, []> var_458 = const()[name = tensor<string, []>("op_458"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 5, 256]> var_459 = mul(x = var_457, y = var_458)[name = tensor<string, []>("op_459")];
+            tensor<int32, [4]> var_460 = const()[name = tensor<string, []>("op_460"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_461 = reshape(shape = var_460, x = var_459)[name = tensor<string, []>("op_461")];
             tensor<int32, [4]> k_3_perm_0 = const()[name = tensor<string, []>("k_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> var_399 = linear(bias = encoder_v_proj_1_bias, weight = encoder_v_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_14")];
-            tensor<int32, [4]> var_400 = const()[name = tensor<string, []>("op_400"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_401 = reshape(shape = var_400, x = var_399)[name = tensor<string, []>("op_401")];
+            tensor<fp32, [1, 5, 256]> var_465 = linear(bias = inner_encoder_v_proj_1_bias, weight = inner_encoder_v_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_14")];
+            tensor<int32, [4]> var_466 = const()[name = tensor<string, []>("op_466"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_467 = reshape(shape = var_466, x = var_465)[name = tensor<string, []>("op_467")];
             tensor<int32, [4]> v_3_perm_0 = const()[name = tensor<string, []>("v_3_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> input_57 = linear(bias = encoder_g_proj_1_bias, weight = encoder_g_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_15")];
+            tensor<fp32, [1, 5, 256]> input_59 = linear(bias = inner_encoder_g_proj_1_bias, weight = inner_encoder_g_proj_1_weight, x = x_7)[name = tensor<string, []>("linear_15")];
             tensor<fp32, [1]> sqrt_s0_3 = sqrt(x = prev_scale_3)[name = tensor<string, []>("sqrt_s0_3")];
-            tensor<fp32, [5]> s_t_3 = add(x = prev_scale_3, y = encoder__t_index)[name = tensor<string, []>("s_t_3")];
+            tensor<fp32, [5]> s_t_3 = add(x = prev_scale_3, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_3")];
             tensor<fp32, [5]> sqrt_s_t_3 = sqrt(x = s_t_3)[name = tensor<string, []>("sqrt_s_t_3")];
             tensor<bool, []> qk_3_transpose_x_1 = const()[name = tensor<string, []>("qk_3_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_3_transpose_y_1 = const()[name = tensor<string, []>("qk_3_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 5, 64]> k_3 = transpose(perm = k_3_perm_0, x = var_395)[name = tensor<string, []>("transpose_50")];
-            tensor<fp32, [1, 4, 5, 64]> q_3 = transpose(perm = q_3_perm_0, x = var_387)[name = tensor<string, []>("transpose_51")];
+            tensor<fp32, [1, 4, 5, 64]> k_3 = transpose(perm = k_3_perm_0, x = var_461)[name = tensor<string, []>("transpose_50")];
+            tensor<fp32, [1, 4, 5, 64]> q_3 = transpose(perm = q_3_perm_0, x = var_453)[name = tensor<string, []>("transpose_51")];
             tensor<fp32, [1, 4, 5, 5]> qk_3 = matmul(transpose_x = qk_3_transpose_x_1, transpose_y = qk_3_transpose_y_1, x = q_3, y = k_3)[name = tensor<string, []>("qk_3")];
-            tensor<int32, [2]> var_411 = const()[name = tensor<string, []>("op_411"), val = tensor<int32, [2]>([5, 1])];
-            tensor<fp32, [5, 1]> var_412 = reshape(shape = var_411, x = sqrt_s_t_3)[name = tensor<string, []>("op_412")];
-            tensor<fp32, [5, 5]> M_3 = real_div(x = encoder__causal_mask, y = var_412)[name = tensor<string, []>("M_3")];
-            tensor<fp32, [1, 4, 5, 5]> var_414 = mul(x = qk_3, y = M_3)[name = tensor<string, []>("op_414")];
+            tensor<int32, [2]> var_477 = const()[name = tensor<string, []>("op_477"), val = tensor<int32, [2]>([5, 1])];
+            tensor<fp32, [5, 1]> var_478 = reshape(shape = var_477, x = sqrt_s_t_3)[name = tensor<string, []>("op_478")];
+            tensor<fp32, [5, 5]> M_3 = real_div(x = inner_encoder__causal_mask, y = var_478)[name = tensor<string, []>("M_3")];
+            tensor<fp32, [1, 4, 5, 5]> var_480 = mul(x = qk_3, y = M_3)[name = tensor<string, []>("op_480")];
             tensor<bool, []> inner_3_transpose_x_0 = const()[name = tensor<string, []>("inner_3_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_3_transpose_y_0 = const()[name = tensor<string, []>("inner_3_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 5, 64]> v_3 = transpose(perm = v_3_perm_0, x = var_401)[name = tensor<string, []>("transpose_49")];
-            tensor<fp32, [1, 4, 5, 64]> inner_3 = matmul(transpose_x = inner_3_transpose_x_0, transpose_y = inner_3_transpose_y_0, x = var_414, y = v_3)[name = tensor<string, []>("inner_3")];
-            tensor<bool, []> var_416_transpose_x_0 = const()[name = tensor<string, []>("op_416_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_416_transpose_y_0 = const()[name = tensor<string, []>("op_416_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 5, 64]> var_416 = matmul(transpose_x = var_416_transpose_x_0, transpose_y = var_416_transpose_y_0, x = q_3, y = prev_kv_3)[name = tensor<string, []>("op_416")];
-            tensor<fp32, [5]> var_417 = real_div(x = sqrt_s0_3, y = sqrt_s_t_3)[name = tensor<string, []>("op_417")];
-            tensor<int32, [4]> var_418 = const()[name = tensor<string, []>("op_418"), val = tensor<int32, [4]>([1, 1, 5, 1])];
-            tensor<fp32, [1, 1, 5, 1]> var_419 = reshape(shape = var_418, x = var_417)[name = tensor<string, []>("op_419")];
-            tensor<fp32, [1, 4, 5, 64]> cross_3 = mul(x = var_416, y = var_419)[name = tensor<string, []>("cross_3")];
+            tensor<fp32, [1, 4, 5, 64]> v_3 = transpose(perm = v_3_perm_0, x = var_467)[name = tensor<string, []>("transpose_49")];
+            tensor<fp32, [1, 4, 5, 64]> inner_3 = matmul(transpose_x = inner_3_transpose_x_0, transpose_y = inner_3_transpose_y_0, x = var_480, y = v_3)[name = tensor<string, []>("inner_3")];
+            tensor<bool, []> var_482_transpose_x_0 = const()[name = tensor<string, []>("op_482_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_482_transpose_y_0 = const()[name = tensor<string, []>("op_482_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 5, 64]> var_482 = matmul(transpose_x = var_482_transpose_x_0, transpose_y = var_482_transpose_y_0, x = q_3, y = prev_kv_3)[name = tensor<string, []>("op_482")];
+            tensor<fp32, [5]> var_483 = real_div(x = sqrt_s0_3, y = sqrt_s_t_3)[name = tensor<string, []>("op_483")];
+            tensor<int32, [4]> var_484 = const()[name = tensor<string, []>("op_484"), val = tensor<int32, [4]>([1, 1, 5, 1])];
+            tensor<fp32, [1, 1, 5, 1]> var_485 = reshape(shape = var_484, x = var_483)[name = tensor<string, []>("op_485")];
+            tensor<fp32, [1, 4, 5, 64]> cross_3 = mul(x = var_482, y = var_485)[name = tensor<string, []>("cross_3")];
             tensor<fp32, [1, 4, 5, 64]> out_7 = add(x = inner_3, y = cross_3)[name = tensor<string, []>("out_7")];
-            tensor<fp32, [1, 4, 64, 64]> var_422 = mul(x = prev_kv_3, y = sqrt_s0_3)[name = tensor<string, []>("op_422")];
-            tensor<bool, []> var_424_transpose_x_1 = const()[name = tensor<string, []>("op_424_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_424_transpose_y_1 = const()[name = tensor<string, []>("op_424_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_424 = matmul(transpose_x = var_424_transpose_x_1, transpose_y = var_424_transpose_y_1, x = k_3, y = v_3)[name = tensor<string, []>("op_424")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_3 = add(x = var_422, y = var_424)[name = tensor<string, []>("new_kv_unnorm_3")];
-            tensor<fp32, []> var_426 = const()[name = tensor<string, []>("op_426"), val = tensor<fp32, []>(0x1.4p+2)];
-            tensor<fp32, [1]> new_scale_3 = add(x = prev_scale_3, y = var_426)[name = tensor<string, []>("new_scale_3")];
-            tensor<fp32, [1]> var_428 = sqrt(x = new_scale_3)[name = tensor<string, []>("op_428")];
-            tensor<fp32, [1, 4, 64, 64]> var_429 = real_div(x = new_kv_unnorm_3, y = var_428)[name = tensor<string, []>("op_429")];
-            tensor<int32, [4]> var_430_perm_0 = const()[name = tensor<string, []>("op_430_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_488 = mul(x = prev_kv_3, y = sqrt_s0_3)[name = tensor<string, []>("op_488")];
+            tensor<bool, []> var_490_transpose_x_1 = const()[name = tensor<string, []>("op_490_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_490_transpose_y_1 = const()[name = tensor<string, []>("op_490_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_490 = matmul(transpose_x = var_490_transpose_x_1, transpose_y = var_490_transpose_y_1, x = k_3, y = v_3)[name = tensor<string, []>("op_490")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_3 = add(x = var_488, y = var_490)[name = tensor<string, []>("new_kv_unnorm_3")];
+            tensor<fp32, []> var_492 = const()[name = tensor<string, []>("op_492"), val = tensor<fp32, []>(0x1.4p+2)];
+            tensor<fp32, [1]> new_scale_3 = add(x = prev_scale_3, y = var_492)[name = tensor<string, []>("new_scale_3")];
+            tensor<fp32, [1]> var_494 = sqrt(x = new_scale_3)[name = tensor<string, []>("op_494")];
+            tensor<fp32, [1, 4, 64, 64]> var_495 = real_div(x = new_kv_unnorm_3, y = var_494)[name = tensor<string, []>("op_495")];
+            tensor<int32, [4]> var_496_perm_0 = const()[name = tensor<string, []>("op_496_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_9_axes_0 = const()[name = tensor<string, []>("out_9_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 4, 64]> var_430 = transpose(perm = var_430_perm_0, x = out_7)[name = tensor<string, []>("transpose_48")];
-            tensor<fp32, [1, 5, 4, 64]> out_9 = layer_norm(axes = out_9_axes_0, epsilon = var_18, x = var_430)[name = tensor<string, []>("out_9")];
-            tensor<int32, [3]> var_434 = const()[name = tensor<string, []>("op_434"), val = tensor<int32, [3]>([1, 5, 256])];
-            tensor<fp32, [1, 5, 256]> out_11 = reshape(shape = var_434, x = out_9)[name = tensor<string, []>("out_11")];
-            tensor<fp32, [1, 5, 256]> var_436 = silu(x = input_57)[name = tensor<string, []>("op_436")];
-            tensor<fp32, [1, 5, 256]> input_59 = mul(x = var_436, y = out_11)[name = tensor<string, []>("input_59")];
-            tensor<fp32, [1, 5, 256]> ret_out_3 = linear(bias = encoder_out_proj_1_bias, weight = encoder_out_proj_1_weight, x = input_59)[name = tensor<string, []>("linear_16")];
-            tensor<fp32, [1, 5, 256]> x_9 = add(x = input_53, y = ret_out_3)[name = tensor<string, []>("x_9")];
+            tensor<fp32, [1, 5, 4, 64]> var_496 = transpose(perm = var_496_perm_0, x = out_7)[name = tensor<string, []>("transpose_48")];
+            tensor<fp32, [1, 5, 4, 64]> out_9 = layer_norm(axes = out_9_axes_0, epsilon = var_84, x = var_496)[name = tensor<string, []>("out_9")];
+            tensor<int32, [3]> var_500 = const()[name = tensor<string, []>("op_500"), val = tensor<int32, [3]>([1, 5, 256])];
+            tensor<fp32, [1, 5, 256]> out_11 = reshape(shape = var_500, x = out_9)[name = tensor<string, []>("out_11")];
+            tensor<fp32, [1, 5, 256]> var_502 = silu(x = input_59)[name = tensor<string, []>("op_502")];
+            tensor<fp32, [1, 5, 256]> input_61 = mul(x = var_502, y = out_11)[name = tensor<string, []>("input_61")];
+            tensor<fp32, [1, 5, 256]> ret_out_3 = linear(bias = inner_encoder_out_proj_1_bias, weight = inner_encoder_out_proj_1_weight, x = input_61)[name = tensor<string, []>("linear_16")];
+            tensor<fp32, [1, 5, 256]> x_9 = add(x = input_55, y = ret_out_3)[name = tensor<string, []>("x_9")];
             tensor<int32, [4]> window_13_begin_0 = const()[name = tensor<string, []>("window_13_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> window_13_end_0 = const()[name = tensor<string, []>("window_13_end_0"), val = tensor<int32, [4]>([2, 1, 16, 256])];
             tensor<bool, [4]> window_13_end_mask_0 = const()[name = tensor<string, []>("window_13_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_13_squeeze_mask_0 = const()[name = tensor<string, []>("window_13_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_13 = slice_by_index(begin = window_13_begin_0, end = window_13_end_0, end_mask = window_13_end_mask_0, squeeze_mask = window_13_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_13")];
-            tensor<int32, [3]> var_444_begin_0 = const()[name = tensor<string, []>("op_444_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_444_end_0 = const()[name = tensor<string, []>("op_444_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_444_end_mask_0 = const()[name = tensor<string, []>("op_444_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_444 = slice_by_index(begin = var_444_begin_0, end = var_444_end_0, end_mask = var_444_end_mask_0, x = x_9)[name = tensor<string, []>("op_444")];
-            tensor<int32, [3]> var_447_begin_0 = const()[name = tensor<string, []>("op_447_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_447_end_0 = const()[name = tensor<string, []>("op_447_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_447_end_mask_0 = const()[name = tensor<string, []>("op_447_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_447 = slice_by_index(begin = var_447_begin_0, end = var_447_end_0, end_mask = var_447_end_mask_0, x = window_13)[name = tensor<string, []>("op_447")];
+            tensor<int32, [3]> var_510_begin_0 = const()[name = tensor<string, []>("op_510_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_510_end_0 = const()[name = tensor<string, []>("op_510_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_510_end_mask_0 = const()[name = tensor<string, []>("op_510_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_510 = slice_by_index(begin = var_510_begin_0, end = var_510_end_0, end_mask = var_510_end_mask_0, x = x_9)[name = tensor<string, []>("op_510")];
+            tensor<int32, [3]> var_513_begin_0 = const()[name = tensor<string, []>("op_513_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_513_end_0 = const()[name = tensor<string, []>("op_513_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_513_end_mask_0 = const()[name = tensor<string, []>("op_513_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_513 = slice_by_index(begin = var_513_begin_0, end = var_513_end_0, end_mask = var_513_end_mask_0, x = window_13)[name = tensor<string, []>("op_513")];
             tensor<bool, []> window_15_interleave_0 = const()[name = tensor<string, []>("window_15_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_15 = concat(axis = var_27, interleave = window_15_interleave_0, values = (var_447, var_444))[name = tensor<string, []>("window_15")];
-            tensor<int32, [3]> var_452_begin_0 = const()[name = tensor<string, []>("op_452_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_452_end_0 = const()[name = tensor<string, []>("op_452_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_452_end_mask_0 = const()[name = tensor<string, []>("op_452_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_452 = slice_by_index(begin = var_452_begin_0, end = var_452_end_0, end_mask = var_452_end_mask_0, x = x_9)[name = tensor<string, []>("op_452")];
-            tensor<int32, [3]> var_455_begin_0 = const()[name = tensor<string, []>("op_455_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_455_end_0 = const()[name = tensor<string, []>("op_455_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_455_end_mask_0 = const()[name = tensor<string, []>("op_455_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_455 = slice_by_index(begin = var_455_begin_0, end = var_455_end_0, end_mask = var_455_end_mask_0, x = window_15)[name = tensor<string, []>("op_455")];
+            tensor<fp32, [1, 16, 256]> window_15 = concat(axis = var_93, interleave = window_15_interleave_0, values = (var_513, var_510))[name = tensor<string, []>("window_15")];
+            tensor<int32, [3]> var_518_begin_0 = const()[name = tensor<string, []>("op_518_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_518_end_0 = const()[name = tensor<string, []>("op_518_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_518_end_mask_0 = const()[name = tensor<string, []>("op_518_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_518 = slice_by_index(begin = var_518_begin_0, end = var_518_end_0, end_mask = var_518_end_mask_0, x = x_9)[name = tensor<string, []>("op_518")];
+            tensor<int32, [3]> var_521_begin_0 = const()[name = tensor<string, []>("op_521_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_521_end_0 = const()[name = tensor<string, []>("op_521_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_521_end_mask_0 = const()[name = tensor<string, []>("op_521_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_521 = slice_by_index(begin = var_521_begin_0, end = var_521_end_0, end_mask = var_521_end_mask_0, x = window_15)[name = tensor<string, []>("op_521")];
             tensor<bool, []> window_17_interleave_0 = const()[name = tensor<string, []>("window_17_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_17 = concat(axis = var_27, interleave = window_17_interleave_0, values = (var_455, var_452))[name = tensor<string, []>("window_17")];
-            tensor<int32, [3]> var_460_begin_0 = const()[name = tensor<string, []>("op_460_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_460_end_0 = const()[name = tensor<string, []>("op_460_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<bool, [3]> var_460_end_mask_0 = const()[name = tensor<string, []>("op_460_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_460 = slice_by_index(begin = var_460_begin_0, end = var_460_end_0, end_mask = var_460_end_mask_0, x = x_9)[name = tensor<string, []>("op_460")];
-            tensor<int32, [3]> var_463_begin_0 = const()[name = tensor<string, []>("op_463_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_463_end_0 = const()[name = tensor<string, []>("op_463_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_463_end_mask_0 = const()[name = tensor<string, []>("op_463_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_463 = slice_by_index(begin = var_463_begin_0, end = var_463_end_0, end_mask = var_463_end_mask_0, x = window_17)[name = tensor<string, []>("op_463")];
+            tensor<fp32, [1, 16, 256]> window_17 = concat(axis = var_93, interleave = window_17_interleave_0, values = (var_521, var_518))[name = tensor<string, []>("window_17")];
+            tensor<int32, [3]> var_526_begin_0 = const()[name = tensor<string, []>("op_526_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_526_end_0 = const()[name = tensor<string, []>("op_526_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<bool, [3]> var_526_end_mask_0 = const()[name = tensor<string, []>("op_526_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_526 = slice_by_index(begin = var_526_begin_0, end = var_526_end_0, end_mask = var_526_end_mask_0, x = x_9)[name = tensor<string, []>("op_526")];
+            tensor<int32, [3]> var_529_begin_0 = const()[name = tensor<string, []>("op_529_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_529_end_0 = const()[name = tensor<string, []>("op_529_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_529_end_mask_0 = const()[name = tensor<string, []>("op_529_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_529 = slice_by_index(begin = var_529_begin_0, end = var_529_end_0, end_mask = var_529_end_mask_0, x = window_17)[name = tensor<string, []>("op_529")];
             tensor<bool, []> window_19_interleave_0 = const()[name = tensor<string, []>("window_19_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_19 = concat(axis = var_27, interleave = window_19_interleave_0, values = (var_463, var_460))[name = tensor<string, []>("window_19")];
-            tensor<int32, [3]> var_468_begin_0 = const()[name = tensor<string, []>("op_468_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
-            tensor<int32, [3]> var_468_end_0 = const()[name = tensor<string, []>("op_468_end_0"), val = tensor<int32, [3]>([1, 4, 256])];
-            tensor<bool, [3]> var_468_end_mask_0 = const()[name = tensor<string, []>("op_468_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_468 = slice_by_index(begin = var_468_begin_0, end = var_468_end_0, end_mask = var_468_end_mask_0, x = x_9)[name = tensor<string, []>("op_468")];
-            tensor<int32, [3]> var_471_begin_0 = const()[name = tensor<string, []>("op_471_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_471_end_0 = const()[name = tensor<string, []>("op_471_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_471_end_mask_0 = const()[name = tensor<string, []>("op_471_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_471 = slice_by_index(begin = var_471_begin_0, end = var_471_end_0, end_mask = var_471_end_mask_0, x = window_19)[name = tensor<string, []>("op_471")];
+            tensor<fp32, [1, 16, 256]> window_19 = concat(axis = var_93, interleave = window_19_interleave_0, values = (var_529, var_526))[name = tensor<string, []>("window_19")];
+            tensor<int32, [3]> var_534_begin_0 = const()[name = tensor<string, []>("op_534_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
+            tensor<int32, [3]> var_534_end_0 = const()[name = tensor<string, []>("op_534_end_0"), val = tensor<int32, [3]>([1, 4, 256])];
+            tensor<bool, [3]> var_534_end_mask_0 = const()[name = tensor<string, []>("op_534_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_534 = slice_by_index(begin = var_534_begin_0, end = var_534_end_0, end_mask = var_534_end_mask_0, x = x_9)[name = tensor<string, []>("op_534")];
+            tensor<int32, [3]> var_537_begin_0 = const()[name = tensor<string, []>("op_537_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_537_end_0 = const()[name = tensor<string, []>("op_537_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_537_end_mask_0 = const()[name = tensor<string, []>("op_537_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_537 = slice_by_index(begin = var_537_begin_0, end = var_537_end_0, end_mask = var_537_end_mask_0, x = window_19)[name = tensor<string, []>("op_537")];
             tensor<bool, []> window_21_interleave_0 = const()[name = tensor<string, []>("window_21_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_21 = concat(axis = var_27, interleave = window_21_interleave_0, values = (var_471, var_468))[name = tensor<string, []>("window_21")];
-            tensor<int32, [3]> var_476_begin_0 = const()[name = tensor<string, []>("op_476_begin_0"), val = tensor<int32, [3]>([0, 4, 0])];
-            tensor<int32, [3]> var_476_end_0 = const()[name = tensor<string, []>("op_476_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_476_end_mask_0 = const()[name = tensor<string, []>("op_476_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_476 = slice_by_index(begin = var_476_begin_0, end = var_476_end_0, end_mask = var_476_end_mask_0, x = x_9)[name = tensor<string, []>("op_476")];
-            tensor<int32, [3]> var_479_begin_0 = const()[name = tensor<string, []>("op_479_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_479_end_0 = const()[name = tensor<string, []>("op_479_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_479_end_mask_0 = const()[name = tensor<string, []>("op_479_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_479 = slice_by_index(begin = var_479_begin_0, end = var_479_end_0, end_mask = var_479_end_mask_0, x = window_21)[name = tensor<string, []>("op_479")];
+            tensor<fp32, [1, 16, 256]> window_21 = concat(axis = var_93, interleave = window_21_interleave_0, values = (var_537, var_534))[name = tensor<string, []>("window_21")];
+            tensor<int32, [3]> var_542_begin_0 = const()[name = tensor<string, []>("op_542_begin_0"), val = tensor<int32, [3]>([0, 4, 0])];
+            tensor<int32, [3]> var_542_end_0 = const()[name = tensor<string, []>("op_542_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_542_end_mask_0 = const()[name = tensor<string, []>("op_542_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_542 = slice_by_index(begin = var_542_begin_0, end = var_542_end_0, end_mask = var_542_end_mask_0, x = x_9)[name = tensor<string, []>("op_542")];
+            tensor<int32, [3]> var_545_begin_0 = const()[name = tensor<string, []>("op_545_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_545_end_0 = const()[name = tensor<string, []>("op_545_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_545_end_mask_0 = const()[name = tensor<string, []>("op_545_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_545 = slice_by_index(begin = var_545_begin_0, end = var_545_end_0, end_mask = var_545_end_mask_0, x = window_21)[name = tensor<string, []>("op_545")];
             tensor<bool, []> window_23_interleave_0 = const()[name = tensor<string, []>("window_23_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_23 = concat(axis = var_27, interleave = window_23_interleave_0, values = (var_479, var_476))[name = tensor<string, []>("window_23")];
-            tensor<bool, []> input_61_interleave_0 = const()[name = tensor<string, []>("input_61_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [5, 16, 256]> input_61 = concat(axis = var_24, interleave = input_61_interleave_0, values = (window_15, window_17, window_19, window_21, window_23))[name = tensor<string, []>("input_61")];
+            tensor<fp32, [1, 16, 256]> window_23 = concat(axis = var_93, interleave = window_23_interleave_0, values = (var_545, var_542))[name = tensor<string, []>("window_23")];
+            tensor<bool, []> input_63_interleave_0 = const()[name = tensor<string, []>("input_63_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [5, 16, 256]> input_63 = concat(axis = var_79, interleave = input_63_interleave_0, values = (window_15, window_17, window_19, window_21, window_23))[name = tensor<string, []>("input_63")];
             tensor<int32, [1]> x_11_axes_0 = const()[name = tensor<string, []>("x_11_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [5, 16, 256]> x_11 = layer_norm(axes = x_11_axes_0, beta = encoder_conv_module_1_sequential_0_bias, epsilon = var_29, gamma = encoder_conv_module_1_sequential_0_weight, x = input_61)[name = tensor<string, []>("x_11")];
-            tensor<int32, [3]> input_63_perm_0 = const()[name = tensor<string, []>("input_63_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [5, 16, 256]> x_11 = layer_norm(axes = x_11_axes_0, beta = inner_encoder_conv_module_1_sequential_0_bias, epsilon = var_76, gamma = inner_encoder_conv_module_1_sequential_0_weight, x = input_63)[name = tensor<string, []>("x_11")];
+            tensor<int32, [3]> input_65_perm_0 = const()[name = tensor<string, []>("input_65_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_13_pad_type_0 = const()[name = tensor<string, []>("inputs_13_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_13_strides_0 = const()[name = tensor<string, []>("inputs_13_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_13_pad_0 = const()[name = tensor<string, []>("inputs_13_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_13_dilations_0 = const()[name = tensor<string, []>("inputs_13_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_13_groups_0 = const()[name = tensor<string, []>("inputs_13_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> input_63 = transpose(perm = input_63_perm_0, x = x_11)[name = tensor<string, []>("transpose_47")];
-            tensor<fp32, [5, 512, 16]> inputs_13 = conv(bias = encoder_conv_module_1_sequential_2_conv_bias, dilations = inputs_13_dilations_0, groups = inputs_13_groups_0, pad = inputs_13_pad_0, pad_type = inputs_13_pad_type_0, strides = inputs_13_strides_0, weight = encoder_conv_module_1_sequential_2_conv_weight, x = input_63)[name = tensor<string, []>("inputs_13")];
-            tensor<int32, [2]> var_504_split_sizes_0 = const()[name = tensor<string, []>("op_504_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_504_axis_0 = const()[name = tensor<string, []>("op_504_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> var_504_0, tensor<fp32, [5, 256, 16]> var_504_1 = split(axis = var_504_axis_0, split_sizes = var_504_split_sizes_0, x = inputs_13)[name = tensor<string, []>("op_504")];
-            tensor<fp32, [5, 256, 16]> var_506 = sigmoid(x = var_504_1)[name = tensor<string, []>("op_506")];
-            tensor<fp32, [5, 256, 16]> inputs_15 = mul(x = var_504_0, y = var_506)[name = tensor<string, []>("inputs_15")];
+            tensor<fp32, [5, 256, 16]> input_65 = transpose(perm = input_65_perm_0, x = x_11)[name = tensor<string, []>("transpose_47")];
+            tensor<fp32, [5, 512, 16]> inputs_13 = conv(bias = inner_encoder_conv_module_1_sequential_2_conv_bias, dilations = inputs_13_dilations_0, groups = inputs_13_groups_0, pad = inputs_13_pad_0, pad_type = inputs_13_pad_type_0, strides = inputs_13_strides_0, weight = inner_encoder_conv_module_1_sequential_2_conv_weight, x = input_65)[name = tensor<string, []>("inputs_13")];
+            tensor<int32, [2]> var_570_split_sizes_0 = const()[name = tensor<string, []>("op_570_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_570_axis_0 = const()[name = tensor<string, []>("op_570_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [5, 256, 16]> var_570_0, tensor<fp32, [5, 256, 16]> var_570_1 = split(axis = var_570_axis_0, split_sizes = var_570_split_sizes_0, x = inputs_13)[name = tensor<string, []>("op_570")];
+            tensor<fp32, [5, 256, 16]> var_572 = sigmoid(x = var_570_1)[name = tensor<string, []>("op_572")];
+            tensor<fp32, [5, 256, 16]> inputs_15 = mul(x = var_570_0, y = var_572)[name = tensor<string, []>("inputs_15")];
             tensor<string, []> outputs_aug_3_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_3_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_3_pad_0 = const()[name = tensor<string, []>("outputs_aug_3_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_3_groups_0 = const()[name = tensor<string, []>("outputs_aug_3_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_3_strides_0 = const()[name = tensor<string, []>("outputs_aug_3_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_3_dilations_0 = const()[name = tensor<string, []>("outputs_aug_3_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [5, 256, 31]> outputs_aug_3 = conv(dilations = outputs_aug_3_dilations_0, groups = outputs_aug_3_groups_0, pad = outputs_aug_3_pad_0, pad_type = outputs_aug_3_pad_type_0, strides = outputs_aug_3_strides_0, weight = encoder_conv_module_1_sequential_4_conv_weight, x = inputs_15)[name = tensor<string, []>("outputs_aug_3")];
-            tensor<int32, [3]> input_65_begin_0 = const()[name = tensor<string, []>("input_65_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_65_end_0 = const()[name = tensor<string, []>("input_65_end_0"), val = tensor<int32, [3]>([5, 256, 16])];
-            tensor<bool, [3]> input_65_end_mask_0 = const()[name = tensor<string, []>("input_65_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [5, 256, 16]> input_65 = slice_by_index(begin = input_65_begin_0, end = input_65_end_0, end_mask = input_65_end_mask_0, x = outputs_aug_3)[name = tensor<string, []>("input_65")];
-            tensor<fp32, [5, 256, 16]> inputs_17 = batch_norm(beta = encoder_conv_module_1_sequential_5_bias, epsilon = var_29, gamma = encoder_conv_module_1_sequential_5_weight, mean = encoder_conv_module_1_sequential_5_running_mean, variance = encoder_conv_module_1_sequential_5_running_var, x = input_65)[name = tensor<string, []>("inputs_17")];
-            tensor<fp32, [5, 256, 16]> input_67 = silu(x = inputs_17)[name = tensor<string, []>("input_67")];
-            tensor<string, []> input_69_pad_type_0 = const()[name = tensor<string, []>("input_69_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_69_strides_0 = const()[name = tensor<string, []>("input_69_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_69_pad_0 = const()[name = tensor<string, []>("input_69_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_69_dilations_0 = const()[name = tensor<string, []>("input_69_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_69_groups_0 = const()[name = tensor<string, []>("input_69_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> input_69 = conv(bias = encoder_conv_module_1_sequential_7_conv_bias, dilations = input_69_dilations_0, groups = input_69_groups_0, pad = input_69_pad_0, pad_type = input_69_pad_type_0, strides = input_69_strides_0, weight = encoder_conv_module_1_sequential_7_conv_weight, x = input_67)[name = tensor<string, []>("input_69")];
+            tensor<fp32, [5, 256, 31]> outputs_aug_3 = conv(dilations = outputs_aug_3_dilations_0, groups = outputs_aug_3_groups_0, pad = outputs_aug_3_pad_0, pad_type = outputs_aug_3_pad_type_0, strides = outputs_aug_3_strides_0, weight = inner_encoder_conv_module_1_sequential_4_conv_weight, x = inputs_15)[name = tensor<string, []>("outputs_aug_3")];
+            tensor<int32, [3]> input_67_begin_0 = const()[name = tensor<string, []>("input_67_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_67_end_0 = const()[name = tensor<string, []>("input_67_end_0"), val = tensor<int32, [3]>([5, 256, 16])];
+            tensor<bool, [3]> input_67_end_mask_0 = const()[name = tensor<string, []>("input_67_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [5, 256, 16]> input_67 = slice_by_index(begin = input_67_begin_0, end = input_67_end_0, end_mask = input_67_end_mask_0, x = outputs_aug_3)[name = tensor<string, []>("input_67")];
+            tensor<fp32, [5, 256, 16]> inputs_17 = batch_norm(beta = inner_encoder_conv_module_1_sequential_5_bias, epsilon = var_76, gamma = inner_encoder_conv_module_1_sequential_5_weight, mean = inner_encoder_conv_module_1_sequential_5_running_mean, variance = inner_encoder_conv_module_1_sequential_5_running_var, x = input_67)[name = tensor<string, []>("inputs_17")];
+            tensor<fp32, [5, 256, 16]> input_69 = silu(x = inputs_17)[name = tensor<string, []>("input_69")];
+            tensor<string, []> input_71_pad_type_0 = const()[name = tensor<string, []>("input_71_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_71_strides_0 = const()[name = tensor<string, []>("input_71_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_71_pad_0 = const()[name = tensor<string, []>("input_71_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_71_dilations_0 = const()[name = tensor<string, []>("input_71_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_71_groups_0 = const()[name = tensor<string, []>("input_71_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [5, 256, 16]> input_71 = conv(bias = inner_encoder_conv_module_1_sequential_7_conv_bias, dilations = input_71_dilations_0, groups = input_71_groups_0, pad = input_71_pad_0, pad_type = input_71_pad_type_0, strides = input_71_strides_0, weight = inner_encoder_conv_module_1_sequential_7_conv_weight, x = input_69)[name = tensor<string, []>("input_71")];
             tensor<int32, [3]> conv_out_3_perm_0 = const()[name = tensor<string, []>("conv_out_3_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_537_begin_0 = const()[name = tensor<string, []>("op_537_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_537_end_0 = const()[name = tensor<string, []>("op_537_end_0"), val = tensor<int32, [3]>([5, 16, 256])];
-            tensor<bool, [3]> var_537_end_mask_0 = const()[name = tensor<string, []>("op_537_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [5, 16, 256]> conv_out_3 = transpose(perm = conv_out_3_perm_0, x = input_69)[name = tensor<string, []>("transpose_46")];
-            tensor<fp32, [5, 1, 256]> var_537 = slice_by_index(begin = var_537_begin_0, end = var_537_end_0, end_mask = var_537_end_mask_0, x = conv_out_3)[name = tensor<string, []>("op_537")];
-            tensor<int32, [3]> var_539_perm_0 = const()[name = tensor<string, []>("op_539_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 5, 256]> var_539 = transpose(perm = var_539_perm_0, x = var_537)[name = tensor<string, []>("transpose_45")];
-            tensor<fp32, [1, 5, 256]> input_71 = add(x = x_9, y = var_539)[name = tensor<string, []>("input_71")];
-            tensor<int32, [1]> input_73_axes_0 = const()[name = tensor<string, []>("input_73_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_73 = layer_norm(axes = input_73_axes_0, beta = encoder_ffn2_1_module_sequential_0_bias, epsilon = var_29, gamma = encoder_ffn2_1_module_sequential_0_weight, x = input_71)[name = tensor<string, []>("input_73")];
-            tensor<fp32, [1, 5, 1024]> inputs_19 = linear(bias = encoder_ffn2_1_module_sequential_1_linear_bias, weight = encoder_ffn2_1_module_sequential_1_linear_weight, x = input_73)[name = tensor<string, []>("linear_17")];
-            tensor<fp32, [1, 5, 1024]> input_75 = silu(x = inputs_19)[name = tensor<string, []>("input_75")];
-            tensor<fp32, [1, 5, 256]> input_79 = linear(bias = encoder_ffn2_1_module_sequential_4_linear_bias, weight = encoder_ffn2_1_module_sequential_4_linear_weight, x = input_75)[name = tensor<string, []>("linear_18")];
-            tensor<fp32, []> var_562 = const()[name = tensor<string, []>("op_562"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 5, 256]> var_563 = mul(x = input_79, y = var_562)[name = tensor<string, []>("op_563")];
-            tensor<fp32, [1, 5, 256]> input_81 = add(x = var_563, y = input_71)[name = tensor<string, []>("input_81")];
-            tensor<int32, [1]> input_83_axes_0 = const()[name = tensor<string, []>("input_83_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_83 = layer_norm(axes = input_83_axes_0, beta = encoder_layer_norm_1_bias, epsilon = var_29, gamma = encoder_layer_norm_1_weight, x = input_81)[name = tensor<string, []>("input_83")];
+            tensor<int32, [3]> var_603_begin_0 = const()[name = tensor<string, []>("op_603_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_603_end_0 = const()[name = tensor<string, []>("op_603_end_0"), val = tensor<int32, [3]>([5, 16, 256])];
+            tensor<bool, [3]> var_603_end_mask_0 = const()[name = tensor<string, []>("op_603_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [5, 16, 256]> conv_out_3 = transpose(perm = conv_out_3_perm_0, x = input_71)[name = tensor<string, []>("transpose_46")];
+            tensor<fp32, [5, 1, 256]> var_603 = slice_by_index(begin = var_603_begin_0, end = var_603_end_0, end_mask = var_603_end_mask_0, x = conv_out_3)[name = tensor<string, []>("op_603")];
+            tensor<int32, [3]> var_605_perm_0 = const()[name = tensor<string, []>("op_605_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 5, 256]> var_605 = transpose(perm = var_605_perm_0, x = var_603)[name = tensor<string, []>("transpose_45")];
+            tensor<fp32, [1, 5, 256]> input_73 = add(x = x_9, y = var_605)[name = tensor<string, []>("input_73")];
+            tensor<int32, [1]> input_75_axes_0 = const()[name = tensor<string, []>("input_75_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 5, 256]> input_75 = layer_norm(axes = input_75_axes_0, beta = inner_encoder_ffn2_1_module_sequential_0_bias, epsilon = var_76, gamma = inner_encoder_ffn2_1_module_sequential_0_weight, x = input_73)[name = tensor<string, []>("input_75")];
+            tensor<fp32, [1, 5, 1024]> inputs_19 = linear(bias = inner_encoder_ffn2_1_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_1_module_sequential_1_linear_weight, x = input_75)[name = tensor<string, []>("linear_17")];
+            tensor<fp32, [1, 5, 1024]> input_77 = silu(x = inputs_19)[name = tensor<string, []>("input_77")];
+            tensor<fp32, [1, 5, 256]> input_81 = linear(bias = inner_encoder_ffn2_1_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_1_module_sequential_4_linear_weight, x = input_77)[name = tensor<string, []>("linear_18")];
+            tensor<fp32, []> var_628 = const()[name = tensor<string, []>("op_628"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 5, 256]> var_629 = mul(x = input_81, y = var_628)[name = tensor<string, []>("op_629")];
+            tensor<fp32, [1, 5, 256]> input_83 = add(x = var_629, y = input_73)[name = tensor<string, []>("input_83")];
             tensor<int32, [1]> input_85_axes_0 = const()[name = tensor<string, []>("input_85_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_85 = layer_norm(axes = input_85_axes_0, beta = encoder_ffn1_2_module_sequential_0_bias, epsilon = var_29, gamma = encoder_ffn1_2_module_sequential_0_weight, x = input_83)[name = tensor<string, []>("input_85")];
-            tensor<fp32, [1, 5, 1024]> inputs_21 = linear(bias = encoder_ffn1_2_module_sequential_1_linear_bias, weight = encoder_ffn1_2_module_sequential_1_linear_weight, x = input_85)[name = tensor<string, []>("linear_19")];
-            tensor<fp32, [1, 5, 1024]> input_87 = silu(x = inputs_21)[name = tensor<string, []>("input_87")];
-            tensor<fp32, [1, 5, 256]> input_91 = linear(bias = encoder_ffn1_2_module_sequential_4_linear_bias, weight = encoder_ffn1_2_module_sequential_4_linear_weight, x = input_87)[name = tensor<string, []>("linear_20")];
-            tensor<fp32, []> var_592 = const()[name = tensor<string, []>("op_592"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 5, 256]> var_593 = mul(x = input_91, y = var_592)[name = tensor<string, []>("op_593")];
-            tensor<fp32, [1, 5, 256]> input_93 = add(x = var_593, y = input_83)[name = tensor<string, []>("input_93")];
+            tensor<fp32, [1, 5, 256]> input_85 = layer_norm(axes = input_85_axes_0, beta = inner_encoder_layer_norm_1_bias, epsilon = var_76, gamma = inner_encoder_layer_norm_1_weight, x = input_83)[name = tensor<string, []>("input_85")];
+            tensor<int32, [1]> input_87_axes_0 = const()[name = tensor<string, []>("input_87_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 5, 256]> input_87 = layer_norm(axes = input_87_axes_0, beta = inner_encoder_ffn1_2_module_sequential_0_bias, epsilon = var_76, gamma = inner_encoder_ffn1_2_module_sequential_0_weight, x = input_85)[name = tensor<string, []>("input_87")];
+            tensor<fp32, [1, 5, 1024]> inputs_21 = linear(bias = inner_encoder_ffn1_2_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_2_module_sequential_1_linear_weight, x = input_87)[name = tensor<string, []>("linear_19")];
+            tensor<fp32, [1, 5, 1024]> input_89 = silu(x = inputs_21)[name = tensor<string, []>("input_89")];
+            tensor<fp32, [1, 5, 256]> input_93 = linear(bias = inner_encoder_ffn1_2_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_2_module_sequential_4_linear_weight, x = input_89)[name = tensor<string, []>("linear_20")];
+            tensor<fp32, []> var_658 = const()[name = tensor<string, []>("op_658"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 5, 256]> var_659 = mul(x = input_93, y = var_658)[name = tensor<string, []>("op_659")];
+            tensor<fp32, [1, 5, 256]> input_95 = add(x = var_659, y = input_85)[name = tensor<string, []>("input_95")];
             tensor<int32, [1]> x_13_axes_0 = const()[name = tensor<string, []>("x_13_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> x_13 = layer_norm(axes = x_13_axes_0, beta = encoder_ret_lns_2_bias, epsilon = var_29, gamma = encoder_ret_lns_2_weight, x = input_93)[name = tensor<string, []>("x_13")];
+            tensor<fp32, [1, 5, 256]> x_13 = layer_norm(axes = x_13_axes_0, beta = inner_encoder_ret_lns_2_bias, epsilon = var_76, gamma = inner_encoder_ret_lns_2_weight, x = input_95)[name = tensor<string, []>("x_13")];
             tensor<int32, [5]> prev_kv_5_begin_0 = const()[name = tensor<string, []>("prev_kv_5_begin_0"), val = tensor<int32, [5]>([2, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_5_end_0 = const()[name = tensor<string, []>("prev_kv_5_end_0"), val = tensor<int32, [5]>([3, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_5_end_mask_0 = const()[name = tensor<string, []>("prev_kv_5_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -613,183 +639,183 @@ program(1.0)
             tensor<bool, [2]> prev_scale_5_end_mask_0 = const()[name = tensor<string, []>("prev_scale_5_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_5_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_5_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_5 = slice_by_index(begin = prev_scale_5_begin_0, end = prev_scale_5_end_0, end_mask = prev_scale_5_end_mask_0, squeeze_mask = prev_scale_5_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_5")];
-            tensor<fp32, [1, 5, 256]> var_607 = linear(bias = encoder_q_proj_2_bias, weight = encoder_q_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_21")];
-            tensor<int32, [4]> var_608 = const()[name = tensor<string, []>("op_608"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_609 = reshape(shape = var_608, x = var_607)[name = tensor<string, []>("op_609")];
+            tensor<fp32, [1, 5, 256]> var_673 = linear(bias = inner_encoder_q_proj_2_bias, weight = inner_encoder_q_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_21")];
+            tensor<int32, [4]> var_674 = const()[name = tensor<string, []>("op_674"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_675 = reshape(shape = var_674, x = var_673)[name = tensor<string, []>("op_675")];
             tensor<int32, [4]> q_5_perm_0 = const()[name = tensor<string, []>("q_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> var_613 = linear(bias = encoder_k_proj_2_bias, weight = encoder_k_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_22")];
-            tensor<fp32, []> var_614 = const()[name = tensor<string, []>("op_614"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 5, 256]> var_615 = mul(x = var_613, y = var_614)[name = tensor<string, []>("op_615")];
-            tensor<int32, [4]> var_616 = const()[name = tensor<string, []>("op_616"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_617 = reshape(shape = var_616, x = var_615)[name = tensor<string, []>("op_617")];
+            tensor<fp32, [1, 5, 256]> var_679 = linear(bias = inner_encoder_k_proj_2_bias, weight = inner_encoder_k_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_22")];
+            tensor<fp32, []> var_680 = const()[name = tensor<string, []>("op_680"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 5, 256]> var_681 = mul(x = var_679, y = var_680)[name = tensor<string, []>("op_681")];
+            tensor<int32, [4]> var_682 = const()[name = tensor<string, []>("op_682"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_683 = reshape(shape = var_682, x = var_681)[name = tensor<string, []>("op_683")];
             tensor<int32, [4]> k_5_perm_0 = const()[name = tensor<string, []>("k_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> var_621 = linear(bias = encoder_v_proj_2_bias, weight = encoder_v_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_23")];
-            tensor<int32, [4]> var_622 = const()[name = tensor<string, []>("op_622"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_623 = reshape(shape = var_622, x = var_621)[name = tensor<string, []>("op_623")];
+            tensor<fp32, [1, 5, 256]> var_687 = linear(bias = inner_encoder_v_proj_2_bias, weight = inner_encoder_v_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_23")];
+            tensor<int32, [4]> var_688 = const()[name = tensor<string, []>("op_688"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_689 = reshape(shape = var_688, x = var_687)[name = tensor<string, []>("op_689")];
             tensor<int32, [4]> v_5_perm_0 = const()[name = tensor<string, []>("v_5_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> input_97 = linear(bias = encoder_g_proj_2_bias, weight = encoder_g_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_24")];
+            tensor<fp32, [1, 5, 256]> input_99 = linear(bias = inner_encoder_g_proj_2_bias, weight = inner_encoder_g_proj_2_weight, x = x_13)[name = tensor<string, []>("linear_24")];
             tensor<fp32, [1]> sqrt_s0_5 = sqrt(x = prev_scale_5)[name = tensor<string, []>("sqrt_s0_5")];
-            tensor<fp32, [5]> s_t_5 = add(x = prev_scale_5, y = encoder__t_index)[name = tensor<string, []>("s_t_5")];
+            tensor<fp32, [5]> s_t_5 = add(x = prev_scale_5, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_5")];
             tensor<fp32, [5]> sqrt_s_t_5 = sqrt(x = s_t_5)[name = tensor<string, []>("sqrt_s_t_5")];
             tensor<bool, []> qk_5_transpose_x_1 = const()[name = tensor<string, []>("qk_5_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_5_transpose_y_1 = const()[name = tensor<string, []>("qk_5_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 5, 64]> k_5 = transpose(perm = k_5_perm_0, x = var_617)[name = tensor<string, []>("transpose_43")];
-            tensor<fp32, [1, 4, 5, 64]> q_5 = transpose(perm = q_5_perm_0, x = var_609)[name = tensor<string, []>("transpose_44")];
+            tensor<fp32, [1, 4, 5, 64]> k_5 = transpose(perm = k_5_perm_0, x = var_683)[name = tensor<string, []>("transpose_43")];
+            tensor<fp32, [1, 4, 5, 64]> q_5 = transpose(perm = q_5_perm_0, x = var_675)[name = tensor<string, []>("transpose_44")];
             tensor<fp32, [1, 4, 5, 5]> qk_5 = matmul(transpose_x = qk_5_transpose_x_1, transpose_y = qk_5_transpose_y_1, x = q_5, y = k_5)[name = tensor<string, []>("qk_5")];
-            tensor<int32, [2]> var_633 = const()[name = tensor<string, []>("op_633"), val = tensor<int32, [2]>([5, 1])];
-            tensor<fp32, [5, 1]> var_634 = reshape(shape = var_633, x = sqrt_s_t_5)[name = tensor<string, []>("op_634")];
-            tensor<fp32, [5, 5]> M_5 = real_div(x = encoder__causal_mask, y = var_634)[name = tensor<string, []>("M_5")];
-            tensor<fp32, [1, 4, 5, 5]> var_636 = mul(x = qk_5, y = M_5)[name = tensor<string, []>("op_636")];
+            tensor<int32, [2]> var_699 = const()[name = tensor<string, []>("op_699"), val = tensor<int32, [2]>([5, 1])];
+            tensor<fp32, [5, 1]> var_700 = reshape(shape = var_699, x = sqrt_s_t_5)[name = tensor<string, []>("op_700")];
+            tensor<fp32, [5, 5]> M_5 = real_div(x = inner_encoder__causal_mask, y = var_700)[name = tensor<string, []>("M_5")];
+            tensor<fp32, [1, 4, 5, 5]> var_702 = mul(x = qk_5, y = M_5)[name = tensor<string, []>("op_702")];
             tensor<bool, []> inner_5_transpose_x_0 = const()[name = tensor<string, []>("inner_5_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_5_transpose_y_0 = const()[name = tensor<string, []>("inner_5_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 5, 64]> v_5 = transpose(perm = v_5_perm_0, x = var_623)[name = tensor<string, []>("transpose_42")];
-            tensor<fp32, [1, 4, 5, 64]> inner_5 = matmul(transpose_x = inner_5_transpose_x_0, transpose_y = inner_5_transpose_y_0, x = var_636, y = v_5)[name = tensor<string, []>("inner_5")];
-            tensor<bool, []> var_638_transpose_x_0 = const()[name = tensor<string, []>("op_638_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_638_transpose_y_0 = const()[name = tensor<string, []>("op_638_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 5, 64]> var_638 = matmul(transpose_x = var_638_transpose_x_0, transpose_y = var_638_transpose_y_0, x = q_5, y = prev_kv_5)[name = tensor<string, []>("op_638")];
-            tensor<fp32, [5]> var_639 = real_div(x = sqrt_s0_5, y = sqrt_s_t_5)[name = tensor<string, []>("op_639")];
-            tensor<int32, [4]> var_640 = const()[name = tensor<string, []>("op_640"), val = tensor<int32, [4]>([1, 1, 5, 1])];
-            tensor<fp32, [1, 1, 5, 1]> var_641 = reshape(shape = var_640, x = var_639)[name = tensor<string, []>("op_641")];
-            tensor<fp32, [1, 4, 5, 64]> cross_5 = mul(x = var_638, y = var_641)[name = tensor<string, []>("cross_5")];
+            tensor<fp32, [1, 4, 5, 64]> v_5 = transpose(perm = v_5_perm_0, x = var_689)[name = tensor<string, []>("transpose_42")];
+            tensor<fp32, [1, 4, 5, 64]> inner_5 = matmul(transpose_x = inner_5_transpose_x_0, transpose_y = inner_5_transpose_y_0, x = var_702, y = v_5)[name = tensor<string, []>("inner_5")];
+            tensor<bool, []> var_704_transpose_x_0 = const()[name = tensor<string, []>("op_704_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_704_transpose_y_0 = const()[name = tensor<string, []>("op_704_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 5, 64]> var_704 = matmul(transpose_x = var_704_transpose_x_0, transpose_y = var_704_transpose_y_0, x = q_5, y = prev_kv_5)[name = tensor<string, []>("op_704")];
+            tensor<fp32, [5]> var_705 = real_div(x = sqrt_s0_5, y = sqrt_s_t_5)[name = tensor<string, []>("op_705")];
+            tensor<int32, [4]> var_706 = const()[name = tensor<string, []>("op_706"), val = tensor<int32, [4]>([1, 1, 5, 1])];
+            tensor<fp32, [1, 1, 5, 1]> var_707 = reshape(shape = var_706, x = var_705)[name = tensor<string, []>("op_707")];
+            tensor<fp32, [1, 4, 5, 64]> cross_5 = mul(x = var_704, y = var_707)[name = tensor<string, []>("cross_5")];
             tensor<fp32, [1, 4, 5, 64]> out_13 = add(x = inner_5, y = cross_5)[name = tensor<string, []>("out_13")];
-            tensor<fp32, [1, 4, 64, 64]> var_644 = mul(x = prev_kv_5, y = sqrt_s0_5)[name = tensor<string, []>("op_644")];
-            tensor<bool, []> var_646_transpose_x_1 = const()[name = tensor<string, []>("op_646_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_646_transpose_y_1 = const()[name = tensor<string, []>("op_646_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_646 = matmul(transpose_x = var_646_transpose_x_1, transpose_y = var_646_transpose_y_1, x = k_5, y = v_5)[name = tensor<string, []>("op_646")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_5 = add(x = var_644, y = var_646)[name = tensor<string, []>("new_kv_unnorm_5")];
-            tensor<fp32, []> var_648 = const()[name = tensor<string, []>("op_648"), val = tensor<fp32, []>(0x1.4p+2)];
-            tensor<fp32, [1]> new_scale_5 = add(x = prev_scale_5, y = var_648)[name = tensor<string, []>("new_scale_5")];
-            tensor<fp32, [1]> var_650 = sqrt(x = new_scale_5)[name = tensor<string, []>("op_650")];
-            tensor<fp32, [1, 4, 64, 64]> var_651 = real_div(x = new_kv_unnorm_5, y = var_650)[name = tensor<string, []>("op_651")];
-            tensor<int32, [4]> var_652_perm_0 = const()[name = tensor<string, []>("op_652_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_710 = mul(x = prev_kv_5, y = sqrt_s0_5)[name = tensor<string, []>("op_710")];
+            tensor<bool, []> var_712_transpose_x_1 = const()[name = tensor<string, []>("op_712_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_712_transpose_y_1 = const()[name = tensor<string, []>("op_712_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_712 = matmul(transpose_x = var_712_transpose_x_1, transpose_y = var_712_transpose_y_1, x = k_5, y = v_5)[name = tensor<string, []>("op_712")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_5 = add(x = var_710, y = var_712)[name = tensor<string, []>("new_kv_unnorm_5")];
+            tensor<fp32, []> var_714 = const()[name = tensor<string, []>("op_714"), val = tensor<fp32, []>(0x1.4p+2)];
+            tensor<fp32, [1]> new_scale_5 = add(x = prev_scale_5, y = var_714)[name = tensor<string, []>("new_scale_5")];
+            tensor<fp32, [1]> var_716 = sqrt(x = new_scale_5)[name = tensor<string, []>("op_716")];
+            tensor<fp32, [1, 4, 64, 64]> var_717 = real_div(x = new_kv_unnorm_5, y = var_716)[name = tensor<string, []>("op_717")];
+            tensor<int32, [4]> var_718_perm_0 = const()[name = tensor<string, []>("op_718_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_15_axes_0 = const()[name = tensor<string, []>("out_15_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 4, 64]> var_652 = transpose(perm = var_652_perm_0, x = out_13)[name = tensor<string, []>("transpose_41")];
-            tensor<fp32, [1, 5, 4, 64]> out_15 = layer_norm(axes = out_15_axes_0, epsilon = var_18, x = var_652)[name = tensor<string, []>("out_15")];
-            tensor<int32, [3]> var_656 = const()[name = tensor<string, []>("op_656"), val = tensor<int32, [3]>([1, 5, 256])];
-            tensor<fp32, [1, 5, 256]> out_17 = reshape(shape = var_656, x = out_15)[name = tensor<string, []>("out_17")];
-            tensor<fp32, [1, 5, 256]> var_658 = silu(x = input_97)[name = tensor<string, []>("op_658")];
-            tensor<fp32, [1, 5, 256]> input_99 = mul(x = var_658, y = out_17)[name = tensor<string, []>("input_99")];
-            tensor<fp32, [1, 5, 256]> ret_out_5 = linear(bias = encoder_out_proj_2_bias, weight = encoder_out_proj_2_weight, x = input_99)[name = tensor<string, []>("linear_25")];
-            tensor<fp32, [1, 5, 256]> x_15 = add(x = input_93, y = ret_out_5)[name = tensor<string, []>("x_15")];
+            tensor<fp32, [1, 5, 4, 64]> var_718 = transpose(perm = var_718_perm_0, x = out_13)[name = tensor<string, []>("transpose_41")];
+            tensor<fp32, [1, 5, 4, 64]> out_15 = layer_norm(axes = out_15_axes_0, epsilon = var_84, x = var_718)[name = tensor<string, []>("out_15")];
+            tensor<int32, [3]> var_722 = const()[name = tensor<string, []>("op_722"), val = tensor<int32, [3]>([1, 5, 256])];
+            tensor<fp32, [1, 5, 256]> out_17 = reshape(shape = var_722, x = out_15)[name = tensor<string, []>("out_17")];
+            tensor<fp32, [1, 5, 256]> var_724 = silu(x = input_99)[name = tensor<string, []>("op_724")];
+            tensor<fp32, [1, 5, 256]> input_101 = mul(x = var_724, y = out_17)[name = tensor<string, []>("input_101")];
+            tensor<fp32, [1, 5, 256]> ret_out_5 = linear(bias = inner_encoder_out_proj_2_bias, weight = inner_encoder_out_proj_2_weight, x = input_101)[name = tensor<string, []>("linear_25")];
+            tensor<fp32, [1, 5, 256]> x_15 = add(x = input_95, y = ret_out_5)[name = tensor<string, []>("x_15")];
             tensor<int32, [4]> window_25_begin_0 = const()[name = tensor<string, []>("window_25_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> window_25_end_0 = const()[name = tensor<string, []>("window_25_end_0"), val = tensor<int32, [4]>([3, 1, 16, 256])];
             tensor<bool, [4]> window_25_end_mask_0 = const()[name = tensor<string, []>("window_25_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_25_squeeze_mask_0 = const()[name = tensor<string, []>("window_25_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_25 = slice_by_index(begin = window_25_begin_0, end = window_25_end_0, end_mask = window_25_end_mask_0, squeeze_mask = window_25_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_25")];
-            tensor<int32, [3]> var_666_begin_0 = const()[name = tensor<string, []>("op_666_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_666_end_0 = const()[name = tensor<string, []>("op_666_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_666_end_mask_0 = const()[name = tensor<string, []>("op_666_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_666 = slice_by_index(begin = var_666_begin_0, end = var_666_end_0, end_mask = var_666_end_mask_0, x = x_15)[name = tensor<string, []>("op_666")];
-            tensor<int32, [3]> var_669_begin_0 = const()[name = tensor<string, []>("op_669_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_669_end_0 = const()[name = tensor<string, []>("op_669_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_669_end_mask_0 = const()[name = tensor<string, []>("op_669_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_669 = slice_by_index(begin = var_669_begin_0, end = var_669_end_0, end_mask = var_669_end_mask_0, x = window_25)[name = tensor<string, []>("op_669")];
+            tensor<int32, [3]> var_732_begin_0 = const()[name = tensor<string, []>("op_732_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_732_end_0 = const()[name = tensor<string, []>("op_732_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_732_end_mask_0 = const()[name = tensor<string, []>("op_732_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_732 = slice_by_index(begin = var_732_begin_0, end = var_732_end_0, end_mask = var_732_end_mask_0, x = x_15)[name = tensor<string, []>("op_732")];
+            tensor<int32, [3]> var_735_begin_0 = const()[name = tensor<string, []>("op_735_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_735_end_0 = const()[name = tensor<string, []>("op_735_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_735_end_mask_0 = const()[name = tensor<string, []>("op_735_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_735 = slice_by_index(begin = var_735_begin_0, end = var_735_end_0, end_mask = var_735_end_mask_0, x = window_25)[name = tensor<string, []>("op_735")];
             tensor<bool, []> window_27_interleave_0 = const()[name = tensor<string, []>("window_27_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_27 = concat(axis = var_27, interleave = window_27_interleave_0, values = (var_669, var_666))[name = tensor<string, []>("window_27")];
-            tensor<int32, [3]> var_674_begin_0 = const()[name = tensor<string, []>("op_674_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_674_end_0 = const()[name = tensor<string, []>("op_674_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_674_end_mask_0 = const()[name = tensor<string, []>("op_674_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_674 = slice_by_index(begin = var_674_begin_0, end = var_674_end_0, end_mask = var_674_end_mask_0, x = x_15)[name = tensor<string, []>("op_674")];
-            tensor<int32, [3]> var_677_begin_0 = const()[name = tensor<string, []>("op_677_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_677_end_0 = const()[name = tensor<string, []>("op_677_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_677_end_mask_0 = const()[name = tensor<string, []>("op_677_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_677 = slice_by_index(begin = var_677_begin_0, end = var_677_end_0, end_mask = var_677_end_mask_0, x = window_27)[name = tensor<string, []>("op_677")];
+            tensor<fp32, [1, 16, 256]> window_27 = concat(axis = var_93, interleave = window_27_interleave_0, values = (var_735, var_732))[name = tensor<string, []>("window_27")];
+            tensor<int32, [3]> var_740_begin_0 = const()[name = tensor<string, []>("op_740_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_740_end_0 = const()[name = tensor<string, []>("op_740_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_740_end_mask_0 = const()[name = tensor<string, []>("op_740_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_740 = slice_by_index(begin = var_740_begin_0, end = var_740_end_0, end_mask = var_740_end_mask_0, x = x_15)[name = tensor<string, []>("op_740")];
+            tensor<int32, [3]> var_743_begin_0 = const()[name = tensor<string, []>("op_743_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_743_end_0 = const()[name = tensor<string, []>("op_743_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_743_end_mask_0 = const()[name = tensor<string, []>("op_743_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_743 = slice_by_index(begin = var_743_begin_0, end = var_743_end_0, end_mask = var_743_end_mask_0, x = window_27)[name = tensor<string, []>("op_743")];
             tensor<bool, []> window_29_interleave_0 = const()[name = tensor<string, []>("window_29_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_29 = concat(axis = var_27, interleave = window_29_interleave_0, values = (var_677, var_674))[name = tensor<string, []>("window_29")];
-            tensor<int32, [3]> var_682_begin_0 = const()[name = tensor<string, []>("op_682_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_682_end_0 = const()[name = tensor<string, []>("op_682_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<bool, [3]> var_682_end_mask_0 = const()[name = tensor<string, []>("op_682_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_682 = slice_by_index(begin = var_682_begin_0, end = var_682_end_0, end_mask = var_682_end_mask_0, x = x_15)[name = tensor<string, []>("op_682")];
-            tensor<int32, [3]> var_685_begin_0 = const()[name = tensor<string, []>("op_685_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_685_end_0 = const()[name = tensor<string, []>("op_685_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_685_end_mask_0 = const()[name = tensor<string, []>("op_685_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_685 = slice_by_index(begin = var_685_begin_0, end = var_685_end_0, end_mask = var_685_end_mask_0, x = window_29)[name = tensor<string, []>("op_685")];
+            tensor<fp32, [1, 16, 256]> window_29 = concat(axis = var_93, interleave = window_29_interleave_0, values = (var_743, var_740))[name = tensor<string, []>("window_29")];
+            tensor<int32, [3]> var_748_begin_0 = const()[name = tensor<string, []>("op_748_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_748_end_0 = const()[name = tensor<string, []>("op_748_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<bool, [3]> var_748_end_mask_0 = const()[name = tensor<string, []>("op_748_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_748 = slice_by_index(begin = var_748_begin_0, end = var_748_end_0, end_mask = var_748_end_mask_0, x = x_15)[name = tensor<string, []>("op_748")];
+            tensor<int32, [3]> var_751_begin_0 = const()[name = tensor<string, []>("op_751_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_751_end_0 = const()[name = tensor<string, []>("op_751_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_751_end_mask_0 = const()[name = tensor<string, []>("op_751_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_751 = slice_by_index(begin = var_751_begin_0, end = var_751_end_0, end_mask = var_751_end_mask_0, x = window_29)[name = tensor<string, []>("op_751")];
             tensor<bool, []> window_31_interleave_0 = const()[name = tensor<string, []>("window_31_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_31 = concat(axis = var_27, interleave = window_31_interleave_0, values = (var_685, var_682))[name = tensor<string, []>("window_31")];
-            tensor<int32, [3]> var_690_begin_0 = const()[name = tensor<string, []>("op_690_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
-            tensor<int32, [3]> var_690_end_0 = const()[name = tensor<string, []>("op_690_end_0"), val = tensor<int32, [3]>([1, 4, 256])];
-            tensor<bool, [3]> var_690_end_mask_0 = const()[name = tensor<string, []>("op_690_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_690 = slice_by_index(begin = var_690_begin_0, end = var_690_end_0, end_mask = var_690_end_mask_0, x = x_15)[name = tensor<string, []>("op_690")];
-            tensor<int32, [3]> var_693_begin_0 = const()[name = tensor<string, []>("op_693_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_693_end_0 = const()[name = tensor<string, []>("op_693_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_693_end_mask_0 = const()[name = tensor<string, []>("op_693_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_693 = slice_by_index(begin = var_693_begin_0, end = var_693_end_0, end_mask = var_693_end_mask_0, x = window_31)[name = tensor<string, []>("op_693")];
+            tensor<fp32, [1, 16, 256]> window_31 = concat(axis = var_93, interleave = window_31_interleave_0, values = (var_751, var_748))[name = tensor<string, []>("window_31")];
+            tensor<int32, [3]> var_756_begin_0 = const()[name = tensor<string, []>("op_756_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
+            tensor<int32, [3]> var_756_end_0 = const()[name = tensor<string, []>("op_756_end_0"), val = tensor<int32, [3]>([1, 4, 256])];
+            tensor<bool, [3]> var_756_end_mask_0 = const()[name = tensor<string, []>("op_756_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_756 = slice_by_index(begin = var_756_begin_0, end = var_756_end_0, end_mask = var_756_end_mask_0, x = x_15)[name = tensor<string, []>("op_756")];
+            tensor<int32, [3]> var_759_begin_0 = const()[name = tensor<string, []>("op_759_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_759_end_0 = const()[name = tensor<string, []>("op_759_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_759_end_mask_0 = const()[name = tensor<string, []>("op_759_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_759 = slice_by_index(begin = var_759_begin_0, end = var_759_end_0, end_mask = var_759_end_mask_0, x = window_31)[name = tensor<string, []>("op_759")];
             tensor<bool, []> window_33_interleave_0 = const()[name = tensor<string, []>("window_33_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_33 = concat(axis = var_27, interleave = window_33_interleave_0, values = (var_693, var_690))[name = tensor<string, []>("window_33")];
-            tensor<int32, [3]> var_698_begin_0 = const()[name = tensor<string, []>("op_698_begin_0"), val = tensor<int32, [3]>([0, 4, 0])];
-            tensor<int32, [3]> var_698_end_0 = const()[name = tensor<string, []>("op_698_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_698_end_mask_0 = const()[name = tensor<string, []>("op_698_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_698 = slice_by_index(begin = var_698_begin_0, end = var_698_end_0, end_mask = var_698_end_mask_0, x = x_15)[name = tensor<string, []>("op_698")];
-            tensor<int32, [3]> var_701_begin_0 = const()[name = tensor<string, []>("op_701_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_701_end_0 = const()[name = tensor<string, []>("op_701_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_701_end_mask_0 = const()[name = tensor<string, []>("op_701_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_701 = slice_by_index(begin = var_701_begin_0, end = var_701_end_0, end_mask = var_701_end_mask_0, x = window_33)[name = tensor<string, []>("op_701")];
+            tensor<fp32, [1, 16, 256]> window_33 = concat(axis = var_93, interleave = window_33_interleave_0, values = (var_759, var_756))[name = tensor<string, []>("window_33")];
+            tensor<int32, [3]> var_764_begin_0 = const()[name = tensor<string, []>("op_764_begin_0"), val = tensor<int32, [3]>([0, 4, 0])];
+            tensor<int32, [3]> var_764_end_0 = const()[name = tensor<string, []>("op_764_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_764_end_mask_0 = const()[name = tensor<string, []>("op_764_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_764 = slice_by_index(begin = var_764_begin_0, end = var_764_end_0, end_mask = var_764_end_mask_0, x = x_15)[name = tensor<string, []>("op_764")];
+            tensor<int32, [3]> var_767_begin_0 = const()[name = tensor<string, []>("op_767_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_767_end_0 = const()[name = tensor<string, []>("op_767_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_767_end_mask_0 = const()[name = tensor<string, []>("op_767_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_767 = slice_by_index(begin = var_767_begin_0, end = var_767_end_0, end_mask = var_767_end_mask_0, x = window_33)[name = tensor<string, []>("op_767")];
             tensor<bool, []> window_35_interleave_0 = const()[name = tensor<string, []>("window_35_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_35 = concat(axis = var_27, interleave = window_35_interleave_0, values = (var_701, var_698))[name = tensor<string, []>("window_35")];
-            tensor<bool, []> input_101_interleave_0 = const()[name = tensor<string, []>("input_101_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [5, 16, 256]> input_101 = concat(axis = var_24, interleave = input_101_interleave_0, values = (window_27, window_29, window_31, window_33, window_35))[name = tensor<string, []>("input_101")];
+            tensor<fp32, [1, 16, 256]> window_35 = concat(axis = var_93, interleave = window_35_interleave_0, values = (var_767, var_764))[name = tensor<string, []>("window_35")];
+            tensor<bool, []> input_103_interleave_0 = const()[name = tensor<string, []>("input_103_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [5, 16, 256]> input_103 = concat(axis = var_79, interleave = input_103_interleave_0, values = (window_27, window_29, window_31, window_33, window_35))[name = tensor<string, []>("input_103")];
             tensor<int32, [1]> x_17_axes_0 = const()[name = tensor<string, []>("x_17_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [5, 16, 256]> x_17 = layer_norm(axes = x_17_axes_0, beta = encoder_conv_module_2_sequential_0_bias, epsilon = var_29, gamma = encoder_conv_module_2_sequential_0_weight, x = input_101)[name = tensor<string, []>("x_17")];
-            tensor<int32, [3]> input_103_perm_0 = const()[name = tensor<string, []>("input_103_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [5, 16, 256]> x_17 = layer_norm(axes = x_17_axes_0, beta = inner_encoder_conv_module_2_sequential_0_bias, epsilon = var_76, gamma = inner_encoder_conv_module_2_sequential_0_weight, x = input_103)[name = tensor<string, []>("x_17")];
+            tensor<int32, [3]> input_105_perm_0 = const()[name = tensor<string, []>("input_105_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_23_pad_type_0 = const()[name = tensor<string, []>("inputs_23_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_23_strides_0 = const()[name = tensor<string, []>("inputs_23_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_23_pad_0 = const()[name = tensor<string, []>("inputs_23_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_23_dilations_0 = const()[name = tensor<string, []>("inputs_23_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_23_groups_0 = const()[name = tensor<string, []>("inputs_23_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> input_103 = transpose(perm = input_103_perm_0, x = x_17)[name = tensor<string, []>("transpose_40")];
-            tensor<fp32, [5, 512, 16]> inputs_23 = conv(bias = encoder_conv_module_2_sequential_2_conv_bias, dilations = inputs_23_dilations_0, groups = inputs_23_groups_0, pad = inputs_23_pad_0, pad_type = inputs_23_pad_type_0, strides = inputs_23_strides_0, weight = encoder_conv_module_2_sequential_2_conv_weight, x = input_103)[name = tensor<string, []>("inputs_23")];
-            tensor<int32, [2]> var_726_split_sizes_0 = const()[name = tensor<string, []>("op_726_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_726_axis_0 = const()[name = tensor<string, []>("op_726_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> var_726_0, tensor<fp32, [5, 256, 16]> var_726_1 = split(axis = var_726_axis_0, split_sizes = var_726_split_sizes_0, x = inputs_23)[name = tensor<string, []>("op_726")];
-            tensor<fp32, [5, 256, 16]> var_728 = sigmoid(x = var_726_1)[name = tensor<string, []>("op_728")];
-            tensor<fp32, [5, 256, 16]> inputs_25 = mul(x = var_726_0, y = var_728)[name = tensor<string, []>("inputs_25")];
+            tensor<fp32, [5, 256, 16]> input_105 = transpose(perm = input_105_perm_0, x = x_17)[name = tensor<string, []>("transpose_40")];
+            tensor<fp32, [5, 512, 16]> inputs_23 = conv(bias = inner_encoder_conv_module_2_sequential_2_conv_bias, dilations = inputs_23_dilations_0, groups = inputs_23_groups_0, pad = inputs_23_pad_0, pad_type = inputs_23_pad_type_0, strides = inputs_23_strides_0, weight = inner_encoder_conv_module_2_sequential_2_conv_weight, x = input_105)[name = tensor<string, []>("inputs_23")];
+            tensor<int32, [2]> var_792_split_sizes_0 = const()[name = tensor<string, []>("op_792_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_792_axis_0 = const()[name = tensor<string, []>("op_792_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [5, 256, 16]> var_792_0, tensor<fp32, [5, 256, 16]> var_792_1 = split(axis = var_792_axis_0, split_sizes = var_792_split_sizes_0, x = inputs_23)[name = tensor<string, []>("op_792")];
+            tensor<fp32, [5, 256, 16]> var_794 = sigmoid(x = var_792_1)[name = tensor<string, []>("op_794")];
+            tensor<fp32, [5, 256, 16]> inputs_25 = mul(x = var_792_0, y = var_794)[name = tensor<string, []>("inputs_25")];
             tensor<string, []> outputs_aug_5_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_5_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_5_pad_0 = const()[name = tensor<string, []>("outputs_aug_5_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_5_groups_0 = const()[name = tensor<string, []>("outputs_aug_5_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_5_strides_0 = const()[name = tensor<string, []>("outputs_aug_5_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_5_dilations_0 = const()[name = tensor<string, []>("outputs_aug_5_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [5, 256, 31]> outputs_aug_5 = conv(dilations = outputs_aug_5_dilations_0, groups = outputs_aug_5_groups_0, pad = outputs_aug_5_pad_0, pad_type = outputs_aug_5_pad_type_0, strides = outputs_aug_5_strides_0, weight = encoder_conv_module_2_sequential_4_conv_weight, x = inputs_25)[name = tensor<string, []>("outputs_aug_5")];
-            tensor<int32, [3]> input_105_begin_0 = const()[name = tensor<string, []>("input_105_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_105_end_0 = const()[name = tensor<string, []>("input_105_end_0"), val = tensor<int32, [3]>([5, 256, 16])];
-            tensor<bool, [3]> input_105_end_mask_0 = const()[name = tensor<string, []>("input_105_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [5, 256, 16]> input_105 = slice_by_index(begin = input_105_begin_0, end = input_105_end_0, end_mask = input_105_end_mask_0, x = outputs_aug_5)[name = tensor<string, []>("input_105")];
-            tensor<fp32, [5, 256, 16]> inputs_27 = batch_norm(beta = encoder_conv_module_2_sequential_5_bias, epsilon = var_29, gamma = encoder_conv_module_2_sequential_5_weight, mean = encoder_conv_module_2_sequential_5_running_mean, variance = encoder_conv_module_2_sequential_5_running_var, x = input_105)[name = tensor<string, []>("inputs_27")];
-            tensor<fp32, [5, 256, 16]> input_107 = silu(x = inputs_27)[name = tensor<string, []>("input_107")];
-            tensor<string, []> input_109_pad_type_0 = const()[name = tensor<string, []>("input_109_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_109_strides_0 = const()[name = tensor<string, []>("input_109_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_109_pad_0 = const()[name = tensor<string, []>("input_109_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_109_dilations_0 = const()[name = tensor<string, []>("input_109_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_109_groups_0 = const()[name = tensor<string, []>("input_109_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> input_109 = conv(bias = encoder_conv_module_2_sequential_7_conv_bias, dilations = input_109_dilations_0, groups = input_109_groups_0, pad = input_109_pad_0, pad_type = input_109_pad_type_0, strides = input_109_strides_0, weight = encoder_conv_module_2_sequential_7_conv_weight, x = input_107)[name = tensor<string, []>("input_109")];
+            tensor<fp32, [5, 256, 31]> outputs_aug_5 = conv(dilations = outputs_aug_5_dilations_0, groups = outputs_aug_5_groups_0, pad = outputs_aug_5_pad_0, pad_type = outputs_aug_5_pad_type_0, strides = outputs_aug_5_strides_0, weight = inner_encoder_conv_module_2_sequential_4_conv_weight, x = inputs_25)[name = tensor<string, []>("outputs_aug_5")];
+            tensor<int32, [3]> input_107_begin_0 = const()[name = tensor<string, []>("input_107_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_107_end_0 = const()[name = tensor<string, []>("input_107_end_0"), val = tensor<int32, [3]>([5, 256, 16])];
+            tensor<bool, [3]> input_107_end_mask_0 = const()[name = tensor<string, []>("input_107_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [5, 256, 16]> input_107 = slice_by_index(begin = input_107_begin_0, end = input_107_end_0, end_mask = input_107_end_mask_0, x = outputs_aug_5)[name = tensor<string, []>("input_107")];
+            tensor<fp32, [5, 256, 16]> inputs_27 = batch_norm(beta = inner_encoder_conv_module_2_sequential_5_bias, epsilon = var_76, gamma = inner_encoder_conv_module_2_sequential_5_weight, mean = inner_encoder_conv_module_2_sequential_5_running_mean, variance = inner_encoder_conv_module_2_sequential_5_running_var, x = input_107)[name = tensor<string, []>("inputs_27")];
+            tensor<fp32, [5, 256, 16]> input_109 = silu(x = inputs_27)[name = tensor<string, []>("input_109")];
+            tensor<string, []> input_111_pad_type_0 = const()[name = tensor<string, []>("input_111_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_111_strides_0 = const()[name = tensor<string, []>("input_111_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_111_pad_0 = const()[name = tensor<string, []>("input_111_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_111_dilations_0 = const()[name = tensor<string, []>("input_111_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_111_groups_0 = const()[name = tensor<string, []>("input_111_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [5, 256, 16]> input_111 = conv(bias = inner_encoder_conv_module_2_sequential_7_conv_bias, dilations = input_111_dilations_0, groups = input_111_groups_0, pad = input_111_pad_0, pad_type = input_111_pad_type_0, strides = input_111_strides_0, weight = inner_encoder_conv_module_2_sequential_7_conv_weight, x = input_109)[name = tensor<string, []>("input_111")];
             tensor<int32, [3]> conv_out_5_perm_0 = const()[name = tensor<string, []>("conv_out_5_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_759_begin_0 = const()[name = tensor<string, []>("op_759_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_759_end_0 = const()[name = tensor<string, []>("op_759_end_0"), val = tensor<int32, [3]>([5, 16, 256])];
-            tensor<bool, [3]> var_759_end_mask_0 = const()[name = tensor<string, []>("op_759_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [5, 16, 256]> conv_out_5 = transpose(perm = conv_out_5_perm_0, x = input_109)[name = tensor<string, []>("transpose_39")];
-            tensor<fp32, [5, 1, 256]> var_759 = slice_by_index(begin = var_759_begin_0, end = var_759_end_0, end_mask = var_759_end_mask_0, x = conv_out_5)[name = tensor<string, []>("op_759")];
-            tensor<int32, [3]> var_761_perm_0 = const()[name = tensor<string, []>("op_761_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 5, 256]> var_761 = transpose(perm = var_761_perm_0, x = var_759)[name = tensor<string, []>("transpose_38")];
-            tensor<fp32, [1, 5, 256]> input_111 = add(x = x_15, y = var_761)[name = tensor<string, []>("input_111")];
-            tensor<int32, [1]> input_113_axes_0 = const()[name = tensor<string, []>("input_113_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_113 = layer_norm(axes = input_113_axes_0, beta = encoder_ffn2_2_module_sequential_0_bias, epsilon = var_29, gamma = encoder_ffn2_2_module_sequential_0_weight, x = input_111)[name = tensor<string, []>("input_113")];
-            tensor<fp32, [1, 5, 1024]> inputs_29 = linear(bias = encoder_ffn2_2_module_sequential_1_linear_bias, weight = encoder_ffn2_2_module_sequential_1_linear_weight, x = input_113)[name = tensor<string, []>("linear_26")];
-            tensor<fp32, [1, 5, 1024]> input_115 = silu(x = inputs_29)[name = tensor<string, []>("input_115")];
-            tensor<fp32, [1, 5, 256]> input_119 = linear(bias = encoder_ffn2_2_module_sequential_4_linear_bias, weight = encoder_ffn2_2_module_sequential_4_linear_weight, x = input_115)[name = tensor<string, []>("linear_27")];
-            tensor<fp32, []> var_784 = const()[name = tensor<string, []>("op_784"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 5, 256]> var_785 = mul(x = input_119, y = var_784)[name = tensor<string, []>("op_785")];
-            tensor<fp32, [1, 5, 256]> input_121 = add(x = var_785, y = input_111)[name = tensor<string, []>("input_121")];
-            tensor<int32, [1]> input_123_axes_0 = const()[name = tensor<string, []>("input_123_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_123 = layer_norm(axes = input_123_axes_0, beta = encoder_layer_norm_2_bias, epsilon = var_29, gamma = encoder_layer_norm_2_weight, x = input_121)[name = tensor<string, []>("input_123")];
+            tensor<int32, [3]> var_825_begin_0 = const()[name = tensor<string, []>("op_825_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_825_end_0 = const()[name = tensor<string, []>("op_825_end_0"), val = tensor<int32, [3]>([5, 16, 256])];
+            tensor<bool, [3]> var_825_end_mask_0 = const()[name = tensor<string, []>("op_825_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [5, 16, 256]> conv_out_5 = transpose(perm = conv_out_5_perm_0, x = input_111)[name = tensor<string, []>("transpose_39")];
+            tensor<fp32, [5, 1, 256]> var_825 = slice_by_index(begin = var_825_begin_0, end = var_825_end_0, end_mask = var_825_end_mask_0, x = conv_out_5)[name = tensor<string, []>("op_825")];
+            tensor<int32, [3]> var_827_perm_0 = const()[name = tensor<string, []>("op_827_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 5, 256]> var_827 = transpose(perm = var_827_perm_0, x = var_825)[name = tensor<string, []>("transpose_38")];
+            tensor<fp32, [1, 5, 256]> input_113 = add(x = x_15, y = var_827)[name = tensor<string, []>("input_113")];
+            tensor<int32, [1]> input_115_axes_0 = const()[name = tensor<string, []>("input_115_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 5, 256]> input_115 = layer_norm(axes = input_115_axes_0, beta = inner_encoder_ffn2_2_module_sequential_0_bias, epsilon = var_76, gamma = inner_encoder_ffn2_2_module_sequential_0_weight, x = input_113)[name = tensor<string, []>("input_115")];
+            tensor<fp32, [1, 5, 1024]> inputs_29 = linear(bias = inner_encoder_ffn2_2_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_2_module_sequential_1_linear_weight, x = input_115)[name = tensor<string, []>("linear_26")];
+            tensor<fp32, [1, 5, 1024]> input_117 = silu(x = inputs_29)[name = tensor<string, []>("input_117")];
+            tensor<fp32, [1, 5, 256]> input_121 = linear(bias = inner_encoder_ffn2_2_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_2_module_sequential_4_linear_weight, x = input_117)[name = tensor<string, []>("linear_27")];
+            tensor<fp32, []> var_850 = const()[name = tensor<string, []>("op_850"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 5, 256]> var_851 = mul(x = input_121, y = var_850)[name = tensor<string, []>("op_851")];
+            tensor<fp32, [1, 5, 256]> input_123 = add(x = var_851, y = input_113)[name = tensor<string, []>("input_123")];
             tensor<int32, [1]> input_125_axes_0 = const()[name = tensor<string, []>("input_125_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_125 = layer_norm(axes = input_125_axes_0, beta = encoder_ffn1_3_module_sequential_0_bias, epsilon = var_29, gamma = encoder_ffn1_3_module_sequential_0_weight, x = input_123)[name = tensor<string, []>("input_125")];
-            tensor<fp32, [1, 5, 1024]> inputs_31 = linear(bias = encoder_ffn1_3_module_sequential_1_linear_bias, weight = encoder_ffn1_3_module_sequential_1_linear_weight, x = input_125)[name = tensor<string, []>("linear_28")];
-            tensor<fp32, [1, 5, 1024]> input_127 = silu(x = inputs_31)[name = tensor<string, []>("input_127")];
-            tensor<fp32, [1, 5, 256]> input_131 = linear(bias = encoder_ffn1_3_module_sequential_4_linear_bias, weight = encoder_ffn1_3_module_sequential_4_linear_weight, x = input_127)[name = tensor<string, []>("linear_29")];
-            tensor<fp32, []> var_814 = const()[name = tensor<string, []>("op_814"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 5, 256]> var_815 = mul(x = input_131, y = var_814)[name = tensor<string, []>("op_815")];
-            tensor<fp32, [1, 5, 256]> input_133 = add(x = var_815, y = input_123)[name = tensor<string, []>("input_133")];
+            tensor<fp32, [1, 5, 256]> input_125 = layer_norm(axes = input_125_axes_0, beta = inner_encoder_layer_norm_2_bias, epsilon = var_76, gamma = inner_encoder_layer_norm_2_weight, x = input_123)[name = tensor<string, []>("input_125")];
+            tensor<int32, [1]> input_127_axes_0 = const()[name = tensor<string, []>("input_127_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 5, 256]> input_127 = layer_norm(axes = input_127_axes_0, beta = inner_encoder_ffn1_3_module_sequential_0_bias, epsilon = var_76, gamma = inner_encoder_ffn1_3_module_sequential_0_weight, x = input_125)[name = tensor<string, []>("input_127")];
+            tensor<fp32, [1, 5, 1024]> inputs_31 = linear(bias = inner_encoder_ffn1_3_module_sequential_1_linear_bias, weight = inner_encoder_ffn1_3_module_sequential_1_linear_weight, x = input_127)[name = tensor<string, []>("linear_28")];
+            tensor<fp32, [1, 5, 1024]> input_129 = silu(x = inputs_31)[name = tensor<string, []>("input_129")];
+            tensor<fp32, [1, 5, 256]> input_133 = linear(bias = inner_encoder_ffn1_3_module_sequential_4_linear_bias, weight = inner_encoder_ffn1_3_module_sequential_4_linear_weight, x = input_129)[name = tensor<string, []>("linear_29")];
+            tensor<fp32, []> var_880 = const()[name = tensor<string, []>("op_880"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 5, 256]> var_881 = mul(x = input_133, y = var_880)[name = tensor<string, []>("op_881")];
+            tensor<fp32, [1, 5, 256]> input_135 = add(x = var_881, y = input_125)[name = tensor<string, []>("input_135")];
             tensor<int32, [1]> x_19_axes_0 = const()[name = tensor<string, []>("x_19_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> x_19 = layer_norm(axes = x_19_axes_0, beta = encoder_ret_lns_3_bias, epsilon = var_29, gamma = encoder_ret_lns_3_weight, x = input_133)[name = tensor<string, []>("x_19")];
+            tensor<fp32, [1, 5, 256]> x_19 = layer_norm(axes = x_19_axes_0, beta = inner_encoder_ret_lns_3_bias, epsilon = var_76, gamma = inner_encoder_ret_lns_3_weight, x = input_135)[name = tensor<string, []>("x_19")];
             tensor<int32, [5]> prev_kv_7_begin_0 = const()[name = tensor<string, []>("prev_kv_7_begin_0"), val = tensor<int32, [5]>([3, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_7_end_0 = const()[name = tensor<string, []>("prev_kv_7_end_0"), val = tensor<int32, [5]>([4, 1, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_7_end_mask_0 = const()[name = tensor<string, []>("prev_kv_7_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -800,219 +826,212 @@ program(1.0)
             tensor<bool, [2]> prev_scale_7_end_mask_0 = const()[name = tensor<string, []>("prev_scale_7_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_7_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_7_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_7 = slice_by_index(begin = prev_scale_7_begin_0, end = prev_scale_7_end_0, end_mask = prev_scale_7_end_mask_0, squeeze_mask = prev_scale_7_squeeze_mask_0, x = enc_scale)[name = tensor<string, []>("prev_scale_7")];
-            tensor<fp32, [1, 5, 256]> var_829 = linear(bias = encoder_q_proj_3_bias, weight = encoder_q_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_30")];
-            tensor<int32, [4]> var_830 = const()[name = tensor<string, []>("op_830"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_831 = reshape(shape = var_830, x = var_829)[name = tensor<string, []>("op_831")];
+            tensor<fp32, [1, 5, 256]> var_895 = linear(bias = inner_encoder_q_proj_3_bias, weight = inner_encoder_q_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_30")];
+            tensor<int32, [4]> var_896 = const()[name = tensor<string, []>("op_896"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_897 = reshape(shape = var_896, x = var_895)[name = tensor<string, []>("op_897")];
             tensor<int32, [4]> q_7_perm_0 = const()[name = tensor<string, []>("q_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> var_835 = linear(bias = encoder_k_proj_3_bias, weight = encoder_k_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_31")];
-            tensor<fp32, []> var_836 = const()[name = tensor<string, []>("op_836"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [1, 5, 256]> var_837 = mul(x = var_835, y = var_836)[name = tensor<string, []>("op_837")];
-            tensor<int32, [4]> var_838 = const()[name = tensor<string, []>("op_838"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_839 = reshape(shape = var_838, x = var_837)[name = tensor<string, []>("op_839")];
+            tensor<fp32, [1, 5, 256]> var_901 = linear(bias = inner_encoder_k_proj_3_bias, weight = inner_encoder_k_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_31")];
+            tensor<fp32, []> var_902 = const()[name = tensor<string, []>("op_902"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [1, 5, 256]> var_903 = mul(x = var_901, y = var_902)[name = tensor<string, []>("op_903")];
+            tensor<int32, [4]> var_904 = const()[name = tensor<string, []>("op_904"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_905 = reshape(shape = var_904, x = var_903)[name = tensor<string, []>("op_905")];
             tensor<int32, [4]> k_7_perm_0 = const()[name = tensor<string, []>("k_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> var_843 = linear(bias = encoder_v_proj_3_bias, weight = encoder_v_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_32")];
-            tensor<int32, [4]> var_844 = const()[name = tensor<string, []>("op_844"), val = tensor<int32, [4]>([1, 5, 4, 64])];
-            tensor<fp32, [1, 5, 4, 64]> var_845 = reshape(shape = var_844, x = var_843)[name = tensor<string, []>("op_845")];
+            tensor<fp32, [1, 5, 256]> var_909 = linear(bias = inner_encoder_v_proj_3_bias, weight = inner_encoder_v_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_32")];
+            tensor<int32, [4]> var_910 = const()[name = tensor<string, []>("op_910"), val = tensor<int32, [4]>([1, 5, 4, 64])];
+            tensor<fp32, [1, 5, 4, 64]> var_911 = reshape(shape = var_910, x = var_909)[name = tensor<string, []>("op_911")];
             tensor<int32, [4]> v_7_perm_0 = const()[name = tensor<string, []>("v_7_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [1, 5, 256]> input_137 = linear(bias = encoder_g_proj_3_bias, weight = encoder_g_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_33")];
+            tensor<fp32, [1, 5, 256]> input_139 = linear(bias = inner_encoder_g_proj_3_bias, weight = inner_encoder_g_proj_3_weight, x = x_19)[name = tensor<string, []>("linear_33")];
             tensor<fp32, [1]> sqrt_s0_7 = sqrt(x = prev_scale_7)[name = tensor<string, []>("sqrt_s0_7")];
-            tensor<fp32, [5]> s_t_7 = add(x = prev_scale_7, y = encoder__t_index)[name = tensor<string, []>("s_t_7")];
+            tensor<fp32, [5]> s_t_7 = add(x = prev_scale_7, y = inner_encoder__t_index)[name = tensor<string, []>("s_t_7")];
             tensor<fp32, [5]> sqrt_s_t_7 = sqrt(x = s_t_7)[name = tensor<string, []>("sqrt_s_t_7")];
             tensor<bool, []> qk_7_transpose_x_1 = const()[name = tensor<string, []>("qk_7_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_7_transpose_y_1 = const()[name = tensor<string, []>("qk_7_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [1, 4, 5, 64]> k_7 = transpose(perm = k_7_perm_0, x = var_839)[name = tensor<string, []>("transpose_36")];
-            tensor<fp32, [1, 4, 5, 64]> q_7 = transpose(perm = q_7_perm_0, x = var_831)[name = tensor<string, []>("transpose_37")];
+            tensor<fp32, [1, 4, 5, 64]> k_7 = transpose(perm = k_7_perm_0, x = var_905)[name = tensor<string, []>("transpose_36")];
+            tensor<fp32, [1, 4, 5, 64]> q_7 = transpose(perm = q_7_perm_0, x = var_897)[name = tensor<string, []>("transpose_37")];
             tensor<fp32, [1, 4, 5, 5]> qk_7 = matmul(transpose_x = qk_7_transpose_x_1, transpose_y = qk_7_transpose_y_1, x = q_7, y = k_7)[name = tensor<string, []>("qk_7")];
-            tensor<int32, [2]> var_855 = const()[name = tensor<string, []>("op_855"), val = tensor<int32, [2]>([5, 1])];
-            tensor<fp32, [5, 1]> var_856 = reshape(shape = var_855, x = sqrt_s_t_7)[name = tensor<string, []>("op_856")];
-            tensor<fp32, [5, 5]> M_7 = real_div(x = encoder__causal_mask, y = var_856)[name = tensor<string, []>("M_7")];
-            tensor<fp32, [1, 4, 5, 5]> var_858 = mul(x = qk_7, y = M_7)[name = tensor<string, []>("op_858")];
+            tensor<int32, [2]> var_921 = const()[name = tensor<string, []>("op_921"), val = tensor<int32, [2]>([5, 1])];
+            tensor<fp32, [5, 1]> var_922 = reshape(shape = var_921, x = sqrt_s_t_7)[name = tensor<string, []>("op_922")];
+            tensor<fp32, [5, 5]> M_7 = real_div(x = inner_encoder__causal_mask, y = var_922)[name = tensor<string, []>("M_7")];
+            tensor<fp32, [1, 4, 5, 5]> var_924 = mul(x = qk_7, y = M_7)[name = tensor<string, []>("op_924")];
             tensor<bool, []> inner_7_transpose_x_0 = const()[name = tensor<string, []>("inner_7_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_7_transpose_y_0 = const()[name = tensor<string, []>("inner_7_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 5, 64]> v_7 = transpose(perm = v_7_perm_0, x = var_845)[name = tensor<string, []>("transpose_35")];
-            tensor<fp32, [1, 4, 5, 64]> inner_7 = matmul(transpose_x = inner_7_transpose_x_0, transpose_y = inner_7_transpose_y_0, x = var_858, y = v_7)[name = tensor<string, []>("inner_7")];
-            tensor<bool, []> var_860_transpose_x_0 = const()[name = tensor<string, []>("op_860_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_860_transpose_y_0 = const()[name = tensor<string, []>("op_860_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 5, 64]> var_860 = matmul(transpose_x = var_860_transpose_x_0, transpose_y = var_860_transpose_y_0, x = q_7, y = prev_kv_7)[name = tensor<string, []>("op_860")];
-            tensor<fp32, [5]> var_861 = real_div(x = sqrt_s0_7, y = sqrt_s_t_7)[name = tensor<string, []>("op_861")];
-            tensor<int32, [4]> var_862 = const()[name = tensor<string, []>("op_862"), val = tensor<int32, [4]>([1, 1, 5, 1])];
-            tensor<fp32, [1, 1, 5, 1]> var_863 = reshape(shape = var_862, x = var_861)[name = tensor<string, []>("op_863")];
-            tensor<fp32, [1, 4, 5, 64]> cross_7 = mul(x = var_860, y = var_863)[name = tensor<string, []>("cross_7")];
+            tensor<fp32, [1, 4, 5, 64]> v_7 = transpose(perm = v_7_perm_0, x = var_911)[name = tensor<string, []>("transpose_35")];
+            tensor<fp32, [1, 4, 5, 64]> inner_7 = matmul(transpose_x = inner_7_transpose_x_0, transpose_y = inner_7_transpose_y_0, x = var_924, y = v_7)[name = tensor<string, []>("inner_7")];
+            tensor<bool, []> var_926_transpose_x_0 = const()[name = tensor<string, []>("op_926_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_926_transpose_y_0 = const()[name = tensor<string, []>("op_926_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 5, 64]> var_926 = matmul(transpose_x = var_926_transpose_x_0, transpose_y = var_926_transpose_y_0, x = q_7, y = prev_kv_7)[name = tensor<string, []>("op_926")];
+            tensor<fp32, [5]> var_927 = real_div(x = sqrt_s0_7, y = sqrt_s_t_7)[name = tensor<string, []>("op_927")];
+            tensor<int32, [4]> var_928 = const()[name = tensor<string, []>("op_928"), val = tensor<int32, [4]>([1, 1, 5, 1])];
+            tensor<fp32, [1, 1, 5, 1]> var_929 = reshape(shape = var_928, x = var_927)[name = tensor<string, []>("op_929")];
+            tensor<fp32, [1, 4, 5, 64]> cross_7 = mul(x = var_926, y = var_929)[name = tensor<string, []>("cross_7")];
             tensor<fp32, [1, 4, 5, 64]> out_19 = add(x = inner_7, y = cross_7)[name = tensor<string, []>("out_19")];
-            tensor<fp32, [1, 4, 64, 64]> var_866 = mul(x = prev_kv_7, y = sqrt_s0_7)[name = tensor<string, []>("op_866")];
-            tensor<bool, []> var_868_transpose_x_1 = const()[name = tensor<string, []>("op_868_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_868_transpose_y_1 = const()[name = tensor<string, []>("op_868_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 4, 64, 64]> var_868 = matmul(transpose_x = var_868_transpose_x_1, transpose_y = var_868_transpose_y_1, x = k_7, y = v_7)[name = tensor<string, []>("op_868")];
-            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_7 = add(x = var_866, y = var_868)[name = tensor<string, []>("new_kv_unnorm_7")];
-            tensor<fp32, []> var_870 = const()[name = tensor<string, []>("op_870"), val = tensor<fp32, []>(0x1.4p+2)];
-            tensor<fp32, [1]> new_scale_7 = add(x = prev_scale_7, y = var_870)[name = tensor<string, []>("new_scale_7")];
-            tensor<fp32, [1]> var_872 = sqrt(x = new_scale_7)[name = tensor<string, []>("op_872")];
-            tensor<fp32, [1, 4, 64, 64]> nkv_1 = real_div(x = new_kv_unnorm_7, y = var_872)[name = tensor<string, []>("nkv_1")];
-            tensor<int32, [4]> var_874_perm_0 = const()[name = tensor<string, []>("op_874_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [1, 4, 64, 64]> var_932 = mul(x = prev_kv_7, y = sqrt_s0_7)[name = tensor<string, []>("op_932")];
+            tensor<bool, []> var_934_transpose_x_1 = const()[name = tensor<string, []>("op_934_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_934_transpose_y_1 = const()[name = tensor<string, []>("op_934_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 4, 64, 64]> var_934 = matmul(transpose_x = var_934_transpose_x_1, transpose_y = var_934_transpose_y_1, x = k_7, y = v_7)[name = tensor<string, []>("op_934")];
+            tensor<fp32, [1, 4, 64, 64]> new_kv_unnorm_7 = add(x = var_932, y = var_934)[name = tensor<string, []>("new_kv_unnorm_7")];
+            tensor<fp32, []> var_936 = const()[name = tensor<string, []>("op_936"), val = tensor<fp32, []>(0x1.4p+2)];
+            tensor<fp32, [1]> new_scale_7 = add(x = prev_scale_7, y = var_936)[name = tensor<string, []>("new_scale_7")];
+            tensor<fp32, [1]> var_938 = sqrt(x = new_scale_7)[name = tensor<string, []>("op_938")];
+            tensor<fp32, [1, 4, 64, 64]> nkv_1 = real_div(x = new_kv_unnorm_7, y = var_938)[name = tensor<string, []>("nkv_1")];
+            tensor<int32, [4]> var_940_perm_0 = const()[name = tensor<string, []>("op_940_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_21_axes_0 = const()[name = tensor<string, []>("out_21_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 4, 64]> var_874 = transpose(perm = var_874_perm_0, x = out_19)[name = tensor<string, []>("transpose_34")];
-            tensor<fp32, [1, 5, 4, 64]> out_21 = layer_norm(axes = out_21_axes_0, epsilon = var_18, x = var_874)[name = tensor<string, []>("out_21")];
-            tensor<int32, [3]> var_878 = const()[name = tensor<string, []>("op_878"), val = tensor<int32, [3]>([1, 5, 256])];
-            tensor<fp32, [1, 5, 256]> out_23 = reshape(shape = var_878, x = out_21)[name = tensor<string, []>("out_23")];
-            tensor<fp32, [1, 5, 256]> var_880 = silu(x = input_137)[name = tensor<string, []>("op_880")];
-            tensor<fp32, [1, 5, 256]> input_139 = mul(x = var_880, y = out_23)[name = tensor<string, []>("input_139")];
-            tensor<fp32, [1, 5, 256]> ret_out_7 = linear(bias = encoder_out_proj_3_bias, weight = encoder_out_proj_3_weight, x = input_139)[name = tensor<string, []>("linear_34")];
-            tensor<fp32, [1, 5, 256]> x_21 = add(x = input_133, y = ret_out_7)[name = tensor<string, []>("x_21")];
+            tensor<fp32, [1, 5, 4, 64]> var_940 = transpose(perm = var_940_perm_0, x = out_19)[name = tensor<string, []>("transpose_34")];
+            tensor<fp32, [1, 5, 4, 64]> out_21 = layer_norm(axes = out_21_axes_0, epsilon = var_84, x = var_940)[name = tensor<string, []>("out_21")];
+            tensor<int32, [3]> var_944 = const()[name = tensor<string, []>("op_944"), val = tensor<int32, [3]>([1, 5, 256])];
+            tensor<fp32, [1, 5, 256]> out_23 = reshape(shape = var_944, x = out_21)[name = tensor<string, []>("out_23")];
+            tensor<fp32, [1, 5, 256]> var_946 = silu(x = input_139)[name = tensor<string, []>("op_946")];
+            tensor<fp32, [1, 5, 256]> input_141 = mul(x = var_946, y = out_23)[name = tensor<string, []>("input_141")];
+            tensor<fp32, [1, 5, 256]> ret_out_7 = linear(bias = inner_encoder_out_proj_3_bias, weight = inner_encoder_out_proj_3_weight, x = input_141)[name = tensor<string, []>("linear_34")];
+            tensor<fp32, [1, 5, 256]> x_21 = add(x = input_135, y = ret_out_7)[name = tensor<string, []>("x_21")];
             tensor<int32, [4]> window_37_begin_0 = const()[name = tensor<string, []>("window_37_begin_0"), val = tensor<int32, [4]>([3, 0, 0, 0])];
             tensor<int32, [4]> window_37_end_0 = const()[name = tensor<string, []>("window_37_end_0"), val = tensor<int32, [4]>([4, 1, 16, 256])];
             tensor<bool, [4]> window_37_end_mask_0 = const()[name = tensor<string, []>("window_37_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> window_37_squeeze_mask_0 = const()[name = tensor<string, []>("window_37_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
             tensor<fp32, [1, 16, 256]> window_37 = slice_by_index(begin = window_37_begin_0, end = window_37_end_0, end_mask = window_37_end_mask_0, squeeze_mask = window_37_squeeze_mask_0, x = enc_conv_cache)[name = tensor<string, []>("window_37")];
-            tensor<int32, [3]> var_888_begin_0 = const()[name = tensor<string, []>("op_888_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> var_888_end_0 = const()[name = tensor<string, []>("op_888_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_888_end_mask_0 = const()[name = tensor<string, []>("op_888_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_888 = slice_by_index(begin = var_888_begin_0, end = var_888_end_0, end_mask = var_888_end_mask_0, x = x_21)[name = tensor<string, []>("op_888")];
-            tensor<int32, [3]> var_891_begin_0 = const()[name = tensor<string, []>("op_891_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_891_end_0 = const()[name = tensor<string, []>("op_891_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_891_end_mask_0 = const()[name = tensor<string, []>("op_891_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_891 = slice_by_index(begin = var_891_begin_0, end = var_891_end_0, end_mask = var_891_end_mask_0, x = window_37)[name = tensor<string, []>("op_891")];
+            tensor<int32, [3]> var_954_begin_0 = const()[name = tensor<string, []>("op_954_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_954_end_0 = const()[name = tensor<string, []>("op_954_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_954_end_mask_0 = const()[name = tensor<string, []>("op_954_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_954 = slice_by_index(begin = var_954_begin_0, end = var_954_end_0, end_mask = var_954_end_mask_0, x = x_21)[name = tensor<string, []>("op_954")];
+            tensor<int32, [3]> var_957_begin_0 = const()[name = tensor<string, []>("op_957_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_957_end_0 = const()[name = tensor<string, []>("op_957_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_957_end_mask_0 = const()[name = tensor<string, []>("op_957_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_957 = slice_by_index(begin = var_957_begin_0, end = var_957_end_0, end_mask = var_957_end_mask_0, x = window_37)[name = tensor<string, []>("op_957")];
             tensor<bool, []> window_39_interleave_0 = const()[name = tensor<string, []>("window_39_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_39 = concat(axis = var_27, interleave = window_39_interleave_0, values = (var_891, var_888))[name = tensor<string, []>("window_39")];
-            tensor<int32, [3]> var_896_begin_0 = const()[name = tensor<string, []>("op_896_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_896_end_0 = const()[name = tensor<string, []>("op_896_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
-            tensor<bool, [3]> var_896_end_mask_0 = const()[name = tensor<string, []>("op_896_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_896 = slice_by_index(begin = var_896_begin_0, end = var_896_end_0, end_mask = var_896_end_mask_0, x = x_21)[name = tensor<string, []>("op_896")];
-            tensor<int32, [3]> var_899_begin_0 = const()[name = tensor<string, []>("op_899_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_899_end_0 = const()[name = tensor<string, []>("op_899_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_899_end_mask_0 = const()[name = tensor<string, []>("op_899_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_899 = slice_by_index(begin = var_899_begin_0, end = var_899_end_0, end_mask = var_899_end_mask_0, x = window_39)[name = tensor<string, []>("op_899")];
+            tensor<fp32, [1, 16, 256]> window_39 = concat(axis = var_93, interleave = window_39_interleave_0, values = (var_957, var_954))[name = tensor<string, []>("window_39")];
+            tensor<int32, [3]> var_962_begin_0 = const()[name = tensor<string, []>("op_962_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_962_end_0 = const()[name = tensor<string, []>("op_962_end_0"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<bool, [3]> var_962_end_mask_0 = const()[name = tensor<string, []>("op_962_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_962 = slice_by_index(begin = var_962_begin_0, end = var_962_end_0, end_mask = var_962_end_mask_0, x = x_21)[name = tensor<string, []>("op_962")];
+            tensor<int32, [3]> var_965_begin_0 = const()[name = tensor<string, []>("op_965_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_965_end_0 = const()[name = tensor<string, []>("op_965_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_965_end_mask_0 = const()[name = tensor<string, []>("op_965_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_965 = slice_by_index(begin = var_965_begin_0, end = var_965_end_0, end_mask = var_965_end_mask_0, x = window_39)[name = tensor<string, []>("op_965")];
             tensor<bool, []> window_41_interleave_0 = const()[name = tensor<string, []>("window_41_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_41 = concat(axis = var_27, interleave = window_41_interleave_0, values = (var_899, var_896))[name = tensor<string, []>("window_41")];
-            tensor<int32, [3]> var_904_begin_0 = const()[name = tensor<string, []>("op_904_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
-            tensor<int32, [3]> var_904_end_0 = const()[name = tensor<string, []>("op_904_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
-            tensor<bool, [3]> var_904_end_mask_0 = const()[name = tensor<string, []>("op_904_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_904 = slice_by_index(begin = var_904_begin_0, end = var_904_end_0, end_mask = var_904_end_mask_0, x = x_21)[name = tensor<string, []>("op_904")];
-            tensor<int32, [3]> var_907_begin_0 = const()[name = tensor<string, []>("op_907_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_907_end_0 = const()[name = tensor<string, []>("op_907_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_907_end_mask_0 = const()[name = tensor<string, []>("op_907_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_907 = slice_by_index(begin = var_907_begin_0, end = var_907_end_0, end_mask = var_907_end_mask_0, x = window_41)[name = tensor<string, []>("op_907")];
+            tensor<fp32, [1, 16, 256]> window_41 = concat(axis = var_93, interleave = window_41_interleave_0, values = (var_965, var_962))[name = tensor<string, []>("window_41")];
+            tensor<int32, [3]> var_970_begin_0 = const()[name = tensor<string, []>("op_970_begin_0"), val = tensor<int32, [3]>([0, 2, 0])];
+            tensor<int32, [3]> var_970_end_0 = const()[name = tensor<string, []>("op_970_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<bool, [3]> var_970_end_mask_0 = const()[name = tensor<string, []>("op_970_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_970 = slice_by_index(begin = var_970_begin_0, end = var_970_end_0, end_mask = var_970_end_mask_0, x = x_21)[name = tensor<string, []>("op_970")];
+            tensor<int32, [3]> var_973_begin_0 = const()[name = tensor<string, []>("op_973_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_973_end_0 = const()[name = tensor<string, []>("op_973_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_973_end_mask_0 = const()[name = tensor<string, []>("op_973_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_973 = slice_by_index(begin = var_973_begin_0, end = var_973_end_0, end_mask = var_973_end_mask_0, x = window_41)[name = tensor<string, []>("op_973")];
             tensor<bool, []> window_43_interleave_0 = const()[name = tensor<string, []>("window_43_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_43 = concat(axis = var_27, interleave = window_43_interleave_0, values = (var_907, var_904))[name = tensor<string, []>("window_43")];
-            tensor<int32, [3]> var_912_begin_0 = const()[name = tensor<string, []>("op_912_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
-            tensor<int32, [3]> var_912_end_0 = const()[name = tensor<string, []>("op_912_end_0"), val = tensor<int32, [3]>([1, 4, 256])];
-            tensor<bool, [3]> var_912_end_mask_0 = const()[name = tensor<string, []>("op_912_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
-            tensor<fp32, [1, 1, 256]> var_912 = slice_by_index(begin = var_912_begin_0, end = var_912_end_0, end_mask = var_912_end_mask_0, x = x_21)[name = tensor<string, []>("op_912")];
-            tensor<int32, [3]> var_915_begin_0 = const()[name = tensor<string, []>("op_915_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_915_end_0 = const()[name = tensor<string, []>("op_915_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_915_end_mask_0 = const()[name = tensor<string, []>("op_915_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_915 = slice_by_index(begin = var_915_begin_0, end = var_915_end_0, end_mask = var_915_end_mask_0, x = window_43)[name = tensor<string, []>("op_915")];
+            tensor<fp32, [1, 16, 256]> window_43 = concat(axis = var_93, interleave = window_43_interleave_0, values = (var_973, var_970))[name = tensor<string, []>("window_43")];
+            tensor<int32, [3]> var_978_begin_0 = const()[name = tensor<string, []>("op_978_begin_0"), val = tensor<int32, [3]>([0, 3, 0])];
+            tensor<int32, [3]> var_978_end_0 = const()[name = tensor<string, []>("op_978_end_0"), val = tensor<int32, [3]>([1, 4, 256])];
+            tensor<bool, [3]> var_978_end_mask_0 = const()[name = tensor<string, []>("op_978_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp32, [1, 1, 256]> var_978 = slice_by_index(begin = var_978_begin_0, end = var_978_end_0, end_mask = var_978_end_mask_0, x = x_21)[name = tensor<string, []>("op_978")];
+            tensor<int32, [3]> var_981_begin_0 = const()[name = tensor<string, []>("op_981_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_981_end_0 = const()[name = tensor<string, []>("op_981_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_981_end_mask_0 = const()[name = tensor<string, []>("op_981_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_981 = slice_by_index(begin = var_981_begin_0, end = var_981_end_0, end_mask = var_981_end_mask_0, x = window_43)[name = tensor<string, []>("op_981")];
             tensor<bool, []> window_45_interleave_0 = const()[name = tensor<string, []>("window_45_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window_45 = concat(axis = var_27, interleave = window_45_interleave_0, values = (var_915, var_912))[name = tensor<string, []>("window_45")];
-            tensor<int32, [3]> var_920_begin_0 = const()[name = tensor<string, []>("op_920_begin_0"), val = tensor<int32, [3]>([0, 4, 0])];
-            tensor<int32, [3]> var_920_end_0 = const()[name = tensor<string, []>("op_920_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
-            tensor<bool, [3]> var_920_end_mask_0 = const()[name = tensor<string, []>("op_920_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 1, 256]> var_920 = slice_by_index(begin = var_920_begin_0, end = var_920_end_0, end_mask = var_920_end_mask_0, x = x_21)[name = tensor<string, []>("op_920")];
-            tensor<int32, [3]> var_923_begin_0 = const()[name = tensor<string, []>("op_923_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
-            tensor<int32, [3]> var_923_end_0 = const()[name = tensor<string, []>("op_923_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
-            tensor<bool, [3]> var_923_end_mask_0 = const()[name = tensor<string, []>("op_923_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 15, 256]> var_923 = slice_by_index(begin = var_923_begin_0, end = var_923_end_0, end_mask = var_923_end_mask_0, x = window_45)[name = tensor<string, []>("op_923")];
+            tensor<fp32, [1, 16, 256]> window_45 = concat(axis = var_93, interleave = window_45_interleave_0, values = (var_981, var_978))[name = tensor<string, []>("window_45")];
+            tensor<int32, [3]> var_986_begin_0 = const()[name = tensor<string, []>("op_986_begin_0"), val = tensor<int32, [3]>([0, 4, 0])];
+            tensor<int32, [3]> var_986_end_0 = const()[name = tensor<string, []>("op_986_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> var_986_end_mask_0 = const()[name = tensor<string, []>("op_986_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 1, 256]> var_986 = slice_by_index(begin = var_986_begin_0, end = var_986_end_0, end_mask = var_986_end_mask_0, x = x_21)[name = tensor<string, []>("op_986")];
+            tensor<int32, [3]> var_989_begin_0 = const()[name = tensor<string, []>("op_989_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> var_989_end_0 = const()[name = tensor<string, []>("op_989_end_0"), val = tensor<int32, [3]>([1, 16, 256])];
+            tensor<bool, [3]> var_989_end_mask_0 = const()[name = tensor<string, []>("op_989_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 15, 256]> var_989 = slice_by_index(begin = var_989_begin_0, end = var_989_end_0, end_mask = var_989_end_mask_0, x = window_45)[name = tensor<string, []>("op_989")];
             tensor<bool, []> window_interleave_0 = const()[name = tensor<string, []>("window_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 16, 256]> window = concat(axis = var_27, interleave = window_interleave_0, values = (var_923, var_920))[name = tensor<string, []>("window")];
-            tensor<bool, []> input_141_interleave_0 = const()[name = tensor<string, []>("input_141_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [5, 16, 256]> input_141 = concat(axis = var_24, interleave = input_141_interleave_0, values = (window_39, window_41, window_43, window_45, window))[name = tensor<string, []>("input_141")];
+            tensor<fp32, [1, 16, 256]> window = concat(axis = var_93, interleave = window_interleave_0, values = (var_989, var_986))[name = tensor<string, []>("window")];
+            tensor<bool, []> input_143_interleave_0 = const()[name = tensor<string, []>("input_143_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [5, 16, 256]> input_143 = concat(axis = var_79, interleave = input_143_interleave_0, values = (window_39, window_41, window_43, window_45, window))[name = tensor<string, []>("input_143")];
             tensor<int32, [1]> x_23_axes_0 = const()[name = tensor<string, []>("x_23_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [5, 16, 256]> x_23 = layer_norm(axes = x_23_axes_0, beta = encoder_conv_module_3_sequential_0_bias, epsilon = var_29, gamma = encoder_conv_module_3_sequential_0_weight, x = input_141)[name = tensor<string, []>("x_23")];
-            tensor<int32, [3]> input_143_perm_0 = const()[name = tensor<string, []>("input_143_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp32, [5, 16, 256]> x_23 = layer_norm(axes = x_23_axes_0, beta = inner_encoder_conv_module_3_sequential_0_bias, epsilon = var_76, gamma = inner_encoder_conv_module_3_sequential_0_weight, x = input_143)[name = tensor<string, []>("x_23")];
+            tensor<int32, [3]> input_145_perm_0 = const()[name = tensor<string, []>("input_145_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<string, []> inputs_33_pad_type_0 = const()[name = tensor<string, []>("inputs_33_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> inputs_33_strides_0 = const()[name = tensor<string, []>("inputs_33_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> inputs_33_pad_0 = const()[name = tensor<string, []>("inputs_33_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> inputs_33_dilations_0 = const()[name = tensor<string, []>("inputs_33_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> inputs_33_groups_0 = const()[name = tensor<string, []>("inputs_33_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> input_143 = transpose(perm = input_143_perm_0, x = x_23)[name = tensor<string, []>("transpose_33")];
-            tensor<fp32, [5, 512, 16]> inputs_33 = conv(bias = encoder_conv_module_3_sequential_2_conv_bias, dilations = inputs_33_dilations_0, groups = inputs_33_groups_0, pad = inputs_33_pad_0, pad_type = inputs_33_pad_type_0, strides = inputs_33_strides_0, weight = encoder_conv_module_3_sequential_2_conv_weight, x = input_143)[name = tensor<string, []>("inputs_33")];
-            tensor<int32, [2]> var_948_split_sizes_0 = const()[name = tensor<string, []>("op_948_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
-            tensor<int32, []> var_948_axis_0 = const()[name = tensor<string, []>("op_948_axis_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> var_948_0, tensor<fp32, [5, 256, 16]> var_948_1 = split(axis = var_948_axis_0, split_sizes = var_948_split_sizes_0, x = inputs_33)[name = tensor<string, []>("op_948")];
-            tensor<fp32, [5, 256, 16]> var_950 = sigmoid(x = var_948_1)[name = tensor<string, []>("op_950")];
-            tensor<fp32, [5, 256, 16]> inputs_35 = mul(x = var_948_0, y = var_950)[name = tensor<string, []>("inputs_35")];
+            tensor<fp32, [5, 256, 16]> input_145 = transpose(perm = input_145_perm_0, x = x_23)[name = tensor<string, []>("transpose_33")];
+            tensor<fp32, [5, 512, 16]> inputs_33 = conv(bias = inner_encoder_conv_module_3_sequential_2_conv_bias, dilations = inputs_33_dilations_0, groups = inputs_33_groups_0, pad = inputs_33_pad_0, pad_type = inputs_33_pad_type_0, strides = inputs_33_strides_0, weight = inner_encoder_conv_module_3_sequential_2_conv_weight, x = input_145)[name = tensor<string, []>("inputs_33")];
+            tensor<int32, [2]> var_1014_split_sizes_0 = const()[name = tensor<string, []>("op_1014_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            tensor<int32, []> var_1014_axis_0 = const()[name = tensor<string, []>("op_1014_axis_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [5, 256, 16]> var_1014_0, tensor<fp32, [5, 256, 16]> var_1014_1 = split(axis = var_1014_axis_0, split_sizes = var_1014_split_sizes_0, x = inputs_33)[name = tensor<string, []>("op_1014")];
+            tensor<fp32, [5, 256, 16]> var_1016 = sigmoid(x = var_1014_1)[name = tensor<string, []>("op_1016")];
+            tensor<fp32, [5, 256, 16]> inputs_35 = mul(x = var_1014_0, y = var_1016)[name = tensor<string, []>("inputs_35")];
             tensor<string, []> outputs_aug_pad_type_0 = const()[name = tensor<string, []>("outputs_aug_pad_type_0"), val = tensor<string, []>("custom")];
             tensor<int32, [2]> outputs_aug_pad_0 = const()[name = tensor<string, []>("outputs_aug_pad_0"), val = tensor<int32, [2]>([15, 15])];
             tensor<int32, []> outputs_aug_groups_0 = const()[name = tensor<string, []>("outputs_aug_groups_0"), val = tensor<int32, []>(256)];
             tensor<int32, [1]> outputs_aug_strides_0 = const()[name = tensor<string, []>("outputs_aug_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [1]> outputs_aug_dilations_0 = const()[name = tensor<string, []>("outputs_aug_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [5, 256, 31]> outputs_aug = conv(dilations = outputs_aug_dilations_0, groups = outputs_aug_groups_0, pad = outputs_aug_pad_0, pad_type = outputs_aug_pad_type_0, strides = outputs_aug_strides_0, weight = encoder_conv_module_3_sequential_4_conv_weight, x = inputs_35)[name = tensor<string, []>("outputs_aug")];
-            tensor<int32, [3]> input_145_begin_0 = const()[name = tensor<string, []>("input_145_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
-            tensor<int32, [3]> input_145_end_0 = const()[name = tensor<string, []>("input_145_end_0"), val = tensor<int32, [3]>([5, 256, 16])];
-            tensor<bool, [3]> input_145_end_mask_0 = const()[name = tensor<string, []>("input_145_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
-            tensor<fp32, [5, 256, 16]> input_145 = slice_by_index(begin = input_145_begin_0, end = input_145_end_0, end_mask = input_145_end_mask_0, x = outputs_aug)[name = tensor<string, []>("input_145")];
-            tensor<fp32, [5, 256, 16]> inputs_37 = batch_norm(beta = encoder_conv_module_3_sequential_5_bias, epsilon = var_29, gamma = encoder_conv_module_3_sequential_5_weight, mean = encoder_conv_module_3_sequential_5_running_mean, variance = encoder_conv_module_3_sequential_5_running_var, x = input_145)[name = tensor<string, []>("inputs_37")];
-            tensor<fp32, [5, 256, 16]> input_147 = silu(x = inputs_37)[name = tensor<string, []>("input_147")];
-            tensor<string, []> input_149_pad_type_0 = const()[name = tensor<string, []>("input_149_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> input_149_strides_0 = const()[name = tensor<string, []>("input_149_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> input_149_pad_0 = const()[name = tensor<string, []>("input_149_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> input_149_dilations_0 = const()[name = tensor<string, []>("input_149_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> input_149_groups_0 = const()[name = tensor<string, []>("input_149_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [5, 256, 16]> input_149 = conv(bias = encoder_conv_module_3_sequential_7_conv_bias, dilations = input_149_dilations_0, groups = input_149_groups_0, pad = input_149_pad_0, pad_type = input_149_pad_type_0, strides = input_149_strides_0, weight = encoder_conv_module_3_sequential_7_conv_weight, x = input_147)[name = tensor<string, []>("input_149")];
+            tensor<fp32, [5, 256, 31]> outputs_aug = conv(dilations = outputs_aug_dilations_0, groups = outputs_aug_groups_0, pad = outputs_aug_pad_0, pad_type = outputs_aug_pad_type_0, strides = outputs_aug_strides_0, weight = inner_encoder_conv_module_3_sequential_4_conv_weight, x = inputs_35)[name = tensor<string, []>("outputs_aug")];
+            tensor<int32, [3]> input_147_begin_0 = const()[name = tensor<string, []>("input_147_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> input_147_end_0 = const()[name = tensor<string, []>("input_147_end_0"), val = tensor<int32, [3]>([5, 256, 16])];
+            tensor<bool, [3]> input_147_end_mask_0 = const()[name = tensor<string, []>("input_147_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp32, [5, 256, 16]> input_147 = slice_by_index(begin = input_147_begin_0, end = input_147_end_0, end_mask = input_147_end_mask_0, x = outputs_aug)[name = tensor<string, []>("input_147")];
+            tensor<fp32, [5, 256, 16]> inputs_37 = batch_norm(beta = inner_encoder_conv_module_3_sequential_5_bias, epsilon = var_76, gamma = inner_encoder_conv_module_3_sequential_5_weight, mean = inner_encoder_conv_module_3_sequential_5_running_mean, variance = inner_encoder_conv_module_3_sequential_5_running_var, x = input_147)[name = tensor<string, []>("inputs_37")];
+            tensor<fp32, [5, 256, 16]> input_149 = silu(x = inputs_37)[name = tensor<string, []>("input_149")];
+            tensor<string, []> input_151_pad_type_0 = const()[name = tensor<string, []>("input_151_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> input_151_strides_0 = const()[name = tensor<string, []>("input_151_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> input_151_pad_0 = const()[name = tensor<string, []>("input_151_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> input_151_dilations_0 = const()[name = tensor<string, []>("input_151_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> input_151_groups_0 = const()[name = tensor<string, []>("input_151_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [5, 256, 16]> input_151 = conv(bias = inner_encoder_conv_module_3_sequential_7_conv_bias, dilations = input_151_dilations_0, groups = input_151_groups_0, pad = input_151_pad_0, pad_type = input_151_pad_type_0, strides = input_151_strides_0, weight = inner_encoder_conv_module_3_sequential_7_conv_weight, x = input_149)[name = tensor<string, []>("input_151")];
             tensor<int32, [3]> conv_out_7_perm_0 = const()[name = tensor<string, []>("conv_out_7_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [3]> var_981_begin_0 = const()[name = tensor<string, []>("op_981_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
-            tensor<int32, [3]> var_981_end_0 = const()[name = tensor<string, []>("op_981_end_0"), val = tensor<int32, [3]>([5, 16, 256])];
-            tensor<bool, [3]> var_981_end_mask_0 = const()[name = tensor<string, []>("op_981_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [5, 16, 256]> conv_out_7 = transpose(perm = conv_out_7_perm_0, x = input_149)[name = tensor<string, []>("transpose_32")];
-            tensor<fp32, [5, 1, 256]> var_981 = slice_by_index(begin = var_981_begin_0, end = var_981_end_0, end_mask = var_981_end_mask_0, x = conv_out_7)[name = tensor<string, []>("op_981")];
-            tensor<int32, [3]> var_983_perm_0 = const()[name = tensor<string, []>("op_983_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<fp32, [1, 5, 256]> var_983 = transpose(perm = var_983_perm_0, x = var_981)[name = tensor<string, []>("transpose_31")];
-            tensor<fp32, [1, 5, 256]> input_151 = add(x = x_21, y = var_983)[name = tensor<string, []>("input_151")];
-            tensor<int32, [1]> input_153_axes_0 = const()[name = tensor<string, []>("input_153_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_153 = layer_norm(axes = input_153_axes_0, beta = encoder_ffn2_3_module_sequential_0_bias, epsilon = var_29, gamma = encoder_ffn2_3_module_sequential_0_weight, x = input_151)[name = tensor<string, []>("input_153")];
-            tensor<fp32, [1, 5, 1024]> inputs = linear(bias = encoder_ffn2_3_module_sequential_1_linear_bias, weight = encoder_ffn2_3_module_sequential_1_linear_weight, x = input_153)[name = tensor<string, []>("linear_35")];
-            tensor<fp32, [1, 5, 1024]> input_155 = silu(x = inputs)[name = tensor<string, []>("input_155")];
-            tensor<fp32, [1, 5, 256]> input_159 = linear(bias = encoder_ffn2_3_module_sequential_4_linear_bias, weight = encoder_ffn2_3_module_sequential_4_linear_weight, x = input_155)[name = tensor<string, []>("linear_36")];
-            tensor<fp32, []> var_1006 = const()[name = tensor<string, []>("op_1006"), val = tensor<fp32, []>(0x1p-1)];
-            tensor<fp32, [1, 5, 256]> var_1007 = mul(x = input_159, y = var_1006)[name = tensor<string, []>("op_1007")];
-            tensor<fp32, [1, 5, 256]> input_161 = add(x = var_1007, y = input_151)[name = tensor<string, []>("input_161")];
+            tensor<int32, [3]> var_1047_begin_0 = const()[name = tensor<string, []>("op_1047_begin_0"), val = tensor<int32, [3]>([0, -1, 0])];
+            tensor<int32, [3]> var_1047_end_0 = const()[name = tensor<string, []>("op_1047_end_0"), val = tensor<int32, [3]>([5, 16, 256])];
+            tensor<bool, [3]> var_1047_end_mask_0 = const()[name = tensor<string, []>("op_1047_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [5, 16, 256]> conv_out_7 = transpose(perm = conv_out_7_perm_0, x = input_151)[name = tensor<string, []>("transpose_32")];
+            tensor<fp32, [5, 1, 256]> var_1047 = slice_by_index(begin = var_1047_begin_0, end = var_1047_end_0, end_mask = var_1047_end_mask_0, x = conv_out_7)[name = tensor<string, []>("op_1047")];
+            tensor<int32, [3]> var_1049_perm_0 = const()[name = tensor<string, []>("op_1049_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<fp32, [1, 5, 256]> var_1049 = transpose(perm = var_1049_perm_0, x = var_1047)[name = tensor<string, []>("transpose_31")];
+            tensor<fp32, [1, 5, 256]> input_153 = add(x = x_21, y = var_1049)[name = tensor<string, []>("input_153")];
+            tensor<int32, [1]> input_155_axes_0 = const()[name = tensor<string, []>("input_155_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 5, 256]> input_155 = layer_norm(axes = input_155_axes_0, beta = inner_encoder_ffn2_3_module_sequential_0_bias, epsilon = var_76, gamma = inner_encoder_ffn2_3_module_sequential_0_weight, x = input_153)[name = tensor<string, []>("input_155")];
+            tensor<fp32, [1, 5, 1024]> inputs = linear(bias = inner_encoder_ffn2_3_module_sequential_1_linear_bias, weight = inner_encoder_ffn2_3_module_sequential_1_linear_weight, x = input_155)[name = tensor<string, []>("linear_35")];
+            tensor<fp32, [1, 5, 1024]> input_157 = silu(x = inputs)[name = tensor<string, []>("input_157")];
+            tensor<fp32, [1, 5, 256]> input_161 = linear(bias = inner_encoder_ffn2_3_module_sequential_4_linear_bias, weight = inner_encoder_ffn2_3_module_sequential_4_linear_weight, x = input_157)[name = tensor<string, []>("linear_36")];
+            tensor<fp32, []> var_1072 = const()[name = tensor<string, []>("op_1072"), val = tensor<fp32, []>(0x1p-1)];
+            tensor<fp32, [1, 5, 256]> var_1073 = mul(x = input_161, y = var_1072)[name = tensor<string, []>("op_1073")];
+            tensor<fp32, [1, 5, 256]> input_163 = add(x = var_1073, y = input_153)[name = tensor<string, []>("input_163")];
             tensor<int32, [1]> x_25_axes_0 = const()[name = tensor<string, []>("x_25_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> x_25 = layer_norm(axes = x_25_axes_0, beta = encoder_layer_norm_3_bias, epsilon = var_29, gamma = encoder_layer_norm_3_weight, x = input_161)[name = tensor<string, []>("x_25")];
+            tensor<fp32, [1, 5, 256]> x_25 = layer_norm(axes = x_25_axes_0, beta = inner_encoder_layer_norm_3_bias, epsilon = var_76, gamma = inner_encoder_layer_norm_3_weight, x = input_163)[name = tensor<string, []>("x_25")];
             tensor<int32, [3]> x_ct_perm_0 = const()[name = tensor<string, []>("x_ct_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<bool, []> cat_interleave_0 = const()[name = tensor<string, []>("cat_interleave_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [1, 256, 5]> x_ct = transpose(perm = x_ct_perm_0, x = x_25)[name = tensor<string, []>("transpose_30")];
-            tensor<fp32, [1, 256, 23]> cat = concat(axis = var_21, interleave = cat_interleave_0, values = (cnn_window, x_ct))[name = tensor<string, []>("cat")];
+            tensor<fp32, [1, 256, 23]> cat = concat(axis = var_81, interleave = cat_interleave_0, values = (cnn_window, x_ct))[name = tensor<string, []>("cat")];
             tensor<string, []> conv_out_pad_type_0 = const()[name = tensor<string, []>("conv_out_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [1]> conv_out_strides_0 = const()[name = tensor<string, []>("conv_out_strides_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, [2]> conv_out_pad_0 = const()[name = tensor<string, []>("conv_out_pad_0"), val = tensor<int32, [2]>([0, 0])];
             tensor<int32, [1]> conv_out_dilations_0 = const()[name = tensor<string, []>("conv_out_dilations_0"), val = tensor<int32, [1]>([1])];
             tensor<int32, []> conv_out_groups_0 = const()[name = tensor<string, []>("conv_out_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp32, [1, 256, 5]> conv_out = conv(bias = encoder_cnn_bias, dilations = conv_out_dilations_0, groups = conv_out_groups_0, pad = conv_out_pad_0, pad_type = conv_out_pad_type_0, strides = conv_out_strides_0, weight = encoder_cnn_weight, x = cat)[name = tensor<string, []>("conv_out")];
-            tensor<int32, [3]> var_1025_begin_0 = const()[name = tensor<string, []>("op_1025_begin_0"), val = tensor<int32, [3]>([0, 0, 5])];
-            tensor<int32, [3]> var_1025_end_0 = const()[name = tensor<string, []>("op_1025_end_0"), val = tensor<int32, [3]>([1, 256, 23])];
-            tensor<bool, [3]> var_1025_end_mask_0 = const()[name = tensor<string, []>("op_1025_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
-            tensor<fp32, [1, 256, 18]> cnn_window_new = slice_by_index(begin = var_1025_begin_0, end = var_1025_end_0, end_mask = var_1025_end_mask_0, x = cat)[name = tensor<string, []>("op_1025")];
-            tensor<int32, [3]> input_163_perm_0 = const()[name = tensor<string, []>("input_163_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [1]> var_1027 = const()[name = tensor<string, []>("op_1027"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 256]> input_163 = transpose(perm = input_163_perm_0, x = conv_out)[name = tensor<string, []>("transpose_29")];
-            tensor<fp32, [1, 5, 1]> var_1028 = reduce_l2_norm(axes = var_1027, keep_dims = var_30, x = input_163)[name = tensor<string, []>("op_1028")];
+            tensor<fp32, [1, 256, 5]> conv_out = conv(bias = inner_encoder_cnn_bias, dilations = conv_out_dilations_0, groups = conv_out_groups_0, pad = conv_out_pad_0, pad_type = conv_out_pad_type_0, strides = conv_out_strides_0, weight = inner_encoder_cnn_weight, x = cat)[name = tensor<string, []>("conv_out")];
+            tensor<int32, [3]> var_1091_begin_0 = const()[name = tensor<string, []>("op_1091_begin_0"), val = tensor<int32, [3]>([0, 0, 5])];
+            tensor<int32, [3]> var_1091_end_0 = const()[name = tensor<string, []>("op_1091_end_0"), val = tensor<int32, [3]>([1, 256, 23])];
+            tensor<bool, [3]> var_1091_end_mask_0 = const()[name = tensor<string, []>("op_1091_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp32, [1, 256, 18]> cnn_window_new = slice_by_index(begin = var_1091_begin_0, end = var_1091_end_0, end_mask = var_1091_end_mask_0, x = cat)[name = tensor<string, []>("op_1091")];
+            tensor<int32, [3]> input_165_perm_0 = const()[name = tensor<string, []>("input_165_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_1093 = const()[name = tensor<string, []>("op_1093"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 5, 256]> input_165 = transpose(perm = input_165_perm_0, x = conv_out)[name = tensor<string, []>("transpose_29")];
+            tensor<fp32, [1, 5, 1]> var_1094 = reduce_l2_norm(axes = var_1093, keep_dims = var_75, x = input_165)[name = tensor<string, []>("op_1094")];
             tensor<fp32, []> const_12 = const()[name = tensor<string, []>("const_12"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1, 5, 1]> clip_0 = clip(alpha = var_11, beta = const_12, x = var_1028)[name = tensor<string, []>("clip_0")];
-            tensor<fp32, [1, 5, 256]> emb = real_div(x = input_163, y = clip_0)[name = tensor<string, []>("emb")];
-            tensor<int32, []> var_1032_axis_0 = const()[name = tensor<string, []>("op_1032_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1, 4, 64, 64]> enc_kv_new = stack(axis = var_1032_axis_0, values = (var_207, var_429, var_651, nkv_1))[name = tensor<string, []>("op_1032")];
-            tensor<int32, []> var_1034_axis_0 = const()[name = tensor<string, []>("op_1034_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1]> enc_scale_new = stack(axis = var_1034_axis_0, values = (new_scale_1, new_scale_3, new_scale_5, new_scale_7))[name = tensor<string, []>("op_1034")];
-            tensor<int32, []> var_1036_axis_0 = const()[name = tensor<string, []>("op_1036_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [4, 1, 16, 256]> enc_conv_cache_new = stack(axis = var_1036_axis_0, values = (window_11, window_23, window_35, window))[name = tensor<string, []>("op_1036")];
-            tensor<fp32, []> var_1045 = const()[name = tensor<string, []>("op_1045"), val = tensor<fp32, []>(0x1.5798eep-27)];
-            tensor<fp32, []> var_1050 = const()[name = tensor<string, []>("op_1050"), val = tensor<fp32, []>(0x1.4f8b58p-17)];
-            tensor<fp32, []> var_1052 = const()[name = tensor<string, []>("op_1052"), val = tensor<fp32, []>(0x1.0c6f7ap-20)];
-            tensor<bool, []> var_1053 = const()[name = tensor<string, []>("op_1053"), val = tensor<bool, []>(true)];
-            tensor<fp32, []> var_1055 = const()[name = tensor<string, []>("op_1055"), val = tensor<fp32, []>(0x1p+0)];
-            tensor<int32, []> var_1059 = const()[name = tensor<string, []>("op_1059"), val = tensor<int32, []>(-1)];
-            tensor<int32, []> var_1065 = const()[name = tensor<string, []>("op_1065"), val = tensor<int32, []>(0)];
+            tensor<fp32, [1, 5, 1]> clip_0 = clip(alpha = var_90, beta = const_12, x = var_1094)[name = tensor<string, []>("clip_0")];
+            tensor<fp32, [1, 5, 256]> emb = real_div(x = input_165, y = clip_0)[name = tensor<string, []>("emb")];
+            tensor<int32, []> var_1098_axis_0 = const()[name = tensor<string, []>("op_1098_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1, 4, 64, 64]> enc_kv_new = stack(axis = var_1098_axis_0, values = (var_273, var_495, var_717, nkv_1))[name = tensor<string, []>("op_1098")];
+            tensor<int32, []> var_1100_axis_0 = const()[name = tensor<string, []>("op_1100_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1]> enc_scale_new = stack(axis = var_1100_axis_0, values = (new_scale_1, new_scale_3, new_scale_5, new_scale_7))[name = tensor<string, []>("op_1100")];
+            tensor<int32, []> var_1102_axis_0 = const()[name = tensor<string, []>("op_1102_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [4, 1, 16, 256]> enc_conv_cache_new = stack(axis = var_1102_axis_0, values = (window_11, window_23, window_35, window))[name = tensor<string, []>("op_1102")];
             tensor<fp32, [1, 5, 12, 256]> pos = const()[name = tensor<string, []>("pos"), val = tensor<fp32, [1, 5, 12, 256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44395712)))];
-            tensor<int32, [1]> var_1127_axes_0 = const()[name = tensor<string, []>("op_1127_axes_0"), val = tensor<int32, [1]>([2])];
-            tensor<fp32, [1, 5, 1, 256]> var_1127 = expand_dims(axes = var_1127_axes_0, x = emb)[name = tensor<string, []>("op_1127")];
+            tensor<int32, [1]> var_1170_axes_0 = const()[name = tensor<string, []>("op_1170_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp32, [1, 5, 1, 256]> var_1170 = expand_dims(axes = var_1170_axes_0, x = emb)[name = tensor<string, []>("op_1170")];
             tensor<int32, [4]> emb_exp_reps_0 = const()[name = tensor<string, []>("emb_exp_reps_0"), val = tensor<int32, [4]>([1, 1, 12, 1])];
-            tensor<fp32, [1, 5, 12, 256]> emb_exp = tile(reps = emb_exp_reps_0, x = var_1127)[name = tensor<string, []>("emb_exp")];
-            tensor<bool, []> input_165_interleave_0 = const()[name = tensor<string, []>("input_165_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 5, 12, 512]> input_165 = concat(axis = var_1059, interleave = input_165_interleave_0, values = (emb_exp, pos))[name = tensor<string, []>("input_165")];
-            tensor<fp32, [1, 5, 12, 256]> x_27 = linear(bias = decoder_convert_bias, weight = decoder_convert_weight, x = input_165)[name = tensor<string, []>("linear_37")];
-            tensor<int32, [4]> var_1135_perm_0 = const()[name = tensor<string, []>("op_1135_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1139 = const()[name = tensor<string, []>("op_1139"), val = tensor<int32, [3]>([12, 5, 256])];
-            tensor<fp32, [1, 12, 5, 256]> var_1135 = transpose(perm = var_1135_perm_0, x = x_27)[name = tensor<string, []>("transpose_28")];
-            tensor<fp32, [12, 5, 256]> x_29 = reshape(shape = var_1139, x = var_1135)[name = tensor<string, []>("x_29")];
+            tensor<fp32, [1, 5, 12, 256]> emb_exp = tile(reps = emb_exp_reps_0, x = var_1170)[name = tensor<string, []>("emb_exp")];
+            tensor<bool, []> input_167_interleave_0 = const()[name = tensor<string, []>("input_167_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 5, 12, 512]> input_167 = concat(axis = var_82, interleave = input_167_interleave_0, values = (emb_exp, pos))[name = tensor<string, []>("input_167")];
+            tensor<fp32, [1, 5, 12, 256]> x_27 = linear(bias = inner_decoder_convert_bias, weight = inner_decoder_convert_weight, x = input_167)[name = tensor<string, []>("linear_37")];
+            tensor<int32, [4]> var_1178_perm_0 = const()[name = tensor<string, []>("op_1178_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1182 = const()[name = tensor<string, []>("op_1182"), val = tensor<int32, [3]>([12, 5, 256])];
+            tensor<fp32, [1, 12, 5, 256]> var_1178 = transpose(perm = var_1178_perm_0, x = x_27)[name = tensor<string, []>("transpose_28")];
+            tensor<fp32, [12, 5, 256]> x_29 = reshape(shape = var_1182, x = var_1178)[name = tensor<string, []>("x_29")];
             tensor<int32, [5]> prev_kv_9_begin_0 = const()[name = tensor<string, []>("prev_kv_9_begin_0"), val = tensor<int32, [5]>([0, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_9_end_0 = const()[name = tensor<string, []>("prev_kv_9_end_0"), val = tensor<int32, [5]>([1, 12, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_9_end_mask_0 = const()[name = tensor<string, []>("prev_kv_9_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -1023,132 +1042,132 @@ program(1.0)
             tensor<bool, [2]> prev_scale_9_end_mask_0 = const()[name = tensor<string, []>("prev_scale_9_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_9_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_9_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale_9 = slice_by_index(begin = prev_scale_9_begin_0, end = prev_scale_9_end_0, end_mask = prev_scale_9_end_mask_0, squeeze_mask = prev_scale_9_squeeze_mask_0, x = dec_scale)[name = tensor<string, []>("prev_scale_9")];
-            tensor<fp32, [12, 5, 256]> var_1147 = linear(bias = decoder_q_proj_0_bias, weight = decoder_q_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_38")];
-            tensor<int32, [4]> var_1148 = const()[name = tensor<string, []>("op_1148"), val = tensor<int32, [4]>([12, 5, 4, 64])];
-            tensor<fp32, [12, 5, 4, 64]> var_1149 = reshape(shape = var_1148, x = var_1147)[name = tensor<string, []>("op_1149")];
+            tensor<fp32, [12, 5, 256]> var_1190 = linear(bias = inner_decoder_q_proj_0_bias, weight = inner_decoder_q_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_38")];
+            tensor<int32, [4]> var_1191 = const()[name = tensor<string, []>("op_1191"), val = tensor<int32, [4]>([12, 5, 4, 64])];
+            tensor<fp32, [12, 5, 4, 64]> var_1192 = reshape(shape = var_1191, x = var_1190)[name = tensor<string, []>("op_1192")];
             tensor<int32, [4]> q_9_perm_0 = const()[name = tensor<string, []>("q_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 5, 256]> var_1153 = linear(bias = decoder_k_proj_0_bias, weight = decoder_k_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_39")];
-            tensor<fp32, []> var_1154 = const()[name = tensor<string, []>("op_1154"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [12, 5, 256]> var_1155 = mul(x = var_1153, y = var_1154)[name = tensor<string, []>("op_1155")];
-            tensor<int32, [4]> var_1156 = const()[name = tensor<string, []>("op_1156"), val = tensor<int32, [4]>([12, 5, 4, 64])];
-            tensor<fp32, [12, 5, 4, 64]> var_1157 = reshape(shape = var_1156, x = var_1155)[name = tensor<string, []>("op_1157")];
+            tensor<fp32, [12, 5, 256]> var_1196 = linear(bias = inner_decoder_k_proj_0_bias, weight = inner_decoder_k_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_39")];
+            tensor<fp32, []> var_1197 = const()[name = tensor<string, []>("op_1197"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [12, 5, 256]> var_1198 = mul(x = var_1196, y = var_1197)[name = tensor<string, []>("op_1198")];
+            tensor<int32, [4]> var_1199 = const()[name = tensor<string, []>("op_1199"), val = tensor<int32, [4]>([12, 5, 4, 64])];
+            tensor<fp32, [12, 5, 4, 64]> var_1200 = reshape(shape = var_1199, x = var_1198)[name = tensor<string, []>("op_1200")];
             tensor<int32, [4]> k_9_perm_0 = const()[name = tensor<string, []>("k_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 5, 256]> var_1161 = linear(bias = decoder_v_proj_0_bias, weight = decoder_v_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_40")];
-            tensor<int32, [4]> var_1162 = const()[name = tensor<string, []>("op_1162"), val = tensor<int32, [4]>([12, 5, 4, 64])];
-            tensor<fp32, [12, 5, 4, 64]> var_1163 = reshape(shape = var_1162, x = var_1161)[name = tensor<string, []>("op_1163")];
+            tensor<fp32, [12, 5, 256]> var_1204 = linear(bias = inner_decoder_v_proj_0_bias, weight = inner_decoder_v_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_40")];
+            tensor<int32, [4]> var_1205 = const()[name = tensor<string, []>("op_1205"), val = tensor<int32, [4]>([12, 5, 4, 64])];
+            tensor<fp32, [12, 5, 4, 64]> var_1206 = reshape(shape = var_1205, x = var_1204)[name = tensor<string, []>("op_1206")];
             tensor<int32, [4]> v_9_perm_0 = const()[name = tensor<string, []>("v_9_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 5, 256]> input_169 = linear(bias = decoder_g_proj_0_bias, weight = decoder_g_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_41")];
+            tensor<fp32, [12, 5, 256]> input_171 = linear(bias = inner_decoder_g_proj_0_bias, weight = inner_decoder_g_proj_0_weight, x = x_29)[name = tensor<string, []>("linear_41")];
             tensor<bool, []> cumsum_mask_1_exclusive_0 = const()[name = tensor<string, []>("cumsum_mask_1_exclusive_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> cumsum_mask_1_reverse_0 = const()[name = tensor<string, []>("cumsum_mask_1_reverse_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [5]> cumsum_mask_1 = cumsum(axis = var_1065, exclusive = cumsum_mask_1_exclusive_0, reverse = cumsum_mask_1_reverse_0, x = valid_mask)[name = tensor<string, []>("cumsum_mask_1")];
+            tensor<fp32, [5]> cumsum_mask_1 = cumsum(axis = var_79, exclusive = cumsum_mask_1_exclusive_0, reverse = cumsum_mask_1_reverse_0, x = valid_mask)[name = tensor<string, []>("cumsum_mask_1")];
             tensor<fp32, [1]> sqrt_s0_9 = sqrt(x = prev_scale_9)[name = tensor<string, []>("sqrt_s0_9")];
             tensor<fp32, [5]> s_t_9 = add(x = prev_scale_9, y = cumsum_mask_1)[name = tensor<string, []>("s_t_9")];
             tensor<fp32, []> const_20 = const()[name = tensor<string, []>("const_20"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [5]> clip_1 = clip(alpha = var_1055, beta = const_20, x = s_t_9)[name = tensor<string, []>("clip_1")];
+            tensor<fp32, [5]> clip_1 = clip(alpha = var_69, beta = const_20, x = s_t_9)[name = tensor<string, []>("clip_1")];
             tensor<fp32, [5]> sqrt_s_t_9 = sqrt(x = clip_1)[name = tensor<string, []>("sqrt_s_t_9")];
             tensor<bool, []> qk_9_transpose_x_1 = const()[name = tensor<string, []>("qk_9_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_9_transpose_y_1 = const()[name = tensor<string, []>("qk_9_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [12, 4, 5, 64]> k_9 = transpose(perm = k_9_perm_0, x = var_1157)[name = tensor<string, []>("transpose_26")];
-            tensor<fp32, [12, 4, 5, 64]> q_9 = transpose(perm = q_9_perm_0, x = var_1149)[name = tensor<string, []>("transpose_27")];
+            tensor<fp32, [12, 4, 5, 64]> k_9 = transpose(perm = k_9_perm_0, x = var_1200)[name = tensor<string, []>("transpose_26")];
+            tensor<fp32, [12, 4, 5, 64]> q_9 = transpose(perm = q_9_perm_0, x = var_1192)[name = tensor<string, []>("transpose_27")];
             tensor<fp32, [12, 4, 5, 5]> qk_9 = matmul(transpose_x = qk_9_transpose_x_1, transpose_y = qk_9_transpose_y_1, x = q_9, y = k_9)[name = tensor<string, []>("qk_9")];
-            tensor<int32, [2]> var_1175 = const()[name = tensor<string, []>("op_1175"), val = tensor<int32, [2]>([1, 5])];
-            tensor<fp32, [1, 5]> var_1176 = reshape(shape = var_1175, x = valid_mask)[name = tensor<string, []>("op_1176")];
-            tensor<fp32, [5, 5]> causal_with_valid_1 = mul(x = decoder__causal_mask, y = var_1176)[name = tensor<string, []>("causal_with_valid_1")];
-            tensor<int32, [2]> var_1178 = const()[name = tensor<string, []>("op_1178"), val = tensor<int32, [2]>([5, 1])];
-            tensor<fp32, [5, 1]> var_1179 = reshape(shape = var_1178, x = sqrt_s_t_9)[name = tensor<string, []>("op_1179")];
-            tensor<fp32, [5, 5]> M_9 = real_div(x = causal_with_valid_1, y = var_1179)[name = tensor<string, []>("M_9")];
-            tensor<fp32, [12, 4, 5, 5]> var_1181 = mul(x = qk_9, y = M_9)[name = tensor<string, []>("op_1181")];
+            tensor<int32, [2]> var_1218 = const()[name = tensor<string, []>("op_1218"), val = tensor<int32, [2]>([1, 5])];
+            tensor<fp32, [1, 5]> var_1219 = reshape(shape = var_1218, x = valid_mask)[name = tensor<string, []>("op_1219")];
+            tensor<fp32, [5, 5]> causal_with_valid_1 = mul(x = inner_decoder__causal_mask, y = var_1219)[name = tensor<string, []>("causal_with_valid_1")];
+            tensor<int32, [2]> var_1221 = const()[name = tensor<string, []>("op_1221"), val = tensor<int32, [2]>([5, 1])];
+            tensor<fp32, [5, 1]> var_1222 = reshape(shape = var_1221, x = sqrt_s_t_9)[name = tensor<string, []>("op_1222")];
+            tensor<fp32, [5, 5]> M_9 = real_div(x = causal_with_valid_1, y = var_1222)[name = tensor<string, []>("M_9")];
+            tensor<fp32, [12, 4, 5, 5]> var_1224 = mul(x = qk_9, y = M_9)[name = tensor<string, []>("op_1224")];
             tensor<bool, []> inner_9_transpose_x_0 = const()[name = tensor<string, []>("inner_9_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> inner_9_transpose_y_0 = const()[name = tensor<string, []>("inner_9_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 5, 64]> v_9 = transpose(perm = v_9_perm_0, x = var_1163)[name = tensor<string, []>("transpose_25")];
-            tensor<fp32, [12, 4, 5, 64]> inner_9 = matmul(transpose_x = inner_9_transpose_x_0, transpose_y = inner_9_transpose_y_0, x = var_1181, y = v_9)[name = tensor<string, []>("inner_9")];
-            tensor<bool, []> var_1183_transpose_x_0 = const()[name = tensor<string, []>("op_1183_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_1183_transpose_y_0 = const()[name = tensor<string, []>("op_1183_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 5, 64]> var_1183 = matmul(transpose_x = var_1183_transpose_x_0, transpose_y = var_1183_transpose_y_0, x = q_9, y = prev_kv_9)[name = tensor<string, []>("op_1183")];
-            tensor<fp32, [5]> var_1184 = real_div(x = sqrt_s0_9, y = sqrt_s_t_9)[name = tensor<string, []>("op_1184")];
-            tensor<int32, [4]> var_1185 = const()[name = tensor<string, []>("op_1185"), val = tensor<int32, [4]>([1, 1, 5, 1])];
-            tensor<fp32, [1, 1, 5, 1]> var_1186 = reshape(shape = var_1185, x = var_1184)[name = tensor<string, []>("op_1186")];
-            tensor<fp32, [12, 4, 5, 64]> cross_9 = mul(x = var_1183, y = var_1186)[name = tensor<string, []>("cross_9")];
+            tensor<fp32, [12, 4, 5, 64]> v_9 = transpose(perm = v_9_perm_0, x = var_1206)[name = tensor<string, []>("transpose_25")];
+            tensor<fp32, [12, 4, 5, 64]> inner_9 = matmul(transpose_x = inner_9_transpose_x_0, transpose_y = inner_9_transpose_y_0, x = var_1224, y = v_9)[name = tensor<string, []>("inner_9")];
+            tensor<bool, []> var_1226_transpose_x_0 = const()[name = tensor<string, []>("op_1226_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_1226_transpose_y_0 = const()[name = tensor<string, []>("op_1226_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 5, 64]> var_1226 = matmul(transpose_x = var_1226_transpose_x_0, transpose_y = var_1226_transpose_y_0, x = q_9, y = prev_kv_9)[name = tensor<string, []>("op_1226")];
+            tensor<fp32, [5]> var_1227 = real_div(x = sqrt_s0_9, y = sqrt_s_t_9)[name = tensor<string, []>("op_1227")];
+            tensor<int32, [4]> var_1228 = const()[name = tensor<string, []>("op_1228"), val = tensor<int32, [4]>([1, 1, 5, 1])];
+            tensor<fp32, [1, 1, 5, 1]> var_1229 = reshape(shape = var_1228, x = var_1227)[name = tensor<string, []>("op_1229")];
+            tensor<fp32, [12, 4, 5, 64]> cross_9 = mul(x = var_1226, y = var_1229)[name = tensor<string, []>("cross_9")];
             tensor<fp32, [12, 4, 5, 64]> out_25 = add(x = inner_9, y = cross_9)[name = tensor<string, []>("out_25")];
-            tensor<int32, [4]> var_1189 = const()[name = tensor<string, []>("op_1189"), val = tensor<int32, [4]>([1, 1, 5, 1])];
-            tensor<fp32, [1, 1, 5, 1]> var_1190 = reshape(shape = var_1189, x = valid_mask)[name = tensor<string, []>("op_1190")];
-            tensor<fp32, [12, 4, 5, 64]> v_masked_1 = mul(x = v_9, y = var_1190)[name = tensor<string, []>("v_masked_1")];
-            tensor<fp32, [12, 4, 64, 64]> var_1192 = mul(x = prev_kv_9, y = sqrt_s0_9)[name = tensor<string, []>("op_1192")];
-            tensor<bool, []> var_1194_transpose_x_1 = const()[name = tensor<string, []>("op_1194_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_1194_transpose_y_1 = const()[name = tensor<string, []>("op_1194_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 64, 64]> var_1194 = matmul(transpose_x = var_1194_transpose_x_1, transpose_y = var_1194_transpose_y_1, x = k_9, y = v_masked_1)[name = tensor<string, []>("op_1194")];
-            tensor<fp32, [12, 4, 64, 64]> new_kv_unnorm_9 = add(x = var_1192, y = var_1194)[name = tensor<string, []>("new_kv_unnorm_9")];
-            tensor<bool, []> var_1196_keep_dims_0 = const()[name = tensor<string, []>("op_1196_keep_dims_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, []> var_1196 = reduce_sum(keep_dims = var_1196_keep_dims_0, x = valid_mask)[name = tensor<string, []>("op_1196")];
-            tensor<int32, [1]> var_1197 = const()[name = tensor<string, []>("op_1197"), val = tensor<int32, [1]>([1])];
-            tensor<fp32, [1]> var_1198 = reshape(shape = var_1197, x = var_1196)[name = tensor<string, []>("op_1198")];
-            tensor<fp32, [1]> new_scale_9 = add(x = prev_scale_9, y = var_1198)[name = tensor<string, []>("new_scale_9")];
+            tensor<int32, [4]> var_1232 = const()[name = tensor<string, []>("op_1232"), val = tensor<int32, [4]>([1, 1, 5, 1])];
+            tensor<fp32, [1, 1, 5, 1]> var_1233 = reshape(shape = var_1232, x = valid_mask)[name = tensor<string, []>("op_1233")];
+            tensor<fp32, [12, 4, 5, 64]> v_masked_1 = mul(x = v_9, y = var_1233)[name = tensor<string, []>("v_masked_1")];
+            tensor<fp32, [12, 4, 64, 64]> var_1235 = mul(x = prev_kv_9, y = sqrt_s0_9)[name = tensor<string, []>("op_1235")];
+            tensor<bool, []> var_1237_transpose_x_1 = const()[name = tensor<string, []>("op_1237_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_1237_transpose_y_1 = const()[name = tensor<string, []>("op_1237_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 64, 64]> var_1237 = matmul(transpose_x = var_1237_transpose_x_1, transpose_y = var_1237_transpose_y_1, x = k_9, y = v_masked_1)[name = tensor<string, []>("op_1237")];
+            tensor<fp32, [12, 4, 64, 64]> new_kv_unnorm_9 = add(x = var_1235, y = var_1237)[name = tensor<string, []>("new_kv_unnorm_9")];
+            tensor<bool, []> var_1239_keep_dims_0 = const()[name = tensor<string, []>("op_1239_keep_dims_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, []> var_1239 = reduce_sum(keep_dims = var_1239_keep_dims_0, x = valid_mask)[name = tensor<string, []>("op_1239")];
+            tensor<int32, [1]> var_1240 = const()[name = tensor<string, []>("op_1240"), val = tensor<int32, [1]>([1])];
+            tensor<fp32, [1]> var_1241 = reshape(shape = var_1240, x = var_1239)[name = tensor<string, []>("op_1241")];
+            tensor<fp32, [1]> new_scale_9 = add(x = prev_scale_9, y = var_1241)[name = tensor<string, []>("new_scale_9")];
             tensor<fp32, []> const_21 = const()[name = tensor<string, []>("const_21"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_2 = clip(alpha = var_1055, beta = const_21, x = new_scale_9)[name = tensor<string, []>("clip_2")];
+            tensor<fp32, [1]> clip_2 = clip(alpha = var_69, beta = const_21, x = new_scale_9)[name = tensor<string, []>("clip_2")];
             tensor<fp32, [1]> sqrt_new_scale_1 = sqrt(x = clip_2)[name = tensor<string, []>("sqrt_new_scale_1")];
-            tensor<fp32, [12, 4, 64, 64]> var_1202 = real_div(x = new_kv_unnorm_9, y = sqrt_new_scale_1)[name = tensor<string, []>("op_1202")];
-            tensor<int32, [4]> var_1203_perm_0 = const()[name = tensor<string, []>("op_1203_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp32, [12, 4, 64, 64]> var_1245 = real_div(x = new_kv_unnorm_9, y = sqrt_new_scale_1)[name = tensor<string, []>("op_1245")];
+            tensor<int32, [4]> var_1246_perm_0 = const()[name = tensor<string, []>("op_1246_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_27_axes_0 = const()[name = tensor<string, []>("out_27_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [12, 5, 4, 64]> var_1203 = transpose(perm = var_1203_perm_0, x = out_25)[name = tensor<string, []>("transpose_24")];
-            tensor<fp32, [12, 5, 4, 64]> out_27 = layer_norm(axes = out_27_axes_0, epsilon = var_1052, x = var_1203)[name = tensor<string, []>("out_27")];
-            tensor<int32, [3]> var_1207 = const()[name = tensor<string, []>("op_1207"), val = tensor<int32, [3]>([12, 5, 256])];
-            tensor<fp32, [12, 5, 256]> out_29 = reshape(shape = var_1207, x = out_27)[name = tensor<string, []>("out_29")];
-            tensor<fp32, [12, 5, 256]> var_1209 = silu(x = input_169)[name = tensor<string, []>("op_1209")];
-            tensor<fp32, [12, 5, 256]> input_171 = mul(x = var_1209, y = out_29)[name = tensor<string, []>("input_171")];
-            tensor<fp32, [12, 5, 256]> ret_out_9 = linear(bias = decoder_out_proj_0_bias, weight = decoder_out_proj_0_weight, x = input_171)[name = tensor<string, []>("linear_42")];
-            tensor<fp32, [12, 5, 256]> input_173 = add(x = x_29, y = ret_out_9)[name = tensor<string, []>("input_173")];
+            tensor<fp32, [12, 5, 4, 64]> var_1246 = transpose(perm = var_1246_perm_0, x = out_25)[name = tensor<string, []>("transpose_24")];
+            tensor<fp32, [12, 5, 4, 64]> out_27 = layer_norm(axes = out_27_axes_0, epsilon = var_84, x = var_1246)[name = tensor<string, []>("out_27")];
+            tensor<int32, [3]> var_1250 = const()[name = tensor<string, []>("op_1250"), val = tensor<int32, [3]>([12, 5, 256])];
+            tensor<fp32, [12, 5, 256]> out_29 = reshape(shape = var_1250, x = out_27)[name = tensor<string, []>("out_29")];
+            tensor<fp32, [12, 5, 256]> var_1252 = silu(x = input_171)[name = tensor<string, []>("op_1252")];
+            tensor<fp32, [12, 5, 256]> input_173 = mul(x = var_1252, y = out_29)[name = tensor<string, []>("input_173")];
+            tensor<fp32, [12, 5, 256]> ret_out_9 = linear(bias = inner_decoder_out_proj_0_bias, weight = inner_decoder_out_proj_0_weight, x = input_173)[name = tensor<string, []>("linear_42")];
+            tensor<fp32, [12, 5, 256]> input_175 = add(x = x_29, y = ret_out_9)[name = tensor<string, []>("input_175")];
             tensor<int32, [1]> xt_1_axes_0 = const()[name = tensor<string, []>("xt_1_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [12, 5, 256]> xt_1 = layer_norm(axes = xt_1_axes_0, beta = decoder_norm11_0_bias, epsilon = var_1050, gamma = decoder_norm11_0_weight, x = input_173)[name = tensor<string, []>("xt_1")];
-            tensor<int32, [4]> var_1219 = const()[name = tensor<string, []>("op_1219"), val = tensor<int32, [4]>([1, 12, 5, 256])];
-            tensor<fp32, [1, 12, 5, 256]> var_1220 = reshape(shape = var_1219, x = xt_1)[name = tensor<string, []>("op_1220")];
-            tensor<int32, [4]> var_1221_perm_0 = const()[name = tensor<string, []>("op_1221_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1224 = const()[name = tensor<string, []>("op_1224"), val = tensor<int32, [3]>([5, 12, 256])];
-            tensor<fp32, [1, 5, 12, 256]> var_1221 = transpose(perm = var_1221_perm_0, x = var_1220)[name = tensor<string, []>("transpose_23")];
-            tensor<fp32, [5, 12, 256]> query_1 = reshape(shape = var_1224, x = var_1221)[name = tensor<string, []>("query_1")];
+            tensor<fp32, [12, 5, 256]> xt_1 = layer_norm(axes = xt_1_axes_0, beta = inner_decoder_norm11_0_bias, epsilon = var_76, gamma = inner_decoder_norm11_0_weight, x = input_175)[name = tensor<string, []>("xt_1")];
+            tensor<int32, [4]> var_1262 = const()[name = tensor<string, []>("op_1262"), val = tensor<int32, [4]>([1, 12, 5, 256])];
+            tensor<fp32, [1, 12, 5, 256]> var_1263 = reshape(shape = var_1262, x = xt_1)[name = tensor<string, []>("op_1263")];
+            tensor<int32, [4]> var_1264_perm_0 = const()[name = tensor<string, []>("op_1264_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1267 = const()[name = tensor<string, []>("op_1267"), val = tensor<int32, [3]>([5, 12, 256])];
+            tensor<fp32, [1, 5, 12, 256]> var_1264 = transpose(perm = var_1264_perm_0, x = var_1263)[name = tensor<string, []>("transpose_23")];
+            tensor<fp32, [5, 12, 256]> query_1 = reshape(shape = var_1267, x = var_1264)[name = tensor<string, []>("query_1")];
             tensor<int32, [3]> query_3_perm_0 = const()[name = tensor<string, []>("query_3_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [12, 5, 256]> query_3 = transpose(perm = query_3_perm_0, x = query_1)[name = tensor<string, []>("transpose_22")];
-            tensor<fp32, [12, 5, 768]> var_1247 = linear(bias = decoder_self_attn2_0_in_proj_bias, weight = decoder_self_attn2_0_in_proj_weight, x = query_3)[name = tensor<string, []>("linear_43")];
+            tensor<fp32, [12, 5, 768]> var_1290 = linear(bias = inner_decoder_self_attn2_0_in_proj_bias, weight = inner_decoder_self_attn2_0_in_proj_weight, x = query_3)[name = tensor<string, []>("linear_43")];
             tensor<int32, [4]> concat_1 = const()[name = tensor<string, []>("concat_1"), val = tensor<int32, [4]>([12, 5, 3, 256])];
-            tensor<fp32, [12, 5, 3, 256]> var_1249 = reshape(shape = concat_1, x = var_1247)[name = tensor<string, []>("op_1249")];
-            tensor<int32, [1]> var_1250_axes_0 = const()[name = tensor<string, []>("op_1250_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp32, [1, 12, 5, 3, 256]> var_1250 = expand_dims(axes = var_1250_axes_0, x = var_1249)[name = tensor<string, []>("op_1250")];
-            tensor<int32, [5]> var_1251_perm_0 = const()[name = tensor<string, []>("op_1251_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
-            tensor<int32, [1]> var_1252_axes_0 = const()[name = tensor<string, []>("op_1252_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp32, [3, 12, 5, 1, 256]> var_1251 = transpose(perm = var_1251_perm_0, x = var_1250)[name = tensor<string, []>("transpose_21")];
-            tensor<fp32, [3, 12, 5, 256]> var_1252 = squeeze(axes = var_1252_axes_0, x = var_1251)[name = tensor<string, []>("op_1252")];
+            tensor<fp32, [12, 5, 3, 256]> var_1292 = reshape(shape = concat_1, x = var_1290)[name = tensor<string, []>("op_1292")];
+            tensor<int32, [1]> var_1293_axes_0 = const()[name = tensor<string, []>("op_1293_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 12, 5, 3, 256]> var_1293 = expand_dims(axes = var_1293_axes_0, x = var_1292)[name = tensor<string, []>("op_1293")];
+            tensor<int32, [5]> var_1294_perm_0 = const()[name = tensor<string, []>("op_1294_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
+            tensor<int32, [1]> var_1295_axes_0 = const()[name = tensor<string, []>("op_1295_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp32, [3, 12, 5, 1, 256]> var_1294 = transpose(perm = var_1294_perm_0, x = var_1293)[name = tensor<string, []>("transpose_21")];
+            tensor<fp32, [3, 12, 5, 256]> var_1295 = squeeze(axes = var_1295_axes_0, x = var_1294)[name = tensor<string, []>("op_1295")];
             tensor<int32, [4]> q_11_begin_0 = const()[name = tensor<string, []>("q_11_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> q_11_end_0 = const()[name = tensor<string, []>("q_11_end_0"), val = tensor<int32, [4]>([1, 12, 5, 256])];
             tensor<bool, [4]> q_11_end_mask_0 = const()[name = tensor<string, []>("q_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> q_11_squeeze_mask_0 = const()[name = tensor<string, []>("q_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 5, 256]> q_11 = slice_by_index(begin = q_11_begin_0, end = q_11_end_0, end_mask = q_11_end_mask_0, squeeze_mask = q_11_squeeze_mask_0, x = var_1252)[name = tensor<string, []>("q_11")];
+            tensor<fp32, [12, 5, 256]> q_11 = slice_by_index(begin = q_11_begin_0, end = q_11_end_0, end_mask = q_11_end_mask_0, squeeze_mask = q_11_squeeze_mask_0, x = var_1295)[name = tensor<string, []>("q_11")];
             tensor<int32, [4]> k_11_begin_0 = const()[name = tensor<string, []>("k_11_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> k_11_end_0 = const()[name = tensor<string, []>("k_11_end_0"), val = tensor<int32, [4]>([2, 12, 5, 256])];
             tensor<bool, [4]> k_11_end_mask_0 = const()[name = tensor<string, []>("k_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> k_11_squeeze_mask_0 = const()[name = tensor<string, []>("k_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 5, 256]> k_11 = slice_by_index(begin = k_11_begin_0, end = k_11_end_0, end_mask = k_11_end_mask_0, squeeze_mask = k_11_squeeze_mask_0, x = var_1252)[name = tensor<string, []>("k_11")];
+            tensor<fp32, [12, 5, 256]> k_11 = slice_by_index(begin = k_11_begin_0, end = k_11_end_0, end_mask = k_11_end_mask_0, squeeze_mask = k_11_squeeze_mask_0, x = var_1295)[name = tensor<string, []>("k_11")];
             tensor<int32, [4]> v_11_begin_0 = const()[name = tensor<string, []>("v_11_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> v_11_end_0 = const()[name = tensor<string, []>("v_11_end_0"), val = tensor<int32, [4]>([3, 12, 5, 256])];
             tensor<bool, [4]> v_11_end_mask_0 = const()[name = tensor<string, []>("v_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> v_11_squeeze_mask_0 = const()[name = tensor<string, []>("v_11_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 5, 256]> v_11 = slice_by_index(begin = v_11_begin_0, end = v_11_end_0, end_mask = v_11_end_mask_0, squeeze_mask = v_11_squeeze_mask_0, x = var_1252)[name = tensor<string, []>("v_11")];
-            tensor<int32, [3]> var_1260 = const()[name = tensor<string, []>("op_1260"), val = tensor<int32, [3]>([12, 20, 64])];
-            tensor<fp32, [12, 20, 64]> var_1261 = reshape(shape = var_1260, x = q_11)[name = tensor<string, []>("op_1261")];
+            tensor<fp32, [12, 5, 256]> v_11 = slice_by_index(begin = v_11_begin_0, end = v_11_end_0, end_mask = v_11_end_mask_0, squeeze_mask = v_11_squeeze_mask_0, x = var_1295)[name = tensor<string, []>("v_11")];
+            tensor<int32, [3]> var_1303 = const()[name = tensor<string, []>("op_1303"), val = tensor<int32, [3]>([12, 20, 64])];
+            tensor<fp32, [12, 20, 64]> var_1304 = reshape(shape = var_1303, x = q_11)[name = tensor<string, []>("op_1304")];
             tensor<int32, [3]> q_13_perm_0 = const()[name = tensor<string, []>("q_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1267 = const()[name = tensor<string, []>("op_1267"), val = tensor<int32, [3]>([12, 20, 64])];
-            tensor<fp32, [12, 20, 64]> var_1268 = reshape(shape = var_1267, x = k_11)[name = tensor<string, []>("op_1268")];
+            tensor<int32, [3]> var_1310 = const()[name = tensor<string, []>("op_1310"), val = tensor<int32, [3]>([12, 20, 64])];
+            tensor<fp32, [12, 20, 64]> var_1311 = reshape(shape = var_1310, x = k_11)[name = tensor<string, []>("op_1311")];
             tensor<int32, [3]> k_13_perm_0 = const()[name = tensor<string, []>("k_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1274 = const()[name = tensor<string, []>("op_1274"), val = tensor<int32, [3]>([12, 20, 64])];
-            tensor<fp32, [12, 20, 64]> var_1275 = reshape(shape = var_1274, x = v_11)[name = tensor<string, []>("op_1275")];
+            tensor<int32, [3]> var_1317 = const()[name = tensor<string, []>("op_1317"), val = tensor<int32, [3]>([12, 20, 64])];
+            tensor<fp32, [12, 20, 64]> var_1318 = reshape(shape = var_1317, x = v_11)[name = tensor<string, []>("op_1318")];
             tensor<int32, [3]> v_13_perm_0 = const()[name = tensor<string, []>("v_13_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [4]> var_1278 = const()[name = tensor<string, []>("op_1278"), val = tensor<int32, [4]>([5, 4, 12, 64])];
-            tensor<fp32, [20, 12, 64]> q_13 = transpose(perm = q_13_perm_0, x = var_1261)[name = tensor<string, []>("transpose_20")];
-            tensor<fp32, [5, 4, 12, 64]> q_15 = reshape(shape = var_1278, x = q_13)[name = tensor<string, []>("q_15")];
-            tensor<int32, [4]> var_1280 = const()[name = tensor<string, []>("op_1280"), val = tensor<int32, [4]>([5, 4, 12, 64])];
-            tensor<fp32, [20, 12, 64]> k_13 = transpose(perm = k_13_perm_0, x = var_1268)[name = tensor<string, []>("transpose_19")];
-            tensor<fp32, [5, 4, 12, 64]> k_15 = reshape(shape = var_1280, x = k_13)[name = tensor<string, []>("k_15")];
-            tensor<int32, [4]> var_1282 = const()[name = tensor<string, []>("op_1282"), val = tensor<int32, [4]>([5, 4, 12, 64])];
-            tensor<fp32, [20, 12, 64]> v_13 = transpose(perm = v_13_perm_0, x = var_1275)[name = tensor<string, []>("transpose_18")];
-            tensor<fp32, [5, 4, 12, 64]> v_15 = reshape(shape = var_1282, x = v_13)[name = tensor<string, []>("v_15")];
+            tensor<int32, [4]> var_1321 = const()[name = tensor<string, []>("op_1321"), val = tensor<int32, [4]>([5, 4, 12, 64])];
+            tensor<fp32, [20, 12, 64]> q_13 = transpose(perm = q_13_perm_0, x = var_1304)[name = tensor<string, []>("transpose_20")];
+            tensor<fp32, [5, 4, 12, 64]> q_15 = reshape(shape = var_1321, x = q_13)[name = tensor<string, []>("q_15")];
+            tensor<int32, [4]> var_1323 = const()[name = tensor<string, []>("op_1323"), val = tensor<int32, [4]>([5, 4, 12, 64])];
+            tensor<fp32, [20, 12, 64]> k_13 = transpose(perm = k_13_perm_0, x = var_1311)[name = tensor<string, []>("transpose_19")];
+            tensor<fp32, [5, 4, 12, 64]> k_15 = reshape(shape = var_1323, x = k_13)[name = tensor<string, []>("k_15")];
+            tensor<int32, [4]> var_1325 = const()[name = tensor<string, []>("op_1325"), val = tensor<int32, [4]>([5, 4, 12, 64])];
+            tensor<fp32, [20, 12, 64]> v_13 = transpose(perm = v_13_perm_0, x = var_1318)[name = tensor<string, []>("transpose_18")];
+            tensor<fp32, [5, 4, 12, 64]> v_15 = reshape(shape = var_1325, x = v_13)[name = tensor<string, []>("v_15")];
             tensor<fp32, []> mul_1_y_0 = const()[name = tensor<string, []>("mul_1_y_0"), val = tensor<fp32, []>(0x1p-3)];
             tensor<fp32, [5, 4, 12, 64]> mul_1 = mul(x = q_15, y = mul_1_y_0)[name = tensor<string, []>("mul_1")];
             tensor<bool, []> matmul_0_transpose_y_0 = const()[name = tensor<string, []>("matmul_0_transpose_y_0"), val = tensor<bool, []>(true)];
@@ -1159,30 +1178,30 @@ program(1.0)
             tensor<bool, []> attn_output_1_transpose_x_0 = const()[name = tensor<string, []>("attn_output_1_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> attn_output_1_transpose_y_0 = const()[name = tensor<string, []>("attn_output_1_transpose_y_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [5, 4, 12, 64]> attn_output_1 = matmul(transpose_x = attn_output_1_transpose_x_0, transpose_y = attn_output_1_transpose_y_0, x = softmax_0, y = v_15)[name = tensor<string, []>("attn_output_1")];
-            tensor<int32, [4]> var_1285 = const()[name = tensor<string, []>("op_1285"), val = tensor<int32, [4]>([2, 0, 1, 3])];
-            tensor<int32, [2]> var_1290 = const()[name = tensor<string, []>("op_1290"), val = tensor<int32, [2]>([60, 256])];
-            tensor<fp32, [12, 5, 4, 64]> var_1286 = transpose(perm = var_1285, x = attn_output_1)[name = tensor<string, []>("transpose_17")];
-            tensor<fp32, [60, 256]> attn_output_3 = reshape(shape = var_1290, x = var_1286)[name = tensor<string, []>("attn_output_3")];
-            tensor<fp32, [60, 256]> attn_output_5 = linear(bias = decoder_self_attn2_0_out_proj_bias, weight = decoder_self_attn2_0_out_proj_weight, x = attn_output_3)[name = tensor<string, []>("linear_44")];
-            tensor<int32, [3]> var_1294 = const()[name = tensor<string, []>("op_1294"), val = tensor<int32, [3]>([12, 5, 256])];
-            tensor<fp32, [12, 5, 256]> attn_output_7 = reshape(shape = var_1294, x = attn_output_5)[name = tensor<string, []>("attn_output_7")];
+            tensor<int32, [4]> var_1328 = const()[name = tensor<string, []>("op_1328"), val = tensor<int32, [4]>([2, 0, 1, 3])];
+            tensor<int32, [2]> var_1333 = const()[name = tensor<string, []>("op_1333"), val = tensor<int32, [2]>([60, 256])];
+            tensor<fp32, [12, 5, 4, 64]> var_1329 = transpose(perm = var_1328, x = attn_output_1)[name = tensor<string, []>("transpose_17")];
+            tensor<fp32, [60, 256]> attn_output_3 = reshape(shape = var_1333, x = var_1329)[name = tensor<string, []>("attn_output_3")];
+            tensor<fp32, [60, 256]> attn_output_5 = linear(bias = inner_decoder_self_attn2_0_out_proj_bias, weight = inner_decoder_self_attn2_0_out_proj_weight, x = attn_output_3)[name = tensor<string, []>("linear_44")];
+            tensor<int32, [3]> var_1337 = const()[name = tensor<string, []>("op_1337"), val = tensor<int32, [3]>([12, 5, 256])];
+            tensor<fp32, [12, 5, 256]> attn_output_7 = reshape(shape = var_1337, x = attn_output_5)[name = tensor<string, []>("attn_output_7")];
             tensor<int32, [3]> sa2_out_1_perm_0 = const()[name = tensor<string, []>("sa2_out_1_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [5, 12, 256]> sa2_out_1 = transpose(perm = sa2_out_1_perm_0, x = attn_output_7)[name = tensor<string, []>("transpose_16")];
-            tensor<fp32, [5, 12, 256]> input_175 = add(x = query_1, y = sa2_out_1)[name = tensor<string, []>("input_175")];
-            tensor<int32, [1]> input_177_axes_0 = const()[name = tensor<string, []>("input_177_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [5, 12, 256]> input_177 = layer_norm(axes = input_177_axes_0, beta = decoder_norm21_0_bias, epsilon = var_1050, gamma = decoder_norm21_0_weight, x = input_175)[name = tensor<string, []>("input_177")];
-            tensor<fp32, [5, 12, 2048]> input_179 = linear(bias = decoder_linear1_0_bias, weight = decoder_linear1_0_weight, x = input_177)[name = tensor<string, []>("linear_45")];
-            tensor<fp32, [5, 12, 2048]> input_181 = relu(x = input_179)[name = tensor<string, []>("input_181")];
-            tensor<fp32, [5, 12, 256]> ffn_out_1 = linear(bias = decoder_linear2_0_bias, weight = decoder_linear2_0_weight, x = input_181)[name = tensor<string, []>("linear_46")];
-            tensor<fp32, [5, 12, 256]> input_183 = add(x = input_177, y = ffn_out_1)[name = tensor<string, []>("input_183")];
+            tensor<fp32, [5, 12, 256]> input_177 = add(x = query_1, y = sa2_out_1)[name = tensor<string, []>("input_177")];
+            tensor<int32, [1]> input_179_axes_0 = const()[name = tensor<string, []>("input_179_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [5, 12, 256]> input_179 = layer_norm(axes = input_179_axes_0, beta = inner_decoder_norm21_0_bias, epsilon = var_76, gamma = inner_decoder_norm21_0_weight, x = input_177)[name = tensor<string, []>("input_179")];
+            tensor<fp32, [5, 12, 2048]> input_181 = linear(bias = inner_decoder_linear1_0_bias, weight = inner_decoder_linear1_0_weight, x = input_179)[name = tensor<string, []>("linear_45")];
+            tensor<fp32, [5, 12, 2048]> input_183 = relu(x = input_181)[name = tensor<string, []>("input_183")];
+            tensor<fp32, [5, 12, 256]> ffn_out_1 = linear(bias = inner_decoder_linear2_0_bias, weight = inner_decoder_linear2_0_weight, x = input_183)[name = tensor<string, []>("linear_46")];
+            tensor<fp32, [5, 12, 256]> input_185 = add(x = input_179, y = ffn_out_1)[name = tensor<string, []>("input_185")];
             tensor<int32, [1]> xt_3_axes_0 = const()[name = tensor<string, []>("xt_3_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [5, 12, 256]> xt_3 = layer_norm(axes = xt_3_axes_0, beta = decoder_norm22_0_bias, epsilon = var_1050, gamma = decoder_norm22_0_weight, x = input_183)[name = tensor<string, []>("xt_3")];
-            tensor<int32, [4]> var_1314 = const()[name = tensor<string, []>("op_1314"), val = tensor<int32, [4]>([1, 5, 12, 256])];
-            tensor<fp32, [1, 5, 12, 256]> x_31 = reshape(shape = var_1314, x = xt_3)[name = tensor<string, []>("x_31")];
-            tensor<int32, [4]> var_1316_perm_0 = const()[name = tensor<string, []>("op_1316_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1320 = const()[name = tensor<string, []>("op_1320"), val = tensor<int32, [3]>([12, 5, 256])];
-            tensor<fp32, [1, 12, 5, 256]> var_1316 = transpose(perm = var_1316_perm_0, x = x_31)[name = tensor<string, []>("transpose_15")];
-            tensor<fp32, [12, 5, 256]> x = reshape(shape = var_1320, x = var_1316)[name = tensor<string, []>("x")];
+            tensor<fp32, [5, 12, 256]> xt_3 = layer_norm(axes = xt_3_axes_0, beta = inner_decoder_norm22_0_bias, epsilon = var_76, gamma = inner_decoder_norm22_0_weight, x = input_185)[name = tensor<string, []>("xt_3")];
+            tensor<int32, [4]> var_1357 = const()[name = tensor<string, []>("op_1357"), val = tensor<int32, [4]>([1, 5, 12, 256])];
+            tensor<fp32, [1, 5, 12, 256]> x_31 = reshape(shape = var_1357, x = xt_3)[name = tensor<string, []>("x_31")];
+            tensor<int32, [4]> var_1359_perm_0 = const()[name = tensor<string, []>("op_1359_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1363 = const()[name = tensor<string, []>("op_1363"), val = tensor<int32, [3]>([12, 5, 256])];
+            tensor<fp32, [1, 12, 5, 256]> var_1359 = transpose(perm = var_1359_perm_0, x = x_31)[name = tensor<string, []>("transpose_15")];
+            tensor<fp32, [12, 5, 256]> x = reshape(shape = var_1363, x = var_1359)[name = tensor<string, []>("x")];
             tensor<int32, [5]> prev_kv_begin_0 = const()[name = tensor<string, []>("prev_kv_begin_0"), val = tensor<int32, [5]>([1, 0, 0, 0, 0])];
             tensor<int32, [5]> prev_kv_end_0 = const()[name = tensor<string, []>("prev_kv_end_0"), val = tensor<int32, [5]>([2, 12, 4, 64, 64])];
             tensor<bool, [5]> prev_kv_end_mask_0 = const()[name = tensor<string, []>("prev_kv_end_mask_0"), val = tensor<bool, [5]>([false, true, true, true, true])];
@@ -1193,120 +1212,120 @@ program(1.0)
             tensor<bool, [2]> prev_scale_end_mask_0 = const()[name = tensor<string, []>("prev_scale_end_mask_0"), val = tensor<bool, [2]>([false, true])];
             tensor<bool, [2]> prev_scale_squeeze_mask_0 = const()[name = tensor<string, []>("prev_scale_squeeze_mask_0"), val = tensor<bool, [2]>([true, false])];
             tensor<fp32, [1]> prev_scale = slice_by_index(begin = prev_scale_begin_0, end = prev_scale_end_0, end_mask = prev_scale_end_mask_0, squeeze_mask = prev_scale_squeeze_mask_0, x = dec_scale)[name = tensor<string, []>("prev_scale")];
-            tensor<fp32, [12, 5, 256]> var_1328 = linear(bias = decoder_q_proj_1_bias, weight = decoder_q_proj_1_weight, x = x)[name = tensor<string, []>("linear_47")];
-            tensor<int32, [4]> var_1329 = const()[name = tensor<string, []>("op_1329"), val = tensor<int32, [4]>([12, 5, 4, 64])];
-            tensor<fp32, [12, 5, 4, 64]> var_1330 = reshape(shape = var_1329, x = var_1328)[name = tensor<string, []>("op_1330")];
+            tensor<fp32, [12, 5, 256]> var_1371 = linear(bias = inner_decoder_q_proj_1_bias, weight = inner_decoder_q_proj_1_weight, x = x)[name = tensor<string, []>("linear_47")];
+            tensor<int32, [4]> var_1372 = const()[name = tensor<string, []>("op_1372"), val = tensor<int32, [4]>([12, 5, 4, 64])];
+            tensor<fp32, [12, 5, 4, 64]> var_1373 = reshape(shape = var_1372, x = var_1371)[name = tensor<string, []>("op_1373")];
             tensor<int32, [4]> q_17_perm_0 = const()[name = tensor<string, []>("q_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 5, 256]> var_1334 = linear(bias = decoder_k_proj_1_bias, weight = decoder_k_proj_1_weight, x = x)[name = tensor<string, []>("linear_48")];
-            tensor<fp32, []> var_1335 = const()[name = tensor<string, []>("op_1335"), val = tensor<fp32, []>(0x1p-3)];
-            tensor<fp32, [12, 5, 256]> var_1336 = mul(x = var_1334, y = var_1335)[name = tensor<string, []>("op_1336")];
-            tensor<int32, [4]> var_1337 = const()[name = tensor<string, []>("op_1337"), val = tensor<int32, [4]>([12, 5, 4, 64])];
-            tensor<fp32, [12, 5, 4, 64]> var_1338 = reshape(shape = var_1337, x = var_1336)[name = tensor<string, []>("op_1338")];
+            tensor<fp32, [12, 5, 256]> var_1377 = linear(bias = inner_decoder_k_proj_1_bias, weight = inner_decoder_k_proj_1_weight, x = x)[name = tensor<string, []>("linear_48")];
+            tensor<fp32, []> var_1378 = const()[name = tensor<string, []>("op_1378"), val = tensor<fp32, []>(0x1p-3)];
+            tensor<fp32, [12, 5, 256]> var_1379 = mul(x = var_1377, y = var_1378)[name = tensor<string, []>("op_1379")];
+            tensor<int32, [4]> var_1380 = const()[name = tensor<string, []>("op_1380"), val = tensor<int32, [4]>([12, 5, 4, 64])];
+            tensor<fp32, [12, 5, 4, 64]> var_1381 = reshape(shape = var_1380, x = var_1379)[name = tensor<string, []>("op_1381")];
             tensor<int32, [4]> k_17_perm_0 = const()[name = tensor<string, []>("k_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 5, 256]> var_1342 = linear(bias = decoder_v_proj_1_bias, weight = decoder_v_proj_1_weight, x = x)[name = tensor<string, []>("linear_49")];
-            tensor<int32, [4]> var_1343 = const()[name = tensor<string, []>("op_1343"), val = tensor<int32, [4]>([12, 5, 4, 64])];
-            tensor<fp32, [12, 5, 4, 64]> var_1344 = reshape(shape = var_1343, x = var_1342)[name = tensor<string, []>("op_1344")];
+            tensor<fp32, [12, 5, 256]> var_1385 = linear(bias = inner_decoder_v_proj_1_bias, weight = inner_decoder_v_proj_1_weight, x = x)[name = tensor<string, []>("linear_49")];
+            tensor<int32, [4]> var_1386 = const()[name = tensor<string, []>("op_1386"), val = tensor<int32, [4]>([12, 5, 4, 64])];
+            tensor<fp32, [12, 5, 4, 64]> var_1387 = reshape(shape = var_1386, x = var_1385)[name = tensor<string, []>("op_1387")];
             tensor<int32, [4]> v_17_perm_0 = const()[name = tensor<string, []>("v_17_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<fp32, [12, 5, 256]> input_187 = linear(bias = decoder_g_proj_1_bias, weight = decoder_g_proj_1_weight, x = x)[name = tensor<string, []>("linear_50")];
+            tensor<fp32, [12, 5, 256]> input_189 = linear(bias = inner_decoder_g_proj_1_bias, weight = inner_decoder_g_proj_1_weight, x = x)[name = tensor<string, []>("linear_50")];
             tensor<fp32, [1]> sqrt_s0 = sqrt(x = prev_scale)[name = tensor<string, []>("sqrt_s0")];
             tensor<fp32, [5]> s_t = add(x = prev_scale, y = cumsum_mask_1)[name = tensor<string, []>("s_t")];
             tensor<fp32, []> const_32 = const()[name = tensor<string, []>("const_32"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [5]> clip_3 = clip(alpha = var_1055, beta = const_32, x = s_t)[name = tensor<string, []>("clip_3")];
+            tensor<fp32, [5]> clip_3 = clip(alpha = var_69, beta = const_32, x = s_t)[name = tensor<string, []>("clip_3")];
             tensor<fp32, [5]> sqrt_s_t = sqrt(x = clip_3)[name = tensor<string, []>("sqrt_s_t")];
             tensor<bool, []> qk_transpose_x_1 = const()[name = tensor<string, []>("qk_transpose_x_1"), val = tensor<bool, []>(false)];
             tensor<bool, []> qk_transpose_y_1 = const()[name = tensor<string, []>("qk_transpose_y_1"), val = tensor<bool, []>(true)];
-            tensor<fp32, [12, 4, 5, 64]> k_17 = transpose(perm = k_17_perm_0, x = var_1338)[name = tensor<string, []>("transpose_13")];
-            tensor<fp32, [12, 4, 5, 64]> q_17 = transpose(perm = q_17_perm_0, x = var_1330)[name = tensor<string, []>("transpose_14")];
+            tensor<fp32, [12, 4, 5, 64]> k_17 = transpose(perm = k_17_perm_0, x = var_1381)[name = tensor<string, []>("transpose_13")];
+            tensor<fp32, [12, 4, 5, 64]> q_17 = transpose(perm = q_17_perm_0, x = var_1373)[name = tensor<string, []>("transpose_14")];
             tensor<fp32, [12, 4, 5, 5]> qk = matmul(transpose_x = qk_transpose_x_1, transpose_y = qk_transpose_y_1, x = q_17, y = k_17)[name = tensor<string, []>("qk")];
-            tensor<int32, [2]> var_1359 = const()[name = tensor<string, []>("op_1359"), val = tensor<int32, [2]>([5, 1])];
-            tensor<fp32, [5, 1]> var_1360 = reshape(shape = var_1359, x = sqrt_s_t)[name = tensor<string, []>("op_1360")];
-            tensor<fp32, [5, 5]> M = real_div(x = causal_with_valid_1, y = var_1360)[name = tensor<string, []>("M")];
-            tensor<fp32, [12, 4, 5, 5]> var_1362 = mul(x = qk, y = M)[name = tensor<string, []>("op_1362")];
-            tensor<bool, []> inner_transpose_x_0 = const()[name = tensor<string, []>("inner_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> inner_transpose_y_0 = const()[name = tensor<string, []>("inner_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 5, 64]> v_17 = transpose(perm = v_17_perm_0, x = var_1344)[name = tensor<string, []>("transpose_12")];
-            tensor<fp32, [12, 4, 5, 64]> inner = matmul(transpose_x = inner_transpose_x_0, transpose_y = inner_transpose_y_0, x = var_1362, y = v_17)[name = tensor<string, []>("inner")];
-            tensor<bool, []> var_1364_transpose_x_0 = const()[name = tensor<string, []>("op_1364_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_1364_transpose_y_0 = const()[name = tensor<string, []>("op_1364_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 5, 64]> var_1364 = matmul(transpose_x = var_1364_transpose_x_0, transpose_y = var_1364_transpose_y_0, x = q_17, y = prev_kv)[name = tensor<string, []>("op_1364")];
-            tensor<fp32, [5]> var_1365 = real_div(x = sqrt_s0, y = sqrt_s_t)[name = tensor<string, []>("op_1365")];
-            tensor<int32, [4]> var_1366 = const()[name = tensor<string, []>("op_1366"), val = tensor<int32, [4]>([1, 1, 5, 1])];
-            tensor<fp32, [1, 1, 5, 1]> var_1367 = reshape(shape = var_1366, x = var_1365)[name = tensor<string, []>("op_1367")];
-            tensor<fp32, [12, 4, 5, 64]> cross = mul(x = var_1364, y = var_1367)[name = tensor<string, []>("cross")];
-            tensor<fp32, [12, 4, 5, 64]> out_31 = add(x = inner, y = cross)[name = tensor<string, []>("out_31")];
-            tensor<fp32, [12, 4, 5, 64]> v_masked = mul(x = v_17, y = var_1190)[name = tensor<string, []>("v_masked")];
-            tensor<fp32, [12, 4, 64, 64]> var_1373 = mul(x = prev_kv, y = sqrt_s0)[name = tensor<string, []>("op_1373")];
-            tensor<bool, []> var_1375_transpose_x_1 = const()[name = tensor<string, []>("op_1375_transpose_x_1"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_1375_transpose_y_1 = const()[name = tensor<string, []>("op_1375_transpose_y_1"), val = tensor<bool, []>(false)];
-            tensor<fp32, [12, 4, 64, 64]> var_1375 = matmul(transpose_x = var_1375_transpose_x_1, transpose_y = var_1375_transpose_y_1, x = k_17, y = v_masked)[name = tensor<string, []>("op_1375")];
-            tensor<fp32, [12, 4, 64, 64]> new_kv_unnorm = add(x = var_1373, y = var_1375)[name = tensor<string, []>("new_kv_unnorm")];
-            tensor<fp32, [1]> new_scale = add(x = prev_scale, y = var_1198)[name = tensor<string, []>("new_scale")];
+            tensor<int32, [2]> var_1402 = const()[name = tensor<string, []>("op_1402"), val = tensor<int32, [2]>([5, 1])];
+            tensor<fp32, [5, 1]> var_1403 = reshape(shape = var_1402, x = sqrt_s_t)[name = tensor<string, []>("op_1403")];
+            tensor<fp32, [5, 5]> M = real_div(x = causal_with_valid_1, y = var_1403)[name = tensor<string, []>("M")];
+            tensor<fp32, [12, 4, 5, 5]> var_1405 = mul(x = qk, y = M)[name = tensor<string, []>("op_1405")];
+            tensor<bool, []> inner_11_transpose_x_0 = const()[name = tensor<string, []>("inner_11_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> inner_11_transpose_y_0 = const()[name = tensor<string, []>("inner_11_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 5, 64]> v_17 = transpose(perm = v_17_perm_0, x = var_1387)[name = tensor<string, []>("transpose_12")];
+            tensor<fp32, [12, 4, 5, 64]> inner_11 = matmul(transpose_x = inner_11_transpose_x_0, transpose_y = inner_11_transpose_y_0, x = var_1405, y = v_17)[name = tensor<string, []>("inner_11")];
+            tensor<bool, []> var_1407_transpose_x_0 = const()[name = tensor<string, []>("op_1407_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_1407_transpose_y_0 = const()[name = tensor<string, []>("op_1407_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 5, 64]> var_1407 = matmul(transpose_x = var_1407_transpose_x_0, transpose_y = var_1407_transpose_y_0, x = q_17, y = prev_kv)[name = tensor<string, []>("op_1407")];
+            tensor<fp32, [5]> var_1408 = real_div(x = sqrt_s0, y = sqrt_s_t)[name = tensor<string, []>("op_1408")];
+            tensor<int32, [4]> var_1409 = const()[name = tensor<string, []>("op_1409"), val = tensor<int32, [4]>([1, 1, 5, 1])];
+            tensor<fp32, [1, 1, 5, 1]> var_1410 = reshape(shape = var_1409, x = var_1408)[name = tensor<string, []>("op_1410")];
+            tensor<fp32, [12, 4, 5, 64]> cross = mul(x = var_1407, y = var_1410)[name = tensor<string, []>("cross")];
+            tensor<fp32, [12, 4, 5, 64]> out_31 = add(x = inner_11, y = cross)[name = tensor<string, []>("out_31")];
+            tensor<fp32, [12, 4, 5, 64]> v_masked = mul(x = v_17, y = var_1233)[name = tensor<string, []>("v_masked")];
+            tensor<fp32, [12, 4, 64, 64]> var_1416 = mul(x = prev_kv, y = sqrt_s0)[name = tensor<string, []>("op_1416")];
+            tensor<bool, []> var_1418_transpose_x_1 = const()[name = tensor<string, []>("op_1418_transpose_x_1"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_1418_transpose_y_1 = const()[name = tensor<string, []>("op_1418_transpose_y_1"), val = tensor<bool, []>(false)];
+            tensor<fp32, [12, 4, 64, 64]> var_1418 = matmul(transpose_x = var_1418_transpose_x_1, transpose_y = var_1418_transpose_y_1, x = k_17, y = v_masked)[name = tensor<string, []>("op_1418")];
+            tensor<fp32, [12, 4, 64, 64]> new_kv_unnorm = add(x = var_1416, y = var_1418)[name = tensor<string, []>("new_kv_unnorm")];
+            tensor<fp32, [1]> new_scale = add(x = prev_scale, y = var_1241)[name = tensor<string, []>("new_scale")];
             tensor<fp32, []> const_33 = const()[name = tensor<string, []>("const_33"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1]> clip_4 = clip(alpha = var_1055, beta = const_33, x = new_scale)[name = tensor<string, []>("clip_4")];
+            tensor<fp32, [1]> clip_4 = clip(alpha = var_69, beta = const_33, x = new_scale)[name = tensor<string, []>("clip_4")];
             tensor<fp32, [1]> sqrt_new_scale = sqrt(x = clip_4)[name = tensor<string, []>("sqrt_new_scale")];
             tensor<fp32, [12, 4, 64, 64]> nkv = real_div(x = new_kv_unnorm, y = sqrt_new_scale)[name = tensor<string, []>("nkv")];
-            tensor<int32, [4]> var_1384_perm_0 = const()[name = tensor<string, []>("op_1384_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [4]> var_1427_perm_0 = const()[name = tensor<string, []>("op_1427_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
             tensor<int32, [1]> out_33_axes_0 = const()[name = tensor<string, []>("out_33_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [12, 5, 4, 64]> var_1384 = transpose(perm = var_1384_perm_0, x = out_31)[name = tensor<string, []>("transpose_11")];
-            tensor<fp32, [12, 5, 4, 64]> out_33 = layer_norm(axes = out_33_axes_0, epsilon = var_1052, x = var_1384)[name = tensor<string, []>("out_33")];
-            tensor<int32, [3]> var_1388 = const()[name = tensor<string, []>("op_1388"), val = tensor<int32, [3]>([12, 5, 256])];
-            tensor<fp32, [12, 5, 256]> out = reshape(shape = var_1388, x = out_33)[name = tensor<string, []>("out")];
-            tensor<fp32, [12, 5, 256]> var_1390 = silu(x = input_187)[name = tensor<string, []>("op_1390")];
-            tensor<fp32, [12, 5, 256]> input_189 = mul(x = var_1390, y = out)[name = tensor<string, []>("input_189")];
-            tensor<fp32, [12, 5, 256]> ret_out = linear(bias = decoder_out_proj_1_bias, weight = decoder_out_proj_1_weight, x = input_189)[name = tensor<string, []>("linear_51")];
-            tensor<fp32, [12, 5, 256]> input_191 = add(x = x, y = ret_out)[name = tensor<string, []>("input_191")];
+            tensor<fp32, [12, 5, 4, 64]> var_1427 = transpose(perm = var_1427_perm_0, x = out_31)[name = tensor<string, []>("transpose_11")];
+            tensor<fp32, [12, 5, 4, 64]> out_33 = layer_norm(axes = out_33_axes_0, epsilon = var_84, x = var_1427)[name = tensor<string, []>("out_33")];
+            tensor<int32, [3]> var_1431 = const()[name = tensor<string, []>("op_1431"), val = tensor<int32, [3]>([12, 5, 256])];
+            tensor<fp32, [12, 5, 256]> out = reshape(shape = var_1431, x = out_33)[name = tensor<string, []>("out")];
+            tensor<fp32, [12, 5, 256]> var_1433 = silu(x = input_189)[name = tensor<string, []>("op_1433")];
+            tensor<fp32, [12, 5, 256]> input_191 = mul(x = var_1433, y = out)[name = tensor<string, []>("input_191")];
+            tensor<fp32, [12, 5, 256]> ret_out = linear(bias = inner_decoder_out_proj_1_bias, weight = inner_decoder_out_proj_1_weight, x = input_191)[name = tensor<string, []>("linear_51")];
+            tensor<fp32, [12, 5, 256]> input_193 = add(x = x, y = ret_out)[name = tensor<string, []>("input_193")];
             tensor<int32, [1]> xt_5_axes_0 = const()[name = tensor<string, []>("xt_5_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [12, 5, 256]> xt_5 = layer_norm(axes = xt_5_axes_0, beta = decoder_norm11_1_bias, epsilon = var_1050, gamma = decoder_norm11_1_weight, x = input_191)[name = tensor<string, []>("xt_5")];
-            tensor<int32, [4]> var_1400 = const()[name = tensor<string, []>("op_1400"), val = tensor<int32, [4]>([1, 12, 5, 256])];
-            tensor<fp32, [1, 12, 5, 256]> var_1401 = reshape(shape = var_1400, x = xt_5)[name = tensor<string, []>("op_1401")];
-            tensor<int32, [4]> var_1402_perm_0 = const()[name = tensor<string, []>("op_1402_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
-            tensor<int32, [3]> var_1405 = const()[name = tensor<string, []>("op_1405"), val = tensor<int32, [3]>([5, 12, 256])];
-            tensor<fp32, [1, 5, 12, 256]> var_1402 = transpose(perm = var_1402_perm_0, x = var_1401)[name = tensor<string, []>("transpose_10")];
-            tensor<fp32, [5, 12, 256]> query_5 = reshape(shape = var_1405, x = var_1402)[name = tensor<string, []>("query_5")];
+            tensor<fp32, [12, 5, 256]> xt_5 = layer_norm(axes = xt_5_axes_0, beta = inner_decoder_norm11_1_bias, epsilon = var_76, gamma = inner_decoder_norm11_1_weight, x = input_193)[name = tensor<string, []>("xt_5")];
+            tensor<int32, [4]> var_1443 = const()[name = tensor<string, []>("op_1443"), val = tensor<int32, [4]>([1, 12, 5, 256])];
+            tensor<fp32, [1, 12, 5, 256]> var_1444 = reshape(shape = var_1443, x = xt_5)[name = tensor<string, []>("op_1444")];
+            tensor<int32, [4]> var_1445_perm_0 = const()[name = tensor<string, []>("op_1445_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1448 = const()[name = tensor<string, []>("op_1448"), val = tensor<int32, [3]>([5, 12, 256])];
+            tensor<fp32, [1, 5, 12, 256]> var_1445 = transpose(perm = var_1445_perm_0, x = var_1444)[name = tensor<string, []>("transpose_10")];
+            tensor<fp32, [5, 12, 256]> query_5 = reshape(shape = var_1448, x = var_1445)[name = tensor<string, []>("query_5")];
             tensor<int32, [3]> query_perm_0 = const()[name = tensor<string, []>("query_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [12, 5, 256]> query = transpose(perm = query_perm_0, x = query_5)[name = tensor<string, []>("transpose_9")];
-            tensor<fp32, [12, 5, 768]> var_1428 = linear(bias = decoder_self_attn2_1_in_proj_bias, weight = decoder_self_attn2_1_in_proj_weight, x = query)[name = tensor<string, []>("linear_52")];
+            tensor<fp32, [12, 5, 768]> var_1471 = linear(bias = inner_decoder_self_attn2_1_in_proj_bias, weight = inner_decoder_self_attn2_1_in_proj_weight, x = query)[name = tensor<string, []>("linear_52")];
             tensor<int32, [4]> concat_2 = const()[name = tensor<string, []>("concat_2"), val = tensor<int32, [4]>([12, 5, 3, 256])];
-            tensor<fp32, [12, 5, 3, 256]> var_1430 = reshape(shape = concat_2, x = var_1428)[name = tensor<string, []>("op_1430")];
-            tensor<int32, [1]> var_1431_axes_0 = const()[name = tensor<string, []>("op_1431_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp32, [1, 12, 5, 3, 256]> var_1431 = expand_dims(axes = var_1431_axes_0, x = var_1430)[name = tensor<string, []>("op_1431")];
-            tensor<int32, [5]> var_1432_perm_0 = const()[name = tensor<string, []>("op_1432_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
-            tensor<int32, [1]> var_1433_axes_0 = const()[name = tensor<string, []>("op_1433_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp32, [3, 12, 5, 1, 256]> var_1432 = transpose(perm = var_1432_perm_0, x = var_1431)[name = tensor<string, []>("transpose_8")];
-            tensor<fp32, [3, 12, 5, 256]> var_1433 = squeeze(axes = var_1433_axes_0, x = var_1432)[name = tensor<string, []>("op_1433")];
+            tensor<fp32, [12, 5, 3, 256]> var_1473 = reshape(shape = concat_2, x = var_1471)[name = tensor<string, []>("op_1473")];
+            tensor<int32, [1]> var_1474_axes_0 = const()[name = tensor<string, []>("op_1474_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [1, 12, 5, 3, 256]> var_1474 = expand_dims(axes = var_1474_axes_0, x = var_1473)[name = tensor<string, []>("op_1474")];
+            tensor<int32, [5]> var_1475_perm_0 = const()[name = tensor<string, []>("op_1475_perm_0"), val = tensor<int32, [5]>([-2, 1, 2, 0, 4])];
+            tensor<int32, [1]> var_1476_axes_0 = const()[name = tensor<string, []>("op_1476_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp32, [3, 12, 5, 1, 256]> var_1475 = transpose(perm = var_1475_perm_0, x = var_1474)[name = tensor<string, []>("transpose_8")];
+            tensor<fp32, [3, 12, 5, 256]> var_1476 = squeeze(axes = var_1476_axes_0, x = var_1475)[name = tensor<string, []>("op_1476")];
             tensor<int32, [4]> q_19_begin_0 = const()[name = tensor<string, []>("q_19_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [4]> q_19_end_0 = const()[name = tensor<string, []>("q_19_end_0"), val = tensor<int32, [4]>([1, 12, 5, 256])];
             tensor<bool, [4]> q_19_end_mask_0 = const()[name = tensor<string, []>("q_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> q_19_squeeze_mask_0 = const()[name = tensor<string, []>("q_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 5, 256]> q_19 = slice_by_index(begin = q_19_begin_0, end = q_19_end_0, end_mask = q_19_end_mask_0, squeeze_mask = q_19_squeeze_mask_0, x = var_1433)[name = tensor<string, []>("q_19")];
+            tensor<fp32, [12, 5, 256]> q_19 = slice_by_index(begin = q_19_begin_0, end = q_19_end_0, end_mask = q_19_end_mask_0, squeeze_mask = q_19_squeeze_mask_0, x = var_1476)[name = tensor<string, []>("q_19")];
             tensor<int32, [4]> k_19_begin_0 = const()[name = tensor<string, []>("k_19_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
             tensor<int32, [4]> k_19_end_0 = const()[name = tensor<string, []>("k_19_end_0"), val = tensor<int32, [4]>([2, 12, 5, 256])];
             tensor<bool, [4]> k_19_end_mask_0 = const()[name = tensor<string, []>("k_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> k_19_squeeze_mask_0 = const()[name = tensor<string, []>("k_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 5, 256]> k_19 = slice_by_index(begin = k_19_begin_0, end = k_19_end_0, end_mask = k_19_end_mask_0, squeeze_mask = k_19_squeeze_mask_0, x = var_1433)[name = tensor<string, []>("k_19")];
+            tensor<fp32, [12, 5, 256]> k_19 = slice_by_index(begin = k_19_begin_0, end = k_19_end_0, end_mask = k_19_end_mask_0, squeeze_mask = k_19_squeeze_mask_0, x = var_1476)[name = tensor<string, []>("k_19")];
             tensor<int32, [4]> v_19_begin_0 = const()[name = tensor<string, []>("v_19_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
             tensor<int32, [4]> v_19_end_0 = const()[name = tensor<string, []>("v_19_end_0"), val = tensor<int32, [4]>([3, 12, 5, 256])];
             tensor<bool, [4]> v_19_end_mask_0 = const()[name = tensor<string, []>("v_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
             tensor<bool, [4]> v_19_squeeze_mask_0 = const()[name = tensor<string, []>("v_19_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
-            tensor<fp32, [12, 5, 256]> v_19 = slice_by_index(begin = v_19_begin_0, end = v_19_end_0, end_mask = v_19_end_mask_0, squeeze_mask = v_19_squeeze_mask_0, x = var_1433)[name = tensor<string, []>("v_19")];
-            tensor<int32, [3]> var_1441 = const()[name = tensor<string, []>("op_1441"), val = tensor<int32, [3]>([12, 20, 64])];
-            tensor<fp32, [12, 20, 64]> var_1442 = reshape(shape = var_1441, x = q_19)[name = tensor<string, []>("op_1442")];
+            tensor<fp32, [12, 5, 256]> v_19 = slice_by_index(begin = v_19_begin_0, end = v_19_end_0, end_mask = v_19_end_mask_0, squeeze_mask = v_19_squeeze_mask_0, x = var_1476)[name = tensor<string, []>("v_19")];
+            tensor<int32, [3]> var_1484 = const()[name = tensor<string, []>("op_1484"), val = tensor<int32, [3]>([12, 20, 64])];
+            tensor<fp32, [12, 20, 64]> var_1485 = reshape(shape = var_1484, x = q_19)[name = tensor<string, []>("op_1485")];
             tensor<int32, [3]> q_21_perm_0 = const()[name = tensor<string, []>("q_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1448 = const()[name = tensor<string, []>("op_1448"), val = tensor<int32, [3]>([12, 20, 64])];
-            tensor<fp32, [12, 20, 64]> var_1449 = reshape(shape = var_1448, x = k_19)[name = tensor<string, []>("op_1449")];
+            tensor<int32, [3]> var_1491 = const()[name = tensor<string, []>("op_1491"), val = tensor<int32, [3]>([12, 20, 64])];
+            tensor<fp32, [12, 20, 64]> var_1492 = reshape(shape = var_1491, x = k_19)[name = tensor<string, []>("op_1492")];
             tensor<int32, [3]> k_21_perm_0 = const()[name = tensor<string, []>("k_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [3]> var_1455 = const()[name = tensor<string, []>("op_1455"), val = tensor<int32, [3]>([12, 20, 64])];
-            tensor<fp32, [12, 20, 64]> var_1456 = reshape(shape = var_1455, x = v_19)[name = tensor<string, []>("op_1456")];
+            tensor<int32, [3]> var_1498 = const()[name = tensor<string, []>("op_1498"), val = tensor<int32, [3]>([12, 20, 64])];
+            tensor<fp32, [12, 20, 64]> var_1499 = reshape(shape = var_1498, x = v_19)[name = tensor<string, []>("op_1499")];
             tensor<int32, [3]> v_21_perm_0 = const()[name = tensor<string, []>("v_21_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [4]> var_1459 = const()[name = tensor<string, []>("op_1459"), val = tensor<int32, [4]>([5, 4, 12, 64])];
-            tensor<fp32, [20, 12, 64]> q_21 = transpose(perm = q_21_perm_0, x = var_1442)[name = tensor<string, []>("transpose_7")];
-            tensor<fp32, [5, 4, 12, 64]> q = reshape(shape = var_1459, x = q_21)[name = tensor<string, []>("q")];
-            tensor<int32, [4]> var_1461 = const()[name = tensor<string, []>("op_1461"), val = tensor<int32, [4]>([5, 4, 12, 64])];
-            tensor<fp32, [20, 12, 64]> k_21 = transpose(perm = k_21_perm_0, x = var_1449)[name = tensor<string, []>("transpose_6")];
-            tensor<fp32, [5, 4, 12, 64]> k = reshape(shape = var_1461, x = k_21)[name = tensor<string, []>("k")];
-            tensor<int32, [4]> var_1463 = const()[name = tensor<string, []>("op_1463"), val = tensor<int32, [4]>([5, 4, 12, 64])];
-            tensor<fp32, [20, 12, 64]> v_21 = transpose(perm = v_21_perm_0, x = var_1456)[name = tensor<string, []>("transpose_5")];
-            tensor<fp32, [5, 4, 12, 64]> v = reshape(shape = var_1463, x = v_21)[name = tensor<string, []>("v")];
+            tensor<int32, [4]> var_1502 = const()[name = tensor<string, []>("op_1502"), val = tensor<int32, [4]>([5, 4, 12, 64])];
+            tensor<fp32, [20, 12, 64]> q_21 = transpose(perm = q_21_perm_0, x = var_1485)[name = tensor<string, []>("transpose_7")];
+            tensor<fp32, [5, 4, 12, 64]> q = reshape(shape = var_1502, x = q_21)[name = tensor<string, []>("q")];
+            tensor<int32, [4]> var_1504 = const()[name = tensor<string, []>("op_1504"), val = tensor<int32, [4]>([5, 4, 12, 64])];
+            tensor<fp32, [20, 12, 64]> k_21 = transpose(perm = k_21_perm_0, x = var_1492)[name = tensor<string, []>("transpose_6")];
+            tensor<fp32, [5, 4, 12, 64]> k = reshape(shape = var_1504, x = k_21)[name = tensor<string, []>("k")];
+            tensor<int32, [4]> var_1506 = const()[name = tensor<string, []>("op_1506"), val = tensor<int32, [4]>([5, 4, 12, 64])];
+            tensor<fp32, [20, 12, 64]> v_21 = transpose(perm = v_21_perm_0, x = var_1499)[name = tensor<string, []>("transpose_5")];
+            tensor<fp32, [5, 4, 12, 64]> v = reshape(shape = var_1506, x = v_21)[name = tensor<string, []>("v")];
             tensor<fp32, []> mul_3_y_0 = const()[name = tensor<string, []>("mul_3_y_0"), val = tensor<fp32, []>(0x1p-3)];
             tensor<fp32, [5, 4, 12, 64]> mul_3 = mul(x = q, y = mul_3_y_0)[name = tensor<string, []>("mul_3")];
             tensor<bool, []> matmul_1_transpose_y_0 = const()[name = tensor<string, []>("matmul_1_transpose_y_0"), val = tensor<bool, []>(true)];
@@ -1317,36 +1336,36 @@ program(1.0)
             tensor<bool, []> attn_output_9_transpose_x_0 = const()[name = tensor<string, []>("attn_output_9_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> attn_output_9_transpose_y_0 = const()[name = tensor<string, []>("attn_output_9_transpose_y_0"), val = tensor<bool, []>(false)];
             tensor<fp32, [5, 4, 12, 64]> attn_output_9 = matmul(transpose_x = attn_output_9_transpose_x_0, transpose_y = attn_output_9_transpose_y_0, x = softmax_1, y = v)[name = tensor<string, []>("attn_output_9")];
-            tensor<int32, [4]> var_1466 = const()[name = tensor<string, []>("op_1466"), val = tensor<int32, [4]>([2, 0, 1, 3])];
-            tensor<int32, [2]> var_1471 = const()[name = tensor<string, []>("op_1471"), val = tensor<int32, [2]>([60, 256])];
-            tensor<fp32, [12, 5, 4, 64]> var_1467 = transpose(perm = var_1466, x = attn_output_9)[name = tensor<string, []>("transpose_4")];
-            tensor<fp32, [60, 256]> attn_output_11 = reshape(shape = var_1471, x = var_1467)[name = tensor<string, []>("attn_output_11")];
-            tensor<fp32, [60, 256]> attn_output_13 = linear(bias = decoder_self_attn2_1_out_proj_bias, weight = decoder_self_attn2_1_out_proj_weight, x = attn_output_11)[name = tensor<string, []>("linear_53")];
-            tensor<int32, [3]> var_1475 = const()[name = tensor<string, []>("op_1475"), val = tensor<int32, [3]>([12, 5, 256])];
-            tensor<fp32, [12, 5, 256]> attn_output = reshape(shape = var_1475, x = attn_output_13)[name = tensor<string, []>("attn_output")];
+            tensor<int32, [4]> var_1509 = const()[name = tensor<string, []>("op_1509"), val = tensor<int32, [4]>([2, 0, 1, 3])];
+            tensor<int32, [2]> var_1514 = const()[name = tensor<string, []>("op_1514"), val = tensor<int32, [2]>([60, 256])];
+            tensor<fp32, [12, 5, 4, 64]> var_1510 = transpose(perm = var_1509, x = attn_output_9)[name = tensor<string, []>("transpose_4")];
+            tensor<fp32, [60, 256]> attn_output_11 = reshape(shape = var_1514, x = var_1510)[name = tensor<string, []>("attn_output_11")];
+            tensor<fp32, [60, 256]> attn_output_13 = linear(bias = inner_decoder_self_attn2_1_out_proj_bias, weight = inner_decoder_self_attn2_1_out_proj_weight, x = attn_output_11)[name = tensor<string, []>("linear_53")];
+            tensor<int32, [3]> var_1518 = const()[name = tensor<string, []>("op_1518"), val = tensor<int32, [3]>([12, 5, 256])];
+            tensor<fp32, [12, 5, 256]> attn_output = reshape(shape = var_1518, x = attn_output_13)[name = tensor<string, []>("attn_output")];
             tensor<int32, [3]> sa2_out_perm_0 = const()[name = tensor<string, []>("sa2_out_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
             tensor<fp32, [5, 12, 256]> sa2_out = transpose(perm = sa2_out_perm_0, x = attn_output)[name = tensor<string, []>("transpose_3")];
-            tensor<fp32, [5, 12, 256]> input_193 = add(x = query_5, y = sa2_out)[name = tensor<string, []>("input_193")];
-            tensor<int32, [1]> input_195_axes_0 = const()[name = tensor<string, []>("input_195_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [5, 12, 256]> input_195 = layer_norm(axes = input_195_axes_0, beta = decoder_norm21_1_bias, epsilon = var_1050, gamma = decoder_norm21_1_weight, x = input_193)[name = tensor<string, []>("input_195")];
-            tensor<fp32, [5, 12, 2048]> input_197 = linear(bias = decoder_linear1_1_bias, weight = decoder_linear1_1_weight, x = input_195)[name = tensor<string, []>("linear_54")];
-            tensor<fp32, [5, 12, 2048]> input_199 = relu(x = input_197)[name = tensor<string, []>("input_199")];
-            tensor<fp32, [5, 12, 256]> ffn_out = linear(bias = decoder_linear2_1_bias, weight = decoder_linear2_1_weight, x = input_199)[name = tensor<string, []>("linear_55")];
-            tensor<fp32, [5, 12, 256]> input_201 = add(x = input_195, y = ffn_out)[name = tensor<string, []>("input_201")];
+            tensor<fp32, [5, 12, 256]> input_195 = add(x = query_5, y = sa2_out)[name = tensor<string, []>("input_195")];
+            tensor<int32, [1]> input_197_axes_0 = const()[name = tensor<string, []>("input_197_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [5, 12, 256]> input_197 = layer_norm(axes = input_197_axes_0, beta = inner_decoder_norm21_1_bias, epsilon = var_76, gamma = inner_decoder_norm21_1_weight, x = input_195)[name = tensor<string, []>("input_197")];
+            tensor<fp32, [5, 12, 2048]> input_199 = linear(bias = inner_decoder_linear1_1_bias, weight = inner_decoder_linear1_1_weight, x = input_197)[name = tensor<string, []>("linear_54")];
+            tensor<fp32, [5, 12, 2048]> input_201 = relu(x = input_199)[name = tensor<string, []>("input_201")];
+            tensor<fp32, [5, 12, 256]> ffn_out = linear(bias = inner_decoder_linear2_1_bias, weight = inner_decoder_linear2_1_weight, x = input_201)[name = tensor<string, []>("linear_55")];
+            tensor<fp32, [5, 12, 256]> input_203 = add(x = input_197, y = ffn_out)[name = tensor<string, []>("input_203")];
             tensor<int32, [1]> xt_axes_0 = const()[name = tensor<string, []>("xt_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [5, 12, 256]> xt = layer_norm(axes = xt_axes_0, beta = decoder_norm22_1_bias, epsilon = var_1050, gamma = decoder_norm22_1_weight, x = input_201)[name = tensor<string, []>("xt")];
-            tensor<int32, [4]> var_1495 = const()[name = tensor<string, []>("op_1495"), val = tensor<int32, [4]>([1, 5, 12, 256])];
-            tensor<fp32, [1, 5, 12, 256]> input = reshape(shape = var_1495, x = xt)[name = tensor<string, []>("input")];
-            tensor<int32, [1]> var_1497 = const()[name = tensor<string, []>("op_1497"), val = tensor<int32, [1]>([-1])];
-            tensor<fp32, [1, 5, 12, 1]> var_1498 = reduce_l2_norm(axes = var_1497, keep_dims = var_1053, x = input)[name = tensor<string, []>("op_1498")];
+            tensor<fp32, [5, 12, 256]> xt = layer_norm(axes = xt_axes_0, beta = inner_decoder_norm22_1_bias, epsilon = var_76, gamma = inner_decoder_norm22_1_weight, x = input_203)[name = tensor<string, []>("xt")];
+            tensor<int32, [4]> var_1538 = const()[name = tensor<string, []>("op_1538"), val = tensor<int32, [4]>([1, 5, 12, 256])];
+            tensor<fp32, [1, 5, 12, 256]> input = reshape(shape = var_1538, x = xt)[name = tensor<string, []>("input")];
+            tensor<int32, [1]> var_1540 = const()[name = tensor<string, []>("op_1540"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 5, 12, 1]> var_1541 = reduce_l2_norm(axes = var_1540, keep_dims = var_75, x = input)[name = tensor<string, []>("op_1541")];
             tensor<fp32, []> const_42 = const()[name = tensor<string, []>("const_42"), val = tensor<fp32, []>(0x1.fffffep+127)];
-            tensor<fp32, [1, 5, 12, 1]> clip_5 = clip(alpha = var_1045, beta = const_42, x = var_1498)[name = tensor<string, []>("clip_5")];
-            tensor<fp32, [1, 5, 12, 256]> var_1500 = real_div(x = input, y = clip_5)[name = tensor<string, []>("op_1500")];
+            tensor<fp32, [1, 5, 12, 1]> clip_5 = clip(alpha = var_90, beta = const_42, x = var_1541)[name = tensor<string, []>("clip_5")];
+            tensor<fp32, [1, 5, 12, 256]> var_1543 = real_div(x = input, y = clip_5)[name = tensor<string, []>("op_1543")];
             tensor<int32, [3]> concat_6 = const()[name = tensor<string, []>("concat_6"), val = tensor<int32, [3]>([5, 1, 256])];
             tensor<fp32, [5, 1, 256]> reshape_0 = reshape(shape = concat_6, x = emb)[name = tensor<string, []>("reshape_0")];
             tensor<int32, [4]> transpose_1_perm_0 = const()[name = tensor<string, []>("transpose_1_perm_0"), val = tensor<int32, [4]>([0, 1, 3, 2])];
             tensor<int32, [3]> concat_7 = const()[name = tensor<string, []>("concat_7"), val = tensor<int32, [3]>([5, 256, 12])];
-            tensor<fp32, [1, 5, 256, 12]> transpose_1 = transpose(perm = transpose_1_perm_0, x = var_1500)[name = tensor<string, []>("transpose_2")];
+            tensor<fp32, [1, 5, 256, 12]> transpose_1 = transpose(perm = transpose_1_perm_0, x = var_1543)[name = tensor<string, []>("transpose_2")];
             tensor<fp32, [5, 256, 12]> reshape_1 = reshape(shape = concat_7, x = transpose_1)[name = tensor<string, []>("reshape_1")];
             tensor<bool, []> matmul_2_transpose_x_0 = const()[name = tensor<string, []>("matmul_2_transpose_x_0"), val = tensor<bool, []>(false)];
             tensor<bool, []> matmul_2_transpose_y_0 = const()[name = tensor<string, []>("matmul_2_transpose_y_0"), val = tensor<bool, []>(false)];
@@ -1357,10 +1376,10 @@ program(1.0)
             tensor<int32, [3]> output_end_0 = const()[name = tensor<string, []>("output_end_0"), val = tensor<int32, [3]>([1, 5, 11])];
             tensor<bool, [3]> output_end_mask_0 = const()[name = tensor<string, []>("output_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
             tensor<fp32, [1, 5, 10]> output = slice_by_index(begin = output_begin_0, end = output_end_0, end_mask = output_end_mask_0, x = reshape_2)[name = tensor<string, []>("output")];
-            tensor<fp32, [1, 5, 10]> probs = sigmoid(x = output)[name = tensor<string, []>("op_1504")];
-            tensor<int32, []> var_1506_axis_0 = const()[name = tensor<string, []>("op_1506_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [2, 12, 4, 64, 64]> dec_kv_new = stack(axis = var_1506_axis_0, values = (var_1202, nkv))[name = tensor<string, []>("op_1506")];
-            tensor<int32, []> var_1508_axis_0 = const()[name = tensor<string, []>("op_1508_axis_0"), val = tensor<int32, []>(0)];
-            tensor<fp32, [2, 1]> dec_scale_new = stack(axis = var_1508_axis_0, values = (new_scale_9, new_scale))[name = tensor<string, []>("op_1508")];
+            tensor<fp32, [1, 5, 10]> probs = sigmoid(x = output)[name = tensor<string, []>("op_1547")];
+            tensor<int32, []> var_1549_axis_0 = const()[name = tensor<string, []>("op_1549_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [2, 12, 4, 64, 64]> dec_kv_new = stack(axis = var_1549_axis_0, values = (var_1245, nkv))[name = tensor<string, []>("op_1549")];
+            tensor<int32, []> var_1551_axis_0 = const()[name = tensor<string, []>("op_1551_axis_0"), val = tensor<int32, []>(0)];
+            tensor<fp32, [2, 1]> dec_scale_new = stack(axis = var_1551_axis_0, values = (new_scale_9, new_scale))[name = tensor<string, []>("op_1551")];
         } -> (probs, enc_kv_new, enc_scale_new, enc_conv_cache_new, cnn_window_new, dec_kv_new, dec_scale_new);
 }
\ No newline at end of file
diff --git a/optimized/dih3/500ms/ls_eend_dih3_500ms.mlpackage/Data/com.apple.CoreML/model.mlmodel b/optimized/dih3/500ms/ls_eend_dih3_500ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
index 55956bf9724e5d26a5683d705abec5e007070969..176f6bc60a62201091f417bee7b486615bb89b1c 100644
--- a/optimized/dih3/500ms/ls_eend_dih3_500ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
+++ b/optimized/dih3/500ms/ls_eend_dih3_500ms.mlpackage/Data/com.apple.CoreML/model.mlmodel
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:caa9e47ded51d738809bca4aef08aede86aea03f444055505cb42b1a9539bb14
-size 196629
+oid sha256:c4b74a9b812800e63bff390dd09aee47559545f428d8bb10703a36598cefc708
+size 203229
diff --git a/optimized/dih3/500ms/ls_eend_dih3_500ms.mlpackage/Manifest.json b/optimized/dih3/500ms/ls_eend_dih3_500ms.mlpackage/Manifest.json
index eb273aebdeb752ae1ebf3dffb84bfed3c48e15a9..f0929c8403f40931887ab605b60dabc479f935aa 100644
--- a/optimized/dih3/500ms/ls_eend_dih3_500ms.mlpackage/Manifest.json
+++ b/optimized/dih3/500ms/ls_eend_dih3_500ms.mlpackage/Manifest.json
@@ -1,18 +1,18 @@
 {
     "fileFormatVersion": "1.0.0",
     "itemInfoEntries": {
-        "36D483E8-31EB-4224-A050-837C09E85C85": {
-            "author": "com.apple.CoreML",
-            "description": "CoreML Model Specification",
-            "name": "model.mlmodel",
-            "path": "com.apple.CoreML/model.mlmodel"
-        },
-        "F67F7E02-4DC3-4166-A52A-2D9FE21F41D8": {
+        "70FFA018-D34B-4B71-9B7D-3D3754E252B7": {
             "author": "com.apple.CoreML",
             "description": "CoreML Model Weights",
             "name": "weights",
             "path": "com.apple.CoreML/weights"
+        },
+        "CE8A955D-9842-41C5-A7BC-A91B484DBFCD": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Specification",
+            "name": "model.mlmodel",
+            "path": "com.apple.CoreML/model.mlmodel"
         }
     },
-    "rootModelIdentifier": "36D483E8-31EB-4224-A050-837C09E85C85"
+    "rootModelIdentifier": "CE8A955D-9842-41C5-A7BC-A91B484DBFCD"
 }