benchang1110
/

demo_utils

Model card Files Files and versions

xet

Community

benchang1110 commited on Sep 6

Commit

7ca2faf

verified ·

1 Parent(s): 3432a0c

Delete instruction/gen_instruction.py

Browse files

Files changed (1) hide show

instruction/gen_instruction.py +0 -612

instruction/gen_instruction.py DELETED Viewed

@@ -1,612 +0,0 @@
-import argparse
-# operation groups
-NOP         = 0b00000000000001
-TEST_BW     = 0b00000000000010
-LOAD        = 0b00000000000100
-STORE       = 0b00000000001000
-MLP_WM      = 0b00000000010000
-MLP_QKT     = 0b00000000100000
-QKT_M_RSQRT = 0b00000001000000
-MLP_HP      = 0b00000010000000
-GATE        = 0b00000100000000
-RRMS        = 0b00001000000000
-RMSNORM     = 0b00010000000000
-SOFTMAX     = 0b00100000000000
-RESIDUAL    = 0b01000000000000
-ROPE        = 0b10000000000000
-# cmode groups
-MX_INT8     = 0b0
-MX_INT4     = 0b1
-#stage groups
-SUM         = 0b0
-GEN         = 0b1
-# nonlinear groups
-NO_ACT      = 0b0
-SILU        = 0b1
-# load targets groups
-NULL        = 0b000
-ACT         = 0b001
-ACT_S       = 0b010
-RESI        = 0b011
-RESI_S      = 0b100
-WEIGHT_S    = 0b101
-# write back groups
-NO_WB       = 0b00000001
-WB          = 0b00000010
-WB_KV       = 0b00000100
-WB_KV_S     = 0b00001000
-WB_ACT      = 0b00010000
-WB_ACT_S    = 0b00100000
-WB_RESI     = 0b01000000
-WB_RESI_S   = 0b10000000
-# address space
-WQ_BASE_ADDR    , WQ_LAYER_OFFSET                       = 0x00000000, 0x00100000
-WK_BASE_ADDR    , WK_LAYER_OFFSET                       = 0x02000000, 0x00100000
-WV_BASE_ADDR    , WV_LAYER_OFFSET                       = 0x04000000, 0x00100000
-WO_BASE_ADDR    , WO_LAYER_OFFSET                       = 0x06000000, 0x00100000
-W1_BASE_ADDR    , W1_LAYER_OFFSET                       = 0x08000000, 0x002B0000
-W3_BASE_ADDR    , W3_LAYER_OFFSET                       = 0x0D600000, 0x002B0000
-W2_BASE_ADDR    , W2_LAYER_OFFSET                       = 0x12C00000, 0x002B0000
-WQS_BASE_ADDR   , WQS_LAYER_OFFSET                      = 0x18200000, 0x00008000
-WKS_BASE_ADDR   , WKS_LAYER_OFFSET                      = 0x18300000, 0x00008000
-WVS_BASE_ADDR   , WVS_LAYER_OFFSET                      = 0x18400000, 0x00008000
-WOS_BASE_ADDR   , WOS_LAYER_OFFSET                      = 0x18500000, 0x00008000
-W1S_BASE_ADDR   , W1S_LAYER_OFFSET                      = 0x18600000, 0x0015800
-W3S_BASE_ADDR   , W3S_LAYER_OFFSET                      = 0x188B0000, 0x0015800
-W2S_BASE_ADDR   , W2S_LAYER_OFFSET                      = 0x18B60000, 0x0018000
-KC_BASE_ADDR    , KC_LAYER_OFFSET   , KC_TOKEN_OFFSET   = 0x18E60000, 0x0080000     , 0x100
-VC_BASE_ADDR    , VC_LAYER_OFFSET   , VC_TOKEN_OFFSET   = 0x19E60000, 0x0080000     , 0x100
-KCS_BASE_ADDR   , KCS_LAYER_OFFSET  , KCS_TOKEN_OFFSET  = 0x1AE60000, 0x0040000     , 0x80
-VCS_BASE_ADDR   , VCS_LAYER_OFFSET  , VCS_TOKEN_OFFSET  = 0x1B660000, 0x0040000     , 0x80
-ACT_BASE_ADDR   , ACT_TOKEN_OFFSET                      = 0x1BE60000, 0x1000
-RESI_BASE_ADDR  , RESI_TOKEN_OFFSET                     = 0x1C660000, 0x1000
-ACTS_BASE_ADDR  , ACTS_TOKEN_OFFSET                     = 0x1CE60000, 0x80
-RESIS_BASE_ADDR , RESIS_TOKEN_OFFSET                    = 0x1CEA0000, 0x80
-PRENORM_ADDR    , PRENORM_LAYER_OFFSET                  = 0x1CEE0000, 0x1000
-POSTNORM_ADDR   , POSTNORM_LAYER_OFFSET                 = 0x1CF00000, 0x1000
-ROPE_BASE_ADDR  , ROPE_TOKEN_OFFSET                     = 0x1CF20000, 0xC0
-PRENORMS_ADDR   , PRENORMS_LAYER_OFFSET                 = 0x1CFE0000, 0x80
-POSTNORMS_ADDR  , POSTNORMS_LAYER_OFFSET                = 0x1CFE1000, 0x80
-OUTNORM_ADDR                                            = 0x1D840000
-OUTNORMS_ADDR                                           = 0x1D841000
-WHEAD_BASE_ADDR     ,   WHEAD_LAYER_OFFSET              = 0x1D000000, 0x100000
-WHEADS_BASE_ADDR    ,   WHEADS_LAYER_OFFSET             = 0x1D800000, 0x8000
-HEAD_OUT_BASE_ADDR  ,   HEAD_OUT_LAYER_OFFSET           = 0x1E000000, 0x1000
-HEADS_OUT_BASE_ADDR ,   HEADS_OUT_LAYER_OFFSET          = 0x1E008000, 0x80
-class Instruction:
-    def __init__(self,
-                 op,
-                 dq_en,
-                 stage,
-                 token,
-                 load_target,
-                 cmode,
-                 nonlinear,
-                 write_back,
-                 input_dim,
-                 output_dim,
-                 input_addr,
-                 scale_addr,
-                 output_addr,
-                 layer_offset,
-                 token_offset,
-                 num_cb_ws,
-                 num_cb_wm
-                ):
-        self.op = op
-        self.dq_en = dq_en
-        self.stage = stage
-        self.token = token
-        self.load_target = load_target
-        self.cmode = cmode
-        self.nonlinear = nonlinear
-        self.write_back = write_back
-        self.input_dim = input_dim
-        self.output_dim = output_dim
-        self.input_addr = input_addr
-        self.scale_addr = scale_addr
-        self.output_addr = output_addr
-        self.layer_offset = layer_offset
-        self.token_offset = token_offset
-        self.num_cb_ws = num_cb_ws
-        self.num_cb_wm = num_cb_wm
-    def to_binary(self, inst_num:int, inst_info:str):
-        print('INFO: {:30s} , Instruction id: {}'.format(inst_info, inst_num))
-        # Convert the instruction to a binary format
-        binary_format = (
-            f"{self.op:014b}{self.dq_en:01b}{self.stage:01b}{self.token:011b}"
-            f"{self.load_target:03b}{self.cmode:01b}{self.nonlinear:01b}{self.write_back:08b}"
-            f"{self.input_dim:016b}{self.output_dim:016b}"
-            f"{self.input_addr:032b}{self.scale_addr:032b}{self.output_addr:032b}"
-            f"{self.layer_offset:032b}{self.token_offset:032b}{self.num_cb_ws:016b}{self.num_cb_wm:016b}"
-        )
-        padding_length = 512 - len(binary_format)
-        binary_format = '0' * padding_length + binary_format
-        return binary_format
-def gen_inst(op, dq_en, stage, token, load_target, cmode, nonlinear, write_back, input_dim, output_dim, input_addr, scale_addr, output_addr, layer_offset, token_offset, num_cb_ws, num_cb_wm):
-    return Instruction(op, dq_en, stage, token, load_target, cmode, nonlinear, write_back, input_dim, output_dim, input_addr, scale_addr, output_addr, layer_offset, token_offset, num_cb_ws, num_cb_wm)
-# =====================================================================================================================================================================================================================
-# Test bandwidth
-# =====================================================================================================================================================================================================================
-inst_test_bw        = gen_inst(TEST_BW,     0, SUM, 0, NULL,        MX_INT8, NO_ACT,    NO_WB,      0,      0       , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 0      )
-# =====================================================================================================================================================================================================================
-# MXINT8 Instruction templates
-# =====================================================================================================================================================================================================================
-inst_nop            = gen_inst(NOP,         0, SUM, 0, NULL,        MX_INT8, NO_ACT,    NO_WB,      0,      0       , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 0      )
-inst_test_bw        = gen_inst(TEST_BW,     0, GEN, 0, NULL,        MX_INT8, NO_ACT,    NO_WB,      0,      0       , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 0      )
-inst_load_resi      = gen_inst(LOAD,        0, GEN, 0, RESI,        MX_INT8, NO_ACT,    NO_WB,      4096,   0       , RESI_BASE_ADDR, 0x00000000, 0x00000000, 0x00000000, RESI_TOKEN_OFFSET, 0, 0      )
-inst_load_in_act    = gen_inst(LOAD,        0, GEN, 0, ACT,         MX_INT8, NO_ACT,    NO_WB,      4096,   0       , ACT_BASE_ADDR, 0x00000000, 0x00000000, 0x00000000, ACT_TOKEN_OFFSET, 0, 0      )
-inst_load_resi_s    = gen_inst(LOAD,        0, GEN, 0, RESI_S,      MX_INT8, NO_ACT,    NO_WB,      128,    0       , RESIS_BASE_ADDR, 0x00000000, 0x00000000, 0x00000000, RESIS_TOKEN_OFFSET, 0, 0      )
-inst_load_in_act_s  = gen_inst(LOAD,        0, GEN, 0, ACT_S,       MX_INT8, NO_ACT,    NO_WB,      128,    0       , ACTS_BASE_ADDR, 0x00000000, 0x00000000, 0x00000000, ACTS_TOKEN_OFFSET, 0, 0      )
-inst_mlp_wm_q       = gen_inst(MLP_WM,      1, GEN, 0, WEIGHT_S,    MX_INT8, NO_ACT,    NO_WB,      4096,   4096    , WQ_BASE_ADDR, WQS_BASE_ADDR, 0x00000000, WQ_LAYER_OFFSET, 0x00000000, 128, 4096    )
-inst_mlp_wm_k       = gen_inst(MLP_WM,      1, GEN, 0, WEIGHT_S,    MX_INT8, NO_ACT,    NO_WB,      4096,   4096    , WK_BASE_ADDR, WKS_BASE_ADDR, 0x00000000, WK_LAYER_OFFSET, 0x00000000, 128, 4096    )
-inst_mlp_wm_v       = gen_inst(MLP_WM,      1, GEN, 0, WEIGHT_S,    MX_INT8, NO_ACT,    WB,         4096,   4096    , WV_BASE_ADDR, WVS_BASE_ADDR, 0x00000000, WV_LAYER_OFFSET, 0x00000000, 128, 4096    )
-inst_mlp_wm_o       = gen_inst(MLP_WM,      1, GEN, 0, WEIGHT_S,    MX_INT8, NO_ACT,    NO_WB,      4096,   4096    , WO_BASE_ADDR, WOS_BASE_ADDR, 0x00000000, WO_LAYER_OFFSET, 0x00000000, 128, 4096    )
-inst_mlp_wm_w1      = gen_inst(MLP_WM,      1, GEN, 0, WEIGHT_S,    MX_INT8, SILU,      NO_WB,      4096,   11008   , W1_BASE_ADDR, W1S_BASE_ADDR, 0x00000000, W1_LAYER_OFFSET, 0x00000000, 32, 1024    )
-inst_mlp_wm_w3      = gen_inst(MLP_WM,      1, GEN, 0, WEIGHT_S,    MX_INT8, NO_ACT,    NO_WB,      4096,   11008   , W3_BASE_ADDR, W3S_BASE_ADDR, 0x00000000, W3_LAYER_OFFSET, 0x00000000, 32, 1024    )
-inst_mlp_wm_w2      = gen_inst(MLP_WM,      1, GEN, 0, WEIGHT_S,    MX_INT8, NO_ACT,    NO_WB,      11008,  4096    , W2_BASE_ADDR, W2S_BASE_ADDR, 0x00000000, W2_LAYER_OFFSET, 0x00000000, 24, 688    )
-inst_gate           = gen_inst(GATE,        1, GEN, 0, NULL,        MX_INT8, NO_ACT,    NO_WB,      11008,  11008   , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 0      )
-inst_residual       = gen_inst(RESIDUAL,    1, GEN, 0, NULL,        MX_INT8, NO_ACT,    WB,         4096,   4096    , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 0      )
-inst_store_act      = gen_inst(STORE,       1, GEN, 0, NULL,        MX_INT8, NO_ACT,    WB_ACT,     4096,   4096    , 0x00000000, 0x00000000, ACT_BASE_ADDR, 0x00000000, ACT_TOKEN_OFFSET, 0, 0      )
-inst_store_act_s    = gen_inst(STORE,       1, GEN, 0, NULL,        MX_INT8, NO_ACT,    WB_ACT_S,   4096,   4096    , 0x00000000, 0x00000000, ACTS_BASE_ADDR, 0x00000000, ACTS_TOKEN_OFFSET, 0, 0      )
-inst_store_resi     = gen_inst(STORE,       1, GEN, 0, NULL,        MX_INT8, NO_ACT,    WB_ACT,     4096,   4096    , 0x00000000, 0x00000000, RESI_BASE_ADDR, 0x00000000, RESI_TOKEN_OFFSET, 0, 0      )
-inst_store_resi_s   = gen_inst(STORE,       1, GEN, 0, NULL,        MX_INT8, NO_ACT,    WB_ACT_S,   4096,   4096    , 0x00000000, 0x00000000, RESIS_BASE_ADDR, 0x00000000, RESIS_TOKEN_OFFSET, 0, 0      )
-inst_rope_nwb       = gen_inst(ROPE,        1, GEN, 0, NULL,        MX_INT8, NO_ACT,    NO_WB,      192,    4096    , ROPE_BASE_ADDR, 0x00000000, 0x00000000, 0x00000000, ROPE_TOKEN_OFFSET, 0, 0      )
-inst_rope_wb        = gen_inst(ROPE,        1, GEN, 0, NULL,        MX_INT8, NO_ACT,    WB,         192,    4096    , ROPE_BASE_ADDR, 0x00000000, 0x00000000, 0x00000000, ROPE_TOKEN_OFFSET, 0, 0      )
-inst_store_k        = gen_inst(STORE,       1, GEN, 0, NULL,        MX_INT8, NO_ACT,    WB_KV,      4096,   4096    , 0x00000000, 0x00000000, KC_BASE_ADDR, 0x00000000, KC_TOKEN_OFFSET, 0, 0      )
-inst_store_k_s      = gen_inst(STORE,       1, GEN, 0, NULL,        MX_INT8, NO_ACT,    WB_KV_S,    4096,   4096    , 0x00000000, 0x00000000, KCS_BASE_ADDR, 0x00000000, KCS_TOKEN_OFFSET, 0, 0      )
-inst_store_v        = gen_inst(STORE,       1, GEN, 0, NULL,        MX_INT8, NO_ACT,    WB_KV,      4096,   4096    , 0x00000000, 0x00000000, VC_BASE_ADDR, 0x00000000, VC_TOKEN_OFFSET, 0, 0      )
-inst_store_v_s      = gen_inst(STORE,       1, GEN, 0, NULL,        MX_INT8, NO_ACT,    WB_KV_S,    4096,   4096    , 0x00000000, 0x00000000, VCS_BASE_ADDR, 0x00000000, VCS_TOKEN_OFFSET, 0, 0      )
-inst_mlp_qkt        = gen_inst(MLP_QKT,     1, GEN, 0, NULL,        MX_INT8, NO_ACT,    NO_WB,      4,      1       , KC_BASE_ADDR, KCS_BASE_ADDR, 0x00000000, 0x00000000, KCS_TOKEN_OFFSET, 4, 32     )
-inst_qkt_m_rsqrt    = gen_inst(QKT_M_RSQRT, 1, GEN, 0, NULL,        MX_INT8, NO_ACT,    NO_WB,      1,      1       , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1, 512    )
-inst_softmax        = gen_inst(SOFTMAX,     1, GEN, 0, NULL,        MX_INT8, NO_ACT,    NO_WB,      0,      1       , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1, 0      )
-inst_mlp_hp         = gen_inst(MLP_HP,      1, GEN, 0, NULL,        MX_INT8, NO_ACT,    NO_WB,      4,      4096    , VC_BASE_ADDR, VCS_BASE_ADDR, 0x00000000, 0x00000000, VCS_TOKEN_OFFSET, 4, 32     )
-inst_prerrms        = gen_inst(RRMS,        1, GEN, 0, NULL,        MX_INT8, NO_ACT,    NO_WB,      4096,   1       , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 128    )
-inst_load_pre_wm    = gen_inst(LOAD,        0, GEN, 0, ACT,         MX_INT8, NO_ACT,    NO_WB,      4096,   0       , PRENORM_ADDR, 0x00000000, 0x00000000, PRENORM_LAYER_OFFSET, 0x00000000, 0, 0      )
-inst_load_pre_ws    = gen_inst(LOAD,        0, GEN, 0, ACT_S,       MX_INT8, NO_ACT,    NO_WB,      128,    0       , PRENORMS_ADDR, 0x00000000, 0x00000000, PRENORMS_LAYER_OFFSET, 0x00000000, 0, 0      )
-inst_prermsnorm     = gen_inst(RMSNORM,     1, GEN, 0, NULL,        MX_INT8, NO_ACT,    NO_WB,      4096,   4096    , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 128    )
-inst_postrrms       = gen_inst(RRMS,        1, GEN, 0, NULL,        MX_INT8, NO_ACT,    NO_WB,      4096,   1       , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 128    )
-inst_load_post_wm   = gen_inst(LOAD,        0, GEN, 0, ACT,         MX_INT8, NO_ACT,    NO_WB,      4096,   0       , POSTNORM_ADDR, 0x00000000, 0x00000000, POSTNORM_LAYER_OFFSET, 0x00000000, 0, 0      )
-inst_load_post_ws   = gen_inst(LOAD,        0, GEN, 0, ACT_S,       MX_INT8, NO_ACT,    NO_WB,      128,    0       , POSTNORMS_ADDR, 0x00000000, 0x00000000, POSTNORMS_LAYER_OFFSET, 0x00000000, 0, 0      )
-inst_postrmsnorm    = gen_inst(RMSNORM,     1, GEN, 0, NULL,        MX_INT8, NO_ACT,    NO_WB,      4096,   4096    , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 128    )
-# =====================================================================================================================================================================================================================
-# DECODER OUT Instruction templates
-# =====================================================================================================================================================================================================================
-inst_outrrms        = gen_inst(RRMS,        1, GEN, 0, NULL,        MX_INT8, NO_ACT,    NO_WB,      4096,   1       , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 128    )
-inst_load_out_wm    = gen_inst(LOAD,        0, GEN, 0, ACT,         MX_INT8, NO_ACT,    NO_WB,      4096,   0       , OUTNORM_ADDR, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 0      )
-inst_load_out_ws    = gen_inst(LOAD,        0, GEN, 0, ACT_S,       MX_INT8, NO_ACT,    NO_WB,      128,    0       , OUTNORMS_ADDR, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 0      )
-inst_outrmsnorm     = gen_inst(RMSNORM,     1, GEN, 0, NULL,        MX_INT8, NO_ACT,    NO_WB,      4096,   4096    , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 128    )
-inst_mlp_wm_whead   = gen_inst(MLP_WM,      1, GEN, 0, WEIGHT_S,    MX_INT8, NO_ACT,    WB,         4096,   4096    , WHEAD_BASE_ADDR, WHEADS_BASE_ADDR, 0x00000000, WHEAD_LAYER_OFFSET, 0x00000000, 128, 4096    )
-inst_store_head     = gen_inst(STORE,       1, GEN, 0, NULL,        MX_INT8, NO_ACT,    WB_ACT,     4096,   4096    , 0x00000000, 0x00000000, HEAD_OUT_BASE_ADDR, 0x00000000, HEAD_OUT_LAYER_OFFSET, 0, 0      )
-inst_store_head_s   = gen_inst(STORE,       1, GEN, 0, NULL,        MX_INT8, NO_ACT,    WB_ACT_S,   4096,   4096    , 0x00000000, 0x00000000, HEADS_OUT_BASE_ADDR, 0x00000000, HEADS_OUT_LAYER_OFFSET, 0, 0      )
-# =====================================================================================================================================================================================================================
-# MXINT4 Instruction templates
-# =====================================================================================================================================================================================================================
-inst_mlp_wm_whead_mxint4= gen_inst(MLP_WM,      1, GEN, 0, WEIGHT_S,    MX_INT4, NO_ACT,    WB,         4096,   4096    , WHEAD_BASE_ADDR,  WHEADS_BASE_ADDR,   0x00000000, WHEAD_LAYER_OFFSET, 0x00000000, 128, 2048    )
-inst_mlp_wm_q_mxint4    = gen_inst(MLP_WM,      1, GEN, 0, WEIGHT_S,    MX_INT4, NO_ACT,    NO_WB,      4096,   4096    , WQ_BASE_ADDR,     WQS_BASE_ADDR,      0x00000000, WQ_LAYER_OFFSET,    0x00000000, 128, 2048    )
-inst_mlp_wm_k_mxint4    = gen_inst(MLP_WM,      1, GEN, 0, WEIGHT_S,    MX_INT4, NO_ACT,    NO_WB,      4096,   4096    , WK_BASE_ADDR,     WKS_BASE_ADDR,      0x00000000, WK_LAYER_OFFSET,    0x00000000, 128, 2048    )
-inst_mlp_wm_v_mxint4    = gen_inst(MLP_WM,      1, GEN, 0, WEIGHT_S,    MX_INT4, NO_ACT,    WB,         4096,   4096    , WV_BASE_ADDR,     WVS_BASE_ADDR,      0x00000000, WV_LAYER_OFFSET,    0x00000000, 128, 2048    )
-inst_mlp_wm_o_mxint4    = gen_inst(MLP_WM,      1, GEN, 0, WEIGHT_S,    MX_INT4, NO_ACT,    NO_WB,      4096,   4096    , WO_BASE_ADDR,     WOS_BASE_ADDR,      0x00000000, WO_LAYER_OFFSET,    0x00000000, 128, 2048    )
-inst_mlp_wm_w1_mxint4   = gen_inst(MLP_WM,      1, GEN, 0, WEIGHT_S,    MX_INT4, SILU,      NO_WB,      4096,   11008   , W1_BASE_ADDR,     W1S_BASE_ADDR,      0x00000000, W1_LAYER_OFFSET,    0x00000000, 32, 512    )
-inst_mlp_wm_w3_mxint4   = gen_inst(MLP_WM,      1, GEN, 0, WEIGHT_S,    MX_INT4, NO_ACT,    NO_WB,      4096,   11008   , W3_BASE_ADDR,     W3S_BASE_ADDR,      0x00000000, W3_LAYER_OFFSET,    0x00000000, 32, 512    )
-inst_mlp_wm_w2_mxint4   = gen_inst(MLP_WM,      1, GEN, 0, WEIGHT_S,    MX_INT4, NO_ACT,    NO_WB,      11008,  4096    , W2_BASE_ADDR,     W2S_BASE_ADDR,      0x00000000, W2_LAYER_OFFSET,    0x00000000, 24, 344     )
-# =====================================================================================================================================================================================================================
-# Instruction templates end
-# =====================================================================================================================================================================================================================
-# gen instructions parameters
-parser = argparse.ArgumentParser(description='Generate instruction binary file for simulation')
-parser.add_argument('--sim_cmode', type=str, default='mxint8', choices=['mxint8', 'mxint4'], help='Simulation compute mode')
-parser.add_argument('--sim_llm_head', action='store_true', help='if true, generate LLM head instructions for FPGA simulation')
-args = parser.parse_args()
-SIM_CMODE       = args.sim_cmode
-TEST_OP_GROUP   = 'demo'
-SIM_LLM_HEAD    = args.sim_llm_head
-SIM_LOGIT_FLAG  = '' if SIM_LLM_HEAD else '_no'
-SIM_NUM_TOKEN   = 1024
-SIM_NUM_LAYER   = 32
-if __name__ == "__main__":
-    if TEST_OP_GROUP == 'demo':
-        current_token = 0
-        tmp_output_dim = 1
-        current_inst_cnt = 0
-        file_name = "instruction_{}T_32L{}_write_back_logit_everyT_{}.bin".format(SIM_NUM_TOKEN, SIM_LOGIT_FLAG, SIM_CMODE)
-        with open(file_name, "wb") as f:
-            for tk in range(SIM_NUM_TOKEN):
-                print("Gen {} th token instruction start".format(tk+1))
-                # LOAD IN_ACT
-                inst_load_in_act.input_addr = ACT_BASE_ADDR + ACT_TOKEN_OFFSET * tk
-                inst_load_in_act.token      = current_token
-                binary_instruction = inst_load_in_act.to_binary(current_inst_cnt, 'LOAD IN_ACT')
-                f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                current_inst_cnt += 1
-                # LOAD IN_ACT
-                inst_load_in_act_s.input_addr = ACTS_BASE_ADDR + ACTS_TOKEN_OFFSET * tk
-                inst_load_in_act_s.token      = current_token
-                binary_instruction = inst_load_in_act_s.to_binary(current_inst_cnt, 'LOAD IN_ACT_S')
-                f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                current_inst_cnt += 1
-                for l in range(SIM_NUM_LAYER):
-                    print("Gen {} th token, {} th layer instruction".format(tk+1, l+1))
-                    # RRMS
-                    inst_prerrms.token  = current_token
-                    binary_instruction  = inst_prerrms.to_binary(current_inst_cnt, 'PRE RRMS')
-                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                    current_inst_cnt += 1
-                    # LOAD NORM weight
-                    inst_load_pre_wm.input_addr = PRENORM_ADDR + PRENORM_LAYER_OFFSET * l
-                    inst_load_pre_wm.token      = current_token
-                    binary_instruction          = inst_load_pre_wm.to_binary(current_inst_cnt, 'LOAD PRENORM weight')
-                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                    current_inst_cnt += 1
-                    # LOAD NORM weight scale
-                    inst_load_pre_ws.input_addr = PRENORMS_ADDR + PRENORMS_LAYER_OFFSET * l
-                    inst_load_pre_ws.token      = current_token
-                    binary_instruction          = inst_load_pre_ws.to_binary(current_inst_cnt, 'LOAD PRENORM weight scale')
-                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                    current_inst_cnt += 1
-                    # RMSNORM
-                    inst_prermsnorm.token       = current_token
-                    binary_instruction          = inst_prermsnorm.to_binary(current_inst_cnt, 'PRE RMSNORM')
-                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                    current_inst_cnt += 1
-                    if SIM_CMODE == 'mxint8':
-                        # MLP_WM Wv
-                        inst_mlp_wm_v.input_addr    = WV_BASE_ADDR + WV_LAYER_OFFSET * l
-                        inst_mlp_wm_v.scale_addr    = WVS_BASE_ADDR + WVS_LAYER_OFFSET * l
-                        inst_mlp_wm_v.token         = current_token
-                        binary_instruction          = inst_mlp_wm_v.to_binary(current_inst_cnt, 'MLP_WM Wv')
-                        f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                        current_inst_cnt += 1
-                    else:
-                        # MLP_WM Wv
-                        inst_mlp_wm_v_mxint4.input_addr = WV_BASE_ADDR + (WV_LAYER_OFFSET//2) * l
-                        inst_mlp_wm_v_mxint4.scale_addr = WVS_BASE_ADDR + WVS_LAYER_OFFSET * l
-                        inst_mlp_wm_v_mxint4.token      = current_token
-                        binary_instruction              = inst_mlp_wm_v_mxint4.to_binary(current_inst_cnt, 'MLP_WM Wv')
-                        f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                        current_inst_cnt += 1
-                    # STORE V elem
-                    inst_store_v.output_addr    = VC_BASE_ADDR + VC_LAYER_OFFSET * l + VC_TOKEN_OFFSET * tk
-                    inst_store_v.token          = current_token
-                    binary_instruction          = inst_store_v.to_binary(current_inst_cnt, 'STORE V elem')
-                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                    current_inst_cnt += 1
-                    # STORE V scale
-                    inst_store_v_s.output_addr  = VCS_BASE_ADDR + VCS_LAYER_OFFSET * l + VCS_TOKEN_OFFSET * tk
-                    inst_store_v_s.token        = current_token
-                    binary_instruction          = inst_store_v_s.to_binary(current_inst_cnt, 'STORE V scale')
-                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                    current_inst_cnt += 1
-                    if SIM_CMODE == 'mxint8':
-                        # MLP_WM Wk
-                        inst_mlp_wm_k.input_addr    = WK_BASE_ADDR + WK_LAYER_OFFSET * l
-                        inst_mlp_wm_k.scale_addr    = WKS_BASE_ADDR + WKS_LAYER_OFFSET * l
-                        inst_mlp_wm_k.token         = current_token
-                        binary_instruction          = inst_mlp_wm_k.to_binary(current_inst_cnt, 'MLP_WM Wk')
-                        f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                        current_inst_cnt += 1
-                    else:
-                        # MLP_WM Wk
-                        inst_mlp_wm_k_mxint4.input_addr = WK_BASE_ADDR + (WK_LAYER_OFFSET//2) * l
-                        inst_mlp_wm_k_mxint4.scale_addr = WKS_BASE_ADDR + WKS_LAYER_OFFSET * l
-                        inst_mlp_wm_k_mxint4.token      = current_token
-                        binary_instruction              = inst_mlp_wm_k_mxint4.to_binary(current_inst_cnt, 'MLP_WM Wk')
-                        f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                        current_inst_cnt += 1
-                    # ROPE WB (K)
-                    inst_rope_wb.input_addr     = ROPE_BASE_ADDR + ROPE_TOKEN_OFFSET * tk
-                    inst_rope_wb.token          = current_token
-                    binary_instruction          = inst_rope_wb.to_binary(current_inst_cnt, 'ROPE WB (K)')
-                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                    current_inst_cnt += 1
-                    # STORE K elem
-                    inst_store_k.output_addr    = KC_BASE_ADDR + KC_LAYER_OFFSET * l + KC_TOKEN_OFFSET * tk
-                    inst_store_k.token          = current_token
-                    binary_instruction          = inst_store_k.to_binary(current_inst_cnt, 'STORE K elem')
-                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                    current_inst_cnt += 1
-                    # STORE K scale
-                    inst_store_k_s.output_addr  = KCS_BASE_ADDR + KCS_LAYER_OFFSET * l + KCS_TOKEN_OFFSET * tk
-                    inst_store_k_s.token        = current_token
-                    binary_instruction          = inst_store_k_s.to_binary(current_inst_cnt, 'STORE K scale')
-                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                    current_inst_cnt += 1
-                    if SIM_CMODE == 'mxint8':
-                        # MLP_WM Wq
-                        inst_mlp_wm_q.input_addr    = WQ_BASE_ADDR + WQ_LAYER_OFFSET * l
-                        inst_mlp_wm_q.scale_addr    = WQS_BASE_ADDR + WQS_LAYER_OFFSET * l
-                        inst_mlp_wm_q.token         = current_token
-                        binary_instruction          = inst_mlp_wm_q.to_binary(current_inst_cnt, 'MLP_WM Wq')
-                        f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                        current_inst_cnt += 1
-                    else:
-                        # MLP_WM Wq
-                        inst_mlp_wm_q_mxint4.input_addr = WQ_BASE_ADDR + (WQ_LAYER_OFFSET//2) * l
-                        inst_mlp_wm_q_mxint4.scale_addr = WQS_BASE_ADDR + WQS_LAYER_OFFSET * l
-                        inst_mlp_wm_q_mxint4.token      = current_token
-                        binary_instruction              = inst_mlp_wm_q_mxint4.to_binary(current_inst_cnt, 'MLP_WM Wq')
-                        f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                        current_inst_cnt += 1
-                    # ROPE NO_WB (Q)
-                    inst_rope_nwb.input_addr    = ROPE_BASE_ADDR + ROPE_TOKEN_OFFSET * tk
-                    inst_rope_nwb.token         = current_token
-                    binary_instruction          = inst_rope_nwb.to_binary(current_inst_cnt, 'ROPE NO_WB (Q)')
-                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                    current_inst_cnt += 1
-                    # MLP_QKT
-                    inst_mlp_qkt.input_addr     = KC_BASE_ADDR + KC_LAYER_OFFSET * l
-                    inst_mlp_qkt.scale_addr     = KCS_BASE_ADDR + KCS_LAYER_OFFSET * l
-                    inst_mlp_qkt.output_dim     = tmp_output_dim
-                    inst_mlp_qkt.input_dim      = tmp_output_dim * inst_mlp_qkt.num_cb_ws
-                    inst_mlp_qkt.token          = current_token
-                    binary_instruction          = inst_mlp_qkt.to_binary(current_inst_cnt, 'MLP_QKT')
-                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                    current_inst_cnt += 1
-                    # QKT_M_RSQRT
-                    inst_qkt_m_rsqrt.output_dim = tmp_output_dim
-                    inst_qkt_m_rsqrt.input_dim  = tmp_output_dim * inst_qkt_m_rsqrt.num_cb_ws
-                    inst_qkt_m_rsqrt.token      = current_token
-                    binary_instruction          = inst_qkt_m_rsqrt.to_binary(current_inst_cnt, 'QKT_M_RSQRT')
-                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                    current_inst_cnt += 1
-                    # SOFTMAX
-                    inst_softmax.output_dim     = tmp_output_dim
-                    inst_softmax.token          = current_token
-                    binary_instruction          = inst_softmax.to_binary(current_inst_cnt, 'SOFTMAX')
-                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                    current_inst_cnt += 1
-                    # MLP_HP (SxV)
-                    inst_mlp_hp.input_addr      = VC_BASE_ADDR + VC_LAYER_OFFSET * l
-                    inst_mlp_hp.scale_addr      = VCS_BASE_ADDR + VCS_LAYER_OFFSET * l
-                    inst_mlp_hp.input_dim       = tmp_output_dim * inst_mlp_hp.num_cb_ws
-                    inst_mlp_hp.token           = current_token
-                    binary_instruction          = inst_mlp_hp.to_binary(current_inst_cnt, 'MLP_HP (SxV)')
-                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                    current_inst_cnt += 1
-                    if SIM_CMODE == 'mxint8':
-                        # MLP_WM Wo
-                        inst_mlp_wm_o.input_addr    = WO_BASE_ADDR + WO_LAYER_OFFSET * l
-                        inst_mlp_wm_o.scale_addr    = WOS_BASE_ADDR + WOS_LAYER_OFFSET * l
-                        inst_mlp_wm_o.token         = current_token
-                        binary_instruction          = inst_mlp_wm_o.to_binary(current_inst_cnt, 'MLP_WM Wo')
-                        f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                        current_inst_cnt += 1
-                    else:
-                        # MLP_WM Wo
-                        inst_mlp_wm_o_mxint4.input_addr = WO_BASE_ADDR + (WO_LAYER_OFFSET//2) * l
-                        inst_mlp_wm_o_mxint4.scale_addr = WOS_BASE_ADDR + WOS_LAYER_OFFSET * l
-                        inst_mlp_wm_o_mxint4.token      = current_token
-                        binary_instruction              = inst_mlp_wm_o_mxint4.to_binary(current_inst_cnt, 'MLP_WM Wo')
-                        f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                        current_inst_cnt += 1
-                    # LOAD RESI
-                    inst_load_resi.input_addr   = RESI_BASE_ADDR + RESI_TOKEN_OFFSET * tk
-                    inst_load_resi.token        = current_token
-                    binary_instruction          = inst_load_resi.to_binary(current_inst_cnt, 'LOAD RESI')
-                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                    current_inst_cnt += 1
-                    # LOAD RESI_S
-                    inst_load_resi_s.input_addr = RESIS_BASE_ADDR + RESIS_TOKEN_OFFSET * tk
-                    inst_load_resi_s.token      = current_token
-                    binary_instruction          = inst_load_resi_s.to_binary(current_inst_cnt, 'LOAD RESI_S')
-                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                    current_inst_cnt += 1
-                    # RESIDUAL
-                    inst_residual.token         = current_token
-                    binary_instruction          = inst_residual.to_binary(current_inst_cnt, 'RESIDUAL')
-                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                    current_inst_cnt += 1
-                    # STORE RESI
-                    inst_store_resi.output_addr = RESI_BASE_ADDR + RESI_TOKEN_OFFSET * tk
-                    inst_store_resi.token       = current_token
-                    binary_instruction          = inst_store_resi.to_binary(current_inst_cnt, 'STORE RESI')
-                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                    current_inst_cnt += 1
-                    # STORE RESI_S
-                    inst_store_resi_s.output_addr   = RESIS_BASE_ADDR + RESIS_TOKEN_OFFSET * tk
-                    inst_store_resi_s.token         = current_token
-                    binary_instruction              = inst_store_resi_s.to_binary(current_inst_cnt, 'STORE RESI_S')
-                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                    current_inst_cnt += 1
-                    # RRMS
-                    inst_postrrms.token             = current_token
-                    binary_instruction              = inst_postrrms.to_binary(current_inst_cnt, 'POST RRMS')
-                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                    current_inst_cnt += 1
-                    # LOAD NORM weight
-                    inst_load_post_wm.input_addr    = POSTNORM_ADDR + POSTNORM_LAYER_OFFSET * l
-                    inst_load_post_wm.token         = current_token
-                    binary_instruction              = inst_load_post_wm.to_binary(current_inst_cnt, 'LOAD POSTNORM weight')
-                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                    current_inst_cnt += 1
-                    # LOAD NORM weight scale
-                    inst_load_post_ws.input_addr    = POSTNORMS_ADDR + POSTNORMS_LAYER_OFFSET * l
-                    inst_load_post_ws.token         = current_token
-                    binary_instruction              = inst_load_post_ws.to_binary(current_inst_cnt, 'LOAD POSTNORM weight scale')
-                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                    current_inst_cnt += 1
-                    # RMSNORM
-                    inst_postrmsnorm.token          = current_token
-                    binary_instruction              = inst_postrmsnorm.to_binary(current_inst_cnt, 'POST RMSNORM')
-                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                    current_inst_cnt += 1
-                    if SIM_CMODE == 'mxint8':
-                        # MLP_WM W1
-                        inst_mlp_wm_w1.input_addr   = W1_BASE_ADDR + W1_LAYER_OFFSET * l
-                        inst_mlp_wm_w1.scale_addr   = W1S_BASE_ADDR + W1S_LAYER_OFFSET * l
-                        inst_mlp_wm_w1.token        = current_token
-                        binary_instruction          = inst_mlp_wm_w1.to_binary(current_inst_cnt, 'MLP_WM W1')
-                        f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                        current_inst_cnt += 1
-                        # MLP_WM W3
-                        inst_mlp_wm_w3.input_addr   = W3_BASE_ADDR + W3_LAYER_OFFSET * l
-                        inst_mlp_wm_w3.scale_addr   = W3S_BASE_ADDR + W3S_LAYER_OFFSET * l
-                        inst_mlp_wm_w3.token        = current_token
-                        binary_instruction          = inst_mlp_wm_w3.to_binary(current_inst_cnt, 'MLP_WM W3')
-                        f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                        current_inst_cnt += 1
-                    else:
-                        # MLP_WM W1
-                        inst_mlp_wm_w1_mxint4.input_addr    = W1_BASE_ADDR + (W1_LAYER_OFFSET//2) * l
-                        inst_mlp_wm_w1_mxint4.scale_addr    = W1S_BASE_ADDR + W1S_LAYER_OFFSET * l
-                        inst_mlp_wm_w1_mxint4.token         = current_token
-                        binary_instruction                  = inst_mlp_wm_w1_mxint4.to_binary(current_inst_cnt, 'MLP_WM W1')
-                        f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                        current_inst_cnt += 1
-                        # MLP_WM W3
-                        inst_mlp_wm_w3_mxint4.input_addr    = W3_BASE_ADDR + (W3_LAYER_OFFSET//2) * l
-                        inst_mlp_wm_w3_mxint4.scale_addr    = W3S_BASE_ADDR + W3S_LAYER_OFFSET * l
-                        inst_mlp_wm_w3_mxint4.token         = current_token
-                        binary_instruction                  = inst_mlp_wm_w3_mxint4.to_binary(current_inst_cnt, 'MLP_WM W3')
-                        f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                        current_inst_cnt += 1
-                    # GATE
-                    inst_gate.token             = current_token
-                    binary_instruction          = inst_gate.to_binary(current_inst_cnt, 'GATE')
-                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                    current_inst_cnt += 1
-                    if SIM_CMODE == 'mxint8':
-                        # MLP_WM W2
-                        inst_mlp_wm_w2.input_addr   = W2_BASE_ADDR + W2_LAYER_OFFSET * l
-                        inst_mlp_wm_w2.scale_addr   = W2S_BASE_ADDR + W2S_LAYER_OFFSET * l
-                        inst_mlp_wm_w2.token        = current_token
-                        binary_instruction          = inst_mlp_wm_w2.to_binary(current_inst_cnt, 'MLP_WM W2')
-                        f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                        current_inst_cnt += 1
-                    else:
-                        # MLP_WM W2
-                        inst_mlp_wm_w2_mxint4.input_addr    = W2_BASE_ADDR + (W2_LAYER_OFFSET//2) * l
-                        inst_mlp_wm_w2_mxint4.scale_addr    = W2S_BASE_ADDR + W2S_LAYER_OFFSET * l
-                        inst_mlp_wm_w2_mxint4.token         = current_token
-                        binary_instruction                  = inst_mlp_wm_w2_mxint4.to_binary(current_inst_cnt, 'MLP_WM W2')
-                        f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                        current_inst_cnt += 1
-                    # LOAD RESI
-                    inst_load_resi.input_addr   = RESI_BASE_ADDR + RESI_TOKEN_OFFSET * tk
-                    inst_load_resi.token        = current_token
-                    binary_instruction          = inst_load_resi.to_binary(current_inst_cnt, 'LOAD RESI')
-                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                    current_inst_cnt += 1
-                    # LOAD RESI_S
-                    inst_load_resi_s.input_addr = RESIS_BASE_ADDR + RESIS_TOKEN_OFFSET * tk
-                    inst_load_resi_s.token      = current_token
-                    binary_instruction          = inst_load_resi_s.to_binary(current_inst_cnt, 'LOAD RESI_S')
-                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                    current_inst_cnt += 1
-                    # RESIDUAL
-                    inst_residual.token         = current_token
-                    binary_instruction          = inst_residual.to_binary(current_inst_cnt, 'RESIDUAL')
-                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                    current_inst_cnt += 1
-                    # STORE RESI
-                    inst_store_resi.output_addr = RESI_BASE_ADDR + RESI_TOKEN_OFFSET * tk
-                    inst_store_resi.token       = current_token
-                    binary_instruction          = inst_store_resi.to_binary(current_inst_cnt, 'STORE RESI')
-                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                    current_inst_cnt += 1
-                    # STORE RESI_S
-                    inst_store_resi_s.output_addr   = RESIS_BASE_ADDR + RESIS_TOKEN_OFFSET * tk
-                    inst_store_resi_s.token         = current_token
-                    binary_instruction              = inst_store_resi_s.to_binary(current_inst_cnt, 'STORE RESI_S')
-                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                    current_inst_cnt += 1
-                    # END OF LAYER
-                # if tk == SIM_NUM_TOKEN-1 and SIM_NUM_LAYER == 32 and SIM_LLM_HEAD:
-                if SIM_NUM_LAYER == 32 and SIM_LLM_HEAD:
-                    # RRMS
-                    inst_outrrms.token              = current_token
-                    binary_instruction              = inst_outrrms.to_binary(current_inst_cnt, 'OUT RRMS')
-                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                    current_inst_cnt += 1
-                    # LOAD NORM weight
-                    inst_load_out_wm.input_addr     = OUTNORM_ADDR
-                    inst_load_out_wm.token          = current_token
-                    binary_instruction              = inst_load_out_wm.to_binary(current_inst_cnt, 'LOAD OUTNORM weight')
-                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                    current_inst_cnt += 1
-                    # LOAD NORM weight scale
-                    inst_load_out_ws.input_addr     = OUTNORMS_ADDR
-                    inst_load_out_ws.token          = current_token
-                    binary_instruction              = inst_load_out_ws.to_binary(current_inst_cnt, 'LOAD OUTNORM weight scale')
-                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                    current_inst_cnt += 1
-                    # RMSNORM
-                    inst_outrmsnorm.token           = current_token
-                    binary_instruction              = inst_outrmsnorm.to_binary(current_inst_cnt, 'OUT RMSNORM')
-                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                    current_inst_cnt += 1
-                    for it in range(8):
-                        if SIM_CMODE == 'mxint8':
-                            # MLP_WM Whead
-                            inst_mlp_wm_whead.input_addr    = WHEAD_BASE_ADDR + WHEAD_LAYER_OFFSET * it
-                            inst_mlp_wm_whead.scale_addr    = WHEADS_BASE_ADDR + WHEADS_LAYER_OFFSET * it
-                            inst_mlp_wm_whead.token         = current_token
-                            binary_instruction              = inst_mlp_wm_whead.to_binary(current_inst_cnt, 'MLP_WM Whead')
-                            f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                            current_inst_cnt += 1
-                        else:
-                            # MLP_WM Whead
-                            inst_mlp_wm_whead_mxint4.input_addr = WHEAD_BASE_ADDR + (WHEAD_LAYER_OFFSET//2) * it
-                            inst_mlp_wm_whead_mxint4.scale_addr = WHEADS_BASE_ADDR + WHEADS_LAYER_OFFSET * it
-                            inst_mlp_wm_whead_mxint4.token      = current_token
-                            binary_instruction                  = inst_mlp_wm_whead_mxint4.to_binary(current_inst_cnt, 'MLP_WM Whead')
-                            f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                            current_inst_cnt += 1
-                        # STORE HEAD
-                        inst_store_head.output_addr     = HEAD_OUT_BASE_ADDR + HEAD_OUT_LAYER_OFFSET * it
-                        inst_store_head.token           = current_token
-                        binary_instruction              = inst_store_head.to_binary(current_inst_cnt, 'STORE HEAD')
-                        f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                        current_inst_cnt += 1
-                        # STORE HEAD_S
-                        inst_store_head_s.output_addr   = HEADS_OUT_BASE_ADDR + HEADS_OUT_LAYER_OFFSET * it
-                        inst_store_head_s.token         = current_token
-                        binary_instruction              = inst_store_head_s.to_binary(current_inst_cnt, 'STORE HEAD_S')
-                        f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                        current_inst_cnt += 1
-                # Adjust parameters
-                tmp_output_dim += 1
-                current_token += 1
-                # END OF TOKEN
-            # end instruction NOP
-            binary_instruction = inst_nop.to_binary(current_inst_cnt, 'End instruction NOP')
-            f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-            current_inst_cnt += 1
-            # END OF SIMULATION
-        print('INFO: Total Instruction Count: {}'.format(current_inst_cnt))
-    elif TEST_OP_GROUP == 'test_bw':
-        current_inst_cnt = 0
-        with open("../instruction/instruction_test_bw.bin", "wb") as f:
-            for i in range(SIM_NUM_TOKEN):
-                binary_instruction = inst_test_bw.to_binary(current_inst_cnt, 'TEST BW')
-                f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
-                current_inst_cnt += 1
-        print('INFO: Total Instruction Count: {}'.format(current_inst_cnt))