Delete instruction/gen_instruction.py
Browse files- instruction/gen_instruction.py +0 -612
instruction/gen_instruction.py
DELETED
|
@@ -1,612 +0,0 @@
|
|
| 1 |
-
import argparse
|
| 2 |
-
|
| 3 |
-
# operation groups
|
| 4 |
-
NOP = 0b00000000000001
|
| 5 |
-
TEST_BW = 0b00000000000010
|
| 6 |
-
LOAD = 0b00000000000100
|
| 7 |
-
STORE = 0b00000000001000
|
| 8 |
-
MLP_WM = 0b00000000010000
|
| 9 |
-
MLP_QKT = 0b00000000100000
|
| 10 |
-
QKT_M_RSQRT = 0b00000001000000
|
| 11 |
-
MLP_HP = 0b00000010000000
|
| 12 |
-
GATE = 0b00000100000000
|
| 13 |
-
RRMS = 0b00001000000000
|
| 14 |
-
RMSNORM = 0b00010000000000
|
| 15 |
-
SOFTMAX = 0b00100000000000
|
| 16 |
-
RESIDUAL = 0b01000000000000
|
| 17 |
-
ROPE = 0b10000000000000
|
| 18 |
-
# cmode groups
|
| 19 |
-
MX_INT8 = 0b0
|
| 20 |
-
MX_INT4 = 0b1
|
| 21 |
-
#stage groups
|
| 22 |
-
SUM = 0b0
|
| 23 |
-
GEN = 0b1
|
| 24 |
-
# nonlinear groups
|
| 25 |
-
NO_ACT = 0b0
|
| 26 |
-
SILU = 0b1
|
| 27 |
-
# load targets groups
|
| 28 |
-
NULL = 0b000
|
| 29 |
-
ACT = 0b001
|
| 30 |
-
ACT_S = 0b010
|
| 31 |
-
RESI = 0b011
|
| 32 |
-
RESI_S = 0b100
|
| 33 |
-
WEIGHT_S = 0b101
|
| 34 |
-
# write back groups
|
| 35 |
-
NO_WB = 0b00000001
|
| 36 |
-
WB = 0b00000010
|
| 37 |
-
WB_KV = 0b00000100
|
| 38 |
-
WB_KV_S = 0b00001000
|
| 39 |
-
WB_ACT = 0b00010000
|
| 40 |
-
WB_ACT_S = 0b00100000
|
| 41 |
-
WB_RESI = 0b01000000
|
| 42 |
-
WB_RESI_S = 0b10000000
|
| 43 |
-
|
| 44 |
-
# address space
|
| 45 |
-
WQ_BASE_ADDR , WQ_LAYER_OFFSET = 0x00000000, 0x00100000
|
| 46 |
-
WK_BASE_ADDR , WK_LAYER_OFFSET = 0x02000000, 0x00100000
|
| 47 |
-
WV_BASE_ADDR , WV_LAYER_OFFSET = 0x04000000, 0x00100000
|
| 48 |
-
WO_BASE_ADDR , WO_LAYER_OFFSET = 0x06000000, 0x00100000
|
| 49 |
-
W1_BASE_ADDR , W1_LAYER_OFFSET = 0x08000000, 0x002B0000
|
| 50 |
-
W3_BASE_ADDR , W3_LAYER_OFFSET = 0x0D600000, 0x002B0000
|
| 51 |
-
W2_BASE_ADDR , W2_LAYER_OFFSET = 0x12C00000, 0x002B0000
|
| 52 |
-
WQS_BASE_ADDR , WQS_LAYER_OFFSET = 0x18200000, 0x00008000
|
| 53 |
-
WKS_BASE_ADDR , WKS_LAYER_OFFSET = 0x18300000, 0x00008000
|
| 54 |
-
WVS_BASE_ADDR , WVS_LAYER_OFFSET = 0x18400000, 0x00008000
|
| 55 |
-
WOS_BASE_ADDR , WOS_LAYER_OFFSET = 0x18500000, 0x00008000
|
| 56 |
-
W1S_BASE_ADDR , W1S_LAYER_OFFSET = 0x18600000, 0x0015800
|
| 57 |
-
W3S_BASE_ADDR , W3S_LAYER_OFFSET = 0x188B0000, 0x0015800
|
| 58 |
-
W2S_BASE_ADDR , W2S_LAYER_OFFSET = 0x18B60000, 0x0018000
|
| 59 |
-
KC_BASE_ADDR , KC_LAYER_OFFSET , KC_TOKEN_OFFSET = 0x18E60000, 0x0080000 , 0x100
|
| 60 |
-
VC_BASE_ADDR , VC_LAYER_OFFSET , VC_TOKEN_OFFSET = 0x19E60000, 0x0080000 , 0x100
|
| 61 |
-
KCS_BASE_ADDR , KCS_LAYER_OFFSET , KCS_TOKEN_OFFSET = 0x1AE60000, 0x0040000 , 0x80
|
| 62 |
-
VCS_BASE_ADDR , VCS_LAYER_OFFSET , VCS_TOKEN_OFFSET = 0x1B660000, 0x0040000 , 0x80
|
| 63 |
-
ACT_BASE_ADDR , ACT_TOKEN_OFFSET = 0x1BE60000, 0x1000
|
| 64 |
-
RESI_BASE_ADDR , RESI_TOKEN_OFFSET = 0x1C660000, 0x1000
|
| 65 |
-
ACTS_BASE_ADDR , ACTS_TOKEN_OFFSET = 0x1CE60000, 0x80
|
| 66 |
-
RESIS_BASE_ADDR , RESIS_TOKEN_OFFSET = 0x1CEA0000, 0x80
|
| 67 |
-
PRENORM_ADDR , PRENORM_LAYER_OFFSET = 0x1CEE0000, 0x1000
|
| 68 |
-
POSTNORM_ADDR , POSTNORM_LAYER_OFFSET = 0x1CF00000, 0x1000
|
| 69 |
-
ROPE_BASE_ADDR , ROPE_TOKEN_OFFSET = 0x1CF20000, 0xC0
|
| 70 |
-
PRENORMS_ADDR , PRENORMS_LAYER_OFFSET = 0x1CFE0000, 0x80
|
| 71 |
-
POSTNORMS_ADDR , POSTNORMS_LAYER_OFFSET = 0x1CFE1000, 0x80
|
| 72 |
-
OUTNORM_ADDR = 0x1D840000
|
| 73 |
-
OUTNORMS_ADDR = 0x1D841000
|
| 74 |
-
WHEAD_BASE_ADDR , WHEAD_LAYER_OFFSET = 0x1D000000, 0x100000
|
| 75 |
-
WHEADS_BASE_ADDR , WHEADS_LAYER_OFFSET = 0x1D800000, 0x8000
|
| 76 |
-
HEAD_OUT_BASE_ADDR , HEAD_OUT_LAYER_OFFSET = 0x1E000000, 0x1000
|
| 77 |
-
HEADS_OUT_BASE_ADDR , HEADS_OUT_LAYER_OFFSET = 0x1E008000, 0x80
|
| 78 |
-
|
| 79 |
-
class Instruction:
|
| 80 |
-
def __init__(self,
|
| 81 |
-
op,
|
| 82 |
-
dq_en,
|
| 83 |
-
stage,
|
| 84 |
-
token,
|
| 85 |
-
load_target,
|
| 86 |
-
cmode,
|
| 87 |
-
nonlinear,
|
| 88 |
-
write_back,
|
| 89 |
-
input_dim,
|
| 90 |
-
output_dim,
|
| 91 |
-
input_addr,
|
| 92 |
-
scale_addr,
|
| 93 |
-
output_addr,
|
| 94 |
-
layer_offset,
|
| 95 |
-
token_offset,
|
| 96 |
-
num_cb_ws,
|
| 97 |
-
num_cb_wm
|
| 98 |
-
):
|
| 99 |
-
self.op = op
|
| 100 |
-
self.dq_en = dq_en
|
| 101 |
-
self.stage = stage
|
| 102 |
-
self.token = token
|
| 103 |
-
self.load_target = load_target
|
| 104 |
-
self.cmode = cmode
|
| 105 |
-
self.nonlinear = nonlinear
|
| 106 |
-
self.write_back = write_back
|
| 107 |
-
self.input_dim = input_dim
|
| 108 |
-
self.output_dim = output_dim
|
| 109 |
-
self.input_addr = input_addr
|
| 110 |
-
self.scale_addr = scale_addr
|
| 111 |
-
self.output_addr = output_addr
|
| 112 |
-
self.layer_offset = layer_offset
|
| 113 |
-
self.token_offset = token_offset
|
| 114 |
-
self.num_cb_ws = num_cb_ws
|
| 115 |
-
self.num_cb_wm = num_cb_wm
|
| 116 |
-
|
| 117 |
-
def to_binary(self, inst_num:int, inst_info:str):
|
| 118 |
-
print('INFO: {:30s} , Instruction id: {}'.format(inst_info, inst_num))
|
| 119 |
-
# Convert the instruction to a binary format
|
| 120 |
-
binary_format = (
|
| 121 |
-
f"{self.op:014b}{self.dq_en:01b}{self.stage:01b}{self.token:011b}"
|
| 122 |
-
f"{self.load_target:03b}{self.cmode:01b}{self.nonlinear:01b}{self.write_back:08b}"
|
| 123 |
-
f"{self.input_dim:016b}{self.output_dim:016b}"
|
| 124 |
-
f"{self.input_addr:032b}{self.scale_addr:032b}{self.output_addr:032b}"
|
| 125 |
-
f"{self.layer_offset:032b}{self.token_offset:032b}{self.num_cb_ws:016b}{self.num_cb_wm:016b}"
|
| 126 |
-
)
|
| 127 |
-
padding_length = 512 - len(binary_format)
|
| 128 |
-
binary_format = '0' * padding_length + binary_format
|
| 129 |
-
return binary_format
|
| 130 |
-
|
| 131 |
-
def gen_inst(op, dq_en, stage, token, load_target, cmode, nonlinear, write_back, input_dim, output_dim, input_addr, scale_addr, output_addr, layer_offset, token_offset, num_cb_ws, num_cb_wm):
|
| 132 |
-
return Instruction(op, dq_en, stage, token, load_target, cmode, nonlinear, write_back, input_dim, output_dim, input_addr, scale_addr, output_addr, layer_offset, token_offset, num_cb_ws, num_cb_wm)
|
| 133 |
-
# =====================================================================================================================================================================================================================
|
| 134 |
-
# Test bandwidth
|
| 135 |
-
# =====================================================================================================================================================================================================================
|
| 136 |
-
inst_test_bw = gen_inst(TEST_BW, 0, SUM, 0, NULL, MX_INT8, NO_ACT, NO_WB, 0, 0 , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 0 )
|
| 137 |
-
# =====================================================================================================================================================================================================================
|
| 138 |
-
# MXINT8 Instruction templates
|
| 139 |
-
# =====================================================================================================================================================================================================================
|
| 140 |
-
inst_nop = gen_inst(NOP, 0, SUM, 0, NULL, MX_INT8, NO_ACT, NO_WB, 0, 0 , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 0 )
|
| 141 |
-
inst_test_bw = gen_inst(TEST_BW, 0, GEN, 0, NULL, MX_INT8, NO_ACT, NO_WB, 0, 0 , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 0 )
|
| 142 |
-
inst_load_resi = gen_inst(LOAD, 0, GEN, 0, RESI, MX_INT8, NO_ACT, NO_WB, 4096, 0 , RESI_BASE_ADDR, 0x00000000, 0x00000000, 0x00000000, RESI_TOKEN_OFFSET, 0, 0 )
|
| 143 |
-
inst_load_in_act = gen_inst(LOAD, 0, GEN, 0, ACT, MX_INT8, NO_ACT, NO_WB, 4096, 0 , ACT_BASE_ADDR, 0x00000000, 0x00000000, 0x00000000, ACT_TOKEN_OFFSET, 0, 0 )
|
| 144 |
-
inst_load_resi_s = gen_inst(LOAD, 0, GEN, 0, RESI_S, MX_INT8, NO_ACT, NO_WB, 128, 0 , RESIS_BASE_ADDR, 0x00000000, 0x00000000, 0x00000000, RESIS_TOKEN_OFFSET, 0, 0 )
|
| 145 |
-
inst_load_in_act_s = gen_inst(LOAD, 0, GEN, 0, ACT_S, MX_INT8, NO_ACT, NO_WB, 128, 0 , ACTS_BASE_ADDR, 0x00000000, 0x00000000, 0x00000000, ACTS_TOKEN_OFFSET, 0, 0 )
|
| 146 |
-
inst_mlp_wm_q = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT8, NO_ACT, NO_WB, 4096, 4096 , WQ_BASE_ADDR, WQS_BASE_ADDR, 0x00000000, WQ_LAYER_OFFSET, 0x00000000, 128, 4096 )
|
| 147 |
-
inst_mlp_wm_k = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT8, NO_ACT, NO_WB, 4096, 4096 , WK_BASE_ADDR, WKS_BASE_ADDR, 0x00000000, WK_LAYER_OFFSET, 0x00000000, 128, 4096 )
|
| 148 |
-
inst_mlp_wm_v = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT8, NO_ACT, WB, 4096, 4096 , WV_BASE_ADDR, WVS_BASE_ADDR, 0x00000000, WV_LAYER_OFFSET, 0x00000000, 128, 4096 )
|
| 149 |
-
inst_mlp_wm_o = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT8, NO_ACT, NO_WB, 4096, 4096 , WO_BASE_ADDR, WOS_BASE_ADDR, 0x00000000, WO_LAYER_OFFSET, 0x00000000, 128, 4096 )
|
| 150 |
-
inst_mlp_wm_w1 = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT8, SILU, NO_WB, 4096, 11008 , W1_BASE_ADDR, W1S_BASE_ADDR, 0x00000000, W1_LAYER_OFFSET, 0x00000000, 32, 1024 )
|
| 151 |
-
inst_mlp_wm_w3 = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT8, NO_ACT, NO_WB, 4096, 11008 , W3_BASE_ADDR, W3S_BASE_ADDR, 0x00000000, W3_LAYER_OFFSET, 0x00000000, 32, 1024 )
|
| 152 |
-
inst_mlp_wm_w2 = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT8, NO_ACT, NO_WB, 11008, 4096 , W2_BASE_ADDR, W2S_BASE_ADDR, 0x00000000, W2_LAYER_OFFSET, 0x00000000, 24, 688 )
|
| 153 |
-
inst_gate = gen_inst(GATE, 1, GEN, 0, NULL, MX_INT8, NO_ACT, NO_WB, 11008, 11008 , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 0 )
|
| 154 |
-
inst_residual = gen_inst(RESIDUAL, 1, GEN, 0, NULL, MX_INT8, NO_ACT, WB, 4096, 4096 , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 0 )
|
| 155 |
-
inst_store_act = gen_inst(STORE, 1, GEN, 0, NULL, MX_INT8, NO_ACT, WB_ACT, 4096, 4096 , 0x00000000, 0x00000000, ACT_BASE_ADDR, 0x00000000, ACT_TOKEN_OFFSET, 0, 0 )
|
| 156 |
-
inst_store_act_s = gen_inst(STORE, 1, GEN, 0, NULL, MX_INT8, NO_ACT, WB_ACT_S, 4096, 4096 , 0x00000000, 0x00000000, ACTS_BASE_ADDR, 0x00000000, ACTS_TOKEN_OFFSET, 0, 0 )
|
| 157 |
-
inst_store_resi = gen_inst(STORE, 1, GEN, 0, NULL, MX_INT8, NO_ACT, WB_ACT, 4096, 4096 , 0x00000000, 0x00000000, RESI_BASE_ADDR, 0x00000000, RESI_TOKEN_OFFSET, 0, 0 )
|
| 158 |
-
inst_store_resi_s = gen_inst(STORE, 1, GEN, 0, NULL, MX_INT8, NO_ACT, WB_ACT_S, 4096, 4096 , 0x00000000, 0x00000000, RESIS_BASE_ADDR, 0x00000000, RESIS_TOKEN_OFFSET, 0, 0 )
|
| 159 |
-
inst_rope_nwb = gen_inst(ROPE, 1, GEN, 0, NULL, MX_INT8, NO_ACT, NO_WB, 192, 4096 , ROPE_BASE_ADDR, 0x00000000, 0x00000000, 0x00000000, ROPE_TOKEN_OFFSET, 0, 0 )
|
| 160 |
-
inst_rope_wb = gen_inst(ROPE, 1, GEN, 0, NULL, MX_INT8, NO_ACT, WB, 192, 4096 , ROPE_BASE_ADDR, 0x00000000, 0x00000000, 0x00000000, ROPE_TOKEN_OFFSET, 0, 0 )
|
| 161 |
-
inst_store_k = gen_inst(STORE, 1, GEN, 0, NULL, MX_INT8, NO_ACT, WB_KV, 4096, 4096 , 0x00000000, 0x00000000, KC_BASE_ADDR, 0x00000000, KC_TOKEN_OFFSET, 0, 0 )
|
| 162 |
-
inst_store_k_s = gen_inst(STORE, 1, GEN, 0, NULL, MX_INT8, NO_ACT, WB_KV_S, 4096, 4096 , 0x00000000, 0x00000000, KCS_BASE_ADDR, 0x00000000, KCS_TOKEN_OFFSET, 0, 0 )
|
| 163 |
-
inst_store_v = gen_inst(STORE, 1, GEN, 0, NULL, MX_INT8, NO_ACT, WB_KV, 4096, 4096 , 0x00000000, 0x00000000, VC_BASE_ADDR, 0x00000000, VC_TOKEN_OFFSET, 0, 0 )
|
| 164 |
-
inst_store_v_s = gen_inst(STORE, 1, GEN, 0, NULL, MX_INT8, NO_ACT, WB_KV_S, 4096, 4096 , 0x00000000, 0x00000000, VCS_BASE_ADDR, 0x00000000, VCS_TOKEN_OFFSET, 0, 0 )
|
| 165 |
-
inst_mlp_qkt = gen_inst(MLP_QKT, 1, GEN, 0, NULL, MX_INT8, NO_ACT, NO_WB, 4, 1 , KC_BASE_ADDR, KCS_BASE_ADDR, 0x00000000, 0x00000000, KCS_TOKEN_OFFSET, 4, 32 )
|
| 166 |
-
inst_qkt_m_rsqrt = gen_inst(QKT_M_RSQRT, 1, GEN, 0, NULL, MX_INT8, NO_ACT, NO_WB, 1, 1 , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1, 512 )
|
| 167 |
-
inst_softmax = gen_inst(SOFTMAX, 1, GEN, 0, NULL, MX_INT8, NO_ACT, NO_WB, 0, 1 , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1, 0 )
|
| 168 |
-
inst_mlp_hp = gen_inst(MLP_HP, 1, GEN, 0, NULL, MX_INT8, NO_ACT, NO_WB, 4, 4096 , VC_BASE_ADDR, VCS_BASE_ADDR, 0x00000000, 0x00000000, VCS_TOKEN_OFFSET, 4, 32 )
|
| 169 |
-
inst_prerrms = gen_inst(RRMS, 1, GEN, 0, NULL, MX_INT8, NO_ACT, NO_WB, 4096, 1 , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 128 )
|
| 170 |
-
inst_load_pre_wm = gen_inst(LOAD, 0, GEN, 0, ACT, MX_INT8, NO_ACT, NO_WB, 4096, 0 , PRENORM_ADDR, 0x00000000, 0x00000000, PRENORM_LAYER_OFFSET, 0x00000000, 0, 0 )
|
| 171 |
-
inst_load_pre_ws = gen_inst(LOAD, 0, GEN, 0, ACT_S, MX_INT8, NO_ACT, NO_WB, 128, 0 , PRENORMS_ADDR, 0x00000000, 0x00000000, PRENORMS_LAYER_OFFSET, 0x00000000, 0, 0 )
|
| 172 |
-
inst_prermsnorm = gen_inst(RMSNORM, 1, GEN, 0, NULL, MX_INT8, NO_ACT, NO_WB, 4096, 4096 , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 128 )
|
| 173 |
-
inst_postrrms = gen_inst(RRMS, 1, GEN, 0, NULL, MX_INT8, NO_ACT, NO_WB, 4096, 1 , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 128 )
|
| 174 |
-
inst_load_post_wm = gen_inst(LOAD, 0, GEN, 0, ACT, MX_INT8, NO_ACT, NO_WB, 4096, 0 , POSTNORM_ADDR, 0x00000000, 0x00000000, POSTNORM_LAYER_OFFSET, 0x00000000, 0, 0 )
|
| 175 |
-
inst_load_post_ws = gen_inst(LOAD, 0, GEN, 0, ACT_S, MX_INT8, NO_ACT, NO_WB, 128, 0 , POSTNORMS_ADDR, 0x00000000, 0x00000000, POSTNORMS_LAYER_OFFSET, 0x00000000, 0, 0 )
|
| 176 |
-
inst_postrmsnorm = gen_inst(RMSNORM, 1, GEN, 0, NULL, MX_INT8, NO_ACT, NO_WB, 4096, 4096 , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 128 )
|
| 177 |
-
# =====================================================================================================================================================================================================================
|
| 178 |
-
# DECODER OUT Instruction templates
|
| 179 |
-
# =====================================================================================================================================================================================================================
|
| 180 |
-
inst_outrrms = gen_inst(RRMS, 1, GEN, 0, NULL, MX_INT8, NO_ACT, NO_WB, 4096, 1 , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 128 )
|
| 181 |
-
inst_load_out_wm = gen_inst(LOAD, 0, GEN, 0, ACT, MX_INT8, NO_ACT, NO_WB, 4096, 0 , OUTNORM_ADDR, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 0 )
|
| 182 |
-
inst_load_out_ws = gen_inst(LOAD, 0, GEN, 0, ACT_S, MX_INT8, NO_ACT, NO_WB, 128, 0 , OUTNORMS_ADDR, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 0 )
|
| 183 |
-
inst_outrmsnorm = gen_inst(RMSNORM, 1, GEN, 0, NULL, MX_INT8, NO_ACT, NO_WB, 4096, 4096 , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 128 )
|
| 184 |
-
inst_mlp_wm_whead = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT8, NO_ACT, WB, 4096, 4096 , WHEAD_BASE_ADDR, WHEADS_BASE_ADDR, 0x00000000, WHEAD_LAYER_OFFSET, 0x00000000, 128, 4096 )
|
| 185 |
-
inst_store_head = gen_inst(STORE, 1, GEN, 0, NULL, MX_INT8, NO_ACT, WB_ACT, 4096, 4096 , 0x00000000, 0x00000000, HEAD_OUT_BASE_ADDR, 0x00000000, HEAD_OUT_LAYER_OFFSET, 0, 0 )
|
| 186 |
-
inst_store_head_s = gen_inst(STORE, 1, GEN, 0, NULL, MX_INT8, NO_ACT, WB_ACT_S, 4096, 4096 , 0x00000000, 0x00000000, HEADS_OUT_BASE_ADDR, 0x00000000, HEADS_OUT_LAYER_OFFSET, 0, 0 )
|
| 187 |
-
# =====================================================================================================================================================================================================================
|
| 188 |
-
# MXINT4 Instruction templates
|
| 189 |
-
# =====================================================================================================================================================================================================================
|
| 190 |
-
inst_mlp_wm_whead_mxint4= gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT4, NO_ACT, WB, 4096, 4096 , WHEAD_BASE_ADDR, WHEADS_BASE_ADDR, 0x00000000, WHEAD_LAYER_OFFSET, 0x00000000, 128, 2048 )
|
| 191 |
-
inst_mlp_wm_q_mxint4 = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT4, NO_ACT, NO_WB, 4096, 4096 , WQ_BASE_ADDR, WQS_BASE_ADDR, 0x00000000, WQ_LAYER_OFFSET, 0x00000000, 128, 2048 )
|
| 192 |
-
inst_mlp_wm_k_mxint4 = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT4, NO_ACT, NO_WB, 4096, 4096 , WK_BASE_ADDR, WKS_BASE_ADDR, 0x00000000, WK_LAYER_OFFSET, 0x00000000, 128, 2048 )
|
| 193 |
-
inst_mlp_wm_v_mxint4 = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT4, NO_ACT, WB, 4096, 4096 , WV_BASE_ADDR, WVS_BASE_ADDR, 0x00000000, WV_LAYER_OFFSET, 0x00000000, 128, 2048 )
|
| 194 |
-
inst_mlp_wm_o_mxint4 = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT4, NO_ACT, NO_WB, 4096, 4096 , WO_BASE_ADDR, WOS_BASE_ADDR, 0x00000000, WO_LAYER_OFFSET, 0x00000000, 128, 2048 )
|
| 195 |
-
inst_mlp_wm_w1_mxint4 = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT4, SILU, NO_WB, 4096, 11008 , W1_BASE_ADDR, W1S_BASE_ADDR, 0x00000000, W1_LAYER_OFFSET, 0x00000000, 32, 512 )
|
| 196 |
-
inst_mlp_wm_w3_mxint4 = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT4, NO_ACT, NO_WB, 4096, 11008 , W3_BASE_ADDR, W3S_BASE_ADDR, 0x00000000, W3_LAYER_OFFSET, 0x00000000, 32, 512 )
|
| 197 |
-
inst_mlp_wm_w2_mxint4 = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT4, NO_ACT, NO_WB, 11008, 4096 , W2_BASE_ADDR, W2S_BASE_ADDR, 0x00000000, W2_LAYER_OFFSET, 0x00000000, 24, 344 )
|
| 198 |
-
# =====================================================================================================================================================================================================================
|
| 199 |
-
# Instruction templates end
|
| 200 |
-
# =====================================================================================================================================================================================================================
|
| 201 |
-
# gen instructions parameters
|
| 202 |
-
parser = argparse.ArgumentParser(description='Generate instruction binary file for simulation')
|
| 203 |
-
parser.add_argument('--sim_cmode', type=str, default='mxint8', choices=['mxint8', 'mxint4'], help='Simulation compute mode')
|
| 204 |
-
parser.add_argument('--sim_llm_head', action='store_true', help='if true, generate LLM head instructions for FPGA simulation')
|
| 205 |
-
args = parser.parse_args()
|
| 206 |
-
|
| 207 |
-
SIM_CMODE = args.sim_cmode
|
| 208 |
-
TEST_OP_GROUP = 'demo'
|
| 209 |
-
SIM_LLM_HEAD = args.sim_llm_head
|
| 210 |
-
SIM_LOGIT_FLAG = '' if SIM_LLM_HEAD else '_no'
|
| 211 |
-
SIM_NUM_TOKEN = 1024
|
| 212 |
-
SIM_NUM_LAYER = 32
|
| 213 |
-
|
| 214 |
-
if __name__ == "__main__":
|
| 215 |
-
if TEST_OP_GROUP == 'demo':
|
| 216 |
-
current_token = 0
|
| 217 |
-
tmp_output_dim = 1
|
| 218 |
-
current_inst_cnt = 0
|
| 219 |
-
file_name = "instruction_{}T_32L{}_write_back_logit_everyT_{}.bin".format(SIM_NUM_TOKEN, SIM_LOGIT_FLAG, SIM_CMODE)
|
| 220 |
-
|
| 221 |
-
with open(file_name, "wb") as f:
|
| 222 |
-
for tk in range(SIM_NUM_TOKEN):
|
| 223 |
-
print("Gen {} th token instruction start".format(tk+1))
|
| 224 |
-
# LOAD IN_ACT
|
| 225 |
-
inst_load_in_act.input_addr = ACT_BASE_ADDR + ACT_TOKEN_OFFSET * tk
|
| 226 |
-
inst_load_in_act.token = current_token
|
| 227 |
-
binary_instruction = inst_load_in_act.to_binary(current_inst_cnt, 'LOAD IN_ACT')
|
| 228 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 229 |
-
current_inst_cnt += 1
|
| 230 |
-
# LOAD IN_ACT
|
| 231 |
-
inst_load_in_act_s.input_addr = ACTS_BASE_ADDR + ACTS_TOKEN_OFFSET * tk
|
| 232 |
-
inst_load_in_act_s.token = current_token
|
| 233 |
-
binary_instruction = inst_load_in_act_s.to_binary(current_inst_cnt, 'LOAD IN_ACT_S')
|
| 234 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 235 |
-
current_inst_cnt += 1
|
| 236 |
-
|
| 237 |
-
for l in range(SIM_NUM_LAYER):
|
| 238 |
-
print("Gen {} th token, {} th layer instruction".format(tk+1, l+1))
|
| 239 |
-
# RRMS
|
| 240 |
-
inst_prerrms.token = current_token
|
| 241 |
-
binary_instruction = inst_prerrms.to_binary(current_inst_cnt, 'PRE RRMS')
|
| 242 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 243 |
-
current_inst_cnt += 1
|
| 244 |
-
# LOAD NORM weight
|
| 245 |
-
inst_load_pre_wm.input_addr = PRENORM_ADDR + PRENORM_LAYER_OFFSET * l
|
| 246 |
-
inst_load_pre_wm.token = current_token
|
| 247 |
-
binary_instruction = inst_load_pre_wm.to_binary(current_inst_cnt, 'LOAD PRENORM weight')
|
| 248 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 249 |
-
current_inst_cnt += 1
|
| 250 |
-
# LOAD NORM weight scale
|
| 251 |
-
inst_load_pre_ws.input_addr = PRENORMS_ADDR + PRENORMS_LAYER_OFFSET * l
|
| 252 |
-
inst_load_pre_ws.token = current_token
|
| 253 |
-
binary_instruction = inst_load_pre_ws.to_binary(current_inst_cnt, 'LOAD PRENORM weight scale')
|
| 254 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 255 |
-
current_inst_cnt += 1
|
| 256 |
-
# RMSNORM
|
| 257 |
-
inst_prermsnorm.token = current_token
|
| 258 |
-
binary_instruction = inst_prermsnorm.to_binary(current_inst_cnt, 'PRE RMSNORM')
|
| 259 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 260 |
-
current_inst_cnt += 1
|
| 261 |
-
|
| 262 |
-
if SIM_CMODE == 'mxint8':
|
| 263 |
-
# MLP_WM Wv
|
| 264 |
-
inst_mlp_wm_v.input_addr = WV_BASE_ADDR + WV_LAYER_OFFSET * l
|
| 265 |
-
inst_mlp_wm_v.scale_addr = WVS_BASE_ADDR + WVS_LAYER_OFFSET * l
|
| 266 |
-
inst_mlp_wm_v.token = current_token
|
| 267 |
-
binary_instruction = inst_mlp_wm_v.to_binary(current_inst_cnt, 'MLP_WM Wv')
|
| 268 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 269 |
-
current_inst_cnt += 1
|
| 270 |
-
else:
|
| 271 |
-
# MLP_WM Wv
|
| 272 |
-
inst_mlp_wm_v_mxint4.input_addr = WV_BASE_ADDR + (WV_LAYER_OFFSET//2) * l
|
| 273 |
-
inst_mlp_wm_v_mxint4.scale_addr = WVS_BASE_ADDR + WVS_LAYER_OFFSET * l
|
| 274 |
-
inst_mlp_wm_v_mxint4.token = current_token
|
| 275 |
-
binary_instruction = inst_mlp_wm_v_mxint4.to_binary(current_inst_cnt, 'MLP_WM Wv')
|
| 276 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 277 |
-
current_inst_cnt += 1
|
| 278 |
-
|
| 279 |
-
# STORE V elem
|
| 280 |
-
inst_store_v.output_addr = VC_BASE_ADDR + VC_LAYER_OFFSET * l + VC_TOKEN_OFFSET * tk
|
| 281 |
-
inst_store_v.token = current_token
|
| 282 |
-
binary_instruction = inst_store_v.to_binary(current_inst_cnt, 'STORE V elem')
|
| 283 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 284 |
-
current_inst_cnt += 1
|
| 285 |
-
# STORE V scale
|
| 286 |
-
inst_store_v_s.output_addr = VCS_BASE_ADDR + VCS_LAYER_OFFSET * l + VCS_TOKEN_OFFSET * tk
|
| 287 |
-
inst_store_v_s.token = current_token
|
| 288 |
-
binary_instruction = inst_store_v_s.to_binary(current_inst_cnt, 'STORE V scale')
|
| 289 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 290 |
-
current_inst_cnt += 1
|
| 291 |
-
|
| 292 |
-
if SIM_CMODE == 'mxint8':
|
| 293 |
-
# MLP_WM Wk
|
| 294 |
-
inst_mlp_wm_k.input_addr = WK_BASE_ADDR + WK_LAYER_OFFSET * l
|
| 295 |
-
inst_mlp_wm_k.scale_addr = WKS_BASE_ADDR + WKS_LAYER_OFFSET * l
|
| 296 |
-
inst_mlp_wm_k.token = current_token
|
| 297 |
-
binary_instruction = inst_mlp_wm_k.to_binary(current_inst_cnt, 'MLP_WM Wk')
|
| 298 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 299 |
-
current_inst_cnt += 1
|
| 300 |
-
else:
|
| 301 |
-
# MLP_WM Wk
|
| 302 |
-
inst_mlp_wm_k_mxint4.input_addr = WK_BASE_ADDR + (WK_LAYER_OFFSET//2) * l
|
| 303 |
-
inst_mlp_wm_k_mxint4.scale_addr = WKS_BASE_ADDR + WKS_LAYER_OFFSET * l
|
| 304 |
-
inst_mlp_wm_k_mxint4.token = current_token
|
| 305 |
-
binary_instruction = inst_mlp_wm_k_mxint4.to_binary(current_inst_cnt, 'MLP_WM Wk')
|
| 306 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 307 |
-
current_inst_cnt += 1
|
| 308 |
-
|
| 309 |
-
# ROPE WB (K)
|
| 310 |
-
inst_rope_wb.input_addr = ROPE_BASE_ADDR + ROPE_TOKEN_OFFSET * tk
|
| 311 |
-
inst_rope_wb.token = current_token
|
| 312 |
-
binary_instruction = inst_rope_wb.to_binary(current_inst_cnt, 'ROPE WB (K)')
|
| 313 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 314 |
-
current_inst_cnt += 1
|
| 315 |
-
# STORE K elem
|
| 316 |
-
inst_store_k.output_addr = KC_BASE_ADDR + KC_LAYER_OFFSET * l + KC_TOKEN_OFFSET * tk
|
| 317 |
-
inst_store_k.token = current_token
|
| 318 |
-
binary_instruction = inst_store_k.to_binary(current_inst_cnt, 'STORE K elem')
|
| 319 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 320 |
-
current_inst_cnt += 1
|
| 321 |
-
# STORE K scale
|
| 322 |
-
inst_store_k_s.output_addr = KCS_BASE_ADDR + KCS_LAYER_OFFSET * l + KCS_TOKEN_OFFSET * tk
|
| 323 |
-
inst_store_k_s.token = current_token
|
| 324 |
-
binary_instruction = inst_store_k_s.to_binary(current_inst_cnt, 'STORE K scale')
|
| 325 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 326 |
-
current_inst_cnt += 1
|
| 327 |
-
|
| 328 |
-
if SIM_CMODE == 'mxint8':
|
| 329 |
-
# MLP_WM Wq
|
| 330 |
-
inst_mlp_wm_q.input_addr = WQ_BASE_ADDR + WQ_LAYER_OFFSET * l
|
| 331 |
-
inst_mlp_wm_q.scale_addr = WQS_BASE_ADDR + WQS_LAYER_OFFSET * l
|
| 332 |
-
inst_mlp_wm_q.token = current_token
|
| 333 |
-
binary_instruction = inst_mlp_wm_q.to_binary(current_inst_cnt, 'MLP_WM Wq')
|
| 334 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 335 |
-
current_inst_cnt += 1
|
| 336 |
-
else:
|
| 337 |
-
# MLP_WM Wq
|
| 338 |
-
inst_mlp_wm_q_mxint4.input_addr = WQ_BASE_ADDR + (WQ_LAYER_OFFSET//2) * l
|
| 339 |
-
inst_mlp_wm_q_mxint4.scale_addr = WQS_BASE_ADDR + WQS_LAYER_OFFSET * l
|
| 340 |
-
inst_mlp_wm_q_mxint4.token = current_token
|
| 341 |
-
binary_instruction = inst_mlp_wm_q_mxint4.to_binary(current_inst_cnt, 'MLP_WM Wq')
|
| 342 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 343 |
-
current_inst_cnt += 1
|
| 344 |
-
|
| 345 |
-
# ROPE NO_WB (Q)
|
| 346 |
-
inst_rope_nwb.input_addr = ROPE_BASE_ADDR + ROPE_TOKEN_OFFSET * tk
|
| 347 |
-
inst_rope_nwb.token = current_token
|
| 348 |
-
binary_instruction = inst_rope_nwb.to_binary(current_inst_cnt, 'ROPE NO_WB (Q)')
|
| 349 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 350 |
-
current_inst_cnt += 1
|
| 351 |
-
# MLP_QKT
|
| 352 |
-
inst_mlp_qkt.input_addr = KC_BASE_ADDR + KC_LAYER_OFFSET * l
|
| 353 |
-
inst_mlp_qkt.scale_addr = KCS_BASE_ADDR + KCS_LAYER_OFFSET * l
|
| 354 |
-
inst_mlp_qkt.output_dim = tmp_output_dim
|
| 355 |
-
inst_mlp_qkt.input_dim = tmp_output_dim * inst_mlp_qkt.num_cb_ws
|
| 356 |
-
inst_mlp_qkt.token = current_token
|
| 357 |
-
binary_instruction = inst_mlp_qkt.to_binary(current_inst_cnt, 'MLP_QKT')
|
| 358 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 359 |
-
current_inst_cnt += 1
|
| 360 |
-
# QKT_M_RSQRT
|
| 361 |
-
inst_qkt_m_rsqrt.output_dim = tmp_output_dim
|
| 362 |
-
inst_qkt_m_rsqrt.input_dim = tmp_output_dim * inst_qkt_m_rsqrt.num_cb_ws
|
| 363 |
-
inst_qkt_m_rsqrt.token = current_token
|
| 364 |
-
binary_instruction = inst_qkt_m_rsqrt.to_binary(current_inst_cnt, 'QKT_M_RSQRT')
|
| 365 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 366 |
-
current_inst_cnt += 1
|
| 367 |
-
# SOFTMAX
|
| 368 |
-
inst_softmax.output_dim = tmp_output_dim
|
| 369 |
-
inst_softmax.token = current_token
|
| 370 |
-
binary_instruction = inst_softmax.to_binary(current_inst_cnt, 'SOFTMAX')
|
| 371 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 372 |
-
current_inst_cnt += 1
|
| 373 |
-
# MLP_HP (SxV)
|
| 374 |
-
inst_mlp_hp.input_addr = VC_BASE_ADDR + VC_LAYER_OFFSET * l
|
| 375 |
-
inst_mlp_hp.scale_addr = VCS_BASE_ADDR + VCS_LAYER_OFFSET * l
|
| 376 |
-
inst_mlp_hp.input_dim = tmp_output_dim * inst_mlp_hp.num_cb_ws
|
| 377 |
-
inst_mlp_hp.token = current_token
|
| 378 |
-
binary_instruction = inst_mlp_hp.to_binary(current_inst_cnt, 'MLP_HP (SxV)')
|
| 379 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 380 |
-
current_inst_cnt += 1
|
| 381 |
-
|
| 382 |
-
if SIM_CMODE == 'mxint8':
|
| 383 |
-
# MLP_WM Wo
|
| 384 |
-
inst_mlp_wm_o.input_addr = WO_BASE_ADDR + WO_LAYER_OFFSET * l
|
| 385 |
-
inst_mlp_wm_o.scale_addr = WOS_BASE_ADDR + WOS_LAYER_OFFSET * l
|
| 386 |
-
inst_mlp_wm_o.token = current_token
|
| 387 |
-
binary_instruction = inst_mlp_wm_o.to_binary(current_inst_cnt, 'MLP_WM Wo')
|
| 388 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 389 |
-
current_inst_cnt += 1
|
| 390 |
-
else:
|
| 391 |
-
# MLP_WM Wo
|
| 392 |
-
inst_mlp_wm_o_mxint4.input_addr = WO_BASE_ADDR + (WO_LAYER_OFFSET//2) * l
|
| 393 |
-
inst_mlp_wm_o_mxint4.scale_addr = WOS_BASE_ADDR + WOS_LAYER_OFFSET * l
|
| 394 |
-
inst_mlp_wm_o_mxint4.token = current_token
|
| 395 |
-
binary_instruction = inst_mlp_wm_o_mxint4.to_binary(current_inst_cnt, 'MLP_WM Wo')
|
| 396 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 397 |
-
current_inst_cnt += 1
|
| 398 |
-
|
| 399 |
-
# LOAD RESI
|
| 400 |
-
inst_load_resi.input_addr = RESI_BASE_ADDR + RESI_TOKEN_OFFSET * tk
|
| 401 |
-
inst_load_resi.token = current_token
|
| 402 |
-
binary_instruction = inst_load_resi.to_binary(current_inst_cnt, 'LOAD RESI')
|
| 403 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 404 |
-
current_inst_cnt += 1
|
| 405 |
-
# LOAD RESI_S
|
| 406 |
-
inst_load_resi_s.input_addr = RESIS_BASE_ADDR + RESIS_TOKEN_OFFSET * tk
|
| 407 |
-
inst_load_resi_s.token = current_token
|
| 408 |
-
binary_instruction = inst_load_resi_s.to_binary(current_inst_cnt, 'LOAD RESI_S')
|
| 409 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 410 |
-
current_inst_cnt += 1
|
| 411 |
-
# RESIDUAL
|
| 412 |
-
inst_residual.token = current_token
|
| 413 |
-
binary_instruction = inst_residual.to_binary(current_inst_cnt, 'RESIDUAL')
|
| 414 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 415 |
-
current_inst_cnt += 1
|
| 416 |
-
# STORE RESI
|
| 417 |
-
inst_store_resi.output_addr = RESI_BASE_ADDR + RESI_TOKEN_OFFSET * tk
|
| 418 |
-
inst_store_resi.token = current_token
|
| 419 |
-
binary_instruction = inst_store_resi.to_binary(current_inst_cnt, 'STORE RESI')
|
| 420 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 421 |
-
current_inst_cnt += 1
|
| 422 |
-
# STORE RESI_S
|
| 423 |
-
inst_store_resi_s.output_addr = RESIS_BASE_ADDR + RESIS_TOKEN_OFFSET * tk
|
| 424 |
-
inst_store_resi_s.token = current_token
|
| 425 |
-
binary_instruction = inst_store_resi_s.to_binary(current_inst_cnt, 'STORE RESI_S')
|
| 426 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 427 |
-
current_inst_cnt += 1
|
| 428 |
-
# RRMS
|
| 429 |
-
inst_postrrms.token = current_token
|
| 430 |
-
binary_instruction = inst_postrrms.to_binary(current_inst_cnt, 'POST RRMS')
|
| 431 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 432 |
-
current_inst_cnt += 1
|
| 433 |
-
# LOAD NORM weight
|
| 434 |
-
inst_load_post_wm.input_addr = POSTNORM_ADDR + POSTNORM_LAYER_OFFSET * l
|
| 435 |
-
inst_load_post_wm.token = current_token
|
| 436 |
-
binary_instruction = inst_load_post_wm.to_binary(current_inst_cnt, 'LOAD POSTNORM weight')
|
| 437 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 438 |
-
current_inst_cnt += 1
|
| 439 |
-
# LOAD NORM weight scale
|
| 440 |
-
inst_load_post_ws.input_addr = POSTNORMS_ADDR + POSTNORMS_LAYER_OFFSET * l
|
| 441 |
-
inst_load_post_ws.token = current_token
|
| 442 |
-
binary_instruction = inst_load_post_ws.to_binary(current_inst_cnt, 'LOAD POSTNORM weight scale')
|
| 443 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 444 |
-
current_inst_cnt += 1
|
| 445 |
-
# RMSNORM
|
| 446 |
-
inst_postrmsnorm.token = current_token
|
| 447 |
-
binary_instruction = inst_postrmsnorm.to_binary(current_inst_cnt, 'POST RMSNORM')
|
| 448 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 449 |
-
current_inst_cnt += 1
|
| 450 |
-
|
| 451 |
-
if SIM_CMODE == 'mxint8':
|
| 452 |
-
# MLP_WM W1
|
| 453 |
-
inst_mlp_wm_w1.input_addr = W1_BASE_ADDR + W1_LAYER_OFFSET * l
|
| 454 |
-
inst_mlp_wm_w1.scale_addr = W1S_BASE_ADDR + W1S_LAYER_OFFSET * l
|
| 455 |
-
inst_mlp_wm_w1.token = current_token
|
| 456 |
-
binary_instruction = inst_mlp_wm_w1.to_binary(current_inst_cnt, 'MLP_WM W1')
|
| 457 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 458 |
-
current_inst_cnt += 1
|
| 459 |
-
# MLP_WM W3
|
| 460 |
-
inst_mlp_wm_w3.input_addr = W3_BASE_ADDR + W3_LAYER_OFFSET * l
|
| 461 |
-
inst_mlp_wm_w3.scale_addr = W3S_BASE_ADDR + W3S_LAYER_OFFSET * l
|
| 462 |
-
inst_mlp_wm_w3.token = current_token
|
| 463 |
-
binary_instruction = inst_mlp_wm_w3.to_binary(current_inst_cnt, 'MLP_WM W3')
|
| 464 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 465 |
-
current_inst_cnt += 1
|
| 466 |
-
else:
|
| 467 |
-
# MLP_WM W1
|
| 468 |
-
inst_mlp_wm_w1_mxint4.input_addr = W1_BASE_ADDR + (W1_LAYER_OFFSET//2) * l
|
| 469 |
-
inst_mlp_wm_w1_mxint4.scale_addr = W1S_BASE_ADDR + W1S_LAYER_OFFSET * l
|
| 470 |
-
inst_mlp_wm_w1_mxint4.token = current_token
|
| 471 |
-
binary_instruction = inst_mlp_wm_w1_mxint4.to_binary(current_inst_cnt, 'MLP_WM W1')
|
| 472 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 473 |
-
current_inst_cnt += 1
|
| 474 |
-
# MLP_WM W3
|
| 475 |
-
inst_mlp_wm_w3_mxint4.input_addr = W3_BASE_ADDR + (W3_LAYER_OFFSET//2) * l
|
| 476 |
-
inst_mlp_wm_w3_mxint4.scale_addr = W3S_BASE_ADDR + W3S_LAYER_OFFSET * l
|
| 477 |
-
inst_mlp_wm_w3_mxint4.token = current_token
|
| 478 |
-
binary_instruction = inst_mlp_wm_w3_mxint4.to_binary(current_inst_cnt, 'MLP_WM W3')
|
| 479 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 480 |
-
current_inst_cnt += 1
|
| 481 |
-
|
| 482 |
-
# GATE
|
| 483 |
-
inst_gate.token = current_token
|
| 484 |
-
binary_instruction = inst_gate.to_binary(current_inst_cnt, 'GATE')
|
| 485 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 486 |
-
current_inst_cnt += 1
|
| 487 |
-
|
| 488 |
-
if SIM_CMODE == 'mxint8':
|
| 489 |
-
# MLP_WM W2
|
| 490 |
-
inst_mlp_wm_w2.input_addr = W2_BASE_ADDR + W2_LAYER_OFFSET * l
|
| 491 |
-
inst_mlp_wm_w2.scale_addr = W2S_BASE_ADDR + W2S_LAYER_OFFSET * l
|
| 492 |
-
inst_mlp_wm_w2.token = current_token
|
| 493 |
-
binary_instruction = inst_mlp_wm_w2.to_binary(current_inst_cnt, 'MLP_WM W2')
|
| 494 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 495 |
-
current_inst_cnt += 1
|
| 496 |
-
else:
|
| 497 |
-
# MLP_WM W2
|
| 498 |
-
inst_mlp_wm_w2_mxint4.input_addr = W2_BASE_ADDR + (W2_LAYER_OFFSET//2) * l
|
| 499 |
-
inst_mlp_wm_w2_mxint4.scale_addr = W2S_BASE_ADDR + W2S_LAYER_OFFSET * l
|
| 500 |
-
inst_mlp_wm_w2_mxint4.token = current_token
|
| 501 |
-
binary_instruction = inst_mlp_wm_w2_mxint4.to_binary(current_inst_cnt, 'MLP_WM W2')
|
| 502 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 503 |
-
current_inst_cnt += 1
|
| 504 |
-
|
| 505 |
-
# LOAD RESI
|
| 506 |
-
inst_load_resi.input_addr = RESI_BASE_ADDR + RESI_TOKEN_OFFSET * tk
|
| 507 |
-
inst_load_resi.token = current_token
|
| 508 |
-
binary_instruction = inst_load_resi.to_binary(current_inst_cnt, 'LOAD RESI')
|
| 509 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 510 |
-
current_inst_cnt += 1
|
| 511 |
-
# LOAD RESI_S
|
| 512 |
-
inst_load_resi_s.input_addr = RESIS_BASE_ADDR + RESIS_TOKEN_OFFSET * tk
|
| 513 |
-
inst_load_resi_s.token = current_token
|
| 514 |
-
binary_instruction = inst_load_resi_s.to_binary(current_inst_cnt, 'LOAD RESI_S')
|
| 515 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 516 |
-
current_inst_cnt += 1
|
| 517 |
-
# RESIDUAL
|
| 518 |
-
inst_residual.token = current_token
|
| 519 |
-
binary_instruction = inst_residual.to_binary(current_inst_cnt, 'RESIDUAL')
|
| 520 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 521 |
-
current_inst_cnt += 1
|
| 522 |
-
# STORE RESI
|
| 523 |
-
inst_store_resi.output_addr = RESI_BASE_ADDR + RESI_TOKEN_OFFSET * tk
|
| 524 |
-
inst_store_resi.token = current_token
|
| 525 |
-
binary_instruction = inst_store_resi.to_binary(current_inst_cnt, 'STORE RESI')
|
| 526 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 527 |
-
current_inst_cnt += 1
|
| 528 |
-
# STORE RESI_S
|
| 529 |
-
inst_store_resi_s.output_addr = RESIS_BASE_ADDR + RESIS_TOKEN_OFFSET * tk
|
| 530 |
-
inst_store_resi_s.token = current_token
|
| 531 |
-
binary_instruction = inst_store_resi_s.to_binary(current_inst_cnt, 'STORE RESI_S')
|
| 532 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 533 |
-
current_inst_cnt += 1
|
| 534 |
-
# END OF LAYER
|
| 535 |
-
|
| 536 |
-
# if tk == SIM_NUM_TOKEN-1 and SIM_NUM_LAYER == 32 and SIM_LLM_HEAD:
|
| 537 |
-
if SIM_NUM_LAYER == 32 and SIM_LLM_HEAD:
|
| 538 |
-
# RRMS
|
| 539 |
-
inst_outrrms.token = current_token
|
| 540 |
-
binary_instruction = inst_outrrms.to_binary(current_inst_cnt, 'OUT RRMS')
|
| 541 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 542 |
-
current_inst_cnt += 1
|
| 543 |
-
# LOAD NORM weight
|
| 544 |
-
inst_load_out_wm.input_addr = OUTNORM_ADDR
|
| 545 |
-
inst_load_out_wm.token = current_token
|
| 546 |
-
binary_instruction = inst_load_out_wm.to_binary(current_inst_cnt, 'LOAD OUTNORM weight')
|
| 547 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 548 |
-
current_inst_cnt += 1
|
| 549 |
-
# LOAD NORM weight scale
|
| 550 |
-
inst_load_out_ws.input_addr = OUTNORMS_ADDR
|
| 551 |
-
inst_load_out_ws.token = current_token
|
| 552 |
-
binary_instruction = inst_load_out_ws.to_binary(current_inst_cnt, 'LOAD OUTNORM weight scale')
|
| 553 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 554 |
-
current_inst_cnt += 1
|
| 555 |
-
# RMSNORM
|
| 556 |
-
inst_outrmsnorm.token = current_token
|
| 557 |
-
binary_instruction = inst_outrmsnorm.to_binary(current_inst_cnt, 'OUT RMSNORM')
|
| 558 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 559 |
-
current_inst_cnt += 1
|
| 560 |
-
|
| 561 |
-
for it in range(8):
|
| 562 |
-
if SIM_CMODE == 'mxint8':
|
| 563 |
-
# MLP_WM Whead
|
| 564 |
-
inst_mlp_wm_whead.input_addr = WHEAD_BASE_ADDR + WHEAD_LAYER_OFFSET * it
|
| 565 |
-
inst_mlp_wm_whead.scale_addr = WHEADS_BASE_ADDR + WHEADS_LAYER_OFFSET * it
|
| 566 |
-
inst_mlp_wm_whead.token = current_token
|
| 567 |
-
binary_instruction = inst_mlp_wm_whead.to_binary(current_inst_cnt, 'MLP_WM Whead')
|
| 568 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 569 |
-
current_inst_cnt += 1
|
| 570 |
-
else:
|
| 571 |
-
# MLP_WM Whead
|
| 572 |
-
inst_mlp_wm_whead_mxint4.input_addr = WHEAD_BASE_ADDR + (WHEAD_LAYER_OFFSET//2) * it
|
| 573 |
-
inst_mlp_wm_whead_mxint4.scale_addr = WHEADS_BASE_ADDR + WHEADS_LAYER_OFFSET * it
|
| 574 |
-
inst_mlp_wm_whead_mxint4.token = current_token
|
| 575 |
-
binary_instruction = inst_mlp_wm_whead_mxint4.to_binary(current_inst_cnt, 'MLP_WM Whead')
|
| 576 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 577 |
-
current_inst_cnt += 1
|
| 578 |
-
|
| 579 |
-
# STORE HEAD
|
| 580 |
-
inst_store_head.output_addr = HEAD_OUT_BASE_ADDR + HEAD_OUT_LAYER_OFFSET * it
|
| 581 |
-
inst_store_head.token = current_token
|
| 582 |
-
binary_instruction = inst_store_head.to_binary(current_inst_cnt, 'STORE HEAD')
|
| 583 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 584 |
-
current_inst_cnt += 1
|
| 585 |
-
# STORE HEAD_S
|
| 586 |
-
inst_store_head_s.output_addr = HEADS_OUT_BASE_ADDR + HEADS_OUT_LAYER_OFFSET * it
|
| 587 |
-
inst_store_head_s.token = current_token
|
| 588 |
-
binary_instruction = inst_store_head_s.to_binary(current_inst_cnt, 'STORE HEAD_S')
|
| 589 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 590 |
-
current_inst_cnt += 1
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
# Adjust parameters
|
| 594 |
-
tmp_output_dim += 1
|
| 595 |
-
current_token += 1
|
| 596 |
-
# END OF TOKEN
|
| 597 |
-
|
| 598 |
-
# end instruction NOP
|
| 599 |
-
binary_instruction = inst_nop.to_binary(current_inst_cnt, 'End instruction NOP')
|
| 600 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 601 |
-
current_inst_cnt += 1
|
| 602 |
-
# END OF SIMULATION
|
| 603 |
-
print('INFO: Total Instruction Count: {}'.format(current_inst_cnt))
|
| 604 |
-
|
| 605 |
-
elif TEST_OP_GROUP == 'test_bw':
|
| 606 |
-
current_inst_cnt = 0
|
| 607 |
-
with open("../instruction/instruction_test_bw.bin", "wb") as f:
|
| 608 |
-
for i in range(SIM_NUM_TOKEN):
|
| 609 |
-
binary_instruction = inst_test_bw.to_binary(current_inst_cnt, 'TEST BW')
|
| 610 |
-
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 611 |
-
current_inst_cnt += 1
|
| 612 |
-
print('INFO: Total Instruction Count: {}'.format(current_inst_cnt))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|