python test_benchmark_inference.py -dbg -d ~/llm_models/koala-13B-GPTQ -- Loading model -- Tokenizer: /home/nap/llm_models/koala-13B-GPTQ/tokenizer.model -- Model config: /home/nap/llm_models/koala-13B-GPTQ/config.json -- Model: /home/nap/llm_models/koala-13B-GPTQ/koala-13B-4bit_qwop_cuda_slow.safetensors -- Sequence length: 2048 -- Options: ['attention: switched', 'matmul: switched', 'mlp: switched', 'debug'] !! Available CUDA devices: " !! - cuda:0: NVIDIA GeForce RTX 4090 " !! - cuda:1: NVIDIA RTX A6000 !! Loading safetensors file: /home/nap/llm_models/koala-13B-GPTQ/koala-13B-4bit_qwop_cuda_slow.safetensors !! Begin load tensors !! - lm_head.weight read: device: cpu, shape: [32000, 5120], dtype: float16 !! - lm_head.weight map: device: cuda:0, shape: [32000, 5120], dtype: float16, min: -0.316406, max: 0.361328, std: 0.020935 !! - model.embed_tokens.weight read: device: cpu, shape: [32000, 5120], dtype: float16 !! - model.embed_tokens.weight map: device: cpu, shape: [32000, 5120], dtype: float16 !! - model.layers.0.input_layernorm.weight read: device: cpu, shape: [5120], dtype: float16 !! - model.layers.0.input_layernorm.weight map: device: cuda:0, shape: [5120], dtype: float16, min: -0.002060, max: 0.742188, std: 0.045593 !! - model.layers.0.mlp.down_proj.g_idx read: device: cpu, shape: [13824], dtype: int32 !! - model.layers.0.mlp.down_proj.g_idx map: device: cuda:0, shape: [13824], dtype: int32, min: 0, max: 107 !! - model.layers.0.mlp.down_proj.qweight read: device: cpu, shape: [1728, 5120], dtype: int32 !! - model.layers.0.mlp.down_proj.qweight map: device: cuda:0, shape: [1728, 5120], dtype: int32, min: -2147416079, max: 2147375608 !! - model.layers.0.mlp.down_proj.qzeros read: device: cpu, shape: [108, 640], dtype: int32 !! - model.layers.0.mlp.down_proj.qzeros map: device: cuda:0, shape: [108, 640], dtype: int32, min: -2106165417, max: 2089191031 !! - model.layers.0.mlp.down_proj.scales read: device: cpu, shape: [108, 5120], dtype: float16 !! - model.layers.0.mlp.down_proj.scales map: device: cuda:0, shape: [108, 5120], dtype: float16, min: 0.003326, max: 0.099487, std: 0.001260 !! - model.layers.0.mlp.gate_proj.g_idx read: device: cpu, shape: [5120], dtype: int32 !! - model.layers.0.mlp.gate_proj.g_idx map: device: cuda:0, shape: [5120], dtype: int32, min: 0, max: 39 !! - model.layers.0.mlp.gate_proj.qweight read: device: cpu, shape: [640, 13824], dtype: int32 !! - model.layers.0.mlp.gate_proj.qweight map: device: cuda:0, shape: [640, 13824], dtype: int32, min: -2147459474, max: 2147466163 !! - model.layers.0.mlp.gate_proj.qzeros read: device: cpu, shape: [40, 1728], dtype: int32 !! - model.layers.0.mlp.gate_proj.qzeros map: device: cuda:0, shape: [40, 1728], dtype: int32, min: -2125109368, max: 2089248375 !! - model.layers.0.mlp.gate_proj.scales read: device: cpu, shape: [40, 13824], dtype: float16 !! - model.layers.0.mlp.gate_proj.scales map: device: cuda:0, shape: [40, 13824], dtype: float16, min: 0.002777, max: 0.060303, std: 0.000990 !! - model.layers.0.mlp.up_proj.g_idx read: device: cpu, shape: [5120], dtype: int32 !! - model.layers.0.mlp.up_proj.g_idx map: device: cuda:0, shape: [5120], dtype: int32, min: 0, max: 39 !! - model.layers.0.mlp.up_proj.qweight read: device: cpu, shape: [640, 13824], dtype: int32 !! - model.layers.0.mlp.up_proj.qweight map: device: cuda:0, shape: [640, 13824], dtype: int32, min: -2147474830, max: 2147437148 !! - model.layers.0.mlp.up_proj.qzeros read: device: cpu, shape: [40, 1728], dtype: int32 !! - model.layers.0.mlp.up_proj.qzeros map: device: cuda:0, shape: [40, 1728], dtype: int32, min: -2107213722, max: 2089121671 !! - model.layers.0.mlp.up_proj.scales read: device: cpu, shape: [40, 13824], dtype: float16 !! - model.layers.0.mlp.up_proj.scales map: device: cuda:0, shape: [40, 13824], dtype: float16, min: 0.002075, max: 0.040131, std: 0.000730 !! - model.layers.0.post_attention_layernorm.weight read: device: cpu, shape: [5120], dtype: float16 !! - model.layers.0.post_attention_layernorm.weight map: device: cuda:0, shape: [5120], dtype: float16, min: -0.035889, max: 0.361328, std: 0.016113 !! - model.layers.0.self_attn.k_proj.g_idx read: device: cpu, shape: [5120], dtype: int32 !! - model.layers.0.self_attn.k_proj.g_idx map: device: cuda:0, shape: [5120], dtype: int32, min: 0, max: 39 !! - model.layers.0.self_attn.k_proj.qweight read: device: cpu, shape: [640, 5120], dtype: int32 !! - model.layers.0.self_attn.k_proj.qweight map: device: cuda:0, shape: [640, 5120], dtype: int32, min: -2147305928, max: 2147337675 !! - model.layers.0.self_attn.k_proj.qzeros read: device: cpu, shape: [40, 640], dtype: int32 !! - model.layers.0.self_attn.k_proj.qzeros map: device: cuda:0, shape: [40, 640], dtype: int32, min: -2128119278, max: 2092336937 !! - model.layers.0.self_attn.k_proj.scales read: device: cpu, shape: [40, 5120], dtype: float16 !! - model.layers.0.self_attn.k_proj.scales map: device: cuda:0, shape: [40, 5120], dtype: float16, min: 0.001449, max: 0.082703, std: 0.005592 !! - model.layers.0.self_attn.o_proj.g_idx read: device: cpu, shape: [5120], dtype: int32 !! - model.layers.0.self_attn.o_proj.g_idx map: device: cuda:0, shape: [5120], dtype: int32, min: 0, max: 39 !! - model.layers.0.self_attn.o_proj.qweight read: device: cpu, shape: [640, 5120], dtype: int32 !! - model.layers.0.self_attn.o_proj.qweight map: device: cuda:0, shape: [640, 5120], dtype: int32, min: -2147453144, max: 2147375548 !! - model.layers.0.self_attn.o_proj.qzeros read: device: cpu, shape: [40, 640], dtype: int32 !! - model.layers.0.self_attn.o_proj.qzeros map: device: cuda:0, shape: [40, 640], dtype: int32, min: -2107209387, max: 2071422582 !! - model.layers.0.self_attn.o_proj.scales read: device: cpu, shape: [40, 5120], dtype: float16 !! - model.layers.0.self_attn.o_proj.scales map: device: cuda:0, shape: [40, 5120], dtype: float16, min: 0.001521, max: 0.089478, std: 0.001425 !! - model.layers.0.self_attn.q_proj.g_idx read: device: cpu, shape: [5120], dtype: int32 !! - model.layers.0.self_attn.q_proj.g_idx map: device: cuda:0, shape: [5120], dtype: int32, min: 0, max: 39 !! - model.layers.0.self_attn.q_proj.qweight read: device: cpu, shape: [640, 5120], dtype: int32 !! - model.layers.0.self_attn.q_proj.qweight map: device: cuda:0, shape: [640, 5120], dtype: int32, min: -2147399309, max: 2147314245 !! - model.layers.0.self_attn.q_proj.qzeros read: device: cpu, shape: [40, 640], dtype: int32 !! - model.layers.0.self_attn.q_proj.qzeros map: device: cuda:0, shape: [40, 640], dtype: int32, min: -2128450726, max: 2092123285 !! - model.layers.0.self_attn.q_proj.scales read: device: cpu, shape: [40, 5120], dtype: float16 !! - model.layers.0.self_attn.q_proj.scales map: device: cuda:0, shape: [40, 5120], dtype: float16, min: 0.001049, max: 0.095764, std: 0.005581 !! - model.layers.0.self_attn.v_proj.g_idx read: device: cpu, shape: [5120], dtype: int32 !! - model.layers.0.self_attn.v_proj.g_idx map: device: cuda:0, shape: [5120], dtype: int32, min: 0, max: 39 !! - model.layers.0.self_attn.v_proj.qweight read: device: cpu, shape: [640, 5120], dtype: int32 !! - model.layers.0.self_attn.v_proj.qweight map: device: cuda:0, shape: [640, 5120], dtype: int32, min: -2147441095, max: 2147387755 !! - model.layers.0.self_attn.v_proj.qzeros read: device: cpu, shape: [40, 640], dtype: int32 !! - model.layers.0.self_attn.v_proj.qzeros map: device: cuda:0, shape: [40, 640], dtype: int32, min: -2091420041, max: 2071422327 !! - model.layers.0.self_attn.v_proj.scales read: device: cpu, shape: [40, 5120], dtype: float16 !! - model.layers.0.self_attn.v_proj.scales map: device: cuda:0, shape: [40, 5120], dtype: float16, min: 0.001673, max: 0.015762, std: 0.001489 !! - model.norm.weight read: device: cpu, shape: [5120], dtype: float16 !! - model.norm.weight map: device: cuda:0, shape: [5120], dtype: float16, min: 0.018066, max: 2.093750, std: 0.073120 !! Computing RoPE table for seq length: 2048 !! - stored for device: cuda:0 ** Time, Load model: 3.72 seconds -- Groupsize (inferred): 128 -- Act-order (inferred): yes ** VRAM, Model: [cuda:0] 6,689.96 MB - [cuda:1] 0.00 MB !! Inference, debug pass !! Begin forward pass !! Moving input_ids from cuda:0 to cpu !! Built initial hidden state: device: cpu, shape: [1, 1920, 5120], dtype: float16, min: -0.117676, max: 0.114746, std: 0.018738 !! Prepared buffer for device: cuda:0 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! Moving hidden_states from cpu to cuda:0 !! Begin decoder 0 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -0.117676, max: 0.114746, std: 0.018738 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: -0.002060, max: 0.742188, std: 0.045593 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001049/0.095764/0.005581 !! - self_attn.k_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001449/0.082703/0.005592 !! - self_attn.v_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001673/0.015762/0.001489 !! - self_attn.o_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001521/0.089478/0.001425 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1.013672, max: 1.294922, std: 0.035309 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: -0.035889, max: 0.361328, std: 0.016113 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002777/0.060303/0.000990 !! - mlp.up_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002075/0.040131/0.000730 !! - mlp.down_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003326/0.099487/0.001260 !! - method: normal !! Begin decoder 1 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -9.375000, max: 35.843750, std: 0.119446 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.012146, max: 0.326172, std: 0.022308 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001299/0.042847/0.005116 !! - self_attn.k_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001262/0.056030/0.005295 !! - self_attn.v_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001407/0.011436/0.001119 !! - self_attn.o_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001063/0.086609/0.001472 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -9.039062, max: 33.656250, std: 0.116211 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.003036, max: 0.166016, std: 0.010605 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003960/0.075562/0.001144 !! - mlp.up_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003387/0.035187/0.000851 !! - mlp.down_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002762/0.120483/0.001154 !! - method: normal !! Begin decoder 2 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -11.812500, max: 30.734375, std: 0.155029 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.057617, max: 0.369141, std: 0.015396 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002361/0.074585/0.003971 !! - self_attn.k_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001963/0.050629/0.004532 !! - self_attn.v_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002445/0.020309/0.000759 !! - self_attn.o_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002083/0.110596/0.001124 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -11.648438, max: 26.859375, std: 0.158203 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.014099, max: 0.161133, std: 0.011726 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002787/0.087097/0.001152 !! - mlp.up_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003202/0.043213/0.000878 !! - mlp.down_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002434/0.133301/0.001044 !! - method: normal !! Begin decoder 3 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -890.500000, max: 24.171875, std: 0.338135 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.033203, max: 0.445312, std: 0.016769 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002218/0.064087/0.003193 !! - self_attn.k_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001682/0.047546/0.003334 !! - self_attn.v_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002258/0.013161/0.000889 !! - self_attn.o_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001929/0.086182/0.001017 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -890.500000, max: 25.640625, std: 0.342529 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.020508, max: 0.185547, std: 0.012711 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002598/0.055603/0.001158 !! - mlp.up_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002819/0.043365/0.000893 !! - mlp.down_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002668/0.083008/0.000952 !! - method: normal !! Begin decoder 4 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -892.500000, max: 24.625000, std: 0.366211 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.036621, max: 0.458984, std: 0.017136 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002357/0.124084/0.003180 !! - self_attn.k_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001328/0.042419/0.003229 !! - self_attn.v_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002598/0.018280/0.000826 !! - self_attn.o_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001725/0.085449/0.000918 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -892.500000, max: 28.000000, std: 0.385742 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.025391, max: 0.200195, std: 0.012398 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003830/0.047241/0.001214 !! - mlp.up_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003572/0.041473/0.000900 !! - mlp.down_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002481/0.095337/0.000922 !! - method: normal !! Begin decoder 5 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -893.000000, max: 25.609375, std: 0.400879 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.033203, max: 0.492188, std: 0.019684 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001987/0.102661/0.003073 !! - self_attn.k_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001550/0.035492/0.003050 !! - self_attn.v_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002256/0.016541/0.000906 !! - self_attn.o_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002275/0.106079/0.001011 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -893.000000, max: 29.265625, std: 0.418213 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.042236, max: 0.211914, std: 0.011848 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002773/0.047150/0.001265 !! - mlp.up_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001515/0.041870/0.000920 !! - mlp.down_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002594/0.062195/0.000935 !! - method: normal !! Begin decoder 6 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -894.500000, max: 26.140625, std: 0.445312 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.067871, max: 0.558594, std: 0.019913 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002136/0.046173/0.003099 !! - self_attn.k_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001863/0.033478/0.003153 !! - self_attn.v_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002909/0.020889/0.000928 !! - self_attn.o_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001761/0.096313/0.001001 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -895.000000, max: 25.453125, std: 0.462891 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.038574, max: 0.244141, std: 0.012810 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003599/0.058990/0.001412 !! - mlp.up_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003576/0.044037/0.000947 !! - mlp.down_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002380/0.090454/0.001029 !! - method: normal !! Begin decoder 7 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1028.000000, max: 79.125000, std: 0.513672 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: -0.010315, max: 0.609375, std: 0.018875 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002357/0.038116/0.002750 !! - self_attn.k_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002035/0.030289/0.002897 !! - self_attn.v_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002699/0.013130/0.000939 !! - self_attn.o_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001756/0.065430/0.000955 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1028.000000, max: 79.312500, std: 0.554688 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.043701, max: 0.222656, std: 0.011360 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003187/0.053528/0.001369 !! - mlp.up_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003983/0.029083/0.000935 !! - mlp.down_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002668/0.070984/0.000947 !! - method: normal !! Begin decoder 8 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1028.000000, max: 79.375000, std: 0.583008 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.007812, max: 0.617188, std: 0.021469 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002020/0.036896/0.003115 !! - self_attn.k_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001634/0.027725/0.003042 !! - self_attn.v_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003176/0.019165/0.000947 !! - self_attn.o_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001910/0.084106/0.000935 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1028.000000, max: 79.500000, std: 0.605469 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.033203, max: 0.228516, std: 0.012070 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003246/0.053589/0.001263 !! - mlp.up_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001094/0.036316/0.000944 !! - mlp.down_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002659/0.075378/0.000929 !! - method: normal !! Begin decoder 9 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1028.000000, max: 79.687500, std: 0.612305 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.003876, max: 0.664062, std: 0.020859 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002146/0.038910/0.002712 !! - self_attn.k_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001664/0.032074/0.002876 !! - self_attn.v_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003122/0.015617/0.000871 !! - self_attn.o_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001311/0.095337/0.000900 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1028.000000, max: 79.750000, std: 0.625000 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.049805, max: 0.238281, std: 0.011787 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003241/0.061310/0.001322 !! - mlp.up_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003132/0.040771/0.000956 !! - mlp.down_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002480/0.081299/0.000928 !! - method: normal !! Begin decoder 10 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1028.000000, max: 79.812500, std: 0.635742 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.002594, max: 0.703125, std: 0.021515 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002222/0.033997/0.002638 !! - self_attn.k_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001856/0.029907/0.002831 !! - self_attn.v_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003365/0.014862/0.000932 !! - self_attn.o_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001518/0.084351/0.000958 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1028.000000, max: 79.875000, std: 0.654785 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.053955, max: 0.245117, std: 0.011978 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003246/0.042297/0.001295 !! - mlp.up_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003368/0.040710/0.000970 !! - mlp.down_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002800/0.089050/0.000934 !! - method: normal !! Begin decoder 11 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1028.000000, max: 79.937500, std: 0.669922 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.007355, max: 0.687500, std: 0.021606 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002106/0.034271/0.002579 !! - self_attn.k_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002033/0.028885/0.002792 !! - self_attn.v_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003374/0.014481/0.000937 !! - self_attn.o_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001925/0.075500/0.000946 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1028.000000, max: 80.000000, std: 0.694336 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.054443, max: 0.251953, std: 0.011749 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003128/0.051086/0.001299 !! - mlp.up_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001537/0.041565/0.000993 !! - mlp.down_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003239/0.079163/0.000940 !! - method: normal !! Begin decoder 12 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1028.000000, max: 80.062500, std: 0.726074 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.014771, max: 0.664062, std: 0.020920 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002449/0.034271/0.002655 !! - self_attn.k_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002136/0.032806/0.002867 !! - self_attn.v_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003397/0.019394/0.000961 !! - self_attn.o_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001609/0.057343/0.000999 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1029.000000, max: 80.250000, std: 0.751953 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.056396, max: 0.249023, std: 0.012207 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003019/0.043274/0.001330 !! - mlp.up_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002712/0.043762/0.001000 !! - mlp.down_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003359/0.118286/0.000953 !! - method: normal !! Begin decoder 13 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1029.000000, max: 80.312500, std: 0.787598 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.031982, max: 0.687500, std: 0.021698 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002420/0.034241/0.002577 !! - self_attn.k_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002388/0.034241/0.002741 !! - self_attn.v_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003078/0.015854/0.000962 !! - self_attn.o_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002022/0.078918/0.000970 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1029.000000, max: 80.500000, std: 0.809570 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.051025, max: 0.265625, std: 0.012978 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003170/0.036652/0.001327 !! - mlp.up_proj: cuda:0 [Q,x_map] scales min/max/std: 0.004108/0.028717/0.000996 !! - mlp.down_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002531/0.052429/0.000926 !! - method: normal !! Begin decoder 14 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1029.000000, max: 80.562500, std: 0.849121 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.025879, max: 0.691406, std: 0.021164 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002115/0.035156/0.002348 !! - self_attn.k_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001471/0.031067/0.002569 !! - self_attn.v_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003618/0.020035/0.000957 !! - self_attn.o_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001540/0.086060/0.000992 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1029.000000, max: 80.687500, std: 0.866699 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.055420, max: 0.273438, std: 0.013245 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003336/0.032928/0.001335 !! - mlp.up_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003906/0.045197/0.000993 !! - mlp.down_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002605/0.088013/0.000936 !! - method: normal !! Begin decoder 15 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1029.000000, max: 80.687500, std: 0.916016 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.031494, max: 0.679688, std: 0.020615 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002296/0.038727/0.002529 !! - self_attn.k_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002375/0.030533/0.002689 !! - self_attn.v_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003328/0.015869/0.000980 !! - self_attn.o_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001546/0.124634/0.001021 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1029.000000, max: 80.750000, std: 0.945801 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.040039, max: 0.291016, std: 0.014809 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003687/0.051025/0.001274 !! - mlp.up_proj: cuda:0 [Q,x_map] scales min/max/std: 0.004307/0.041656/0.000965 !! - mlp.down_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002167/0.078613/0.000919 !! - method: normal !! Begin decoder 16 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1029.000000, max: 80.750000, std: 0.993164 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.012573, max: 0.652344, std: 0.020477 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002371/0.034912/0.002207 !! - self_attn.k_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001926/0.029617/0.002392 !! - self_attn.v_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003460/0.018524/0.000947 !! - self_attn.o_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001738/0.051270/0.000971 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1029.000000, max: 80.812500, std: 1.004883 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.045898, max: 0.298828, std: 0.015106 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003387/0.036011/0.001249 !! - mlp.up_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003696/0.035187/0.000964 !! - mlp.down_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002268/0.065063/0.000917 !! - method: normal !! Begin decoder 17 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1029.000000, max: 80.812500, std: 1.059570 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.025146, max: 0.722656, std: 0.021576 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002331/0.036224/0.002277 !! - self_attn.k_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001755/0.030884/0.002550 !! - self_attn.v_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003754/0.020874/0.000970 !! - self_attn.o_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001672/0.116455/0.001009 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1029.000000, max: 80.937500, std: 1.098633 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.042969, max: 0.310547, std: 0.015625 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003586/0.035492/0.001222 !! - mlp.up_proj: cuda:0 [Q,x_map] scales min/max/std: 0.004265/0.044525/0.000955 !! - mlp.down_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002222/0.067993/0.000917 !! - method: normal !! Begin decoder 18 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1029.000000, max: 80.875000, std: 1.152344 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.029907, max: 0.738281, std: 0.022064 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002323/0.033447/0.002235 !! - self_attn.k_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001904/0.030121/0.002382 !! - self_attn.v_proj: cuda:0 [Q,x_map] scales min/max/std: 0.004002/0.014252/0.000932 !! - self_attn.o_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001740/0.083801/0.000958 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1029.000000, max: 80.937500, std: 1.186523 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.048584, max: 0.318359, std: 0.015625 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003035/0.034271/0.001252 !! - mlp.up_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003998/0.045654/0.000957 !! - mlp.down_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002491/0.084534/0.000911 !! - method: normal !! Begin decoder 19 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1029.000000, max: 80.937500, std: 1.258789 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.024170, max: 0.753906, std: 0.022308 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002134/0.031494/0.002193 !! - self_attn.k_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001934/0.030380/0.002371 !! - self_attn.v_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003841/0.015404/0.000981 !! - self_attn.o_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001974/0.084167/0.001057 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1029.000000, max: 80.937500, std: 1.287109 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.033936, max: 0.347656, std: 0.016785 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003767/0.040405/0.001213 !! - mlp.up_proj: cuda:0 [Q,x_map] scales min/max/std: 0.004185/0.043823/0.000943 !! - mlp.down_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002474/0.062683/0.000900 !! - method: normal !! Begin decoder 20 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1029.000000, max: 80.875000, std: 1.358398 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.037354, max: 0.757812, std: 0.022324 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002235/0.035187/0.002100 !! - self_attn.k_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002291/0.032471/0.002190 !! - self_attn.v_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003658/0.014191/0.001044 !! - self_attn.o_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001817/0.078064/0.001065 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1029.000000, max: 80.875000, std: 1.393555 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.048096, max: 0.345703, std: 0.016815 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003643/0.044281/0.001211 !! - mlp.up_proj: cuda:0 [Q,x_map] scales min/max/std: 0.004276/0.048615/0.000933 !! - mlp.down_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002605/0.067444/0.000911 !! - method: normal !! Begin decoder 21 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1029.000000, max: 80.875000, std: 1.483398 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.037598, max: 0.796875, std: 0.023514 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002506/0.043945/0.002247 !! - self_attn.k_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002028/0.031616/0.002365 !! - self_attn.v_proj: cuda:0 [Q,x_map] scales min/max/std: 0.004189/0.014427/0.001028 !! - self_attn.o_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001978/0.039856/0.001017 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1029.000000, max: 80.875000, std: 1.525391 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.044922, max: 0.347656, std: 0.017212 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003614/0.052155/0.001178 !! - mlp.up_proj: cuda:0 [Q,x_map] scales min/max/std: 0.004387/0.032867/0.000925 !! - mlp.down_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002708/0.063232/0.000911 !! - method: normal !! Begin decoder 22 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1029.000000, max: 80.875000, std: 1.616211 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.037354, max: 0.753906, std: 0.022934 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002468/0.036316/0.002068 !! - self_attn.k_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002302/0.030502/0.002201 !! - self_attn.v_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003658/0.014572/0.000998 !! - self_attn.o_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002260/0.096069/0.001020 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1029.000000, max: 80.875000, std: 1.678711 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.045654, max: 0.361328, std: 0.018143 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003679/0.035217/0.001136 !! - mlp.up_proj: cuda:0 [Q,x_map] scales min/max/std: 0.004360/0.036133/0.000911 !! - mlp.down_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002403/0.078796/0.000916 !! - method: normal !! Begin decoder 23 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1028.000000, max: 80.875000, std: 1.774414 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.033691, max: 0.792969, std: 0.024429 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002359/0.034546/0.002054 !! - self_attn.k_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002043/0.033936/0.002104 !! - self_attn.v_proj: cuda:0 [Q,x_map] scales min/max/std: 0.004379/0.013702/0.000979 !! - self_attn.o_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001885/0.075256/0.000995 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1028.000000, max: 80.812500, std: 1.833008 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.031982, max: 0.367188, std: 0.019226 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003729/0.050964/0.001107 !! - mlp.up_proj: cuda:0 [Q,x_map] scales min/max/std: 0.004387/0.036224/0.000899 !! - mlp.down_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002159/0.082642/0.000899 !! - method: normal !! Begin decoder 24 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1027.000000, max: 80.937500, std: 1.931641 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.051758, max: 0.812500, std: 0.025452 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002163/0.037628/0.002060 !! - self_attn.k_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002029/0.031433/0.002123 !! - self_attn.v_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003849/0.016617/0.000987 !! - self_attn.o_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001784/0.109741/0.001011 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1027.000000, max: 82.437500, std: 1.982422 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.029419, max: 0.382812, std: 0.020203 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003664/0.039459/0.001067 !! - mlp.up_proj: cuda:0 [Q,x_map] scales min/max/std: 0.004559/0.033142/0.000891 !! - mlp.down_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002037/0.088379/0.000898 !! - method: normal !! Begin decoder 25 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1027.000000, max: 85.312500, std: 2.062500 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.043213, max: 0.816406, std: 0.024796 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001895/0.034515/0.002041 !! - self_attn.k_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001381/0.040314/0.002146 !! - self_attn.v_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003727/0.015511/0.001091 !! - self_attn.o_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002243/0.103149/0.001124 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1027.000000, max: 93.312500, std: 2.140625 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.029663, max: 0.404297, std: 0.020950 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003717/0.032501/0.001052 !! - mlp.up_proj: cuda:0 [Q,x_map] scales min/max/std: 0.004433/0.026627/0.000883 !! - mlp.down_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002089/0.068298/0.000892 !! - method: normal !! Begin decoder 26 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1026.000000, max: 98.375000, std: 2.222656 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.038818, max: 0.875000, std: 0.026947 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002312/0.030716/0.001928 !! - self_attn.k_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002153/0.033234/0.002005 !! - self_attn.v_proj: cuda:0 [Q,x_map] scales min/max/std: 0.004166/0.014450/0.000995 !! - self_attn.o_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002365/0.091187/0.001030 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1026.000000, max: 103.250000, std: 2.253906 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.030518, max: 0.400391, std: 0.021332 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q,x_map] scales min/max/std: 0.004192/0.032410/0.001042 !! - mlp.up_proj: cuda:0 [Q,x_map] scales min/max/std: 0.004314/0.036591/0.000883 !! - mlp.down_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002001/0.074585/0.000899 !! - method: normal !! Begin decoder 27 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1025.000000, max: 106.812500, std: 2.332031 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.044922, max: 0.906250, std: 0.027390 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002163/0.037323/0.002039 !! - self_attn.k_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002100/0.032104/0.002142 !! - self_attn.v_proj: cuda:0 [Q,x_map] scales min/max/std: 0.004280/0.019775/0.000985 !! - self_attn.o_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002172/0.070496/0.001004 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1025.000000, max: 113.375000, std: 2.388672 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.034180, max: 0.406250, std: 0.021439 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q,x_map] scales min/max/std: 0.004284/0.040131/0.001047 !! - mlp.up_proj: cuda:0 [Q,x_map] scales min/max/std: 0.004375/0.046295/0.000883 !! - mlp.down_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002033/0.049622/0.000891 !! - method: normal !! Begin decoder 28 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1024.000000, max: 116.187500, std: 2.458984 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.038818, max: 0.937500, std: 0.027420 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002270/0.045990/0.002008 !! - self_attn.k_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002068/0.035706/0.002039 !! - self_attn.v_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003502/0.013725/0.001108 !! - self_attn.o_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002235/0.154175/0.001218 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1024.000000, max: 128.750000, std: 2.568359 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.022705, max: 0.423828, std: 0.022003 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q,x_map] scales min/max/std: 0.004471/0.042694/0.001054 !! - mlp.up_proj: cuda:0 [Q,x_map] scales min/max/std: 0.004562/0.022446/0.000878 !! - mlp.down_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001733/0.056427/0.000884 !! - method: normal !! Begin decoder 29 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1023.000000, max: 131.500000, std: 2.623047 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.003403, max: 0.957031, std: 0.027893 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002245/0.032928/0.001910 !! - self_attn.k_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002039/0.030350/0.001957 !! - self_attn.v_proj: cuda:0 [Q,x_map] scales min/max/std: 0.004120/0.014153/0.001067 !! - self_attn.o_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002306/0.074097/0.001082 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1023.000000, max: 135.375000, std: 2.656250 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.028442, max: 0.691406, std: 0.022568 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q,x_map] scales min/max/std: 0.004440/0.035675/0.001063 !! - mlp.up_proj: cuda:0 [Q,x_map] scales min/max/std: 0.004391/0.031128/0.000879 !! - mlp.down_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001850/0.075684/0.000896 !! - method: normal !! Begin decoder 30 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1022.000000, max: 138.750000, std: 2.707031 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.039062, max: 0.953125, std: 0.028458 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002247/0.030197/0.001984 !! - self_attn.k_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002272/0.032532/0.002090 !! - self_attn.v_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002539/0.015915/0.001025 !! - self_attn.o_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002310/0.092224/0.001046 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1022.000000, max: 145.125000, std: 2.757812 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.013855, max: 0.443359, std: 0.021713 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q,x_map] scales min/max/std: 0.004665/0.045197/0.001092 !! - mlp.up_proj: cuda:0 [Q,x_map] scales min/max/std: 0.004078/0.036926/0.000885 !! - mlp.down_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001813/0.072693/0.000899 !! - method: normal !! Begin decoder 31 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1020.500000, max: 151.500000, std: 2.837891 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: -0.005127, max: 0.949219, std: 0.028824 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002350/0.031052/0.001871 !! - self_attn.k_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002193/0.030899/0.001905 !! - self_attn.v_proj: cuda:0 [Q,x_map] scales min/max/std: 0.004337/0.015503/0.001026 !! - self_attn.o_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002642/0.092957/0.001069 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1020.500000, max: 163.125000, std: 2.910156 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.015198, max: 0.449219, std: 0.022018 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q,x_map] scales min/max/std: 0.004639/0.031525/0.001118 !! - mlp.up_proj: cuda:0 [Q,x_map] scales min/max/std: 0.004658/0.035858/0.000885 !! - mlp.down_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001880/0.045258/0.000892 !! - method: normal !! Begin decoder 32 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1019.000000, max: 165.250000, std: 2.960938 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.002731, max: 0.898438, std: 0.028946 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002439/0.031342/0.001923 !! - self_attn.k_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001745/0.039093/0.001959 !! - self_attn.v_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003937/0.014107/0.001027 !! - self_attn.o_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002518/0.113953/0.001073 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1019.000000, max: 170.125000, std: 3.003906 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.003967, max: 0.746094, std: 0.022736 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q,x_map] scales min/max/std: 0.004223/0.046234/0.001122 !! - mlp.up_proj: cuda:0 [Q,x_map] scales min/max/std: 0.004738/0.031342/0.000886 !! - mlp.down_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001719/0.055420/0.000911 !! - method: normal !! Begin decoder 33 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1016.500000, max: 172.750000, std: 3.056641 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.033203, max: 0.910156, std: 0.029999 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002224/0.034576/0.001955 !! - self_attn.k_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002178/0.034698/0.001965 !! - self_attn.v_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003191/0.017090/0.001073 !! - self_attn.o_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002516/0.098511/0.001093 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1015.500000, max: 177.375000, std: 3.095703 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.021729, max: 0.457031, std: 0.021973 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q,x_map] scales min/max/std: 0.004440/0.058960/0.001143 !! - mlp.up_proj: cuda:0 [Q,x_map] scales min/max/std: 0.004120/0.027802/0.000899 !! - mlp.down_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001822/0.089966/0.000950 !! - method: normal !! Begin decoder 34 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1012.000000, max: 178.875000, std: 3.134766 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.038086, max: 0.953125, std: 0.030441 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002279/0.033783/0.001966 !! - self_attn.k_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002062/0.031311/0.002022 !! - self_attn.v_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003651/0.016846/0.001222 !! - self_attn.o_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002913/0.079651/0.001315 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -1011.000000, max: 181.750000, std: 3.199219 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.029907, max: 0.460938, std: 0.021744 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q,x_map] scales min/max/std: 0.004433/0.036102/0.001138 !! - mlp.up_proj: cuda:0 [Q,x_map] scales min/max/std: 0.004498/0.028717/0.000901 !! - mlp.down_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001920/0.123169/0.001141 !! - method: normal !! Begin decoder 35 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -982.000000, max: 186.500000, std: 3.277344 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.040283, max: 0.917969, std: 0.029037 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002428/0.032837/0.001951 !! - self_attn.k_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002157/0.030807/0.002024 !! - self_attn.v_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003971/0.013626/0.001038 !! - self_attn.o_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003014/0.090149/0.001112 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -981.000000, max: 191.500000, std: 3.328125 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.030151, max: 0.468750, std: 0.021896 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002829/0.039459/0.001129 !! - mlp.up_proj: cuda:0 [Q,x_map] scales min/max/std: 0.004147/0.044250/0.000917 !! - mlp.down_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002134/0.148560/0.001385 !! - method: normal !! Begin decoder 36 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -919.500000, max: 191.500000, std: 3.392578 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: -0.004456, max: 0.941406, std: 0.031082 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001844/0.032776/0.001974 !! - self_attn.k_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001781/0.031769/0.002085 !! - self_attn.v_proj: cuda:0 [Q,x_map] scales min/max/std: 0.004047/0.016876/0.001062 !! - self_attn.o_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002771/0.059174/0.001117 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -919.000000, max: 193.875000, std: 3.429688 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.050537, max: 0.839844, std: 0.022324 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q,x_map] scales min/max/std: 0.004131/0.048218/0.001153 !! - mlp.up_proj: cuda:0 [Q,x_map] scales min/max/std: 0.004238/0.036469/0.000927 !! - mlp.down_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002237/0.148193/0.001454 !! - method: normal !! Begin decoder 37 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -861.000000, max: 191.125000, std: 3.441406 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: -0.002762, max: 1.054688, std: 0.032867 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002245/0.036652/0.001965 !! - self_attn.k_proj: cuda:0 [Q,x_map] scales min/max/std: 0.001849/0.033752/0.002066 !! - self_attn.v_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003832/0.017563/0.001212 !! - self_attn.o_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003330/0.115906/0.001400 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -857.500000, max: 195.500000, std: 3.544922 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.066406, max: 0.593750, std: 0.021439 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003469/0.083496/0.001222 !! - mlp.up_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003468/0.034821/0.000952 !! - mlp.down_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002447/0.204346/0.002012 !! - method: normal !! Begin decoder 38 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -580.000000, max: 195.125000, std: 3.591797 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.097656, max: 1.039062, std: 0.031891 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002375/0.045197/0.001980 !! - self_attn.k_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002171/0.030624/0.001997 !! - self_attn.v_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003450/0.017731/0.001331 !! - self_attn.o_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003344/0.227539/0.001991 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -108.500000, max: 203.750000, std: 3.845703 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.087891, max: 0.498047, std: 0.020370 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q,x_map] scales min/max/std: 0.004387/0.031525/0.001246 !! - mlp.up_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002453/0.059601/0.001083 !! - mlp.down_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003397/0.199585/0.001426 !! - method: normal !! Begin decoder 39 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -168.000000, max: 226.125000, std: 4.089844 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.002625, max: 0.957031, std: 0.032471 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002300/0.047607/0.002197 !! - self_attn.k_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002066/0.033020/0.002274 !! - self_attn.v_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002975/0.016586/0.001257 !! - self_attn.o_proj: cuda:0 [Q,x_map] scales min/max/std: 0.003019/0.146851/0.001698 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -144.375000, max: 229.500000, std: 4.367188 !! - layernorm.weight: device: cuda:0, shape: [5120], dtype: float16, min: 0.109863, max: 0.648438, std: 0.025543 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002789/0.032501/0.001303 !! - mlp.up_proj: cuda:0 [Q,x_map] scales min/max/std: 0.002787/0.085999/0.001245 !! - mlp.down_proj: cuda:0 [Q,x_map] scales min/max/std: 0.004478/0.175049/0.001831 !! - method: normal !! pre norm, hidden_states: device: cuda:0, shape: [1, 1920, 5120], dtype: float16, min: -198.250000, max: 719.000000, std: 6.828125 !! pre lm_head, hidden_states: device: cuda:0, shape: [1, 1, 5120], dtype: float16, min: -13.359375, max: 17.625000, std: 1.145508 !! logits: device: cuda:0, shape: [1, 1, 32000], dtype: float16, min: -11.101562, max: 10.367188, std: 2.171875 !! Moving logits from cuda:0 to cpu ** Time, Inference: 0.93 seconds