diff --git "a/1x_A10G_24GB/nohup.out" "b/1x_A10G_24GB/nohup.out" new file mode 100644--- /dev/null +++ "b/1x_A10G_24GB/nohup.out" @@ -0,0 +1,5299 @@ +Multi-GPU support is disabled. Using a single GPU. ++-----------------------+----------------------------------------------------+ +| Parameter | Value | ++-----------------------+----------------------------------------------------+ +| train data pattern | dev/data/fineweb10B/fineweb_train_*.bin | +| val data pattern | dev/data/fineweb10B/fineweb_val_*.bin | +| output log dir | log124M | +| checkpoint_every | 5000 | +| resume | 0 | +| micro batch size B | 32 | +| sequence length T | 1024 | +| total batch size | 524288 | +| LR scheduler | cosine | +| learning rate (LR) | 6.000000e-04 | +| warmup iterations | 700 | +| final LR fraction | 0.000000e+00 | +| weight decay | 1.000000e-01 | +| skip update lossz | 0.000000 | +| skip update gradz | 0.000000 | +| max_steps | -1 | +| val_loss_every | 250 | +| val_max_steps | 20 | +| sample_every | 20000 | +| genT | 64 | +| overfit_single_batch | 0 | +| use_master_weights | enabled | +| gelu_fusion | 0 | +| recompute | 1 | ++-----------------------+----------------------------------------------------+ +| device | NVIDIA A10G | +| peak TFlops | -1.0 | +| precision | BF16 | ++-----------------------+----------------------------------------------------+ +| weight init method | d12 | +| max_sequence_length T | 1024 | +| vocab_size V | 50257 | +| padded_vocab_size Vp | 50304 | +| num_layers L | 12 | +| num_heads NH | 12 | +| channels C | 768 | +| num_parameters | 124475904 | ++-----------------------+----------------------------------------------------+ +| train_num_batches | 19560 | +| val_num_batches | 20 | ++-----------------------+----------------------------------------------------+ +| run hellaswag | yes | ++-----------------------+----------------------------------------------------+ +| num_processes | 1 | +| zero_stage | 1 | ++-----------------------+----------------------------------------------------+ +num_parameters: 124475904 => bytes: 248951808 +allocated 237 MiB for model parameters +batch_size B=32 * seq_len T=1024 * num_processes=1 and total_batch_size=524288 +=> setting grad_accum_steps=16 +--- +WARNING: Failed to open the tokenizer file gpt2_tokenizer.bin +The Tokenizer is a new feature added April 14 2024. +Re-run `python train_gpt2.py` to write it +--- +allocating 237 MiB for parameter gradients +allocating 10608 MiB for activations +allocating 474 MiB for AdamW optimizer state m +allocating 474 MiB for AdamW optimizer state v +allocating 474 MiB for master copy of params +device memory usage: 12923 MiB / 22502 MiB +memory per sequence: 331 MiB + -> estimated maximum batch size: 60 +val loss 11.008753 +step 1/19560 | loss 11.010259 (+nanz)| norm 15.1382 (+nanz)| lr 8.57e-07 | 8793.89 ms | -100.0% bf16 MFU | 59620 tok/s +step 2/19560 | loss 10.956964 (+nanz)| norm 15.2993 (+nanz)| lr 1.71e-06 | 8424.12 ms | -100.0% bf16 MFU | 62237 tok/s +step 3/19560 | loss 10.848864 (+nanz)| norm 14.8130 (+nanz)| lr 2.57e-06 | 8425.21 ms | -100.0% bf16 MFU | 62232 tok/s +step 4/19560 | loss 10.718041 (+nanz)| norm 12.9247 (+nanz)| lr 3.43e-06 | 8439.93 ms | -100.0% bf16 MFU | 62193 tok/s +step 5/19560 | loss 10.560916 (+nanz)| norm 10.5543 (+nanz)| lr 4.29e-06 | 8454.25 ms | -100.0% bf16 MFU | 62145 tok/s +step 6/19560 | loss 10.416245 (+nanz)| norm 8.5709 (+nanz)| lr 5.14e-06 | 8451.84 ms | -100.0% bf16 MFU | 62120 tok/s +step 7/19560 | loss 10.299311 (+nanz)| norm 7.1331 (+nanz)| lr 6.00e-06 | 8453.42 ms | -100.0% bf16 MFU | 62101 tok/s +step 8/19560 | loss 10.185864 (+nanz)| norm 6.2568 (+nanz)| lr 6.86e-06 | 8447.38 ms | -100.0% bf16 MFU | 62095 tok/s +step 9/19560 | loss 10.062736 (+nanz)| norm 5.4449 (+nanz)| lr 7.71e-06 | 8449.10 ms | -100.0% bf16 MFU | 62089 tok/s +step 10/19560 | loss 10.002638 (+nanz)| norm 4.5096 (+nanz)| lr 8.57e-06 | 8449.80 ms | -100.0% bf16 MFU | 62083 tok/s +step 11/19560 | loss 9.922058 (+nanz)| norm 3.8770 (+nanz)| lr 9.43e-06 | 8451.08 ms | -100.0% bf16 MFU | 62078 tok/s +step 12/19560 | loss 9.843082 (+nanz)| norm 3.4079 (+nanz)| lr 1.03e-05 | 8448.84 ms | -100.0% bf16 MFU | 62075 tok/s +step 13/19560 | loss 9.813807 (+nanz)| norm 2.9508 (+nanz)| lr 1.11e-05 | 8450.27 ms | -100.0% bf16 MFU | 62072 tok/s +step 14/19560 | loss 9.751107 (+nanz)| norm 2.7054 (+nanz)| lr 1.20e-05 | 8452.14 ms | -100.0% bf16 MFU | 62067 tok/s +step 15/19560 | loss 9.709040 (+nanz)| norm 2.4936 (+nanz)| lr 1.29e-05 | 8444.66 ms | -100.0% bf16 MFU | 62069 tok/s +step 16/19560 | loss 9.699852 (+nanz)| norm 2.2954 (+nanz)| lr 1.37e-05 | 8446.41 ms | -100.0% bf16 MFU | 62069 tok/s +step 17/19560 | loss 9.646862 (+nanz)| norm 2.2614 (+nanz)| lr 1.46e-05 | 8448.62 ms | -100.0% bf16 MFU | 62068 tok/s +step 18/19560 | loss 9.651264 (+nanz)| norm 2.1821 (+nanz)| lr 1.54e-05 | 8445.83 ms | -100.0% bf16 MFU | 62069 tok/s +step 19/19560 | loss 9.606375 (+nanz)| norm 2.1965 (+nanz)| lr 1.63e-05 | 8448.44 ms | -100.0% bf16 MFU | 62068 tok/s +step 20/19560 | loss 9.587271 (+nanz)| norm 2.1762 (+nanz)| lr 1.71e-05 | 8443.36 ms | -100.0% bf16 MFU | 62070 tok/s +step 21/19560 | loss 9.574852 (+nanz)| norm 2.1452 (+nanz)| lr 1.80e-05 | 8434.44 ms | -100.0% bf16 MFU | 62077 tok/s +step 22/19560 | loss 9.529562 (+nanz)| norm 2.1578 (+nanz)| lr 1.89e-05 | 8446.56 ms | -100.0% bf16 MFU | 62077 tok/s +step 23/19560 | loss 9.511011 (+nanz)| norm 2.1096 (+nanz)| lr 1.97e-05 | 8442.77 ms | -100.0% bf16 MFU | 62078 tok/s +step 24/19560 | loss 9.470493 (+nanz)| norm 2.0948 (+nanz)| lr 2.06e-05 | 8434.33 ms | -100.0% bf16 MFU | 62084 tok/s +step 25/19560 | loss 9.478806 (+nanz)| norm 1.9904 (+nanz)| lr 2.14e-05 | 8438.12 ms | -100.0% bf16 MFU | 62088 tok/s +step 26/19560 | loss 9.438299 (+nanz)| norm 2.0031 (+nanz)| lr 2.23e-05 | 8429.98 ms | -100.0% bf16 MFU | 62095 tok/s +step 27/19560 | loss 9.404727 (+nanz)| norm 1.9770 (+nanz)| lr 2.31e-05 | 8441.09 ms | -100.0% bf16 MFU | 62096 tok/s +step 28/19560 | loss 9.371781 (+nanz)| norm 2.0175 (+nanz)| lr 2.40e-05 | 8442.62 ms | -100.0% bf16 MFU | 62096 tok/s +step 29/19560 | loss 9.334040 (+nanz)| norm 1.9800 (+nanz)| lr 2.49e-05 | 8437.23 ms | -100.0% bf16 MFU | 62099 tok/s +step 30/19560 | loss 9.274792 (+nanz)| norm 1.9668 (+nanz)| lr 2.57e-05 | 8446.23 ms | -100.0% bf16 MFU | 62098 tok/s +step 31/19560 | loss 9.276773 (+nanz)| norm 1.9861 (+nanz)| lr 2.66e-05 | 8440.83 ms | -100.0% bf16 MFU | 62099 tok/s +step 32/19560 | loss 9.189733 (+nanz)| norm 1.9960 (+nanz)| lr 2.74e-05 | 8438.08 ms | -100.0% bf16 MFU | 62101 tok/s +step 33/19560 | loss 9.177148 (+nanz)| norm 2.0200 (+nanz)| lr 2.83e-05 | 8439.18 ms | -100.0% bf16 MFU | 62102 tok/s +step 34/19560 | loss 9.166351 (+nanz)| norm 1.8684 (+nanz)| lr 2.91e-05 | 8440.36 ms | -100.0% bf16 MFU | 62103 tok/s +step 35/19560 | loss 9.104283 (+nanz)| norm 1.8384 (+nanz)| lr 3.00e-05 | 8444.23 ms | -100.0% bf16 MFU | 62102 tok/s +step 36/19560 | loss 9.058926 (+nanz)| norm 1.9402 (+nanz)| lr 3.09e-05 | 8441.54 ms | -100.0% bf16 MFU | 62103 tok/s +step 37/19560 | loss 9.031883 (+nanz)| norm 1.8592 (+nanz)| lr 3.17e-05 | 8444.98 ms | -100.0% bf16 MFU | 62102 tok/s +step 38/19560 | loss 8.993668 (+nanz)| norm 1.7465 (+nanz)| lr 3.26e-05 | 8440.26 ms | -100.0% bf16 MFU | 62102 tok/s +step 39/19560 | loss 8.992878 (+nanz)| norm 1.7351 (+nanz)| lr 3.34e-05 | 8445.54 ms | -100.0% bf16 MFU | 62101 tok/s +step 40/19560 | loss 8.945091 (+nanz)| norm 1.9640 (+nanz)| lr 3.43e-05 | 8436.78 ms | -100.0% bf16 MFU | 62103 tok/s +step 41/19560 | loss 8.924958 (+nanz)| norm 1.8914 (+nanz)| lr 3.51e-05 | 8436.80 ms | -100.0% bf16 MFU | 62106 tok/s +step 42/19560 | loss 8.840565 (+nanz)| norm 1.6533 (+nanz)| lr 3.60e-05 | 8437.56 ms | -100.0% bf16 MFU | 62108 tok/s +step 43/19560 | loss 8.827600 (+nanz)| norm 1.8524 (+nanz)| lr 3.69e-05 | 8443.38 ms | -100.0% bf16 MFU | 62107 tok/s +step 44/19560 | loss 8.798441 (+nanz)| norm 2.0519 (+nanz)| lr 3.77e-05 | 8440.43 ms | -100.0% bf16 MFU | 62107 tok/s +step 45/19560 | loss 8.754171 (+nanz)| norm 1.7828 (+nanz)| lr 3.86e-05 | 8435.29 ms | -100.0% bf16 MFU | 62110 tok/s +step 46/19560 | loss 8.743413 (+nanz)| norm 1.5417 (+nanz)| lr 3.94e-05 | 8439.77 ms | -100.0% bf16 MFU | 62111 tok/s +step 47/19560 | loss 8.667406 (+nanz)| norm 1.7163 (+nanz)| lr 4.03e-05 | 8440.67 ms | -100.0% bf16 MFU | 62111 tok/s +step 48/19560 | loss 8.624380 (+nanz)| norm 1.8641 (+nanz)| lr 4.11e-05 | 8447.53 ms | -100.0% bf16 MFU | 62108 tok/s +step 49/19560 | loss 8.603354 (+nanz)| norm 1.6230 (+nanz)| lr 4.20e-05 | 8443.17 ms | -100.0% bf16 MFU | 62108 tok/s +step 50/19560 | loss 8.598346 (+nanz)| norm 1.5292 (+nanz)| lr 4.29e-05 | 8437.36 ms | -100.0% bf16 MFU | 62109 tok/s +step 51/19560 | loss 8.553126 (+nanz)| norm 1.5569 (+nanz)| lr 4.37e-05 | 8437.45 ms | -100.0% bf16 MFU | 62111 tok/s +step 52/19560 | loss 8.453709 (+nanz)| norm 1.6110 (+nanz)| lr 4.46e-05 | 8438.79 ms | -100.0% bf16 MFU | 62112 tok/s +step 53/19560 | loss 8.464251 (+nanz)| norm 1.6304 (+nanz)| lr 4.54e-05 | 8440.76 ms | -100.0% bf16 MFU | 62112 tok/s +step 54/19560 | loss 8.432004 (+nanz)| norm 1.5422 (+nanz)| lr 4.63e-05 | 8444.15 ms | -100.0% bf16 MFU | 62111 tok/s +step 55/19560 | loss 8.428511 (+nanz)| norm 1.4384 (+nanz)| lr 4.71e-05 | 8436.24 ms | -100.0% bf16 MFU | 62113 tok/s +step 56/19560 | loss 8.354360 (+nanz)| norm 1.6018 (+nanz)| lr 4.80e-05 | 8436.83 ms | -100.0% bf16 MFU | 62114 tok/s +step 57/19560 | loss 8.294889 (+nanz)| norm 1.6498 (+nanz)| lr 4.89e-05 | 8438.10 ms | -100.0% bf16 MFU | 62115 tok/s +step 58/19560 | loss 8.284508 (+nanz)| norm 1.5375 (+nanz)| lr 4.97e-05 | 8442.39 ms | -100.0% bf16 MFU | 62115 tok/s +step 59/19560 | loss 8.217104 (+nanz)| norm 1.5933 (+nanz)| lr 5.06e-05 | 8434.60 ms | -100.0% bf16 MFU | 62117 tok/s +step 60/19560 | loss 8.236506 (+nanz)| norm 1.3676 (+nanz)| lr 5.14e-05 | 8438.57 ms | -100.0% bf16 MFU | 62118 tok/s +step 61/19560 | loss 8.180492 (+nanz)| norm 1.5023 (+nanz)| lr 5.23e-05 | 8431.72 ms | -100.0% bf16 MFU | 62121 tok/s +step 62/19560 | loss 8.110718 (+nanz)| norm 1.5773 (+nanz)| lr 5.31e-05 | 8442.67 ms | -100.0% bf16 MFU | 62120 tok/s +step 63/19560 | loss 8.096655 (+nanz)| norm 1.5944 (+nanz)| lr 5.40e-05 | 8433.66 ms | -100.0% bf16 MFU | 62122 tok/s +step 64/19560 | loss 8.009394 (+nanz)| norm 1.3932 (+nanz)| lr 5.49e-05 | 8439.39 ms | -100.0% bf16 MFU | 62122 tok/s +step 65/19560 | loss 8.006959 (+nanz)| norm 1.3041 (+nanz)| lr 5.57e-05 | 8432.05 ms | -100.0% bf16 MFU | 62125 tok/s +step 66/19560 | loss 8.009072 (+nanz)| norm 1.4577 (+nanz)| lr 5.66e-05 | 8442.31 ms | -100.0% bf16 MFU | 62124 tok/s +step 67/19560 | loss 7.949001 (+nanz)| norm 1.3010 (+nanz)| lr 5.74e-05 | 8436.97 ms | -100.0% bf16 MFU | 62125 tok/s +step 68/19560 | loss 7.928912 (+nanz)| norm 1.2694 (+nanz)| lr 5.83e-05 | 8433.81 ms | -100.0% bf16 MFU | 62127 tok/s +step 69/19560 | loss 7.881010 (+nanz)| norm 1.3286 (+nanz)| lr 5.91e-05 | 8440.56 ms | -100.0% bf16 MFU | 62126 tok/s +step 70/19560 | loss 7.856122 (+nanz)| norm 1.1869 (+nanz)| lr 6.00e-05 | 8432.21 ms | -100.0% bf16 MFU | 62129 tok/s +step 71/19560 | loss 7.787865 (+nanz)| norm 1.3163 (+nanz)| lr 6.09e-05 | 8434.93 ms | -100.0% bf16 MFU | 62130 tok/s +step 72/19560 | loss 7.777581 (+nanz)| norm 1.2215 (+nanz)| lr 6.17e-05 | 8437.67 ms | -100.0% bf16 MFU | 62131 tok/s +step 73/19560 | loss 7.800623 (+nanz)| norm 1.2251 (+nanz)| lr 6.26e-05 | 8432.98 ms | -100.0% bf16 MFU | 62133 tok/s +step 74/19560 | loss 7.727236 (+nanz)| norm 1.5249 (+nanz)| lr 6.34e-05 | 8437.46 ms | -100.0% bf16 MFU | 62133 tok/s +step 75/19560 | loss 7.743951 (+nanz)| norm 1.0699 (+nanz)| lr 6.43e-05 | 8439.23 ms | -100.0% bf16 MFU | 62133 tok/s +step 76/19560 | loss 7.656986 (+nanz)| norm 1.3256 (+nanz)| lr 6.51e-05 | 8437.42 ms | -100.0% bf16 MFU | 62133 tok/s +step 77/19560 | loss 7.626959 (+nanz)| norm 1.3845 (+nanz)| lr 6.60e-05 | 8436.04 ms | -100.0% bf16 MFU | 62134 tok/s +step 78/19560 | loss 7.596109 (+nanz)| norm 1.0311 (+nanz)| lr 6.69e-05 | 8432.50 ms | -100.0% bf16 MFU | 62136 tok/s +step 79/19560 | loss 7.550291 (+nanz)| norm 1.0829 (+nanz)| lr 6.77e-05 | 8435.31 ms | -100.0% bf16 MFU | 62137 tok/s +step 80/19560 | loss 7.509967 (+nanz)| norm 1.0553 (+nanz)| lr 6.86e-05 | 8431.60 ms | -100.0% bf16 MFU | 62139 tok/s +step 81/19560 | loss 7.510971 (+nanz)| norm 1.5032 (+nanz)| lr 6.94e-05 | 8431.95 ms | -100.0% bf16 MFU | 62141 tok/s +step 82/19560 | loss 7.470774 (+nanz)| norm 1.0560 (+nanz)| lr 7.03e-05 | 8433.16 ms | -100.0% bf16 MFU | 62143 tok/s +step 83/19560 | loss 7.485591 (+nanz)| norm 0.9092 (+nanz)| lr 7.11e-05 | 8437.94 ms | -100.0% bf16 MFU | 62142 tok/s +step 84/19560 | loss 7.499313 (+nanz)| norm 1.0952 (+nanz)| lr 7.20e-05 | 8436.12 ms | -100.0% bf16 MFU | 62142 tok/s +step 85/19560 | loss 7.397354 (+nanz)| norm 0.9090 (+nanz)| lr 7.29e-05 | 8434.60 ms | -100.0% bf16 MFU | 62143 tok/s +step 86/19560 | loss 7.384872 (+nanz)| norm 0.8060 (+nanz)| lr 7.37e-05 | 8429.24 ms | -100.0% bf16 MFU | 62146 tok/s +step 87/19560 | loss 7.331864 (+nanz)| norm 1.2114 (+nanz)| lr 7.46e-05 | 8440.42 ms | -100.0% bf16 MFU | 62145 tok/s +step 88/19560 | loss 7.361476 (+nanz)| norm 1.0832 (+nanz)| lr 7.54e-05 | 8435.40 ms | -100.0% bf16 MFU | 62145 tok/s +step 89/19560 | loss 7.377361 (+nanz)| norm 1.1907 (+nanz)| lr 7.63e-05 | 8435.73 ms | -100.0% bf16 MFU | 62145 tok/s +step 90/19560 | loss 7.320753 (+nanz)| norm 0.8184 (+nanz)| lr 7.71e-05 | 8432.62 ms | -100.0% bf16 MFU | 62147 tok/s +step 91/19560 | loss 7.250688 (+nanz)| norm 1.1590 (+nanz)| lr 7.80e-05 | 8426.20 ms | -100.0% bf16 MFU | 62150 tok/s +step 92/19560 | loss 7.255867 (+nanz)| norm 0.9096 (+nanz)| lr 7.89e-05 | 8436.35 ms | -100.0% bf16 MFU | 62150 tok/s +step 93/19560 | loss 7.257424 (+nanz)| norm 0.9557 (+nanz)| lr 7.97e-05 | 8433.86 ms | -100.0% bf16 MFU | 62151 tok/s +step 94/19560 | loss 7.269912 (+nanz)| norm 0.7752 (+nanz)| lr 8.06e-05 | 8435.33 ms | -100.0% bf16 MFU | 62151 tok/s +step 95/19560 | loss 7.200772 (+nanz)| norm 0.7679 (+nanz)| lr 8.14e-05 | 8438.23 ms | -100.0% bf16 MFU | 62150 tok/s +step 96/19560 | loss 7.202884 (+nanz)| norm 1.1951 (+nanz)| lr 8.23e-05 | 8433.12 ms | -100.0% bf16 MFU | 62151 tok/s +step 97/19560 | loss 7.146716 (+nanz)| norm 1.8277 (+nanz)| lr 8.31e-05 | 8436.53 ms | -100.0% bf16 MFU | 62151 tok/s +step 98/19560 | loss 7.166766 (+nanz)| norm 0.6801 (+nanz)| lr 8.40e-05 | 8432.35 ms | -100.0% bf16 MFU | 62152 tok/s +step 99/19560 | loss 7.152293 (+nanz)| norm 0.8604 (+nanz)| lr 8.49e-05 | 8429.16 ms | -100.0% bf16 MFU | 62155 tok/s +step 100/19560 | loss 7.109857 (+nanz)| norm 0.7855 (+nanz)| lr 8.57e-05 | 8434.53 ms | -100.0% bf16 MFU | 62155 tok/s +step 101/19560 | loss 7.132414 (+nanz)| norm 0.8171 (+nanz)| lr 8.66e-05 | 8434.52 ms | -100.0% bf16 MFU | 62155 tok/s +step 102/19560 | loss 7.097123 (+nanz)| norm 0.7269 (+nanz)| lr 8.74e-05 | 8433.42 ms | -100.0% bf16 MFU | 62156 tok/s +step 103/19560 | loss 7.007131 (+nanz)| norm 1.2193 (+nanz)| lr 8.83e-05 | 8433.54 ms | -100.0% bf16 MFU | 62156 tok/s +step 104/19560 | loss 7.082747 (+nanz)| norm 1.6976 (+nanz)| lr 8.91e-05 | 8433.07 ms | -100.0% bf16 MFU | 62157 tok/s +step 105/19560 | loss 7.042377 (+nanz)| norm 0.7539 (+nanz)| lr 9.00e-05 | 8436.22 ms | -100.0% bf16 MFU | 62156 tok/s +step 106/19560 | loss 7.039721 (+nanz)| norm 2.1856 (+nanz)| lr 9.09e-05 | 8437.80 ms | -100.0% bf16 MFU | 62155 tok/s +step 107/19560 | loss 7.047487 (+nanz)| norm 0.9788 (+nanz)| lr 9.17e-05 | 8438.45 ms | -100.0% bf16 MFU | 62154 tok/s +step 108/19560 | loss 7.015858 (+nanz)| norm 0.9960 (+nanz)| lr 9.26e-05 | 8437.92 ms | -100.0% bf16 MFU | 62153 tok/s +step 109/19560 | loss 6.972701 (+nanz)| norm 0.9726 (+nanz)| lr 9.34e-05 | 8439.13 ms | -100.0% bf16 MFU | 62152 tok/s +step 110/19560 | loss 7.038019 (+nanz)| norm 0.9371 (+nanz)| lr 9.43e-05 | 8430.27 ms | -100.0% bf16 MFU | 62154 tok/s +step 111/19560 | loss 7.002180 (+nanz)| norm 1.1173 (+nanz)| lr 9.51e-05 | 8436.07 ms | -100.0% bf16 MFU | 62154 tok/s +step 112/19560 | loss 6.995847 (+nanz)| norm 0.8903 (+nanz)| lr 9.60e-05 | 8435.02 ms | -100.0% bf16 MFU | 62154 tok/s +step 113/19560 | loss 6.982081 (+nanz)| norm 0.7107 (+nanz)| lr 9.69e-05 | 8437.36 ms | -100.0% bf16 MFU | 62153 tok/s +step 114/19560 | loss 6.872504 (+nanz)| norm 0.7813 (+nanz)| lr 9.77e-05 | 8433.50 ms | -100.0% bf16 MFU | 62154 tok/s +step 115/19560 | loss 6.887468 (+nanz)| norm 0.8278 (+nanz)| lr 9.86e-05 | 8431.48 ms | -100.0% bf16 MFU | 62155 tok/s +step 116/19560 | loss 6.910745 (+nanz)| norm 0.6931 (+nanz)| lr 9.94e-05 | 8433.80 ms | -100.0% bf16 MFU | 62156 tok/s +step 117/19560 | loss 6.874321 (+nanz)| norm 0.6662 (+nanz)| lr 1.00e-04 | 8436.75 ms | -100.0% bf16 MFU | 62155 tok/s +step 118/19560 | loss 6.911847 (+nanz)| norm 0.6455 (+nanz)| lr 1.01e-04 | 8435.74 ms | -100.0% bf16 MFU | 62155 tok/s +step 119/19560 | loss 6.868719 (+nanz)| norm 0.8190 (+nanz)| lr 1.02e-04 | 8435.23 ms | -100.0% bf16 MFU | 62155 tok/s +step 120/19560 | loss 6.835175 (+nanz)| norm 1.0509 (+nanz)| lr 1.03e-04 | 8433.48 ms | -100.0% bf16 MFU | 62155 tok/s +step 121/19560 | loss 6.842366 (+nanz)| norm 1.2438 (+nanz)| lr 1.04e-04 | 8434.77 ms | -100.0% bf16 MFU | 62156 tok/s +step 122/19560 | loss 6.857784 (+nanz)| norm 0.7474 (+nanz)| lr 1.05e-04 | 8431.36 ms | -100.0% bf16 MFU | 62157 tok/s +step 123/19560 | loss 6.859900 (+nanz)| norm 0.8385 (+nanz)| lr 1.05e-04 | 8435.60 ms | -100.0% bf16 MFU | 62157 tok/s +step 124/19560 | loss 6.835468 (+nanz)| norm 0.8834 (+nanz)| lr 1.06e-04 | 8433.41 ms | -100.0% bf16 MFU | 62157 tok/s +step 125/19560 | loss 6.846536 (+nanz)| norm 1.2363 (+nanz)| lr 1.07e-04 | 8435.58 ms | -100.0% bf16 MFU | 62157 tok/s +step 126/19560 | loss 6.860096 (+nanz)| norm 0.9752 (+nanz)| lr 1.08e-04 | 8438.81 ms | -100.0% bf16 MFU | 62155 tok/s +step 127/19560 | loss 6.815416 (+nanz)| norm 1.2405 (+nanz)| lr 1.09e-04 | 8437.01 ms | -100.0% bf16 MFU | 62155 tok/s +step 128/19560 | loss 6.863970 (+nanz)| norm 0.9818 (+nanz)| lr 1.10e-04 | 8437.30 ms | -100.0% bf16 MFU | 62154 tok/s +step 129/19560 | loss 6.805686 (-1.25z)| norm 1.0218 (-0.43z)| lr 1.11e-04 | 8438.54 ms | -100.0% bf16 MFU | 62153 tok/s +step 130/19560 | loss 6.764585 (-1.28z)| norm 0.6592 (-0.61z)| lr 1.11e-04 | 8437.21 ms | -100.0% bf16 MFU | 62152 tok/s +step 131/19560 | loss 6.739507 (-1.30z)| norm 0.8550 (-0.56z)| lr 1.12e-04 | 8437.58 ms | -100.0% bf16 MFU | 62151 tok/s +step 132/19560 | loss 6.774843 (-1.26z)| norm 0.6494 (-0.76z)| lr 1.13e-04 | 8434.86 ms | -100.0% bf16 MFU | 62152 tok/s +step 133/19560 | loss 6.804895 (-1.22z)| norm 0.6145 (-0.89z)| lr 1.14e-04 | 8433.95 ms | -100.0% bf16 MFU | 62152 tok/s +step 134/19560 | loss 6.741206 (-1.27z)| norm 0.6537 (-0.94z)| lr 1.15e-04 | 8433.35 ms | -100.0% bf16 MFU | 62153 tok/s +step 135/19560 | loss 6.710127 (-1.29z)| norm 0.6882 (-0.99z)| lr 1.16e-04 | 8433.98 ms | -100.0% bf16 MFU | 62154 tok/s +step 136/19560 | loss 6.660229 (-1.32z)| norm 0.7663 (-0.97z)| lr 1.17e-04 | 8432.64 ms | -100.0% bf16 MFU | 62155 tok/s +step 137/19560 | loss 6.694194 (-1.28z)| norm 0.9231 (-0.80z)| lr 1.17e-04 | 8437.79 ms | -100.0% bf16 MFU | 62154 tok/s +step 138/19560 | loss 6.673580 (-1.28z)| norm 1.0368 (-0.64z)| lr 1.18e-04 | 8439.36 ms | -100.0% bf16 MFU | 62152 tok/s +step 139/19560 | loss 6.595860 (-1.34z)| norm 1.1593 (-0.43z)| lr 1.19e-04 | 8438.67 ms | -100.0% bf16 MFU | 62151 tok/s +step 140/19560 | loss 6.619841 (-1.31z)| norm 1.1321 (-0.47z)| lr 1.20e-04 | 8434.35 ms | -100.0% bf16 MFU | 62152 tok/s +step 141/19560 | loss 6.613657 (-1.30z)| norm 1.1862 (-0.35z)| lr 1.21e-04 | 8434.10 ms | -100.0% bf16 MFU | 62152 tok/s +step 142/19560 | loss 6.633605 (-1.26z)| norm 1.3164 (-0.07z)| lr 1.22e-04 | 8433.49 ms | -100.0% bf16 MFU | 62153 tok/s +step 143/19560 | loss 6.660674 (-1.22z)| norm 0.8091 (-1.12z)| lr 1.23e-04 | 8434.33 ms | -100.0% bf16 MFU | 62153 tok/s +step 144/19560 | loss 6.568370 (-1.30z)| norm 0.6045 (-1.53z)| lr 1.23e-04 | 8436.17 ms | -100.0% bf16 MFU | 62153 tok/s +step 145/19560 | loss 6.592902 (-1.26z)| norm 0.6653 (-1.39z)| lr 1.24e-04 | 8432.18 ms | -100.0% bf16 MFU | 62154 tok/s +step 146/19560 | loss 6.594756 (-1.25z)| norm 0.6555 (-1.39z)| lr 1.25e-04 | 8436.07 ms | -100.0% bf16 MFU | 62154 tok/s +step 147/19560 | loss 6.586793 (-1.24z)| norm 0.6856 (-1.31z)| lr 1.26e-04 | 8433.56 ms | -100.0% bf16 MFU | 62155 tok/s +step 148/19560 | loss 6.622343 (-1.19z)| norm 0.8251 (-1.00z)| lr 1.27e-04 | 8433.28 ms | -100.0% bf16 MFU | 62155 tok/s +step 149/19560 | loss 6.604802 (-1.20z)| norm 1.1060 (-0.37z)| lr 1.28e-04 | 8433.31 ms | -100.0% bf16 MFU | 62156 tok/s +step 150/19560 | loss 6.602734 (-1.19z)| norm 1.5171 (+0.57z)| lr 1.29e-04 | 8437.33 ms | -100.0% bf16 MFU | 62155 tok/s +step 151/19560 | loss 6.592071 (-1.19z)| norm 0.9032 (-0.81z)| lr 1.29e-04 | 8437.94 ms | -100.0% bf16 MFU | 62154 tok/s +step 152/19560 | loss 6.562912 (-1.21z)| norm 1.3767 (+0.29z)| lr 1.30e-04 | 8434.71 ms | -100.0% bf16 MFU | 62154 tok/s +step 153/19560 | loss 6.529886 (-1.23z)| norm 1.0310 (-0.50z)| lr 1.31e-04 | 8436.55 ms | -100.0% bf16 MFU | 62154 tok/s +step 154/19560 | loss 6.547832 (-1.20z)| norm 1.0290 (-0.49z)| lr 1.32e-04 | 8435.70 ms | -100.0% bf16 MFU | 62154 tok/s +step 155/19560 | loss 6.583127 (-1.15z)| norm 1.0848 (-0.34z)| lr 1.33e-04 | 8435.20 ms | -100.0% bf16 MFU | 62154 tok/s +step 156/19560 | loss 6.578361 (-1.15z)| norm 0.9350 (-0.69z)| lr 1.34e-04 | 8435.22 ms | -100.0% bf16 MFU | 62154 tok/s +step 157/19560 | loss 6.562771 (-1.15z)| norm 0.9060 (-0.75z)| lr 1.35e-04 | 8436.91 ms | -100.0% bf16 MFU | 62153 tok/s +step 158/19560 | loss 6.517971 (-1.20z)| norm 0.9431 (-0.65z)| lr 1.35e-04 | 8437.97 ms | -100.0% bf16 MFU | 62152 tok/s +step 159/19560 | loss 6.505173 (-1.20z)| norm 0.8837 (-0.79z)| lr 1.36e-04 | 8438.62 ms | -100.0% bf16 MFU | 62151 tok/s +step 160/19560 | loss 6.521952 (-1.17z)| norm 0.7683 (-1.07z)| lr 1.37e-04 | 8435.34 ms | -100.0% bf16 MFU | 62151 tok/s +step 161/19560 | loss 6.536079 (-1.14z)| norm 0.9234 (-0.66z)| lr 1.38e-04 | 8431.30 ms | -100.0% bf16 MFU | 62153 tok/s +step 162/19560 | loss 6.542742 (-1.12z)| norm 1.3458 (+0.46z)| lr 1.39e-04 | 8430.04 ms | -100.0% bf16 MFU | 62155 tok/s +step 163/19560 | loss 6.434725 (-1.26z)| norm 1.1087 (-0.15z)| lr 1.40e-04 | 8429.16 ms | -100.0% bf16 MFU | 62157 tok/s +step 164/19560 | loss 6.577487 (-1.06z)| norm 1.3751 (+0.58z)| lr 1.41e-04 | 8429.49 ms | -100.0% bf16 MFU | 62159 tok/s +step 165/19560 | loss 6.493022 (-1.16z)| norm 0.9124 (-0.66z)| lr 1.41e-04 | 8429.42 ms | -100.0% bf16 MFU | 62161 tok/s +step 166/19560 | loss 6.469007 (-1.18z)| norm 0.6668 (-1.32z)| lr 1.42e-04 | 8427.63 ms | -100.0% bf16 MFU | 62164 tok/s +step 167/19560 | loss 6.595911 (-1.00z)| norm 0.9135 (-0.63z)| lr 1.43e-04 | 8431.29 ms | -100.0% bf16 MFU | 62165 tok/s +step 168/19560 | loss 6.397165 (-1.27z)| norm 0.8112 (-0.90z)| lr 1.44e-04 | 8428.07 ms | -100.0% bf16 MFU | 62167 tok/s +step 169/19560 | loss 6.448043 (-1.19z)| norm 0.8405 (-0.81z)| lr 1.45e-04 | 8428.57 ms | -100.0% bf16 MFU | 62168 tok/s +step 170/19560 | loss 6.496977 (-1.11z)| norm 0.7313 (-1.11z)| lr 1.46e-04 | 8427.06 ms | -100.0% bf16 MFU | 62171 tok/s +step 171/19560 | loss 6.471723 (-1.14z)| norm 1.0565 (-0.16z)| lr 1.47e-04 | 8429.94 ms | -100.0% bf16 MFU | 62172 tok/s +step 172/19560 | loss 6.463876 (-1.14z)| norm 1.1765 (+0.22z)| lr 1.47e-04 | 8428.22 ms | -100.0% bf16 MFU | 62174 tok/s +step 173/19560 | loss 6.468000 (-1.13z)| norm 1.1622 (+0.20z)| lr 1.48e-04 | 8429.10 ms | -100.0% bf16 MFU | 62175 tok/s +step 174/19560 | loss 6.419138 (-1.20z)| norm 1.0025 (-0.28z)| lr 1.49e-04 | 8427.32 ms | -100.0% bf16 MFU | 62177 tok/s +step 175/19560 | loss 6.456314 (-1.13z)| norm 1.0616 (-0.08z)| lr 1.50e-04 | 8428.63 ms | -100.0% bf16 MFU | 62178 tok/s +step 176/19560 | loss 6.409801 (-1.20z)| norm 0.9036 (-0.57z)| lr 1.51e-04 | 8436.37 ms | -100.0% bf16 MFU | 62177 tok/s +step 177/19560 | loss 6.395579 (-1.21z)| norm 0.9139 (-0.52z)| lr 1.52e-04 | 8426.96 ms | -100.0% bf16 MFU | 62179 tok/s +step 178/19560 | loss 6.422232 (-1.16z)| norm 0.7780 (-0.95z)| lr 1.53e-04 | 8426.71 ms | -100.0% bf16 MFU | 62180 tok/s +step 179/19560 | loss 6.482417 (-1.05z)| norm 0.7714 (-0.96z)| lr 1.53e-04 | 8429.20 ms | -100.0% bf16 MFU | 62181 tok/s +step 180/19560 | loss 6.407396 (-1.17z)| norm 0.9857 (-0.24z)| lr 1.54e-04 | 8428.24 ms | -100.0% bf16 MFU | 62183 tok/s +step 181/19560 | loss 6.428791 (-1.13z)| norm 1.0809 (+0.09z)| lr 1.55e-04 | 8431.14 ms | -100.0% bf16 MFU | 62183 tok/s +step 182/19560 | loss 6.425170 (-1.13z)| norm 1.1172 (+0.23z)| lr 1.56e-04 | 8432.32 ms | -100.0% bf16 MFU | 62182 tok/s +step 183/19560 | loss 6.473822 (-1.04z)| norm 1.5631 (+1.74z)| lr 1.57e-04 | 8433.25 ms | -100.0% bf16 MFU | 62182 tok/s +step 184/19560 | loss 6.409286 (-1.15z)| norm 1.0838 (+0.12z)| lr 1.58e-04 | 8432.68 ms | -100.0% bf16 MFU | 62181 tok/s +step 185/19560 | loss 6.466430 (-1.03z)| norm 1.0561 (+0.05z)| lr 1.59e-04 | 8430.19 ms | -100.0% bf16 MFU | 62182 tok/s +step 186/19560 | loss 6.397510 (-1.16z)| norm 1.0098 (-0.10z)| lr 1.59e-04 | 8433.07 ms | -100.0% bf16 MFU | 62181 tok/s +step 187/19560 | loss 6.475346 (-1.00z)| norm 1.2951 (+0.93z)| lr 1.60e-04 | 8434.52 ms | -100.0% bf16 MFU | 62180 tok/s +step 188/19560 | loss 6.396613 (-1.16z)| norm 0.8330 (-0.72z)| lr 1.61e-04 | 8432.13 ms | -100.0% bf16 MFU | 62180 tok/s +step 189/19560 | loss 6.393667 (-1.16z)| norm 0.6085 (-1.51z)| lr 1.62e-04 | 8434.01 ms | -100.0% bf16 MFU | 62179 tok/s +step 190/19560 | loss 6.360002 (-1.22z)| norm 0.9255 (-0.35z)| lr 1.63e-04 | 8434.46 ms | -100.0% bf16 MFU | 62178 tok/s +step 191/19560 | loss 6.397192 (-1.14z)| norm 0.9105 (-0.39z)| lr 1.64e-04 | 8440.77 ms | -100.0% bf16 MFU | 62175 tok/s +step 192/19560 | loss 6.392975 (-1.14z)| norm 0.7793 (-0.87z)| lr 1.65e-04 | 8456.88 ms | -100.0% bf16 MFU | 62166 tok/s +step 193/19560 | loss 6.372932 (-1.18z)| norm 0.7763 (-0.86z)| lr 1.65e-04 | 8457.01 ms | -100.0% bf16 MFU | 62157 tok/s +step 194/19560 | loss 6.364835 (-1.19z)| norm 0.7057 (-1.11z)| lr 1.66e-04 | 8459.32 ms | -100.0% bf16 MFU | 62148 tok/s +step 195/19560 | loss 6.418477 (-1.06z)| norm 0.8616 (-0.51z)| lr 1.67e-04 | 8454.37 ms | -100.0% bf16 MFU | 62142 tok/s +step 196/19560 | loss 6.413850 (-1.06z)| norm 1.0037 (+0.03z)| lr 1.68e-04 | 8457.77 ms | -100.0% bf16 MFU | 62134 tok/s +step 197/19560 | loss 6.389941 (-1.12z)| norm 1.1658 (+0.66z)| lr 1.69e-04 | 8451.88 ms | -100.0% bf16 MFU | 62129 tok/s +step 198/19560 | loss 6.374125 (-1.15z)| norm 1.0368 (+0.17z)| lr 1.70e-04 | 8457.99 ms | -100.0% bf16 MFU | 62122 tok/s +step 199/19560 | loss 6.302661 (-1.32z)| norm 0.9458 (-0.17z)| lr 1.71e-04 | 8457.09 ms | -100.0% bf16 MFU | 62116 tok/s +step 200/19560 | loss 6.297486 (-1.33z)| norm 0.7423 (-0.94z)| lr 1.71e-04 | 8455.11 ms | -100.0% bf16 MFU | 62110 tok/s +step 201/19560 | loss 6.323028 (-1.26z)| norm 0.6781 (-1.17z)| lr 1.72e-04 | 8455.14 ms | -100.0% bf16 MFU | 62105 tok/s +step 202/19560 | loss 6.314390 (-1.27z)| norm 0.6329 (-1.33z)| lr 1.73e-04 | 8450.58 ms | -100.0% bf16 MFU | 62102 tok/s +step 203/19560 | loss 6.297926 (-1.32z)| norm 0.6659 (-1.18z)| lr 1.74e-04 | 8456.55 ms | -100.0% bf16 MFU | 62097 tok/s +step 204/19560 | loss 6.315179 (-1.26z)| norm 0.8754 (-0.36z)| lr 1.75e-04 | 8452.28 ms | -100.0% bf16 MFU | 62093 tok/s +step 205/19560 | loss 6.299935 (-1.30z)| norm 1.2423 (+1.08z)| lr 1.76e-04 | 8455.74 ms | -100.0% bf16 MFU | 62089 tok/s +step 206/19560 | loss 6.276480 (-1.36z)| norm 0.8616 (-0.41z)| lr 1.77e-04 | 8457.24 ms | -100.0% bf16 MFU | 62084 tok/s +step 207/19560 | loss 6.361687 (-1.10z)| norm 0.8800 (-0.33z)| lr 1.77e-04 | 8456.25 ms | -100.0% bf16 MFU | 62080 tok/s +step 208/19560 | loss 6.280711 (-1.33z)| norm 1.0887 (+0.49z)| lr 1.78e-04 | 8442.09 ms | -100.0% bf16 MFU | 62081 tok/s +step 209/19560 | loss 6.337261 (-1.15z)| norm 1.0743 (+0.45z)| lr 1.79e-04 | 8449.68 ms | -100.0% bf16 MFU | 62079 tok/s +step 210/19560 | loss 6.249579 (-1.41z)| norm 1.3356 (+1.48z)| lr 1.80e-04 | 8452.34 ms | -100.0% bf16 MFU | 62077 tok/s +step 211/19560 | loss 6.289678 (-1.28z)| norm 0.7193 (-0.96z)| lr 1.81e-04 | 8447.06 ms | -100.0% bf16 MFU | 62076 tok/s +step 212/19560 | loss 6.366944 (-1.03z)| norm 0.8942 (-0.26z)| lr 1.82e-04 | 8449.07 ms | -100.0% bf16 MFU | 62075 tok/s +step 213/19560 | loss 6.311238 (-1.20z)| norm 0.7640 (-0.77z)| lr 1.83e-04 | 8448.87 ms | -100.0% bf16 MFU | 62074 tok/s +step 214/19560 | loss 6.275431 (-1.31z)| norm 0.9312 (-0.11z)| lr 1.83e-04 | 8453.52 ms | -100.0% bf16 MFU | 62071 tok/s +step 215/19560 | loss 6.245791 (-1.40z)| norm 0.9864 (+0.11z)| lr 1.84e-04 | 8446.94 ms | -100.0% bf16 MFU | 62071 tok/s +step 216/19560 | loss 6.217447 (-1.48z)| norm 0.9792 (+0.09z)| lr 1.85e-04 | 8447.81 ms | -100.0% bf16 MFU | 62071 tok/s +step 217/19560 | loss 6.216700 (-1.48z)| norm 0.9172 (-0.15z)| lr 1.86e-04 | 8446.86 ms | -100.0% bf16 MFU | 62071 tok/s +step 218/19560 | loss 6.300950 (-1.18z)| norm 0.8098 (-0.58z)| lr 1.87e-04 | 8447.39 ms | -100.0% bf16 MFU | 62070 tok/s +step 219/19560 | loss 6.300754 (-1.17z)| norm 1.4104 (+1.79z)| lr 1.88e-04 | 8452.64 ms | -100.0% bf16 MFU | 62068 tok/s +step 220/19560 | loss 6.295072 (-1.18z)| norm 1.0079 (+0.20z)| lr 1.89e-04 | 8453.16 ms | -100.0% bf16 MFU | 62066 tok/s +step 221/19560 | loss 6.242665 (-1.36z)| norm 0.9245 (-0.13z)| lr 1.89e-04 | 8445.80 ms | -100.0% bf16 MFU | 62067 tok/s +step 222/19560 | loss 6.229870 (-1.40z)| norm 0.6522 (-1.19z)| lr 1.90e-04 | 8445.63 ms | -100.0% bf16 MFU | 62067 tok/s +step 223/19560 | loss 6.368147 (-0.87z)| norm 0.7370 (-0.86z)| lr 1.91e-04 | 8449.91 ms | -100.0% bf16 MFU | 62066 tok/s +step 224/19560 | loss 6.202819 (-1.49z)| norm 0.7023 (-0.98z)| lr 1.92e-04 | 8443.95 ms | -100.0% bf16 MFU | 62067 tok/s +step 225/19560 | loss 6.190422 (-1.52z)| norm 0.8564 (-0.36z)| lr 1.93e-04 | 8441.70 ms | -100.0% bf16 MFU | 62069 tok/s +step 226/19560 | loss 6.232869 (-1.35z)| norm 1.0724 (+0.51z)| lr 1.94e-04 | 8442.29 ms | -100.0% bf16 MFU | 62071 tok/s +step 227/19560 | loss 6.205657 (-1.44z)| norm 1.2968 (+1.42z)| lr 1.95e-04 | 8443.73 ms | -100.0% bf16 MFU | 62072 tok/s +step 228/19560 | loss 6.234141 (-1.32z)| norm 0.9534 (+0.00z)| lr 1.95e-04 | 8441.96 ms | -100.0% bf16 MFU | 62074 tok/s +step 229/19560 | loss 6.246111 (-1.26z)| norm 1.0419 (+0.36z)| lr 1.96e-04 | 8444.56 ms | -100.0% bf16 MFU | 62074 tok/s +step 230/19560 | loss 6.197363 (-1.45z)| norm 1.0085 (+0.21z)| lr 1.97e-04 | 8442.80 ms | -100.0% bf16 MFU | 62075 tok/s +step 231/19560 | loss 6.187082 (-1.47z)| norm 1.1005 (+0.60z)| lr 1.98e-04 | 8445.17 ms | -100.0% bf16 MFU | 62076 tok/s +step 232/19560 | loss 6.166658 (-1.55z)| norm 0.7557 (-0.82z)| lr 1.99e-04 | 8437.84 ms | -100.0% bf16 MFU | 62079 tok/s +step 233/19560 | loss 6.265611 (-1.12z)| norm 0.9738 (+0.10z)| lr 2.00e-04 | 8440.72 ms | -100.0% bf16 MFU | 62081 tok/s +step 234/19560 | loss 6.181860 (-1.46z)| norm 0.8482 (-0.44z)| lr 2.01e-04 | 8441.92 ms | -100.0% bf16 MFU | 62082 tok/s +step 235/19560 | loss 6.192468 (-1.41z)| norm 1.1200 (+0.87z)| lr 2.01e-04 | 8441.36 ms | -100.0% bf16 MFU | 62083 tok/s +step 236/19560 | loss 6.197932 (-1.37z)| norm 0.8008 (-0.67z)| lr 2.02e-04 | 8440.73 ms | -100.0% bf16 MFU | 62085 tok/s +step 237/19560 | loss 6.195143 (-1.37z)| norm 0.7855 (-0.74z)| lr 2.03e-04 | 8446.39 ms | -100.0% bf16 MFU | 62084 tok/s +step 238/19560 | loss 6.156668 (-1.53z)| norm 0.8290 (-0.52z)| lr 2.04e-04 | 8441.57 ms | -100.0% bf16 MFU | 62085 tok/s +step 239/19560 | loss 6.150063 (-1.55z)| norm 0.8060 (-0.62z)| lr 2.05e-04 | 8443.00 ms | -100.0% bf16 MFU | 62086 tok/s +step 240/19560 | loss 6.185164 (-1.38z)| norm 0.7982 (-0.65z)| lr 2.06e-04 | 8448.19 ms | -100.0% bf16 MFU | 62085 tok/s +step 241/19560 | loss 6.163269 (-1.47z)| norm 1.0515 (+0.56z)| lr 2.07e-04 | 8443.12 ms | -100.0% bf16 MFU | 62085 tok/s +step 242/19560 | loss 6.174234 (-1.40z)| norm 0.8547 (-0.40z)| lr 2.07e-04 | 8438.96 ms | -100.0% bf16 MFU | 62087 tok/s +step 243/19560 | loss 6.150814 (-1.50z)| norm 0.7956 (-0.68z)| lr 2.08e-04 | 8438.26 ms | -100.0% bf16 MFU | 62089 tok/s +step 244/19560 | loss 6.217577 (-1.17z)| norm 0.8475 (-0.44z)| lr 2.09e-04 | 8443.36 ms | -100.0% bf16 MFU | 62090 tok/s +step 245/19560 | loss 6.142106 (-1.52z)| norm 0.7517 (-0.91z)| lr 2.10e-04 | 8437.79 ms | -100.0% bf16 MFU | 62092 tok/s +step 246/19560 | loss 6.248346 (-0.99z)| norm 1.1113 (+0.83z)| lr 2.11e-04 | 8438.52 ms | -100.0% bf16 MFU | 62094 tok/s +step 247/19560 | loss 6.203101 (-1.20z)| norm 0.8420 (-0.49z)| lr 2.12e-04 | 8440.33 ms | -100.0% bf16 MFU | 62095 tok/s +step 248/19560 | loss 6.218516 (-1.11z)| norm 0.8211 (-0.59z)| lr 2.13e-04 | 8442.15 ms | -100.0% bf16 MFU | 62096 tok/s +step 249/19560 | loss 6.109895 (-1.65z)| norm 0.7228 (-1.06z)| lr 2.13e-04 | 8437.99 ms | -100.0% bf16 MFU | 62097 tok/s +step 250/19560 | loss 6.151821 (-1.42z)| norm 0.6808 (-1.26z)| lr 2.14e-04 | 8437.33 ms | -100.0% bf16 MFU | 62100 tok/s +val loss 6.197762 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 2492/10042 = 0.248158 +step 251/19560 | loss 6.213551 (-1.09z)| norm 0.6652 (-1.32z)| lr 2.15e-04 | 8431.81 ms | -100.0% bf16 MFU | 62104 tok/s +step 252/19560 | loss 6.130404 (-1.51z)| norm 0.7242 (-1.02z)| lr 2.16e-04 | 8432.00 ms | -100.0% bf16 MFU | 62107 tok/s +step 253/19560 | loss 6.143776 (-1.43z)| norm 0.6477 (-1.37z)| lr 2.17e-04 | 8433.13 ms | -100.0% bf16 MFU | 62110 tok/s +step 254/19560 | loss 6.116277 (-1.57z)| norm 0.7863 (-0.69z)| lr 2.18e-04 | 8433.67 ms | -100.0% bf16 MFU | 62113 tok/s +step 255/19560 | loss 6.112810 (-1.58z)| norm 0.7845 (-0.68z)| lr 2.19e-04 | 8432.87 ms | -100.0% bf16 MFU | 62116 tok/s +step 256/19560 | loss 6.146612 (-1.39z)| norm 0.6672 (-1.24z)| lr 2.19e-04 | 8432.21 ms | -100.0% bf16 MFU | 62119 tok/s +step 257/19560 | loss 6.192252 (-1.12z)| norm 0.7167 (-0.98z)| lr 2.20e-04 | 8440.45 ms | -100.0% bf16 MFU | 62119 tok/s +step 258/19560 | loss 6.123223 (-1.50z)| norm 0.8260 (-0.46z)| lr 2.21e-04 | 8437.25 ms | -100.0% bf16 MFU | 62120 tok/s +step 259/19560 | loss 6.106413 (-1.58z)| norm 0.6943 (-1.10z)| lr 2.22e-04 | 8435.22 ms | -100.0% bf16 MFU | 62122 tok/s +step 260/19560 | loss 6.074505 (-1.75z)| norm 0.8020 (-0.58z)| lr 2.23e-04 | 8437.61 ms | -100.0% bf16 MFU | 62123 tok/s +step 261/19560 | loss 6.092783 (-1.64z)| norm 1.0199 (+0.48z)| lr 2.24e-04 | 8438.39 ms | -100.0% bf16 MFU | 62123 tok/s +step 262/19560 | loss 6.147604 (-1.30z)| norm 1.2489 (+1.58z)| lr 2.25e-04 | 8434.50 ms | -100.0% bf16 MFU | 62125 tok/s +step 263/19560 | loss 6.089511 (-1.63z)| norm 0.8908 (-0.19z)| lr 2.25e-04 | 8437.94 ms | -100.0% bf16 MFU | 62125 tok/s +step 264/19560 | loss 6.055043 (-1.81z)| norm 0.9057 (-0.12z)| lr 2.26e-04 | 8440.14 ms | -100.0% bf16 MFU | 62125 tok/s +step 265/19560 | loss 6.128433 (-1.35z)| norm 0.9358 (+0.03z)| lr 2.27e-04 | 8438.26 ms | -100.0% bf16 MFU | 62125 tok/s +step 266/19560 | loss 6.119726 (-1.39z)| norm 0.6490 (-1.38z)| lr 2.28e-04 | 8436.26 ms | -100.0% bf16 MFU | 62126 tok/s +step 267/19560 | loss 6.103078 (-1.47z)| norm 0.6567 (-1.32z)| lr 2.29e-04 | 8435.58 ms | -100.0% bf16 MFU | 62128 tok/s +step 268/19560 | loss 6.093471 (-1.51z)| norm 0.6954 (-1.11z)| lr 2.30e-04 | 8439.19 ms | -100.0% bf16 MFU | 62128 tok/s +step 269/19560 | loss 6.087685 (-1.53z)| norm 0.8840 (-0.17z)| lr 2.31e-04 | 8441.09 ms | -100.0% bf16 MFU | 62127 tok/s +step 270/19560 | loss 6.192928 (-0.86z)| norm 1.0272 (+0.56z)| lr 2.31e-04 | 8437.05 ms | -100.0% bf16 MFU | 62127 tok/s +step 271/19560 | loss 6.134328 (-1.22z)| norm 1.3975 (+2.35z)| lr 2.32e-04 | 8440.50 ms | -100.0% bf16 MFU | 62127 tok/s +step 272/19560 | loss 6.067671 (-1.62z)| norm 1.4348 (+2.47z)| lr 2.33e-04 | 8439.34 ms | -100.0% bf16 MFU | 62127 tok/s +step 273/19560 | loss 6.093391 (-1.43z)| norm 1.0629 (+0.65z)| lr 2.34e-04 | 8435.73 ms | -100.0% bf16 MFU | 62128 tok/s +step 274/19560 | loss 6.047095 (-1.70z)| norm 0.9221 (-0.05z)| lr 2.35e-04 | 8438.20 ms | -100.0% bf16 MFU | 62128 tok/s +step 275/19560 | loss 6.118476 (-1.23z)| norm 0.9316 (-0.01z)| lr 2.36e-04 | 8438.87 ms | -100.0% bf16 MFU | 62128 tok/s +step 276/19560 | loss 6.037366 (-1.73z)| norm 1.0878 (+0.75z)| lr 2.37e-04 | 8440.71 ms | -100.0% bf16 MFU | 62127 tok/s +step 277/19560 | loss 6.029911 (-1.76z)| norm 0.9502 (+0.08z)| lr 2.37e-04 | 8440.84 ms | -100.0% bf16 MFU | 62127 tok/s +step 278/19560 | loss 6.039595 (-1.67z)| norm 0.8192 (-0.56z)| lr 2.38e-04 | 8440.62 ms | -100.0% bf16 MFU | 62126 tok/s +step 279/19560 | loss 6.057557 (-1.54z)| norm 0.8297 (-0.50z)| lr 2.39e-04 | 8436.42 ms | -100.0% bf16 MFU | 62127 tok/s +step 280/19560 | loss 6.059269 (-1.51z)| norm 0.7640 (-0.83z)| lr 2.40e-04 | 8437.07 ms | -100.0% bf16 MFU | 62128 tok/s +step 281/19560 | loss 6.059142 (-1.48z)| norm 0.6700 (-1.30z)| lr 2.41e-04 | 8436.91 ms | -100.0% bf16 MFU | 62129 tok/s +step 282/19560 | loss 6.044037 (-1.56z)| norm 0.6413 (-1.42z)| lr 2.42e-04 | 8436.11 ms | -100.0% bf16 MFU | 62130 tok/s +step 283/19560 | loss 5.996150 (-1.85z)| norm 0.6202 (-1.50z)| lr 2.43e-04 | 8439.34 ms | -100.0% bf16 MFU | 62129 tok/s +step 284/19560 | loss 6.056411 (-1.44z)| norm 0.7158 (-1.00z)| lr 2.43e-04 | 8437.36 ms | -100.0% bf16 MFU | 62130 tok/s +step 285/19560 | loss 6.042507 (-1.51z)| norm 0.7887 (-0.62z)| lr 2.44e-04 | 8439.20 ms | -100.0% bf16 MFU | 62130 tok/s +step 286/19560 | loss 5.961585 (-2.02z)| norm 0.7914 (-0.60z)| lr 2.45e-04 | 8437.85 ms | -100.0% bf16 MFU | 62130 tok/s +step 287/19560 | loss 5.981151 (-1.86z)| norm 0.7924 (-0.59z)| lr 2.46e-04 | 8437.99 ms | -100.0% bf16 MFU | 62130 tok/s +step 288/19560 | loss 6.027915 (-1.52z)| norm 0.8544 (-0.28z)| lr 2.47e-04 | 8439.23 ms | -100.0% bf16 MFU | 62130 tok/s +step 289/19560 | loss 6.044820 (-1.39z)| norm 0.9372 (+0.14z)| lr 2.48e-04 | 8436.39 ms | -100.0% bf16 MFU | 62131 tok/s +step 290/19560 | loss 6.030326 (-1.47z)| norm 1.4582 (+2.74z)| lr 2.49e-04 | 8437.63 ms | -100.0% bf16 MFU | 62131 tok/s +step 291/19560 | loss 6.014317 (-1.55z)| norm 0.8669 (-0.21z)| lr 2.49e-04 | 8437.04 ms | -100.0% bf16 MFU | 62131 tok/s +step 292/19560 | loss 6.015982 (-1.53z)| norm 1.0485 (+0.73z)| lr 2.50e-04 | 8440.96 ms | -100.0% bf16 MFU | 62130 tok/s +step 293/19560 | loss 6.039289 (-1.35z)| norm 1.0055 (+0.50z)| lr 2.51e-04 | 8438.47 ms | -100.0% bf16 MFU | 62130 tok/s +step 294/19560 | loss 5.994616 (-1.64z)| norm 0.8349 (-0.38z)| lr 2.52e-04 | 8439.20 ms | -100.0% bf16 MFU | 62130 tok/s +step 295/19560 | loss 6.024224 (-1.42z)| norm 0.8128 (-0.49z)| lr 2.53e-04 | 8436.62 ms | -100.0% bf16 MFU | 62131 tok/s +step 296/19560 | loss 6.006327 (-1.52z)| norm 0.8771 (-0.16z)| lr 2.54e-04 | 8438.05 ms | -100.0% bf16 MFU | 62131 tok/s +step 297/19560 | loss 5.955547 (-1.85z)| norm 1.1253 (+1.10z)| lr 2.55e-04 | 8456.75 ms | -100.0% bf16 MFU | 62124 tok/s +step 298/19560 | loss 6.007855 (-1.46z)| norm 0.9790 (+0.34z)| lr 2.55e-04 | 8465.63 ms | -100.0% bf16 MFU | 62115 tok/s +step 299/19560 | loss 6.020070 (-1.36z)| norm 0.9328 (+0.11z)| lr 2.56e-04 | 8466.27 ms | -100.0% bf16 MFU | 62105 tok/s +step 300/19560 | loss 5.938631 (-1.90z)| norm 0.9036 (-0.03z)| lr 2.57e-04 | 8463.14 ms | -100.0% bf16 MFU | 62098 tok/s +step 301/19560 | loss 6.014291 (-1.35z)| norm 1.2173 (+1.59z)| lr 2.58e-04 | 8462.73 ms | -100.0% bf16 MFU | 62090 tok/s +step 302/19560 | loss 6.027766 (-1.24z)| norm 0.7909 (-0.61z)| lr 2.59e-04 | 8460.45 ms | -100.0% bf16 MFU | 62084 tok/s +step 303/19560 | loss 6.022500 (-1.26z)| norm 0.6930 (-1.10z)| lr 2.60e-04 | 8466.12 ms | -100.0% bf16 MFU | 62076 tok/s +step 304/19560 | loss 5.988136 (-1.48z)| norm 0.8288 (-0.39z)| lr 2.61e-04 | 8457.91 ms | -100.0% bf16 MFU | 62072 tok/s +step 305/19560 | loss 5.947268 (-1.74z)| norm 0.8510 (-0.28z)| lr 2.61e-04 | 8461.53 ms | -100.0% bf16 MFU | 62066 tok/s +step 306/19560 | loss 5.947697 (-1.71z)| norm 0.9091 (+0.02z)| lr 2.62e-04 | 8462.58 ms | -100.0% bf16 MFU | 62061 tok/s +step 307/19560 | loss 5.932615 (-1.80z)| norm 0.9551 (+0.25z)| lr 2.63e-04 | 8459.30 ms | -100.0% bf16 MFU | 62057 tok/s +step 308/19560 | loss 5.894580 (-2.03z)| norm 0.9617 (+0.28z)| lr 2.64e-04 | 8461.34 ms | -100.0% bf16 MFU | 62052 tok/s +step 309/19560 | loss 5.992926 (-1.31z)| norm 0.9730 (+0.35z)| lr 2.65e-04 | 8455.46 ms | -100.0% bf16 MFU | 62050 tok/s +step 310/19560 | loss 5.941513 (-1.66z)| norm 0.9239 (+0.10z)| lr 2.66e-04 | 8459.23 ms | -100.0% bf16 MFU | 62046 tok/s +step 311/19560 | loss 5.958200 (-1.52z)| norm 0.9006 (+0.01z)| lr 2.67e-04 | 8457.03 ms | -100.0% bf16 MFU | 62043 tok/s +step 312/19560 | loss 5.931982 (-1.69z)| norm 1.0292 (+0.72z)| lr 2.67e-04 | 8463.33 ms | -100.0% bf16 MFU | 62039 tok/s +step 313/19560 | loss 5.876442 (-2.06z)| norm 1.2862 (+2.09z)| lr 2.68e-04 | 8453.99 ms | -100.0% bf16 MFU | 62038 tok/s +step 314/19560 | loss 5.917127 (-1.74z)| norm 1.1109 (+1.13z)| lr 2.69e-04 | 8455.09 ms | -100.0% bf16 MFU | 62036 tok/s +step 315/19560 | loss 6.000117 (-1.13z)| norm 1.2276 (+1.77z)| lr 2.70e-04 | 8456.31 ms | -100.0% bf16 MFU | 62034 tok/s +step 316/19560 | loss 5.868444 (-2.06z)| norm 0.9934 (+0.49z)| lr 2.71e-04 | 8456.30 ms | -100.0% bf16 MFU | 62033 tok/s +step 317/19560 | loss 5.953414 (-1.42z)| norm 0.9116 (+0.04z)| lr 2.72e-04 | 8454.82 ms | -100.0% bf16 MFU | 62031 tok/s +step 318/19560 | loss 5.971694 (-1.26z)| norm 0.9056 (+0.01z)| lr 2.73e-04 | 8452.50 ms | -100.0% bf16 MFU | 62031 tok/s +step 319/19560 | loss 5.934337 (-1.52z)| norm 0.9715 (+0.36z)| lr 2.73e-04 | 8456.98 ms | -100.0% bf16 MFU | 62029 tok/s +step 320/19560 | loss 5.907097 (-1.70z)| norm 1.1970 (+1.57z)| lr 2.74e-04 | 8456.66 ms | -100.0% bf16 MFU | 62028 tok/s +step 321/19560 | loss 5.954335 (-1.33z)| norm 0.6877 (-1.19z)| lr 2.75e-04 | 8451.06 ms | -100.0% bf16 MFU | 62028 tok/s +step 322/19560 | loss 5.964081 (-1.24z)| norm 0.7397 (-0.91z)| lr 2.76e-04 | 8455.17 ms | -100.0% bf16 MFU | 62027 tok/s +step 323/19560 | loss 5.935753 (-1.44z)| norm 0.7558 (-0.82z)| lr 2.77e-04 | 8457.98 ms | -100.0% bf16 MFU | 62025 tok/s +step 324/19560 | loss 5.922614 (-1.53z)| norm 0.8603 (-0.25z)| lr 2.78e-04 | 8456.25 ms | -100.0% bf16 MFU | 62024 tok/s +step 325/19560 | loss 5.944735 (-1.35z)| norm 0.8143 (-0.48z)| lr 2.79e-04 | 8449.63 ms | -100.0% bf16 MFU | 62025 tok/s +step 326/19560 | loss 5.861885 (-1.96z)| norm 0.8465 (-0.30z)| lr 2.79e-04 | 8449.34 ms | -100.0% bf16 MFU | 62027 tok/s +step 327/19560 | loss 5.911884 (-1.55z)| norm 0.7492 (-0.82z)| lr 2.80e-04 | 8457.53 ms | -100.0% bf16 MFU | 62025 tok/s +step 328/19560 | loss 5.879344 (-1.77z)| norm 0.9841 (+0.45z)| lr 2.81e-04 | 8455.74 ms | -100.0% bf16 MFU | 62024 tok/s +step 329/19560 | loss 5.878876 (-1.74z)| norm 1.2856 (+2.05z)| lr 2.82e-04 | 8448.78 ms | -100.0% bf16 MFU | 62025 tok/s +step 330/19560 | loss 5.838237 (-2.02z)| norm 0.8012 (-0.58z)| lr 2.83e-04 | 8452.34 ms | -100.0% bf16 MFU | 62025 tok/s +step 331/19560 | loss 5.883521 (-1.64z)| norm 0.7419 (-0.91z)| lr 2.84e-04 | 8449.78 ms | -100.0% bf16 MFU | 62027 tok/s +step 332/19560 | loss 5.820986 (-2.08z)| norm 1.1907 (+1.51z)| lr 2.85e-04 | 8458.30 ms | -100.0% bf16 MFU | 62024 tok/s +step 333/19560 | loss 5.885140 (-1.56z)| norm 1.0132 (+0.57z)| lr 2.85e-04 | 8448.18 ms | -100.0% bf16 MFU | 62026 tok/s +step 334/19560 | loss 5.927871 (-1.22z)| norm 1.1854 (+1.48z)| lr 2.86e-04 | 8446.42 ms | -100.0% bf16 MFU | 62029 tok/s +step 335/19560 | loss 5.942362 (-1.10z)| norm 1.0253 (+0.61z)| lr 2.87e-04 | 8451.09 ms | -100.0% bf16 MFU | 62029 tok/s +step 336/19560 | loss 6.003396 (-0.61z)| norm 1.2031 (+1.56z)| lr 2.88e-04 | 8454.76 ms | -100.0% bf16 MFU | 62028 tok/s +step 337/19560 | loss 5.847037 (-1.81z)| norm 1.0373 (+0.67z)| lr 2.89e-04 | 8451.52 ms | -100.0% bf16 MFU | 62028 tok/s +step 338/19560 | loss 5.876528 (-1.55z)| norm 0.7140 (-1.07z)| lr 2.90e-04 | 8451.92 ms | -100.0% bf16 MFU | 62029 tok/s +step 339/19560 | loss 5.830075 (-1.88z)| norm 0.6902 (-1.20z)| lr 2.91e-04 | 8456.22 ms | -100.0% bf16 MFU | 62027 tok/s +step 340/19560 | loss 5.835734 (-1.82z)| norm 0.7243 (-1.00z)| lr 2.91e-04 | 8444.10 ms | -100.0% bf16 MFU | 62030 tok/s +step 341/19560 | loss 5.872680 (-1.51z)| norm 0.9277 (+0.11z)| lr 2.92e-04 | 8452.29 ms | -100.0% bf16 MFU | 62030 tok/s +step 342/19560 | loss 5.860566 (-1.58z)| norm 1.3715 (+2.45z)| lr 2.93e-04 | 8448.18 ms | -100.0% bf16 MFU | 62032 tok/s +step 343/19560 | loss 5.856997 (-1.59z)| norm 0.8267 (-0.45z)| lr 2.94e-04 | 8440.42 ms | -100.0% bf16 MFU | 62036 tok/s +step 344/19560 | loss 5.841364 (-1.68z)| norm 0.9803 (+0.37z)| lr 2.95e-04 | 8452.27 ms | -100.0% bf16 MFU | 62036 tok/s +step 345/19560 | loss 5.854559 (-1.55z)| norm 1.0186 (+0.57z)| lr 2.96e-04 | 8449.11 ms | -100.0% bf16 MFU | 62036 tok/s +step 346/19560 | loss 5.881502 (-1.32z)| norm 1.3349 (+2.19z)| lr 2.97e-04 | 8444.29 ms | -100.0% bf16 MFU | 62039 tok/s +step 347/19560 | loss 5.815949 (-1.82z)| norm 0.7867 (-0.67z)| lr 2.97e-04 | 8447.87 ms | -100.0% bf16 MFU | 62040 tok/s +step 348/19560 | loss 5.915710 (-1.01z)| norm 0.7866 (-0.66z)| lr 2.98e-04 | 8447.71 ms | -100.0% bf16 MFU | 62041 tok/s +step 349/19560 | loss 5.773650 (-2.11z)| norm 0.7010 (-1.10z)| lr 2.99e-04 | 8447.11 ms | -100.0% bf16 MFU | 62043 tok/s +step 350/19560 | loss 5.844160 (-1.52z)| norm 0.8412 (-0.36z)| lr 3.00e-04 | 8447.45 ms | -100.0% bf16 MFU | 62044 tok/s +step 351/19560 | loss 5.848439 (-1.48z)| norm 0.9189 (+0.05z)| lr 3.01e-04 | 8450.37 ms | -100.0% bf16 MFU | 62044 tok/s +step 352/19560 | loss 5.870335 (-1.29z)| norm 1.0458 (+0.72z)| lr 3.02e-04 | 8442.35 ms | -100.0% bf16 MFU | 62047 tok/s +step 353/19560 | loss 5.783573 (-1.95z)| norm 0.9310 (+0.10z)| lr 3.03e-04 | 8446.23 ms | -100.0% bf16 MFU | 62048 tok/s +step 354/19560 | loss 5.843041 (-1.45z)| norm 1.0062 (+0.51z)| lr 3.03e-04 | 8438.77 ms | -100.0% bf16 MFU | 62052 tok/s +step 355/19560 | loss 5.829731 (-1.53z)| norm 1.0542 (+0.79z)| lr 3.04e-04 | 8445.89 ms | -100.0% bf16 MFU | 62053 tok/s +step 356/19560 | loss 5.852941 (-1.33z)| norm 0.8885 (-0.12z)| lr 3.05e-04 | 8448.35 ms | -100.0% bf16 MFU | 62053 tok/s +step 357/19560 | loss 5.811649 (-1.64z)| norm 0.8478 (-0.34z)| lr 3.06e-04 | 8451.29 ms | -100.0% bf16 MFU | 62053 tok/s +step 358/19560 | loss 5.783350 (-1.84z)| norm 1.2129 (+1.65z)| lr 3.07e-04 | 8444.53 ms | -100.0% bf16 MFU | 62054 tok/s +step 359/19560 | loss 5.841149 (-1.35z)| norm 0.8923 (-0.09z)| lr 3.08e-04 | 8448.28 ms | -100.0% bf16 MFU | 62054 tok/s +step 360/19560 | loss 5.824903 (-1.46z)| norm 1.0621 (+0.83z)| lr 3.09e-04 | 8441.95 ms | -100.0% bf16 MFU | 62057 tok/s +step 361/19560 | loss 5.808461 (-1.57z)| norm 1.2907 (+2.03z)| lr 3.09e-04 | 8441.69 ms | -100.0% bf16 MFU | 62059 tok/s +step 362/19560 | loss 5.873861 (-1.02z)| norm 1.0969 (+0.97z)| lr 3.10e-04 | 8447.81 ms | -100.0% bf16 MFU | 62060 tok/s +step 363/19560 | loss 5.762534 (-1.91z)| norm 1.1975 (+1.51z)| lr 3.11e-04 | 8442.02 ms | -100.0% bf16 MFU | 62062 tok/s +step 364/19560 | loss 5.792645 (-1.63z)| norm 1.2403 (+1.70z)| lr 3.12e-04 | 8443.24 ms | -100.0% bf16 MFU | 62064 tok/s +step 365/19560 | loss 5.818675 (-1.40z)| norm 0.8331 (-0.46z)| lr 3.13e-04 | 8446.41 ms | -100.0% bf16 MFU | 62064 tok/s +step 366/19560 | loss 5.800385 (-1.52z)| norm 0.8496 (-0.38z)| lr 3.14e-04 | 8445.63 ms | -100.0% bf16 MFU | 62065 tok/s +step 367/19560 | loss 5.819321 (-1.35z)| norm 0.8336 (-0.46z)| lr 3.15e-04 | 8446.37 ms | -100.0% bf16 MFU | 62065 tok/s +step 368/19560 | loss 5.865353 (-0.96z)| norm 0.8962 (-0.13z)| lr 3.15e-04 | 8445.56 ms | -100.0% bf16 MFU | 62066 tok/s +step 369/19560 | loss 5.713481 (-2.16z)| norm 0.7782 (-0.75z)| lr 3.16e-04 | 8446.68 ms | -100.0% bf16 MFU | 62066 tok/s +step 370/19560 | loss 5.721330 (-2.06z)| norm 0.8840 (-0.19z)| lr 3.17e-04 | 8446.06 ms | -100.0% bf16 MFU | 62066 tok/s +step 371/19560 | loss 5.720601 (-2.02z)| norm 0.9095 (-0.06z)| lr 3.18e-04 | 8445.97 ms | -100.0% bf16 MFU | 62067 tok/s +step 372/19560 | loss 5.713697 (-2.04z)| norm 0.9609 (+0.21z)| lr 3.19e-04 | 8443.62 ms | -100.0% bf16 MFU | 62068 tok/s +step 373/19560 | loss 5.740957 (-1.79z)| norm 0.9136 (-0.05z)| lr 3.20e-04 | 8453.20 ms | -100.0% bf16 MFU | 62066 tok/s +step 374/19560 | loss 5.803090 (-1.28z)| norm 1.0399 (+0.63z)| lr 3.21e-04 | 8442.42 ms | -100.0% bf16 MFU | 62068 tok/s +step 375/19560 | loss 5.782050 (-1.43z)| norm 0.9129 (-0.05z)| lr 3.21e-04 | 8445.96 ms | -100.0% bf16 MFU | 62068 tok/s +step 376/19560 | loss 5.692233 (-2.14z)| norm 0.7451 (-0.95z)| lr 3.22e-04 | 8446.19 ms | -100.0% bf16 MFU | 62068 tok/s +step 377/19560 | loss 5.736783 (-1.74z)| norm 0.8668 (-0.30z)| lr 3.23e-04 | 8441.12 ms | -100.0% bf16 MFU | 62070 tok/s +step 378/19560 | loss 5.825133 (-1.00z)| norm 0.7734 (-0.81z)| lr 3.24e-04 | 8446.48 ms | -100.0% bf16 MFU | 62071 tok/s +step 379/19560 | loss 5.856173 (-0.74z)| norm 0.9447 (+0.10z)| lr 3.25e-04 | 8440.87 ms | -100.0% bf16 MFU | 62073 tok/s +step 380/19560 | loss 5.792763 (-1.25z)| norm 1.3759 (+2.38z)| lr 3.26e-04 | 8446.66 ms | -100.0% bf16 MFU | 62073 tok/s +step 381/19560 | loss 5.737338 (-1.69z)| norm 1.1507 (+1.16z)| lr 3.27e-04 | 8452.61 ms | -100.0% bf16 MFU | 62070 tok/s +step 382/19560 | loss 5.766708 (-1.42z)| norm 1.0042 (+0.36z)| lr 3.27e-04 | 8445.79 ms | -100.0% bf16 MFU | 62071 tok/s +step 383/19560 | loss 5.778786 (-1.30z)| norm 0.9545 (+0.09z)| lr 3.28e-04 | 8442.84 ms | -100.0% bf16 MFU | 62072 tok/s +step 384/19560 | loss 5.717426 (-1.79z)| norm 1.0357 (+0.52z)| lr 3.29e-04 | 8442.88 ms | -100.0% bf16 MFU | 62073 tok/s +step 385/19560 | loss 5.796866 (-1.11z)| norm 0.7869 (-0.84z)| lr 3.30e-04 | 8444.71 ms | -100.0% bf16 MFU | 62074 tok/s +step 386/19560 | loss 5.754298 (-1.46z)| norm 0.9145 (-0.15z)| lr 3.31e-04 | 8441.22 ms | -100.0% bf16 MFU | 62076 tok/s +step 387/19560 | loss 5.690953 (-1.96z)| norm 0.9524 (+0.05z)| lr 3.32e-04 | 8443.42 ms | -100.0% bf16 MFU | 62077 tok/s +step 388/19560 | loss 5.717247 (-1.70z)| norm 1.0509 (+0.58z)| lr 3.33e-04 | 8437.90 ms | -100.0% bf16 MFU | 62080 tok/s +step 389/19560 | loss 5.696252 (-1.85z)| norm 1.3537 (+2.19z)| lr 3.33e-04 | 8441.79 ms | -100.0% bf16 MFU | 62081 tok/s +step 390/19560 | loss 5.763199 (-1.27z)| norm 1.0136 (+0.36z)| lr 3.34e-04 | 8443.17 ms | -100.0% bf16 MFU | 62082 tok/s +step 391/19560 | loss 5.678538 (-1.95z)| norm 1.1648 (+1.17z)| lr 3.35e-04 | 8441.25 ms | -100.0% bf16 MFU | 62083 tok/s +step 392/19560 | loss 5.749193 (-1.33z)| norm 0.7890 (-0.86z)| lr 3.36e-04 | 8441.99 ms | -100.0% bf16 MFU | 62084 tok/s +step 393/19560 | loss 5.694565 (-1.77z)| norm 0.8330 (-0.62z)| lr 3.37e-04 | 8443.81 ms | -100.0% bf16 MFU | 62085 tok/s +step 394/19560 | loss 5.718903 (-1.54z)| norm 1.0101 (+0.33z)| lr 3.38e-04 | 8447.88 ms | -100.0% bf16 MFU | 62083 tok/s +step 395/19560 | loss 5.702663 (-1.66z)| norm 1.0365 (+0.46z)| lr 3.39e-04 | 8445.34 ms | -100.0% bf16 MFU | 62083 tok/s +step 396/19560 | loss 5.704408 (-1.62z)| norm 0.9151 (-0.22z)| lr 3.39e-04 | 8445.06 ms | -100.0% bf16 MFU | 62083 tok/s +step 397/19560 | loss 5.748791 (-1.22z)| norm 1.0335 (+0.43z)| lr 3.40e-04 | 8443.44 ms | -100.0% bf16 MFU | 62084 tok/s +step 398/19560 | loss 5.762572 (-1.10z)| norm 0.7556 (-1.09z)| lr 3.41e-04 | 8437.36 ms | -100.0% bf16 MFU | 62086 tok/s +step 399/19560 | loss 5.744789 (-1.24z)| norm 0.7691 (-1.01z)| lr 3.42e-04 | 8438.83 ms | -100.0% bf16 MFU | 62089 tok/s +step 400/19560 | loss 5.651929 (-2.03z)| norm 0.8272 (-0.68z)| lr 3.43e-04 | 8441.52 ms | -100.0% bf16 MFU | 62090 tok/s +step 401/19560 | loss 5.697699 (-1.60z)| norm 0.8079 (-0.78z)| lr 3.44e-04 | 8438.00 ms | -100.0% bf16 MFU | 62092 tok/s +step 402/19560 | loss 5.630303 (-2.15z)| norm 0.9935 (+0.29z)| lr 3.45e-04 | 8437.31 ms | -100.0% bf16 MFU | 62094 tok/s +step 403/19560 | loss 5.651294 (-1.94z)| norm 0.9738 (+0.18z)| lr 3.45e-04 | 8440.99 ms | -100.0% bf16 MFU | 62095 tok/s +step 404/19560 | loss 5.690030 (-1.57z)| norm 1.1453 (+1.17z)| lr 3.46e-04 | 8443.04 ms | -100.0% bf16 MFU | 62095 tok/s +step 405/19560 | loss 5.745972 (-1.06z)| norm 0.9791 (+0.21z)| lr 3.47e-04 | 8439.87 ms | -100.0% bf16 MFU | 62096 tok/s +step 406/19560 | loss 5.678847 (-1.63z)| norm 0.9037 (-0.23z)| lr 3.48e-04 | 8443.20 ms | -100.0% bf16 MFU | 62096 tok/s +step 407/19560 | loss 5.678125 (-1.61z)| norm 0.7251 (-1.26z)| lr 3.49e-04 | 8436.31 ms | -100.0% bf16 MFU | 62099 tok/s +step 408/19560 | loss 5.669782 (-1.66z)| norm 0.9109 (-0.20z)| lr 3.50e-04 | 8442.78 ms | -100.0% bf16 MFU | 62099 tok/s +step 409/19560 | loss 5.684138 (-1.52z)| norm 0.9859 (+0.23z)| lr 3.51e-04 | 8441.72 ms | -100.0% bf16 MFU | 62099 tok/s +step 410/19560 | loss 5.660527 (-1.70z)| norm 0.9666 (+0.10z)| lr 3.51e-04 | 8442.18 ms | -100.0% bf16 MFU | 62099 tok/s +step 411/19560 | loss 5.682633 (-1.48z)| norm 0.8581 (-0.56z)| lr 3.52e-04 | 8440.18 ms | -100.0% bf16 MFU | 62100 tok/s +step 412/19560 | loss 5.699544 (-1.31z)| norm 0.8131 (-0.84z)| lr 3.53e-04 | 8441.05 ms | -100.0% bf16 MFU | 62101 tok/s +step 413/19560 | loss 5.617124 (-2.01z)| norm 0.8248 (-0.77z)| lr 3.54e-04 | 8441.23 ms | -100.0% bf16 MFU | 62101 tok/s +step 414/19560 | loss 5.667464 (-1.54z)| norm 0.7712 (-1.09z)| lr 3.55e-04 | 8439.93 ms | -100.0% bf16 MFU | 62102 tok/s +step 415/19560 | loss 5.670411 (-1.48z)| norm 0.7250 (-1.37z)| lr 3.56e-04 | 8438.37 ms | -100.0% bf16 MFU | 62104 tok/s +step 416/19560 | loss 5.604097 (-2.03z)| norm 0.8206 (-0.79z)| lr 3.57e-04 | 8438.19 ms | -100.0% bf16 MFU | 62105 tok/s +step 417/19560 | loss 5.609651 (-1.95z)| norm 0.8549 (-0.58z)| lr 3.57e-04 | 8442.77 ms | -100.0% bf16 MFU | 62105 tok/s +step 418/19560 | loss 5.605047 (-1.96z)| norm 0.8091 (-0.85z)| lr 3.58e-04 | 8438.96 ms | -100.0% bf16 MFU | 62106 tok/s +step 419/19560 | loss 5.544348 (-2.42z)| norm 0.9382 (-0.05z)| lr 3.59e-04 | 8439.56 ms | -100.0% bf16 MFU | 62107 tok/s +step 420/19560 | loss 5.641644 (-1.56z)| norm 1.0517 (+0.66z)| lr 3.60e-04 | 8438.09 ms | -100.0% bf16 MFU | 62108 tok/s +step 421/19560 | loss 5.644943 (-1.51z)| norm 0.9058 (-0.25z)| lr 3.61e-04 | 8437.02 ms | -100.0% bf16 MFU | 62110 tok/s +step 422/19560 | loss 5.554695 (-2.24z)| norm 0.8575 (-0.55z)| lr 3.62e-04 | 8437.65 ms | -100.0% bf16 MFU | 62111 tok/s +step 423/19560 | loss 5.521500 (-2.47z)| norm 1.1532 (+1.27z)| lr 3.63e-04 | 8437.51 ms | -100.0% bf16 MFU | 62113 tok/s +step 424/19560 | loss 5.648625 (-1.36z)| norm 1.2356 (+1.74z)| lr 3.63e-04 | 8441.65 ms | -100.0% bf16 MFU | 62112 tok/s +step 425/19560 | loss 5.639465 (-1.42z)| norm 1.0933 (+0.87z)| lr 3.64e-04 | 8439.86 ms | -100.0% bf16 MFU | 62113 tok/s +step 426/19560 | loss 5.589069 (-1.82z)| norm 0.9177 (-0.20z)| lr 3.65e-04 | 8441.99 ms | -100.0% bf16 MFU | 62112 tok/s +step 427/19560 | loss 5.619831 (-1.54z)| norm 0.9580 (+0.04z)| lr 3.66e-04 | 8440.48 ms | -100.0% bf16 MFU | 62112 tok/s +step 428/19560 | loss 5.618422 (-1.52z)| norm 0.9784 (+0.17z)| lr 3.67e-04 | 8437.88 ms | -100.0% bf16 MFU | 62114 tok/s +step 429/19560 | loss 5.605934 (-1.61z)| norm 1.1192 (+1.04z)| lr 3.68e-04 | 8441.43 ms | -100.0% bf16 MFU | 62113 tok/s +step 430/19560 | loss 5.603659 (-1.61z)| norm 1.1606 (+1.28z)| lr 3.69e-04 | 8441.65 ms | -100.0% bf16 MFU | 62113 tok/s +step 431/19560 | loss 5.630184 (-1.36z)| norm 0.9528 (-0.02z)| lr 3.69e-04 | 8437.09 ms | -100.0% bf16 MFU | 62114 tok/s +step 432/19560 | loss 5.573090 (-1.83z)| norm 0.8291 (-0.79z)| lr 3.70e-04 | 8441.40 ms | -100.0% bf16 MFU | 62114 tok/s +step 433/19560 | loss 5.555979 (-1.94z)| norm 0.8970 (-0.37z)| lr 3.71e-04 | 8437.55 ms | -100.0% bf16 MFU | 62115 tok/s +step 434/19560 | loss 5.568200 (-1.80z)| norm 0.7621 (-1.20z)| lr 3.72e-04 | 8442.64 ms | -100.0% bf16 MFU | 62115 tok/s +step 435/19560 | loss 5.614604 (-1.38z)| norm 0.8603 (-0.58z)| lr 3.73e-04 | 8437.97 ms | -100.0% bf16 MFU | 62116 tok/s +step 436/19560 | loss 5.616482 (-1.34z)| norm 1.1350 (+1.11z)| lr 3.74e-04 | 8437.24 ms | -100.0% bf16 MFU | 62117 tok/s +step 437/19560 | loss 5.612557 (-1.36z)| norm 1.0598 (+0.64z)| lr 3.75e-04 | 8437.30 ms | -100.0% bf16 MFU | 62118 tok/s +step 438/19560 | loss 5.627433 (-1.21z)| norm 1.0382 (+0.50z)| lr 3.75e-04 | 8436.62 ms | -100.0% bf16 MFU | 62119 tok/s +step 439/19560 | loss 5.501456 (-2.25z)| norm 0.9232 (-0.21z)| lr 3.76e-04 | 8439.21 ms | -100.0% bf16 MFU | 62120 tok/s +step 440/19560 | loss 5.559837 (-1.72z)| norm 1.1087 (+0.93z)| lr 3.77e-04 | 8435.62 ms | -100.0% bf16 MFU | 62121 tok/s +step 441/19560 | loss 5.570014 (-1.60z)| norm 0.9779 (+0.14z)| lr 3.78e-04 | 8436.28 ms | -100.0% bf16 MFU | 62122 tok/s +step 442/19560 | loss 5.566133 (-1.60z)| norm 1.0909 (+0.85z)| lr 3.79e-04 | 8432.07 ms | -100.0% bf16 MFU | 62125 tok/s +step 443/19560 | loss 5.603548 (-1.28z)| norm 0.8415 (-0.70z)| lr 3.80e-04 | 8429.02 ms | -100.0% bf16 MFU | 62129 tok/s +step 444/19560 | loss 5.519835 (-1.95z)| norm 0.7801 (-1.07z)| lr 3.81e-04 | 8431.96 ms | -100.0% bf16 MFU | 62131 tok/s +step 445/19560 | loss 5.537646 (-1.77z)| norm 0.7166 (-1.45z)| lr 3.81e-04 | 8431.95 ms | -100.0% bf16 MFU | 62134 tok/s +step 446/19560 | loss 5.538696 (-1.73z)| norm 0.8248 (-0.77z)| lr 3.82e-04 | 8428.83 ms | -100.0% bf16 MFU | 62137 tok/s +step 447/19560 | loss 5.502294 (-2.00z)| norm 0.7138 (-1.43z)| lr 3.83e-04 | 8432.13 ms | -100.0% bf16 MFU | 62139 tok/s +step 448/19560 | loss 5.496027 (-2.01z)| norm 0.6749 (-1.65z)| lr 3.84e-04 | 8428.38 ms | -100.0% bf16 MFU | 62142 tok/s +step 449/19560 | loss 5.508558 (-1.87z)| norm 0.9812 (+0.23z)| lr 3.85e-04 | 8432.51 ms | -100.0% bf16 MFU | 62144 tok/s +step 450/19560 | loss 5.502926 (-1.89z)| norm 1.0761 (+0.81z)| lr 3.86e-04 | 8431.08 ms | -100.0% bf16 MFU | 62146 tok/s +step 451/19560 | loss 5.535034 (-1.60z)| norm 0.9598 (+0.07z)| lr 3.87e-04 | 8431.31 ms | -100.0% bf16 MFU | 62148 tok/s +step 452/19560 | loss 5.498568 (-1.87z)| norm 1.1348 (+1.15z)| lr 3.87e-04 | 8431.69 ms | -100.0% bf16 MFU | 62150 tok/s +step 453/19560 | loss 5.509981 (-1.75z)| norm 1.0567 (+0.65z)| lr 3.88e-04 | 8433.69 ms | -100.0% bf16 MFU | 62150 tok/s +step 454/19560 | loss 5.554773 (-1.36z)| norm 1.0805 (+0.79z)| lr 3.89e-04 | 8432.56 ms | -100.0% bf16 MFU | 62152 tok/s +step 455/19560 | loss 5.538348 (-1.47z)| norm 0.7694 (-1.16z)| lr 3.90e-04 | 8435.34 ms | -100.0% bf16 MFU | 62152 tok/s +step 456/19560 | loss 5.547231 (-1.37z)| norm 0.7989 (-0.96z)| lr 3.91e-04 | 8433.91 ms | -100.0% bf16 MFU | 62152 tok/s +step 457/19560 | loss 5.493047 (-1.79z)| norm 0.8185 (-0.83z)| lr 3.92e-04 | 8434.71 ms | -100.0% bf16 MFU | 62153 tok/s +step 458/19560 | loss 5.473147 (-1.91z)| norm 0.8756 (-0.47z)| lr 3.93e-04 | 8432.62 ms | -100.0% bf16 MFU | 62154 tok/s +step 459/19560 | loss 5.474680 (-1.86z)| norm 0.7749 (-1.12z)| lr 3.93e-04 | 8433.99 ms | -100.0% bf16 MFU | 62154 tok/s +step 460/19560 | loss 5.437311 (-2.11z)| norm 0.7613 (-1.19z)| lr 3.94e-04 | 8433.86 ms | -100.0% bf16 MFU | 62155 tok/s +step 461/19560 | loss 5.499324 (-1.59z)| norm 0.8780 (-0.43z)| lr 3.95e-04 | 8435.35 ms | -100.0% bf16 MFU | 62155 tok/s +step 462/19560 | loss 5.523953 (-1.37z)| norm 0.9755 (+0.20z)| lr 3.96e-04 | 8435.93 ms | -100.0% bf16 MFU | 62154 tok/s +step 463/19560 | loss 5.456686 (-1.89z)| norm 0.8664 (-0.49z)| lr 3.97e-04 | 8432.91 ms | -100.0% bf16 MFU | 62155 tok/s +step 464/19560 | loss 5.452829 (-1.91z)| norm 0.7907 (-0.97z)| lr 3.98e-04 | 8436.22 ms | -100.0% bf16 MFU | 62155 tok/s +step 465/19560 | loss 5.563652 (-0.99z)| norm 0.9069 (-0.21z)| lr 3.99e-04 | 8437.16 ms | -100.0% bf16 MFU | 62154 tok/s +step 466/19560 | loss 5.513153 (-1.38z)| norm 1.2144 (+1.76z)| lr 3.99e-04 | 8436.04 ms | -100.0% bf16 MFU | 62154 tok/s +step 467/19560 | loss 5.555882 (-1.01z)| norm 1.0469 (+0.66z)| lr 4.00e-04 | 8435.40 ms | -100.0% bf16 MFU | 62154 tok/s +step 468/19560 | loss 5.515872 (-1.32z)| norm 0.9492 (+0.01z)| lr 4.01e-04 | 8436.85 ms | -100.0% bf16 MFU | 62153 tok/s +step 469/19560 | loss 5.473644 (-1.65z)| norm 1.0089 (+0.40z)| lr 4.02e-04 | 8436.39 ms | -100.0% bf16 MFU | 62153 tok/s +step 470/19560 | loss 5.459952 (-1.73z)| norm 0.9584 (+0.09z)| lr 4.03e-04 | 8436.07 ms | -100.0% bf16 MFU | 62153 tok/s +step 471/19560 | loss 5.463748 (-1.67z)| norm 0.9534 (+0.05z)| lr 4.04e-04 | 8437.41 ms | -100.0% bf16 MFU | 62152 tok/s +step 472/19560 | loss 5.492648 (-1.41z)| norm 1.0222 (+0.52z)| lr 4.05e-04 | 8435.80 ms | -100.0% bf16 MFU | 62152 tok/s +step 473/19560 | loss 5.439860 (-1.81z)| norm 1.0997 (+1.04z)| lr 4.05e-04 | 8437.73 ms | -100.0% bf16 MFU | 62151 tok/s +step 474/19560 | loss 5.487970 (-1.40z)| norm 0.8233 (-0.83z)| lr 4.06e-04 | 8437.33 ms | -100.0% bf16 MFU | 62151 tok/s +step 475/19560 | loss 5.489732 (-1.36z)| norm 0.7850 (-1.10z)| lr 4.07e-04 | 8437.31 ms | -100.0% bf16 MFU | 62150 tok/s +step 476/19560 | loss 5.470841 (-1.50z)| norm 0.8988 (-0.31z)| lr 4.08e-04 | 8436.69 ms | -100.0% bf16 MFU | 62150 tok/s +step 477/19560 | loss 5.453074 (-1.62z)| norm 0.9534 (+0.05z)| lr 4.09e-04 | 8436.02 ms | -100.0% bf16 MFU | 62150 tok/s +step 478/19560 | loss 5.454528 (-1.58z)| norm 1.0075 (+0.43z)| lr 4.10e-04 | 8438.12 ms | -100.0% bf16 MFU | 62149 tok/s +step 479/19560 | loss 5.425702 (-1.79z)| norm 0.9227 (-0.17z)| lr 4.11e-04 | 8437.45 ms | -100.0% bf16 MFU | 62148 tok/s +step 480/19560 | loss 5.447329 (-1.59z)| norm 0.8778 (-0.48z)| lr 4.11e-04 | 8436.66 ms | -100.0% bf16 MFU | 62148 tok/s +step 481/19560 | loss 5.494299 (-1.18z)| norm 0.9052 (-0.29z)| lr 4.12e-04 | 8436.96 ms | -100.0% bf16 MFU | 62148 tok/s +step 482/19560 | loss 5.447534 (-1.55z)| norm 0.8872 (-0.41z)| lr 4.13e-04 | 8435.22 ms | -100.0% bf16 MFU | 62148 tok/s +step 483/19560 | loss 5.474232 (-1.31z)| norm 0.7938 (-1.05z)| lr 4.14e-04 | 8438.93 ms | -100.0% bf16 MFU | 62147 tok/s +step 484/19560 | loss 5.606418 (-0.20z)| norm 0.9465 (+0.03z)| lr 4.15e-04 | 8441.96 ms | -100.0% bf16 MFU | 62145 tok/s +step 485/19560 | loss 5.407750 (-1.84z)| norm 1.0711 (+0.89z)| lr 4.16e-04 | 8437.52 ms | -100.0% bf16 MFU | 62145 tok/s +step 486/19560 | loss 5.432180 (-1.60z)| norm 0.9735 (+0.22z)| lr 4.17e-04 | 8436.77 ms | -100.0% bf16 MFU | 62144 tok/s +step 487/19560 | loss 5.399565 (-1.85z)| norm 1.0970 (+1.09z)| lr 4.17e-04 | 8441.87 ms | -100.0% bf16 MFU | 62143 tok/s +step 488/19560 | loss 5.428481 (-1.58z)| norm 1.1402 (+1.39z)| lr 4.18e-04 | 8461.79 ms | -100.0% bf16 MFU | 62133 tok/s +step 489/19560 | loss 5.327775 (-2.37z)| norm 1.0471 (+0.76z)| lr 4.19e-04 | 8468.75 ms | -100.0% bf16 MFU | 62122 tok/s +step 490/19560 | loss 5.382339 (-1.89z)| norm 1.3321 (+2.74z)| lr 4.20e-04 | 8522.57 ms | -100.0% bf16 MFU | 62092 tok/s +step 491/19560 | loss 5.418093 (-1.57z)| norm 0.9503 (+0.05z)| lr 4.21e-04 | 8503.92 ms | -100.0% bf16 MFU | 62070 tok/s +step 492/19560 | loss 5.383137 (-1.82z)| norm 0.9908 (+0.36z)| lr 4.22e-04 | 8461.28 ms | -100.0% bf16 MFU | 62065 tok/s +step 493/19560 | loss 5.380856 (-1.81z)| norm 0.8196 (-0.88z)| lr 4.23e-04 | 8465.49 ms | -100.0% bf16 MFU | 62058 tok/s +step 494/19560 | loss 5.350261 (-2.02z)| norm 0.7622 (-1.29z)| lr 4.23e-04 | 8461.01 ms | -100.0% bf16 MFU | 62053 tok/s +step 495/19560 | loss 5.328555 (-2.16z)| norm 0.6739 (-1.90z)| lr 4.24e-04 | 8467.81 ms | -100.0% bf16 MFU | 62046 tok/s +step 496/19560 | loss 5.387750 (-1.66z)| norm 0.8349 (-0.74z)| lr 4.25e-04 | 8462.05 ms | -100.0% bf16 MFU | 62042 tok/s +step 497/19560 | loss 5.340254 (-2.00z)| norm 0.9590 (+0.14z)| lr 4.26e-04 | 8459.79 ms | -100.0% bf16 MFU | 62039 tok/s +step 498/19560 | loss 5.429480 (-1.26z)| norm 1.0758 (+0.96z)| lr 4.27e-04 | 8463.23 ms | -100.0% bf16 MFU | 62034 tok/s +step 499/19560 | loss 5.357549 (-1.80z)| norm 0.7976 (-1.02z)| lr 4.28e-04 | 8456.58 ms | -100.0% bf16 MFU | 62032 tok/s +step 500/19560 | loss 5.393263 (-1.49z)| norm 0.7756 (-1.16z)| lr 4.29e-04 | 8456.82 ms | -100.0% bf16 MFU | 62030 tok/s +val loss 5.420555 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 2452/10042 = 0.244174 +step 501/19560 | loss 5.408200 (-1.35z)| norm 0.7283 (-1.47z)| lr 4.29e-04 | 8458.77 ms | -100.0% bf16 MFU | 62028 tok/s +step 502/19560 | loss 5.385075 (-1.51z)| norm 0.8401 (-0.68z)| lr 4.30e-04 | 8462.23 ms | -100.0% bf16 MFU | 62024 tok/s +step 503/19560 | loss 5.358516 (-1.70z)| norm 0.9622 (+0.18z)| lr 4.31e-04 | 8464.03 ms | -100.0% bf16 MFU | 62020 tok/s +step 504/19560 | loss 5.426615 (-1.13z)| norm 0.7341 (-1.42z)| lr 4.32e-04 | 8455.77 ms | -100.0% bf16 MFU | 62020 tok/s +step 505/19560 | loss 5.321928 (-1.94z)| norm 0.7952 (-0.99z)| lr 4.33e-04 | 8460.56 ms | -100.0% bf16 MFU | 62017 tok/s +step 506/19560 | loss 5.386995 (-1.40z)| norm 0.8380 (-0.69z)| lr 4.34e-04 | 8450.00 ms | -100.0% bf16 MFU | 62018 tok/s +step 507/19560 | loss 5.326671 (-1.87z)| norm 0.6961 (-1.66z)| lr 4.35e-04 | 8463.86 ms | -100.0% bf16 MFU | 62015 tok/s +step 508/19560 | loss 5.332357 (-1.80z)| norm 0.7491 (-1.30z)| lr 4.35e-04 | 8457.16 ms | -100.0% bf16 MFU | 62014 tok/s +step 509/19560 | loss 5.289356 (-2.10z)| norm 0.7157 (-1.51z)| lr 4.36e-04 | 8459.10 ms | -100.0% bf16 MFU | 62012 tok/s +step 510/19560 | loss 5.236068 (-2.47z)| norm 0.7853 (-1.00z)| lr 4.37e-04 | 8457.72 ms | -100.0% bf16 MFU | 62011 tok/s +step 511/19560 | loss 5.312719 (-1.83z)| norm 1.0241 (+0.71z)| lr 4.38e-04 | 8454.40 ms | -100.0% bf16 MFU | 62011 tok/s +step 512/19560 | loss 5.366766 (-1.38z)| norm 0.9542 (+0.21z)| lr 4.39e-04 | 8455.60 ms | -100.0% bf16 MFU | 62011 tok/s +step 513/19560 | loss 5.373008 (-1.31z)| norm 1.0104 (+0.61z)| lr 4.40e-04 | 8456.13 ms | -100.0% bf16 MFU | 62010 tok/s +step 514/19560 | loss 5.376373 (-1.27z)| norm 1.1052 (+1.27z)| lr 4.41e-04 | 8459.56 ms | -100.0% bf16 MFU | 62008 tok/s +step 515/19560 | loss 5.355858 (-1.41z)| norm 1.0044 (+0.55z)| lr 4.41e-04 | 8454.15 ms | -100.0% bf16 MFU | 62009 tok/s +step 516/19560 | loss 5.329982 (-1.60z)| norm 0.9017 (-0.18z)| lr 4.42e-04 | 8462.07 ms | -100.0% bf16 MFU | 62006 tok/s +step 517/19560 | loss 5.333099 (-1.55z)| norm 0.8369 (-0.64z)| lr 4.43e-04 | 8455.70 ms | -100.0% bf16 MFU | 62006 tok/s +step 518/19560 | loss 5.259181 (-2.11z)| norm 0.7680 (-1.13z)| lr 4.44e-04 | 8458.62 ms | -100.0% bf16 MFU | 62005 tok/s +step 519/19560 | loss 5.239765 (-2.21z)| norm 0.7340 (-1.37z)| lr 4.45e-04 | 8452.10 ms | -100.0% bf16 MFU | 62006 tok/s +step 520/19560 | loss 5.327818 (-1.49z)| norm 0.6547 (-1.93z)| lr 4.46e-04 | 8450.79 ms | -100.0% bf16 MFU | 62008 tok/s +step 521/19560 | loss 5.259218 (-1.99z)| norm 0.7485 (-1.23z)| lr 4.47e-04 | 8447.44 ms | -100.0% bf16 MFU | 62011 tok/s +step 522/19560 | loss 5.276063 (-1.83z)| norm 0.9669 (+0.38z)| lr 4.47e-04 | 8456.55 ms | -100.0% bf16 MFU | 62010 tok/s +step 523/19560 | loss 5.377865 (-1.01z)| norm 1.1930 (+2.00z)| lr 4.48e-04 | 8455.84 ms | -100.0% bf16 MFU | 62010 tok/s +step 524/19560 | loss 5.303354 (-1.57z)| norm 0.8250 (-0.66z)| lr 4.49e-04 | 8450.50 ms | -100.0% bf16 MFU | 62011 tok/s +step 525/19560 | loss 5.278002 (-1.75z)| norm 0.8149 (-0.72z)| lr 4.50e-04 | 8449.14 ms | -100.0% bf16 MFU | 62013 tok/s +step 526/19560 | loss 5.259005 (-1.88z)| norm 0.8517 (-0.46z)| lr 4.51e-04 | 8446.69 ms | -100.0% bf16 MFU | 62016 tok/s +step 527/19560 | loss 5.276304 (-1.72z)| norm 0.8584 (-0.42z)| lr 4.52e-04 | 8451.81 ms | -100.0% bf16 MFU | 62017 tok/s +step 528/19560 | loss 5.296983 (-1.53z)| norm 0.7062 (-1.51z)| lr 4.53e-04 | 8459.06 ms | -100.0% bf16 MFU | 62015 tok/s +step 529/19560 | loss 5.255675 (-1.83z)| norm 0.8178 (-0.70z)| lr 4.53e-04 | 8447.96 ms | -100.0% bf16 MFU | 62017 tok/s +step 530/19560 | loss 5.264084 (-1.73z)| norm 0.9147 (+0.00z)| lr 4.54e-04 | 8451.84 ms | -100.0% bf16 MFU | 62018 tok/s +step 531/19560 | loss 5.198147 (-2.20z)| norm 0.8449 (-0.50z)| lr 4.55e-04 | 8456.81 ms | -100.0% bf16 MFU | 62017 tok/s +step 532/19560 | loss 5.237253 (-1.86z)| norm 0.7656 (-1.06z)| lr 4.56e-04 | 8443.30 ms | -100.0% bf16 MFU | 62021 tok/s +step 533/19560 | loss 5.277564 (-1.53z)| norm 0.7970 (-0.82z)| lr 4.57e-04 | 8455.52 ms | -100.0% bf16 MFU | 62020 tok/s +step 534/19560 | loss 5.234750 (-1.83z)| norm 0.7789 (-0.94z)| lr 4.58e-04 | 8451.02 ms | -100.0% bf16 MFU | 62021 tok/s +step 535/19560 | loss 5.247575 (-1.70z)| norm 0.7461 (-1.18z)| lr 4.59e-04 | 8456.03 ms | -100.0% bf16 MFU | 62020 tok/s +step 536/19560 | loss 5.227197 (-1.83z)| norm 0.6423 (-1.89z)| lr 4.59e-04 | 8449.90 ms | -100.0% bf16 MFU | 62021 tok/s +step 537/19560 | loss 5.288664 (-1.33z)| norm 0.5864 (-2.23z)| lr 4.60e-04 | 8447.71 ms | -100.0% bf16 MFU | 62024 tok/s +step 538/19560 | loss 5.272198 (-1.44z)| norm 0.5881 (-2.16z)| lr 4.61e-04 | 8450.78 ms | -100.0% bf16 MFU | 62024 tok/s +step 539/19560 | loss 5.205219 (-1.94z)| norm 0.6975 (-1.38z)| lr 4.62e-04 | 8450.62 ms | -100.0% bf16 MFU | 62025 tok/s +step 540/19560 | loss 5.263317 (-1.46z)| norm 1.0155 (+0.79z)| lr 4.63e-04 | 8447.50 ms | -100.0% bf16 MFU | 62027 tok/s +step 541/19560 | loss 5.271664 (-1.37z)| norm 0.9705 (+0.48z)| lr 4.64e-04 | 8448.56 ms | -100.0% bf16 MFU | 62029 tok/s +step 542/19560 | loss 5.285592 (-1.25z)| norm 0.8910 (-0.08z)| lr 4.65e-04 | 8447.36 ms | -100.0% bf16 MFU | 62030 tok/s +step 543/19560 | loss 5.178344 (-2.07z)| norm 0.8055 (-0.67z)| lr 4.65e-04 | 8451.78 ms | -100.0% bf16 MFU | 62031 tok/s +step 544/19560 | loss 5.241323 (-1.54z)| norm 0.7786 (-0.86z)| lr 4.66e-04 | 8453.19 ms | -100.0% bf16 MFU | 62030 tok/s +step 545/19560 | loss 5.253315 (-1.42z)| norm 0.6879 (-1.46z)| lr 4.67e-04 | 8456.64 ms | -100.0% bf16 MFU | 62029 tok/s +step 546/19560 | loss 5.189572 (-1.89z)| norm 0.6882 (-1.44z)| lr 4.68e-04 | 8446.31 ms | -100.0% bf16 MFU | 62031 tok/s +step 547/19560 | loss 5.158064 (-2.09z)| norm 0.7144 (-1.25z)| lr 4.69e-04 | 8445.93 ms | -100.0% bf16 MFU | 62033 tok/s +step 548/19560 | loss 5.238061 (-1.44z)| norm 0.8107 (-0.58z)| lr 4.70e-04 | 8444.91 ms | -100.0% bf16 MFU | 62036 tok/s +step 549/19560 | loss 5.229532 (-1.49z)| norm 0.9600 (+0.43z)| lr 4.71e-04 | 8450.72 ms | -100.0% bf16 MFU | 62036 tok/s +step 550/19560 | loss 5.187750 (-1.78z)| norm 1.0526 (+1.04z)| lr 4.71e-04 | 8446.20 ms | -100.0% bf16 MFU | 62038 tok/s +step 551/19560 | loss 5.155171 (-1.99z)| norm 0.9675 (+0.48z)| lr 4.72e-04 | 8451.92 ms | -100.0% bf16 MFU | 62037 tok/s +step 552/19560 | loss 5.232444 (-1.37z)| norm 0.7697 (-0.86z)| lr 4.73e-04 | 8447.27 ms | -100.0% bf16 MFU | 62039 tok/s +step 553/19560 | loss 5.284957 (-0.95z)| norm 0.8079 (-0.58z)| lr 4.74e-04 | 8449.40 ms | -100.0% bf16 MFU | 62039 tok/s +step 554/19560 | loss 5.201277 (-1.58z)| norm 0.7731 (-0.82z)| lr 4.75e-04 | 8443.91 ms | -100.0% bf16 MFU | 62042 tok/s +step 555/19560 | loss 5.136073 (-2.05z)| norm 0.8922 (+0.02z)| lr 4.76e-04 | 8445.03 ms | -100.0% bf16 MFU | 62044 tok/s +step 556/19560 | loss 5.242405 (-1.21z)| norm 0.6565 (-1.60z)| lr 4.77e-04 | 8444.83 ms | -100.0% bf16 MFU | 62046 tok/s +step 557/19560 | loss 5.250813 (-1.13z)| norm 0.7941 (-0.63z)| lr 4.77e-04 | 8441.55 ms | -100.0% bf16 MFU | 62049 tok/s +step 558/19560 | loss 5.251075 (-1.11z)| norm 1.1552 (+1.89z)| lr 4.78e-04 | 8445.26 ms | -100.0% bf16 MFU | 62051 tok/s +step 559/19560 | loss 5.282775 (-0.85z)| norm 1.0241 (+0.97z)| lr 4.79e-04 | 8444.69 ms | -100.0% bf16 MFU | 62052 tok/s +step 560/19560 | loss 5.196030 (-1.52z)| norm 0.9368 (+0.35z)| lr 4.80e-04 | 8441.20 ms | -100.0% bf16 MFU | 62055 tok/s +step 561/19560 | loss 5.261323 (-0.98z)| norm 0.9331 (+0.33z)| lr 4.81e-04 | 8443.60 ms | -100.0% bf16 MFU | 62057 tok/s +step 562/19560 | loss 5.279804 (-0.82z)| norm 0.8796 (-0.05z)| lr 4.82e-04 | 8447.17 ms | -100.0% bf16 MFU | 62058 tok/s +step 563/19560 | loss 5.213586 (-1.34z)| norm 0.8286 (-0.41z)| lr 4.83e-04 | 8450.14 ms | -100.0% bf16 MFU | 62057 tok/s +step 564/19560 | loss 5.157001 (-1.77z)| norm 0.6628 (-1.54z)| lr 4.83e-04 | 8441.35 ms | -100.0% bf16 MFU | 62060 tok/s +step 565/19560 | loss 5.227589 (-1.18z)| norm 0.6613 (-1.53z)| lr 4.84e-04 | 8446.51 ms | -100.0% bf16 MFU | 62060 tok/s +step 566/19560 | loss 5.131271 (-1.95z)| norm 0.6584 (-1.52z)| lr 4.85e-04 | 8443.14 ms | -100.0% bf16 MFU | 62062 tok/s +step 567/19560 | loss 5.150237 (-1.76z)| norm 0.7577 (-0.82z)| lr 4.86e-04 | 8446.96 ms | -100.0% bf16 MFU | 62062 tok/s +step 568/19560 | loss 5.208243 (-1.27z)| norm 0.8099 (-0.45z)| lr 4.87e-04 | 8442.48 ms | -100.0% bf16 MFU | 62064 tok/s +step 569/19560 | loss 5.148769 (-1.73z)| norm 0.9631 (+0.63z)| lr 4.88e-04 | 8443.15 ms | -100.0% bf16 MFU | 62066 tok/s +step 570/19560 | loss 5.184544 (-1.41z)| norm 0.9304 (+0.41z)| lr 4.89e-04 | 8442.85 ms | -100.0% bf16 MFU | 62068 tok/s +step 571/19560 | loss 5.250710 (-0.86z)| norm 0.8071 (-0.46z)| lr 4.89e-04 | 8448.71 ms | -100.0% bf16 MFU | 62067 tok/s +step 572/19560 | loss 5.154082 (-1.64z)| norm 0.8712 (-0.01z)| lr 4.90e-04 | 8446.68 ms | -100.0% bf16 MFU | 62067 tok/s +step 573/19560 | loss 5.102655 (-2.02z)| norm 0.7648 (-0.77z)| lr 4.91e-04 | 8448.24 ms | -100.0% bf16 MFU | 62067 tok/s +step 574/19560 | loss 5.157141 (-1.55z)| norm 0.7814 (-0.65z)| lr 4.92e-04 | 8444.70 ms | -100.0% bf16 MFU | 62068 tok/s +step 575/19560 | loss 5.136922 (-1.68z)| norm 0.6756 (-1.39z)| lr 4.93e-04 | 8443.85 ms | -100.0% bf16 MFU | 62069 tok/s +step 576/19560 | loss 5.143162 (-1.60z)| norm 0.7546 (-0.84z)| lr 4.94e-04 | 8442.85 ms | -100.0% bf16 MFU | 62070 tok/s +step 577/19560 | loss 5.129506 (-1.69z)| norm 0.7161 (-1.10z)| lr 4.95e-04 | 8447.35 ms | -100.0% bf16 MFU | 62070 tok/s +step 578/19560 | loss 5.191851 (-1.16z)| norm 0.6364 (-1.63z)| lr 4.95e-04 | 8442.65 ms | -100.0% bf16 MFU | 62071 tok/s +step 579/19560 | loss 5.088865 (-1.97z)| norm 0.5891 (-1.92z)| lr 4.96e-04 | 8440.99 ms | -100.0% bf16 MFU | 62074 tok/s +step 580/19560 | loss 5.087960 (-1.93z)| norm 0.6839 (-1.25z)| lr 4.97e-04 | 8442.80 ms | -100.0% bf16 MFU | 62075 tok/s +step 581/19560 | loss 5.140316 (-1.48z)| norm 0.8237 (-0.25z)| lr 4.98e-04 | 8440.74 ms | -100.0% bf16 MFU | 62077 tok/s +step 582/19560 | loss 5.109284 (-1.71z)| norm 1.0995 (+1.70z)| lr 4.99e-04 | 8440.68 ms | -100.0% bf16 MFU | 62079 tok/s +step 583/19560 | loss 5.191447 (-1.03z)| norm 0.9101 (+0.35z)| lr 5.00e-04 | 8442.85 ms | -100.0% bf16 MFU | 62080 tok/s +step 584/19560 | loss 5.162878 (-1.25z)| norm 0.8609 (+0.00z)| lr 5.01e-04 | 8442.42 ms | -100.0% bf16 MFU | 62081 tok/s +step 585/19560 | loss 5.108222 (-1.67z)| norm 0.8777 (+0.12z)| lr 5.01e-04 | 8438.40 ms | -100.0% bf16 MFU | 62083 tok/s +step 586/19560 | loss 5.180737 (-1.06z)| norm 0.9935 (+0.93z)| lr 5.02e-04 | 8445.22 ms | -100.0% bf16 MFU | 62083 tok/s +step 587/19560 | loss 5.131328 (-1.44z)| norm 0.8451 (-0.13z)| lr 5.03e-04 | 8440.39 ms | -100.0% bf16 MFU | 62085 tok/s +step 588/19560 | loss 5.125841 (-1.46z)| norm 0.7757 (-0.62z)| lr 5.04e-04 | 8437.84 ms | -100.0% bf16 MFU | 62087 tok/s +step 589/19560 | loss 5.135655 (-1.36z)| norm 0.7718 (-0.64z)| lr 5.05e-04 | 8441.81 ms | -100.0% bf16 MFU | 62088 tok/s +step 590/19560 | loss 5.127503 (-1.41z)| norm 0.7861 (-0.53z)| lr 5.06e-04 | 8441.03 ms | -100.0% bf16 MFU | 62089 tok/s +step 591/19560 | loss 5.083761 (-1.74z)| norm 0.8091 (-0.36z)| lr 5.07e-04 | 8441.67 ms | -100.0% bf16 MFU | 62090 tok/s +step 592/19560 | loss 5.123464 (-1.39z)| norm 0.7430 (-0.82z)| lr 5.07e-04 | 8443.96 ms | -100.0% bf16 MFU | 62090 tok/s +step 593/19560 | loss 5.097796 (-1.59z)| norm 0.6284 (-1.60z)| lr 5.08e-04 | 8443.05 ms | -100.0% bf16 MFU | 62091 tok/s +step 594/19560 | loss 5.107263 (-1.49z)| norm 0.5689 (-2.00z)| lr 5.09e-04 | 8445.12 ms | -100.0% bf16 MFU | 62090 tok/s +step 595/19560 | loss 5.085435 (-1.66z)| norm 0.5742 (-1.92z)| lr 5.10e-04 | 8445.73 ms | -100.0% bf16 MFU | 62090 tok/s +step 596/19560 | loss 5.010324 (-2.25z)| norm 0.5193 (-2.24z)| lr 5.11e-04 | 8439.85 ms | -100.0% bf16 MFU | 62091 tok/s +step 597/19560 | loss 5.064076 (-1.77z)| norm 0.6136 (-1.57z)| lr 5.12e-04 | 8445.61 ms | -100.0% bf16 MFU | 62090 tok/s +step 598/19560 | loss 5.072531 (-1.67z)| norm 0.7026 (-0.94z)| lr 5.13e-04 | 8442.56 ms | -100.0% bf16 MFU | 62091 tok/s +step 599/19560 | loss 5.059023 (-1.75z)| norm 0.7383 (-0.69z)| lr 5.13e-04 | 8445.31 ms | -100.0% bf16 MFU | 62090 tok/s +step 600/19560 | loss 5.090487 (-1.47z)| norm 0.8127 (-0.17z)| lr 5.14e-04 | 8442.68 ms | -100.0% bf16 MFU | 62091 tok/s +step 601/19560 | loss 5.076506 (-1.56z)| norm 0.9203 (+0.59z)| lr 5.15e-04 | 8444.52 ms | -100.0% bf16 MFU | 62091 tok/s +step 602/19560 | loss 5.089846 (-1.43z)| norm 0.9172 (+0.56z)| lr 5.16e-04 | 8444.17 ms | -100.0% bf16 MFU | 62091 tok/s +step 603/19560 | loss 5.070911 (-1.58z)| norm 0.7687 (-0.47z)| lr 5.17e-04 | 8438.78 ms | -100.0% bf16 MFU | 62092 tok/s +step 604/19560 | loss 5.065165 (-1.60z)| norm 0.8487 (+0.09z)| lr 5.18e-04 | 8442.62 ms | -100.0% bf16 MFU | 62093 tok/s +step 605/19560 | loss 5.065269 (-1.58z)| norm 0.8007 (-0.24z)| lr 5.19e-04 | 8441.90 ms | -100.0% bf16 MFU | 62093 tok/s +step 606/19560 | loss 5.070999 (-1.51z)| norm 0.7897 (-0.30z)| lr 5.19e-04 | 8440.97 ms | -100.0% bf16 MFU | 62094 tok/s +step 607/19560 | loss 4.994761 (-2.12z)| norm 0.8189 (-0.09z)| lr 5.20e-04 | 8439.69 ms | -100.0% bf16 MFU | 62096 tok/s +step 608/19560 | loss 5.072651 (-1.43z)| norm 0.8868 (+0.38z)| lr 5.21e-04 | 8441.84 ms | -100.0% bf16 MFU | 62096 tok/s +step 609/19560 | loss 5.072170 (-1.42z)| norm 0.7976 (-0.24z)| lr 5.22e-04 | 8444.65 ms | -100.0% bf16 MFU | 62096 tok/s +step 610/19560 | loss 5.033667 (-1.73z)| norm 0.8312 (+0.00z)| lr 5.23e-04 | 8444.48 ms | -100.0% bf16 MFU | 62095 tok/s +step 611/19560 | loss 5.015245 (-1.87z)| norm 0.9154 (+0.59z)| lr 5.24e-04 | 8440.85 ms | -100.0% bf16 MFU | 62096 tok/s +step 612/19560 | loss 5.036060 (-1.70z)| norm 0.8843 (+0.37z)| lr 5.25e-04 | 8438.18 ms | -100.0% bf16 MFU | 62098 tok/s +step 613/19560 | loss 5.006902 (-1.93z)| norm 0.8360 (+0.04z)| lr 5.25e-04 | 8446.84 ms | -100.0% bf16 MFU | 62097 tok/s +step 614/19560 | loss 5.011772 (-1.85z)| norm 0.7363 (-0.65z)| lr 5.26e-04 | 8443.77 ms | -100.0% bf16 MFU | 62096 tok/s +step 615/19560 | loss 5.083638 (-1.20z)| norm 0.6573 (-1.20z)| lr 5.27e-04 | 8439.67 ms | -100.0% bf16 MFU | 62098 tok/s +step 616/19560 | loss 5.084060 (-1.18z)| norm 0.6000 (-1.60z)| lr 5.28e-04 | 8443.63 ms | -100.0% bf16 MFU | 62097 tok/s +step 617/19560 | loss 5.035513 (-1.59z)| norm 0.6221 (-1.42z)| lr 5.29e-04 | 8442.54 ms | -100.0% bf16 MFU | 62098 tok/s +step 618/19560 | loss 5.025916 (-1.65z)| norm 0.6994 (-0.87z)| lr 5.30e-04 | 8442.76 ms | -100.0% bf16 MFU | 62098 tok/s +step 619/19560 | loss 4.966284 (-2.14z)| norm 0.6871 (-0.95z)| lr 5.31e-04 | 8438.21 ms | -100.0% bf16 MFU | 62099 tok/s +step 620/19560 | loss 5.095622 (-0.97z)| norm 0.5940 (-1.64z)| lr 5.31e-04 | 8438.51 ms | -100.0% bf16 MFU | 62101 tok/s +step 621/19560 | loss 4.951172 (-2.22z)| norm 0.5623 (-1.84z)| lr 5.32e-04 | 8442.88 ms | -100.0% bf16 MFU | 62101 tok/s +step 622/19560 | loss 5.007802 (-1.68z)| norm 0.6518 (-1.15z)| lr 5.33e-04 | 8442.43 ms | -100.0% bf16 MFU | 62101 tok/s +step 623/19560 | loss 5.018421 (-1.56z)| norm 0.7726 (-0.24z)| lr 5.34e-04 | 8443.27 ms | -100.0% bf16 MFU | 62101 tok/s +step 624/19560 | loss 5.026501 (-1.47z)| norm 0.9689 (+1.23z)| lr 5.35e-04 | 8441.35 ms | -100.0% bf16 MFU | 62101 tok/s +step 625/19560 | loss 5.046223 (-1.27z)| norm 0.8898 (+0.64z)| lr 5.36e-04 | 8443.95 ms | -100.0% bf16 MFU | 62100 tok/s +step 626/19560 | loss 5.023331 (-1.46z)| norm 0.7262 (-0.59z)| lr 5.37e-04 | 8445.76 ms | -100.0% bf16 MFU | 62099 tok/s +step 627/19560 | loss 5.035280 (-1.34z)| norm 0.8107 (+0.06z)| lr 5.37e-04 | 8437.63 ms | -100.0% bf16 MFU | 62101 tok/s +step 628/19560 | loss 5.019681 (-1.46z)| norm 0.9384 (+1.03z)| lr 5.38e-04 | 8435.68 ms | -100.0% bf16 MFU | 62104 tok/s +step 629/19560 | loss 5.060197 (-1.08z)| norm 1.0868 (+2.11z)| lr 5.39e-04 | 8440.27 ms | -100.0% bf16 MFU | 62104 tok/s +step 630/19560 | loss 5.052411 (-1.14z)| norm 0.9651 (+1.18z)| lr 5.40e-04 | 8439.87 ms | -100.0% bf16 MFU | 62105 tok/s +step 631/19560 | loss 5.036933 (-1.26z)| norm 0.8354 (+0.22z)| lr 5.41e-04 | 8441.95 ms | -100.0% bf16 MFU | 62105 tok/s +step 632/19560 | loss 5.000524 (-1.59z)| norm 0.7381 (-0.52z)| lr 5.42e-04 | 8441.38 ms | -100.0% bf16 MFU | 62105 tok/s +step 633/19560 | loss 5.018427 (-1.40z)| norm 0.6431 (-1.22z)| lr 5.43e-04 | 8440.76 ms | -100.0% bf16 MFU | 62106 tok/s +step 634/19560 | loss 5.050517 (-1.09z)| norm 0.5965 (-1.54z)| lr 5.43e-04 | 8438.79 ms | -100.0% bf16 MFU | 62107 tok/s +step 635/19560 | loss 5.025414 (-1.31z)| norm 0.6498 (-1.14z)| lr 5.44e-04 | 8441.12 ms | -100.0% bf16 MFU | 62107 tok/s +step 636/19560 | loss 4.993986 (-1.58z)| norm 0.6008 (-1.48z)| lr 5.45e-04 | 8443.25 ms | -100.0% bf16 MFU | 62107 tok/s +step 637/19560 | loss 4.976823 (-1.71z)| norm 0.5782 (-1.62z)| lr 5.46e-04 | 8440.30 ms | -100.0% bf16 MFU | 62107 tok/s +step 638/19560 | loss 5.078761 (-0.74z)| norm 0.5980 (-1.45z)| lr 5.47e-04 | 8437.39 ms | -100.0% bf16 MFU | 62109 tok/s +step 639/19560 | loss 4.985300 (-1.59z)| norm 0.6195 (-1.28z)| lr 5.48e-04 | 8443.08 ms | -100.0% bf16 MFU | 62108 tok/s +step 640/19560 | loss 5.021909 (-1.23z)| norm 0.6769 (-0.85z)| lr 5.49e-04 | 8443.80 ms | -100.0% bf16 MFU | 62107 tok/s +step 641/19560 | loss 4.980150 (-1.61z)| norm 0.6894 (-0.75z)| lr 5.49e-04 | 8439.34 ms | -100.0% bf16 MFU | 62108 tok/s +step 642/19560 | loss 4.986693 (-1.54z)| norm 0.6809 (-0.80z)| lr 5.50e-04 | 8443.02 ms | -100.0% bf16 MFU | 62108 tok/s +step 643/19560 | loss 5.029646 (-1.11z)| norm 0.6129 (-1.29z)| lr 5.51e-04 | 8438.65 ms | -100.0% bf16 MFU | 62109 tok/s +step 644/19560 | loss 4.981891 (-1.56z)| norm 0.6494 (-1.00z)| lr 5.52e-04 | 8440.88 ms | -100.0% bf16 MFU | 62109 tok/s +step 645/19560 | loss 4.980203 (-1.55z)| norm 0.7819 (-0.01z)| lr 5.53e-04 | 8439.05 ms | -100.0% bf16 MFU | 62110 tok/s +step 646/19560 | loss 4.999837 (-1.34z)| norm 0.7572 (-0.19z)| lr 5.54e-04 | 8438.38 ms | -100.0% bf16 MFU | 62111 tok/s +step 647/19560 | loss 4.956137 (-1.73z)| norm 0.7395 (-0.32z)| lr 5.55e-04 | 8439.95 ms | -100.0% bf16 MFU | 62111 tok/s +step 648/19560 | loss 4.950826 (-1.76z)| norm 0.8521 (+0.51z)| lr 5.55e-04 | 8440.47 ms | -100.0% bf16 MFU | 62111 tok/s +step 649/19560 | loss 4.964890 (-1.59z)| norm 0.8773 (+0.69z)| lr 5.56e-04 | 8438.78 ms | -100.0% bf16 MFU | 62112 tok/s +step 650/19560 | loss 4.916579 (-2.02z)| norm 0.9629 (+1.33z)| lr 5.57e-04 | 8437.06 ms | -100.0% bf16 MFU | 62114 tok/s +step 651/19560 | loss 4.968055 (-1.52z)| norm 0.9457 (+1.26z)| lr 5.58e-04 | 8439.41 ms | -100.0% bf16 MFU | 62114 tok/s +step 652/19560 | loss 4.970837 (-1.47z)| norm 0.7092 (-0.57z)| lr 5.59e-04 | 8438.67 ms | -100.0% bf16 MFU | 62115 tok/s +step 653/19560 | loss 4.991039 (-1.25z)| norm 0.6807 (-0.78z)| lr 5.60e-04 | 8438.88 ms | -100.0% bf16 MFU | 62116 tok/s +step 654/19560 | loss 4.910038 (-2.01z)| norm 0.6612 (-0.91z)| lr 5.61e-04 | 8441.28 ms | -100.0% bf16 MFU | 62115 tok/s +step 655/19560 | loss 4.962527 (-1.47z)| norm 0.6828 (-0.74z)| lr 5.61e-04 | 8439.32 ms | -100.0% bf16 MFU | 62116 tok/s +step 656/19560 | loss 4.996614 (-1.12z)| norm 0.7085 (-0.54z)| lr 5.62e-04 | 8440.48 ms | -100.0% bf16 MFU | 62116 tok/s +step 657/19560 | loss 4.940464 (-1.65z)| norm 0.6948 (-0.64z)| lr 5.63e-04 | 8438.89 ms | -100.0% bf16 MFU | 62116 tok/s +step 658/19560 | loss 4.922305 (-1.80z)| norm 0.7985 (+0.17z)| lr 5.64e-04 | 8438.67 ms | -100.0% bf16 MFU | 62117 tok/s +step 659/19560 | loss 4.987511 (-1.14z)| norm 0.8504 (+0.57z)| lr 5.65e-04 | 8441.93 ms | -100.0% bf16 MFU | 62116 tok/s +step 660/19560 | loss 4.942014 (-1.56z)| norm 0.7333 (-0.33z)| lr 5.66e-04 | 8439.78 ms | -100.0% bf16 MFU | 62117 tok/s +step 661/19560 | loss 4.938011 (-1.58z)| norm 0.6892 (-0.67z)| lr 5.67e-04 | 8441.43 ms | -100.0% bf16 MFU | 62116 tok/s +step 662/19560 | loss 4.916266 (-1.76z)| norm 0.7030 (-0.55z)| lr 5.67e-04 | 8442.64 ms | -100.0% bf16 MFU | 62115 tok/s +step 663/19560 | loss 4.902471 (-1.86z)| norm 0.6686 (-0.81z)| lr 5.68e-04 | 8439.66 ms | -100.0% bf16 MFU | 62116 tok/s +step 664/19560 | loss 4.899307 (-1.85z)| norm 0.6580 (-0.90z)| lr 5.69e-04 | 8442.16 ms | -100.0% bf16 MFU | 62115 tok/s +step 665/19560 | loss 4.877375 (-2.03z)| norm 0.7906 (+0.11z)| lr 5.70e-04 | 8440.56 ms | -100.0% bf16 MFU | 62115 tok/s +step 666/19560 | loss 4.994901 (-0.88z)| norm 0.8019 (+0.19z)| lr 5.71e-04 | 8438.82 ms | -100.0% bf16 MFU | 62116 tok/s +step 667/19560 | loss 4.901663 (-1.76z)| norm 0.7938 (+0.12z)| lr 5.72e-04 | 8438.89 ms | -100.0% bf16 MFU | 62116 tok/s +step 668/19560 | loss 4.878871 (-1.94z)| norm 0.8572 (+0.63z)| lr 5.73e-04 | 8442.10 ms | -100.0% bf16 MFU | 62116 tok/s +step 669/19560 | loss 4.957368 (-1.17z)| norm 0.9205 (+1.14z)| lr 5.73e-04 | 8438.22 ms | -100.0% bf16 MFU | 62117 tok/s +step 670/19560 | loss 4.905247 (-1.66z)| norm 0.8092 (+0.26z)| lr 5.74e-04 | 8440.42 ms | -100.0% bf16 MFU | 62117 tok/s +step 671/19560 | loss 4.950983 (-1.19z)| norm 0.6924 (-0.66z)| lr 5.75e-04 | 8440.18 ms | -100.0% bf16 MFU | 62117 tok/s +step 672/19560 | loss 4.942315 (-1.26z)| norm 0.8027 (+0.22z)| lr 5.76e-04 | 8438.19 ms | -100.0% bf16 MFU | 62118 tok/s +step 673/19560 | loss 4.903385 (-1.62z)| norm 0.8109 (+0.27z)| lr 5.77e-04 | 8443.39 ms | -100.0% bf16 MFU | 62116 tok/s +step 674/19560 | loss 4.948821 (-1.15z)| norm 0.8801 (+0.81z)| lr 5.78e-04 | 8441.16 ms | -100.0% bf16 MFU | 62116 tok/s +step 675/19560 | loss 4.903045 (-1.57z)| norm 0.8283 (+0.39z)| lr 5.79e-04 | 8438.62 ms | -100.0% bf16 MFU | 62117 tok/s +step 676/19560 | loss 4.947520 (-1.12z)| norm 0.7757 (-0.02z)| lr 5.79e-04 | 8440.54 ms | -100.0% bf16 MFU | 62117 tok/s +step 677/19560 | loss 5.003852 (-0.55z)| norm 0.6990 (-0.62z)| lr 5.80e-04 | 8440.42 ms | -100.0% bf16 MFU | 62117 tok/s +step 678/19560 | loss 4.930079 (-1.27z)| norm 0.7464 (-0.23z)| lr 5.81e-04 | 8453.32 ms | -100.0% bf16 MFU | 62112 tok/s +step 679/19560 | loss 4.913820 (-1.41z)| norm 0.7410 (-0.26z)| lr 5.82e-04 | 8465.92 ms | -100.0% bf16 MFU | 62103 tok/s +step 680/19560 | loss 4.937643 (-1.16z)| norm 0.7376 (-0.29z)| lr 5.83e-04 | 8462.72 ms | -100.0% bf16 MFU | 62095 tok/s +step 681/19560 | loss 4.880070 (-1.72z)| norm 0.6194 (-1.24z)| lr 5.84e-04 | 8461.71 ms | -100.0% bf16 MFU | 62089 tok/s +step 682/19560 | loss 4.920226 (-1.29z)| norm 0.6467 (-1.01z)| lr 5.85e-04 | 8463.82 ms | -100.0% bf16 MFU | 62081 tok/s +step 683/19560 | loss 4.919765 (-1.28z)| norm 0.6517 (-0.95z)| lr 5.85e-04 | 8460.21 ms | -100.0% bf16 MFU | 62076 tok/s +step 684/19560 | loss 4.885162 (-1.61z)| norm 0.6324 (-1.11z)| lr 5.86e-04 | 8461.32 ms | -100.0% bf16 MFU | 62070 tok/s +step 685/19560 | loss 4.869470 (-1.75z)| norm 0.6165 (-1.22z)| lr 5.87e-04 | 8461.38 ms | -100.0% bf16 MFU | 62065 tok/s +step 686/19560 | loss 4.883206 (-1.59z)| norm 0.5971 (-1.38z)| lr 5.88e-04 | 8458.84 ms | -100.0% bf16 MFU | 62061 tok/s +step 687/19560 | loss 4.892305 (-1.49z)| norm 0.7408 (-0.16z)| lr 5.89e-04 | 8465.76 ms | -100.0% bf16 MFU | 62054 tok/s +step 688/19560 | loss 4.871730 (-1.68z)| norm 0.7659 (+0.06z)| lr 5.90e-04 | 8463.76 ms | -100.0% bf16 MFU | 62049 tok/s +step 689/19560 | loss 4.911810 (-1.25z)| norm 0.7185 (-0.33z)| lr 5.91e-04 | 8457.40 ms | -100.0% bf16 MFU | 62046 tok/s +step 690/19560 | loss 4.951337 (-0.82z)| norm 0.7417 (-0.12z)| lr 5.91e-04 | 8461.00 ms | -100.0% bf16 MFU | 62042 tok/s +step 691/19560 | loss 4.844611 (-1.97z)| norm 0.7823 (+0.24z)| lr 5.92e-04 | 8457.11 ms | -100.0% bf16 MFU | 62039 tok/s +step 692/19560 | loss 4.955456 (-0.74z)| norm 0.9729 (+1.86z)| lr 5.93e-04 | 8464.82 ms | -100.0% bf16 MFU | 62034 tok/s +step 693/19560 | loss 4.860917 (-1.77z)| norm 0.9337 (+1.50z)| lr 5.94e-04 | 8455.06 ms | -100.0% bf16 MFU | 62033 tok/s +step 694/19560 | loss 4.882280 (-1.50z)| norm 0.6813 (-0.68z)| lr 5.95e-04 | 8455.05 ms | -100.0% bf16 MFU | 62032 tok/s +step 695/19560 | loss 4.886944 (-1.43z)| norm 0.6170 (-1.22z)| lr 5.96e-04 | 8459.13 ms | -100.0% bf16 MFU | 62029 tok/s +step 696/19560 | loss 4.826026 (-2.08z)| norm 0.5998 (-1.34z)| lr 5.97e-04 | 8454.48 ms | -100.0% bf16 MFU | 62028 tok/s +step 697/19560 | loss 4.880163 (-1.45z)| norm 0.6961 (-0.51z)| lr 5.97e-04 | 8456.22 ms | -100.0% bf16 MFU | 62027 tok/s +step 698/19560 | loss 4.913503 (-1.07z)| norm 0.8746 (+1.04z)| lr 5.98e-04 | 8457.76 ms | -100.0% bf16 MFU | 62025 tok/s +step 699/19560 | loss 4.863223 (-1.63z)| norm 0.7943 (+0.34z)| lr 5.99e-04 | 8457.97 ms | -100.0% bf16 MFU | 62023 tok/s +step 700/19560 | loss 4.785723 (-2.46z)| norm 0.7477 (-0.05z)| lr 6.00e-04 | 8453.54 ms | -100.0% bf16 MFU | 62023 tok/s +step 701/19560 | loss 4.910601 (-1.02z)| norm 0.8118 (+0.50z)| lr 6.00e-04 | 8455.08 ms | -100.0% bf16 MFU | 62022 tok/s +step 702/19560 | loss 4.864876 (-1.52z)| norm 0.9045 (+1.29z)| lr 6.00e-04 | 8456.70 ms | -100.0% bf16 MFU | 62021 tok/s +step 703/19560 | loss 4.847783 (-1.69z)| norm 0.6359 (-1.02z)| lr 6.00e-04 | 8460.25 ms | -100.0% bf16 MFU | 62018 tok/s +step 704/19560 | loss 4.865015 (-1.47z)| norm 0.5744 (-1.53z)| lr 6.00e-04 | 8457.07 ms | -100.0% bf16 MFU | 62017 tok/s +step 705/19560 | loss 4.820211 (-1.95z)| norm 0.5050 (-2.07z)| lr 6.00e-04 | 8452.30 ms | -100.0% bf16 MFU | 62018 tok/s +step 706/19560 | loss 4.821885 (-1.91z)| norm 0.5151 (-1.95z)| lr 6.00e-04 | 8452.07 ms | -100.0% bf16 MFU | 62018 tok/s +step 707/19560 | loss 4.866717 (-1.37z)| norm 0.5619 (-1.56z)| lr 6.00e-04 | 8453.53 ms | -100.0% bf16 MFU | 62019 tok/s +step 708/19560 | loss 4.864428 (-1.38z)| norm 0.6785 (-0.60z)| lr 6.00e-04 | 8458.39 ms | -100.0% bf16 MFU | 62017 tok/s +step 709/19560 | loss 4.814985 (-1.91z)| norm 0.8669 (+0.96z)| lr 6.00e-04 | 8452.49 ms | -100.0% bf16 MFU | 62017 tok/s +step 710/19560 | loss 4.832235 (-1.68z)| norm 0.8293 (+0.69z)| lr 6.00e-04 | 8461.03 ms | -100.0% bf16 MFU | 62015 tok/s +step 711/19560 | loss 4.967796 (-0.12z)| norm 0.7810 (+0.28z)| lr 6.00e-04 | 8457.29 ms | -100.0% bf16 MFU | 62014 tok/s +step 712/19560 | loss 4.791662 (-2.14z)| norm 0.7081 (-0.33z)| lr 6.00e-04 | 8453.98 ms | -100.0% bf16 MFU | 62014 tok/s +step 713/19560 | loss 4.826641 (-1.70z)| norm 0.7089 (-0.31z)| lr 6.00e-04 | 8449.99 ms | -100.0% bf16 MFU | 62015 tok/s +step 714/19560 | loss 4.815334 (-1.82z)| norm 0.6850 (-0.51z)| lr 6.00e-04 | 8453.27 ms | -100.0% bf16 MFU | 62016 tok/s +step 715/19560 | loss 4.853207 (-1.36z)| norm 0.7424 (+0.00z)| lr 6.00e-04 | 8451.05 ms | -100.0% bf16 MFU | 62017 tok/s +step 716/19560 | loss 4.831374 (-1.60z)| norm 0.8462 (+0.91z)| lr 6.00e-04 | 8456.21 ms | -100.0% bf16 MFU | 62016 tok/s +step 717/19560 | loss 4.817385 (-1.74z)| norm 0.9998 (+2.21z)| lr 6.00e-04 | 8451.84 ms | -100.0% bf16 MFU | 62017 tok/s +step 718/19560 | loss 4.842941 (-1.42z)| norm 0.9155 (+1.46z)| lr 6.00e-04 | 8448.21 ms | -100.0% bf16 MFU | 62019 tok/s +step 719/19560 | loss 4.779994 (-2.13z)| norm 0.8439 (+0.84z)| lr 6.00e-04 | 8452.88 ms | -100.0% bf16 MFU | 62019 tok/s +step 720/19560 | loss 4.784621 (-2.04z)| norm 0.7216 (-0.20z)| lr 6.00e-04 | 8452.88 ms | -100.0% bf16 MFU | 62020 tok/s +step 721/19560 | loss 4.787504 (-1.97z)| norm 0.5980 (-1.26z)| lr 6.00e-04 | 8452.89 ms | -100.0% bf16 MFU | 62020 tok/s +step 722/19560 | loss 4.779885 (-2.02z)| norm 0.5793 (-1.42z)| lr 6.00e-04 | 8448.37 ms | -100.0% bf16 MFU | 62022 tok/s +step 723/19560 | loss 4.831033 (-1.40z)| norm 0.6125 (-1.14z)| lr 6.00e-04 | 8449.84 ms | -100.0% bf16 MFU | 62023 tok/s +step 724/19560 | loss 4.796884 (-1.76z)| norm 0.5732 (-1.49z)| lr 6.00e-04 | 8451.12 ms | -100.0% bf16 MFU | 62024 tok/s +step 725/19560 | loss 4.828616 (-1.37z)| norm 0.6004 (-1.25z)| lr 6.00e-04 | 8455.90 ms | -100.0% bf16 MFU | 62023 tok/s +step 726/19560 | loss 4.806154 (-1.60z)| norm 0.7228 (-0.20z)| lr 6.00e-04 | 8450.45 ms | -100.0% bf16 MFU | 62024 tok/s +step 727/19560 | loss 4.752672 (-2.18z)| norm 0.7197 (-0.23z)| lr 6.00e-04 | 8453.63 ms | -100.0% bf16 MFU | 62023 tok/s +step 728/19560 | loss 4.809000 (-1.50z)| norm 0.6446 (-0.86z)| lr 6.00e-04 | 8455.97 ms | -100.0% bf16 MFU | 62022 tok/s +step 729/19560 | loss 4.793210 (-1.66z)| norm 0.6472 (-0.82z)| lr 6.00e-04 | 8458.68 ms | -100.0% bf16 MFU | 62020 tok/s +step 730/19560 | loss 4.741107 (-2.21z)| norm 0.7834 (+0.37z)| lr 6.00e-04 | 8450.92 ms | -100.0% bf16 MFU | 62021 tok/s +step 731/19560 | loss 4.816381 (-1.33z)| norm 0.7093 (-0.28z)| lr 6.00e-04 | 8449.56 ms | -100.0% bf16 MFU | 62023 tok/s +step 732/19560 | loss 4.766592 (-1.87z)| norm 0.6062 (-1.16z)| lr 6.00e-04 | 8453.34 ms | -100.0% bf16 MFU | 62023 tok/s +step 733/19560 | loss 4.765415 (-1.85z)| norm 0.7077 (-0.27z)| lr 6.00e-04 | 8450.62 ms | -100.0% bf16 MFU | 62024 tok/s +step 734/19560 | loss 4.723209 (-2.28z)| norm 0.7669 (+0.25z)| lr 6.00e-04 | 8446.87 ms | -100.0% bf16 MFU | 62026 tok/s +step 735/19560 | loss 4.747623 (-1.95z)| norm 0.6093 (-1.11z)| lr 6.00e-04 | 8453.73 ms | -100.0% bf16 MFU | 62025 tok/s +step 736/19560 | loss 4.745377 (-1.94z)| norm 0.5856 (-1.29z)| lr 6.00e-04 | 8449.25 ms | -100.0% bf16 MFU | 62027 tok/s +step 737/19560 | loss 4.756789 (-1.79z)| norm 0.5345 (-1.70z)| lr 6.00e-04 | 8450.77 ms | -100.0% bf16 MFU | 62027 tok/s +step 738/19560 | loss 4.761286 (-1.70z)| norm 0.5592 (-1.46z)| lr 6.00e-04 | 8450.08 ms | -100.0% bf16 MFU | 62028 tok/s +step 739/19560 | loss 4.682558 (-2.49z)| norm 0.5515 (-1.51z)| lr 6.00e-04 | 8447.09 ms | -100.0% bf16 MFU | 62030 tok/s +step 740/19560 | loss 4.768300 (-1.54z)| norm 0.6531 (-0.62z)| lr 6.00e-04 | 8452.06 ms | -100.0% bf16 MFU | 62030 tok/s +step 741/19560 | loss 4.732598 (-1.88z)| norm 0.7641 (+0.34z)| lr 6.00e-04 | 8451.54 ms | -100.0% bf16 MFU | 62031 tok/s +step 742/19560 | loss 4.772770 (-1.43z)| norm 0.6989 (-0.22z)| lr 6.00e-04 | 8445.08 ms | -100.0% bf16 MFU | 62033 tok/s +step 743/19560 | loss 4.805684 (-1.06z)| norm 0.6572 (-0.58z)| lr 6.00e-04 | 8456.26 ms | -100.0% bf16 MFU | 62031 tok/s +step 744/19560 | loss 4.681592 (-2.35z)| norm 0.7131 (-0.11z)| lr 6.00e-04 | 8444.37 ms | -100.0% bf16 MFU | 62034 tok/s +step 745/19560 | loss 4.680960 (-2.30z)| norm 0.6352 (-0.78z)| lr 6.00e-04 | 8446.40 ms | -100.0% bf16 MFU | 62036 tok/s +step 746/19560 | loss 4.726057 (-1.79z)| norm 0.6079 (-1.01z)| lr 6.00e-04 | 8447.09 ms | -100.0% bf16 MFU | 62038 tok/s +step 747/19560 | loss 4.710097 (-1.91z)| norm 0.8302 (+0.90z)| lr 6.00e-04 | 8442.70 ms | -100.0% bf16 MFU | 62041 tok/s +step 748/19560 | loss 4.743043 (-1.55z)| norm 0.8042 (+0.66z)| lr 6.00e-04 | 8447.03 ms | -100.0% bf16 MFU | 62042 tok/s +step 749/19560 | loss 4.712428 (-1.83z)| norm 0.6932 (-0.31z)| lr 6.00e-04 | 8444.63 ms | -100.0% bf16 MFU | 62044 tok/s +step 750/19560 | loss 4.776558 (-1.15z)| norm 0.7995 (+0.61z)| lr 6.00e-04 | 8451.30 ms | -100.0% bf16 MFU | 62044 tok/s +val loss 4.730120 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 2528/10042 = 0.251743 +step 751/19560 | loss 4.738505 (-1.52z)| norm 0.8072 (+0.67z)| lr 6.00e-04 | 8443.80 ms | -100.0% bf16 MFU | 62046 tok/s +step 752/19560 | loss 4.785816 (-1.01z)| norm 0.7577 (+0.26z)| lr 6.00e-04 | 8442.29 ms | -100.0% bf16 MFU | 62049 tok/s +step 753/19560 | loss 4.740735 (-1.46z)| norm 0.6412 (-0.76z)| lr 6.00e-04 | 8446.72 ms | -100.0% bf16 MFU | 62050 tok/s +step 754/19560 | loss 4.732038 (-1.53z)| norm 0.7142 (-0.11z)| lr 6.00e-04 | 8443.31 ms | -100.0% bf16 MFU | 62052 tok/s +step 755/19560 | loss 4.737041 (-1.45z)| norm 0.6688 (-0.50z)| lr 6.00e-04 | 8448.92 ms | -100.0% bf16 MFU | 62052 tok/s +step 756/19560 | loss 4.711574 (-1.69z)| norm 0.5999 (-1.11z)| lr 6.00e-04 | 8449.28 ms | -100.0% bf16 MFU | 62052 tok/s +step 757/19560 | loss 4.695774 (-1.82z)| norm 0.6307 (-0.83z)| lr 6.00e-04 | 8443.41 ms | -100.0% bf16 MFU | 62054 tok/s +step 758/19560 | loss 4.684559 (-1.91z)| norm 0.5489 (-1.59z)| lr 6.00e-04 | 8448.40 ms | -100.0% bf16 MFU | 62055 tok/s +step 759/19560 | loss 4.694374 (-1.78z)| norm 0.5890 (-1.19z)| lr 6.00e-04 | 8442.80 ms | -100.0% bf16 MFU | 62057 tok/s +step 760/19560 | loss 4.663586 (-2.05z)| norm 0.5606 (-1.43z)| lr 6.00e-04 | 8454.63 ms | -100.0% bf16 MFU | 62055 tok/s +step 761/19560 | loss 4.646347 (-2.18z)| norm 0.5050 (-1.92z)| lr 6.00e-04 | 8445.68 ms | -100.0% bf16 MFU | 62056 tok/s +step 762/19560 | loss 4.686401 (-1.75z)| norm 0.5460 (-1.53z)| lr 6.00e-04 | 8443.64 ms | -100.0% bf16 MFU | 62058 tok/s +step 763/19560 | loss 4.733723 (-1.25z)| norm 0.5167 (-1.77z)| lr 6.00e-04 | 8440.57 ms | -100.0% bf16 MFU | 62060 tok/s +step 764/19560 | loss 4.678862 (-1.78z)| norm 0.5613 (-1.35z)| lr 6.00e-04 | 8442.39 ms | -100.0% bf16 MFU | 62063 tok/s +step 765/19560 | loss 4.664773 (-1.88z)| norm 0.6559 (-0.50z)| lr 6.00e-04 | 8444.00 ms | -100.0% bf16 MFU | 62064 tok/s +step 766/19560 | loss 4.670557 (-1.81z)| norm 0.6753 (-0.33z)| lr 6.00e-04 | 8445.88 ms | -100.0% bf16 MFU | 62065 tok/s +step 767/19560 | loss 4.696638 (-1.51z)| norm 0.6795 (-0.29z)| lr 6.00e-04 | 8449.35 ms | -100.0% bf16 MFU | 62064 tok/s +step 768/19560 | loss 4.713241 (-1.33z)| norm 0.6291 (-0.75z)| lr 6.00e-04 | 8444.54 ms | -100.0% bf16 MFU | 62065 tok/s +step 769/19560 | loss 4.656108 (-1.88z)| norm 0.5955 (-1.05z)| lr 6.00e-04 | 8446.96 ms | -100.0% bf16 MFU | 62065 tok/s +step 770/19560 | loss 4.633137 (-2.07z)| norm 0.5667 (-1.30z)| lr 6.00e-04 | 8446.04 ms | -100.0% bf16 MFU | 62066 tok/s +step 771/19560 | loss 4.589967 (-2.45z)| norm 0.5878 (-1.10z)| lr 6.00e-04 | 8445.06 ms | -100.0% bf16 MFU | 62066 tok/s +step 772/19560 | loss 4.630618 (-2.00z)| norm 0.6048 (-0.94z)| lr 6.00e-04 | 8448.16 ms | -100.0% bf16 MFU | 62066 tok/s +step 773/19560 | loss 4.629695 (-1.97z)| norm 0.5989 (-0.98z)| lr 6.00e-04 | 8445.46 ms | -100.0% bf16 MFU | 62067 tok/s +step 774/19560 | loss 4.669918 (-1.55z)| norm 0.5607 (-1.31z)| lr 6.00e-04 | 8445.71 ms | -100.0% bf16 MFU | 62067 tok/s +step 775/19560 | loss 4.619682 (-2.00z)| norm 0.5608 (-1.29z)| lr 6.00e-04 | 8444.58 ms | -100.0% bf16 MFU | 62068 tok/s +step 776/19560 | loss 4.624755 (-1.91z)| norm 0.6130 (-0.81z)| lr 6.00e-04 | 8445.97 ms | -100.0% bf16 MFU | 62069 tok/s +step 777/19560 | loss 4.578908 (-2.29z)| norm 0.6697 (-0.28z)| lr 6.00e-04 | 8441.36 ms | -100.0% bf16 MFU | 62071 tok/s +step 778/19560 | loss 4.603806 (-2.01z)| norm 0.6751 (-0.22z)| lr 6.00e-04 | 8443.35 ms | -100.0% bf16 MFU | 62072 tok/s +step 779/19560 | loss 4.609922 (-1.91z)| norm 0.6899 (-0.06z)| lr 6.00e-04 | 8439.11 ms | -100.0% bf16 MFU | 62075 tok/s +step 780/19560 | loss 4.567618 (-2.25z)| norm 0.6575 (-0.37z)| lr 6.00e-04 | 8445.23 ms | -100.0% bf16 MFU | 62075 tok/s +step 781/19560 | loss 4.647775 (-1.48z)| norm 0.6208 (-0.71z)| lr 6.00e-04 | 8443.30 ms | -100.0% bf16 MFU | 62076 tok/s +step 782/19560 | loss 4.604753 (-1.85z)| norm 0.5236 (-1.60z)| lr 6.00e-04 | 8445.51 ms | -100.0% bf16 MFU | 62076 tok/s +step 783/19560 | loss 4.628295 (-1.60z)| norm 0.5589 (-1.26z)| lr 6.00e-04 | 8445.77 ms | -100.0% bf16 MFU | 62076 tok/s +step 784/19560 | loss 4.588408 (-1.94z)| norm 0.5325 (-1.48z)| lr 6.00e-04 | 8445.92 ms | -100.0% bf16 MFU | 62076 tok/s +step 785/19560 | loss 4.604785 (-1.75z)| norm 0.6567 (-0.33z)| lr 6.00e-04 | 8445.92 ms | -100.0% bf16 MFU | 62076 tok/s +step 786/19560 | loss 4.633887 (-1.46z)| norm 0.6051 (-0.79z)| lr 6.00e-04 | 8445.85 ms | -100.0% bf16 MFU | 62076 tok/s +step 787/19560 | loss 4.616985 (-1.59z)| norm 0.6290 (-0.56z)| lr 6.00e-04 | 8445.54 ms | -100.0% bf16 MFU | 62076 tok/s +step 788/19560 | loss 4.630918 (-1.44z)| norm 0.5783 (-1.01z)| lr 6.00e-04 | 8443.84 ms | -100.0% bf16 MFU | 62077 tok/s +step 789/19560 | loss 4.641620 (-1.32z)| norm 0.4858 (-1.83z)| lr 6.00e-04 | 8443.30 ms | -100.0% bf16 MFU | 62078 tok/s +step 790/19560 | loss 4.583085 (-1.82z)| norm 0.5120 (-1.56z)| lr 6.00e-04 | 8444.25 ms | -100.0% bf16 MFU | 62078 tok/s +step 791/19560 | loss 4.558794 (-1.99z)| norm 0.5748 (-0.98z)| lr 6.00e-04 | 8445.26 ms | -100.0% bf16 MFU | 62078 tok/s +step 792/19560 | loss 4.606759 (-1.53z)| norm 0.6328 (-0.46z)| lr 6.00e-04 | 8442.49 ms | -100.0% bf16 MFU | 62080 tok/s +step 793/19560 | loss 4.618989 (-1.40z)| norm 0.6030 (-0.72z)| lr 6.00e-04 | 8438.72 ms | -100.0% bf16 MFU | 62082 tok/s +step 794/19560 | loss 4.596261 (-1.58z)| norm 0.6139 (-0.61z)| lr 6.00e-04 | 8440.28 ms | -100.0% bf16 MFU | 62084 tok/s +step 795/19560 | loss 4.609459 (-1.44z)| norm 0.6407 (-0.35z)| lr 6.00e-04 | 8437.59 ms | -100.0% bf16 MFU | 62087 tok/s +step 796/19560 | loss 4.616353 (-1.36z)| norm 0.6484 (-0.27z)| lr 6.00e-04 | 8431.46 ms | -100.0% bf16 MFU | 62091 tok/s +step 797/19560 | loss 4.604291 (-1.44z)| norm 0.6160 (-0.56z)| lr 6.00e-04 | 8436.16 ms | -100.0% bf16 MFU | 62094 tok/s +step 798/19560 | loss 4.624650 (-1.24z)| norm 0.5679 (-0.99z)| lr 6.00e-04 | 8434.49 ms | -100.0% bf16 MFU | 62097 tok/s +step 799/19560 | loss 4.596262 (-1.47z)| norm 0.5565 (-1.08z)| lr 6.00e-04 | 8436.73 ms | -100.0% bf16 MFU | 62100 tok/s +step 800/19560 | loss 4.557664 (-1.79z)| norm 0.5934 (-0.73z)| lr 6.00e-04 | 8433.74 ms | -100.0% bf16 MFU | 62103 tok/s +step 801/19560 | loss 4.582559 (-1.54z)| norm 0.5263 (-1.34z)| lr 6.00e-04 | 8433.80 ms | -100.0% bf16 MFU | 62106 tok/s +step 802/19560 | loss 4.599168 (-1.37z)| norm 0.5128 (-1.45z)| lr 6.00e-04 | 8434.67 ms | -100.0% bf16 MFU | 62109 tok/s +step 803/19560 | loss 4.586258 (-1.46z)| norm 0.6289 (-0.34z)| lr 6.00e-04 | 8434.99 ms | -100.0% bf16 MFU | 62111 tok/s +step 804/19560 | loss 4.571324 (-1.57z)| norm 0.7406 (+0.73z)| lr 6.00e-04 | 8437.79 ms | -100.0% bf16 MFU | 62112 tok/s +step 805/19560 | loss 4.606825 (-1.25z)| norm 0.7227 (+0.56z)| lr 6.00e-04 | 8439.21 ms | -100.0% bf16 MFU | 62113 tok/s +step 806/19560 | loss 4.608241 (-1.22z)| norm 0.5627 (-0.95z)| lr 6.00e-04 | 8437.75 ms | -100.0% bf16 MFU | 62114 tok/s +step 807/19560 | loss 4.537314 (-1.83z)| norm 0.4818 (-1.69z)| lr 6.00e-04 | 8435.41 ms | -100.0% bf16 MFU | 62116 tok/s +step 808/19560 | loss 4.585658 (-1.38z)| norm 0.4408 (-2.03z)| lr 6.00e-04 | 8438.20 ms | -100.0% bf16 MFU | 62117 tok/s +step 809/19560 | loss 4.557157 (-1.61z)| norm 0.4515 (-1.89z)| lr 6.00e-04 | 8431.29 ms | -100.0% bf16 MFU | 62120 tok/s +step 810/19560 | loss 4.604730 (-1.16z)| norm 0.4846 (-1.56z)| lr 6.00e-04 | 8434.83 ms | -100.0% bf16 MFU | 62122 tok/s +step 811/19560 | loss 4.552739 (-1.61z)| norm 0.6299 (-0.23z)| lr 6.00e-04 | 8438.62 ms | -100.0% bf16 MFU | 62122 tok/s +step 812/19560 | loss 4.621148 (-0.97z)| norm 0.6995 (+0.39z)| lr 6.00e-04 | 8442.25 ms | -100.0% bf16 MFU | 62122 tok/s +step 813/19560 | loss 4.535626 (-1.72z)| norm 0.6685 (+0.11z)| lr 6.00e-04 | 8439.12 ms | -100.0% bf16 MFU | 62122 tok/s +step 814/19560 | loss 4.518290 (-1.85z)| norm 0.5945 (-0.56z)| lr 6.00e-04 | 8438.98 ms | -100.0% bf16 MFU | 62122 tok/s +step 815/19560 | loss 4.574994 (-1.31z)| norm 0.5335 (-1.10z)| lr 6.00e-04 | 8439.19 ms | -100.0% bf16 MFU | 62122 tok/s +step 816/19560 | loss 4.509600 (-1.87z)| norm 0.5251 (-1.16z)| lr 6.00e-04 | 8438.65 ms | -100.0% bf16 MFU | 62123 tok/s +step 817/19560 | loss 4.503269 (-1.90z)| norm 0.5659 (-0.78z)| lr 6.00e-04 | 8442.97 ms | -100.0% bf16 MFU | 62121 tok/s +step 818/19560 | loss 4.591314 (-1.09z)| norm 0.5949 (-0.51z)| lr 6.00e-04 | 8438.93 ms | -100.0% bf16 MFU | 62122 tok/s +step 819/19560 | loss 4.558263 (-1.37z)| norm 0.6782 (+0.26z)| lr 6.00e-04 | 8437.63 ms | -100.0% bf16 MFU | 62122 tok/s +step 820/19560 | loss 4.630805 (-0.69z)| norm 0.7296 (+0.77z)| lr 6.00e-04 | 8439.93 ms | -100.0% bf16 MFU | 62122 tok/s +step 821/19560 | loss 4.507208 (-1.82z)| norm 0.7694 (+1.19z)| lr 6.00e-04 | 8442.21 ms | -100.0% bf16 MFU | 62121 tok/s +step 822/19560 | loss 4.579454 (-1.13z)| norm 0.7072 (+0.58z)| lr 6.00e-04 | 8439.11 ms | -100.0% bf16 MFU | 62121 tok/s +step 823/19560 | loss 4.545239 (-1.43z)| norm 0.6008 (-0.44z)| lr 6.00e-04 | 8438.84 ms | -100.0% bf16 MFU | 62122 tok/s +step 824/19560 | loss 4.568741 (-1.19z)| norm 0.5327 (-1.09z)| lr 6.00e-04 | 8443.42 ms | -100.0% bf16 MFU | 62120 tok/s +step 825/19560 | loss 4.674378 (-0.18z)| norm 0.4868 (-1.50z)| lr 6.00e-04 | 8438.58 ms | -100.0% bf16 MFU | 62121 tok/s +step 826/19560 | loss 4.568445 (-1.18z)| norm 0.4302 (-2.02z)| lr 6.00e-04 | 8440.59 ms | -100.0% bf16 MFU | 62121 tok/s +step 827/19560 | loss 4.525539 (-1.57z)| norm 0.4237 (-2.03z)| lr 6.00e-04 | 8440.16 ms | -100.0% bf16 MFU | 62121 tok/s +step 828/19560 | loss 4.525681 (-1.54z)| norm 0.4778 (-1.49z)| lr 6.00e-04 | 8443.21 ms | -100.0% bf16 MFU | 62119 tok/s +step 829/19560 | loss 4.514349 (-1.63z)| norm 0.5434 (-0.86z)| lr 6.00e-04 | 8438.05 ms | -100.0% bf16 MFU | 62120 tok/s +step 830/19560 | loss 4.493604 (-1.81z)| norm 0.6022 (-0.29z)| lr 6.00e-04 | 8442.77 ms | -100.0% bf16 MFU | 62119 tok/s +step 831/19560 | loss 4.502804 (-1.69z)| norm 0.5467 (-0.82z)| lr 6.00e-04 | 8440.03 ms | -100.0% bf16 MFU | 62119 tok/s +step 832/19560 | loss 4.535011 (-1.36z)| norm 0.5810 (-0.49z)| lr 6.00e-04 | 8441.05 ms | -100.0% bf16 MFU | 62119 tok/s +step 833/19560 | loss 4.560984 (-1.09z)| norm 0.6042 (-0.27z)| lr 6.00e-04 | 8438.91 ms | -100.0% bf16 MFU | 62119 tok/s +step 834/19560 | loss 4.562866 (-1.06z)| norm 0.6648 (+0.31z)| lr 6.00e-04 | 8439.88 ms | -100.0% bf16 MFU | 62119 tok/s +step 835/19560 | loss 4.467941 (-1.96z)| norm 0.6308 (-0.03z)| lr 6.00e-04 | 8439.56 ms | -100.0% bf16 MFU | 62119 tok/s +step 836/19560 | loss 4.560596 (-1.04z)| norm 0.5376 (-0.93z)| lr 6.00e-04 | 8438.92 ms | -100.0% bf16 MFU | 62120 tok/s +step 837/19560 | loss 4.630239 (-0.33z)| norm 0.5193 (-1.10z)| lr 6.00e-04 | 8440.82 ms | -100.0% bf16 MFU | 62119 tok/s +step 838/19560 | loss 4.520601 (-1.41z)| norm 0.5163 (-1.12z)| lr 6.00e-04 | 8442.97 ms | -100.0% bf16 MFU | 62118 tok/s +step 839/19560 | loss 4.499909 (-1.63z)| norm 0.5444 (-0.83z)| lr 6.00e-04 | 8438.26 ms | -100.0% bf16 MFU | 62119 tok/s +step 840/19560 | loss 4.534667 (-1.25z)| norm 0.5576 (-0.68z)| lr 6.00e-04 | 8439.59 ms | -100.0% bf16 MFU | 62119 tok/s +step 841/19560 | loss 4.557829 (-1.00z)| norm 0.4762 (-1.48z)| lr 6.00e-04 | 8440.44 ms | -100.0% bf16 MFU | 62119 tok/s +step 842/19560 | loss 4.511199 (-1.46z)| norm 0.4622 (-1.59z)| lr 6.00e-04 | 8440.21 ms | -100.0% bf16 MFU | 62119 tok/s +step 843/19560 | loss 4.514184 (-1.42z)| norm 0.4872 (-1.32z)| lr 6.00e-04 | 8438.36 ms | -100.0% bf16 MFU | 62120 tok/s +step 844/19560 | loss 4.502285 (-1.52z)| norm 0.4218 (-1.96z)| lr 6.00e-04 | 8441.50 ms | -100.0% bf16 MFU | 62119 tok/s +step 845/19560 | loss 4.483498 (-1.70z)| norm 0.4103 (-2.12z)| lr 6.00e-04 | 8440.53 ms | -100.0% bf16 MFU | 62119 tok/s +step 846/19560 | loss 4.525676 (-1.24z)| norm 0.5000 (-1.18z)| lr 6.00e-04 | 8441.93 ms | -100.0% bf16 MFU | 62118 tok/s +step 847/19560 | loss 4.478983 (-1.71z)| norm 0.5485 (-0.64z)| lr 6.00e-04 | 8443.54 ms | -100.0% bf16 MFU | 62117 tok/s +step 848/19560 | loss 4.498732 (-1.48z)| norm 0.6173 (+0.15z)| lr 6.00e-04 | 8442.92 ms | -100.0% bf16 MFU | 62116 tok/s +step 849/19560 | loss 4.489985 (-1.55z)| norm 0.6087 (+0.05z)| lr 6.00e-04 | 8439.18 ms | -100.0% bf16 MFU | 62116 tok/s +step 850/19560 | loss 4.482467 (-1.60z)| norm 0.6389 (+0.39z)| lr 6.00e-04 | 8436.52 ms | -100.0% bf16 MFU | 62118 tok/s +step 851/19560 | loss 4.452187 (-1.90z)| norm 0.6303 (+0.29z)| lr 6.00e-04 | 8434.55 ms | -100.0% bf16 MFU | 62120 tok/s +step 852/19560 | loss 4.467724 (-1.71z)| norm 0.5941 (-0.13z)| lr 6.00e-04 | 8432.36 ms | -100.0% bf16 MFU | 62123 tok/s +step 853/19560 | loss 4.559963 (-0.70z)| norm 0.6306 (+0.28z)| lr 6.00e-04 | 8431.65 ms | -100.0% bf16 MFU | 62126 tok/s +step 854/19560 | loss 4.525303 (-1.07z)| norm 0.7071 (+1.15z)| lr 6.00e-04 | 8431.86 ms | -100.0% bf16 MFU | 62128 tok/s +step 855/19560 | loss 4.528901 (-1.02z)| norm 0.6520 (+0.54z)| lr 6.00e-04 | 8435.40 ms | -100.0% bf16 MFU | 62130 tok/s +step 856/19560 | loss 4.473673 (-1.62z)| norm 0.6176 (+0.15z)| lr 6.00e-04 | 8430.81 ms | -100.0% bf16 MFU | 62132 tok/s +step 857/19560 | loss 4.466209 (-1.68z)| norm 0.5116 (-1.05z)| lr 6.00e-04 | 8433.22 ms | -100.0% bf16 MFU | 62134 tok/s +step 858/19560 | loss 4.491373 (-1.37z)| norm 0.5037 (-1.12z)| lr 6.00e-04 | 8433.37 ms | -100.0% bf16 MFU | 62136 tok/s +step 859/19560 | loss 4.477680 (-1.52z)| norm 0.4921 (-1.24z)| lr 6.00e-04 | 8433.54 ms | -100.0% bf16 MFU | 62138 tok/s +step 860/19560 | loss 4.504613 (-1.19z)| norm 0.4452 (-1.74z)| lr 6.00e-04 | 8430.72 ms | -100.0% bf16 MFU | 62140 tok/s +step 861/19560 | loss 4.393952 (-2.41z)| norm 0.4532 (-1.62z)| lr 6.00e-04 | 8430.38 ms | -100.0% bf16 MFU | 62143 tok/s +step 862/19560 | loss 4.428115 (-1.98z)| norm 0.4618 (-1.51z)| lr 6.00e-04 | 8432.48 ms | -100.0% bf16 MFU | 62144 tok/s +step 863/19560 | loss 4.493054 (-1.23z)| norm 0.5138 (-0.90z)| lr 6.00e-04 | 8436.57 ms | -100.0% bf16 MFU | 62144 tok/s +step 864/19560 | loss 4.472966 (-1.43z)| norm 0.5074 (-0.96z)| lr 6.00e-04 | 8434.79 ms | -100.0% bf16 MFU | 62145 tok/s +step 865/19560 | loss 4.493762 (-1.18z)| norm 0.5814 (-0.13z)| lr 6.00e-04 | 8434.47 ms | -100.0% bf16 MFU | 62146 tok/s +step 866/19560 | loss 4.478921 (-1.34z)| norm 0.6115 (+0.21z)| lr 6.00e-04 | 8435.24 ms | -100.0% bf16 MFU | 62146 tok/s +step 867/19560 | loss 4.469342 (-1.42z)| norm 0.5314 (-0.70z)| lr 6.00e-04 | 8433.02 ms | -100.0% bf16 MFU | 62147 tok/s +step 868/19560 | loss 4.516650 (-0.87z)| norm 0.5488 (-0.50z)| lr 6.00e-04 | 8438.21 ms | -100.0% bf16 MFU | 62147 tok/s +step 869/19560 | loss 4.478722 (-1.29z)| norm 0.6661 (+0.85z)| lr 6.00e-04 | 8456.42 ms | -100.0% bf16 MFU | 62139 tok/s +step 870/19560 | loss 4.417573 (-1.98z)| norm 0.6524 (+0.70z)| lr 6.00e-04 | 8464.64 ms | -100.0% bf16 MFU | 62129 tok/s +step 871/19560 | loss 4.495169 (-1.06z)| norm 0.6588 (+0.78z)| lr 6.00e-04 | 8465.18 ms | -100.0% bf16 MFU | 62119 tok/s +step 872/19560 | loss 4.441453 (-1.68z)| norm 0.5714 (-0.22z)| lr 6.00e-04 | 8460.01 ms | -100.0% bf16 MFU | 62112 tok/s +step 873/19560 | loss 4.496285 (-1.01z)| norm 0.5136 (-0.88z)| lr 6.00e-04 | 8458.73 ms | -100.0% bf16 MFU | 62106 tok/s +step 874/19560 | loss 4.427608 (-1.80z)| norm 0.5828 (-0.07z)| lr 6.00e-04 | 8461.27 ms | -100.0% bf16 MFU | 62098 tok/s +step 875/19560 | loss 4.523173 (-0.64z)| norm 0.5389 (-0.57z)| lr 6.00e-04 | 8455.97 ms | -100.0% bf16 MFU | 62094 tok/s +step 876/19560 | loss 4.445790 (-1.56z)| norm 0.5587 (-0.32z)| lr 6.00e-04 | 8455.86 ms | -100.0% bf16 MFU | 62089 tok/s +step 877/19560 | loss 4.468552 (-1.27z)| norm 0.5629 (-0.26z)| lr 6.00e-04 | 8466.83 ms | -100.0% bf16 MFU | 62081 tok/s +step 878/19560 | loss 4.424280 (-1.79z)| norm 0.5046 (-0.98z)| lr 6.00e-04 | 8464.52 ms | -100.0% bf16 MFU | 62074 tok/s +step 879/19560 | loss 4.456848 (-1.38z)| norm 0.4494 (-1.68z)| lr 6.00e-04 | 8459.41 ms | -100.0% bf16 MFU | 62069 tok/s +step 880/19560 | loss 4.422067 (-1.80z)| norm 0.4381 (-1.80z)| lr 6.00e-04 | 8457.32 ms | -100.0% bf16 MFU | 62065 tok/s +step 881/19560 | loss 4.421774 (-1.79z)| norm 0.4780 (-1.26z)| lr 6.00e-04 | 8456.04 ms | -100.0% bf16 MFU | 62062 tok/s +step 882/19560 | loss 4.372871 (-2.37z)| norm 0.5018 (-0.94z)| lr 6.00e-04 | 8458.37 ms | -100.0% bf16 MFU | 62058 tok/s +step 883/19560 | loss 4.445949 (-1.43z)| norm 0.5210 (-0.68z)| lr 6.00e-04 | 8461.00 ms | -100.0% bf16 MFU | 62053 tok/s +step 884/19560 | loss 4.421447 (-1.72z)| norm 0.5231 (-0.64z)| lr 6.00e-04 | 8455.95 ms | -100.0% bf16 MFU | 62051 tok/s +step 885/19560 | loss 4.442682 (-1.43z)| norm 0.5210 (-0.66z)| lr 6.00e-04 | 8454.01 ms | -100.0% bf16 MFU | 62049 tok/s +step 886/19560 | loss 4.461952 (-1.16z)| norm 0.5631 (-0.10z)| lr 6.00e-04 | 8454.62 ms | -100.0% bf16 MFU | 62047 tok/s +step 887/19560 | loss 4.472097 (-1.02z)| norm 0.5787 (+0.10z)| lr 6.00e-04 | 8457.44 ms | -100.0% bf16 MFU | 62044 tok/s +step 888/19560 | loss 4.420265 (-1.67z)| norm 0.5494 (-0.28z)| lr 6.00e-04 | 8453.83 ms | -100.0% bf16 MFU | 62043 tok/s +step 889/19560 | loss 4.449206 (-1.27z)| norm 0.4918 (-1.04z)| lr 6.00e-04 | 8456.85 ms | -100.0% bf16 MFU | 62041 tok/s +step 890/19560 | loss 4.558464 (+0.18z)| norm 0.4361 (-1.74z)| lr 6.00e-04 | 8454.19 ms | -100.0% bf16 MFU | 62039 tok/s +step 891/19560 | loss 4.414739 (-1.73z)| norm 0.4681 (-1.31z)| lr 6.00e-04 | 8461.34 ms | -100.0% bf16 MFU | 62036 tok/s +step 892/19560 | loss 4.367108 (-2.32z)| norm 0.5623 (-0.09z)| lr 6.00e-04 | 8455.85 ms | -100.0% bf16 MFU | 62034 tok/s +step 893/19560 | loss 4.462342 (-1.03z)| norm 0.5789 (+0.13z)| lr 6.00e-04 | 8461.37 ms | -100.0% bf16 MFU | 62030 tok/s +step 894/19560 | loss 4.411872 (-1.68z)| norm 0.4951 (-0.95z)| lr 6.00e-04 | 8458.67 ms | -100.0% bf16 MFU | 62028 tok/s +step 895/19560 | loss 4.417604 (-1.59z)| norm 0.4596 (-1.39z)| lr 6.00e-04 | 8458.26 ms | -100.0% bf16 MFU | 62026 tok/s +step 896/19560 | loss 4.462972 (-0.97z)| norm 0.5519 (-0.17z)| lr 6.00e-04 | 8457.45 ms | -100.0% bf16 MFU | 62024 tok/s +step 897/19560 | loss 4.387460 (-1.98z)| norm 0.6211 (+0.73z)| lr 6.00e-04 | 8452.25 ms | -100.0% bf16 MFU | 62024 tok/s +step 898/19560 | loss 4.398155 (-1.80z)| norm 0.6919 (+1.63z)| lr 6.00e-04 | 8464.49 ms | -100.0% bf16 MFU | 62020 tok/s +step 899/19560 | loss 4.495266 (-0.45z)| norm 0.7002 (+1.71z)| lr 6.00e-04 | 8456.57 ms | -100.0% bf16 MFU | 62019 tok/s +step 900/19560 | loss 4.546827 (+0.28z)| norm 0.7605 (+2.41z)| lr 6.00e-04 | 8463.80 ms | -100.0% bf16 MFU | 62015 tok/s +step 901/19560 | loss 4.395344 (-1.80z)| norm 0.6700 (+1.26z)| lr 6.00e-04 | 8452.32 ms | -100.0% bf16 MFU | 62016 tok/s +step 902/19560 | loss 4.401875 (-1.69z)| norm 0.4650 (-1.28z)| lr 6.00e-04 | 8449.70 ms | -100.0% bf16 MFU | 62018 tok/s +step 903/19560 | loss 4.405766 (-1.61z)| norm 0.4812 (-1.06z)| lr 6.00e-04 | 8461.01 ms | -100.0% bf16 MFU | 62015 tok/s +step 904/19560 | loss 4.421282 (-1.37z)| norm 0.4726 (-1.15z)| lr 6.00e-04 | 8453.42 ms | -100.0% bf16 MFU | 62015 tok/s +step 905/19560 | loss 4.379497 (-1.91z)| norm 0.4545 (-1.35z)| lr 6.00e-04 | 8460.71 ms | -100.0% bf16 MFU | 62013 tok/s +step 906/19560 | loss 4.343550 (-2.33z)| norm 0.5867 (+0.28z)| lr 6.00e-04 | 8456.49 ms | -100.0% bf16 MFU | 62012 tok/s +step 907/19560 | loss 4.397103 (-1.58z)| norm 0.4133 (-1.83z)| lr 6.00e-04 | 8451.51 ms | -100.0% bf16 MFU | 62013 tok/s +step 908/19560 | loss 4.441381 (-0.97z)| norm 0.3814 (-2.17z)| lr 6.00e-04 | 8454.81 ms | -100.0% bf16 MFU | 62013 tok/s +step 909/19560 | loss 4.417187 (-1.28z)| norm 0.4180 (-1.69z)| lr 6.00e-04 | 8452.70 ms | -100.0% bf16 MFU | 62014 tok/s +step 910/19560 | loss 4.393542 (-1.57z)| norm 0.5040 (-0.65z)| lr 6.00e-04 | 8453.74 ms | -100.0% bf16 MFU | 62014 tok/s +step 911/19560 | loss 4.448248 (-0.82z)| norm 0.6561 (+1.17z)| lr 6.00e-04 | 8453.71 ms | -100.0% bf16 MFU | 62014 tok/s +step 912/19560 | loss 4.386652 (-1.63z)| norm 0.6140 (+0.65z)| lr 6.00e-04 | 8447.77 ms | -100.0% bf16 MFU | 62017 tok/s +step 913/19560 | loss 4.457045 (-0.66z)| norm 0.6261 (+0.80z)| lr 6.00e-04 | 8448.10 ms | -100.0% bf16 MFU | 62019 tok/s +step 914/19560 | loss 4.457324 (-0.65z)| norm 0.6171 (+0.69z)| lr 6.00e-04 | 8446.76 ms | -100.0% bf16 MFU | 62021 tok/s +step 915/19560 | loss 4.419936 (-1.14z)| norm 0.5230 (-0.42z)| lr 6.00e-04 | 8453.43 ms | -100.0% bf16 MFU | 62021 tok/s +step 916/19560 | loss 4.431975 (-0.97z)| norm 0.5088 (-0.59z)| lr 6.00e-04 | 8456.69 ms | -100.0% bf16 MFU | 62020 tok/s +step 917/19560 | loss 4.423111 (-1.08z)| norm 0.5298 (-0.34z)| lr 6.00e-04 | 8450.71 ms | -100.0% bf16 MFU | 62021 tok/s +step 918/19560 | loss 4.451551 (-0.67z)| norm 0.4813 (-0.92z)| lr 6.00e-04 | 8450.06 ms | -100.0% bf16 MFU | 62022 tok/s +step 919/19560 | loss 4.365971 (-1.83z)| norm 0.4054 (-1.79z)| lr 6.00e-04 | 8448.06 ms | -100.0% bf16 MFU | 62024 tok/s +step 920/19560 | loss 4.423643 (-1.01z)| norm 0.4196 (-1.59z)| lr 6.00e-04 | 8449.49 ms | -100.0% bf16 MFU | 62026 tok/s +step 921/19560 | loss 4.372063 (-1.71z)| norm 0.4720 (-0.96z)| lr 6.00e-04 | 8449.19 ms | -100.0% bf16 MFU | 62027 tok/s +step 922/19560 | loss 4.368515 (-1.73z)| norm 0.4667 (-1.01z)| lr 6.00e-04 | 8440.91 ms | -100.0% bf16 MFU | 62031 tok/s +step 923/19560 | loss 4.366874 (-1.72z)| norm 0.4522 (-1.16z)| lr 6.00e-04 | 8446.86 ms | -100.0% bf16 MFU | 62033 tok/s +step 924/19560 | loss 4.350673 (-1.91z)| norm 0.4240 (-1.46z)| lr 6.00e-04 | 8449.98 ms | -100.0% bf16 MFU | 62034 tok/s +step 925/19560 | loss 4.372705 (-1.58z)| norm 0.4106 (-1.59z)| lr 6.00e-04 | 8447.56 ms | -100.0% bf16 MFU | 62035 tok/s +step 926/19560 | loss 4.358565 (-1.75z)| norm 0.3885 (-1.80z)| lr 6.00e-04 | 8448.34 ms | -100.0% bf16 MFU | 62036 tok/s +step 927/19560 | loss 4.370344 (-1.57z)| norm 0.4430 (-1.17z)| lr 6.00e-04 | 8446.22 ms | -100.0% bf16 MFU | 62038 tok/s +step 928/19560 | loss 4.387617 (-1.30z)| norm 0.5558 (+0.12z)| lr 6.00e-04 | 8449.81 ms | -100.0% bf16 MFU | 62039 tok/s +step 929/19560 | loss 4.374991 (-1.46z)| norm 0.6170 (+0.80z)| lr 6.00e-04 | 8453.33 ms | -100.0% bf16 MFU | 62038 tok/s +step 930/19560 | loss 4.363777 (-1.59z)| norm 0.6404 (+1.05z)| lr 6.00e-04 | 8453.92 ms | -100.0% bf16 MFU | 62037 tok/s +step 931/19560 | loss 4.396457 (-1.12z)| norm 0.6463 (+1.12z)| lr 6.00e-04 | 8452.74 ms | -100.0% bf16 MFU | 62036 tok/s +step 932/19560 | loss 4.370023 (-1.46z)| norm 0.5710 (+0.29z)| lr 6.00e-04 | 8441.52 ms | -100.0% bf16 MFU | 62040 tok/s +step 933/19560 | loss 4.415001 (-0.82z)| norm 0.5783 (+0.39z)| lr 6.00e-04 | 8444.64 ms | -100.0% bf16 MFU | 62042 tok/s +step 934/19560 | loss 4.333207 (-1.95z)| norm 0.4774 (-0.78z)| lr 6.00e-04 | 8445.99 ms | -100.0% bf16 MFU | 62044 tok/s +step 935/19560 | loss 4.368430 (-1.42z)| norm 0.4945 (-0.58z)| lr 6.00e-04 | 8445.41 ms | -100.0% bf16 MFU | 62046 tok/s +step 936/19560 | loss 4.368199 (-1.41z)| norm 0.4583 (-1.00z)| lr 6.00e-04 | 8456.92 ms | -100.0% bf16 MFU | 62043 tok/s +step 937/19560 | loss 4.374469 (-1.30z)| norm 0.4425 (-1.19z)| lr 6.00e-04 | 8452.00 ms | -100.0% bf16 MFU | 62042 tok/s +step 938/19560 | loss 4.377110 (-1.25z)| norm 0.4394 (-1.21z)| lr 6.00e-04 | 8451.12 ms | -100.0% bf16 MFU | 62042 tok/s +step 939/19560 | loss 4.361742 (-1.44z)| norm 0.4262 (-1.35z)| lr 6.00e-04 | 8443.89 ms | -100.0% bf16 MFU | 62045 tok/s +step 940/19560 | loss 4.381366 (-1.15z)| norm 0.3969 (-1.66z)| lr 6.00e-04 | 8440.74 ms | -100.0% bf16 MFU | 62048 tok/s +step 941/19560 | loss 4.357056 (-1.48z)| norm 0.4622 (-0.89z)| lr 6.00e-04 | 8446.20 ms | -100.0% bf16 MFU | 62049 tok/s +step 942/19560 | loss 4.351519 (-1.53z)| norm 0.5298 (-0.09z)| lr 6.00e-04 | 8443.21 ms | -100.0% bf16 MFU | 62052 tok/s +step 943/19560 | loss 4.316575 (-1.99z)| norm 0.5393 (+0.02z)| lr 6.00e-04 | 8444.23 ms | -100.0% bf16 MFU | 62054 tok/s +step 944/19560 | loss 4.358535 (-1.37z)| norm 0.5014 (-0.42z)| lr 6.00e-04 | 8436.06 ms | -100.0% bf16 MFU | 62058 tok/s +step 945/19560 | loss 4.344923 (-1.53z)| norm 0.4368 (-1.16z)| lr 6.00e-04 | 8436.52 ms | -100.0% bf16 MFU | 62063 tok/s +step 946/19560 | loss 4.365270 (-1.23z)| norm 0.5400 (+0.04z)| lr 6.00e-04 | 8444.04 ms | -100.0% bf16 MFU | 62064 tok/s +step 947/19560 | loss 4.353825 (-1.37z)| norm 0.5093 (-0.30z)| lr 6.00e-04 | 8438.45 ms | -100.0% bf16 MFU | 62067 tok/s +step 948/19560 | loss 4.399934 (-0.71z)| norm 0.4873 (-0.55z)| lr 6.00e-04 | 8438.16 ms | -100.0% bf16 MFU | 62071 tok/s +step 949/19560 | loss 4.330841 (-1.68z)| norm 0.4468 (-1.04z)| lr 6.00e-04 | 8439.36 ms | -100.0% bf16 MFU | 62073 tok/s +step 950/19560 | loss 4.403479 (-0.62z)| norm 0.4345 (-1.18z)| lr 6.00e-04 | 8437.62 ms | -100.0% bf16 MFU | 62076 tok/s +step 951/19560 | loss 4.233434 (-2.98z)| norm 0.4404 (-1.09z)| lr 6.00e-04 | 8438.05 ms | -100.0% bf16 MFU | 62079 tok/s +step 952/19560 | loss 4.373678 (-0.98z)| norm 0.4673 (-0.74z)| lr 6.00e-04 | 8440.05 ms | -100.0% bf16 MFU | 62081 tok/s +step 953/19560 | loss 4.309599 (-1.91z)| norm 0.4713 (-0.69z)| lr 6.00e-04 | 8440.15 ms | -100.0% bf16 MFU | 62083 tok/s +step 954/19560 | loss 4.353989 (-1.24z)| norm 0.5056 (-0.27z)| lr 6.00e-04 | 8439.55 ms | -100.0% bf16 MFU | 62085 tok/s +step 955/19560 | loss 4.336106 (-1.48z)| norm 0.4527 (-0.94z)| lr 6.00e-04 | 8435.98 ms | -100.0% bf16 MFU | 62088 tok/s +step 956/19560 | loss 4.300848 (-1.96z)| norm 0.4377 (-1.12z)| lr 6.00e-04 | 8434.15 ms | -100.0% bf16 MFU | 62092 tok/s +step 957/19560 | loss 4.354042 (-1.16z)| norm 0.4193 (-1.33z)| lr 6.00e-04 | 8434.21 ms | -100.0% bf16 MFU | 62096 tok/s +step 958/19560 | loss 4.356682 (-1.11z)| norm 0.4401 (-1.05z)| lr 6.00e-04 | 8436.62 ms | -100.0% bf16 MFU | 62098 tok/s +step 959/19560 | loss 4.354937 (-1.12z)| norm 0.4313 (-1.15z)| lr 6.00e-04 | 8438.66 ms | -100.0% bf16 MFU | 62100 tok/s +step 960/19560 | loss 4.334123 (-1.40z)| norm 0.4990 (-0.30z)| lr 6.00e-04 | 8436.39 ms | -100.0% bf16 MFU | 62102 tok/s +step 961/19560 | loss 4.349060 (-1.17z)| norm 0.5377 (+0.19z)| lr 6.00e-04 | 8439.07 ms | -100.0% bf16 MFU | 62103 tok/s +step 962/19560 | loss 4.414637 (-0.18z)| norm 0.5322 (+0.13z)| lr 6.00e-04 | 8435.59 ms | -100.0% bf16 MFU | 62106 tok/s +step 963/19560 | loss 4.367518 (-0.88z)| norm 0.5613 (+0.51z)| lr 6.00e-04 | 8434.66 ms | -100.0% bf16 MFU | 62108 tok/s +step 964/19560 | loss 4.403090 (-0.33z)| norm 0.5556 (+0.44z)| lr 6.00e-04 | 8438.16 ms | -100.0% bf16 MFU | 62109 tok/s +step 965/19560 | loss 4.320701 (-1.60z)| norm 0.5015 (-0.25z)| lr 6.00e-04 | 8435.73 ms | -100.0% bf16 MFU | 62111 tok/s +step 966/19560 | loss 4.304974 (-1.81z)| norm 0.4533 (-0.85z)| lr 6.00e-04 | 8436.32 ms | -100.0% bf16 MFU | 62113 tok/s +step 967/19560 | loss 4.325768 (-1.46z)| norm 0.4495 (-0.89z)| lr 6.00e-04 | 8441.53 ms | -100.0% bf16 MFU | 62113 tok/s +step 968/19560 | loss 4.354164 (-1.00z)| norm 0.4575 (-0.78z)| lr 6.00e-04 | 8435.98 ms | -100.0% bf16 MFU | 62115 tok/s +step 969/19560 | loss 4.344861 (-1.14z)| norm 0.5204 (+0.01z)| lr 6.00e-04 | 8439.75 ms | -100.0% bf16 MFU | 62115 tok/s +step 970/19560 | loss 4.306200 (-1.73z)| norm 0.5125 (-0.09z)| lr 6.00e-04 | 8438.49 ms | -100.0% bf16 MFU | 62116 tok/s +step 971/19560 | loss 4.325552 (-1.40z)| norm 0.4472 (-0.91z)| lr 6.00e-04 | 8436.45 ms | -100.0% bf16 MFU | 62117 tok/s +step 972/19560 | loss 4.275351 (-2.15z)| norm 0.4774 (-0.54z)| lr 6.00e-04 | 8436.70 ms | -100.0% bf16 MFU | 62119 tok/s +step 973/19560 | loss 4.398692 (-0.19z)| norm 0.4586 (-0.79z)| lr 6.00e-04 | 8435.38 ms | -100.0% bf16 MFU | 62120 tok/s +step 974/19560 | loss 4.310369 (-1.57z)| norm 0.4928 (-0.35z)| lr 6.00e-04 | 8442.07 ms | -100.0% bf16 MFU | 62120 tok/s +step 975/19560 | loss 4.355044 (-0.85z)| norm 0.5213 (+0.02z)| lr 6.00e-04 | 8447.47 ms | -100.0% bf16 MFU | 62117 tok/s +step 976/19560 | loss 4.358995 (-0.77z)| norm 0.5576 (+0.49z)| lr 6.00e-04 | 8441.11 ms | -100.0% bf16 MFU | 62117 tok/s +step 977/19560 | loss 4.343891 (-1.00z)| norm 0.5127 (-0.08z)| lr 6.00e-04 | 8442.33 ms | -100.0% bf16 MFU | 62116 tok/s +step 978/19560 | loss 4.348778 (-0.91z)| norm 0.4618 (-0.72z)| lr 6.00e-04 | 8441.43 ms | -100.0% bf16 MFU | 62116 tok/s +step 979/19560 | loss 4.293694 (-1.76z)| norm 0.4780 (-0.50z)| lr 6.00e-04 | 8445.66 ms | -100.0% bf16 MFU | 62114 tok/s +step 980/19560 | loss 4.352978 (-0.80z)| norm 0.4134 (-1.32z)| lr 6.00e-04 | 8438.97 ms | -100.0% bf16 MFU | 62114 tok/s +step 981/19560 | loss 4.369946 (-0.51z)| norm 0.4396 (-0.97z)| lr 6.00e-04 | 8438.92 ms | -100.0% bf16 MFU | 62115 tok/s +step 982/19560 | loss 4.277159 (-2.01z)| norm 0.4632 (-0.65z)| lr 6.00e-04 | 8442.07 ms | -100.0% bf16 MFU | 62114 tok/s +step 983/19560 | loss 4.340019 (-0.96z)| norm 0.4645 (-0.62z)| lr 6.00e-04 | 8439.79 ms | -100.0% bf16 MFU | 62115 tok/s +step 984/19560 | loss 4.285377 (-1.84z)| norm 0.4132 (-1.30z)| lr 6.00e-04 | 8442.62 ms | -100.0% bf16 MFU | 62114 tok/s +step 985/19560 | loss 4.260518 (-2.19z)| norm 0.3951 (-1.52z)| lr 6.00e-04 | 8439.66 ms | -100.0% bf16 MFU | 62114 tok/s +step 986/19560 | loss 4.276128 (-1.90z)| norm 0.3522 (-2.04z)| lr 6.00e-04 | 8438.56 ms | -100.0% bf16 MFU | 62115 tok/s +step 987/19560 | loss 4.386197 (-0.10z)| norm 0.3968 (-1.43z)| lr 6.00e-04 | 8443.54 ms | -100.0% bf16 MFU | 62114 tok/s +step 988/19560 | loss 4.357625 (-0.56z)| norm 0.3781 (-1.66z)| lr 6.00e-04 | 8443.23 ms | -100.0% bf16 MFU | 62113 tok/s +step 989/19560 | loss 4.281478 (-1.78z)| norm 0.4378 (-0.88z)| lr 6.00e-04 | 8448.18 ms | -100.0% bf16 MFU | 62110 tok/s +step 990/19560 | loss 4.279620 (-1.77z)| norm 0.4870 (-0.24z)| lr 6.00e-04 | 8440.14 ms | -100.0% bf16 MFU | 62111 tok/s +step 991/19560 | loss 4.324593 (-1.03z)| norm 0.5237 (+0.24z)| lr 6.00e-04 | 8441.35 ms | -100.0% bf16 MFU | 62111 tok/s +step 992/19560 | loss 4.335235 (-0.84z)| norm 0.5750 (+0.90z)| lr 6.00e-04 | 8448.31 ms | -100.0% bf16 MFU | 62108 tok/s +step 993/19560 | loss 4.354484 (-0.52z)| norm 0.6670 (+2.06z)| lr 6.00e-04 | 8439.71 ms | -100.0% bf16 MFU | 62109 tok/s +step 994/19560 | loss 4.372334 (-0.21z)| norm 0.6807 (+2.20z)| lr 6.00e-04 | 8444.91 ms | -100.0% bf16 MFU | 62108 tok/s +step 995/19560 | loss 4.317942 (-1.10z)| norm 0.5940 (+1.09z)| lr 6.00e-04 | 8443.12 ms | -100.0% bf16 MFU | 62107 tok/s +step 996/19560 | loss 4.339880 (-0.72z)| norm 0.5613 (+0.68z)| lr 6.00e-04 | 8439.26 ms | -100.0% bf16 MFU | 62108 tok/s +step 997/19560 | loss 4.402906 (+0.37z)| norm 0.8403 (+3.97z)| lr 6.00e-04 | 8441.52 ms | -100.0% bf16 MFU | 62108 tok/s +step 998/19560 | loss 4.256374 (-2.11z)| norm 0.5691 (+0.74z)| lr 6.00e-04 | 8439.61 ms | -100.0% bf16 MFU | 62109 tok/s +step 999/19560 | loss 4.408206 (+0.49z)| norm 0.6451 (+1.66z)| lr 6.00e-04 | 8441.94 ms | -100.0% bf16 MFU | 62108 tok/s +step 1000/19560 | loss 4.368209 (-0.19z)| norm 0.7405 (+2.73z)| lr 6.00e-04 | 8442.09 ms | -100.0% bf16 MFU | 62108 tok/s +val loss 4.358351 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 2571/10042 = 0.256025 +step 1001/19560 | loss 4.326060 (-0.91z)| norm 0.6513 (+1.64z)| lr 6.00e-04 | 8442.24 ms | -100.0% bf16 MFU | 62108 tok/s +step 1002/19560 | loss 4.332892 (-0.78z)| norm 0.5453 (+0.41z)| lr 6.00e-04 | 8441.96 ms | -100.0% bf16 MFU | 62108 tok/s +step 1003/19560 | loss 4.360882 (-0.27z)| norm 0.4473 (-0.73z)| lr 6.00e-04 | 8442.43 ms | -100.0% bf16 MFU | 62108 tok/s +step 1004/19560 | loss 4.291972 (-1.49z)| norm 0.4004 (-1.25z)| lr 6.00e-04 | 8446.82 ms | -100.0% bf16 MFU | 62106 tok/s +step 1005/19560 | loss 4.390268 (+0.30z)| norm 0.5873 (+0.91z)| lr 6.00e-04 | 8442.15 ms | -100.0% bf16 MFU | 62106 tok/s +step 1006/19560 | loss 4.344454 (-0.53z)| norm 0.4217 (-1.00z)| lr 6.00e-04 | 8442.13 ms | -100.0% bf16 MFU | 62105 tok/s +step 1007/19560 | loss 4.358182 (-0.26z)| norm 0.4810 (-0.31z)| lr 6.00e-04 | 8444.65 ms | -100.0% bf16 MFU | 62104 tok/s +step 1008/19560 | loss 4.298266 (-1.34z)| norm 0.4823 (-0.30z)| lr 6.00e-04 | 8441.58 ms | -100.0% bf16 MFU | 62105 tok/s +step 1009/19560 | loss 4.256430 (-2.06z)| norm 0.6605 (+1.73z)| lr 6.00e-04 | 8439.19 ms | -100.0% bf16 MFU | 62106 tok/s +step 1010/19560 | loss 4.357083 (-0.24z)| norm 0.4716 (-0.44z)| lr 6.00e-04 | 8440.60 ms | -100.0% bf16 MFU | 62106 tok/s +step 1011/19560 | loss 4.366784 (-0.05z)| norm 0.8687 (+3.84z)| lr 6.00e-04 | 8443.90 ms | -100.0% bf16 MFU | 62105 tok/s +step 1012/19560 | loss 4.328157 (-0.74z)| norm 0.4848 (-0.29z)| lr 6.00e-04 | 8436.67 ms | -100.0% bf16 MFU | 62107 tok/s +step 1013/19560 | loss 4.333050 (-0.64z)| norm 0.4840 (-0.30z)| lr 6.00e-04 | 8438.06 ms | -100.0% bf16 MFU | 62109 tok/s +step 1014/19560 | loss 4.314169 (-0.97z)| norm 0.6953 (+1.94z)| lr 6.00e-04 | 8441.72 ms | -100.0% bf16 MFU | 62108 tok/s +step 1015/19560 | loss 4.279074 (-1.60z)| norm 0.6103 (+1.03z)| lr 6.00e-04 | 8443.43 ms | -100.0% bf16 MFU | 62108 tok/s +step 1016/19560 | loss 4.310845 (-1.00z)| norm 0.6162 (+1.08z)| lr 6.00e-04 | 8445.01 ms | -100.0% bf16 MFU | 62106 tok/s +step 1017/19560 | loss 4.317875 (-0.85z)| norm 2.3011 (+9.66z)| lr 6.00e-04 | 8441.88 ms | -100.0% bf16 MFU | 62106 tok/s +step 1018/19560 | loss 4.424510 (+1.22z)| norm 0.7649 (+1.27z)| lr 6.00e-04 | 8444.66 ms | -100.0% bf16 MFU | 62105 tok/s +step 1019/19560 | loss 4.341167 (-0.41z)| norm 0.6919 (+0.86z)| lr 6.00e-04 | 8446.71 ms | -100.0% bf16 MFU | 62104 tok/s +step 1020/19560 | loss 4.355760 (-0.12z)| norm 0.6414 (+0.59z)| lr 6.00e-04 | 8445.48 ms | -100.0% bf16 MFU | 62102 tok/s +step 1021/19560 | loss 4.293059 (-1.34z)| norm 0.5310 (-0.01z)| lr 6.00e-04 | 8443.09 ms | -100.0% bf16 MFU | 62102 tok/s +step 1022/19560 | loss 4.356758 (-0.06z)| norm 0.5137 (-0.10z)| lr 6.00e-04 | 8443.29 ms | -100.0% bf16 MFU | 62102 tok/s +step 1023/19560 | loss 4.305220 (-1.08z)| norm 0.5796 (+0.25z)| lr 6.00e-04 | 8441.15 ms | -100.0% bf16 MFU | 62102 tok/s +step 1024/19560 | loss 4.330781 (-0.55z)| norm 0.6660 (+0.71z)| lr 6.00e-04 | 8434.03 ms | -100.0% bf16 MFU | 62105 tok/s +step 1025/19560 | loss 4.271700 (-1.72z)| norm 0.5223 (-0.06z)| lr 6.00e-04 | 8435.81 ms | -100.0% bf16 MFU | 62108 tok/s +step 1026/19560 | loss 4.289821 (-1.33z)| norm 0.5018 (-0.16z)| lr 6.00e-04 | 8434.50 ms | -100.0% bf16 MFU | 62110 tok/s +step 1027/19560 | loss 4.313235 (-0.86z)| norm 0.3889 (-0.76z)| lr 6.00e-04 | 8432.09 ms | -100.0% bf16 MFU | 62114 tok/s +step 1028/19560 | loss 4.328909 (-0.53z)| norm 0.4236 (-0.56z)| lr 6.00e-04 | 8433.12 ms | -100.0% bf16 MFU | 62116 tok/s +step 1029/19560 | loss 4.266496 (-1.86z)| norm 0.4242 (-0.55z)| lr 6.00e-04 | 8431.74 ms | -100.0% bf16 MFU | 62120 tok/s +step 1030/19560 | loss 4.296401 (-1.19z)| norm 0.4730 (-0.28z)| lr 6.00e-04 | 8431.26 ms | -100.0% bf16 MFU | 62123 tok/s +step 1031/19560 | loss 4.318621 (-0.70z)| norm 0.5229 (-0.01z)| lr 6.00e-04 | 8435.19 ms | -100.0% bf16 MFU | 62124 tok/s +step 1032/19560 | loss 4.285645 (-1.40z)| norm 0.5172 (-0.05z)| lr 6.00e-04 | 8433.91 ms | -100.0% bf16 MFU | 62126 tok/s +step 1033/19560 | loss 4.299171 (-1.08z)| norm 0.4604 (-0.36z)| lr 6.00e-04 | 8430.53 ms | -100.0% bf16 MFU | 62130 tok/s +step 1034/19560 | loss 4.351541 (+0.05z)| norm 0.4494 (-0.41z)| lr 6.00e-04 | 8433.09 ms | -100.0% bf16 MFU | 62132 tok/s +step 1035/19560 | loss 4.356528 (+0.17z)| norm 0.4506 (-0.41z)| lr 6.00e-04 | 8430.99 ms | -100.0% bf16 MFU | 62134 tok/s +step 1036/19560 | loss 4.334928 (-0.29z)| norm 0.4157 (-0.60z)| lr 6.00e-04 | 8434.65 ms | -100.0% bf16 MFU | 62135 tok/s +step 1037/19560 | loss 4.268365 (-1.74z)| norm 0.4201 (-0.57z)| lr 6.00e-04 | 8430.02 ms | -100.0% bf16 MFU | 62138 tok/s +step 1038/19560 | loss 4.306734 (-0.87z)| norm 0.4634 (-0.34z)| lr 6.00e-04 | 8433.66 ms | -100.0% bf16 MFU | 62140 tok/s +step 1039/19560 | loss 4.356033 (+0.24z)| norm 0.4242 (-0.54z)| lr 6.00e-04 | 8433.48 ms | -100.0% bf16 MFU | 62141 tok/s +step 1040/19560 | loss 4.332072 (-0.29z)| norm 0.4689 (-0.29z)| lr 6.00e-04 | 8434.67 ms | -100.0% bf16 MFU | 62142 tok/s +step 1041/19560 | loss 4.292250 (-1.19z)| norm 0.4397 (-0.44z)| lr 6.00e-04 | 8433.36 ms | -100.0% bf16 MFU | 62143 tok/s +step 1042/19560 | loss 4.237720 (-2.42z)| norm 0.4089 (-0.60z)| lr 6.00e-04 | 8434.12 ms | -100.0% bf16 MFU | 62144 tok/s +step 1043/19560 | loss 4.306788 (-0.81z)| norm 0.3657 (-0.83z)| lr 6.00e-04 | 8437.14 ms | -100.0% bf16 MFU | 62144 tok/s +step 1044/19560 | loss 4.206156 (-3.07z)| norm 0.3637 (-0.83z)| lr 6.00e-04 | 8434.96 ms | -100.0% bf16 MFU | 62145 tok/s +step 1045/19560 | loss 4.278649 (-1.38z)| norm 0.3903 (-0.68z)| lr 6.00e-04 | 8437.34 ms | -100.0% bf16 MFU | 62144 tok/s +step 1046/19560 | loss 4.281994 (-1.30z)| norm 0.4084 (-0.58z)| lr 6.00e-04 | 8439.67 ms | -100.0% bf16 MFU | 62143 tok/s +step 1047/19560 | loss 4.301482 (-0.83z)| norm 0.4554 (-0.33z)| lr 6.00e-04 | 8437.83 ms | -100.0% bf16 MFU | 62143 tok/s +step 1048/19560 | loss 4.263335 (-1.71z)| norm 0.4254 (-0.49z)| lr 5.99e-04 | 8437.04 ms | -100.0% bf16 MFU | 62143 tok/s +step 1049/19560 | loss 4.291885 (-1.01z)| norm 0.3733 (-0.76z)| lr 5.99e-04 | 8439.15 ms | -100.0% bf16 MFU | 62142 tok/s +step 1050/19560 | loss 4.283015 (-1.20z)| norm 0.3525 (-0.87z)| lr 5.99e-04 | 8440.41 ms | -100.0% bf16 MFU | 62141 tok/s +step 1051/19560 | loss 4.216736 (-2.68z)| norm 0.3882 (-0.67z)| lr 5.99e-04 | 8439.50 ms | -100.0% bf16 MFU | 62140 tok/s +step 1052/19560 | loss 4.231372 (-2.27z)| norm 0.4253 (-0.47z)| lr 5.99e-04 | 8439.40 ms | -100.0% bf16 MFU | 62139 tok/s +step 1053/19560 | loss 4.284748 (-1.05z)| norm 0.3962 (-0.63z)| lr 5.99e-04 | 8435.47 ms | -100.0% bf16 MFU | 62140 tok/s +step 1054/19560 | loss 4.170074 (-3.44z)| norm 0.5424 (+0.15z)| lr 5.99e-04 | 8435.48 ms | -100.0% bf16 MFU | 62140 tok/s +step 1055/19560 | loss 4.310961 (-0.39z)| norm 0.5057 (-0.05z)| lr 5.99e-04 | 8433.67 ms | -100.0% bf16 MFU | 62142 tok/s +step 1056/19560 | loss 4.207678 (-2.54z)| norm 0.4842 (-0.16z)| lr 5.99e-04 | 8431.66 ms | -100.0% bf16 MFU | 62144 tok/s +step 1057/19560 | loss 4.231538 (-1.99z)| norm 0.4911 (-0.12z)| lr 5.99e-04 | 8432.71 ms | -100.0% bf16 MFU | 62145 tok/s +step 1058/19560 | loss 4.378260 (+1.08z)| norm 0.5095 (-0.01z)| lr 5.99e-04 | 8430.97 ms | -100.0% bf16 MFU | 62147 tok/s +step 1059/19560 | loss 4.275465 (-1.05z)| norm 0.5134 (+0.01z)| lr 5.99e-04 | 8434.36 ms | -100.0% bf16 MFU | 62148 tok/s +step 1060/19560 | loss 4.254093 (-1.48z)| norm 0.4536 (-0.31z)| lr 5.99e-04 | 8456.02 ms | -100.0% bf16 MFU | 62141 tok/s +step 1061/19560 | loss 4.233463 (-1.88z)| norm 0.3880 (-0.66z)| lr 5.99e-04 | 8454.87 ms | -100.0% bf16 MFU | 62134 tok/s +step 1062/19560 | loss 4.246282 (-1.58z)| norm 0.3896 (-0.64z)| lr 5.99e-04 | 8457.71 ms | -100.0% bf16 MFU | 62127 tok/s +step 1063/19560 | loss 4.212427 (-2.22z)| norm 0.3968 (-0.60z)| lr 5.99e-04 | 8461.68 ms | -100.0% bf16 MFU | 62118 tok/s +step 1064/19560 | loss 4.249949 (-1.43z)| norm 0.4382 (-0.37z)| lr 5.99e-04 | 8456.03 ms | -100.0% bf16 MFU | 62113 tok/s +step 1065/19560 | loss 4.273447 (-0.94z)| norm 0.4728 (-0.19z)| lr 5.99e-04 | 8456.04 ms | -100.0% bf16 MFU | 62107 tok/s +step 1066/19560 | loss 4.256979 (-1.25z)| norm 0.4319 (-0.41z)| lr 5.99e-04 | 8458.30 ms | -100.0% bf16 MFU | 62101 tok/s +step 1067/19560 | loss 4.305985 (-0.25z)| norm 0.4720 (-0.19z)| lr 5.99e-04 | 8460.42 ms | -100.0% bf16 MFU | 62094 tok/s +step 1068/19560 | loss 4.256327 (-1.24z)| norm 0.4210 (-0.47z)| lr 5.99e-04 | 8457.83 ms | -100.0% bf16 MFU | 62089 tok/s +step 1069/19560 | loss 4.379456 (+1.25z)| norm 0.3919 (-0.62z)| lr 5.99e-04 | 8452.30 ms | -100.0% bf16 MFU | 62086 tok/s +step 1070/19560 | loss 4.261258 (-1.12z)| norm 0.5520 (+0.24z)| lr 5.99e-04 | 8457.36 ms | -100.0% bf16 MFU | 62081 tok/s +step 1071/19560 | loss 4.309181 (-0.15z)| norm 0.4898 (-0.09z)| lr 5.99e-04 | 8457.65 ms | -100.0% bf16 MFU | 62077 tok/s +step 1072/19560 | loss 4.233713 (-1.64z)| norm 0.4374 (-0.37z)| lr 5.99e-04 | 8456.61 ms | -100.0% bf16 MFU | 62073 tok/s +step 1073/19560 | loss 4.366256 (+1.00z)| norm 0.3987 (-0.58z)| lr 5.99e-04 | 8459.45 ms | -100.0% bf16 MFU | 62068 tok/s +step 1074/19560 | loss 4.333093 (+0.35z)| norm 0.3900 (-0.62z)| lr 5.99e-04 | 8455.73 ms | -100.0% bf16 MFU | 62065 tok/s +step 1075/19560 | loss 4.232360 (-1.64z)| norm 0.4241 (-0.43z)| lr 5.99e-04 | 8453.63 ms | -100.0% bf16 MFU | 62063 tok/s +step 1076/19560 | loss 4.328486 (+0.29z)| norm 0.4400 (-0.35z)| lr 5.99e-04 | 8458.04 ms | -100.0% bf16 MFU | 62059 tok/s +step 1077/19560 | loss 4.278880 (-0.70z)| norm 0.4525 (-0.28z)| lr 5.99e-04 | 8462.85 ms | -100.0% bf16 MFU | 62053 tok/s +step 1078/19560 | loss 4.274809 (-0.77z)| norm 0.3534 (-0.81z)| lr 5.99e-04 | 8456.95 ms | -100.0% bf16 MFU | 62050 tok/s +step 1079/19560 | loss 4.235724 (-1.57z)| norm 0.3590 (-0.77z)| lr 5.99e-04 | 8455.89 ms | -100.0% bf16 MFU | 62048 tok/s +step 1080/19560 | loss 4.310092 (-0.05z)| norm 0.4071 (-0.51z)| lr 5.99e-04 | 8461.58 ms | -100.0% bf16 MFU | 62044 tok/s +step 1081/19560 | loss 4.293530 (-0.38z)| norm 0.4978 (-0.03z)| lr 5.99e-04 | 8456.38 ms | -100.0% bf16 MFU | 62041 tok/s +step 1082/19560 | loss 4.297855 (-0.29z)| norm 0.5519 (+0.26z)| lr 5.99e-04 | 8453.90 ms | -100.0% bf16 MFU | 62040 tok/s +step 1083/19560 | loss 4.336494 (+0.51z)| norm 0.5895 (+0.46z)| lr 5.99e-04 | 8452.10 ms | -100.0% bf16 MFU | 62040 tok/s +step 1084/19560 | loss 4.285151 (-0.54z)| norm 0.4530 (-0.27z)| lr 5.99e-04 | 8461.06 ms | -100.0% bf16 MFU | 62036 tok/s +step 1085/19560 | loss 4.289014 (-0.45z)| norm 0.4213 (-0.44z)| lr 5.99e-04 | 8459.00 ms | -100.0% bf16 MFU | 62033 tok/s +step 1086/19560 | loss 4.260278 (-1.03z)| norm 0.4038 (-0.54z)| lr 5.99e-04 | 8458.18 ms | -100.0% bf16 MFU | 62031 tok/s +step 1087/19560 | loss 4.266508 (-0.88z)| norm 0.4440 (-0.32z)| lr 5.99e-04 | 8455.14 ms | -100.0% bf16 MFU | 62030 tok/s +step 1088/19560 | loss 4.220455 (-1.79z)| norm 0.4647 (-0.21z)| lr 5.99e-04 | 8461.55 ms | -100.0% bf16 MFU | 62026 tok/s +step 1089/19560 | loss 4.284904 (-0.47z)| norm 0.4160 (-0.46z)| lr 5.99e-04 | 8455.50 ms | -100.0% bf16 MFU | 62025 tok/s +step 1090/19560 | loss 4.262360 (-0.92z)| norm 0.4907 (-0.06z)| lr 5.99e-04 | 8453.48 ms | -100.0% bf16 MFU | 62025 tok/s +step 1091/19560 | loss 4.307780 (+0.02z)| norm 0.4772 (-0.13z)| lr 5.99e-04 | 8455.50 ms | -100.0% bf16 MFU | 62024 tok/s +step 1092/19560 | loss 4.229090 (-1.59z)| norm 0.3719 (-0.69z)| lr 5.99e-04 | 8465.85 ms | -100.0% bf16 MFU | 62019 tok/s +step 1093/19560 | loss 4.322689 (+0.36z)| norm 0.3778 (-0.65z)| lr 5.99e-04 | 8455.73 ms | -100.0% bf16 MFU | 62019 tok/s +step 1094/19560 | loss 4.244804 (-1.24z)| norm 0.3700 (-0.69z)| lr 5.99e-04 | 8462.17 ms | -100.0% bf16 MFU | 62015 tok/s +step 1095/19560 | loss 4.278123 (-0.55z)| norm 0.4284 (-0.37z)| lr 5.99e-04 | 8451.25 ms | -100.0% bf16 MFU | 62017 tok/s +step 1096/19560 | loss 4.272294 (-0.65z)| norm 0.4710 (-0.15z)| lr 5.99e-04 | 8460.01 ms | -100.0% bf16 MFU | 62014 tok/s +step 1097/19560 | loss 4.257386 (-0.95z)| norm 0.4690 (-0.16z)| lr 5.99e-04 | 8457.13 ms | -100.0% bf16 MFU | 62013 tok/s +step 1098/19560 | loss 4.171304 (-2.64z)| norm 0.4961 (-0.01z)| lr 5.99e-04 | 8457.00 ms | -100.0% bf16 MFU | 62012 tok/s +step 1099/19560 | loss 4.251033 (-1.01z)| norm 0.5159 (+0.09z)| lr 5.99e-04 | 8455.32 ms | -100.0% bf16 MFU | 62012 tok/s +step 1100/19560 | loss 4.279157 (-0.45z)| norm 0.4334 (-0.35z)| lr 5.99e-04 | 8460.61 ms | -100.0% bf16 MFU | 62010 tok/s +step 1101/19560 | loss 4.209352 (-1.83z)| norm 0.4536 (-0.24z)| lr 5.99e-04 | 8460.14 ms | -100.0% bf16 MFU | 62008 tok/s +step 1102/19560 | loss 4.296344 (-0.07z)| norm 0.5034 (+0.03z)| lr 5.99e-04 | 8459.14 ms | -100.0% bf16 MFU | 62006 tok/s +step 1103/19560 | loss 4.238568 (-1.22z)| norm 0.4495 (-0.26z)| lr 5.99e-04 | 8462.72 ms | -100.0% bf16 MFU | 62004 tok/s +step 1104/19560 | loss 4.253112 (-0.91z)| norm 0.3963 (-0.53z)| lr 5.99e-04 | 8459.63 ms | -100.0% bf16 MFU | 62002 tok/s +step 1105/19560 | loss 4.251983 (-0.92z)| norm 0.3751 (-0.64z)| lr 5.99e-04 | 8459.82 ms | -100.0% bf16 MFU | 62001 tok/s +step 1106/19560 | loss 4.231562 (-1.31z)| norm 0.3850 (-0.58z)| lr 5.99e-04 | 8452.42 ms | -100.0% bf16 MFU | 62002 tok/s +step 1107/19560 | loss 4.234233 (-1.24z)| norm 0.4189 (-0.40z)| lr 5.99e-04 | 8455.21 ms | -100.0% bf16 MFU | 62003 tok/s +step 1108/19560 | loss 4.181278 (-2.24z)| norm 0.4535 (-0.22z)| lr 5.99e-04 | 8454.03 ms | -100.0% bf16 MFU | 62003 tok/s +step 1109/19560 | loss 4.354621 (+1.18z)| norm 0.4624 (-0.17z)| lr 5.99e-04 | 8450.41 ms | -100.0% bf16 MFU | 62005 tok/s +step 1110/19560 | loss 4.173231 (-2.33z)| norm 0.4157 (-0.42z)| lr 5.99e-04 | 8443.93 ms | -100.0% bf16 MFU | 62010 tok/s +step 1111/19560 | loss 4.293780 (+0.01z)| norm 0.4133 (-0.43z)| lr 5.99e-04 | 8438.82 ms | -100.0% bf16 MFU | 62015 tok/s +step 1112/19560 | loss 4.237015 (-1.08z)| norm 0.3819 (-0.59z)| lr 5.99e-04 | 8434.16 ms | -100.0% bf16 MFU | 62023 tok/s +step 1113/19560 | loss 4.264703 (-0.55z)| norm 0.3551 (-0.73z)| lr 5.99e-04 | 8443.05 ms | -100.0% bf16 MFU | 62026 tok/s +step 1114/19560 | loss 4.253038 (-0.77z)| norm 0.3578 (-0.72z)| lr 5.99e-04 | 8443.96 ms | -100.0% bf16 MFU | 62030 tok/s +step 1115/19560 | loss 4.179696 (-2.14z)| norm 0.3800 (-0.60z)| lr 5.99e-04 | 8440.98 ms | -100.0% bf16 MFU | 62034 tok/s +step 1116/19560 | loss 4.215721 (-1.43z)| norm 0.3332 (-0.85z)| lr 5.99e-04 | 8443.89 ms | -100.0% bf16 MFU | 62037 tok/s +step 1117/19560 | loss 4.206085 (-1.58z)| norm 0.3664 (-0.67z)| lr 5.99e-04 | 8440.05 ms | -100.0% bf16 MFU | 62041 tok/s +step 1118/19560 | loss 4.244551 (-0.85z)| norm 0.4187 (-0.39z)| lr 5.99e-04 | 8438.66 ms | -100.0% bf16 MFU | 62045 tok/s +step 1119/19560 | loss 4.252079 (-0.69z)| norm 0.4546 (-0.19z)| lr 5.99e-04 | 8438.71 ms | -100.0% bf16 MFU | 62049 tok/s +step 1120/19560 | loss 4.256962 (-0.59z)| norm 0.4108 (-0.42z)| lr 5.99e-04 | 8442.11 ms | -100.0% bf16 MFU | 62052 tok/s +step 1121/19560 | loss 4.201201 (-1.62z)| norm 0.3614 (-0.67z)| lr 5.99e-04 | 8437.81 ms | -100.0% bf16 MFU | 62056 tok/s +step 1122/19560 | loss 4.257427 (-0.55z)| norm 0.3807 (-0.56z)| lr 5.99e-04 | 8439.75 ms | -100.0% bf16 MFU | 62060 tok/s +step 1123/19560 | loss 4.240874 (-0.85z)| norm 0.3892 (-0.50z)| lr 5.99e-04 | 8439.21 ms | -100.0% bf16 MFU | 62063 tok/s +step 1124/19560 | loss 4.223949 (-1.15z)| norm 0.3873 (-0.50z)| lr 5.99e-04 | 8440.37 ms | -100.0% bf16 MFU | 62066 tok/s +step 1125/19560 | loss 4.170702 (-2.13z)| norm 0.3752 (-0.56z)| lr 5.99e-04 | 8441.68 ms | -100.0% bf16 MFU | 62068 tok/s +step 1126/19560 | loss 4.204658 (-1.47z)| norm 0.3319 (-0.78z)| lr 5.99e-04 | 8440.98 ms | -100.0% bf16 MFU | 62070 tok/s +step 1127/19560 | loss 4.202639 (-1.50z)| norm 0.3475 (-0.68z)| lr 5.99e-04 | 8440.28 ms | -100.0% bf16 MFU | 62072 tok/s +step 1128/19560 | loss 4.266174 (-0.27z)| norm 0.4265 (-0.25z)| lr 5.99e-04 | 8442.02 ms | -100.0% bf16 MFU | 62074 tok/s +step 1129/19560 | loss 4.199097 (-1.54z)| norm 0.4685 (-0.01z)| lr 5.99e-04 | 8442.18 ms | -100.0% bf16 MFU | 62075 tok/s +step 1130/19560 | loss 4.304296 (+0.49z)| norm 0.4570 (-0.07z)| lr 5.99e-04 | 8439.87 ms | -100.0% bf16 MFU | 62078 tok/s +step 1131/19560 | loss 4.281062 (+0.06z)| norm 0.4836 (+0.07z)| lr 5.99e-04 | 8437.92 ms | -100.0% bf16 MFU | 62080 tok/s +step 1132/19560 | loss 4.202580 (-1.45z)| norm 0.4746 (+0.02z)| lr 5.99e-04 | 8438.44 ms | -100.0% bf16 MFU | 62083 tok/s +step 1133/19560 | loss 4.259470 (-0.33z)| norm 0.4491 (-0.11z)| lr 5.99e-04 | 8441.83 ms | -100.0% bf16 MFU | 62084 tok/s +step 1134/19560 | loss 4.242587 (-0.65z)| norm 0.4158 (-0.29z)| lr 5.99e-04 | 8442.17 ms | -100.0% bf16 MFU | 62085 tok/s +step 1135/19560 | loss 4.198705 (-1.50z)| norm 0.3692 (-0.54z)| lr 5.99e-04 | 8443.08 ms | -100.0% bf16 MFU | 62086 tok/s +step 1136/19560 | loss 4.233478 (-0.80z)| norm 0.4134 (-0.30z)| lr 5.99e-04 | 8444.88 ms | -100.0% bf16 MFU | 62086 tok/s +step 1137/19560 | loss 4.330499 (+1.10z)| norm 0.4369 (-0.16z)| lr 5.99e-04 | 8444.75 ms | -100.0% bf16 MFU | 62085 tok/s +step 1138/19560 | loss 4.175890 (-1.91z)| norm 0.4226 (-0.24z)| lr 5.99e-04 | 8444.72 ms | -100.0% bf16 MFU | 62085 tok/s +step 1139/19560 | loss 4.179602 (-1.81z)| norm 0.3438 (-0.66z)| lr 5.99e-04 | 8445.19 ms | -100.0% bf16 MFU | 62085 tok/s +step 1140/19560 | loss 4.214099 (-1.11z)| norm 0.3651 (-0.53z)| lr 5.99e-04 | 8443.35 ms | -100.0% bf16 MFU | 62086 tok/s +step 1141/19560 | loss 4.267506 (-0.05z)| norm 0.3896 (-0.39z)| lr 5.99e-04 | 8446.30 ms | -100.0% bf16 MFU | 62085 tok/s +step 1142/19560 | loss 4.181620 (-1.71z)| norm 0.4163 (-0.24z)| lr 5.99e-04 | 8447.95 ms | -100.0% bf16 MFU | 62084 tok/s +step 1143/19560 | loss 4.219888 (-0.95z)| norm 0.4670 (+0.05z)| lr 5.99e-04 | 8446.20 ms | -100.0% bf16 MFU | 62083 tok/s +step 1144/19560 | loss 4.290790 (+0.44z)| norm 0.5373 (+0.45z)| lr 5.99e-04 | 8450.67 ms | -100.0% bf16 MFU | 62081 tok/s +step 1145/19560 | loss 4.244938 (-0.45z)| norm 0.5208 (+1.10z)| lr 5.99e-04 | 8448.94 ms | -100.0% bf16 MFU | 62080 tok/s +step 1146/19560 | loss 4.204394 (-1.25z)| norm 0.3814 (-0.90z)| lr 5.99e-04 | 8444.51 ms | -100.0% bf16 MFU | 62080 tok/s +step 1147/19560 | loss 4.334097 (+1.38z)| norm 0.4421 (+0.07z)| lr 5.99e-04 | 8448.07 ms | -100.0% bf16 MFU | 62079 tok/s +step 1148/19560 | loss 4.209173 (-1.14z)| norm 0.4059 (-0.51z)| lr 5.99e-04 | 8441.15 ms | -100.0% bf16 MFU | 62081 tok/s +step 1149/19560 | loss 4.226658 (-0.77z)| norm 0.3678 (-1.15z)| lr 5.99e-04 | 8442.55 ms | -100.0% bf16 MFU | 62082 tok/s +step 1150/19560 | loss 4.181923 (-1.67z)| norm 0.3783 (-0.96z)| lr 5.99e-04 | 8437.78 ms | -100.0% bf16 MFU | 62084 tok/s +step 1151/19560 | loss 4.310469 (+0.97z)| norm 0.4374 (+0.09z)| lr 5.99e-04 | 8439.68 ms | -100.0% bf16 MFU | 62086 tok/s +step 1152/19560 | loss 4.179587 (-1.68z)| norm 0.4436 (+0.24z)| lr 5.99e-04 | 8437.05 ms | -100.0% bf16 MFU | 62089 tok/s +step 1153/19560 | loss 4.187228 (-1.50z)| norm 0.4442 (+0.27z)| lr 5.99e-04 | 8437.16 ms | -100.0% bf16 MFU | 62092 tok/s +step 1154/19560 | loss 4.213127 (-0.96z)| norm 0.4559 (+0.51z)| lr 5.99e-04 | 8437.57 ms | -100.0% bf16 MFU | 62094 tok/s +step 1155/19560 | loss 4.169183 (-1.81z)| norm 0.4592 (+0.56z)| lr 5.99e-04 | 8441.22 ms | -100.0% bf16 MFU | 62095 tok/s +step 1156/19560 | loss 4.212283 (-0.93z)| norm 0.4151 (-0.29z)| lr 5.99e-04 | 8442.59 ms | -100.0% bf16 MFU | 62095 tok/s +step 1157/19560 | loss 4.228110 (-0.61z)| norm 0.4160 (-0.28z)| lr 5.99e-04 | 8435.95 ms | -100.0% bf16 MFU | 62098 tok/s +step 1158/19560 | loss 4.216619 (-0.83z)| norm 0.4614 (+0.61z)| lr 5.99e-04 | 8436.08 ms | -100.0% bf16 MFU | 62100 tok/s +step 1159/19560 | loss 4.228724 (-0.57z)| norm 0.4380 (+0.17z)| lr 5.99e-04 | 8439.35 ms | -100.0% bf16 MFU | 62101 tok/s +step 1160/19560 | loss 4.132954 (-2.42z)| norm 0.3706 (-1.14z)| lr 5.99e-04 | 8436.44 ms | -100.0% bf16 MFU | 62104 tok/s +step 1161/19560 | loss 4.183104 (-1.41z)| norm 0.3840 (-0.87z)| lr 5.99e-04 | 8436.00 ms | -100.0% bf16 MFU | 62106 tok/s +step 1162/19560 | loss 4.151833 (-1.99z)| norm 0.4025 (-0.49z)| lr 5.99e-04 | 8439.24 ms | -100.0% bf16 MFU | 62107 tok/s +step 1163/19560 | loss 4.200769 (-1.02z)| norm 0.3604 (-1.30z)| lr 5.99e-04 | 8437.90 ms | -100.0% bf16 MFU | 62108 tok/s +step 1164/19560 | loss 4.153092 (-1.93z)| norm 0.3415 (-1.65z)| lr 5.99e-04 | 8438.27 ms | -100.0% bf16 MFU | 62109 tok/s +step 1165/19560 | loss 4.200481 (-0.98z)| norm 0.3297 (-1.84z)| lr 5.99e-04 | 8438.45 ms | -100.0% bf16 MFU | 62111 tok/s +step 1166/19560 | loss 4.184743 (-1.27z)| norm 0.3436 (-1.54z)| lr 5.99e-04 | 8440.82 ms | -100.0% bf16 MFU | 62111 tok/s +step 1167/19560 | loss 4.190882 (-1.14z)| norm 0.3675 (-1.07z)| lr 5.99e-04 | 8442.64 ms | -100.0% bf16 MFU | 62110 tok/s +step 1168/19560 | loss 4.167179 (-1.58z)| norm 0.3614 (-1.17z)| lr 5.99e-04 | 8438.51 ms | -100.0% bf16 MFU | 62111 tok/s +step 1169/19560 | loss 4.231122 (-0.30z)| norm 0.3645 (-1.09z)| lr 5.99e-04 | 8440.88 ms | -100.0% bf16 MFU | 62111 tok/s +step 1170/19560 | loss 4.237477 (-0.18z)| norm 0.3638 (-1.10z)| lr 5.99e-04 | 8443.13 ms | -100.0% bf16 MFU | 62111 tok/s +step 1171/19560 | loss 4.160308 (-1.68z)| norm 0.4079 (-0.28z)| lr 5.99e-04 | 8441.65 ms | -100.0% bf16 MFU | 62110 tok/s +step 1172/19560 | loss 4.142429 (-2.00z)| norm 0.3796 (-0.81z)| lr 5.99e-04 | 8440.96 ms | -100.0% bf16 MFU | 62110 tok/s +step 1173/19560 | loss 4.153449 (-1.75z)| norm 0.4481 (+0.47z)| lr 5.99e-04 | 8439.88 ms | -100.0% bf16 MFU | 62111 tok/s +step 1174/19560 | loss 4.205191 (-0.73z)| norm 0.4425 (+0.36z)| lr 5.99e-04 | 8440.49 ms | -100.0% bf16 MFU | 62111 tok/s +step 1175/19560 | loss 4.217984 (-0.47z)| norm 0.4457 (+0.42z)| lr 5.99e-04 | 8440.09 ms | -100.0% bf16 MFU | 62112 tok/s +step 1176/19560 | loss 4.214265 (-0.54z)| norm 0.4081 (-0.28z)| lr 5.99e-04 | 8440.58 ms | -100.0% bf16 MFU | 62112 tok/s +step 1177/19560 | loss 4.179042 (-1.21z)| norm 0.3632 (-1.13z)| lr 5.99e-04 | 8442.03 ms | -100.0% bf16 MFU | 62111 tok/s +step 1178/19560 | loss 4.221930 (-0.36z)| norm 0.3459 (-1.45z)| lr 5.99e-04 | 8444.93 ms | -100.0% bf16 MFU | 62110 tok/s +step 1179/19560 | loss 4.158119 (-1.58z)| norm 0.3537 (-1.29z)| lr 5.99e-04 | 8438.97 ms | -100.0% bf16 MFU | 62111 tok/s +step 1180/19560 | loss 4.173415 (-1.27z)| norm 0.3617 (-1.13z)| lr 5.99e-04 | 8437.47 ms | -100.0% bf16 MFU | 62112 tok/s +step 1181/19560 | loss 4.243453 (+0.08z)| norm 0.3394 (-1.52z)| lr 5.99e-04 | 8434.61 ms | -100.0% bf16 MFU | 62115 tok/s +step 1182/19560 | loss 4.210926 (-0.56z)| norm 0.3811 (-0.74z)| lr 5.99e-04 | 8436.38 ms | -100.0% bf16 MFU | 62116 tok/s +step 1183/19560 | loss 4.211284 (-0.54z)| norm 0.3969 (-0.43z)| lr 5.99e-04 | 8438.75 ms | -100.0% bf16 MFU | 62117 tok/s +step 1184/19560 | loss 4.135247 (-1.98z)| norm 0.3843 (-0.66z)| lr 5.99e-04 | 8434.24 ms | -100.0% bf16 MFU | 62119 tok/s +step 1185/19560 | loss 4.184384 (-1.02z)| norm 0.4092 (-0.18z)| lr 5.99e-04 | 8437.58 ms | -100.0% bf16 MFU | 62120 tok/s +step 1186/19560 | loss 4.286325 (+0.96z)| norm 0.4784 (+1.17z)| lr 5.99e-04 | 8434.27 ms | -100.0% bf16 MFU | 62122 tok/s +step 1187/19560 | loss 4.178799 (-1.13z)| norm 0.4795 (+1.20z)| lr 5.99e-04 | 8435.29 ms | -100.0% bf16 MFU | 62124 tok/s +step 1188/19560 | loss 4.230902 (-0.11z)| norm 0.4678 (+0.97z)| lr 5.99e-04 | 8435.60 ms | -100.0% bf16 MFU | 62125 tok/s +step 1189/19560 | loss 4.198155 (-0.74z)| norm 0.4180 (-0.00z)| lr 5.99e-04 | 8439.95 ms | -100.0% bf16 MFU | 62125 tok/s +step 1190/19560 | loss 4.160152 (-1.46z)| norm 0.3964 (-0.43z)| lr 5.99e-04 | 8439.59 ms | -100.0% bf16 MFU | 62125 tok/s +step 1191/19560 | loss 4.110752 (-2.35z)| norm 0.3973 (-0.41z)| lr 5.99e-04 | 8433.44 ms | -100.0% bf16 MFU | 62127 tok/s +step 1192/19560 | loss 4.241627 (+0.13z)| norm 0.3708 (-0.91z)| lr 5.99e-04 | 8436.28 ms | -100.0% bf16 MFU | 62128 tok/s +step 1193/19560 | loss 4.185397 (-0.92z)| norm 0.4204 (+0.06z)| lr 5.99e-04 | 8432.74 ms | -100.0% bf16 MFU | 62130 tok/s +step 1194/19560 | loss 4.144421 (-1.66z)| norm 0.3928 (-0.47z)| lr 5.99e-04 | 8440.97 ms | -100.0% bf16 MFU | 62129 tok/s +step 1195/19560 | loss 4.164222 (-1.27z)| norm 0.4008 (-0.31z)| lr 5.99e-04 | 8441.42 ms | -100.0% bf16 MFU | 62128 tok/s +step 1196/19560 | loss 4.154187 (-1.43z)| norm 0.4907 (+1.44z)| lr 5.99e-04 | 8436.34 ms | -100.0% bf16 MFU | 62129 tok/s +step 1197/19560 | loss 4.228555 (-0.03z)| norm 0.4966 (+1.52z)| lr 5.99e-04 | 8438.09 ms | -100.0% bf16 MFU | 62129 tok/s +step 1198/19560 | loss 4.160052 (-1.32z)| norm 0.5236 (+2.07z)| lr 5.99e-04 | 8442.20 ms | -100.0% bf16 MFU | 62128 tok/s +step 1199/19560 | loss 4.208241 (-0.39z)| norm 0.4764 (+1.15z)| lr 5.99e-04 | 8445.13 ms | -100.0% bf16 MFU | 62126 tok/s +step 1200/19560 | loss 4.221226 (-0.13z)| norm 0.4286 (+0.22z)| lr 5.99e-04 | 8439.80 ms | -100.0% bf16 MFU | 62125 tok/s +step 1201/19560 | loss 4.322546 (+1.85z)| norm 0.3947 (-0.44z)| lr 5.99e-04 | 8437.00 ms | -100.0% bf16 MFU | 62126 tok/s +step 1202/19560 | loss 4.159914 (-1.32z)| norm 0.3462 (-1.38z)| lr 5.99e-04 | 8439.04 ms | -100.0% bf16 MFU | 62126 tok/s +step 1203/19560 | loss 4.226366 (-0.00z)| norm 0.3129 (-1.97z)| lr 5.99e-04 | 8438.17 ms | -100.0% bf16 MFU | 62127 tok/s +step 1204/19560 | loss 4.201705 (-0.48z)| norm 0.2972 (-2.21z)| lr 5.99e-04 | 8440.93 ms | -100.0% bf16 MFU | 62126 tok/s +step 1205/19560 | loss 4.152503 (-1.44z)| norm 0.3210 (-1.73z)| lr 5.99e-04 | 8444.43 ms | -100.0% bf16 MFU | 62124 tok/s +step 1206/19560 | loss 4.138775 (-1.68z)| norm 0.3502 (-1.18z)| lr 5.99e-04 | 8437.95 ms | -100.0% bf16 MFU | 62124 tok/s +step 1207/19560 | loss 4.209097 (-0.28z)| norm 0.3816 (-0.61z)| lr 5.99e-04 | 8437.10 ms | -100.0% bf16 MFU | 62125 tok/s +step 1208/19560 | loss 4.125535 (-1.90z)| norm 0.3145 (-1.82z)| lr 5.99e-04 | 8444.53 ms | -100.0% bf16 MFU | 62123 tok/s +step 1209/19560 | loss 4.166795 (-1.07z)| norm 0.3303 (-1.51z)| lr 5.99e-04 | 8442.53 ms | -100.0% bf16 MFU | 62122 tok/s +step 1210/19560 | loss 4.129112 (-1.79z)| norm 0.3597 (-0.96z)| lr 5.99e-04 | 8441.79 ms | -100.0% bf16 MFU | 62121 tok/s +step 1211/19560 | loss 4.190046 (-0.57z)| norm 0.3706 (-0.75z)| lr 5.99e-04 | 8441.60 ms | -100.0% bf16 MFU | 62121 tok/s +step 1212/19560 | loss 4.172193 (-0.92z)| norm 0.4173 (+0.17z)| lr 5.99e-04 | 8445.39 ms | -100.0% bf16 MFU | 62119 tok/s +step 1213/19560 | loss 4.096864 (-2.38z)| norm 0.4052 (-0.06z)| lr 5.99e-04 | 8447.93 ms | -100.0% bf16 MFU | 62116 tok/s +step 1214/19560 | loss 4.209296 (-0.13z)| norm 0.3781 (-0.59z)| lr 5.99e-04 | 8448.12 ms | -100.0% bf16 MFU | 62113 tok/s +step 1215/19560 | loss 4.166880 (-0.96z)| norm 0.4128 (+0.09z)| lr 5.99e-04 | 8446.25 ms | -100.0% bf16 MFU | 62111 tok/s +step 1216/19560 | loss 4.254341 (+0.79z)| norm 0.4837 (+1.48z)| lr 5.99e-04 | 8445.87 ms | -100.0% bf16 MFU | 62109 tok/s +step 1217/19560 | loss 4.153593 (-1.21z)| norm 0.4844 (+1.48z)| lr 5.99e-04 | 8443.96 ms | -100.0% bf16 MFU | 62108 tok/s +step 1218/19560 | loss 4.194986 (-0.37z)| norm 0.4617 (+1.04z)| lr 5.99e-04 | 8446.38 ms | -100.0% bf16 MFU | 62107 tok/s +step 1219/19560 | loss 4.172286 (-0.82z)| norm 0.3971 (-0.21z)| lr 5.99e-04 | 8446.13 ms | -100.0% bf16 MFU | 62105 tok/s +step 1220/19560 | loss 4.088356 (-2.44z)| norm 0.3714 (-0.72z)| lr 5.99e-04 | 8446.93 ms | -100.0% bf16 MFU | 62103 tok/s +step 1221/19560 | loss 4.117585 (-1.84z)| norm 0.4095 (+0.03z)| lr 5.99e-04 | 8448.02 ms | -100.0% bf16 MFU | 62101 tok/s +step 1222/19560 | loss 4.312923 (+2.02z)| norm 0.4560 (+0.93z)| lr 5.99e-04 | 8445.97 ms | -100.0% bf16 MFU | 62100 tok/s +step 1223/19560 | loss 4.167620 (-0.83z)| norm 0.4423 (+0.66z)| lr 5.99e-04 | 8444.93 ms | -100.0% bf16 MFU | 62099 tok/s +step 1224/19560 | loss 4.235076 (+0.52z)| norm 0.3628 (-0.90z)| lr 5.99e-04 | 8449.52 ms | -100.0% bf16 MFU | 62096 tok/s +step 1225/19560 | loss 4.167438 (-0.82z)| norm 0.3537 (-1.06z)| lr 5.99e-04 | 8444.87 ms | -100.0% bf16 MFU | 62096 tok/s +step 1226/19560 | loss 4.139028 (-1.37z)| norm 0.3257 (-1.59z)| lr 5.99e-04 | 8446.42 ms | -100.0% bf16 MFU | 62095 tok/s +step 1227/19560 | loss 4.140365 (-1.32z)| norm 0.3174 (-1.74z)| lr 5.99e-04 | 8445.42 ms | -100.0% bf16 MFU | 62094 tok/s +step 1228/19560 | loss 4.176530 (-0.59z)| norm 0.3464 (-1.14z)| lr 5.99e-04 | 8450.64 ms | -100.0% bf16 MFU | 62091 tok/s +step 1229/19560 | loss 4.127951 (-1.53z)| norm 0.3578 (-0.90z)| lr 5.99e-04 | 8445.65 ms | -100.0% bf16 MFU | 62091 tok/s +step 1230/19560 | loss 4.196792 (-0.16z)| norm 0.4081 (+0.12z)| lr 5.99e-04 | 8447.72 ms | -100.0% bf16 MFU | 62089 tok/s +step 1231/19560 | loss 4.065388 (-2.68z)| norm 0.4006 (-0.02z)| lr 5.99e-04 | 8443.57 ms | -100.0% bf16 MFU | 62089 tok/s +step 1232/19560 | loss 4.101730 (-1.93z)| norm 0.4086 (+0.14z)| lr 5.99e-04 | 8445.54 ms | -100.0% bf16 MFU | 62089 tok/s +step 1233/19560 | loss 4.138077 (-1.22z)| norm 0.4426 (+0.82z)| lr 5.99e-04 | 8448.36 ms | -100.0% bf16 MFU | 62087 tok/s +step 1234/19560 | loss 4.245066 (+0.83z)| norm 0.4292 (+0.54z)| lr 5.99e-04 | 8447.89 ms | -100.0% bf16 MFU | 62086 tok/s +step 1235/19560 | loss 4.124845 (-1.44z)| norm 0.4514 (+0.98z)| lr 5.99e-04 | 8445.26 ms | -100.0% bf16 MFU | 62086 tok/s +step 1236/19560 | loss 4.140662 (-1.13z)| norm 0.4036 (+0.02z)| lr 5.99e-04 | 8445.03 ms | -100.0% bf16 MFU | 62086 tok/s +step 1237/19560 | loss 4.333231 (+2.53z)| norm 0.3900 (-0.25z)| lr 5.99e-04 | 8446.45 ms | -100.0% bf16 MFU | 62085 tok/s +step 1238/19560 | loss 4.123080 (-1.45z)| norm 0.3967 (-0.11z)| lr 5.99e-04 | 8446.65 ms | -100.0% bf16 MFU | 62084 tok/s +step 1239/19560 | loss 4.177068 (-0.42z)| norm 0.4081 (+0.13z)| lr 5.99e-04 | 8446.11 ms | -100.0% bf16 MFU | 62084 tok/s +step 1240/19560 | loss 4.147025 (-0.98z)| norm 0.3793 (-0.46z)| lr 5.99e-04 | 8445.44 ms | -100.0% bf16 MFU | 62083 tok/s +step 1241/19560 | loss 4.145471 (-1.00z)| norm 0.3624 (-0.81z)| lr 5.99e-04 | 8446.27 ms | -100.0% bf16 MFU | 62083 tok/s +step 1242/19560 | loss 4.119356 (-1.47z)| norm 0.4003 (-0.04z)| lr 5.99e-04 | 8448.45 ms | -100.0% bf16 MFU | 62082 tok/s +step 1243/19560 | loss 4.108959 (-1.64z)| norm 0.4135 (+0.23z)| lr 5.99e-04 | 8450.60 ms | -100.0% bf16 MFU | 62080 tok/s +step 1244/19560 | loss 4.173478 (-0.42z)| norm 0.3729 (-0.62z)| lr 5.99e-04 | 8447.08 ms | -100.0% bf16 MFU | 62079 tok/s +step 1245/19560 | loss 4.139975 (-1.03z)| norm 0.3728 (-0.62z)| lr 5.99e-04 | 8446.69 ms | -100.0% bf16 MFU | 62079 tok/s +step 1246/19560 | loss 4.149094 (-0.85z)| norm 0.4148 (+0.25z)| lr 5.99e-04 | 8445.88 ms | -100.0% bf16 MFU | 62078 tok/s +step 1247/19560 | loss 4.142470 (-0.96z)| norm 0.4238 (+0.44z)| lr 5.99e-04 | 8449.67 ms | -100.0% bf16 MFU | 62077 tok/s +step 1248/19560 | loss 4.108355 (-1.57z)| norm 0.5046 (+2.07z)| lr 5.99e-04 | 8446.38 ms | -100.0% bf16 MFU | 62077 tok/s +step 1249/19560 | loss 4.213403 (+0.39z)| norm 0.4271 (+0.48z)| lr 5.99e-04 | 8446.73 ms | -100.0% bf16 MFU | 62076 tok/s +step 1250/19560 | loss 4.133550 (-1.08z)| norm 0.3784 (-0.52z)| lr 5.99e-04 | 8458.44 ms | -100.0% bf16 MFU | 62072 tok/s +val loss 4.131935 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 2606/10042 = 0.259510 +step 1251/19560 | loss 4.136459 (-1.01z)| norm 0.3691 (-0.70z)| lr 5.99e-04 | 8471.47 ms | -100.0% bf16 MFU | 62063 tok/s +step 1252/19560 | loss 4.096259 (-1.73z)| norm 0.3414 (-1.26z)| lr 5.99e-04 | 8471.08 ms | -100.0% bf16 MFU | 62054 tok/s +step 1253/19560 | loss 4.088764 (-1.83z)| norm 0.3317 (-1.44z)| lr 5.99e-04 | 8475.75 ms | -100.0% bf16 MFU | 62044 tok/s +step 1254/19560 | loss 4.175788 (-0.24z)| norm 0.3057 (-1.94z)| lr 5.99e-04 | 8473.69 ms | -100.0% bf16 MFU | 62036 tok/s +step 1255/19560 | loss 4.089392 (-1.78z)| norm 0.3206 (-1.63z)| lr 5.99e-04 | 8468.98 ms | -100.0% bf16 MFU | 62029 tok/s +step 1256/19560 | loss 4.067148 (-2.13z)| norm 0.3110 (-1.78z)| lr 5.99e-04 | 8474.53 ms | -100.0% bf16 MFU | 62021 tok/s +step 1257/19560 | loss 4.075560 (-1.94z)| norm 0.3223 (-1.53z)| lr 5.99e-04 | 8470.69 ms | -100.0% bf16 MFU | 62015 tok/s +step 1258/19560 | loss 4.066988 (-2.06z)| norm 0.3423 (-1.12z)| lr 5.99e-04 | 8471.26 ms | -100.0% bf16 MFU | 62009 tok/s +step 1259/19560 | loss 4.028923 (-2.65z)| norm 0.3373 (-1.21z)| lr 5.99e-04 | 8471.39 ms | -100.0% bf16 MFU | 62003 tok/s +step 1260/19560 | loss 4.087535 (-1.60z)| norm 0.3345 (-1.24z)| lr 5.99e-04 | 8473.09 ms | -100.0% bf16 MFU | 61996 tok/s +step 1261/19560 | loss 4.058834 (-2.05z)| norm 0.3538 (-0.85z)| lr 5.99e-04 | 8464.25 ms | -100.0% bf16 MFU | 61994 tok/s +step 1262/19560 | loss 4.162748 (-0.27z)| norm 0.3363 (-1.18z)| lr 5.99e-04 | 8467.45 ms | -100.0% bf16 MFU | 61990 tok/s +step 1263/19560 | loss 4.086973 (-1.54z)| norm 0.3106 (-1.66z)| lr 5.99e-04 | 8465.54 ms | -100.0% bf16 MFU | 61987 tok/s +step 1264/19560 | loss 4.086239 (-1.52z)| norm 0.3338 (-1.19z)| lr 5.99e-04 | 8466.02 ms | -100.0% bf16 MFU | 61984 tok/s +step 1265/19560 | loss 4.104451 (-1.21z)| norm 0.3705 (-0.46z)| lr 5.99e-04 | 8463.85 ms | -100.0% bf16 MFU | 61982 tok/s +step 1266/19560 | loss 4.062596 (-1.89z)| norm 0.3796 (-0.28z)| lr 5.99e-04 | 8460.71 ms | -100.0% bf16 MFU | 61981 tok/s +step 1267/19560 | loss 4.092904 (-1.35z)| norm 0.3709 (-0.46z)| lr 5.99e-04 | 8467.56 ms | -100.0% bf16 MFU | 61978 tok/s +step 1268/19560 | loss 4.115069 (-0.96z)| norm 0.3624 (-0.62z)| lr 5.99e-04 | 8458.00 ms | -100.0% bf16 MFU | 61978 tok/s +step 1269/19560 | loss 4.057159 (-1.90z)| norm 0.4192 (+0.48z)| lr 5.99e-04 | 8456.52 ms | -100.0% bf16 MFU | 61979 tok/s +step 1270/19560 | loss 4.105279 (-1.08z)| norm 0.4161 (+0.42z)| lr 5.99e-04 | 8447.48 ms | -100.0% bf16 MFU | 61984 tok/s +step 1271/19560 | loss 4.085865 (-1.38z)| norm 0.3787 (-0.30z)| lr 5.99e-04 | 8452.87 ms | -100.0% bf16 MFU | 61986 tok/s +step 1272/19560 | loss 4.050306 (-1.94z)| norm 0.5014 (+2.15z)| lr 5.99e-04 | 8451.10 ms | -100.0% bf16 MFU | 61988 tok/s +step 1273/19560 | loss 4.170454 (+0.07z)| norm 0.5730 (+3.48z)| lr 5.99e-04 | 8453.09 ms | -100.0% bf16 MFU | 61990 tok/s +step 1274/19560 | loss 4.156510 (-0.16z)| norm 0.5712 (+3.27z)| lr 5.99e-04 | 8446.77 ms | -100.0% bf16 MFU | 61994 tok/s +step 1275/19560 | loss 4.075418 (-1.52z)| norm 0.4304 (+0.66z)| lr 5.99e-04 | 8450.34 ms | -100.0% bf16 MFU | 61997 tok/s +step 1276/19560 | loss 4.172091 (+0.14z)| norm 0.3986 (+0.06z)| lr 5.99e-04 | 8450.56 ms | -100.0% bf16 MFU | 61999 tok/s +step 1277/19560 | loss 4.081131 (-1.40z)| norm 0.3760 (-0.36z)| lr 5.99e-04 | 8449.80 ms | -100.0% bf16 MFU | 62001 tok/s +step 1278/19560 | loss 4.122245 (-0.68z)| norm 0.4116 (+0.30z)| lr 5.99e-04 | 8445.51 ms | -100.0% bf16 MFU | 62005 tok/s +step 1279/19560 | loss 4.113247 (-0.83z)| norm 0.4336 (+0.71z)| lr 5.99e-04 | 8444.21 ms | -100.0% bf16 MFU | 62009 tok/s +step 1280/19560 | loss 4.050819 (-1.88z)| norm 0.4371 (+0.78z)| lr 5.99e-04 | 8453.88 ms | -100.0% bf16 MFU | 62010 tok/s +step 1281/19560 | loss 4.123672 (-0.61z)| norm 0.4066 (+0.21z)| lr 5.99e-04 | 8459.43 ms | -100.0% bf16 MFU | 62008 tok/s +step 1282/19560 | loss 4.181349 (+0.39z)| norm 0.6647 (+4.60z)| lr 5.99e-04 | 8446.76 ms | -100.0% bf16 MFU | 62011 tok/s +step 1283/19560 | loss 4.132582 (-0.45z)| norm 0.3639 (-0.55z)| lr 5.99e-04 | 8452.70 ms | -100.0% bf16 MFU | 62012 tok/s +step 1284/19560 | loss 4.067577 (-1.55z)| norm 0.3211 (-1.27z)| lr 5.99e-04 | 8452.79 ms | -100.0% bf16 MFU | 62013 tok/s +step 1285/19560 | loss 4.087420 (-1.19z)| norm 0.3184 (-1.30z)| lr 5.99e-04 | 8451.21 ms | -100.0% bf16 MFU | 62014 tok/s +step 1286/19560 | loss 4.082494 (-1.25z)| norm 0.3404 (-0.91z)| lr 5.99e-04 | 8454.92 ms | -100.0% bf16 MFU | 62014 tok/s +step 1287/19560 | loss 4.045028 (-1.86z)| norm 0.3490 (-0.75z)| lr 5.99e-04 | 8446.79 ms | -100.0% bf16 MFU | 62016 tok/s +step 1288/19560 | loss 4.055168 (-1.66z)| norm 0.3564 (-0.62z)| lr 5.99e-04 | 8455.06 ms | -100.0% bf16 MFU | 62016 tok/s +step 1289/19560 | loss 4.082685 (-1.17z)| norm 0.3989 (+0.10z)| lr 5.99e-04 | 8447.61 ms | -100.0% bf16 MFU | 62018 tok/s +step 1290/19560 | loss 4.134742 (-0.30z)| norm 0.4408 (+0.81z)| lr 5.99e-04 | 8456.79 ms | -100.0% bf16 MFU | 62017 tok/s +step 1291/19560 | loss 4.094344 (-0.96z)| norm 0.4774 (+1.41z)| lr 5.99e-04 | 8452.27 ms | -100.0% bf16 MFU | 62018 tok/s +step 1292/19560 | loss 4.159558 (+0.13z)| norm 0.4241 (+0.50z)| lr 5.99e-04 | 8448.82 ms | -100.0% bf16 MFU | 62020 tok/s +step 1293/19560 | loss 4.052171 (-1.64z)| norm 0.4580 (+1.06z)| lr 5.99e-04 | 8456.20 ms | -100.0% bf16 MFU | 62019 tok/s +step 1294/19560 | loss 4.131250 (-0.31z)| norm 0.4956 (+1.66z)| lr 5.99e-04 | 8452.54 ms | -100.0% bf16 MFU | 62019 tok/s +step 1295/19560 | loss 4.124016 (-0.43z)| norm 0.5364 (+2.28z)| lr 5.99e-04 | 8453.24 ms | -100.0% bf16 MFU | 62019 tok/s +step 1296/19560 | loss 4.124540 (-0.41z)| norm 0.4572 (+0.96z)| lr 5.99e-04 | 8452.49 ms | -100.0% bf16 MFU | 62020 tok/s +step 1297/19560 | loss 4.048822 (-1.64z)| norm 0.4222 (+0.37z)| lr 5.99e-04 | 8452.95 ms | -100.0% bf16 MFU | 62020 tok/s +step 1298/19560 | loss 4.140565 (-0.11z)| norm 0.3857 (-0.23z)| lr 5.99e-04 | 8451.83 ms | -100.0% bf16 MFU | 62021 tok/s +step 1299/19560 | loss 4.112001 (-0.58z)| norm 0.4203 (+0.34z)| lr 5.99e-04 | 8462.73 ms | -100.0% bf16 MFU | 62017 tok/s +step 1300/19560 | loss 4.119074 (-0.46z)| norm 0.3653 (-0.56z)| lr 5.99e-04 | 8460.80 ms | -100.0% bf16 MFU | 62015 tok/s +step 1301/19560 | loss 4.081920 (-1.06z)| norm 0.3361 (-1.03z)| lr 5.99e-04 | 8456.24 ms | -100.0% bf16 MFU | 62014 tok/s +step 1302/19560 | loss 4.090199 (-0.91z)| norm 0.3278 (-1.15z)| lr 5.98e-04 | 8456.47 ms | -100.0% bf16 MFU | 62013 tok/s +step 1303/19560 | loss 4.071144 (-1.21z)| norm 0.3142 (-1.35z)| lr 5.98e-04 | 8450.39 ms | -100.0% bf16 MFU | 62015 tok/s +step 1304/19560 | loss 4.057337 (-1.42z)| norm 0.3097 (-1.40z)| lr 5.98e-04 | 8456.84 ms | -100.0% bf16 MFU | 62014 tok/s +step 1305/19560 | loss 4.204601 (+1.02z)| norm 0.3066 (-1.43z)| lr 5.98e-04 | 8461.47 ms | -100.0% bf16 MFU | 62011 tok/s +step 1306/19560 | loss 4.098365 (-0.72z)| norm 0.3257 (-1.12z)| lr 5.98e-04 | 8460.39 ms | -100.0% bf16 MFU | 62009 tok/s +step 1307/19560 | loss 4.123694 (-0.30z)| norm 0.3122 (-1.32z)| lr 5.98e-04 | 8457.25 ms | -100.0% bf16 MFU | 62008 tok/s +step 1308/19560 | loss 4.099452 (-0.69z)| norm 0.3449 (-0.80z)| lr 5.98e-04 | 8462.03 ms | -100.0% bf16 MFU | 62006 tok/s +step 1309/19560 | loss 4.093478 (-0.78z)| norm 0.3702 (-0.40z)| lr 5.98e-04 | 8461.61 ms | -100.0% bf16 MFU | 62003 tok/s +step 1310/19560 | loss 4.061124 (-1.30z)| norm 0.4161 (+0.33z)| lr 5.98e-04 | 8456.00 ms | -100.0% bf16 MFU | 62003 tok/s +step 1311/19560 | loss 4.052071 (-1.43z)| norm 0.4193 (+0.38z)| lr 5.98e-04 | 8451.97 ms | -100.0% bf16 MFU | 62005 tok/s +step 1312/19560 | loss 4.097813 (-0.66z)| norm 0.4409 (+0.71z)| lr 5.98e-04 | 8456.26 ms | -100.0% bf16 MFU | 62005 tok/s +step 1313/19560 | loss 4.069807 (-1.11z)| norm 0.4386 (+0.67z)| lr 5.98e-04 | 8451.82 ms | -100.0% bf16 MFU | 62006 tok/s +step 1314/19560 | loss 4.038883 (-1.61z)| norm 0.4337 (+0.60z)| lr 5.98e-04 | 8462.90 ms | -100.0% bf16 MFU | 62003 tok/s +step 1315/19560 | loss 4.086373 (-0.80z)| norm 0.3698 (-0.41z)| lr 5.98e-04 | 8452.56 ms | -100.0% bf16 MFU | 62004 tok/s +step 1316/19560 | loss 4.103936 (-0.49z)| norm 0.3511 (-0.70z)| lr 5.98e-04 | 8453.99 ms | -100.0% bf16 MFU | 62005 tok/s +step 1317/19560 | loss 4.054902 (-1.31z)| norm 0.3422 (-0.83z)| lr 5.98e-04 | 8455.85 ms | -100.0% bf16 MFU | 62005 tok/s +step 1318/19560 | loss 4.073252 (-0.98z)| norm 0.3326 (-0.97z)| lr 5.98e-04 | 8450.38 ms | -100.0% bf16 MFU | 62007 tok/s +step 1319/19560 | loss 4.064563 (-1.11z)| norm 0.3569 (-0.58z)| lr 5.98e-04 | 8457.64 ms | -100.0% bf16 MFU | 62006 tok/s +step 1320/19560 | loss 4.093224 (-0.62z)| norm 0.3912 (-0.03z)| lr 5.98e-04 | 8456.90 ms | -100.0% bf16 MFU | 62005 tok/s +step 1321/19560 | loss 4.067002 (-1.05z)| norm 0.3829 (-0.16z)| lr 5.98e-04 | 8455.11 ms | -100.0% bf16 MFU | 62006 tok/s +step 1322/19560 | loss 4.093985 (-0.58z)| norm 0.3493 (-0.69z)| lr 5.98e-04 | 8455.54 ms | -100.0% bf16 MFU | 62006 tok/s +step 1323/19560 | loss 4.075852 (-0.88z)| norm 0.3433 (-0.78z)| lr 5.98e-04 | 8457.33 ms | -100.0% bf16 MFU | 62005 tok/s +step 1324/19560 | loss 4.108827 (-0.31z)| norm 0.3215 (-1.11z)| lr 5.98e-04 | 8455.80 ms | -100.0% bf16 MFU | 62005 tok/s +step 1325/19560 | loss 4.058160 (-1.16z)| norm 0.3370 (-0.85z)| lr 5.98e-04 | 8457.50 ms | -100.0% bf16 MFU | 62004 tok/s +step 1326/19560 | loss 4.073584 (-0.88z)| norm 0.3471 (-0.67z)| lr 5.98e-04 | 8450.65 ms | -100.0% bf16 MFU | 62006 tok/s +step 1327/19560 | loss 4.043180 (-1.38z)| norm 0.3305 (-0.93z)| lr 5.98e-04 | 8456.75 ms | -100.0% bf16 MFU | 62005 tok/s +step 1328/19560 | loss 4.102676 (-0.35z)| norm 0.3176 (-1.13z)| lr 5.98e-04 | 8455.40 ms | -100.0% bf16 MFU | 62006 tok/s +step 1329/19560 | loss 4.038307 (-1.49z)| norm 0.2891 (-1.57z)| lr 5.98e-04 | 8454.89 ms | -100.0% bf16 MFU | 62006 tok/s +step 1330/19560 | loss 4.040035 (-1.43z)| norm 0.2961 (-1.44z)| lr 5.98e-04 | 8454.49 ms | -100.0% bf16 MFU | 62006 tok/s +step 1331/19560 | loss 4.081891 (-0.67z)| norm 0.3593 (-0.42z)| lr 5.98e-04 | 8452.17 ms | -100.0% bf16 MFU | 62007 tok/s +step 1332/19560 | loss 4.140193 (+0.41z)| norm 0.4279 (+0.69z)| lr 5.98e-04 | 8457.00 ms | -100.0% bf16 MFU | 62007 tok/s +step 1333/19560 | loss 4.073351 (-0.81z)| norm 0.4639 (+1.26z)| lr 5.98e-04 | 8446.22 ms | -100.0% bf16 MFU | 62010 tok/s +step 1334/19560 | loss 4.050759 (-1.20z)| norm 0.4425 (+0.89z)| lr 5.98e-04 | 8455.16 ms | -100.0% bf16 MFU | 62010 tok/s +step 1335/19560 | loss 4.090566 (-0.46z)| norm 0.4570 (+1.11z)| lr 5.98e-04 | 8462.74 ms | -100.0% bf16 MFU | 62007 tok/s +step 1336/19560 | loss 4.061267 (-0.99z)| norm 0.3967 (+0.12z)| lr 5.98e-04 | 8451.96 ms | -100.0% bf16 MFU | 62008 tok/s +step 1337/19560 | loss 4.065548 (-0.90z)| norm 0.3708 (-0.31z)| lr 5.98e-04 | 8455.38 ms | -100.0% bf16 MFU | 62008 tok/s +step 1338/19560 | loss 4.072998 (-0.75z)| norm 0.3290 (-0.99z)| lr 5.98e-04 | 8451.00 ms | -100.0% bf16 MFU | 62010 tok/s +step 1339/19560 | loss 4.023668 (-1.63z)| norm 0.3286 (-0.98z)| lr 5.98e-04 | 8452.95 ms | -100.0% bf16 MFU | 62010 tok/s +step 1340/19560 | loss 4.046833 (-1.19z)| norm 0.3623 (-0.43z)| lr 5.98e-04 | 8453.79 ms | -100.0% bf16 MFU | 62011 tok/s +step 1341/19560 | loss 4.030443 (-1.46z)| norm 0.3976 (+0.15z)| lr 5.98e-04 | 8449.69 ms | -100.0% bf16 MFU | 62013 tok/s +step 1342/19560 | loss 4.056561 (-0.98z)| norm 0.3946 (+0.10z)| lr 5.98e-04 | 8451.59 ms | -100.0% bf16 MFU | 62014 tok/s +step 1343/19560 | loss 4.060518 (-0.89z)| norm 0.3965 (+0.13z)| lr 5.98e-04 | 8451.17 ms | -100.0% bf16 MFU | 62015 tok/s +step 1344/19560 | loss 4.006038 (-1.88z)| norm 0.3501 (-0.61z)| lr 5.98e-04 | 8452.12 ms | -100.0% bf16 MFU | 62016 tok/s +step 1345/19560 | loss 4.003447 (-1.89z)| norm 0.3325 (-0.89z)| lr 5.98e-04 | 8454.73 ms | -100.0% bf16 MFU | 62015 tok/s +step 1346/19560 | loss 4.015017 (-1.65z)| norm 0.3183 (-1.11z)| lr 5.98e-04 | 8447.21 ms | -100.0% bf16 MFU | 62018 tok/s +step 1347/19560 | loss 4.071662 (-0.59z)| norm 0.3256 (-0.97z)| lr 5.98e-04 | 8454.33 ms | -100.0% bf16 MFU | 62018 tok/s +step 1348/19560 | loss 3.971406 (-2.38z)| norm 0.3255 (-0.97z)| lr 5.98e-04 | 8453.37 ms | -100.0% bf16 MFU | 62018 tok/s +step 1349/19560 | loss 4.052635 (-0.90z)| norm 0.3754 (-0.14z)| lr 5.98e-04 | 8452.08 ms | -100.0% bf16 MFU | 62019 tok/s +step 1350/19560 | loss 3.995623 (-1.96z)| norm 0.3677 (-0.26z)| lr 5.98e-04 | 8456.43 ms | -100.0% bf16 MFU | 62018 tok/s +step 1351/19560 | loss 4.127906 (+0.54z)| norm 0.4040 (+0.35z)| lr 5.98e-04 | 8451.26 ms | -100.0% bf16 MFU | 62019 tok/s +step 1352/19560 | loss 4.095238 (-0.06z)| norm 0.4470 (+1.05z)| lr 5.98e-04 | 8450.89 ms | -100.0% bf16 MFU | 62020 tok/s +step 1353/19560 | loss 4.075559 (-0.44z)| norm 0.4015 (+0.29z)| lr 5.98e-04 | 8450.89 ms | -100.0% bf16 MFU | 62021 tok/s +step 1354/19560 | loss 4.035944 (-1.19z)| norm 0.3772 (-0.12z)| lr 5.98e-04 | 8448.63 ms | -100.0% bf16 MFU | 62022 tok/s +step 1355/19560 | loss 4.050413 (-0.89z)| norm 0.3480 (-0.61z)| lr 5.98e-04 | 8445.43 ms | -100.0% bf16 MFU | 62025 tok/s +step 1356/19560 | loss 4.056163 (-0.77z)| norm 0.3534 (-0.52z)| lr 5.98e-04 | 8447.15 ms | -100.0% bf16 MFU | 62027 tok/s +step 1357/19560 | loss 4.086695 (-0.16z)| norm 0.3672 (-0.29z)| lr 5.98e-04 | 8452.20 ms | -100.0% bf16 MFU | 62027 tok/s +step 1358/19560 | loss 4.039296 (-1.08z)| norm 0.3315 (-0.88z)| lr 5.98e-04 | 8455.93 ms | -100.0% bf16 MFU | 62026 tok/s +step 1359/19560 | loss 4.099192 (+0.10z)| norm 0.3501 (-0.56z)| lr 5.98e-04 | 8450.26 ms | -100.0% bf16 MFU | 62027 tok/s +step 1360/19560 | loss 4.049290 (-0.88z)| norm 0.3354 (-0.79z)| lr 5.98e-04 | 8446.43 ms | -100.0% bf16 MFU | 62029 tok/s +step 1361/19560 | loss 4.072518 (-0.41z)| norm 0.3120 (-1.17z)| lr 5.98e-04 | 8448.59 ms | -100.0% bf16 MFU | 62031 tok/s +step 1362/19560 | loss 3.987796 (-2.10z)| norm 0.3007 (-1.33z)| lr 5.98e-04 | 8449.35 ms | -100.0% bf16 MFU | 62032 tok/s +step 1363/19560 | loss 4.075448 (-0.31z)| norm 0.3368 (-0.72z)| lr 5.98e-04 | 8446.98 ms | -100.0% bf16 MFU | 62033 tok/s +step 1364/19560 | loss 4.099206 (+0.18z)| norm 0.3671 (-0.21z)| lr 5.98e-04 | 8452.93 ms | -100.0% bf16 MFU | 62033 tok/s +step 1365/19560 | loss 4.068336 (-0.46z)| norm 0.4794 (+1.62z)| lr 5.98e-04 | 8446.47 ms | -100.0% bf16 MFU | 62035 tok/s +step 1366/19560 | loss 4.116145 (+0.63z)| norm 0.5222 (+2.26z)| lr 5.98e-04 | 8453.15 ms | -100.0% bf16 MFU | 62034 tok/s +step 1367/19560 | loss 3.990225 (-2.19z)| norm 0.4855 (+1.65z)| lr 5.98e-04 | 8445.58 ms | -100.0% bf16 MFU | 62037 tok/s +step 1368/19560 | loss 4.048339 (-0.86z)| norm 0.4215 (+0.62z)| lr 5.98e-04 | 8450.70 ms | -100.0% bf16 MFU | 62037 tok/s +step 1369/19560 | loss 4.044579 (-0.93z)| norm 0.3756 (-0.11z)| lr 5.98e-04 | 8449.25 ms | -100.0% bf16 MFU | 62038 tok/s +step 1370/19560 | loss 4.010268 (-1.68z)| norm 0.3561 (-0.42z)| lr 5.98e-04 | 8450.08 ms | -100.0% bf16 MFU | 62038 tok/s +step 1371/19560 | loss 4.112507 (+0.64z)| norm 0.3399 (-0.67z)| lr 5.98e-04 | 8449.57 ms | -100.0% bf16 MFU | 62038 tok/s +step 1372/19560 | loss 4.064627 (-0.44z)| norm 0.3624 (-0.31z)| lr 5.98e-04 | 8446.43 ms | -100.0% bf16 MFU | 62040 tok/s +step 1373/19560 | loss 4.155902 (+1.65z)| norm 0.3595 (-0.35z)| lr 5.98e-04 | 8448.69 ms | -100.0% bf16 MFU | 62041 tok/s +step 1374/19560 | loss 4.045914 (-0.85z)| norm 0.3509 (-0.48z)| lr 5.98e-04 | 8453.42 ms | -100.0% bf16 MFU | 62040 tok/s +step 1375/19560 | loss 4.074848 (-0.18z)| norm 0.3314 (-0.78z)| lr 5.98e-04 | 8448.26 ms | -100.0% bf16 MFU | 62041 tok/s +step 1376/19560 | loss 3.976948 (-2.37z)| norm 0.3319 (-0.76z)| lr 5.98e-04 | 8449.60 ms | -100.0% bf16 MFU | 62041 tok/s +step 1377/19560 | loss 4.076643 (-0.09z)| norm 0.3453 (-0.53z)| lr 5.98e-04 | 8448.72 ms | -100.0% bf16 MFU | 62042 tok/s +step 1378/19560 | loss 4.009716 (-1.63z)| norm 0.3255 (-0.84z)| lr 5.98e-04 | 8447.88 ms | -100.0% bf16 MFU | 62043 tok/s +step 1379/19560 | loss 4.051308 (-0.65z)| norm 0.3965 (+0.29z)| lr 5.98e-04 | 8449.76 ms | -100.0% bf16 MFU | 62043 tok/s +step 1380/19560 | loss 4.144599 (+1.53z)| norm 0.3604 (-0.29z)| lr 5.98e-04 | 8446.24 ms | -100.0% bf16 MFU | 62045 tok/s +step 1381/19560 | loss 4.091660 (+0.29z)| norm 0.3279 (-0.81z)| lr 5.98e-04 | 8449.79 ms | -100.0% bf16 MFU | 62045 tok/s +step 1382/19560 | loss 4.003113 (-1.76z)| norm 0.3857 (+0.11z)| lr 5.98e-04 | 8450.18 ms | -100.0% bf16 MFU | 62045 tok/s +step 1383/19560 | loss 4.090867 (+0.31z)| norm 0.4098 (+0.49z)| lr 5.98e-04 | 8445.50 ms | -100.0% bf16 MFU | 62047 tok/s +step 1384/19560 | loss 4.067209 (-0.25z)| norm 0.3778 (-0.04z)| lr 5.98e-04 | 8447.06 ms | -100.0% bf16 MFU | 62048 tok/s +step 1385/19560 | loss 4.045808 (-0.75z)| norm 0.3773 (-0.05z)| lr 5.98e-04 | 8450.16 ms | -100.0% bf16 MFU | 62047 tok/s +step 1386/19560 | loss 4.042448 (-0.82z)| norm 0.3923 (+0.19z)| lr 5.98e-04 | 8449.63 ms | -100.0% bf16 MFU | 62047 tok/s +step 1387/19560 | loss 4.057737 (-0.47z)| norm 0.3419 (-0.64z)| lr 5.98e-04 | 8447.26 ms | -100.0% bf16 MFU | 62048 tok/s +step 1388/19560 | loss 4.062101 (-0.36z)| norm 0.3135 (-1.10z)| lr 5.98e-04 | 8447.61 ms | -100.0% bf16 MFU | 62049 tok/s +step 1389/19560 | loss 4.048796 (-0.67z)| norm 0.2987 (-1.32z)| lr 5.98e-04 | 8451.88 ms | -100.0% bf16 MFU | 62048 tok/s +step 1390/19560 | loss 4.058056 (-0.44z)| norm 0.3279 (-0.85z)| lr 5.98e-04 | 8447.52 ms | -100.0% bf16 MFU | 62049 tok/s +step 1391/19560 | loss 4.068292 (-0.19z)| norm 0.3323 (-0.78z)| lr 5.98e-04 | 8450.76 ms | -100.0% bf16 MFU | 62049 tok/s +step 1392/19560 | loss 4.098712 (+0.53z)| norm 0.3179 (-1.01z)| lr 5.98e-04 | 8447.81 ms | -100.0% bf16 MFU | 62049 tok/s +step 1393/19560 | loss 4.025096 (-1.21z)| norm 0.3142 (-1.06z)| lr 5.98e-04 | 8453.16 ms | -100.0% bf16 MFU | 62048 tok/s +step 1394/19560 | loss 4.057722 (-0.43z)| norm 0.3619 (-0.29z)| lr 5.98e-04 | 8450.51 ms | -100.0% bf16 MFU | 62048 tok/s +step 1395/19560 | loss 4.130116 (+1.28z)| norm 0.3666 (-0.21z)| lr 5.98e-04 | 8446.41 ms | -100.0% bf16 MFU | 62049 tok/s +step 1396/19560 | loss 4.064514 (-0.26z)| norm 0.4281 (+0.77z)| lr 5.98e-04 | 8454.08 ms | -100.0% bf16 MFU | 62047 tok/s +step 1397/19560 | loss 4.043656 (-0.76z)| norm 0.5155 (+2.13z)| lr 5.98e-04 | 8447.49 ms | -100.0% bf16 MFU | 62048 tok/s +step 1398/19560 | loss 4.018836 (-1.32z)| norm 0.4667 (+1.34z)| lr 5.98e-04 | 8448.47 ms | -100.0% bf16 MFU | 62049 tok/s +step 1399/19560 | loss 4.078435 (+0.09z)| norm 0.3917 (+0.16z)| lr 5.98e-04 | 8445.12 ms | -100.0% bf16 MFU | 62050 tok/s +step 1400/19560 | loss 4.077717 (+0.06z)| norm 0.3797 (-0.01z)| lr 5.98e-04 | 8450.68 ms | -100.0% bf16 MFU | 62050 tok/s +step 1401/19560 | loss 4.050947 (-0.56z)| norm 0.3463 (-0.54z)| lr 5.98e-04 | 8446.50 ms | -100.0% bf16 MFU | 62051 tok/s +step 1402/19560 | loss 4.056046 (-0.42z)| norm 0.3320 (-0.78z)| lr 5.98e-04 | 8445.10 ms | -100.0% bf16 MFU | 62052 tok/s +step 1403/19560 | loss 4.036520 (-0.89z)| norm 0.3257 (-0.87z)| lr 5.98e-04 | 8449.70 ms | -100.0% bf16 MFU | 62052 tok/s +step 1404/19560 | loss 4.020450 (-1.27z)| norm 0.3093 (-1.14z)| lr 5.98e-04 | 8446.40 ms | -100.0% bf16 MFU | 62053 tok/s +step 1405/19560 | loss 4.053689 (-0.44z)| norm 0.2999 (-1.28z)| lr 5.98e-04 | 8449.46 ms | -100.0% bf16 MFU | 62053 tok/s +step 1406/19560 | loss 4.006511 (-1.58z)| norm 0.2882 (-1.45z)| lr 5.98e-04 | 8447.36 ms | -100.0% bf16 MFU | 62054 tok/s +step 1407/19560 | loss 4.052502 (-0.44z)| norm 0.2925 (-1.36z)| lr 5.98e-04 | 8449.23 ms | -100.0% bf16 MFU | 62054 tok/s +step 1408/19560 | loss 4.018495 (-1.27z)| norm 0.2958 (-1.28z)| lr 5.98e-04 | 8447.05 ms | -100.0% bf16 MFU | 62054 tok/s +step 1409/19560 | loss 4.077216 (+0.19z)| norm 0.3393 (-0.54z)| lr 5.98e-04 | 8443.39 ms | -100.0% bf16 MFU | 62056 tok/s +step 1410/19560 | loss 4.027898 (-1.03z)| norm 0.4122 (+0.81z)| lr 5.98e-04 | 8439.61 ms | -100.0% bf16 MFU | 62060 tok/s +step 1411/19560 | loss 4.026749 (-1.05z)| norm 0.4706 (+1.86z)| lr 5.98e-04 | 8443.33 ms | -100.0% bf16 MFU | 62061 tok/s +step 1412/19560 | loss 4.051097 (-0.42z)| norm 0.4245 (+0.99z)| lr 5.98e-04 | 8438.50 ms | -100.0% bf16 MFU | 62065 tok/s +step 1413/19560 | loss 4.099133 (+0.81z)| norm 0.3830 (+0.22z)| lr 5.98e-04 | 8441.79 ms | -100.0% bf16 MFU | 62067 tok/s +step 1414/19560 | loss 4.045061 (-0.57z)| norm 0.3773 (+0.11z)| lr 5.98e-04 | 8444.28 ms | -100.0% bf16 MFU | 62068 tok/s +step 1415/19560 | loss 4.111722 (+1.12z)| norm 0.3773 (+0.10z)| lr 5.98e-04 | 8440.06 ms | -100.0% bf16 MFU | 62070 tok/s +step 1416/19560 | loss 4.025219 (-1.07z)| norm 0.3846 (+0.23z)| lr 5.98e-04 | 8441.71 ms | -100.0% bf16 MFU | 62072 tok/s +step 1417/19560 | loss 3.985179 (-2.04z)| norm 0.3367 (-0.65z)| lr 5.98e-04 | 8444.00 ms | -100.0% bf16 MFU | 62073 tok/s +step 1418/19560 | loss 4.042974 (-0.58z)| norm 0.3210 (-0.92z)| lr 5.98e-04 | 8443.09 ms | -100.0% bf16 MFU | 62074 tok/s +step 1419/19560 | loss 3.984941 (-2.00z)| norm 0.3954 (+0.48z)| lr 5.98e-04 | 8444.43 ms | -100.0% bf16 MFU | 62075 tok/s +step 1420/19560 | loss 4.028018 (-0.92z)| norm 0.3967 (+0.51z)| lr 5.98e-04 | 8447.56 ms | -100.0% bf16 MFU | 62074 tok/s +step 1421/19560 | loss 4.054559 (-0.24z)| norm 0.3753 (+0.12z)| lr 5.98e-04 | 8444.55 ms | -100.0% bf16 MFU | 62075 tok/s +step 1422/19560 | loss 3.981284 (-2.06z)| norm 0.3320 (-0.70z)| lr 5.98e-04 | 8444.45 ms | -100.0% bf16 MFU | 62076 tok/s +step 1423/19560 | loss 4.029647 (-0.83z)| norm 0.3546 (-0.24z)| lr 5.98e-04 | 8445.42 ms | -100.0% bf16 MFU | 62076 tok/s +step 1424/19560 | loss 4.015203 (-1.18z)| norm 0.3606 (-0.11z)| lr 5.98e-04 | 8447.18 ms | -100.0% bf16 MFU | 62075 tok/s +step 1425/19560 | loss 4.003534 (-1.45z)| norm 0.3244 (-0.84z)| lr 5.98e-04 | 8445.80 ms | -100.0% bf16 MFU | 62075 tok/s +step 1426/19560 | loss 3.999188 (-1.55z)| norm 0.2870 (-1.59z)| lr 5.98e-04 | 8447.95 ms | -100.0% bf16 MFU | 62075 tok/s +step 1427/19560 | loss 4.055611 (-0.10z)| norm 0.2929 (-1.45z)| lr 5.98e-04 | 8449.68 ms | -100.0% bf16 MFU | 62073 tok/s +step 1428/19560 | loss 4.019932 (-1.00z)| norm 0.3419 (-0.43z)| lr 5.98e-04 | 8450.07 ms | -100.0% bf16 MFU | 62072 tok/s +step 1429/19560 | loss 3.994381 (-1.63z)| norm 0.3770 (+0.28z)| lr 5.98e-04 | 8447.42 ms | -100.0% bf16 MFU | 62072 tok/s +step 1430/19560 | loss 4.042167 (-0.40z)| norm 0.3574 (-0.13z)| lr 5.98e-04 | 8449.62 ms | -100.0% bf16 MFU | 62070 tok/s +step 1431/19560 | loss 4.016007 (-1.05z)| norm 0.3443 (-0.40z)| lr 5.98e-04 | 8447.87 ms | -100.0% bf16 MFU | 62070 tok/s +step 1432/19560 | loss 3.998382 (-1.48z)| norm 0.3627 (-0.03z)| lr 5.98e-04 | 8446.16 ms | -100.0% bf16 MFU | 62070 tok/s +step 1433/19560 | loss 3.979362 (-1.99z)| norm 0.3418 (-0.47z)| lr 5.98e-04 | 8448.10 ms | -100.0% bf16 MFU | 62070 tok/s +step 1434/19560 | loss 4.017851 (-0.96z)| norm 0.3101 (-1.13z)| lr 5.98e-04 | 8452.53 ms | -100.0% bf16 MFU | 62068 tok/s +step 1435/19560 | loss 4.031555 (-0.59z)| norm 0.3105 (-1.12z)| lr 5.98e-04 | 8448.11 ms | -100.0% bf16 MFU | 62067 tok/s +step 1436/19560 | loss 3.982317 (-1.87z)| norm 0.3156 (-1.01z)| lr 5.98e-04 | 8448.66 ms | -100.0% bf16 MFU | 62067 tok/s +step 1437/19560 | loss 4.008374 (-1.16z)| norm 0.3313 (-0.67z)| lr 5.98e-04 | 8448.96 ms | -100.0% bf16 MFU | 62066 tok/s +step 1438/19560 | loss 3.990237 (-1.61z)| norm 0.3480 (-0.32z)| lr 5.98e-04 | 8448.40 ms | -100.0% bf16 MFU | 62066 tok/s +step 1439/19560 | loss 4.043382 (-0.21z)| norm 0.3354 (-0.57z)| lr 5.98e-04 | 8448.13 ms | -100.0% bf16 MFU | 62065 tok/s +step 1440/19560 | loss 3.986069 (-1.68z)| norm 0.3361 (-0.54z)| lr 5.98e-04 | 8449.21 ms | -100.0% bf16 MFU | 62065 tok/s +step 1441/19560 | loss 4.060743 (+0.27z)| norm 0.3365 (-0.52z)| lr 5.98e-04 | 8464.22 ms | -100.0% bf16 MFU | 62058 tok/s +step 1442/19560 | loss 4.009383 (-1.06z)| norm 0.3127 (-1.01z)| lr 5.98e-04 | 8476.65 ms | -100.0% bf16 MFU | 62048 tok/s +step 1443/19560 | loss 4.043622 (-0.17z)| norm 0.3382 (-0.46z)| lr 5.98e-04 | 8473.56 ms | -100.0% bf16 MFU | 62039 tok/s +step 1444/19560 | loss 3.973964 (-1.94z)| norm 0.3208 (-0.83z)| lr 5.98e-04 | 8475.94 ms | -100.0% bf16 MFU | 62030 tok/s +step 1445/19560 | loss 4.056277 (+0.19z)| norm 0.2940 (-1.38z)| lr 5.98e-04 | 8474.89 ms | -100.0% bf16 MFU | 62022 tok/s +step 1446/19560 | loss 4.020406 (-0.73z)| norm 0.3252 (-0.72z)| lr 5.98e-04 | 8472.22 ms | -100.0% bf16 MFU | 62015 tok/s +step 1447/19560 | loss 4.024233 (-0.62z)| norm 0.3349 (-0.51z)| lr 5.98e-04 | 8469.10 ms | -100.0% bf16 MFU | 62009 tok/s +step 1448/19560 | loss 4.045681 (-0.06z)| norm 0.3824 (+0.50z)| lr 5.98e-04 | 8473.07 ms | -100.0% bf16 MFU | 62003 tok/s +step 1449/19560 | loss 4.011631 (-0.93z)| norm 0.4038 (+0.95z)| lr 5.98e-04 | 8473.53 ms | -100.0% bf16 MFU | 61996 tok/s +step 1450/19560 | loss 3.983350 (-1.63z)| norm 0.4000 (+0.86z)| lr 5.98e-04 | 8476.60 ms | -100.0% bf16 MFU | 61989 tok/s +step 1451/19560 | loss 4.066075 (+0.51z)| norm 0.3788 (+0.40z)| lr 5.98e-04 | 8466.59 ms | -100.0% bf16 MFU | 61986 tok/s +step 1452/19560 | loss 4.086060 (+1.03z)| norm 0.4628 (+2.12z)| lr 5.98e-04 | 8468.92 ms | -100.0% bf16 MFU | 61982 tok/s +step 1453/19560 | loss 4.036726 (-0.25z)| norm 0.4734 (+2.27z)| lr 5.98e-04 | 8466.73 ms | -100.0% bf16 MFU | 61979 tok/s +step 1454/19560 | loss 4.093197 (+1.21z)| norm 0.4979 (+2.67z)| lr 5.98e-04 | 8472.90 ms | -100.0% bf16 MFU | 61974 tok/s +step 1455/19560 | loss 4.055015 (+0.22z)| norm 0.4335 (+1.37z)| lr 5.98e-04 | 8463.06 ms | -100.0% bf16 MFU | 61973 tok/s +step 1456/19560 | loss 4.015770 (-0.78z)| norm 0.3388 (-0.50z)| lr 5.98e-04 | 8465.32 ms | -100.0% bf16 MFU | 61971 tok/s +step 1457/19560 | loss 4.099201 (+1.37z)| norm 0.3139 (-1.00z)| lr 5.98e-04 | 8468.37 ms | -100.0% bf16 MFU | 61968 tok/s +step 1458/19560 | loss 4.083042 (+0.94z)| norm 0.3321 (-0.64z)| lr 5.98e-04 | 8469.22 ms | -100.0% bf16 MFU | 61965 tok/s +step 1459/19560 | loss 4.051965 (+0.15z)| norm 0.3266 (-0.75z)| lr 5.98e-04 | 8471.60 ms | -100.0% bf16 MFU | 61961 tok/s +step 1460/19560 | loss 4.048542 (+0.08z)| norm 0.3050 (-1.16z)| lr 5.98e-04 | 8463.67 ms | -100.0% bf16 MFU | 61960 tok/s +step 1461/19560 | loss 4.050568 (+0.14z)| norm 0.2878 (-1.48z)| lr 5.98e-04 | 8463.03 ms | -100.0% bf16 MFU | 61960 tok/s +step 1462/19560 | loss 4.066040 (+0.54z)| norm 0.3026 (-1.17z)| lr 5.98e-04 | 8460.04 ms | -100.0% bf16 MFU | 61960 tok/s +step 1463/19560 | loss 4.074172 (+0.76z)| norm 0.3220 (-0.77z)| lr 5.98e-04 | 8462.84 ms | -100.0% bf16 MFU | 61960 tok/s +step 1464/19560 | loss 4.003621 (-1.09z)| norm 0.3542 (-0.11z)| lr 5.98e-04 | 8471.18 ms | -100.0% bf16 MFU | 61956 tok/s +step 1465/19560 | loss 4.019692 (-0.66z)| norm 0.4124 (+1.07z)| lr 5.98e-04 | 8466.66 ms | -100.0% bf16 MFU | 61955 tok/s +step 1466/19560 | loss 4.033256 (-0.29z)| norm 0.4245 (+1.30z)| lr 5.98e-04 | 8460.89 ms | -100.0% bf16 MFU | 61955 tok/s +step 1467/19560 | loss 4.024782 (-0.52z)| norm 0.3769 (+0.32z)| lr 5.98e-04 | 8466.67 ms | -100.0% bf16 MFU | 61954 tok/s +step 1468/19560 | loss 4.032922 (-0.30z)| norm 0.3916 (+0.62z)| lr 5.98e-04 | 8468.28 ms | -100.0% bf16 MFU | 61952 tok/s +step 1469/19560 | loss 4.052581 (+0.22z)| norm 0.3923 (+0.63z)| lr 5.98e-04 | 8467.22 ms | -100.0% bf16 MFU | 61950 tok/s +step 1470/19560 | loss 4.081695 (+0.98z)| norm 0.3355 (-0.51z)| lr 5.98e-04 | 8467.20 ms | -100.0% bf16 MFU | 61949 tok/s +step 1471/19560 | loss 3.990594 (-1.40z)| norm 0.3198 (-0.81z)| lr 5.98e-04 | 8467.33 ms | -100.0% bf16 MFU | 61947 tok/s +step 1472/19560 | loss 4.066213 (+0.57z)| norm 0.3246 (-0.71z)| lr 5.98e-04 | 8464.13 ms | -100.0% bf16 MFU | 61947 tok/s +step 1473/19560 | loss 3.985903 (-1.53z)| norm 0.3008 (-1.18z)| lr 5.98e-04 | 8457.85 ms | -100.0% bf16 MFU | 61949 tok/s +step 1474/19560 | loss 4.030742 (-0.36z)| norm 0.2958 (-1.27z)| lr 5.98e-04 | 8459.36 ms | -100.0% bf16 MFU | 61950 tok/s +step 1475/19560 | loss 3.973120 (-1.83z)| norm 0.2835 (-1.50z)| lr 5.98e-04 | 8464.89 ms | -100.0% bf16 MFU | 61950 tok/s +step 1476/19560 | loss 4.010602 (-0.88z)| norm 0.2933 (-1.30z)| lr 5.98e-04 | 8461.80 ms | -100.0% bf16 MFU | 61950 tok/s +step 1477/19560 | loss 3.993882 (-1.29z)| norm 0.3080 (-0.99z)| lr 5.97e-04 | 8460.58 ms | -100.0% bf16 MFU | 61951 tok/s +step 1478/19560 | loss 4.006203 (-0.98z)| norm 0.3102 (-0.94z)| lr 5.97e-04 | 8464.81 ms | -100.0% bf16 MFU | 61950 tok/s +step 1479/19560 | loss 3.933434 (-2.80z)| norm 0.3025 (-1.07z)| lr 5.97e-04 | 8455.49 ms | -100.0% bf16 MFU | 61953 tok/s +step 1480/19560 | loss 4.067052 (+0.65z)| norm 0.3234 (-0.65z)| lr 5.97e-04 | 8463.14 ms | -100.0% bf16 MFU | 61953 tok/s +step 1481/19560 | loss 4.006406 (-0.91z)| norm 0.3351 (-0.41z)| lr 5.97e-04 | 8459.23 ms | -100.0% bf16 MFU | 61954 tok/s +step 1482/19560 | loss 4.008633 (-0.84z)| norm 0.3512 (-0.08z)| lr 5.97e-04 | 8460.42 ms | -100.0% bf16 MFU | 61955 tok/s +step 1483/19560 | loss 4.062780 (+0.56z)| norm 0.3238 (-0.62z)| lr 5.97e-04 | 8462.82 ms | -100.0% bf16 MFU | 61955 tok/s +step 1484/19560 | loss 3.962893 (-1.97z)| norm 0.3034 (-1.02z)| lr 5.97e-04 | 8466.34 ms | -100.0% bf16 MFU | 61953 tok/s +step 1485/19560 | loss 4.073460 (+0.84z)| norm 0.3084 (-0.91z)| lr 5.97e-04 | 8457.69 ms | -100.0% bf16 MFU | 61955 tok/s +step 1486/19560 | loss 4.000805 (-1.00z)| norm 0.3297 (-0.48z)| lr 5.97e-04 | 8459.46 ms | -100.0% bf16 MFU | 61956 tok/s +step 1487/19560 | loss 4.054874 (+0.39z)| norm 0.3702 (+0.31z)| lr 5.97e-04 | 8462.50 ms | -100.0% bf16 MFU | 61956 tok/s +step 1488/19560 | loss 3.961807 (-1.95z)| norm 0.3843 (+0.58z)| lr 5.97e-04 | 8460.56 ms | -100.0% bf16 MFU | 61957 tok/s +step 1489/19560 | loss 4.028715 (-0.25z)| norm 0.3570 (+0.04z)| lr 5.97e-04 | 8463.95 ms | -100.0% bf16 MFU | 61956 tok/s +step 1490/19560 | loss 4.035498 (-0.09z)| norm 0.3260 (-0.58z)| lr 5.97e-04 | 8462.30 ms | -100.0% bf16 MFU | 61956 tok/s +step 1491/19560 | loss 4.045455 (+0.17z)| norm 0.3243 (-0.61z)| lr 5.97e-04 | 8461.70 ms | -100.0% bf16 MFU | 61956 tok/s +step 1492/19560 | loss 3.999215 (-1.00z)| norm 0.3386 (-0.33z)| lr 5.97e-04 | 8463.56 ms | -100.0% bf16 MFU | 61956 tok/s +step 1493/19560 | loss 3.986650 (-1.30z)| norm 0.3691 (+0.30z)| lr 5.97e-04 | 8455.38 ms | -100.0% bf16 MFU | 61958 tok/s +step 1494/19560 | loss 4.075274 (+0.99z)| norm 0.4226 (+1.46z)| lr 5.97e-04 | 8463.95 ms | -100.0% bf16 MFU | 61958 tok/s +step 1495/19560 | loss 4.003753 (-0.87z)| norm 0.4314 (+1.69z)| lr 5.97e-04 | 8457.67 ms | -100.0% bf16 MFU | 61959 tok/s +step 1496/19560 | loss 4.030204 (-0.18z)| norm 0.4287 (+1.63z)| lr 5.97e-04 | 8458.02 ms | -100.0% bf16 MFU | 61961 tok/s +step 1497/19560 | loss 4.056576 (+0.50z)| norm 0.3309 (-0.47z)| lr 5.97e-04 | 8460.28 ms | -100.0% bf16 MFU | 61961 tok/s +step 1498/19560 | loss 3.993804 (-1.12z)| norm 0.3424 (-0.22z)| lr 5.97e-04 | 8458.63 ms | -100.0% bf16 MFU | 61962 tok/s +step 1499/19560 | loss 3.972743 (-1.64z)| norm 0.3389 (-0.29z)| lr 5.97e-04 | 8456.90 ms | -100.0% bf16 MFU | 61964 tok/s +step 1500/19560 | loss 4.035563 (-0.00z)| norm 0.3397 (-0.27z)| lr 5.97e-04 | 8460.21 ms | -100.0% bf16 MFU | 61964 tok/s +val loss 4.005649 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 2632/10042 = 0.262099 +step 1501/19560 | loss 4.026576 (-0.22z)| norm 0.3448 (-0.16z)| lr 5.97e-04 | 8462.15 ms | -100.0% bf16 MFU | 61964 tok/s +step 1502/19560 | loss 3.960539 (-1.96z)| norm 0.3397 (-0.27z)| lr 5.97e-04 | 8459.42 ms | -100.0% bf16 MFU | 61964 tok/s +step 1503/19560 | loss 4.008293 (-0.68z)| norm 0.3480 (-0.09z)| lr 5.97e-04 | 8457.77 ms | -100.0% bf16 MFU | 61966 tok/s +step 1504/19560 | loss 4.030705 (-0.09z)| norm 0.3627 (+0.22z)| lr 5.97e-04 | 8460.91 ms | -100.0% bf16 MFU | 61966 tok/s +step 1505/19560 | loss 4.019689 (-0.38z)| norm 0.3246 (-0.60z)| lr 5.97e-04 | 8452.65 ms | -100.0% bf16 MFU | 61969 tok/s +step 1506/19560 | loss 4.035776 (+0.06z)| norm 0.3182 (-0.73z)| lr 5.97e-04 | 8458.50 ms | -100.0% bf16 MFU | 61969 tok/s +step 1507/19560 | loss 3.929098 (-2.74z)| norm 0.3144 (-0.80z)| lr 5.97e-04 | 8454.88 ms | -100.0% bf16 MFU | 61972 tok/s +step 1508/19560 | loss 4.009746 (-0.60z)| norm 0.3239 (-0.59z)| lr 5.97e-04 | 8460.59 ms | -100.0% bf16 MFU | 61971 tok/s +step 1509/19560 | loss 3.970757 (-1.64z)| norm 0.3709 (+0.41z)| lr 5.97e-04 | 8457.61 ms | -100.0% bf16 MFU | 61972 tok/s +step 1510/19560 | loss 3.983244 (-1.29z)| norm 0.3504 (-0.02z)| lr 5.97e-04 | 8455.72 ms | -100.0% bf16 MFU | 61974 tok/s +step 1511/19560 | loss 3.977948 (-1.41z)| norm 0.3874 (+0.78z)| lr 5.97e-04 | 8456.71 ms | -100.0% bf16 MFU | 61975 tok/s +step 1512/19560 | loss 4.047252 (+0.48z)| norm 0.4171 (+1.41z)| lr 5.97e-04 | 8453.69 ms | -100.0% bf16 MFU | 61977 tok/s +step 1513/19560 | loss 4.024359 (-0.14z)| norm 0.4458 (+1.98z)| lr 5.97e-04 | 8451.55 ms | -100.0% bf16 MFU | 61980 tok/s +step 1514/19560 | loss 4.069435 (+1.09z)| norm 0.4198 (+1.42z)| lr 5.97e-04 | 8453.66 ms | -100.0% bf16 MFU | 61982 tok/s +step 1515/19560 | loss 4.058756 (+0.80z)| norm 0.3579 (+0.11z)| lr 5.97e-04 | 8449.14 ms | -100.0% bf16 MFU | 61986 tok/s +step 1516/19560 | loss 3.967041 (-1.67z)| norm 0.3346 (-0.38z)| lr 5.97e-04 | 8458.16 ms | -100.0% bf16 MFU | 61986 tok/s +step 1517/19560 | loss 4.021040 (-0.21z)| norm 0.3344 (-0.39z)| lr 5.97e-04 | 8456.37 ms | -100.0% bf16 MFU | 61986 tok/s +step 1518/19560 | loss 3.951573 (-2.04z)| norm 0.3315 (-0.46z)| lr 5.97e-04 | 8453.32 ms | -100.0% bf16 MFU | 61988 tok/s +step 1519/19560 | loss 4.008648 (-0.50z)| norm 0.3262 (-0.57z)| lr 5.97e-04 | 8451.50 ms | -100.0% bf16 MFU | 61990 tok/s +step 1520/19560 | loss 4.000018 (-0.72z)| norm 0.2945 (-1.23z)| lr 5.97e-04 | 8452.85 ms | -100.0% bf16 MFU | 61992 tok/s +step 1521/19560 | loss 3.989424 (-1.00z)| norm 0.2966 (-1.18z)| lr 5.97e-04 | 8454.91 ms | -100.0% bf16 MFU | 61993 tok/s +step 1522/19560 | loss 4.058434 (+0.87z)| norm 0.3051 (-0.99z)| lr 5.97e-04 | 8458.77 ms | -100.0% bf16 MFU | 61992 tok/s +step 1523/19560 | loss 4.021885 (-0.10z)| norm 0.3148 (-0.77z)| lr 5.97e-04 | 8455.05 ms | -100.0% bf16 MFU | 61993 tok/s +step 1524/19560 | loss 4.055160 (+0.83z)| norm 0.3119 (-0.82z)| lr 5.97e-04 | 8460.67 ms | -100.0% bf16 MFU | 61992 tok/s +step 1525/19560 | loss 4.102869 (+2.12z)| norm 0.3357 (-0.30z)| lr 5.97e-04 | 8452.70 ms | -100.0% bf16 MFU | 61994 tok/s +step 1526/19560 | loss 3.999825 (-0.71z)| norm 0.3688 (+0.46z)| lr 5.97e-04 | 8457.10 ms | -100.0% bf16 MFU | 61994 tok/s +step 1527/19560 | loss 3.960561 (-1.76z)| norm 0.3770 (+0.65z)| lr 5.97e-04 | 8459.16 ms | -100.0% bf16 MFU | 61993 tok/s +step 1528/19560 | loss 3.978445 (-1.25z)| norm 0.3508 (+0.06z)| lr 5.97e-04 | 8456.67 ms | -100.0% bf16 MFU | 61993 tok/s +step 1529/19560 | loss 4.022300 (-0.04z)| norm 0.3583 (+0.23z)| lr 5.97e-04 | 8453.10 ms | -100.0% bf16 MFU | 61995 tok/s +step 1530/19560 | loss 3.950429 (-1.97z)| norm 0.3532 (+0.11z)| lr 5.97e-04 | 8454.15 ms | -100.0% bf16 MFU | 61996 tok/s +step 1531/19560 | loss 4.086564 (+1.70z)| norm 0.3061 (-0.97z)| lr 5.97e-04 | 8440.97 ms | -100.0% bf16 MFU | 62001 tok/s +step 1532/19560 | loss 4.002066 (-0.57z)| norm 0.2873 (-1.38z)| lr 5.97e-04 | 8440.01 ms | -100.0% bf16 MFU | 62007 tok/s +step 1533/19560 | loss 4.070051 (+1.25z)| norm 0.3270 (-0.49z)| lr 5.97e-04 | 8441.51 ms | -100.0% bf16 MFU | 62012 tok/s +step 1534/19560 | loss 4.010570 (-0.34z)| norm 0.3526 (+0.08z)| lr 5.97e-04 | 8444.04 ms | -100.0% bf16 MFU | 62016 tok/s +step 1535/19560 | loss 3.965425 (-1.52z)| norm 0.4011 (+1.18z)| lr 5.97e-04 | 8436.07 ms | -100.0% bf16 MFU | 62023 tok/s +step 1536/19560 | loss 4.005859 (-0.44z)| norm 0.4010 (+1.16z)| lr 5.97e-04 | 8440.01 ms | -100.0% bf16 MFU | 62028 tok/s +step 1537/19560 | loss 4.010100 (-0.32z)| norm 0.3510 (+0.01z)| lr 5.97e-04 | 8445.55 ms | -100.0% bf16 MFU | 62030 tok/s +step 1538/19560 | loss 3.982236 (-1.05z)| norm 0.3471 (-0.07z)| lr 5.97e-04 | 8442.60 ms | -100.0% bf16 MFU | 62034 tok/s +step 1539/19560 | loss 4.006423 (-0.40z)| norm 0.3514 (+0.05z)| lr 5.97e-04 | 8436.70 ms | -100.0% bf16 MFU | 62039 tok/s +step 1540/19560 | loss 4.074053 (+1.38z)| norm 0.3380 (-0.26z)| lr 5.97e-04 | 8441.05 ms | -100.0% bf16 MFU | 62043 tok/s +step 1541/19560 | loss 4.066321 (+1.20z)| norm 0.2968 (-1.24z)| lr 5.97e-04 | 8436.87 ms | -100.0% bf16 MFU | 62048 tok/s +step 1542/19560 | loss 4.028265 (+0.19z)| norm 0.2915 (-1.34z)| lr 5.97e-04 | 8440.50 ms | -100.0% bf16 MFU | 62051 tok/s +step 1543/19560 | loss 3.967798 (-1.43z)| norm 0.2882 (-1.40z)| lr 5.97e-04 | 8437.87 ms | -100.0% bf16 MFU | 62055 tok/s +step 1544/19560 | loss 3.998276 (-0.59z)| norm 0.2981 (-1.14z)| lr 5.97e-04 | 8436.23 ms | -100.0% bf16 MFU | 62060 tok/s +step 1545/19560 | loss 4.090557 (+1.88z)| norm 0.3306 (-0.36z)| lr 5.97e-04 | 8440.45 ms | -100.0% bf16 MFU | 62063 tok/s +step 1546/19560 | loss 3.970200 (-1.34z)| norm 0.3486 (+0.06z)| lr 5.97e-04 | 8443.30 ms | -100.0% bf16 MFU | 62064 tok/s +step 1547/19560 | loss 4.009291 (-0.30z)| norm 0.3815 (+0.85z)| lr 5.97e-04 | 8440.26 ms | -100.0% bf16 MFU | 62067 tok/s +step 1548/19560 | loss 3.978098 (-1.12z)| norm 0.3727 (+0.65z)| lr 5.97e-04 | 8440.20 ms | -100.0% bf16 MFU | 62070 tok/s +step 1549/19560 | loss 3.995859 (-0.64z)| norm 0.3580 (+0.30z)| lr 5.97e-04 | 8441.32 ms | -100.0% bf16 MFU | 62072 tok/s +step 1550/19560 | loss 3.992099 (-0.74z)| norm 0.3700 (+0.58z)| lr 5.97e-04 | 8443.03 ms | -100.0% bf16 MFU | 62073 tok/s +step 1551/19560 | loss 3.996774 (-0.61z)| norm 0.4204 (+1.76z)| lr 5.97e-04 | 8441.27 ms | -100.0% bf16 MFU | 62075 tok/s +step 1552/19560 | loss 4.094153 (+1.95z)| norm 0.3634 (+0.40z)| lr 5.97e-04 | 8438.21 ms | -100.0% bf16 MFU | 62078 tok/s +step 1553/19560 | loss 4.058696 (+1.00z)| norm 0.3584 (+0.28z)| lr 5.97e-04 | 8441.19 ms | -100.0% bf16 MFU | 62079 tok/s +step 1554/19560 | loss 3.994083 (-0.69z)| norm 0.3560 (+0.21z)| lr 5.97e-04 | 8445.31 ms | -100.0% bf16 MFU | 62079 tok/s +step 1555/19560 | loss 4.013278 (-0.18z)| norm 0.3221 (-0.61z)| lr 5.97e-04 | 8441.38 ms | -100.0% bf16 MFU | 62081 tok/s +step 1556/19560 | loss 4.004741 (-0.40z)| norm 0.3311 (-0.39z)| lr 5.97e-04 | 8442.31 ms | -100.0% bf16 MFU | 62082 tok/s +step 1557/19560 | loss 3.994608 (-0.67z)| norm 0.3054 (-1.00z)| lr 5.97e-04 | 8443.39 ms | -100.0% bf16 MFU | 62083 tok/s +step 1558/19560 | loss 3.990653 (-0.76z)| norm 0.3285 (-0.44z)| lr 5.97e-04 | 8445.98 ms | -100.0% bf16 MFU | 62082 tok/s +step 1559/19560 | loss 3.986171 (-0.87z)| norm 0.3630 (+0.39z)| lr 5.97e-04 | 8446.18 ms | -100.0% bf16 MFU | 62082 tok/s +step 1560/19560 | loss 4.031693 (+0.32z)| norm 0.3519 (+0.13z)| lr 5.97e-04 | 8449.24 ms | -100.0% bf16 MFU | 62080 tok/s +step 1561/19560 | loss 4.016633 (-0.09z)| norm 0.3487 (+0.05z)| lr 5.97e-04 | 8448.47 ms | -100.0% bf16 MFU | 62079 tok/s +step 1562/19560 | loss 4.053371 (+0.87z)| norm 0.3413 (-0.14z)| lr 5.97e-04 | 8450.73 ms | -100.0% bf16 MFU | 62077 tok/s +step 1563/19560 | loss 4.029305 (+0.24z)| norm 0.3687 (+0.51z)| lr 5.97e-04 | 8454.27 ms | -100.0% bf16 MFU | 62074 tok/s +step 1564/19560 | loss 4.013537 (-0.18z)| norm 0.4320 (+1.99z)| lr 5.97e-04 | 8449.80 ms | -100.0% bf16 MFU | 62073 tok/s +step 1565/19560 | loss 3.983687 (-0.96z)| norm 0.3862 (+0.89z)| lr 5.97e-04 | 8450.85 ms | -100.0% bf16 MFU | 62071 tok/s +step 1566/19560 | loss 3.965196 (-1.43z)| norm 0.3039 (-1.06z)| lr 5.97e-04 | 8451.62 ms | -100.0% bf16 MFU | 62069 tok/s +step 1567/19560 | loss 3.992469 (-0.71z)| norm 0.3649 (+0.39z)| lr 5.97e-04 | 8450.37 ms | -100.0% bf16 MFU | 62068 tok/s +step 1568/19560 | loss 4.004765 (-0.39z)| norm 0.3454 (-0.08z)| lr 5.97e-04 | 8446.90 ms | -100.0% bf16 MFU | 62068 tok/s +step 1569/19560 | loss 3.966756 (-1.37z)| norm 0.3391 (-0.23z)| lr 5.97e-04 | 8447.87 ms | -100.0% bf16 MFU | 62068 tok/s +step 1570/19560 | loss 3.963834 (-1.42z)| norm 0.3316 (-0.41z)| lr 5.97e-04 | 8443.76 ms | -100.0% bf16 MFU | 62069 tok/s +step 1571/19560 | loss 3.993342 (-0.65z)| norm 0.3053 (-1.03z)| lr 5.97e-04 | 8440.11 ms | -100.0% bf16 MFU | 62071 tok/s +step 1572/19560 | loss 3.991027 (-0.71z)| norm 0.2902 (-1.37z)| lr 5.97e-04 | 8436.41 ms | -100.0% bf16 MFU | 62075 tok/s +step 1573/19560 | loss 4.066475 (+1.24z)| norm 0.2950 (-1.26z)| lr 5.97e-04 | 8438.02 ms | -100.0% bf16 MFU | 62078 tok/s +step 1574/19560 | loss 3.990664 (-0.71z)| norm 0.2963 (-1.22z)| lr 5.97e-04 | 8438.91 ms | -100.0% bf16 MFU | 62080 tok/s +step 1575/19560 | loss 3.997854 (-0.52z)| norm 0.2923 (-1.29z)| lr 5.97e-04 | 8435.34 ms | -100.0% bf16 MFU | 62084 tok/s +step 1576/19560 | loss 3.982378 (-0.91z)| norm 0.2984 (-1.13z)| lr 5.97e-04 | 8436.59 ms | -100.0% bf16 MFU | 62087 tok/s +step 1577/19560 | loss 3.985259 (-0.83z)| norm 0.3065 (-0.93z)| lr 5.97e-04 | 8440.04 ms | -100.0% bf16 MFU | 62089 tok/s +step 1578/19560 | loss 3.966823 (-1.29z)| norm 0.2892 (-1.31z)| lr 5.97e-04 | 8437.40 ms | -100.0% bf16 MFU | 62091 tok/s +step 1579/19560 | loss 3.944352 (-1.83z)| norm 0.3247 (-0.48z)| lr 5.97e-04 | 8436.07 ms | -100.0% bf16 MFU | 62094 tok/s +step 1580/19560 | loss 4.060697 (+1.15z)| norm 0.3430 (-0.03z)| lr 5.97e-04 | 8437.84 ms | -100.0% bf16 MFU | 62096 tok/s +step 1581/19560 | loss 3.983131 (-0.83z)| norm 0.4000 (+1.40z)| lr 5.97e-04 | 8439.51 ms | -100.0% bf16 MFU | 62098 tok/s +step 1582/19560 | loss 4.027312 (+0.32z)| norm 0.3653 (+0.60z)| lr 5.97e-04 | 8435.96 ms | -100.0% bf16 MFU | 62100 tok/s +step 1583/19560 | loss 4.028870 (+0.36z)| norm 0.3463 (+0.12z)| lr 5.97e-04 | 8433.95 ms | -100.0% bf16 MFU | 62103 tok/s +step 1584/19560 | loss 4.027016 (+0.31z)| norm 0.3284 (-0.36z)| lr 5.97e-04 | 8437.59 ms | -100.0% bf16 MFU | 62105 tok/s +step 1585/19560 | loss 4.056709 (+1.11z)| norm 0.3508 (+0.23z)| lr 5.97e-04 | 8444.05 ms | -100.0% bf16 MFU | 62104 tok/s +step 1586/19560 | loss 4.000154 (-0.37z)| norm 0.3737 (+0.84z)| lr 5.97e-04 | 8440.00 ms | -100.0% bf16 MFU | 62105 tok/s +step 1587/19560 | loss 4.017454 (+0.10z)| norm 0.3632 (+0.55z)| lr 5.97e-04 | 8438.55 ms | -100.0% bf16 MFU | 62106 tok/s +step 1588/19560 | loss 3.974768 (-1.04z)| norm 0.3422 (-0.02z)| lr 5.97e-04 | 8440.24 ms | -100.0% bf16 MFU | 62107 tok/s +step 1589/19560 | loss 3.944314 (-1.82z)| norm 0.3362 (-0.20z)| lr 5.97e-04 | 8442.53 ms | -100.0% bf16 MFU | 62106 tok/s +step 1590/19560 | loss 3.985573 (-0.70z)| norm 0.3199 (-0.65z)| lr 5.97e-04 | 8440.28 ms | -100.0% bf16 MFU | 62107 tok/s +step 1591/19560 | loss 3.974275 (-0.99z)| norm 0.3304 (-0.36z)| lr 5.97e-04 | 8442.50 ms | -100.0% bf16 MFU | 62107 tok/s +step 1592/19560 | loss 4.052210 (+1.10z)| norm 0.3584 (+0.40z)| lr 5.97e-04 | 8441.75 ms | -100.0% bf16 MFU | 62107 tok/s +step 1593/19560 | loss 3.961073 (-1.33z)| norm 0.3665 (+0.64z)| lr 5.97e-04 | 8441.34 ms | -100.0% bf16 MFU | 62107 tok/s +step 1594/19560 | loss 4.013507 (+0.07z)| norm 0.3342 (-0.24z)| lr 5.97e-04 | 8442.38 ms | -100.0% bf16 MFU | 62107 tok/s +step 1595/19560 | loss 4.006116 (-0.12z)| norm 0.3722 (+0.84z)| lr 5.97e-04 | 8446.14 ms | -100.0% bf16 MFU | 62105 tok/s +step 1596/19560 | loss 3.921754 (-2.31z)| norm 0.4656 (+3.33z)| lr 5.97e-04 | 8447.46 ms | -100.0% bf16 MFU | 62103 tok/s +step 1597/19560 | loss 3.984033 (-0.66z)| norm 0.4966 (+3.92z)| lr 5.97e-04 | 8447.04 ms | -100.0% bf16 MFU | 62101 tok/s +step 1598/19560 | loss 3.958429 (-1.32z)| norm 0.4361 (+2.30z)| lr 5.97e-04 | 8449.78 ms | -100.0% bf16 MFU | 62099 tok/s +step 1599/19560 | loss 3.966204 (-1.11z)| norm 0.3562 (+0.28z)| lr 5.97e-04 | 8449.31 ms | -100.0% bf16 MFU | 62096 tok/s +step 1600/19560 | loss 3.951147 (-1.48z)| norm 0.3215 (-0.59z)| lr 5.97e-04 | 8448.34 ms | -100.0% bf16 MFU | 62094 tok/s +step 1601/19560 | loss 3.975004 (-0.85z)| norm 0.3152 (-0.75z)| lr 5.97e-04 | 8449.14 ms | -100.0% bf16 MFU | 62092 tok/s +step 1602/19560 | loss 3.966222 (-1.06z)| norm 0.3006 (-1.13z)| lr 5.97e-04 | 8449.66 ms | -100.0% bf16 MFU | 62090 tok/s +step 1603/19560 | loss 3.980877 (-0.68z)| norm 0.2998 (-1.16z)| lr 5.97e-04 | 8448.59 ms | -100.0% bf16 MFU | 62088 tok/s +step 1604/19560 | loss 3.974536 (-0.84z)| norm 0.2980 (-1.20z)| lr 5.97e-04 | 8448.12 ms | -100.0% bf16 MFU | 62087 tok/s +step 1605/19560 | loss 3.934917 (-1.84z)| norm 0.3255 (-0.51z)| lr 5.97e-04 | 8448.01 ms | -100.0% bf16 MFU | 62086 tok/s +step 1606/19560 | loss 3.995683 (-0.26z)| norm 0.3005 (-1.14z)| lr 5.97e-04 | 8448.17 ms | -100.0% bf16 MFU | 62084 tok/s +step 1607/19560 | loss 3.998915 (-0.19z)| norm 0.3119 (-0.85z)| lr 5.97e-04 | 8448.54 ms | -100.0% bf16 MFU | 62083 tok/s +step 1608/19560 | loss 3.983060 (-0.60z)| norm 0.2864 (-1.48z)| lr 5.97e-04 | 8449.10 ms | -100.0% bf16 MFU | 62081 tok/s +step 1609/19560 | loss 4.034863 (+0.77z)| norm 0.3169 (-0.71z)| lr 5.97e-04 | 8452.16 ms | -100.0% bf16 MFU | 62079 tok/s +step 1610/19560 | loss 3.950935 (-1.43z)| norm 0.3539 (+0.22z)| lr 5.97e-04 | 8447.38 ms | -100.0% bf16 MFU | 62078 tok/s +step 1611/19560 | loss 3.982302 (-0.60z)| norm 0.3963 (+1.27z)| lr 5.97e-04 | 8448.09 ms | -100.0% bf16 MFU | 62077 tok/s +step 1612/19560 | loss 3.950961 (-1.42z)| norm 0.3288 (-0.43z)| lr 5.97e-04 | 8452.03 ms | -100.0% bf16 MFU | 62075 tok/s +step 1613/19560 | loss 3.986712 (-0.46z)| norm 0.3127 (-0.84z)| lr 5.97e-04 | 8449.00 ms | -100.0% bf16 MFU | 62074 tok/s +step 1614/19560 | loss 3.984031 (-0.53z)| norm 0.3428 (-0.08z)| lr 5.97e-04 | 8449.76 ms | -100.0% bf16 MFU | 62072 tok/s +step 1615/19560 | loss 3.968644 (-0.93z)| norm 0.3338 (-0.30z)| lr 5.97e-04 | 8447.81 ms | -100.0% bf16 MFU | 62072 tok/s +step 1616/19560 | loss 3.945324 (-1.54z)| norm 0.3114 (-0.85z)| lr 5.97e-04 | 8449.44 ms | -100.0% bf16 MFU | 62071 tok/s +step 1617/19560 | loss 3.978504 (-0.65z)| norm 0.3019 (-1.08z)| lr 5.97e-04 | 8445.97 ms | -100.0% bf16 MFU | 62071 tok/s +step 1618/19560 | loss 3.972904 (-0.78z)| norm 0.2940 (-1.26z)| lr 5.97e-04 | 8448.77 ms | -100.0% bf16 MFU | 62070 tok/s +step 1619/19560 | loss 3.969601 (-0.86z)| norm 0.3121 (-0.80z)| lr 5.96e-04 | 8448.50 ms | -100.0% bf16 MFU | 62070 tok/s +step 1620/19560 | loss 3.972876 (-0.76z)| norm 0.3247 (-0.49z)| lr 5.96e-04 | 8446.71 ms | -100.0% bf16 MFU | 62070 tok/s +step 1621/19560 | loss 3.955382 (-1.22z)| norm 0.3372 (-0.17z)| lr 5.96e-04 | 8449.51 ms | -100.0% bf16 MFU | 62069 tok/s +step 1622/19560 | loss 3.953963 (-1.24z)| norm 0.3183 (-0.63z)| lr 5.96e-04 | 8449.66 ms | -100.0% bf16 MFU | 62068 tok/s +step 1623/19560 | loss 3.955575 (-1.18z)| norm 0.3423 (-0.00z)| lr 5.96e-04 | 8451.32 ms | -100.0% bf16 MFU | 62066 tok/s +step 1624/19560 | loss 3.949752 (-1.31z)| norm 0.3614 (+0.51z)| lr 5.96e-04 | 8447.27 ms | -100.0% bf16 MFU | 62066 tok/s +step 1625/19560 | loss 3.959918 (-1.03z)| norm 0.2959 (-1.20z)| lr 5.96e-04 | 8450.06 ms | -100.0% bf16 MFU | 62065 tok/s +step 1626/19560 | loss 3.938703 (-1.57z)| norm 0.2928 (-1.26z)| lr 5.96e-04 | 8451.69 ms | -100.0% bf16 MFU | 62063 tok/s +step 1627/19560 | loss 3.902189 (-2.46z)| norm 0.3176 (-0.61z)| lr 5.96e-04 | 8448.09 ms | -100.0% bf16 MFU | 62063 tok/s +step 1628/19560 | loss 3.932485 (-1.65z)| norm 0.2977 (-1.12z)| lr 5.96e-04 | 8447.12 ms | -100.0% bf16 MFU | 62063 tok/s +step 1629/19560 | loss 3.990114 (-0.16z)| norm 0.3044 (-0.93z)| lr 5.96e-04 | 8447.89 ms | -100.0% bf16 MFU | 62063 tok/s +step 1630/19560 | loss 3.924860 (-1.81z)| norm 0.3049 (-0.91z)| lr 5.96e-04 | 8446.99 ms | -100.0% bf16 MFU | 62064 tok/s +step 1631/19560 | loss 3.959858 (-0.91z)| norm 0.3125 (-0.70z)| lr 5.96e-04 | 8447.35 ms | -100.0% bf16 MFU | 62064 tok/s +step 1632/19560 | loss 3.972645 (-0.58z)| norm 0.3420 (+0.06z)| lr 5.96e-04 | 8477.07 ms | -100.0% bf16 MFU | 62053 tok/s +step 1633/19560 | loss 3.933985 (-1.53z)| norm 0.3229 (-0.43z)| lr 5.96e-04 | 8475.72 ms | -100.0% bf16 MFU | 62043 tok/s +step 1634/19560 | loss 4.007775 (+0.34z)| norm 0.3049 (-0.89z)| lr 5.96e-04 | 8479.04 ms | -100.0% bf16 MFU | 62033 tok/s +step 1635/19560 | loss 3.970151 (-0.63z)| norm 0.3480 (+0.21z)| lr 5.96e-04 | 8478.92 ms | -100.0% bf16 MFU | 62023 tok/s +step 1636/19560 | loss 3.912862 (-2.04z)| norm 0.3751 (+0.89z)| lr 5.96e-04 | 8472.54 ms | -100.0% bf16 MFU | 62016 tok/s +step 1637/19560 | loss 4.114900 (+2.92z)| norm 0.3392 (-0.02z)| lr 5.96e-04 | 8474.59 ms | -100.0% bf16 MFU | 62008 tok/s +step 1638/19560 | loss 3.998692 (+0.09z)| norm 0.5128 (+4.11z)| lr 5.96e-04 | 8472.25 ms | -100.0% bf16 MFU | 62002 tok/s +step 1639/19560 | loss 3.953249 (-1.01z)| norm 0.4587 (+2.73z)| lr 5.96e-04 | 8474.42 ms | -100.0% bf16 MFU | 61995 tok/s +step 1640/19560 | loss 4.014043 (+0.47z)| norm 0.3665 (+0.59z)| lr 5.96e-04 | 8476.57 ms | -100.0% bf16 MFU | 61988 tok/s +step 1641/19560 | loss 3.958189 (-0.88z)| norm 0.3543 (+0.33z)| lr 5.96e-04 | 8473.04 ms | -100.0% bf16 MFU | 61982 tok/s +step 1642/19560 | loss 4.032080 (+0.94z)| norm 0.3849 (+1.09z)| lr 5.96e-04 | 8475.61 ms | -100.0% bf16 MFU | 61976 tok/s +step 1643/19560 | loss 3.967685 (-0.63z)| norm 0.3821 (+1.01z)| lr 5.96e-04 | 8475.00 ms | -100.0% bf16 MFU | 61971 tok/s +step 1644/19560 | loss 3.939418 (-1.32z)| norm 0.3154 (-0.61z)| lr 5.96e-04 | 8468.04 ms | -100.0% bf16 MFU | 61968 tok/s +step 1645/19560 | loss 3.986167 (-0.16z)| norm 0.3080 (-0.78z)| lr 5.96e-04 | 8472.09 ms | -100.0% bf16 MFU | 61964 tok/s +step 1646/19560 | loss 4.046312 (+1.30z)| norm 0.2960 (-1.06z)| lr 5.96e-04 | 8469.27 ms | -100.0% bf16 MFU | 61961 tok/s +step 1647/19560 | loss 3.968357 (-0.61z)| norm 0.2911 (-1.17z)| lr 5.96e-04 | 8468.17 ms | -100.0% bf16 MFU | 61958 tok/s +step 1648/19560 | loss 3.957331 (-0.87z)| norm 0.2968 (-1.03z)| lr 5.96e-04 | 8473.34 ms | -100.0% bf16 MFU | 61954 tok/s +step 1649/19560 | loss 3.987853 (-0.12z)| norm 0.3156 (-0.59z)| lr 5.96e-04 | 8466.23 ms | -100.0% bf16 MFU | 61953 tok/s +step 1650/19560 | loss 4.027404 (+0.86z)| norm 0.3042 (-0.86z)| lr 5.96e-04 | 8462.64 ms | -100.0% bf16 MFU | 61953 tok/s +step 1651/19560 | loss 3.976833 (-0.38z)| norm 0.3211 (-0.45z)| lr 5.96e-04 | 8467.30 ms | -100.0% bf16 MFU | 61951 tok/s +step 1652/19560 | loss 4.021859 (+0.75z)| norm 0.3579 (+0.43z)| lr 5.96e-04 | 8464.83 ms | -100.0% bf16 MFU | 61950 tok/s +step 1653/19560 | loss 3.969496 (-0.55z)| norm 0.3557 (+0.37z)| lr 5.96e-04 | 8467.28 ms | -100.0% bf16 MFU | 61949 tok/s +step 1654/19560 | loss 3.963037 (-0.71z)| norm 0.3626 (+0.54z)| lr 5.96e-04 | 8464.46 ms | -100.0% bf16 MFU | 61948 tok/s +step 1655/19560 | loss 3.975006 (-0.40z)| norm 0.3670 (+0.65z)| lr 5.96e-04 | 8461.85 ms | -100.0% bf16 MFU | 61949 tok/s +step 1656/19560 | loss 3.980195 (-0.27z)| norm 0.3680 (+0.66z)| lr 5.96e-04 | 8465.13 ms | -100.0% bf16 MFU | 61948 tok/s +step 1657/19560 | loss 3.990886 (+0.01z)| norm 0.3377 (-0.06z)| lr 5.96e-04 | 8462.06 ms | -100.0% bf16 MFU | 61949 tok/s +step 1658/19560 | loss 3.970365 (-0.53z)| norm 0.3356 (-0.11z)| lr 5.96e-04 | 8461.61 ms | -100.0% bf16 MFU | 61949 tok/s +step 1659/19560 | loss 3.947489 (-1.11z)| norm 0.3379 (-0.06z)| lr 5.96e-04 | 8466.52 ms | -100.0% bf16 MFU | 61948 tok/s +step 1660/19560 | loss 3.991185 (+0.04z)| norm 0.3122 (-0.69z)| lr 5.96e-04 | 8471.69 ms | -100.0% bf16 MFU | 61945 tok/s +step 1661/19560 | loss 3.954550 (-0.91z)| norm 0.3024 (-0.92z)| lr 5.96e-04 | 8465.94 ms | -100.0% bf16 MFU | 61944 tok/s +step 1662/19560 | loss 3.901805 (-2.26z)| norm 0.3184 (-0.53z)| lr 5.96e-04 | 8464.96 ms | -100.0% bf16 MFU | 61944 tok/s +step 1663/19560 | loss 3.937278 (-1.32z)| norm 0.3667 (+0.66z)| lr 5.96e-04 | 8465.14 ms | -100.0% bf16 MFU | 61943 tok/s +step 1664/19560 | loss 3.977284 (-0.26z)| norm 0.3799 (+0.99z)| lr 5.96e-04 | 8460.80 ms | -100.0% bf16 MFU | 61945 tok/s +step 1665/19560 | loss 3.963935 (-0.60z)| norm 0.3731 (+0.81z)| lr 5.96e-04 | 8462.30 ms | -100.0% bf16 MFU | 61945 tok/s +step 1666/19560 | loss 3.988182 (+0.03z)| norm 0.3403 (+0.01z)| lr 5.96e-04 | 8457.28 ms | -100.0% bf16 MFU | 61947 tok/s +step 1667/19560 | loss 4.037417 (+1.31z)| norm 0.3138 (-0.63z)| lr 5.96e-04 | 8467.15 ms | -100.0% bf16 MFU | 61946 tok/s +step 1668/19560 | loss 3.956595 (-0.79z)| norm 0.3903 (+1.23z)| lr 5.96e-04 | 8459.43 ms | -100.0% bf16 MFU | 61948 tok/s +step 1669/19560 | loss 4.024230 (+1.03z)| norm 0.3190 (-0.51z)| lr 5.96e-04 | 8470.17 ms | -100.0% bf16 MFU | 61945 tok/s +step 1670/19560 | loss 3.931977 (-1.43z)| norm 0.2802 (-1.46z)| lr 5.96e-04 | 8461.94 ms | -100.0% bf16 MFU | 61946 tok/s +step 1671/19560 | loss 4.014418 (+0.77z)| norm 0.2933 (-1.14z)| lr 5.96e-04 | 8459.18 ms | -100.0% bf16 MFU | 61947 tok/s +step 1672/19560 | loss 3.955164 (-0.80z)| norm 0.2944 (-1.11z)| lr 5.96e-04 | 8461.35 ms | -100.0% bf16 MFU | 61948 tok/s +step 1673/19560 | loss 3.919572 (-1.75z)| norm 0.2734 (-1.60z)| lr 5.96e-04 | 8450.72 ms | -100.0% bf16 MFU | 61953 tok/s +step 1674/19560 | loss 3.948018 (-0.97z)| norm 0.2857 (-1.28z)| lr 5.96e-04 | 8455.50 ms | -100.0% bf16 MFU | 61955 tok/s +step 1675/19560 | loss 3.982749 (-0.02z)| norm 0.3139 (-0.59z)| lr 5.96e-04 | 8453.92 ms | -100.0% bf16 MFU | 61959 tok/s +step 1676/19560 | loss 3.959934 (-0.64z)| norm 0.3215 (-0.40z)| lr 5.96e-04 | 8455.18 ms | -100.0% bf16 MFU | 61961 tok/s +step 1677/19560 | loss 4.015576 (+0.87z)| norm 0.3073 (-0.73z)| lr 5.96e-04 | 8449.46 ms | -100.0% bf16 MFU | 61965 tok/s +step 1678/19560 | loss 3.907819 (-2.00z)| norm 0.3159 (-0.52z)| lr 5.96e-04 | 8457.55 ms | -100.0% bf16 MFU | 61967 tok/s +step 1679/19560 | loss 3.971395 (-0.30z)| norm 0.3050 (-0.77z)| lr 5.96e-04 | 8448.52 ms | -100.0% bf16 MFU | 61971 tok/s +step 1680/19560 | loss 4.022883 (+1.12z)| norm 0.2975 (-0.94z)| lr 5.96e-04 | 8453.37 ms | -100.0% bf16 MFU | 61974 tok/s +step 1681/19560 | loss 3.960486 (-0.58z)| norm 0.3231 (-0.31z)| lr 5.96e-04 | 8454.95 ms | -100.0% bf16 MFU | 61976 tok/s +step 1682/19560 | loss 3.981301 (+0.00z)| norm 0.3540 (+0.45z)| lr 5.96e-04 | 8453.25 ms | -100.0% bf16 MFU | 61978 tok/s +step 1683/19560 | loss 3.942889 (-1.06z)| norm 0.3439 (+0.20z)| lr 5.96e-04 | 8449.29 ms | -100.0% bf16 MFU | 61982 tok/s +step 1684/19560 | loss 3.948047 (-0.90z)| norm 0.3338 (-0.05z)| lr 5.96e-04 | 8455.04 ms | -100.0% bf16 MFU | 61983 tok/s +step 1685/19560 | loss 3.952778 (-0.76z)| norm 0.3074 (-0.69z)| lr 5.96e-04 | 8453.36 ms | -100.0% bf16 MFU | 61985 tok/s +step 1686/19560 | loss 3.949173 (-0.85z)| norm 0.2948 (-0.99z)| lr 5.96e-04 | 8456.26 ms | -100.0% bf16 MFU | 61986 tok/s +step 1687/19560 | loss 3.979210 (-0.01z)| norm 0.3151 (-0.49z)| lr 5.96e-04 | 8456.65 ms | -100.0% bf16 MFU | 61986 tok/s +step 1688/19560 | loss 4.005206 (+0.72z)| norm 0.3169 (-0.44z)| lr 5.96e-04 | 8460.59 ms | -100.0% bf16 MFU | 61985 tok/s +step 1689/19560 | loss 3.966861 (-0.34z)| norm 0.3319 (-0.07z)| lr 5.96e-04 | 8458.01 ms | -100.0% bf16 MFU | 61985 tok/s +step 1690/19560 | loss 3.959424 (-0.54z)| norm 0.3678 (+0.80z)| lr 5.96e-04 | 8460.56 ms | -100.0% bf16 MFU | 61984 tok/s +step 1691/19560 | loss 3.907660 (-1.97z)| norm 0.3705 (+0.86z)| lr 5.96e-04 | 8456.21 ms | -100.0% bf16 MFU | 61985 tok/s +step 1692/19560 | loss 3.867764 (-2.98z)| norm 0.3217 (-0.31z)| lr 5.96e-04 | 8458.24 ms | -100.0% bf16 MFU | 61985 tok/s +step 1693/19560 | loss 3.941925 (-0.93z)| norm 0.3511 (+0.43z)| lr 5.96e-04 | 8455.92 ms | -100.0% bf16 MFU | 61986 tok/s +step 1694/19560 | loss 3.928314 (-1.28z)| norm 0.3336 (-0.01z)| lr 5.96e-04 | 8453.10 ms | -100.0% bf16 MFU | 61988 tok/s +step 1695/19560 | loss 3.963202 (-0.33z)| norm 0.3032 (-0.76z)| lr 5.96e-04 | 8454.02 ms | -100.0% bf16 MFU | 61989 tok/s +step 1696/19560 | loss 3.951506 (-0.64z)| norm 0.3136 (-0.50z)| lr 5.96e-04 | 8459.91 ms | -100.0% bf16 MFU | 61989 tok/s +step 1697/19560 | loss 3.966662 (-0.22z)| norm 0.3234 (-0.25z)| lr 5.96e-04 | 8459.28 ms | -100.0% bf16 MFU | 61988 tok/s +step 1698/19560 | loss 3.930819 (-1.19z)| norm 0.3032 (-0.74z)| lr 5.96e-04 | 8454.37 ms | -100.0% bf16 MFU | 61989 tok/s +step 1699/19560 | loss 3.941553 (-0.88z)| norm 0.3287 (-0.11z)| lr 5.96e-04 | 8455.68 ms | -100.0% bf16 MFU | 61990 tok/s +step 1700/19560 | loss 4.012432 (+1.03z)| norm 0.3298 (-0.09z)| lr 5.96e-04 | 8456.97 ms | -100.0% bf16 MFU | 61990 tok/s +step 1701/19560 | loss 3.934836 (-1.06z)| norm 0.5228 (+4.36z)| lr 5.96e-04 | 8455.23 ms | -100.0% bf16 MFU | 61991 tok/s +step 1702/19560 | loss 3.905242 (-1.83z)| norm 0.3127 (-0.53z)| lr 5.96e-04 | 8454.19 ms | -100.0% bf16 MFU | 61992 tok/s +step 1703/19560 | loss 3.943480 (-0.78z)| norm 0.3439 (+0.19z)| lr 5.96e-04 | 8459.12 ms | -100.0% bf16 MFU | 61992 tok/s +step 1704/19560 | loss 3.919530 (-1.41z)| norm 0.3932 (+1.32z)| lr 5.96e-04 | 8462.24 ms | -100.0% bf16 MFU | 61990 tok/s +step 1705/19560 | loss 3.988560 (+0.45z)| norm 0.4267 (+2.04z)| lr 5.96e-04 | 8450.95 ms | -100.0% bf16 MFU | 61992 tok/s +step 1706/19560 | loss 4.021167 (+1.31z)| norm 0.3959 (+1.32z)| lr 5.96e-04 | 8450.74 ms | -100.0% bf16 MFU | 61995 tok/s +step 1707/19560 | loss 3.957733 (-0.39z)| norm 0.3500 (+0.26z)| lr 5.96e-04 | 8463.35 ms | -100.0% bf16 MFU | 61992 tok/s +step 1708/19560 | loss 3.948135 (-0.64z)| norm 0.3159 (-0.51z)| lr 5.96e-04 | 8453.34 ms | -100.0% bf16 MFU | 61994 tok/s +step 1709/19560 | loss 4.036743 (+1.75z)| norm 0.3800 (+0.96z)| lr 5.96e-04 | 8454.86 ms | -100.0% bf16 MFU | 61995 tok/s +step 1710/19560 | loss 3.942234 (-0.79z)| norm 0.4161 (+1.76z)| lr 5.96e-04 | 8456.72 ms | -100.0% bf16 MFU | 61995 tok/s +step 1711/19560 | loss 3.950738 (-0.55z)| norm 0.3830 (+1.00z)| lr 5.96e-04 | 8456.49 ms | -100.0% bf16 MFU | 61995 tok/s +step 1712/19560 | loss 4.001188 (+0.85z)| norm 0.3548 (+0.36z)| lr 5.96e-04 | 8461.72 ms | -100.0% bf16 MFU | 61993 tok/s +step 1713/19560 | loss 3.941861 (-0.78z)| norm 0.3391 (+0.00z)| lr 5.96e-04 | 8450.89 ms | -100.0% bf16 MFU | 61996 tok/s +step 1714/19560 | loss 3.970705 (+0.04z)| norm 0.3187 (-0.45z)| lr 5.96e-04 | 8457.15 ms | -100.0% bf16 MFU | 61995 tok/s +step 1715/19560 | loss 3.983675 (+0.42z)| norm 0.3071 (-0.70z)| lr 5.96e-04 | 8452.57 ms | -100.0% bf16 MFU | 61997 tok/s +step 1716/19560 | loss 3.945083 (-0.67z)| norm 0.2750 (-1.41z)| lr 5.96e-04 | 8457.22 ms | -100.0% bf16 MFU | 61997 tok/s +step 1717/19560 | loss 3.927410 (-1.17z)| norm 0.3033 (-0.76z)| lr 5.96e-04 | 8458.26 ms | -100.0% bf16 MFU | 61996 tok/s +step 1718/19560 | loss 3.956332 (-0.34z)| norm 0.3140 (-0.52z)| lr 5.96e-04 | 8456.12 ms | -100.0% bf16 MFU | 61996 tok/s +step 1719/19560 | loss 3.961160 (-0.20z)| norm 0.2967 (-0.90z)| lr 5.96e-04 | 8457.04 ms | -100.0% bf16 MFU | 61996 tok/s +step 1720/19560 | loss 3.937865 (-0.85z)| norm 0.2864 (-1.12z)| lr 5.96e-04 | 8453.08 ms | -100.0% bf16 MFU | 61998 tok/s +step 1721/19560 | loss 3.946784 (-0.59z)| norm 0.3091 (-0.60z)| lr 5.96e-04 | 8448.55 ms | -100.0% bf16 MFU | 62001 tok/s +step 1722/19560 | loss 3.960104 (-0.20z)| norm 0.3263 (-0.22z)| lr 5.96e-04 | 8454.33 ms | -100.0% bf16 MFU | 62001 tok/s +step 1723/19560 | loss 3.917656 (-1.41z)| norm 0.3576 (+0.48z)| lr 5.96e-04 | 8453.63 ms | -100.0% bf16 MFU | 62002 tok/s +step 1724/19560 | loss 4.021846 (+1.59z)| norm 0.3627 (+0.64z)| lr 5.96e-04 | 8452.21 ms | -100.0% bf16 MFU | 62004 tok/s +step 1725/19560 | loss 3.941165 (-0.74z)| norm 0.3430 (+0.22z)| lr 5.96e-04 | 8451.85 ms | -100.0% bf16 MFU | 62005 tok/s +step 1726/19560 | loss 4.004800 (+1.09z)| norm 0.3676 (+0.86z)| lr 5.96e-04 | 8451.39 ms | -100.0% bf16 MFU | 62007 tok/s +step 1727/19560 | loss 3.948396 (-0.53z)| norm 0.3500 (+0.42z)| lr 5.96e-04 | 8450.37 ms | -100.0% bf16 MFU | 62008 tok/s +step 1728/19560 | loss 3.990933 (+0.68z)| norm 0.3247 (-0.22z)| lr 5.96e-04 | 8453.83 ms | -100.0% bf16 MFU | 62009 tok/s +step 1729/19560 | loss 3.901083 (-1.85z)| norm 0.3501 (+0.41z)| lr 5.96e-04 | 8457.24 ms | -100.0% bf16 MFU | 62008 tok/s +step 1730/19560 | loss 3.869058 (-2.66z)| norm 0.2899 (-1.09z)| lr 5.96e-04 | 8451.97 ms | -100.0% bf16 MFU | 62009 tok/s +step 1731/19560 | loss 3.908839 (-1.54z)| norm 0.2881 (-1.13z)| lr 5.96e-04 | 8448.30 ms | -100.0% bf16 MFU | 62012 tok/s +step 1732/19560 | loss 3.966679 (+0.04z)| norm 0.3049 (-0.71z)| lr 5.96e-04 | 8455.60 ms | -100.0% bf16 MFU | 62011 tok/s +step 1733/19560 | loss 3.915631 (-1.34z)| norm 0.2697 (-1.56z)| lr 5.96e-04 | 8452.88 ms | -100.0% bf16 MFU | 62012 tok/s +step 1734/19560 | loss 3.944136 (-0.56z)| norm 0.2806 (-1.28z)| lr 5.96e-04 | 8449.87 ms | -100.0% bf16 MFU | 62014 tok/s +step 1735/19560 | loss 3.956451 (-0.21z)| norm 0.2771 (-1.35z)| lr 5.96e-04 | 8451.02 ms | -100.0% bf16 MFU | 62015 tok/s +step 1736/19560 | loss 3.948952 (-0.41z)| norm 0.2598 (-1.76z)| lr 5.96e-04 | 8447.76 ms | -100.0% bf16 MFU | 62017 tok/s +step 1737/19560 | loss 3.981340 (+0.49z)| norm 0.2662 (-1.58z)| lr 5.96e-04 | 8451.36 ms | -100.0% bf16 MFU | 62018 tok/s +step 1738/19560 | loss 3.897311 (-1.80z)| norm 0.2747 (-1.35z)| lr 5.96e-04 | 8456.94 ms | -100.0% bf16 MFU | 62017 tok/s +step 1739/19560 | loss 3.921729 (-1.12z)| norm 0.2816 (-1.17z)| lr 5.96e-04 | 8454.32 ms | -100.0% bf16 MFU | 62017 tok/s +step 1740/19560 | loss 3.912853 (-1.34z)| norm 0.3076 (-0.54z)| lr 5.96e-04 | 8457.48 ms | -100.0% bf16 MFU | 62016 tok/s +step 1741/19560 | loss 3.936296 (-0.70z)| norm 0.3405 (+0.24z)| lr 5.96e-04 | 8451.98 ms | -100.0% bf16 MFU | 62016 tok/s +step 1742/19560 | loss 3.926333 (-0.95z)| norm 0.3356 (+0.12z)| lr 5.96e-04 | 8452.32 ms | -100.0% bf16 MFU | 62017 tok/s +step 1743/19560 | loss 3.937709 (-0.64z)| norm 0.3689 (+0.91z)| lr 5.95e-04 | 8451.42 ms | -100.0% bf16 MFU | 62018 tok/s +step 1744/19560 | loss 3.977716 (+0.43z)| norm 0.3989 (+1.60z)| lr 5.95e-04 | 8449.96 ms | -100.0% bf16 MFU | 62019 tok/s +step 1745/19560 | loss 4.009597 (+1.28z)| norm 0.3355 (+0.09z)| lr 5.95e-04 | 8448.06 ms | -100.0% bf16 MFU | 62021 tok/s +step 1746/19560 | loss 4.007380 (+1.21z)| norm 0.3529 (+0.49z)| lr 5.95e-04 | 8448.59 ms | -100.0% bf16 MFU | 62023 tok/s +step 1747/19560 | loss 3.992224 (+0.80z)| norm 0.3376 (+0.12z)| lr 5.95e-04 | 8457.19 ms | -100.0% bf16 MFU | 62022 tok/s +step 1748/19560 | loss 3.952861 (-0.25z)| norm 0.3310 (-0.03z)| lr 5.95e-04 | 8453.15 ms | -100.0% bf16 MFU | 62022 tok/s +step 1749/19560 | loss 4.014218 (+1.36z)| norm 0.3343 (+0.05z)| lr 5.95e-04 | 8454.79 ms | -100.0% bf16 MFU | 62021 tok/s +step 1750/19560 | loss 3.930679 (-0.84z)| norm 0.3275 (-0.12z)| lr 5.95e-04 | 8452.31 ms | -100.0% bf16 MFU | 62022 tok/s +val loss 3.923858 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 2636/10042 = 0.262498 +step 1751/19560 | loss 3.881977 (-2.07z)| norm 0.3168 (-0.37z)| lr 5.95e-04 | 8437.65 ms | -100.0% bf16 MFU | 62027 tok/s +step 1752/19560 | loss 3.879713 (-2.08z)| norm 0.2967 (-0.83z)| lr 5.95e-04 | 8433.14 ms | -100.0% bf16 MFU | 62034 tok/s +step 1753/19560 | loss 3.936488 (-0.63z)| norm 0.2989 (-0.78z)| lr 5.95e-04 | 8437.29 ms | -100.0% bf16 MFU | 62040 tok/s +step 1754/19560 | loss 3.938953 (-0.56z)| norm 0.3124 (-0.46z)| lr 5.95e-04 | 8439.84 ms | -100.0% bf16 MFU | 62044 tok/s +step 1755/19560 | loss 3.914953 (-1.18z)| norm 0.3564 (+0.57z)| lr 5.95e-04 | 8438.45 ms | -100.0% bf16 MFU | 62048 tok/s +step 1756/19560 | loss 3.950189 (-0.29z)| norm 0.3414 (+0.21z)| lr 5.95e-04 | 8438.64 ms | -100.0% bf16 MFU | 62052 tok/s +step 1757/19560 | loss 3.975469 (+0.36z)| norm 0.2971 (-0.84z)| lr 5.95e-04 | 8436.46 ms | -100.0% bf16 MFU | 62057 tok/s +step 1758/19560 | loss 3.985595 (+0.61z)| norm 0.3161 (-0.39z)| lr 5.95e-04 | 8434.92 ms | -100.0% bf16 MFU | 62062 tok/s +step 1759/19560 | loss 3.918085 (-1.10z)| norm 0.3241 (-0.20z)| lr 5.95e-04 | 8433.15 ms | -100.0% bf16 MFU | 62067 tok/s +step 1760/19560 | loss 3.944242 (-0.43z)| norm 0.3243 (-0.20z)| lr 5.95e-04 | 8436.32 ms | -100.0% bf16 MFU | 62071 tok/s +step 1761/19560 | loss 3.918487 (-1.08z)| norm 0.3175 (-0.36z)| lr 5.95e-04 | 8436.35 ms | -100.0% bf16 MFU | 62075 tok/s +step 1762/19560 | loss 3.981346 (+0.52z)| norm 0.3353 (+0.06z)| lr 5.95e-04 | 8436.70 ms | -100.0% bf16 MFU | 62078 tok/s +step 1763/19560 | loss 3.985987 (+0.64z)| norm 0.3544 (+0.52z)| lr 5.95e-04 | 8435.40 ms | -100.0% bf16 MFU | 62082 tok/s +step 1764/19560 | loss 4.000005 (+0.98z)| norm 0.3259 (-0.16z)| lr 5.95e-04 | 8435.97 ms | -100.0% bf16 MFU | 62085 tok/s +step 1765/19560 | loss 3.970081 (+0.26z)| norm 0.3203 (-0.29z)| lr 5.95e-04 | 8434.31 ms | -100.0% bf16 MFU | 62089 tok/s +step 1766/19560 | loss 3.918228 (-1.13z)| norm 0.3152 (-0.40z)| lr 5.95e-04 | 8436.06 ms | -100.0% bf16 MFU | 62092 tok/s +step 1767/19560 | loss 3.949307 (-0.29z)| norm 0.3250 (-0.13z)| lr 5.95e-04 | 8439.80 ms | -100.0% bf16 MFU | 62094 tok/s +step 1768/19560 | loss 3.946596 (-0.35z)| norm 0.2983 (-0.84z)| lr 5.95e-04 | 8435.03 ms | -100.0% bf16 MFU | 62097 tok/s +step 1769/19560 | loss 3.938815 (-0.56z)| norm 0.3199 (-0.24z)| lr 5.95e-04 | 8438.86 ms | -100.0% bf16 MFU | 62098 tok/s +step 1770/19560 | loss 3.931526 (-0.75z)| norm 0.4201 (+2.44z)| lr 5.95e-04 | 8437.15 ms | -100.0% bf16 MFU | 62100 tok/s +step 1771/19560 | loss 3.890209 (-1.85z)| norm 0.5693 (+5.61z)| lr 5.95e-04 | 8435.59 ms | -100.0% bf16 MFU | 62103 tok/s +step 1772/19560 | loss 3.907755 (-1.36z)| norm 0.4823 (+3.37z)| lr 5.95e-04 | 8437.89 ms | -100.0% bf16 MFU | 62105 tok/s +step 1773/19560 | loss 3.939857 (-0.47z)| norm 0.3340 (+0.04z)| lr 5.95e-04 | 8438.23 ms | -100.0% bf16 MFU | 62106 tok/s +step 1774/19560 | loss 3.939110 (-0.48z)| norm 0.3325 (+0.00z)| lr 5.95e-04 | 8441.13 ms | -100.0% bf16 MFU | 62106 tok/s +step 1775/19560 | loss 3.972126 (+0.44z)| norm 0.2989 (-0.75z)| lr 5.95e-04 | 8438.31 ms | -100.0% bf16 MFU | 62108 tok/s +step 1776/19560 | loss 3.912365 (-1.21z)| norm 0.2877 (-1.00z)| lr 5.95e-04 | 8443.10 ms | -100.0% bf16 MFU | 62107 tok/s +step 1777/19560 | loss 3.956451 (+0.02z)| norm 0.2743 (-1.29z)| lr 5.95e-04 | 8440.50 ms | -100.0% bf16 MFU | 62107 tok/s +step 1778/19560 | loss 3.935941 (-0.54z)| norm 0.2725 (-1.32z)| lr 5.95e-04 | 8440.54 ms | -100.0% bf16 MFU | 62108 tok/s +step 1779/19560 | loss 3.911391 (-1.21z)| norm 0.2869 (-0.99z)| lr 5.95e-04 | 8445.64 ms | -100.0% bf16 MFU | 62106 tok/s +step 1780/19560 | loss 3.986720 (+0.92z)| norm 0.2951 (-0.80z)| lr 5.95e-04 | 8440.86 ms | -100.0% bf16 MFU | 62107 tok/s +step 1781/19560 | loss 3.918863 (-0.99z)| norm 0.2854 (-1.00z)| lr 5.95e-04 | 8442.85 ms | -100.0% bf16 MFU | 62106 tok/s +step 1782/19560 | loss 3.933581 (-0.57z)| norm 0.2848 (-0.99z)| lr 5.95e-04 | 8448.68 ms | -100.0% bf16 MFU | 62104 tok/s +step 1783/19560 | loss 3.945570 (-0.22z)| norm 0.2707 (-1.28z)| lr 5.95e-04 | 8447.23 ms | -100.0% bf16 MFU | 62102 tok/s +step 1784/19560 | loss 3.906671 (-1.30z)| norm 0.2815 (-1.03z)| lr 5.95e-04 | 8445.51 ms | -100.0% bf16 MFU | 62101 tok/s +step 1785/19560 | loss 3.960968 (+0.24z)| norm 0.2812 (-1.02z)| lr 5.95e-04 | 8450.25 ms | -100.0% bf16 MFU | 62098 tok/s +step 1786/19560 | loss 3.976593 (+0.68z)| norm 0.2949 (-0.71z)| lr 5.95e-04 | 8450.18 ms | -100.0% bf16 MFU | 62095 tok/s +step 1787/19560 | loss 3.902128 (-1.41z)| norm 0.3106 (-0.37z)| lr 5.95e-04 | 8448.77 ms | -100.0% bf16 MFU | 62093 tok/s +step 1788/19560 | loss 3.924170 (-0.78z)| norm 0.3174 (-0.22z)| lr 5.95e-04 | 8447.81 ms | -100.0% bf16 MFU | 62092 tok/s +step 1789/19560 | loss 3.877556 (-2.04z)| norm 0.3237 (-0.09z)| lr 5.95e-04 | 8449.39 ms | -100.0% bf16 MFU | 62090 tok/s +step 1790/19560 | loss 3.881307 (-1.92z)| norm 0.3283 (+0.01z)| lr 5.95e-04 | 8450.36 ms | -100.0% bf16 MFU | 62087 tok/s +step 1791/19560 | loss 3.939800 (-0.31z)| norm 0.3576 (+0.65z)| lr 5.95e-04 | 8453.41 ms | -100.0% bf16 MFU | 62084 tok/s +step 1792/19560 | loss 3.893399 (-1.56z)| norm 0.3432 (+0.35z)| lr 5.95e-04 | 8451.72 ms | -100.0% bf16 MFU | 62081 tok/s +step 1793/19560 | loss 3.887429 (-1.69z)| norm 0.3301 (+0.07z)| lr 5.95e-04 | 8449.37 ms | -100.0% bf16 MFU | 62080 tok/s +step 1794/19560 | loss 3.897891 (-1.38z)| norm 0.2948 (-0.70z)| lr 5.95e-04 | 8446.95 ms | -100.0% bf16 MFU | 62079 tok/s +step 1795/19560 | loss 3.923876 (-0.67z)| norm 0.3195 (-0.16z)| lr 5.95e-04 | 8451.50 ms | -100.0% bf16 MFU | 62077 tok/s +step 1796/19560 | loss 3.962456 (+0.39z)| norm 0.3594 (+0.73z)| lr 5.95e-04 | 8446.93 ms | -100.0% bf16 MFU | 62077 tok/s +step 1797/19560 | loss 3.963437 (+0.44z)| norm 0.4260 (+2.14z)| lr 5.95e-04 | 8449.70 ms | -100.0% bf16 MFU | 62075 tok/s +step 1798/19560 | loss 3.952875 (+0.14z)| norm 0.3724 (+0.96z)| lr 5.95e-04 | 8449.72 ms | -100.0% bf16 MFU | 62074 tok/s +step 1799/19560 | loss 3.901867 (-1.27z)| norm 0.3029 (-0.55z)| lr 5.95e-04 | 8450.24 ms | -100.0% bf16 MFU | 62072 tok/s +step 1800/19560 | loss 3.963807 (+0.47z)| norm 0.3175 (-0.24z)| lr 5.95e-04 | 8450.43 ms | -100.0% bf16 MFU | 62071 tok/s +step 1801/19560 | loss 3.937118 (-0.29z)| norm 0.3014 (-0.59z)| lr 5.95e-04 | 8449.50 ms | -100.0% bf16 MFU | 62070 tok/s +step 1802/19560 | loss 3.966189 (+0.53z)| norm 0.3160 (-0.28z)| lr 5.95e-04 | 8450.68 ms | -100.0% bf16 MFU | 62068 tok/s +step 1803/19560 | loss 3.908149 (-1.09z)| norm 0.3310 (+0.05z)| lr 5.95e-04 | 8449.94 ms | -100.0% bf16 MFU | 62067 tok/s +step 1804/19560 | loss 3.897339 (-1.37z)| norm 0.3346 (+0.12z)| lr 5.95e-04 | 8447.69 ms | -100.0% bf16 MFU | 62067 tok/s +step 1805/19560 | loss 3.912981 (-0.92z)| norm 0.3462 (+0.37z)| lr 5.95e-04 | 8451.15 ms | -100.0% bf16 MFU | 62066 tok/s +step 1806/19560 | loss 3.967080 (+0.60z)| norm 0.3113 (-0.40z)| lr 5.95e-04 | 8448.25 ms | -100.0% bf16 MFU | 62065 tok/s +step 1807/19560 | loss 3.898198 (-1.33z)| norm 0.3248 (-0.10z)| lr 5.95e-04 | 8449.81 ms | -100.0% bf16 MFU | 62064 tok/s +step 1808/19560 | loss 3.930735 (-0.40z)| norm 0.3252 (-0.10z)| lr 5.95e-04 | 8448.79 ms | -100.0% bf16 MFU | 62064 tok/s +step 1809/19560 | loss 3.954572 (+0.29z)| norm 0.3248 (-0.11z)| lr 5.95e-04 | 8449.34 ms | -100.0% bf16 MFU | 62063 tok/s +step 1810/19560 | loss 3.885664 (-1.67z)| norm 0.2832 (-1.01z)| lr 5.95e-04 | 8447.06 ms | -100.0% bf16 MFU | 62063 tok/s +step 1811/19560 | loss 3.985293 (+1.17z)| norm 0.2958 (-0.72z)| lr 5.95e-04 | 8448.56 ms | -100.0% bf16 MFU | 62063 tok/s +step 1812/19560 | loss 3.935390 (-0.25z)| norm 0.3167 (-0.26z)| lr 5.95e-04 | 8450.08 ms | -100.0% bf16 MFU | 62062 tok/s +step 1813/19560 | loss 3.987113 (+1.21z)| norm 0.3710 (+0.92z)| lr 5.95e-04 | 8447.64 ms | -100.0% bf16 MFU | 62062 tok/s +step 1814/19560 | loss 3.918342 (-0.73z)| norm 0.3412 (+0.26z)| lr 5.95e-04 | 8444.81 ms | -100.0% bf16 MFU | 62063 tok/s +step 1815/19560 | loss 3.942688 (-0.03z)| norm 0.3026 (-0.59z)| lr 5.95e-04 | 8440.09 ms | -100.0% bf16 MFU | 62066 tok/s +step 1816/19560 | loss 3.936404 (-0.20z)| norm 0.3382 (+0.19z)| lr 5.95e-04 | 8437.59 ms | -100.0% bf16 MFU | 62070 tok/s +step 1817/19560 | loss 3.866374 (-2.15z)| norm 0.3056 (-0.52z)| lr 5.95e-04 | 8434.90 ms | -100.0% bf16 MFU | 62074 tok/s +step 1818/19560 | loss 3.891071 (-1.43z)| norm 0.3347 (+0.12z)| lr 5.95e-04 | 8438.74 ms | -100.0% bf16 MFU | 62077 tok/s +step 1819/19560 | loss 3.930992 (-0.32z)| norm 0.3263 (-0.05z)| lr 5.95e-04 | 8433.70 ms | -100.0% bf16 MFU | 62081 tok/s +step 1820/19560 | loss 3.889390 (-1.50z)| norm 0.3169 (-0.26z)| lr 5.95e-04 | 8436.23 ms | -100.0% bf16 MFU | 62084 tok/s +step 1821/19560 | loss 4.058793 (+3.14z)| norm 0.3356 (+0.15z)| lr 5.95e-04 | 8437.31 ms | -100.0% bf16 MFU | 62087 tok/s +step 1822/19560 | loss 3.905179 (-1.03z)| norm 0.3428 (+0.31z)| lr 5.95e-04 | 8446.34 ms | -100.0% bf16 MFU | 62087 tok/s +step 1823/19560 | loss 3.860907 (-2.17z)| norm 0.3167 (-0.27z)| lr 5.95e-04 | 8457.75 ms | -100.0% bf16 MFU | 62082 tok/s +step 1824/19560 | loss 3.842177 (-2.58z)| norm 0.3135 (-0.34z)| lr 5.95e-04 | 8462.46 ms | -100.0% bf16 MFU | 62075 tok/s +step 1825/19560 | loss 3.808936 (-3.26z)| norm 0.3173 (-0.25z)| lr 5.95e-04 | 8458.97 ms | -100.0% bf16 MFU | 62071 tok/s +step 1826/19560 | loss 3.789617 (-3.53z)| norm 0.3093 (-0.43z)| lr 5.95e-04 | 8461.07 ms | -100.0% bf16 MFU | 62065 tok/s +step 1827/19560 | loss 3.866471 (-1.68z)| norm 0.3256 (-0.07z)| lr 5.95e-04 | 8457.89 ms | -100.0% bf16 MFU | 62061 tok/s +step 1828/19560 | loss 3.870875 (-1.56z)| norm 0.3338 (+0.11z)| lr 5.95e-04 | 8459.28 ms | -100.0% bf16 MFU | 62057 tok/s +step 1829/19560 | loss 3.880059 (-1.32z)| norm 0.3163 (-0.26z)| lr 5.95e-04 | 8454.24 ms | -100.0% bf16 MFU | 62055 tok/s +step 1830/19560 | loss 3.957847 (+0.48z)| norm 0.3236 (-0.09z)| lr 5.95e-04 | 8455.87 ms | -100.0% bf16 MFU | 62052 tok/s +step 1831/19560 | loss 3.833126 (-2.35z)| norm 0.3577 (+0.72z)| lr 5.95e-04 | 8457.45 ms | -100.0% bf16 MFU | 62049 tok/s +step 1832/19560 | loss 3.802096 (-2.94z)| norm 0.2981 (-0.68z)| lr 5.95e-04 | 8456.38 ms | -100.0% bf16 MFU | 62047 tok/s +step 1833/19560 | loss 3.917643 (-0.39z)| norm 0.2972 (-0.70z)| lr 5.95e-04 | 8457.50 ms | -100.0% bf16 MFU | 62044 tok/s +step 1834/19560 | loss 3.781879 (-3.25z)| norm 0.3058 (-0.47z)| lr 5.95e-04 | 8459.49 ms | -100.0% bf16 MFU | 62041 tok/s +step 1835/19560 | loss 3.892090 (-0.87z)| norm 0.3026 (-0.54z)| lr 5.95e-04 | 8457.46 ms | -100.0% bf16 MFU | 62038 tok/s +step 1836/19560 | loss 3.949839 (+0.37z)| norm 0.2675 (-1.39z)| lr 5.95e-04 | 8461.31 ms | -100.0% bf16 MFU | 62034 tok/s +step 1837/19560 | loss 3.968335 (+0.79z)| norm 0.2603 (-1.54z)| lr 5.95e-04 | 8458.25 ms | -100.0% bf16 MFU | 62032 tok/s +step 1838/19560 | loss 3.824406 (-2.28z)| norm 0.2731 (-1.22z)| lr 5.95e-04 | 8456.28 ms | -100.0% bf16 MFU | 62030 tok/s +step 1839/19560 | loss 3.873301 (-1.22z)| norm 0.3143 (-0.18z)| lr 5.95e-04 | 8453.67 ms | -100.0% bf16 MFU | 62030 tok/s +step 1840/19560 | loss 3.911541 (-0.39z)| norm 0.2736 (-1.19z)| lr 5.95e-04 | 8453.14 ms | -100.0% bf16 MFU | 62029 tok/s +step 1841/19560 | loss 3.841428 (-1.85z)| norm 0.2910 (-0.74z)| lr 5.95e-04 | 8459.79 ms | -100.0% bf16 MFU | 62027 tok/s +step 1842/19560 | loss 3.870285 (-1.22z)| norm 0.3054 (-0.38z)| lr 5.95e-04 | 8456.48 ms | -100.0% bf16 MFU | 62025 tok/s +step 1843/19560 | loss 3.915963 (-0.25z)| norm 0.3325 (+0.30z)| lr 5.95e-04 | 8454.95 ms | -100.0% bf16 MFU | 62025 tok/s +step 1844/19560 | loss 3.878394 (-1.03z)| norm 0.3471 (+0.65z)| lr 5.95e-04 | 8460.10 ms | -100.0% bf16 MFU | 62022 tok/s +step 1845/19560 | loss 3.885087 (-0.88z)| norm 0.3183 (-0.08z)| lr 5.95e-04 | 8453.95 ms | -100.0% bf16 MFU | 62022 tok/s +step 1846/19560 | loss 3.895290 (-0.66z)| norm 0.3580 (+0.91z)| lr 5.95e-04 | 8455.76 ms | -100.0% bf16 MFU | 62021 tok/s +step 1847/19560 | loss 3.882044 (-0.92z)| norm 0.3772 (+1.37z)| lr 5.95e-04 | 8457.26 ms | -100.0% bf16 MFU | 62019 tok/s +step 1848/19560 | loss 3.900918 (-0.52z)| norm 0.3466 (+0.60z)| lr 5.95e-04 | 8463.25 ms | -100.0% bf16 MFU | 62016 tok/s +step 1849/19560 | loss 3.867048 (-1.21z)| norm 0.3542 (+0.77z)| lr 5.95e-04 | 8452.99 ms | -100.0% bf16 MFU | 62016 tok/s +step 1850/19560 | loss 3.867619 (-1.18z)| norm 0.3562 (+0.82z)| lr 5.95e-04 | 8456.44 ms | -100.0% bf16 MFU | 62015 tok/s +step 1851/19560 | loss 3.920883 (-0.07z)| norm 0.3410 (+0.44z)| lr 5.95e-04 | 8459.42 ms | -100.0% bf16 MFU | 62013 tok/s +step 1852/19560 | loss 3.985516 (+1.29z)| norm 0.3417 (+0.47z)| lr 5.95e-04 | 8451.82 ms | -100.0% bf16 MFU | 62014 tok/s +step 1853/19560 | loss 3.832413 (-1.88z)| norm 0.3282 (+0.13z)| lr 5.94e-04 | 8453.11 ms | -100.0% bf16 MFU | 62015 tok/s +step 1854/19560 | loss 3.925064 (+0.05z)| norm 0.3867 (+1.58z)| lr 5.94e-04 | 8447.49 ms | -100.0% bf16 MFU | 62017 tok/s +step 1855/19560 | loss 3.955795 (+0.70z)| norm 0.3426 (+0.49z)| lr 5.94e-04 | 8449.32 ms | -100.0% bf16 MFU | 62019 tok/s +step 1856/19560 | loss 3.835781 (-1.79z)| norm 0.3156 (-0.18z)| lr 5.94e-04 | 8453.38 ms | -100.0% bf16 MFU | 62019 tok/s +step 1857/19560 | loss 3.934318 (+0.26z)| norm 0.2946 (-0.70z)| lr 5.94e-04 | 8454.68 ms | -100.0% bf16 MFU | 62019 tok/s +step 1858/19560 | loss 3.927104 (+0.11z)| norm 0.3088 (-0.35z)| lr 5.94e-04 | 8446.58 ms | -100.0% bf16 MFU | 62021 tok/s +step 1859/19560 | loss 3.860220 (-1.28z)| norm 0.3103 (-0.31z)| lr 5.94e-04 | 8452.58 ms | -100.0% bf16 MFU | 62022 tok/s +step 1860/19560 | loss 3.878735 (-0.88z)| norm 0.2826 (-1.00z)| lr 5.94e-04 | 8450.44 ms | -100.0% bf16 MFU | 62023 tok/s +step 1861/19560 | loss 3.892812 (-0.58z)| norm 0.2898 (-0.83z)| lr 5.94e-04 | 8452.24 ms | -100.0% bf16 MFU | 62023 tok/s +step 1862/19560 | loss 3.841724 (-1.62z)| norm 0.3143 (-0.22z)| lr 5.94e-04 | 8446.55 ms | -100.0% bf16 MFU | 62025 tok/s +step 1863/19560 | loss 3.800426 (-2.39z)| norm 0.3337 (+0.26z)| lr 5.94e-04 | 8448.57 ms | -100.0% bf16 MFU | 62027 tok/s +step 1864/19560 | loss 3.918880 (+0.01z)| norm 0.3363 (+0.31z)| lr 5.94e-04 | 8449.19 ms | -100.0% bf16 MFU | 62028 tok/s +step 1865/19560 | loss 3.894674 (-0.47z)| norm 0.3092 (-0.39z)| lr 5.94e-04 | 8448.84 ms | -100.0% bf16 MFU | 62030 tok/s +step 1866/19560 | loss 3.907964 (-0.20z)| norm 0.3168 (-0.21z)| lr 5.94e-04 | 8448.86 ms | -100.0% bf16 MFU | 62031 tok/s +step 1867/19560 | loss 4.017459 (+1.98z)| norm 0.3157 (-0.24z)| lr 5.94e-04 | 8452.39 ms | -100.0% bf16 MFU | 62031 tok/s +step 1868/19560 | loss 3.834907 (-1.65z)| norm 0.3393 (+0.36z)| lr 5.94e-04 | 8447.94 ms | -100.0% bf16 MFU | 62032 tok/s +step 1869/19560 | loss 3.825381 (-1.80z)| norm 0.3898 (+1.65z)| lr 5.94e-04 | 8448.17 ms | -100.0% bf16 MFU | 62034 tok/s +step 1870/19560 | loss 3.866761 (-0.98z)| norm 0.4692 (+3.49z)| lr 5.94e-04 | 8445.20 ms | -100.0% bf16 MFU | 62036 tok/s +step 1871/19560 | loss 3.911528 (-0.10z)| norm 0.4326 (+2.53z)| lr 5.94e-04 | 8454.50 ms | -100.0% bf16 MFU | 62035 tok/s +step 1872/19560 | loss 3.848012 (-1.32z)| norm 0.3419 (+0.37z)| lr 5.94e-04 | 8452.00 ms | -100.0% bf16 MFU | 62035 tok/s +step 1873/19560 | loss 3.861922 (-1.03z)| norm 0.2825 (-1.06z)| lr 5.94e-04 | 8448.12 ms | -100.0% bf16 MFU | 62036 tok/s +step 1874/19560 | loss 3.911131 (-0.05z)| norm 0.2743 (-1.24z)| lr 5.94e-04 | 8459.59 ms | -100.0% bf16 MFU | 62033 tok/s +step 1875/19560 | loss 3.873883 (-0.78z)| norm 0.2997 (-0.62z)| lr 5.94e-04 | 8455.16 ms | -100.0% bf16 MFU | 62032 tok/s +step 1876/19560 | loss 3.841796 (-1.40z)| norm 0.2914 (-0.81z)| lr 5.94e-04 | 8461.04 ms | -100.0% bf16 MFU | 62028 tok/s +step 1877/19560 | loss 3.860335 (-1.02z)| norm 0.2878 (-0.88z)| lr 5.94e-04 | 8460.54 ms | -100.0% bf16 MFU | 62025 tok/s +step 1878/19560 | loss 3.824214 (-1.71z)| norm 0.2838 (-0.97z)| lr 5.94e-04 | 8453.60 ms | -100.0% bf16 MFU | 62025 tok/s +step 1879/19560 | loss 3.806400 (-2.03z)| norm 0.3015 (-0.54z)| lr 5.94e-04 | 8457.04 ms | -100.0% bf16 MFU | 62023 tok/s +step 1880/19560 | loss 4.013338 (+2.00z)| norm 0.2878 (-0.87z)| lr 5.94e-04 | 8458.75 ms | -100.0% bf16 MFU | 62021 tok/s +step 1881/19560 | loss 3.849848 (-1.16z)| norm 0.2855 (-0.92z)| lr 5.94e-04 | 8459.87 ms | -100.0% bf16 MFU | 62019 tok/s +step 1882/19560 | loss 3.857031 (-1.00z)| norm 0.2635 (-1.42z)| lr 5.94e-04 | 8461.68 ms | -100.0% bf16 MFU | 62016 tok/s +step 1883/19560 | loss 3.810236 (-1.86z)| norm 0.2687 (-1.27z)| lr 5.94e-04 | 8459.51 ms | -100.0% bf16 MFU | 62014 tok/s +step 1884/19560 | loss 4.011504 (+1.93z)| norm 0.3385 (+0.36z)| lr 5.94e-04 | 8457.72 ms | -100.0% bf16 MFU | 62013 tok/s +step 1885/19560 | loss 3.903404 (-0.09z)| norm 0.3500 (+0.62z)| lr 5.94e-04 | 8457.80 ms | -100.0% bf16 MFU | 62012 tok/s +step 1886/19560 | loss 3.823758 (-1.57z)| norm 0.3131 (-0.24z)| lr 5.94e-04 | 8451.16 ms | -100.0% bf16 MFU | 62013 tok/s +step 1887/19560 | loss 3.865482 (-0.77z)| norm 0.3254 (+0.04z)| lr 5.94e-04 | 8453.81 ms | -100.0% bf16 MFU | 62013 tok/s +step 1888/19560 | loss 3.862761 (-0.81z)| norm 0.4486 (+2.82z)| lr 5.94e-04 | 8456.70 ms | -100.0% bf16 MFU | 62012 tok/s +step 1889/19560 | loss 3.937499 (+0.59z)| norm 0.3986 (+1.65z)| lr 5.94e-04 | 8456.50 ms | -100.0% bf16 MFU | 62012 tok/s +step 1890/19560 | loss 3.942560 (+0.70z)| norm 0.3657 (+0.90z)| lr 5.94e-04 | 8454.33 ms | -100.0% bf16 MFU | 62012 tok/s +step 1891/19560 | loss 3.873864 (-0.59z)| norm 0.3333 (+0.18z)| lr 5.94e-04 | 8455.75 ms | -100.0% bf16 MFU | 62011 tok/s +step 1892/19560 | loss 3.893673 (-0.20z)| norm 0.3104 (-0.33z)| lr 5.94e-04 | 8454.92 ms | -100.0% bf16 MFU | 62011 tok/s +step 1893/19560 | loss 3.914226 (+0.21z)| norm 0.3043 (-0.46z)| lr 5.94e-04 | 8457.78 ms | -100.0% bf16 MFU | 62010 tok/s +step 1894/19560 | loss 3.890317 (-0.25z)| norm 0.2925 (-0.72z)| lr 5.94e-04 | 8459.84 ms | -100.0% bf16 MFU | 62008 tok/s +step 1895/19560 | loss 3.880097 (-0.44z)| norm 0.2994 (-0.56z)| lr 5.94e-04 | 8456.01 ms | -100.0% bf16 MFU | 62008 tok/s +step 1896/19560 | loss 3.889001 (-0.26z)| norm 0.2647 (-1.32z)| lr 5.94e-04 | 8452.14 ms | -100.0% bf16 MFU | 62009 tok/s +step 1897/19560 | loss 3.826060 (-1.46z)| norm 0.2835 (-0.89z)| lr 5.94e-04 | 8453.20 ms | -100.0% bf16 MFU | 62010 tok/s +step 1898/19560 | loss 3.823483 (-1.49z)| norm 0.2916 (-0.71z)| lr 5.94e-04 | 8457.00 ms | -100.0% bf16 MFU | 62009 tok/s +step 1899/19560 | loss 3.908053 (+0.14z)| norm 0.3205 (-0.01z)| lr 5.94e-04 | 8454.06 ms | -100.0% bf16 MFU | 62009 tok/s +step 1900/19560 | loss 3.878951 (-0.41z)| norm 0.3252 (+0.15z)| lr 5.94e-04 | 8452.81 ms | -100.0% bf16 MFU | 62010 tok/s +step 1901/19560 | loss 3.882197 (-0.34z)| norm 0.3161 (-0.10z)| lr 5.94e-04 | 8455.01 ms | -100.0% bf16 MFU | 62010 tok/s +step 1902/19560 | loss 3.898091 (-0.03z)| norm 0.3031 (-0.45z)| lr 5.94e-04 | 8458.96 ms | -100.0% bf16 MFU | 62009 tok/s +step 1903/19560 | loss 3.899800 (+0.01z)| norm 0.2982 (-0.59z)| lr 5.94e-04 | 8456.17 ms | -100.0% bf16 MFU | 62008 tok/s +step 1904/19560 | loss 3.847444 (-0.99z)| norm 0.2866 (-0.91z)| lr 5.94e-04 | 8453.12 ms | -100.0% bf16 MFU | 62009 tok/s +step 1905/19560 | loss 3.885438 (-0.25z)| norm 0.3273 (+0.21z)| lr 5.94e-04 | 8455.81 ms | -100.0% bf16 MFU | 62009 tok/s +step 1906/19560 | loss 3.914310 (+0.32z)| norm 0.3140 (-0.17z)| lr 5.94e-04 | 8451.60 ms | -100.0% bf16 MFU | 62010 tok/s +step 1907/19560 | loss 3.785082 (-2.15z)| norm 0.3223 (+0.05z)| lr 5.94e-04 | 8455.24 ms | -100.0% bf16 MFU | 62010 tok/s +step 1908/19560 | loss 3.910958 (+0.28z)| norm 0.3252 (+0.13z)| lr 5.94e-04 | 8458.40 ms | -100.0% bf16 MFU | 62009 tok/s +step 1909/19560 | loss 3.828417 (-1.30z)| norm 0.3067 (-0.40z)| lr 5.94e-04 | 8464.29 ms | -100.0% bf16 MFU | 62005 tok/s +step 1910/19560 | loss 3.941317 (+0.88z)| norm 0.3024 (-0.53z)| lr 5.94e-04 | 8457.92 ms | -100.0% bf16 MFU | 62004 tok/s +step 1911/19560 | loss 3.866121 (-0.56z)| norm 0.3008 (-0.59z)| lr 5.94e-04 | 8455.96 ms | -100.0% bf16 MFU | 62004 tok/s +step 1912/19560 | loss 3.868867 (-0.50z)| norm 0.2895 (-0.91z)| lr 5.94e-04 | 8458.92 ms | -100.0% bf16 MFU | 62003 tok/s +step 1913/19560 | loss 3.843343 (-0.98z)| norm 0.3018 (-0.56z)| lr 5.94e-04 | 8458.00 ms | -100.0% bf16 MFU | 62002 tok/s +step 1914/19560 | loss 3.849080 (-0.85z)| norm 0.3018 (-0.57z)| lr 5.94e-04 | 8456.62 ms | -100.0% bf16 MFU | 62002 tok/s +step 1915/19560 | loss 3.860663 (-0.62z)| norm 0.2967 (-0.71z)| lr 5.94e-04 | 8455.96 ms | -100.0% bf16 MFU | 62002 tok/s +step 1916/19560 | loss 3.898561 (+0.12z)| norm 0.3392 (+0.51z)| lr 5.94e-04 | 8463.74 ms | -100.0% bf16 MFU | 61999 tok/s +step 1917/19560 | loss 3.901823 (+0.18z)| norm 0.3164 (-0.15z)| lr 5.94e-04 | 8458.51 ms | -100.0% bf16 MFU | 61998 tok/s +step 1918/19560 | loss 3.945873 (+1.03z)| norm 0.3303 (+0.25z)| lr 5.94e-04 | 8460.12 ms | -100.0% bf16 MFU | 61997 tok/s +step 1919/19560 | loss 3.883475 (-0.18z)| norm 0.3768 (+1.57z)| lr 5.94e-04 | 8458.32 ms | -100.0% bf16 MFU | 61996 tok/s +step 1920/19560 | loss 3.857706 (-0.67z)| norm 0.3475 (+0.74z)| lr 5.94e-04 | 8458.71 ms | -100.0% bf16 MFU | 61996 tok/s +step 1921/19560 | loss 3.943933 (+0.99z)| norm 0.2930 (-0.81z)| lr 5.94e-04 | 8459.10 ms | -100.0% bf16 MFU | 61995 tok/s +step 1922/19560 | loss 3.869658 (-0.44z)| norm 0.3079 (-0.39z)| lr 5.94e-04 | 8458.73 ms | -100.0% bf16 MFU | 61994 tok/s +step 1923/19560 | loss 3.865010 (-0.52z)| norm 0.2888 (-0.92z)| lr 5.94e-04 | 8457.44 ms | -100.0% bf16 MFU | 61994 tok/s +step 1924/19560 | loss 3.864742 (-0.52z)| norm 0.4228 (+2.80z)| lr 5.94e-04 | 8456.00 ms | -100.0% bf16 MFU | 61994 tok/s +step 1925/19560 | loss 3.848275 (-0.83z)| norm 0.3185 (-0.07z)| lr 5.94e-04 | 8457.28 ms | -100.0% bf16 MFU | 61994 tok/s +step 1926/19560 | loss 3.841547 (-0.94z)| norm 0.2702 (-1.43z)| lr 5.94e-04 | 8458.00 ms | -100.0% bf16 MFU | 61994 tok/s +step 1927/19560 | loss 3.922894 (+0.65z)| norm 0.3068 (-0.38z)| lr 5.94e-04 | 8462.38 ms | -100.0% bf16 MFU | 61992 tok/s +step 1928/19560 | loss 3.904621 (+0.30z)| norm 0.3054 (-0.42z)| lr 5.94e-04 | 8455.13 ms | -100.0% bf16 MFU | 61993 tok/s +step 1929/19560 | loss 3.876181 (-0.25z)| norm 0.2603 (-1.69z)| lr 5.94e-04 | 8462.15 ms | -100.0% bf16 MFU | 61991 tok/s +step 1930/19560 | loss 3.849804 (-0.76z)| norm 0.2625 (-1.60z)| lr 5.94e-04 | 8453.71 ms | -100.0% bf16 MFU | 61992 tok/s +step 1931/19560 | loss 3.898980 (+0.22z)| norm 0.2774 (-1.16z)| lr 5.94e-04 | 8459.04 ms | -100.0% bf16 MFU | 61992 tok/s +step 1932/19560 | loss 3.852969 (-0.69z)| norm 0.2811 (-1.04z)| lr 5.94e-04 | 8464.75 ms | -100.0% bf16 MFU | 61989 tok/s +step 1933/19560 | loss 3.835655 (-1.02z)| norm 0.3424 (+0.67z)| lr 5.94e-04 | 8453.12 ms | -100.0% bf16 MFU | 61991 tok/s +step 1934/19560 | loss 3.880431 (-0.11z)| norm 0.3871 (+1.87z)| lr 5.94e-04 | 8456.73 ms | -100.0% bf16 MFU | 61991 tok/s +step 1935/19560 | loss 3.787120 (-1.94z)| norm 0.4113 (+2.46z)| lr 5.94e-04 | 8458.31 ms | -100.0% bf16 MFU | 61991 tok/s +step 1936/19560 | loss 3.872582 (-0.24z)| norm 0.3916 (+1.89z)| lr 5.94e-04 | 8459.06 ms | -100.0% bf16 MFU | 61990 tok/s +step 1937/19560 | loss 3.904525 (+0.40z)| norm 0.3195 (-0.02z)| lr 5.94e-04 | 8458.74 ms | -100.0% bf16 MFU | 61990 tok/s +step 1938/19560 | loss 3.862267 (-0.44z)| norm 0.3257 (+0.14z)| lr 5.94e-04 | 8462.66 ms | -100.0% bf16 MFU | 61988 tok/s +step 1939/19560 | loss 3.838657 (-0.90z)| norm 0.2864 (-0.90z)| lr 5.94e-04 | 8457.14 ms | -100.0% bf16 MFU | 61988 tok/s +step 1940/19560 | loss 3.920475 (+0.76z)| norm 0.3051 (-0.40z)| lr 5.94e-04 | 8458.23 ms | -100.0% bf16 MFU | 61988 tok/s +step 1941/19560 | loss 3.863482 (-0.38z)| norm 0.3353 (+0.41z)| lr 5.94e-04 | 8457.04 ms | -100.0% bf16 MFU | 61988 tok/s +step 1942/19560 | loss 3.901680 (+0.41z)| norm 0.3551 (+0.93z)| lr 5.94e-04 | 8457.32 ms | -100.0% bf16 MFU | 61989 tok/s +step 1943/19560 | loss 3.905844 (+0.50z)| norm 0.3513 (+0.82z)| lr 5.94e-04 | 8458.35 ms | -100.0% bf16 MFU | 61988 tok/s +step 1944/19560 | loss 3.901058 (+0.41z)| norm 0.3310 (+0.28z)| lr 5.94e-04 | 8455.07 ms | -100.0% bf16 MFU | 61989 tok/s +step 1945/19560 | loss 3.789455 (-1.88z)| norm 0.3202 (-0.01z)| lr 5.94e-04 | 8453.63 ms | -100.0% bf16 MFU | 61991 tok/s +step 1946/19560 | loss 3.850432 (-0.62z)| norm 0.2998 (-0.55z)| lr 5.94e-04 | 8455.73 ms | -100.0% bf16 MFU | 61992 tok/s +step 1947/19560 | loss 3.867184 (-0.26z)| norm 0.3148 (-0.15z)| lr 5.94e-04 | 8455.35 ms | -100.0% bf16 MFU | 61992 tok/s +step 1948/19560 | loss 3.870498 (-0.19z)| norm 0.3292 (+0.24z)| lr 5.94e-04 | 8452.88 ms | -100.0% bf16 MFU | 61994 tok/s +step 1949/19560 | loss 3.862731 (-0.34z)| norm 0.2962 (-0.63z)| lr 5.94e-04 | 8456.62 ms | -100.0% bf16 MFU | 61994 tok/s +step 1950/19560 | loss 3.987147 (+2.32z)| norm 0.2930 (-0.71z)| lr 5.94e-04 | 8453.29 ms | -100.0% bf16 MFU | 61995 tok/s +step 1951/19560 | loss 3.873745 (-0.11z)| norm 0.2978 (-0.58z)| lr 5.94e-04 | 8455.94 ms | -100.0% bf16 MFU | 61996 tok/s +step 1952/19560 | loss 3.885903 (+0.14z)| norm 0.2938 (-0.68z)| lr 5.94e-04 | 8455.64 ms | -100.0% bf16 MFU | 61996 tok/s +step 1953/19560 | loss 3.807453 (-1.54z)| norm 0.3332 (+0.36z)| lr 5.93e-04 | 8454.63 ms | -100.0% bf16 MFU | 61997 tok/s +step 1954/19560 | loss 3.900538 (+0.44z)| norm 0.2978 (-0.57z)| lr 5.93e-04 | 8455.21 ms | -100.0% bf16 MFU | 61998 tok/s +step 1955/19560 | loss 3.936790 (+1.21z)| norm 0.3197 (+0.01z)| lr 5.93e-04 | 8455.89 ms | -100.0% bf16 MFU | 61998 tok/s +step 1956/19560 | loss 3.863381 (-0.37z)| norm 0.2765 (-1.11z)| lr 5.93e-04 | 8457.36 ms | -100.0% bf16 MFU | 61998 tok/s +step 1957/19560 | loss 3.874712 (-0.13z)| norm 0.2793 (-1.03z)| lr 5.93e-04 | 8452.58 ms | -100.0% bf16 MFU | 61999 tok/s +step 1958/19560 | loss 3.813389 (-1.44z)| norm 0.2921 (-0.69z)| lr 5.93e-04 | 8454.70 ms | -100.0% bf16 MFU | 62000 tok/s +step 1959/19560 | loss 3.879860 (+0.00z)| norm 0.2748 (-1.12z)| lr 5.93e-04 | 8455.30 ms | -100.0% bf16 MFU | 62000 tok/s +step 1960/19560 | loss 3.858765 (-0.47z)| norm 0.2934 (-0.63z)| lr 5.93e-04 | 8454.94 ms | -100.0% bf16 MFU | 62000 tok/s +step 1961/19560 | loss 3.872236 (-0.17z)| norm 0.2749 (-1.11z)| lr 5.93e-04 | 8454.19 ms | -100.0% bf16 MFU | 62001 tok/s +step 1962/19560 | loss 3.894130 (+0.30z)| norm 0.2887 (-0.74z)| lr 5.93e-04 | 8452.96 ms | -100.0% bf16 MFU | 62002 tok/s +step 1963/19560 | loss 3.806607 (-1.64z)| norm 0.3229 (+0.14z)| lr 5.93e-04 | 8459.56 ms | -100.0% bf16 MFU | 62001 tok/s +step 1964/19560 | loss 3.790787 (-1.95z)| norm 0.3583 (+1.04z)| lr 5.93e-04 | 8452.31 ms | -100.0% bf16 MFU | 62002 tok/s +step 1965/19560 | loss 3.857481 (-0.46z)| norm 0.3271 (+0.22z)| lr 5.93e-04 | 8451.66 ms | -100.0% bf16 MFU | 62004 tok/s +step 1966/19560 | loss 3.825503 (-1.18z)| norm 0.2868 (-0.85z)| lr 5.93e-04 | 8455.40 ms | -100.0% bf16 MFU | 62004 tok/s +step 1967/19560 | loss 3.883533 (+0.12z)| norm 0.3115 (-0.19z)| lr 5.93e-04 | 8454.94 ms | -100.0% bf16 MFU | 62004 tok/s +step 1968/19560 | loss 3.889512 (+0.26z)| norm 0.2923 (-0.70z)| lr 5.93e-04 | 8451.23 ms | -100.0% bf16 MFU | 62006 tok/s +step 1969/19560 | loss 3.815161 (-1.40z)| norm 0.2702 (-1.28z)| lr 5.93e-04 | 8452.93 ms | -100.0% bf16 MFU | 62007 tok/s +step 1970/19560 | loss 3.809504 (-1.51z)| norm 0.3104 (-0.22z)| lr 5.93e-04 | 8455.89 ms | -100.0% bf16 MFU | 62007 tok/s +step 1971/19560 | loss 3.846487 (-0.67z)| norm 0.3740 (+1.43z)| lr 5.93e-04 | 8454.41 ms | -100.0% bf16 MFU | 62007 tok/s +step 1972/19560 | loss 3.881198 (+0.10z)| norm 0.3925 (+1.88z)| lr 5.93e-04 | 8453.59 ms | -100.0% bf16 MFU | 62008 tok/s +step 1973/19560 | loss 3.833665 (-0.95z)| norm 0.3689 (+1.26z)| lr 5.93e-04 | 8456.69 ms | -100.0% bf16 MFU | 62007 tok/s +step 1974/19560 | loss 3.851456 (-0.54z)| norm 0.3617 (+1.07z)| lr 5.93e-04 | 8455.56 ms | -100.0% bf16 MFU | 62007 tok/s +step 1975/19560 | loss 3.859023 (-0.37z)| norm 0.3495 (+0.77z)| lr 5.93e-04 | 8455.32 ms | -100.0% bf16 MFU | 62007 tok/s +step 1976/19560 | loss 3.905229 (+0.65z)| norm 0.3261 (+0.17z)| lr 5.93e-04 | 8456.10 ms | -100.0% bf16 MFU | 62007 tok/s +step 1977/19560 | loss 3.884849 (+0.20z)| norm 0.3302 (+0.28z)| lr 5.93e-04 | 8456.40 ms | -100.0% bf16 MFU | 62006 tok/s +step 1978/19560 | loss 3.823082 (-1.16z)| norm 0.3194 (+0.01z)| lr 5.93e-04 | 8456.02 ms | -100.0% bf16 MFU | 62006 tok/s +step 1979/19560 | loss 3.952329 (+1.67z)| norm 0.3463 (+0.70z)| lr 5.93e-04 | 8455.52 ms | -100.0% bf16 MFU | 62006 tok/s +step 1980/19560 | loss 3.904506 (+0.65z)| norm 0.3031 (-0.41z)| lr 5.93e-04 | 8453.78 ms | -100.0% bf16 MFU | 62007 tok/s +step 1981/19560 | loss 3.841020 (-0.77z)| norm 0.3163 (-0.06z)| lr 5.93e-04 | 8448.65 ms | -100.0% bf16 MFU | 62009 tok/s +step 1982/19560 | loss 3.870136 (-0.11z)| norm 0.2705 (-1.24z)| lr 5.93e-04 | 8439.17 ms | -100.0% bf16 MFU | 62015 tok/s +step 1983/19560 | loss 3.843300 (-0.70z)| norm 0.2774 (-1.04z)| lr 5.93e-04 | 8438.72 ms | -100.0% bf16 MFU | 62021 tok/s +step 1984/19560 | loss 3.844153 (-0.68z)| norm 0.3130 (-0.11z)| lr 5.93e-04 | 8436.60 ms | -100.0% bf16 MFU | 62027 tok/s +step 1985/19560 | loss 3.882843 (+0.21z)| norm 0.2887 (-0.75z)| lr 5.93e-04 | 8436.62 ms | -100.0% bf16 MFU | 62033 tok/s +step 1986/19560 | loss 3.902782 (+0.67z)| norm 0.3084 (-0.23z)| lr 5.93e-04 | 8435.65 ms | -100.0% bf16 MFU | 62039 tok/s +step 1987/19560 | loss 3.901201 (+0.63z)| norm 0.3224 (+0.13z)| lr 5.93e-04 | 8438.30 ms | -100.0% bf16 MFU | 62043 tok/s +step 1988/19560 | loss 3.849313 (-0.56z)| norm 0.3038 (-0.36z)| lr 5.93e-04 | 8437.59 ms | -100.0% bf16 MFU | 62048 tok/s +step 1989/19560 | loss 3.895973 (+0.51z)| norm 0.3031 (-0.38z)| lr 5.93e-04 | 8433.92 ms | -100.0% bf16 MFU | 62054 tok/s +step 1990/19560 | loss 3.892582 (+0.43z)| norm 0.2930 (-0.64z)| lr 5.93e-04 | 8435.04 ms | -100.0% bf16 MFU | 62059 tok/s +step 1991/19560 | loss 3.845108 (-0.68z)| norm 0.3095 (-0.20z)| lr 5.93e-04 | 8435.54 ms | -100.0% bf16 MFU | 62064 tok/s +step 1992/19560 | loss 3.797905 (-1.74z)| norm 0.2922 (-0.65z)| lr 5.93e-04 | 8437.29 ms | -100.0% bf16 MFU | 62067 tok/s +step 1993/19560 | loss 3.836843 (-0.83z)| norm 0.3097 (-0.19z)| lr 5.93e-04 | 8435.29 ms | -100.0% bf16 MFU | 62072 tok/s +step 1994/19560 | loss 3.828599 (-1.00z)| norm 0.3256 (+0.22z)| lr 5.93e-04 | 8436.19 ms | -100.0% bf16 MFU | 62076 tok/s +step 1995/19560 | loss 3.883589 (+0.30z)| norm 0.3135 (-0.09z)| lr 5.93e-04 | 8438.34 ms | -100.0% bf16 MFU | 62078 tok/s +step 1996/19560 | loss 3.861276 (-0.25z)| norm 0.2874 (-0.76z)| lr 5.93e-04 | 8437.93 ms | -100.0% bf16 MFU | 62081 tok/s +step 1997/19560 | loss 3.955156 (+1.97z)| norm 0.3142 (-0.05z)| lr 5.93e-04 | 8436.77 ms | -100.0% bf16 MFU | 62084 tok/s +step 1998/19560 | loss 3.850777 (-0.51z)| norm 0.3202 (+0.15z)| lr 5.93e-04 | 8433.64 ms | -100.0% bf16 MFU | 62088 tok/s +step 1999/19560 | loss 3.843589 (-0.67z)| norm 0.3117 (-0.06z)| lr 5.93e-04 | 8437.92 ms | -100.0% bf16 MFU | 62091 tok/s +step 2000/19560 | loss 3.846493 (-0.60z)| norm 0.2896 (-0.71z)| lr 5.93e-04 | 8440.15 ms | -100.0% bf16 MFU | 62092 tok/s +val loss 3.863114 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 2644/10042 = 0.263294 +step 2001/19560 | loss 3.874212 (+0.05z)| norm 0.3162 (+0.07z)| lr 5.93e-04 | 8449.72 ms | -100.0% bf16 MFU | 62090 tok/s +step 2002/19560 | loss 3.800112 (-1.68z)| norm 0.3452 (+0.92z)| lr 5.93e-04 | 8446.41 ms | -100.0% bf16 MFU | 62089 tok/s +step 2003/19560 | loss 3.814309 (-1.32z)| norm 0.3615 (+1.39z)| lr 5.93e-04 | 8450.82 ms | -100.0% bf16 MFU | 62086 tok/s +step 2004/19560 | loss 3.857933 (-0.30z)| norm 0.3042 (-0.32z)| lr 5.93e-04 | 8450.10 ms | -100.0% bf16 MFU | 62084 tok/s +step 2005/19560 | loss 3.773013 (-2.24z)| norm 0.2814 (-0.99z)| lr 5.93e-04 | 8450.72 ms | -100.0% bf16 MFU | 62082 tok/s +step 2006/19560 | loss 3.820482 (-1.14z)| norm 0.2608 (-1.59z)| lr 5.93e-04 | 8451.23 ms | -100.0% bf16 MFU | 62080 tok/s +step 2007/19560 | loss 3.870704 (+0.00z)| norm 0.2870 (-0.81z)| lr 5.93e-04 | 8449.21 ms | -100.0% bf16 MFU | 62079 tok/s +step 2008/19560 | loss 3.853034 (-0.39z)| norm 0.3024 (-0.36z)| lr 5.93e-04 | 8451.22 ms | -100.0% bf16 MFU | 62076 tok/s +step 2009/19560 | loss 3.877903 (+0.20z)| norm 0.3074 (-0.22z)| lr 5.93e-04 | 8451.18 ms | -100.0% bf16 MFU | 62075 tok/s +step 2010/19560 | loss 3.851096 (-0.45z)| norm 0.2652 (-1.47z)| lr 5.93e-04 | 8448.26 ms | -100.0% bf16 MFU | 62074 tok/s +step 2011/19560 | loss 3.781471 (-2.11z)| norm 0.2704 (-1.32z)| lr 5.93e-04 | 8451.33 ms | -100.0% bf16 MFU | 62072 tok/s +step 2012/19560 | loss 3.842626 (-0.64z)| norm 0.2541 (-1.76z)| lr 5.93e-04 | 8447.50 ms | -100.0% bf16 MFU | 62071 tok/s +step 2013/19560 | loss 3.866478 (-0.03z)| norm 0.2842 (-0.87z)| lr 5.93e-04 | 8463.42 ms | -100.0% bf16 MFU | 62065 tok/s +step 2014/19560 | loss 3.818048 (-1.25z)| norm 0.3210 (+0.21z)| lr 5.93e-04 | 8472.83 ms | -100.0% bf16 MFU | 62056 tok/s +step 2015/19560 | loss 3.785901 (-2.02z)| norm 0.3247 (+0.32z)| lr 5.93e-04 | 8473.61 ms | -100.0% bf16 MFU | 62047 tok/s +step 2016/19560 | loss 3.835835 (-0.77z)| norm 0.3303 (+0.55z)| lr 5.93e-04 | 8478.19 ms | -100.0% bf16 MFU | 62036 tok/s +step 2017/19560 | loss 3.848180 (-0.45z)| norm 0.3059 (-0.20z)| lr 5.93e-04 | 8474.26 ms | -100.0% bf16 MFU | 62028 tok/s +step 2018/19560 | loss 3.816006 (-1.24z)| norm 0.2823 (-0.95z)| lr 5.93e-04 | 8466.76 ms | -100.0% bf16 MFU | 62023 tok/s +step 2019/19560 | loss 3.783140 (-2.02z)| norm 0.2695 (-1.34z)| lr 5.93e-04 | 8470.73 ms | -100.0% bf16 MFU | 62016 tok/s +step 2020/19560 | loss 3.817666 (-1.15z)| norm 0.2747 (-1.16z)| lr 5.93e-04 | 8474.82 ms | -100.0% bf16 MFU | 62009 tok/s +step 2021/19560 | loss 3.799994 (-1.56z)| norm 0.2834 (-0.87z)| lr 5.93e-04 | 8470.41 ms | -100.0% bf16 MFU | 62003 tok/s +step 2022/19560 | loss 3.853180 (-0.23z)| norm 0.3089 (-0.05z)| lr 5.93e-04 | 8474.62 ms | -100.0% bf16 MFU | 61996 tok/s +step 2023/19560 | loss 3.880176 (+0.44z)| norm 0.3229 (+0.39z)| lr 5.93e-04 | 8471.35 ms | -100.0% bf16 MFU | 61991 tok/s +step 2024/19560 | loss 3.826475 (-0.88z)| norm 0.3484 (+1.19z)| lr 5.93e-04 | 8462.38 ms | -100.0% bf16 MFU | 61989 tok/s +step 2025/19560 | loss 3.881614 (+0.47z)| norm 0.3508 (+1.25z)| lr 5.93e-04 | 8468.88 ms | -100.0% bf16 MFU | 61985 tok/s +step 2026/19560 | loss 3.906803 (+1.08z)| norm 0.3141 (+0.06z)| lr 5.93e-04 | 8471.03 ms | -100.0% bf16 MFU | 61980 tok/s +step 2027/19560 | loss 3.759839 (-2.48z)| norm 0.3181 (+0.19z)| lr 5.93e-04 | 8472.10 ms | -100.0% bf16 MFU | 61976 tok/s +step 2028/19560 | loss 3.813191 (-1.17z)| norm 0.2962 (-0.51z)| lr 5.93e-04 | 8469.42 ms | -100.0% bf16 MFU | 61972 tok/s +step 2029/19560 | loss 3.812470 (-1.17z)| norm 0.3154 (+0.11z)| lr 5.93e-04 | 8469.61 ms | -100.0% bf16 MFU | 61968 tok/s +step 2030/19560 | loss 3.839086 (-0.52z)| norm 0.3586 (+1.48z)| lr 5.93e-04 | 8469.96 ms | -100.0% bf16 MFU | 61965 tok/s +step 2031/19560 | loss 3.829062 (-0.75z)| norm 0.3670 (+1.71z)| lr 5.93e-04 | 8467.43 ms | -100.0% bf16 MFU | 61963 tok/s +step 2032/19560 | loss 3.886768 (+0.64z)| norm 0.3459 (+1.03z)| lr 5.93e-04 | 8463.92 ms | -100.0% bf16 MFU | 61962 tok/s +step 2033/19560 | loss 3.819273 (-0.97z)| norm 0.3515 (+1.19z)| lr 5.93e-04 | 8466.08 ms | -100.0% bf16 MFU | 61960 tok/s +step 2034/19560 | loss 3.859671 (+0.01z)| norm 0.3567 (+1.34z)| lr 5.93e-04 | 8469.86 ms | -100.0% bf16 MFU | 61957 tok/s +step 2035/19560 | loss 3.808542 (-1.24z)| norm 0.3729 (+1.80z)| lr 5.93e-04 | 8467.09 ms | -100.0% bf16 MFU | 61955 tok/s +step 2036/19560 | loss 3.808048 (-1.24z)| norm 0.3253 (+0.34z)| lr 5.93e-04 | 8463.36 ms | -100.0% bf16 MFU | 61955 tok/s +step 2037/19560 | loss 3.891502 (+0.79z)| norm 0.2988 (-0.47z)| lr 5.93e-04 | 8466.87 ms | -100.0% bf16 MFU | 61953 tok/s +step 2038/19560 | loss 3.838163 (-0.50z)| norm 0.3108 (-0.11z)| lr 5.93e-04 | 8466.40 ms | -100.0% bf16 MFU | 61952 tok/s +step 2039/19560 | loss 3.868297 (+0.25z)| norm 0.2636 (-1.54z)| lr 5.93e-04 | 8456.02 ms | -100.0% bf16 MFU | 61954 tok/s +step 2040/19560 | loss 3.847491 (-0.27z)| norm 0.3032 (-0.33z)| lr 5.93e-04 | 8463.72 ms | -100.0% bf16 MFU | 61954 tok/s +step 2041/19560 | loss 3.868819 (+0.26z)| norm 0.2937 (-0.62z)| lr 5.93e-04 | 8466.02 ms | -100.0% bf16 MFU | 61953 tok/s +step 2042/19560 | loss 3.757685 (-2.42z)| norm 0.2638 (-1.51z)| lr 5.93e-04 | 8466.40 ms | -100.0% bf16 MFU | 61951 tok/s +step 2043/19560 | loss 3.857454 (-0.01z)| norm 0.2710 (-1.28z)| lr 5.93e-04 | 8457.84 ms | -100.0% bf16 MFU | 61953 tok/s +step 2044/19560 | loss 3.842419 (-0.36z)| norm 0.2661 (-1.40z)| lr 5.93e-04 | 8466.04 ms | -100.0% bf16 MFU | 61952 tok/s +step 2045/19560 | loss 3.853661 (-0.08z)| norm 0.2786 (-1.02z)| lr 5.93e-04 | 8464.20 ms | -100.0% bf16 MFU | 61951 tok/s +step 2046/19560 | loss 3.819781 (-0.90z)| norm 0.2922 (-0.60z)| lr 5.93e-04 | 8456.99 ms | -100.0% bf16 MFU | 61954 tok/s +step 2047/19560 | loss 3.835275 (-0.50z)| norm 0.2655 (-1.38z)| lr 5.92e-04 | 8464.93 ms | -100.0% bf16 MFU | 61953 tok/s +step 2048/19560 | loss 3.875620 (+0.50z)| norm 0.2740 (-1.11z)| lr 5.92e-04 | 8459.82 ms | -100.0% bf16 MFU | 61954 tok/s +step 2049/19560 | loss 3.824682 (-0.76z)| norm 0.2695 (-1.23z)| lr 5.92e-04 | 8466.09 ms | -100.0% bf16 MFU | 61952 tok/s +step 2050/19560 | loss 3.795851 (-1.46z)| norm 0.2816 (-0.86z)| lr 5.92e-04 | 8460.82 ms | -100.0% bf16 MFU | 61953 tok/s +step 2051/19560 | loss 3.806566 (-1.17z)| norm 0.3093 (-0.04z)| lr 5.92e-04 | 8463.70 ms | -100.0% bf16 MFU | 61953 tok/s +step 2052/19560 | loss 3.810306 (-1.06z)| norm 0.3309 (+0.65z)| lr 5.92e-04 | 8460.25 ms | -100.0% bf16 MFU | 61954 tok/s +step 2053/19560 | loss 3.803005 (-1.23z)| norm 0.3609 (+1.56z)| lr 5.92e-04 | 8466.09 ms | -100.0% bf16 MFU | 61952 tok/s +step 2054/19560 | loss 3.868225 (+0.37z)| norm 0.3727 (+1.89z)| lr 5.92e-04 | 8461.27 ms | -100.0% bf16 MFU | 61953 tok/s +step 2055/19560 | loss 3.768726 (-2.04z)| norm 0.3588 (+1.44z)| lr 5.92e-04 | 8455.29 ms | -100.0% bf16 MFU | 61956 tok/s +step 2056/19560 | loss 3.832761 (-0.46z)| norm 0.2842 (-0.82z)| lr 5.92e-04 | 8461.04 ms | -100.0% bf16 MFU | 61956 tok/s +step 2057/19560 | loss 3.860359 (+0.23z)| norm 0.2909 (-0.63z)| lr 5.92e-04 | 8460.09 ms | -100.0% bf16 MFU | 61957 tok/s +step 2058/19560 | loss 3.811242 (-0.98z)| norm 0.3217 (+0.30z)| lr 5.92e-04 | 8459.69 ms | -100.0% bf16 MFU | 61958 tok/s +step 2059/19560 | loss 3.840286 (-0.25z)| norm 0.2837 (-0.88z)| lr 5.92e-04 | 8456.37 ms | -100.0% bf16 MFU | 61960 tok/s +step 2060/19560 | loss 3.845103 (-0.13z)| norm 0.3029 (-0.29z)| lr 5.92e-04 | 8463.35 ms | -100.0% bf16 MFU | 61959 tok/s +step 2061/19560 | loss 3.792893 (-1.40z)| norm 0.3268 (+0.45z)| lr 5.92e-04 | 8456.92 ms | -100.0% bf16 MFU | 61961 tok/s +step 2062/19560 | loss 3.864969 (+0.37z)| norm 0.3041 (-0.23z)| lr 5.92e-04 | 8460.42 ms | -100.0% bf16 MFU | 61962 tok/s +step 2063/19560 | loss 3.779219 (-1.73z)| norm 0.2890 (-0.71z)| lr 5.92e-04 | 8458.68 ms | -100.0% bf16 MFU | 61963 tok/s +step 2064/19560 | loss 3.831040 (-0.45z)| norm 0.2885 (-0.72z)| lr 5.92e-04 | 8460.22 ms | -100.0% bf16 MFU | 61963 tok/s +step 2065/19560 | loss 3.812790 (-0.89z)| norm 0.2961 (-0.46z)| lr 5.92e-04 | 8457.24 ms | -100.0% bf16 MFU | 61964 tok/s +step 2066/19560 | loss 3.820720 (-0.68z)| norm 0.3341 (+0.83z)| lr 5.92e-04 | 8454.87 ms | -100.0% bf16 MFU | 61967 tok/s +step 2067/19560 | loss 3.847617 (-0.02z)| norm 0.3073 (-0.09z)| lr 5.92e-04 | 8458.56 ms | -100.0% bf16 MFU | 61968 tok/s +step 2068/19560 | loss 3.818970 (-0.72z)| norm 0.2649 (-1.50z)| lr 5.92e-04 | 8456.45 ms | -100.0% bf16 MFU | 61969 tok/s +step 2069/19560 | loss 3.858786 (+0.27z)| norm 0.2931 (-0.54z)| lr 5.92e-04 | 8461.87 ms | -100.0% bf16 MFU | 61969 tok/s +step 2070/19560 | loss 3.798670 (-1.20z)| norm 0.2839 (-0.84z)| lr 5.92e-04 | 8454.93 ms | -100.0% bf16 MFU | 61971 tok/s +step 2071/19560 | loss 3.762150 (-2.07z)| norm 0.2933 (-0.51z)| lr 5.92e-04 | 8463.59 ms | -100.0% bf16 MFU | 61969 tok/s +step 2072/19560 | loss 3.828904 (-0.41z)| norm 0.3291 (+0.72z)| lr 5.92e-04 | 8456.81 ms | -100.0% bf16 MFU | 61971 tok/s +step 2073/19560 | loss 3.852473 (+0.17z)| norm 0.3511 (+1.45z)| lr 5.92e-04 | 8458.45 ms | -100.0% bf16 MFU | 61971 tok/s +step 2074/19560 | loss 3.836256 (-0.24z)| norm 0.3142 (+0.19z)| lr 5.92e-04 | 8457.30 ms | -100.0% bf16 MFU | 61972 tok/s +step 2075/19560 | loss 3.797054 (-1.20z)| norm 0.3159 (+0.25z)| lr 5.92e-04 | 8448.96 ms | -100.0% bf16 MFU | 61977 tok/s +step 2076/19560 | loss 3.818672 (-0.65z)| norm 0.2736 (-1.17z)| lr 5.92e-04 | 8457.25 ms | -100.0% bf16 MFU | 61977 tok/s +step 2077/19560 | loss 3.809748 (-0.86z)| norm 0.2911 (-0.57z)| lr 5.92e-04 | 8461.36 ms | -100.0% bf16 MFU | 61977 tok/s +step 2078/19560 | loss 3.884078 (+1.06z)| norm 0.3044 (-0.12z)| lr 5.92e-04 | 8459.10 ms | -100.0% bf16 MFU | 61977 tok/s +step 2079/19560 | loss 3.793499 (-1.28z)| norm 0.2730 (-1.17z)| lr 5.92e-04 | 8454.15 ms | -100.0% bf16 MFU | 61979 tok/s +step 2080/19560 | loss 3.804205 (-0.99z)| norm 0.2805 (-0.91z)| lr 5.92e-04 | 8450.35 ms | -100.0% bf16 MFU | 61982 tok/s +step 2081/19560 | loss 3.835496 (-0.18z)| norm 0.2827 (-0.83z)| lr 5.92e-04 | 8455.73 ms | -100.0% bf16 MFU | 61983 tok/s +step 2082/19560 | loss 3.765122 (-1.98z)| norm 0.2710 (-1.21z)| lr 5.92e-04 | 8456.94 ms | -100.0% bf16 MFU | 61984 tok/s +step 2083/19560 | loss 3.820014 (-0.54z)| norm 0.2884 (-0.62z)| lr 5.92e-04 | 8453.87 ms | -100.0% bf16 MFU | 61985 tok/s +step 2084/19560 | loss 3.856493 (+0.43z)| norm 0.3232 (+0.53z)| lr 5.92e-04 | 8456.53 ms | -100.0% bf16 MFU | 61986 tok/s +step 2085/19560 | loss 3.838870 (-0.03z)| norm 0.3554 (+1.58z)| lr 5.92e-04 | 8456.31 ms | -100.0% bf16 MFU | 61987 tok/s +step 2086/19560 | loss 3.821617 (-0.50z)| norm 0.3514 (+1.42z)| lr 5.92e-04 | 8457.27 ms | -100.0% bf16 MFU | 61987 tok/s +step 2087/19560 | loss 3.768070 (-1.88z)| norm 0.3212 (+0.41z)| lr 5.92e-04 | 8460.89 ms | -100.0% bf16 MFU | 61986 tok/s +step 2088/19560 | loss 3.831662 (-0.20z)| norm 0.3204 (+0.38z)| lr 5.92e-04 | 8454.28 ms | -100.0% bf16 MFU | 61987 tok/s +step 2089/19560 | loss 3.821847 (-0.45z)| norm 0.3192 (+0.33z)| lr 5.92e-04 | 8451.67 ms | -100.0% bf16 MFU | 61990 tok/s +step 2090/19560 | loss 3.795482 (-1.13z)| norm 0.2827 (-0.88z)| lr 5.92e-04 | 8454.09 ms | -100.0% bf16 MFU | 61991 tok/s +step 2091/19560 | loss 3.795478 (-1.13z)| norm 0.2715 (-1.24z)| lr 5.92e-04 | 8458.67 ms | -100.0% bf16 MFU | 61990 tok/s +step 2092/19560 | loss 4.004042 (+4.10z)| norm 0.2878 (-0.68z)| lr 5.92e-04 | 8451.42 ms | -100.0% bf16 MFU | 61993 tok/s +step 2093/19560 | loss 3.762141 (-1.88z)| norm 0.3174 (+0.31z)| lr 5.92e-04 | 8450.35 ms | -100.0% bf16 MFU | 61995 tok/s +step 2094/19560 | loss 3.835227 (-0.09z)| norm 0.3132 (+0.16z)| lr 5.92e-04 | 8455.77 ms | -100.0% bf16 MFU | 61996 tok/s +step 2095/19560 | loss 3.786263 (-1.27z)| norm 0.3138 (+0.18z)| lr 5.92e-04 | 8457.80 ms | -100.0% bf16 MFU | 61995 tok/s +step 2096/19560 | loss 3.895426 (+1.40z)| norm 0.2983 (-0.34z)| lr 5.92e-04 | 8454.54 ms | -100.0% bf16 MFU | 61996 tok/s +step 2097/19560 | loss 3.781784 (-1.37z)| norm 0.2764 (-1.08z)| lr 5.92e-04 | 8452.63 ms | -100.0% bf16 MFU | 61998 tok/s +step 2098/19560 | loss 3.873433 (+0.85z)| norm 0.2852 (-0.77z)| lr 5.92e-04 | 8453.36 ms | -100.0% bf16 MFU | 61999 tok/s +step 2099/19560 | loss 3.765175 (-1.74z)| norm 0.2952 (-0.43z)| lr 5.92e-04 | 8458.94 ms | -100.0% bf16 MFU | 61998 tok/s +step 2100/19560 | loss 3.835791 (-0.04z)| norm 0.3138 (+0.23z)| lr 5.92e-04 | 8456.04 ms | -100.0% bf16 MFU | 61998 tok/s +step 2101/19560 | loss 3.856464 (+0.46z)| norm 0.3342 (+0.98z)| lr 5.92e-04 | 8454.49 ms | -100.0% bf16 MFU | 61999 tok/s +step 2102/19560 | loss 3.770501 (-1.59z)| norm 0.3395 (+1.19z)| lr 5.92e-04 | 8456.88 ms | -100.0% bf16 MFU | 61999 tok/s +step 2103/19560 | loss 3.777874 (-1.39z)| norm 0.3194 (+0.47z)| lr 5.92e-04 | 8454.20 ms | -100.0% bf16 MFU | 62000 tok/s +step 2104/19560 | loss 3.788522 (-1.12z)| norm 0.3051 (-0.04z)| lr 5.92e-04 | 8455.74 ms | -100.0% bf16 MFU | 62000 tok/s +step 2105/19560 | loss 3.802351 (-0.78z)| norm 0.2971 (-0.33z)| lr 5.92e-04 | 8454.00 ms | -100.0% bf16 MFU | 62001 tok/s +step 2106/19560 | loss 3.899678 (+1.53z)| norm 0.2842 (-0.79z)| lr 5.92e-04 | 8453.13 ms | -100.0% bf16 MFU | 62002 tok/s +step 2107/19560 | loss 3.850004 (+0.38z)| norm 0.2942 (-0.41z)| lr 5.92e-04 | 8456.67 ms | -100.0% bf16 MFU | 62001 tok/s +step 2108/19560 | loss 3.818937 (-0.37z)| norm 0.3104 (+0.18z)| lr 5.92e-04 | 8454.61 ms | -100.0% bf16 MFU | 62002 tok/s +step 2109/19560 | loss 3.818257 (-0.38z)| norm 0.3190 (+0.50z)| lr 5.92e-04 | 8455.66 ms | -100.0% bf16 MFU | 62002 tok/s +step 2110/19560 | loss 3.821071 (-0.30z)| norm 0.3066 (+0.03z)| lr 5.92e-04 | 8448.84 ms | -100.0% bf16 MFU | 62005 tok/s +step 2111/19560 | loss 3.807081 (-0.64z)| norm 0.2676 (-1.41z)| lr 5.92e-04 | 8455.64 ms | -100.0% bf16 MFU | 62005 tok/s +step 2112/19560 | loss 3.986300 (+3.58z)| norm 0.3020 (-0.13z)| lr 5.92e-04 | 8458.70 ms | -100.0% bf16 MFU | 62004 tok/s +step 2113/19560 | loss 3.832817 (-0.02z)| norm 0.2981 (-0.28z)| lr 5.92e-04 | 8454.61 ms | -100.0% bf16 MFU | 62004 tok/s +step 2114/19560 | loss 3.801439 (-0.75z)| norm 0.2963 (-0.34z)| lr 5.92e-04 | 8453.04 ms | -100.0% bf16 MFU | 62005 tok/s +step 2115/19560 | loss 3.733274 (-2.32z)| norm 0.3039 (-0.06z)| lr 5.92e-04 | 8450.75 ms | -100.0% bf16 MFU | 62007 tok/s +step 2116/19560 | loss 3.897940 (+1.54z)| norm 0.3077 (+0.09z)| lr 5.92e-04 | 8450.84 ms | -100.0% bf16 MFU | 62008 tok/s +step 2117/19560 | loss 3.817803 (-0.32z)| norm 0.3237 (+0.67z)| lr 5.92e-04 | 8457.49 ms | -100.0% bf16 MFU | 62008 tok/s +step 2118/19560 | loss 3.849803 (+0.44z)| norm 0.3105 (+0.18z)| lr 5.92e-04 | 8456.82 ms | -100.0% bf16 MFU | 62007 tok/s +step 2119/19560 | loss 3.860478 (+0.69z)| norm 0.2992 (-0.24z)| lr 5.92e-04 | 8450.14 ms | -100.0% bf16 MFU | 62009 tok/s +step 2120/19560 | loss 3.814046 (-0.41z)| norm 0.3024 (-0.12z)| lr 5.92e-04 | 8456.38 ms | -100.0% bf16 MFU | 62008 tok/s +step 2121/19560 | loss 3.840569 (+0.22z)| norm 0.3246 (+0.70z)| lr 5.92e-04 | 8450.53 ms | -100.0% bf16 MFU | 62010 tok/s +step 2122/19560 | loss 3.813736 (-0.41z)| norm 0.3202 (+0.53z)| lr 5.92e-04 | 8455.03 ms | -100.0% bf16 MFU | 62010 tok/s +step 2123/19560 | loss 3.735408 (-2.21z)| norm 0.3128 (+0.26z)| lr 5.92e-04 | 8450.01 ms | -100.0% bf16 MFU | 62012 tok/s +step 2124/19560 | loss 3.729099 (-2.29z)| norm 0.2980 (-0.29z)| lr 5.92e-04 | 8449.92 ms | -100.0% bf16 MFU | 62014 tok/s +step 2125/19560 | loss 3.801995 (-0.61z)| norm 0.3036 (-0.08z)| lr 5.92e-04 | 8456.31 ms | -100.0% bf16 MFU | 62013 tok/s +step 2126/19560 | loss 3.887414 (+1.40z)| norm 0.3148 (+0.34z)| lr 5.92e-04 | 8453.44 ms | -100.0% bf16 MFU | 62013 tok/s +step 2127/19560 | loss 3.752456 (-1.75z)| norm 0.3255 (+0.73z)| lr 5.92e-04 | 8454.60 ms | -100.0% bf16 MFU | 62013 tok/s +step 2128/19560 | loss 3.803531 (-0.55z)| norm 0.2745 (-1.15z)| lr 5.92e-04 | 8452.45 ms | -100.0% bf16 MFU | 62014 tok/s +step 2129/19560 | loss 3.748055 (-1.80z)| norm 0.2990 (-0.24z)| lr 5.92e-04 | 8449.58 ms | -100.0% bf16 MFU | 62016 tok/s +step 2130/19560 | loss 3.775496 (-1.16z)| norm 0.2834 (-0.81z)| lr 5.92e-04 | 8448.40 ms | -100.0% bf16 MFU | 62018 tok/s +step 2131/19560 | loss 3.793447 (-0.74z)| norm 0.2655 (-1.46z)| lr 5.92e-04 | 8456.93 ms | -100.0% bf16 MFU | 62017 tok/s +step 2132/19560 | loss 3.814071 (-0.26z)| norm 0.2693 (-1.30z)| lr 5.92e-04 | 8453.58 ms | -100.0% bf16 MFU | 62017 tok/s +step 2133/19560 | loss 3.836808 (+0.25z)| norm 0.2726 (-1.17z)| lr 5.92e-04 | 8451.20 ms | -100.0% bf16 MFU | 62018 tok/s +step 2134/19560 | loss 3.796907 (-0.66z)| norm 0.2879 (-0.61z)| lr 5.91e-04 | 8451.61 ms | -100.0% bf16 MFU | 62019 tok/s +step 2135/19560 | loss 3.857730 (+0.74z)| norm 0.3408 (+1.35z)| lr 5.91e-04 | 8448.59 ms | -100.0% bf16 MFU | 62020 tok/s +step 2136/19560 | loss 3.801103 (-0.56z)| norm 0.3635 (+2.14z)| lr 5.91e-04 | 8455.01 ms | -100.0% bf16 MFU | 62020 tok/s +step 2137/19560 | loss 3.841475 (+0.39z)| norm 0.3279 (+0.83z)| lr 5.91e-04 | 8455.52 ms | -100.0% bf16 MFU | 62019 tok/s +step 2138/19560 | loss 3.828795 (+0.09z)| norm 0.2955 (-0.37z)| lr 5.91e-04 | 8450.16 ms | -100.0% bf16 MFU | 62020 tok/s +step 2139/19560 | loss 3.804646 (-0.47z)| norm 0.2874 (-0.68z)| lr 5.91e-04 | 8450.82 ms | -100.0% bf16 MFU | 62021 tok/s +step 2140/19560 | loss 3.800834 (-0.56z)| norm 0.3269 (+0.78z)| lr 5.91e-04 | 8449.24 ms | -100.0% bf16 MFU | 62023 tok/s +step 2141/19560 | loss 3.869543 (+1.05z)| norm 0.3225 (+0.60z)| lr 5.91e-04 | 8453.20 ms | -100.0% bf16 MFU | 62023 tok/s +step 2142/19560 | loss 3.829011 (+0.10z)| norm 0.3121 (+0.21z)| lr 5.91e-04 | 8455.45 ms | -100.0% bf16 MFU | 62022 tok/s +step 2143/19560 | loss 3.788981 (-0.84z)| norm 0.3042 (-0.08z)| lr 5.91e-04 | 8451.86 ms | -100.0% bf16 MFU | 62023 tok/s +step 2144/19560 | loss 3.776364 (-1.11z)| norm 0.2803 (-0.96z)| lr 5.91e-04 | 8450.97 ms | -100.0% bf16 MFU | 62023 tok/s +step 2145/19560 | loss 3.754928 (-1.58z)| norm 0.2921 (-0.51z)| lr 5.91e-04 | 8450.45 ms | -100.0% bf16 MFU | 62024 tok/s +step 2146/19560 | loss 3.731146 (-2.08z)| norm 0.2744 (-1.17z)| lr 5.91e-04 | 8441.99 ms | -100.0% bf16 MFU | 62028 tok/s +step 2147/19560 | loss 3.818649 (-0.10z)| norm 0.2731 (-1.23z)| lr 5.91e-04 | 8437.43 ms | -100.0% bf16 MFU | 62034 tok/s +step 2148/19560 | loss 3.809809 (-0.30z)| norm 0.2745 (-1.17z)| lr 5.91e-04 | 8436.78 ms | -100.0% bf16 MFU | 62039 tok/s +step 2149/19560 | loss 3.757398 (-1.47z)| norm 0.2690 (-1.37z)| lr 5.91e-04 | 8435.01 ms | -100.0% bf16 MFU | 62045 tok/s +step 2150/19560 | loss 3.716571 (-2.32z)| norm 0.3027 (-0.11z)| lr 5.91e-04 | 8434.94 ms | -100.0% bf16 MFU | 62051 tok/s +step 2151/19560 | loss 3.807079 (-0.31z)| norm 0.3174 (+0.44z)| lr 5.91e-04 | 8434.45 ms | -100.0% bf16 MFU | 62056 tok/s +step 2152/19560 | loss 3.829564 (+0.19z)| norm 0.3278 (+0.84z)| lr 5.91e-04 | 8435.28 ms | -100.0% bf16 MFU | 62061 tok/s +step 2153/19560 | loss 3.787497 (-0.74z)| norm 0.3248 (+0.75z)| lr 5.91e-04 | 8433.07 ms | -100.0% bf16 MFU | 62067 tok/s +step 2154/19560 | loss 3.711301 (-2.39z)| norm 0.3062 (+0.04z)| lr 5.91e-04 | 8435.31 ms | -100.0% bf16 MFU | 62071 tok/s +step 2155/19560 | loss 3.794889 (-0.54z)| norm 0.2826 (-0.84z)| lr 5.91e-04 | 8437.13 ms | -100.0% bf16 MFU | 62074 tok/s +step 2156/19560 | loss 3.772050 (-1.04z)| norm 0.2856 (-0.73z)| lr 5.91e-04 | 8439.67 ms | -100.0% bf16 MFU | 62077 tok/s +step 2157/19560 | loss 3.839781 (+0.46z)| norm 0.2861 (-0.70z)| lr 5.91e-04 | 8438.46 ms | -100.0% bf16 MFU | 62080 tok/s +step 2158/19560 | loss 3.897685 (+1.72z)| norm 0.2980 (-0.23z)| lr 5.91e-04 | 8436.30 ms | -100.0% bf16 MFU | 62083 tok/s +step 2159/19560 | loss 3.883461 (+1.38z)| norm 0.2892 (-0.56z)| lr 5.91e-04 | 8439.87 ms | -100.0% bf16 MFU | 62085 tok/s +step 2160/19560 | loss 3.863921 (+0.97z)| norm 0.3056 (+0.10z)| lr 5.91e-04 | 8437.94 ms | -100.0% bf16 MFU | 62087 tok/s +step 2161/19560 | loss 3.829828 (+0.22z)| norm 0.3185 (+0.63z)| lr 5.91e-04 | 8438.62 ms | -100.0% bf16 MFU | 62089 tok/s +step 2162/19560 | loss 3.838115 (+0.40z)| norm 0.2835 (-0.77z)| lr 5.91e-04 | 8439.46 ms | -100.0% bf16 MFU | 62091 tok/s +step 2163/19560 | loss 3.817374 (-0.05z)| norm 0.2824 (-0.81z)| lr 5.91e-04 | 8435.12 ms | -100.0% bf16 MFU | 62094 tok/s +step 2164/19560 | loss 3.795086 (-0.54z)| norm 0.2957 (-0.24z)| lr 5.91e-04 | 8437.97 ms | -100.0% bf16 MFU | 62096 tok/s +step 2165/19560 | loss 3.903100 (+1.82z)| norm 0.2913 (-0.42z)| lr 5.91e-04 | 8436.29 ms | -100.0% bf16 MFU | 62099 tok/s +step 2166/19560 | loss 3.805249 (-0.31z)| norm 0.2664 (-1.46z)| lr 5.91e-04 | 8441.02 ms | -100.0% bf16 MFU | 62099 tok/s +step 2167/19560 | loss 3.781857 (-0.81z)| norm 0.2987 (-0.11z)| lr 5.91e-04 | 8439.51 ms | -100.0% bf16 MFU | 62101 tok/s +step 2168/19560 | loss 3.890485 (+1.55z)| norm 0.3495 (+2.00z)| lr 5.91e-04 | 8441.78 ms | -100.0% bf16 MFU | 62101 tok/s +step 2169/19560 | loss 3.810830 (-0.17z)| norm 0.3167 (+0.63z)| lr 5.91e-04 | 8442.22 ms | -100.0% bf16 MFU | 62101 tok/s +step 2170/19560 | loss 3.734685 (-1.82z)| norm 0.2888 (-0.56z)| lr 5.91e-04 | 8439.61 ms | -100.0% bf16 MFU | 62102 tok/s +step 2171/19560 | loss 3.797275 (-0.45z)| norm 0.2923 (-0.42z)| lr 5.91e-04 | 8438.56 ms | -100.0% bf16 MFU | 62103 tok/s +step 2172/19560 | loss 3.823830 (+0.13z)| norm 0.3329 (+1.29z)| lr 5.91e-04 | 8437.37 ms | -100.0% bf16 MFU | 62105 tok/s +step 2173/19560 | loss 3.814737 (-0.06z)| norm 0.3170 (+0.60z)| lr 5.91e-04 | 8441.68 ms | -100.0% bf16 MFU | 62105 tok/s +step 2174/19560 | loss 3.797129 (-0.44z)| norm 0.3260 (+0.97z)| lr 5.91e-04 | 8441.63 ms | -100.0% bf16 MFU | 62105 tok/s +step 2175/19560 | loss 3.732373 (-1.81z)| norm 0.3330 (+1.25z)| lr 5.91e-04 | 8442.13 ms | -100.0% bf16 MFU | 62105 tok/s +step 2176/19560 | loss 3.853367 (+0.80z)| norm 0.3622 (+2.43z)| lr 5.91e-04 | 8440.25 ms | -100.0% bf16 MFU | 62106 tok/s +step 2177/19560 | loss 3.804480 (-0.25z)| norm 0.3580 (+2.20z)| lr 5.91e-04 | 8444.16 ms | -100.0% bf16 MFU | 62105 tok/s +step 2178/19560 | loss 3.790458 (-0.56z)| norm 0.3600 (+2.22z)| lr 5.91e-04 | 8446.22 ms | -100.0% bf16 MFU | 62104 tok/s +step 2179/19560 | loss 3.829363 (+0.28z)| norm 0.3506 (+1.80z)| lr 5.91e-04 | 8449.47 ms | -100.0% bf16 MFU | 62101 tok/s +step 2180/19560 | loss 3.818506 (+0.04z)| norm 0.3135 (+0.31z)| lr 5.91e-04 | 8448.30 ms | -100.0% bf16 MFU | 62099 tok/s +step 2181/19560 | loss 3.791474 (-0.54z)| norm 0.2860 (-0.80z)| lr 5.91e-04 | 8448.08 ms | -100.0% bf16 MFU | 62097 tok/s +step 2182/19560 | loss 3.783505 (-0.70z)| norm 0.2821 (-0.95z)| lr 5.91e-04 | 8449.29 ms | -100.0% bf16 MFU | 62095 tok/s +step 2183/19560 | loss 3.823729 (+0.16z)| norm 0.2684 (-1.52z)| lr 5.91e-04 | 8449.66 ms | -100.0% bf16 MFU | 62092 tok/s +step 2184/19560 | loss 3.827128 (+0.24z)| norm 0.2738 (-1.28z)| lr 5.91e-04 | 8447.85 ms | -100.0% bf16 MFU | 62091 tok/s +step 2185/19560 | loss 3.814499 (-0.03z)| norm 0.2735 (-1.28z)| lr 5.91e-04 | 8448.30 ms | -100.0% bf16 MFU | 62089 tok/s +step 2186/19560 | loss 3.831897 (+0.35z)| norm 0.2676 (-1.51z)| lr 5.91e-04 | 8448.95 ms | -100.0% bf16 MFU | 62087 tok/s +step 2187/19560 | loss 3.757180 (-1.26z)| norm 0.2824 (-0.88z)| lr 5.91e-04 | 8448.38 ms | -100.0% bf16 MFU | 62086 tok/s +step 2188/19560 | loss 3.725671 (-1.90z)| norm 0.3044 (+0.05z)| lr 5.91e-04 | 8447.88 ms | -100.0% bf16 MFU | 62085 tok/s +step 2189/19560 | loss 3.802557 (-0.25z)| norm 0.2843 (-0.79z)| lr 5.91e-04 | 8449.71 ms | -100.0% bf16 MFU | 62083 tok/s +step 2190/19560 | loss 3.772772 (-0.88z)| norm 0.2969 (-0.25z)| lr 5.91e-04 | 8448.68 ms | -100.0% bf16 MFU | 62081 tok/s +step 2191/19560 | loss 3.849565 (+0.76z)| norm 0.2576 (-1.88z)| lr 5.91e-04 | 8448.60 ms | -100.0% bf16 MFU | 62080 tok/s +step 2192/19560 | loss 3.851449 (+0.79z)| norm 0.2669 (-1.48z)| lr 5.91e-04 | 8447.58 ms | -100.0% bf16 MFU | 62079 tok/s +step 2193/19560 | loss 3.795288 (-0.41z)| norm 0.3007 (-0.08z)| lr 5.91e-04 | 8451.13 ms | -100.0% bf16 MFU | 62077 tok/s +step 2194/19560 | loss 3.798871 (-0.33z)| norm 0.3185 (+0.67z)| lr 5.91e-04 | 8449.86 ms | -100.0% bf16 MFU | 62076 tok/s +step 2195/19560 | loss 3.781515 (-0.69z)| norm 0.3160 (+0.56z)| lr 5.91e-04 | 8450.39 ms | -100.0% bf16 MFU | 62074 tok/s +step 2196/19560 | loss 3.876586 (+1.33z)| norm 0.3298 (+1.12z)| lr 5.91e-04 | 8450.01 ms | -100.0% bf16 MFU | 62073 tok/s +step 2197/19560 | loss 3.942107 (+2.64z)| norm 0.3539 (+2.08z)| lr 5.91e-04 | 8449.00 ms | -100.0% bf16 MFU | 62072 tok/s +step 2198/19560 | loss 3.783869 (-0.64z)| norm 0.3637 (+2.40z)| lr 5.91e-04 | 8451.03 ms | -100.0% bf16 MFU | 62070 tok/s +step 2199/19560 | loss 3.857809 (+0.88z)| norm 0.3600 (+2.19z)| lr 5.91e-04 | 8454.24 ms | -100.0% bf16 MFU | 62067 tok/s +step 2200/19560 | loss 3.847052 (+0.65z)| norm 0.3131 (+0.34z)| lr 5.91e-04 | 8454.85 ms | -100.0% bf16 MFU | 62064 tok/s +step 2201/19560 | loss 3.776073 (-0.80z)| norm 0.3098 (+0.23z)| lr 5.91e-04 | 8453.96 ms | -100.0% bf16 MFU | 62062 tok/s +step 2202/19560 | loss 3.748272 (-1.36z)| norm 0.2815 (-0.90z)| lr 5.91e-04 | 8451.50 ms | -100.0% bf16 MFU | 62061 tok/s +step 2203/19560 | loss 3.797801 (-0.34z)| norm 0.2905 (-0.53z)| lr 5.91e-04 | 8449.40 ms | -100.0% bf16 MFU | 62060 tok/s +step 2204/19560 | loss 3.789847 (-0.50z)| norm 0.2753 (-1.14z)| lr 5.91e-04 | 8464.00 ms | -100.0% bf16 MFU | 62054 tok/s +step 2205/19560 | loss 3.800912 (-0.27z)| norm 0.2754 (-1.12z)| lr 5.91e-04 | 8461.74 ms | -100.0% bf16 MFU | 62050 tok/s +step 2206/19560 | loss 3.818307 (+0.10z)| norm 0.2890 (-0.58z)| lr 5.91e-04 | 8461.49 ms | -100.0% bf16 MFU | 62045 tok/s +step 2207/19560 | loss 3.783522 (-0.62z)| norm 0.3027 (-0.04z)| lr 5.91e-04 | 8460.18 ms | -100.0% bf16 MFU | 62042 tok/s +step 2208/19560 | loss 3.775084 (-0.78z)| norm 0.3184 (+0.58z)| lr 5.91e-04 | 8456.07 ms | -100.0% bf16 MFU | 62039 tok/s +step 2209/19560 | loss 3.837230 (+0.50z)| norm 0.3508 (+1.84z)| lr 5.91e-04 | 8459.76 ms | -100.0% bf16 MFU | 62036 tok/s +step 2210/19560 | loss 3.775558 (-0.78z)| norm 0.3381 (+1.32z)| lr 5.91e-04 | 8454.14 ms | -100.0% bf16 MFU | 62035 tok/s +step 2211/19560 | loss 3.821134 (+0.17z)| norm 0.3122 (+0.28z)| lr 5.91e-04 | 8460.61 ms | -100.0% bf16 MFU | 62032 tok/s +step 2212/19560 | loss 3.847762 (+0.72z)| norm 0.2869 (-0.72z)| lr 5.91e-04 | 8456.37 ms | -100.0% bf16 MFU | 62030 tok/s +step 2213/19560 | loss 3.752505 (-1.23z)| norm 0.3040 (-0.02z)| lr 5.91e-04 | 8463.42 ms | -100.0% bf16 MFU | 62026 tok/s +step 2214/19560 | loss 3.757358 (-1.12z)| norm 0.2908 (-0.54z)| lr 5.91e-04 | 8457.39 ms | -100.0% bf16 MFU | 62024 tok/s +step 2215/19560 | loss 3.773481 (-0.79z)| norm 0.2868 (-0.70z)| lr 5.91e-04 | 8454.40 ms | -100.0% bf16 MFU | 62024 tok/s +step 2216/19560 | loss 3.790175 (-0.44z)| norm 0.2527 (-2.05z)| lr 5.90e-04 | 8455.21 ms | -100.0% bf16 MFU | 62023 tok/s +step 2217/19560 | loss 3.789364 (-0.45z)| norm 0.2870 (-0.65z)| lr 5.90e-04 | 8452.34 ms | -100.0% bf16 MFU | 62023 tok/s +step 2218/19560 | loss 3.844937 (+0.68z)| norm 0.2901 (-0.52z)| lr 5.90e-04 | 8455.91 ms | -100.0% bf16 MFU | 62022 tok/s +step 2219/19560 | loss 3.810855 (-0.02z)| norm 0.3181 (+0.60z)| lr 5.90e-04 | 8452.35 ms | -100.0% bf16 MFU | 62023 tok/s +step 2220/19560 | loss 3.829551 (+0.42z)| norm 0.3442 (+1.63z)| lr 5.90e-04 | 8461.37 ms | -100.0% bf16 MFU | 62020 tok/s +step 2221/19560 | loss 3.758715 (-1.13z)| norm 0.3299 (+1.04z)| lr 5.90e-04 | 8459.66 ms | -100.0% bf16 MFU | 62017 tok/s +step 2222/19560 | loss 3.798035 (-0.27z)| norm 0.2779 (-1.03z)| lr 5.90e-04 | 8457.38 ms | -100.0% bf16 MFU | 62016 tok/s +step 2223/19560 | loss 3.788821 (-0.47z)| norm 0.3371 (+1.32z)| lr 5.90e-04 | 8456.24 ms | -100.0% bf16 MFU | 62015 tok/s +step 2224/19560 | loss 3.820912 (+0.25z)| norm 0.3009 (-0.12z)| lr 5.90e-04 | 8456.28 ms | -100.0% bf16 MFU | 62015 tok/s +step 2225/19560 | loss 3.756491 (-1.17z)| norm 0.3142 (+0.40z)| lr 5.90e-04 | 8455.56 ms | -100.0% bf16 MFU | 62014 tok/s +step 2226/19560 | loss 3.805049 (-0.09z)| norm 0.3216 (+0.69z)| lr 5.90e-04 | 8455.86 ms | -100.0% bf16 MFU | 62013 tok/s +step 2227/19560 | loss 3.812219 (+0.07z)| norm 0.3049 (+0.01z)| lr 5.90e-04 | 8455.61 ms | -100.0% bf16 MFU | 62013 tok/s +step 2228/19560 | loss 3.759342 (-1.10z)| norm 0.2775 (-1.07z)| lr 5.90e-04 | 8449.41 ms | -100.0% bf16 MFU | 62015 tok/s +step 2229/19560 | loss 3.813972 (+0.13z)| norm 0.2837 (-0.81z)| lr 5.90e-04 | 8459.88 ms | -100.0% bf16 MFU | 62013 tok/s +step 2230/19560 | loss 3.771148 (-0.83z)| norm 0.3170 (+0.54z)| lr 5.90e-04 | 8445.89 ms | -100.0% bf16 MFU | 62016 tok/s +step 2231/19560 | loss 3.750937 (-1.27z)| norm 0.3038 (+0.01z)| lr 5.90e-04 | 8443.23 ms | -100.0% bf16 MFU | 62020 tok/s +step 2232/19560 | loss 3.797124 (-0.25z)| norm 0.2953 (-0.33z)| lr 5.90e-04 | 8447.52 ms | -100.0% bf16 MFU | 62022 tok/s +step 2233/19560 | loss 3.784925 (-0.51z)| norm 0.2553 (-1.90z)| lr 5.90e-04 | 8447.75 ms | -100.0% bf16 MFU | 62024 tok/s +step 2234/19560 | loss 3.813682 (+0.14z)| norm 0.2698 (-1.32z)| lr 5.90e-04 | 8458.23 ms | -100.0% bf16 MFU | 62022 tok/s +step 2235/19560 | loss 3.769976 (-0.83z)| norm 0.3032 (+0.00z)| lr 5.90e-04 | 8446.25 ms | -100.0% bf16 MFU | 62025 tok/s +step 2236/19560 | loss 3.805024 (-0.04z)| norm 0.3119 (+0.34z)| lr 5.90e-04 | 8449.31 ms | -100.0% bf16 MFU | 62026 tok/s +step 2237/19560 | loss 3.793425 (-0.30z)| norm 0.2761 (-1.06z)| lr 5.90e-04 | 8445.45 ms | -100.0% bf16 MFU | 62029 tok/s +step 2238/19560 | loss 3.781200 (-0.56z)| norm 0.2672 (-1.38z)| lr 5.90e-04 | 8448.88 ms | -100.0% bf16 MFU | 62030 tok/s +step 2239/19560 | loss 3.799365 (-0.15z)| norm 0.2869 (-0.62z)| lr 5.90e-04 | 8443.96 ms | -100.0% bf16 MFU | 62033 tok/s +step 2240/19560 | loss 3.794422 (-0.25z)| norm 0.2767 (-1.01z)| lr 5.90e-04 | 8451.69 ms | -100.0% bf16 MFU | 62033 tok/s +step 2241/19560 | loss 3.837403 (+0.79z)| norm 0.2934 (-0.36z)| lr 5.90e-04 | 8451.71 ms | -100.0% bf16 MFU | 62033 tok/s +step 2242/19560 | loss 3.812918 (+0.20z)| norm 0.3355 (+1.28z)| lr 5.90e-04 | 8449.71 ms | -100.0% bf16 MFU | 62034 tok/s +step 2243/19560 | loss 3.762335 (-1.04z)| norm 0.3441 (+1.59z)| lr 5.90e-04 | 8449.04 ms | -100.0% bf16 MFU | 62035 tok/s +step 2244/19560 | loss 3.704133 (-2.42z)| norm 0.3115 (+0.33z)| lr 5.90e-04 | 8444.23 ms | -100.0% bf16 MFU | 62037 tok/s +step 2245/19560 | loss 3.783803 (-0.47z)| norm 0.2668 (-1.38z)| lr 5.90e-04 | 8446.22 ms | -100.0% bf16 MFU | 62039 tok/s +step 2246/19560 | loss 3.838051 (+0.85z)| norm 0.2821 (-0.78z)| lr 5.90e-04 | 8445.82 ms | -100.0% bf16 MFU | 62041 tok/s +step 2247/19560 | loss 3.776238 (-0.64z)| norm 0.2969 (-0.21z)| lr 5.90e-04 | 8448.41 ms | -100.0% bf16 MFU | 62042 tok/s +step 2248/19560 | loss 3.823256 (+0.51z)| norm 0.2741 (-1.07z)| lr 5.90e-04 | 8446.18 ms | -100.0% bf16 MFU | 62044 tok/s +step 2249/19560 | loss 3.778157 (-0.59z)| norm 0.2648 (-1.40z)| lr 5.90e-04 | 8448.58 ms | -100.0% bf16 MFU | 62044 tok/s +step 2250/19560 | loss 3.789101 (-0.31z)| norm 0.3149 (+0.50z)| lr 5.90e-04 | 8449.72 ms | -100.0% bf16 MFU | 62044 tok/s +val loss 3.810804 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 2654/10042 = 0.264290 +step 2251/19560 | loss 3.821416 (+0.47z)| norm 0.3510 (+1.84z)| lr 5.90e-04 | 8461.13 ms | -100.0% bf16 MFU | 62040 tok/s +step 2252/19560 | loss 3.812375 (+0.23z)| norm 0.3375 (+1.31z)| lr 5.90e-04 | 8460.17 ms | -100.0% bf16 MFU | 62037 tok/s +step 2253/19560 | loss 3.856207 (+1.31z)| norm 0.3222 (+0.73z)| lr 5.90e-04 | 8455.69 ms | -100.0% bf16 MFU | 62035 tok/s +step 2254/19560 | loss 3.754081 (-1.22z)| norm 0.2974 (-0.18z)| lr 5.90e-04 | 8446.79 ms | -100.0% bf16 MFU | 62037 tok/s +step 2255/19560 | loss 3.725630 (-1.92z)| norm 0.3217 (+0.73z)| lr 5.90e-04 | 8447.17 ms | -100.0% bf16 MFU | 62038 tok/s +step 2256/19560 | loss 3.746645 (-1.37z)| norm 0.2988 (-0.13z)| lr 5.90e-04 | 8448.74 ms | -100.0% bf16 MFU | 62039 tok/s +step 2257/19560 | loss 3.750337 (-1.28z)| norm 0.2984 (-0.15z)| lr 5.90e-04 | 8445.71 ms | -100.0% bf16 MFU | 62041 tok/s +step 2258/19560 | loss 3.836183 (+0.84z)| norm 0.2845 (-0.67z)| lr 5.90e-04 | 8448.56 ms | -100.0% bf16 MFU | 62042 tok/s +step 2259/19560 | loss 3.895366 (+2.25z)| norm 0.3000 (-0.10z)| lr 5.90e-04 | 8443.73 ms | -100.0% bf16 MFU | 62044 tok/s +step 2260/19560 | loss 3.782324 (-0.50z)| norm 0.2798 (-0.87z)| lr 5.90e-04 | 8444.20 ms | -100.0% bf16 MFU | 62047 tok/s +step 2261/19560 | loss 3.720602 (-1.96z)| norm 0.3092 (+0.23z)| lr 5.90e-04 | 8441.44 ms | -100.0% bf16 MFU | 62050 tok/s +step 2262/19560 | loss 3.824524 (+0.54z)| norm 0.3662 (+2.33z)| lr 5.90e-04 | 8443.57 ms | -100.0% bf16 MFU | 62052 tok/s +step 2263/19560 | loss 3.803560 (+0.04z)| norm 0.3738 (+2.55z)| lr 5.90e-04 | 8446.89 ms | -100.0% bf16 MFU | 62053 tok/s +step 2264/19560 | loss 3.850568 (+1.16z)| norm 0.3591 (+2.02z)| lr 5.90e-04 | 8446.50 ms | -100.0% bf16 MFU | 62054 tok/s +step 2265/19560 | loss 3.801760 (-0.00z)| norm 0.3089 (+0.19z)| lr 5.90e-04 | 8443.14 ms | -100.0% bf16 MFU | 62056 tok/s +step 2266/19560 | loss 3.776718 (-0.60z)| norm 0.3068 (+0.11z)| lr 5.90e-04 | 8449.64 ms | -100.0% bf16 MFU | 62056 tok/s +step 2267/19560 | loss 3.785403 (-0.38z)| norm 0.3084 (+0.16z)| lr 5.90e-04 | 8441.73 ms | -100.0% bf16 MFU | 62058 tok/s +step 2268/19560 | loss 3.748078 (-1.27z)| norm 0.3062 (+0.09z)| lr 5.90e-04 | 8443.82 ms | -100.0% bf16 MFU | 62060 tok/s +step 2269/19560 | loss 3.831069 (+0.74z)| norm 0.3441 (+1.47z)| lr 5.90e-04 | 8447.91 ms | -100.0% bf16 MFU | 62060 tok/s +step 2270/19560 | loss 3.841484 (+0.98z)| norm 0.2834 (-0.75z)| lr 5.90e-04 | 8449.35 ms | -100.0% bf16 MFU | 62059 tok/s +step 2271/19560 | loss 3.777513 (-0.56z)| norm 0.3062 (+0.09z)| lr 5.90e-04 | 8445.86 ms | -100.0% bf16 MFU | 62060 tok/s +step 2272/19560 | loss 3.772870 (-0.67z)| norm 0.2980 (-0.22z)| lr 5.90e-04 | 8441.94 ms | -100.0% bf16 MFU | 62062 tok/s +step 2273/19560 | loss 3.779097 (-0.52z)| norm 0.2850 (-0.69z)| lr 5.90e-04 | 8446.63 ms | -100.0% bf16 MFU | 62063 tok/s +step 2274/19560 | loss 3.784949 (-0.40z)| norm 0.2756 (-1.04z)| lr 5.90e-04 | 8448.41 ms | -100.0% bf16 MFU | 62063 tok/s +step 2275/19560 | loss 3.827475 (+0.64z)| norm 0.3096 (+0.20z)| lr 5.90e-04 | 8448.97 ms | -100.0% bf16 MFU | 62062 tok/s +step 2276/19560 | loss 3.800877 (-0.01z)| norm 0.3265 (+0.81z)| lr 5.90e-04 | 8449.92 ms | -100.0% bf16 MFU | 62061 tok/s +step 2277/19560 | loss 3.701491 (-2.38z)| norm 0.3031 (-0.07z)| lr 5.90e-04 | 8443.16 ms | -100.0% bf16 MFU | 62063 tok/s +step 2278/19560 | loss 3.804169 (+0.07z)| norm 0.2510 (-1.96z)| lr 5.90e-04 | 8443.12 ms | -100.0% bf16 MFU | 62065 tok/s +step 2279/19560 | loss 3.826628 (+0.61z)| norm 0.2869 (-0.63z)| lr 5.90e-04 | 8458.32 ms | -100.0% bf16 MFU | 62061 tok/s +step 2280/19560 | loss 3.777390 (-0.58z)| norm 0.3053 (+0.05z)| lr 5.90e-04 | 8452.21 ms | -100.0% bf16 MFU | 62059 tok/s +step 2281/19560 | loss 3.810444 (+0.22z)| norm 0.2964 (-0.27z)| lr 5.90e-04 | 8454.34 ms | -100.0% bf16 MFU | 62057 tok/s +step 2282/19560 | loss 3.800218 (-0.05z)| norm 0.2753 (-1.04z)| lr 5.90e-04 | 8456.49 ms | -100.0% bf16 MFU | 62054 tok/s +step 2283/19560 | loss 3.732236 (-1.70z)| norm 0.2934 (-0.38z)| lr 5.90e-04 | 8453.78 ms | -100.0% bf16 MFU | 62052 tok/s +step 2284/19560 | loss 3.770104 (-0.77z)| norm 0.2427 (-2.18z)| lr 5.90e-04 | 8455.84 ms | -100.0% bf16 MFU | 62050 tok/s +step 2285/19560 | loss 3.812252 (+0.27z)| norm 0.2753 (-1.00z)| lr 5.90e-04 | 8458.04 ms | -100.0% bf16 MFU | 62047 tok/s +step 2286/19560 | loss 3.751810 (-1.21z)| norm 0.3125 (+0.33z)| lr 5.90e-04 | 8452.24 ms | -100.0% bf16 MFU | 62046 tok/s +step 2287/19560 | loss 3.790507 (-0.23z)| norm 0.2993 (-0.15z)| lr 5.90e-04 | 8452.56 ms | -100.0% bf16 MFU | 62045 tok/s +step 2288/19560 | loss 3.850336 (+1.30z)| norm 0.2902 (-0.47z)| lr 5.90e-04 | 8453.19 ms | -100.0% bf16 MFU | 62044 tok/s +step 2289/19560 | loss 3.778292 (-0.53z)| norm 0.2896 (-0.49z)| lr 5.90e-04 | 8454.56 ms | -100.0% bf16 MFU | 62042 tok/s +step 2290/19560 | loss 3.778812 (-0.51z)| norm 0.2972 (-0.22z)| lr 5.90e-04 | 8456.95 ms | -100.0% bf16 MFU | 62040 tok/s +step 2291/19560 | loss 3.780647 (-0.45z)| norm 0.3138 (+0.37z)| lr 5.90e-04 | 8456.68 ms | -100.0% bf16 MFU | 62038 tok/s +step 2292/19560 | loss 3.744898 (-1.35z)| norm 0.3206 (+0.61z)| lr 5.90e-04 | 8447.42 ms | -100.0% bf16 MFU | 62039 tok/s +step 2293/19560 | loss 3.757979 (-1.01z)| norm 0.3449 (+1.46z)| lr 5.90e-04 | 8441.85 ms | -100.0% bf16 MFU | 62042 tok/s +step 2294/19560 | loss 3.807709 (+0.29z)| norm 0.3707 (+2.32z)| lr 5.90e-04 | 8437.85 ms | -100.0% bf16 MFU | 62047 tok/s +step 2295/19560 | loss 3.884447 (+2.23z)| norm 0.3125 (+0.26z)| lr 5.89e-04 | 8441.40 ms | -100.0% bf16 MFU | 62050 tok/s +step 2296/19560 | loss 3.767940 (-0.75z)| norm 0.2928 (-0.42z)| lr 5.89e-04 | 8437.55 ms | -100.0% bf16 MFU | 62054 tok/s +step 2297/19560 | loss 3.785337 (-0.29z)| norm 0.3099 (+0.19z)| lr 5.89e-04 | 8438.57 ms | -100.0% bf16 MFU | 62058 tok/s +step 2298/19560 | loss 3.795138 (-0.04z)| norm 0.3104 (+0.20z)| lr 5.89e-04 | 8433.34 ms | -100.0% bf16 MFU | 62064 tok/s +step 2299/19560 | loss 3.759249 (-0.98z)| norm 0.2926 (-0.43z)| lr 5.89e-04 | 8437.63 ms | -100.0% bf16 MFU | 62067 tok/s +step 2300/19560 | loss 3.750205 (-1.20z)| norm 0.3191 (+0.52z)| lr 5.89e-04 | 8442.28 ms | -100.0% bf16 MFU | 62069 tok/s +step 2301/19560 | loss 3.742448 (-1.38z)| norm 0.3121 (+0.27z)| lr 5.89e-04 | 8441.36 ms | -100.0% bf16 MFU | 62071 tok/s +step 2302/19560 | loss 3.815584 (+0.52z)| norm 0.2810 (-0.83z)| lr 5.89e-04 | 8441.01 ms | -100.0% bf16 MFU | 62073 tok/s +step 2303/19560 | loss 3.787551 (-0.22z)| norm 0.2795 (-0.87z)| lr 5.89e-04 | 8438.86 ms | -100.0% bf16 MFU | 62076 tok/s +step 2304/19560 | loss 3.763557 (-0.84z)| norm 0.2898 (-0.49z)| lr 5.89e-04 | 8439.08 ms | -100.0% bf16 MFU | 62078 tok/s +step 2305/19560 | loss 3.776376 (-0.49z)| norm 0.2940 (-0.32z)| lr 5.89e-04 | 8440.40 ms | -100.0% bf16 MFU | 62080 tok/s +step 2306/19560 | loss 3.850350 (+1.44z)| norm 0.2871 (-0.56z)| lr 5.89e-04 | 8440.90 ms | -100.0% bf16 MFU | 62082 tok/s +step 2307/19560 | loss 3.798681 (+0.09z)| norm 0.2848 (-0.64z)| lr 5.89e-04 | 8436.10 ms | -100.0% bf16 MFU | 62085 tok/s +step 2308/19560 | loss 3.782204 (-0.34z)| norm 0.2678 (-1.26z)| lr 5.89e-04 | 8442.37 ms | -100.0% bf16 MFU | 62086 tok/s +step 2309/19560 | loss 3.788586 (-0.17z)| norm 0.2823 (-0.71z)| lr 5.89e-04 | 8442.47 ms | -100.0% bf16 MFU | 62087 tok/s +step 2310/19560 | loss 3.795362 (+0.01z)| norm 0.2884 (-0.49z)| lr 5.89e-04 | 8441.68 ms | -100.0% bf16 MFU | 62088 tok/s +step 2311/19560 | loss 3.751318 (-1.14z)| norm 0.2681 (-1.25z)| lr 5.89e-04 | 8445.66 ms | -100.0% bf16 MFU | 62087 tok/s +step 2312/19560 | loss 3.858118 (+1.66z)| norm 0.2936 (-0.30z)| lr 5.89e-04 | 8443.31 ms | -100.0% bf16 MFU | 62088 tok/s +step 2313/19560 | loss 3.826895 (+0.84z)| norm 0.2898 (-0.45z)| lr 5.89e-04 | 8444.67 ms | -100.0% bf16 MFU | 62088 tok/s +step 2314/19560 | loss 3.752740 (-1.08z)| norm 0.2662 (-1.35z)| lr 5.89e-04 | 8439.80 ms | -100.0% bf16 MFU | 62089 tok/s +step 2315/19560 | loss 3.824304 (+0.77z)| norm 0.2572 (-1.66z)| lr 5.89e-04 | 8441.87 ms | -100.0% bf16 MFU | 62090 tok/s +step 2316/19560 | loss 3.814259 (+0.50z)| norm 0.2514 (-1.84z)| lr 5.89e-04 | 8444.92 ms | -100.0% bf16 MFU | 62090 tok/s +step 2317/19560 | loss 3.800428 (+0.13z)| norm 0.2611 (-1.46z)| lr 5.89e-04 | 8448.11 ms | -100.0% bf16 MFU | 62088 tok/s +step 2318/19560 | loss 3.785764 (-0.26z)| norm 0.2701 (-1.12z)| lr 5.89e-04 | 8448.77 ms | -100.0% bf16 MFU | 62087 tok/s +step 2319/19560 | loss 3.792853 (-0.06z)| norm 0.2634 (-1.37z)| lr 5.89e-04 | 8449.56 ms | -100.0% bf16 MFU | 62085 tok/s +step 2320/19560 | loss 3.757312 (-0.99z)| norm 0.2892 (-0.43z)| lr 5.89e-04 | 8451.14 ms | -100.0% bf16 MFU | 62082 tok/s +step 2321/19560 | loss 3.842635 (+1.28z)| norm 0.3531 (+1.90z)| lr 5.89e-04 | 8453.21 ms | -100.0% bf16 MFU | 62079 tok/s +step 2322/19560 | loss 3.785177 (-0.25z)| norm 0.3830 (+2.88z)| lr 5.89e-04 | 8452.84 ms | -100.0% bf16 MFU | 62077 tok/s +step 2323/19560 | loss 3.792372 (-0.06z)| norm 0.3300 (+0.99z)| lr 5.89e-04 | 8452.74 ms | -100.0% bf16 MFU | 62074 tok/s +step 2324/19560 | loss 3.769044 (-0.67z)| norm 0.2965 (-0.18z)| lr 5.89e-04 | 8448.10 ms | -100.0% bf16 MFU | 62073 tok/s +step 2325/19560 | loss 3.721360 (-2.02z)| norm 0.3055 (+0.16z)| lr 5.89e-04 | 8446.79 ms | -100.0% bf16 MFU | 62073 tok/s +step 2326/19560 | loss 3.766979 (-0.71z)| norm 0.3390 (+1.38z)| lr 5.89e-04 | 8452.72 ms | -100.0% bf16 MFU | 62071 tok/s +step 2327/19560 | loss 3.761572 (-0.85z)| norm 0.3286 (+1.03z)| lr 5.89e-04 | 8453.78 ms | -100.0% bf16 MFU | 62068 tok/s +step 2328/19560 | loss 3.786776 (-0.12z)| norm 0.3093 (+0.32z)| lr 5.89e-04 | 8449.35 ms | -100.0% bf16 MFU | 62067 tok/s +step 2329/19560 | loss 3.731779 (-1.69z)| norm 0.2723 (-1.03z)| lr 5.89e-04 | 8450.38 ms | -100.0% bf16 MFU | 62066 tok/s +step 2330/19560 | loss 3.831435 (+1.16z)| norm 0.2797 (-0.76z)| lr 5.89e-04 | 8449.68 ms | -100.0% bf16 MFU | 62065 tok/s +step 2331/19560 | loss 3.841210 (+1.42z)| norm 0.2619 (-1.40z)| lr 5.89e-04 | 8447.62 ms | -100.0% bf16 MFU | 62065 tok/s +step 2332/19560 | loss 3.783199 (-0.23z)| norm 0.2507 (-1.78z)| lr 5.89e-04 | 8449.74 ms | -100.0% bf16 MFU | 62064 tok/s +step 2333/19560 | loss 3.806501 (+0.43z)| norm 0.2883 (-0.43z)| lr 5.89e-04 | 8452.45 ms | -100.0% bf16 MFU | 62062 tok/s +step 2334/19560 | loss 3.774776 (-0.47z)| norm 0.3028 (+0.10z)| lr 5.89e-04 | 8449.82 ms | -100.0% bf16 MFU | 62062 tok/s +step 2335/19560 | loss 3.736590 (-1.53z)| norm 0.3104 (+0.37z)| lr 5.89e-04 | 8449.15 ms | -100.0% bf16 MFU | 62061 tok/s +step 2336/19560 | loss 3.779460 (-0.32z)| norm 0.3134 (+0.48z)| lr 5.89e-04 | 8451.19 ms | -100.0% bf16 MFU | 62060 tok/s +step 2337/19560 | loss 3.736805 (-1.50z)| norm 0.3009 (+0.04z)| lr 5.89e-04 | 8451.36 ms | -100.0% bf16 MFU | 62059 tok/s +step 2338/19560 | loss 3.759335 (-0.86z)| norm 0.3165 (+0.63z)| lr 5.89e-04 | 8450.46 ms | -100.0% bf16 MFU | 62058 tok/s +step 2339/19560 | loss 3.805927 (+0.46z)| norm 0.3182 (+0.69z)| lr 5.89e-04 | 8449.63 ms | -100.0% bf16 MFU | 62058 tok/s +step 2340/19560 | loss 3.669769 (-3.24z)| norm 0.3048 (+0.19z)| lr 5.89e-04 | 8452.84 ms | -100.0% bf16 MFU | 62056 tok/s +step 2341/19560 | loss 3.745174 (-1.18z)| norm 0.2957 (-0.15z)| lr 5.89e-04 | 8450.19 ms | -100.0% bf16 MFU | 62055 tok/s +step 2342/19560 | loss 3.739817 (-1.31z)| norm 0.3000 (+0.01z)| lr 5.89e-04 | 8448.42 ms | -100.0% bf16 MFU | 62055 tok/s +step 2343/19560 | loss 3.775745 (-0.34z)| norm 0.2875 (-0.46z)| lr 5.89e-04 | 8448.78 ms | -100.0% bf16 MFU | 62055 tok/s +step 2344/19560 | loss 3.892862 (+2.74z)| norm 0.2811 (-0.71z)| lr 5.89e-04 | 8446.93 ms | -100.0% bf16 MFU | 62056 tok/s +step 2345/19560 | loss 3.805941 (+0.45z)| norm 0.2629 (-1.37z)| lr 5.89e-04 | 8450.26 ms | -100.0% bf16 MFU | 62055 tok/s +step 2346/19560 | loss 3.719604 (-1.80z)| norm 0.2848 (-0.56z)| lr 5.89e-04 | 8449.26 ms | -100.0% bf16 MFU | 62055 tok/s +step 2347/19560 | loss 3.753850 (-0.89z)| norm 0.3097 (+0.37z)| lr 5.89e-04 | 8450.31 ms | -100.0% bf16 MFU | 62055 tok/s +step 2348/19560 | loss 3.781976 (-0.14z)| norm 0.2802 (-0.71z)| lr 5.89e-04 | 8448.92 ms | -100.0% bf16 MFU | 62055 tok/s +step 2349/19560 | loss 3.813973 (+0.69z)| norm 0.2729 (-0.97z)| lr 5.89e-04 | 8452.69 ms | -100.0% bf16 MFU | 62053 tok/s +step 2350/19560 | loss 3.754768 (-0.86z)| norm 0.2982 (-0.03z)| lr 5.89e-04 | 8453.42 ms | -100.0% bf16 MFU | 62052 tok/s +step 2351/19560 | loss 3.818808 (+0.82z)| norm 0.3075 (+0.33z)| lr 5.89e-04 | 8452.84 ms | -100.0% bf16 MFU | 62050 tok/s +step 2352/19560 | loss 3.779649 (-0.20z)| norm 0.3120 (+0.50z)| lr 5.89e-04 | 8450.90 ms | -100.0% bf16 MFU | 62050 tok/s +step 2353/19560 | loss 3.781153 (-0.17z)| norm 0.3108 (+0.45z)| lr 5.89e-04 | 8450.10 ms | -100.0% bf16 MFU | 62049 tok/s +step 2354/19560 | loss 3.796522 (+0.24z)| norm 0.3460 (+1.76z)| lr 5.89e-04 | 8451.84 ms | -100.0% bf16 MFU | 62049 tok/s +step 2355/19560 | loss 3.778396 (-0.23z)| norm 0.3136 (+0.55z)| lr 5.89e-04 | 8448.41 ms | -100.0% bf16 MFU | 62049 tok/s +step 2356/19560 | loss 3.755469 (-0.83z)| norm 0.2719 (-1.01z)| lr 5.89e-04 | 8451.26 ms | -100.0% bf16 MFU | 62048 tok/s +step 2357/19560 | loss 3.772190 (-0.39z)| norm 0.2696 (-1.09z)| lr 5.89e-04 | 8445.95 ms | -100.0% bf16 MFU | 62050 tok/s +step 2358/19560 | loss 3.754826 (-0.84z)| norm 0.2867 (-0.45z)| lr 5.89e-04 | 8442.32 ms | -100.0% bf16 MFU | 62052 tok/s +step 2359/19560 | loss 3.755466 (-0.82z)| norm 0.2707 (-1.03z)| lr 5.89e-04 | 8435.43 ms | -100.0% bf16 MFU | 62057 tok/s +step 2360/19560 | loss 3.781482 (-0.13z)| norm 0.2744 (-0.88z)| lr 5.89e-04 | 8435.42 ms | -100.0% bf16 MFU | 62062 tok/s +step 2361/19560 | loss 3.732214 (-1.41z)| norm 0.2853 (-0.49z)| lr 5.89e-04 | 8433.19 ms | -100.0% bf16 MFU | 62068 tok/s +step 2362/19560 | loss 3.744033 (-1.09z)| norm 0.2999 (+0.05z)| lr 5.89e-04 | 8435.97 ms | -100.0% bf16 MFU | 62072 tok/s +step 2363/19560 | loss 3.829031 (+1.12z)| norm 0.3006 (+0.07z)| lr 5.89e-04 | 8438.99 ms | -100.0% bf16 MFU | 62074 tok/s +step 2364/19560 | loss 3.755168 (-0.79z)| norm 0.3374 (+1.44z)| lr 5.89e-04 | 8438.10 ms | -100.0% bf16 MFU | 62077 tok/s +step 2365/19560 | loss 3.729365 (-1.44z)| norm 0.3294 (+1.12z)| lr 5.89e-04 | 8434.73 ms | -100.0% bf16 MFU | 62081 tok/s +step 2366/19560 | loss 3.767592 (-0.45z)| norm 0.3160 (+0.61z)| lr 5.89e-04 | 8434.01 ms | -100.0% bf16 MFU | 62086 tok/s +step 2367/19560 | loss 3.793035 (+0.21z)| norm 0.2977 (-0.08z)| lr 5.89e-04 | 8433.23 ms | -100.0% bf16 MFU | 62090 tok/s +step 2368/19560 | loss 3.748851 (-0.92z)| norm 0.2951 (-0.18z)| lr 5.89e-04 | 8435.08 ms | -100.0% bf16 MFU | 62093 tok/s +step 2369/19560 | loss 3.722931 (-1.56z)| norm 0.2877 (-0.46z)| lr 5.88e-04 | 8436.17 ms | -100.0% bf16 MFU | 62096 tok/s +step 2370/19560 | loss 3.782363 (-0.03z)| norm 0.2787 (-0.78z)| lr 5.88e-04 | 8434.45 ms | -100.0% bf16 MFU | 62099 tok/s +step 2371/19560 | loss 3.756351 (-0.70z)| norm 0.2599 (-1.47z)| lr 5.88e-04 | 8436.84 ms | -100.0% bf16 MFU | 62101 tok/s +step 2372/19560 | loss 3.740033 (-1.13z)| norm 0.2767 (-0.82z)| lr 5.88e-04 | 8436.08 ms | -100.0% bf16 MFU | 62104 tok/s +step 2373/19560 | loss 3.780202 (-0.09z)| norm 0.2547 (-1.64z)| lr 5.88e-04 | 8437.46 ms | -100.0% bf16 MFU | 62105 tok/s +step 2374/19560 | loss 3.755285 (-0.73z)| norm 0.2650 (-1.24z)| lr 5.88e-04 | 8438.46 ms | -100.0% bf16 MFU | 62107 tok/s +step 2375/19560 | loss 3.866548 (+2.12z)| norm 0.3212 (+0.85z)| lr 5.88e-04 | 8433.71 ms | -100.0% bf16 MFU | 62109 tok/s +step 2376/19560 | loss 3.731159 (-1.33z)| norm 0.3278 (+1.08z)| lr 5.88e-04 | 8436.79 ms | -100.0% bf16 MFU | 62111 tok/s +step 2377/19560 | loss 3.777148 (-0.15z)| norm 0.3236 (+0.91z)| lr 5.88e-04 | 8435.68 ms | -100.0% bf16 MFU | 62113 tok/s +step 2378/19560 | loss 3.728733 (-1.37z)| norm 0.2897 (-0.35z)| lr 5.88e-04 | 8436.39 ms | -100.0% bf16 MFU | 62115 tok/s +step 2379/19560 | loss 3.774322 (-0.20z)| norm 0.2824 (-0.62z)| lr 5.88e-04 | 8440.00 ms | -100.0% bf16 MFU | 62115 tok/s +step 2380/19560 | loss 3.790122 (+0.21z)| norm 0.2755 (-0.86z)| lr 5.88e-04 | 8435.90 ms | -100.0% bf16 MFU | 62117 tok/s +step 2381/19560 | loss 3.787933 (+0.17z)| norm 0.2683 (-1.12z)| lr 5.88e-04 | 8440.51 ms | -100.0% bf16 MFU | 62117 tok/s +step 2382/19560 | loss 3.763889 (-0.46z)| norm 0.2565 (-1.54z)| lr 5.88e-04 | 8437.48 ms | -100.0% bf16 MFU | 62118 tok/s +step 2383/19560 | loss 3.726802 (-1.42z)| norm 0.2652 (-1.19z)| lr 5.88e-04 | 8441.42 ms | -100.0% bf16 MFU | 62117 tok/s +step 2384/19560 | loss 3.706290 (-1.92z)| norm 0.2988 (+0.07z)| lr 5.88e-04 | 8446.02 ms | -100.0% bf16 MFU | 62115 tok/s +step 2385/19560 | loss 3.726853 (-1.38z)| norm 0.2997 (+0.10z)| lr 5.88e-04 | 8444.09 ms | -100.0% bf16 MFU | 62114 tok/s +step 2386/19560 | loss 3.781962 (+0.03z)| norm 0.3077 (+0.40z)| lr 5.88e-04 | 8444.58 ms | -100.0% bf16 MFU | 62113 tok/s +step 2387/19560 | loss 3.789184 (+0.25z)| norm 0.3211 (+0.89z)| lr 5.88e-04 | 8448.42 ms | -100.0% bf16 MFU | 62110 tok/s +step 2388/19560 | loss 3.753965 (-0.68z)| norm 0.3462 (+1.79z)| lr 5.88e-04 | 8443.31 ms | -100.0% bf16 MFU | 62109 tok/s +step 2389/19560 | loss 3.763106 (-0.45z)| norm 0.3699 (+2.59z)| lr 5.88e-04 | 8447.56 ms | -100.0% bf16 MFU | 62107 tok/s +step 2390/19560 | loss 3.819789 (+1.07z)| norm 0.3685 (+2.53z)| lr 5.88e-04 | 8444.47 ms | -100.0% bf16 MFU | 62106 tok/s +step 2391/19560 | loss 3.747909 (-0.85z)| norm 0.3357 (+1.39z)| lr 5.88e-04 | 8447.26 ms | -100.0% bf16 MFU | 62104 tok/s +step 2392/19560 | loss 3.758639 (-0.55z)| norm 0.3054 (+0.30z)| lr 5.88e-04 | 8447.62 ms | -100.0% bf16 MFU | 62102 tok/s +step 2393/19560 | loss 3.743021 (-0.96z)| norm 0.2826 (-0.56z)| lr 5.88e-04 | 8449.44 ms | -100.0% bf16 MFU | 62099 tok/s +step 2394/19560 | loss 3.770061 (-0.22z)| norm 0.2719 (-0.94z)| lr 5.88e-04 | 8455.88 ms | -100.0% bf16 MFU | 62094 tok/s +step 2395/19560 | loss 3.808007 (+0.80z)| norm 0.2908 (-0.23z)| lr 5.88e-04 | 8469.64 ms | -100.0% bf16 MFU | 62085 tok/s +step 2396/19560 | loss 3.827686 (+1.31z)| norm 0.2749 (-0.81z)| lr 5.88e-04 | 8474.28 ms | -100.0% bf16 MFU | 62074 tok/s +step 2397/19560 | loss 3.831079 (+1.40z)| norm 0.3428 (+1.73z)| lr 5.88e-04 | 8465.84 ms | -100.0% bf16 MFU | 62067 tok/s +step 2398/19560 | loss 3.804198 (+0.69z)| norm 0.3410 (+1.63z)| lr 5.88e-04 | 8468.21 ms | -100.0% bf16 MFU | 62059 tok/s +step 2399/19560 | loss 3.770144 (-0.23z)| norm 0.3020 (+0.18z)| lr 5.88e-04 | 8468.52 ms | -100.0% bf16 MFU | 62052 tok/s +step 2400/19560 | loss 3.766158 (-0.34z)| norm 0.3207 (+0.87z)| lr 5.88e-04 | 8471.91 ms | -100.0% bf16 MFU | 62043 tok/s +step 2401/19560 | loss 3.799610 (+0.56z)| norm 0.2728 (-0.90z)| lr 5.88e-04 | 8468.68 ms | -100.0% bf16 MFU | 62037 tok/s +step 2402/19560 | loss 3.799025 (+0.55z)| norm 0.3064 (+0.33z)| lr 5.88e-04 | 8472.27 ms | -100.0% bf16 MFU | 62029 tok/s +step 2403/19560 | loss 3.840102 (+1.65z)| norm 0.2831 (-0.52z)| lr 5.88e-04 | 8469.05 ms | -100.0% bf16 MFU | 62023 tok/s +step 2404/19560 | loss 3.841616 (+1.67z)| norm 0.2721 (-0.91z)| lr 5.88e-04 | 8472.00 ms | -100.0% bf16 MFU | 62016 tok/s +step 2405/19560 | loss 3.822347 (+1.14z)| norm 0.2759 (-0.76z)| lr 5.88e-04 | 8468.41 ms | -100.0% bf16 MFU | 62011 tok/s +step 2406/19560 | loss 3.710822 (-1.84z)| norm 0.2756 (-0.79z)| lr 5.88e-04 | 8470.13 ms | -100.0% bf16 MFU | 62005 tok/s +step 2407/19560 | loss 3.766487 (-0.34z)| norm 0.2645 (-1.19z)| lr 5.88e-04 | 8470.75 ms | -100.0% bf16 MFU | 61999 tok/s +step 2408/19560 | loss 3.739251 (-1.06z)| norm 0.2753 (-0.78z)| lr 5.88e-04 | 8473.32 ms | -100.0% bf16 MFU | 61993 tok/s +step 2409/19560 | loss 3.802330 (+0.63z)| norm 0.3002 (+0.14z)| lr 5.88e-04 | 8469.37 ms | -100.0% bf16 MFU | 61989 tok/s +step 2410/19560 | loss 3.750309 (-0.75z)| norm 0.2741 (-0.82z)| lr 5.88e-04 | 8468.45 ms | -100.0% bf16 MFU | 61985 tok/s +step 2411/19560 | loss 3.818856 (+1.07z)| norm 0.2701 (-0.96z)| lr 5.88e-04 | 8469.15 ms | -100.0% bf16 MFU | 61981 tok/s +step 2412/19560 | loss 3.777618 (-0.04z)| norm 0.2739 (-0.84z)| lr 5.88e-04 | 8463.22 ms | -100.0% bf16 MFU | 61979 tok/s +step 2413/19560 | loss 3.800253 (+0.57z)| norm 0.3172 (+0.76z)| lr 5.88e-04 | 8470.22 ms | -100.0% bf16 MFU | 61975 tok/s +step 2414/19560 | loss 3.771490 (-0.21z)| norm 0.3618 (+2.36z)| lr 5.88e-04 | 8471.41 ms | -100.0% bf16 MFU | 61971 tok/s +step 2415/19560 | loss 3.756966 (-0.59z)| norm 0.3016 (+0.16z)| lr 5.88e-04 | 8466.69 ms | -100.0% bf16 MFU | 61969 tok/s +step 2416/19560 | loss 3.730667 (-1.28z)| norm 0.2835 (-0.50z)| lr 5.88e-04 | 8464.96 ms | -100.0% bf16 MFU | 61967 tok/s +step 2417/19560 | loss 3.784659 (+0.18z)| norm 0.3152 (+0.65z)| lr 5.88e-04 | 8464.00 ms | -100.0% bf16 MFU | 61966 tok/s +step 2418/19560 | loss 3.773504 (-0.12z)| norm 0.2803 (-0.61z)| lr 5.88e-04 | 8462.94 ms | -100.0% bf16 MFU | 61965 tok/s +step 2419/19560 | loss 3.766787 (-0.30z)| norm 0.2937 (-0.12z)| lr 5.88e-04 | 8463.21 ms | -100.0% bf16 MFU | 61964 tok/s +step 2420/19560 | loss 3.793142 (+0.41z)| norm 0.2660 (-1.12z)| lr 5.88e-04 | 8464.87 ms | -100.0% bf16 MFU | 61963 tok/s +step 2421/19560 | loss 3.799330 (+0.57z)| norm 0.2666 (-1.08z)| lr 5.88e-04 | 8465.61 ms | -100.0% bf16 MFU | 61961 tok/s +step 2422/19560 | loss 3.749427 (-0.78z)| norm 0.2981 (+0.10z)| lr 5.88e-04 | 8464.49 ms | -100.0% bf16 MFU | 61960 tok/s +step 2423/19560 | loss 3.788450 (+0.31z)| norm 0.3169 (+0.81z)| lr 5.88e-04 | 8463.88 ms | -100.0% bf16 MFU | 61959 tok/s +step 2424/19560 | loss 3.790561 (+0.37z)| norm 0.2844 (-0.42z)| lr 5.88e-04 | 8463.10 ms | -100.0% bf16 MFU | 61959 tok/s +step 2425/19560 | loss 3.864456 (+2.37z)| norm 0.2878 (-0.28z)| lr 5.88e-04 | 8463.46 ms | -100.0% bf16 MFU | 61958 tok/s +step 2426/19560 | loss 3.798612 (+0.56z)| norm 0.3349 (+1.48z)| lr 5.88e-04 | 8461.46 ms | -100.0% bf16 MFU | 61959 tok/s +step 2427/19560 | loss 3.771328 (-0.19z)| norm 0.3491 (+1.97z)| lr 5.88e-04 | 8466.05 ms | -100.0% bf16 MFU | 61957 tok/s +step 2428/19560 | loss 3.800062 (+0.59z)| norm 0.3240 (+1.04z)| lr 5.88e-04 | 8462.29 ms | -100.0% bf16 MFU | 61957 tok/s +step 2429/19560 | loss 3.823347 (+1.21z)| norm 0.3529 (+2.06z)| lr 5.88e-04 | 8469.31 ms | -100.0% bf16 MFU | 61954 tok/s +step 2430/19560 | loss 3.748447 (-0.83z)| norm 0.3146 (+0.66z)| lr 5.88e-04 | 8462.22 ms | -100.0% bf16 MFU | 61954 tok/s +step 2431/19560 | loss 3.745749 (-0.90z)| norm 0.3150 (+0.66z)| lr 5.88e-04 | 8465.64 ms | -100.0% bf16 MFU | 61953 tok/s +step 2432/19560 | loss 3.738584 (-1.08z)| norm 0.3625 (+2.32z)| lr 5.88e-04 | 8467.42 ms | -100.0% bf16 MFU | 61952 tok/s +step 2433/19560 | loss 3.773838 (-0.12z)| norm 0.3417 (+1.55z)| lr 5.88e-04 | 8468.33 ms | -100.0% bf16 MFU | 61950 tok/s +step 2434/19560 | loss 3.765282 (-0.34z)| norm 0.3075 (+0.34z)| lr 5.88e-04 | 8464.86 ms | -100.0% bf16 MFU | 61949 tok/s +step 2435/19560 | loss 3.792070 (+0.41z)| norm 0.2976 (-0.01z)| lr 5.88e-04 | 8457.97 ms | -100.0% bf16 MFU | 61951 tok/s +step 2436/19560 | loss 3.810787 (+0.92z)| norm 0.3139 (+0.55z)| lr 5.88e-04 | 8458.68 ms | -100.0% bf16 MFU | 61952 tok/s +step 2437/19560 | loss 3.775973 (-0.05z)| norm 0.3438 (+1.58z)| lr 5.88e-04 | 8465.26 ms | -100.0% bf16 MFU | 61951 tok/s +step 2438/19560 | loss 3.767081 (-0.29z)| norm 0.3248 (+0.90z)| lr 5.88e-04 | 8461.77 ms | -100.0% bf16 MFU | 61952 tok/s +step 2439/19560 | loss 3.767675 (-0.27z)| norm 0.2840 (-0.53z)| lr 5.88e-04 | 8459.75 ms | -100.0% bf16 MFU | 61953 tok/s +step 2440/19560 | loss 3.724167 (-1.47z)| norm 0.2590 (-1.39z)| lr 5.88e-04 | 8468.47 ms | -100.0% bf16 MFU | 61951 tok/s +step 2441/19560 | loss 3.802540 (+0.74z)| norm 0.3026 (+0.12z)| lr 5.87e-04 | 8468.47 ms | -100.0% bf16 MFU | 61949 tok/s +step 2442/19560 | loss 3.781203 (+0.13z)| norm 0.3026 (+0.11z)| lr 5.87e-04 | 8459.84 ms | -100.0% bf16 MFU | 61950 tok/s +step 2443/19560 | loss 3.755573 (-0.58z)| norm 0.2991 (-0.02z)| lr 5.87e-04 | 8454.37 ms | -100.0% bf16 MFU | 61953 tok/s +step 2444/19560 | loss 3.798326 (+0.64z)| norm 0.3367 (+1.29z)| lr 5.87e-04 | 8462.65 ms | -100.0% bf16 MFU | 61953 tok/s +step 2445/19560 | loss 3.769795 (-0.17z)| norm 0.3182 (+0.62z)| lr 5.87e-04 | 8462.43 ms | -100.0% bf16 MFU | 61953 tok/s +step 2446/19560 | loss 3.834352 (+1.65z)| norm 0.3089 (+0.28z)| lr 5.87e-04 | 8461.66 ms | -100.0% bf16 MFU | 61954 tok/s +step 2447/19560 | loss 3.743963 (-0.89z)| norm 0.3219 (+0.73z)| lr 5.87e-04 | 8467.77 ms | -100.0% bf16 MFU | 61952 tok/s +step 2448/19560 | loss 3.823735 (+1.34z)| norm 0.3155 (+0.49z)| lr 5.87e-04 | 8467.71 ms | -100.0% bf16 MFU | 61950 tok/s +step 2449/19560 | loss 3.831981 (+1.58z)| norm 0.2988 (-0.09z)| lr 5.87e-04 | 8460.46 ms | -100.0% bf16 MFU | 61951 tok/s +step 2450/19560 | loss 3.789653 (+0.38z)| norm 0.2848 (-0.59z)| lr 5.87e-04 | 8464.16 ms | -100.0% bf16 MFU | 61951 tok/s +step 2451/19560 | loss 3.749450 (-0.74z)| norm 0.2898 (-0.40z)| lr 5.87e-04 | 8469.51 ms | -100.0% bf16 MFU | 61948 tok/s +step 2452/19560 | loss 3.784619 (+0.25z)| norm 0.3097 (+0.36z)| lr 5.87e-04 | 8461.67 ms | -100.0% bf16 MFU | 61949 tok/s +step 2453/19560 | loss 3.786283 (+0.28z)| norm 0.3246 (+0.91z)| lr 5.87e-04 | 8463.57 ms | -100.0% bf16 MFU | 61949 tok/s +step 2454/19560 | loss 3.748942 (-0.77z)| norm 0.3091 (+0.34z)| lr 5.87e-04 | 8456.86 ms | -100.0% bf16 MFU | 61951 tok/s +step 2455/19560 | loss 3.761074 (-0.43z)| norm 0.2816 (-0.70z)| lr 5.87e-04 | 8465.88 ms | -100.0% bf16 MFU | 61950 tok/s +step 2456/19560 | loss 3.798916 (+0.64z)| norm 0.3289 (+1.10z)| lr 5.87e-04 | 8465.48 ms | -100.0% bf16 MFU | 61949 tok/s +step 2457/19560 | loss 3.716228 (-1.69z)| norm 0.2927 (-0.29z)| lr 5.87e-04 | 8457.54 ms | -100.0% bf16 MFU | 61951 tok/s +step 2458/19560 | loss 3.725811 (-1.40z)| norm 0.2928 (-0.29z)| lr 5.87e-04 | 8463.41 ms | -100.0% bf16 MFU | 61951 tok/s +step 2459/19560 | loss 3.700860 (-2.06z)| norm 0.2769 (-0.91z)| lr 5.87e-04 | 8457.54 ms | -100.0% bf16 MFU | 61953 tok/s +step 2460/19560 | loss 3.798615 (+0.68z)| norm 0.2596 (-1.58z)| lr 5.87e-04 | 8459.40 ms | -100.0% bf16 MFU | 61954 tok/s +step 2461/19560 | loss 3.708943 (-1.80z)| norm 0.2709 (-1.13z)| lr 5.87e-04 | 8461.26 ms | -100.0% bf16 MFU | 61955 tok/s +step 2462/19560 | loss 3.753364 (-0.56z)| norm 0.2629 (-1.42z)| lr 5.87e-04 | 8458.85 ms | -100.0% bf16 MFU | 61956 tok/s +step 2463/19560 | loss 3.751633 (-0.61z)| norm 0.2713 (-1.08z)| lr 5.87e-04 | 8464.08 ms | -100.0% bf16 MFU | 61955 tok/s +step 2464/19560 | loss 3.804124 (+0.84z)| norm 0.2772 (-0.84z)| lr 5.87e-04 | 8461.00 ms | -100.0% bf16 MFU | 61956 tok/s +step 2465/19560 | loss 3.757656 (-0.45z)| norm 0.2807 (-0.71z)| lr 5.87e-04 | 8461.77 ms | -100.0% bf16 MFU | 61956 tok/s +step 2466/19560 | loss 3.757946 (-0.44z)| norm 0.2632 (-1.35z)| lr 5.87e-04 | 8457.85 ms | -100.0% bf16 MFU | 61958 tok/s +step 2467/19560 | loss 3.865312 (+2.48z)| norm 0.2957 (-0.11z)| lr 5.87e-04 | 8461.59 ms | -100.0% bf16 MFU | 61958 tok/s +step 2468/19560 | loss 3.801336 (+0.73z)| norm 0.2967 (-0.07z)| lr 5.87e-04 | 8466.21 ms | -100.0% bf16 MFU | 61956 tok/s +step 2469/19560 | loss 3.794877 (+0.54z)| norm 0.2925 (-0.23z)| lr 5.87e-04 | 8457.38 ms | -100.0% bf16 MFU | 61958 tok/s +step 2470/19560 | loss 3.742048 (-0.95z)| norm 0.2837 (-0.56z)| lr 5.87e-04 | 8459.33 ms | -100.0% bf16 MFU | 61959 tok/s +step 2471/19560 | loss 3.863855 (+2.40z)| norm 0.3051 (+0.24z)| lr 5.87e-04 | 8466.60 ms | -100.0% bf16 MFU | 61957 tok/s +step 2472/19560 | loss 3.791683 (+0.46z)| norm 0.3086 (+0.37z)| lr 5.87e-04 | 8462.76 ms | -100.0% bf16 MFU | 61957 tok/s +step 2473/19560 | loss 3.800518 (+0.71z)| norm 0.2996 (+0.02z)| lr 5.87e-04 | 8457.11 ms | -100.0% bf16 MFU | 61959 tok/s +step 2474/19560 | loss 3.759299 (-0.48z)| norm 0.2822 (-0.65z)| lr 5.87e-04 | 8460.00 ms | -100.0% bf16 MFU | 61960 tok/s +step 2475/19560 | loss 3.793644 (+0.50z)| norm 0.3120 (+0.49z)| lr 5.87e-04 | 8460.46 ms | -100.0% bf16 MFU | 61960 tok/s +step 2476/19560 | loss 3.778934 (+0.08z)| norm 0.3073 (+0.31z)| lr 5.87e-04 | 8455.47 ms | -100.0% bf16 MFU | 61962 tok/s +step 2477/19560 | loss 3.760253 (-0.45z)| norm 0.2936 (-0.23z)| lr 5.87e-04 | 8447.06 ms | -100.0% bf16 MFU | 61968 tok/s +step 2478/19560 | loss 3.727136 (-1.40z)| norm 0.3279 (+1.07z)| lr 5.87e-04 | 8445.31 ms | -100.0% bf16 MFU | 61973 tok/s +step 2479/19560 | loss 3.774825 (-0.01z)| norm 0.3414 (+1.56z)| lr 5.87e-04 | 8446.09 ms | -100.0% bf16 MFU | 61978 tok/s +step 2480/19560 | loss 3.802339 (+0.78z)| norm 0.3628 (+2.31z)| lr 5.87e-04 | 8442.67 ms | -100.0% bf16 MFU | 61984 tok/s +step 2481/19560 | loss 3.750227 (-0.72z)| norm 0.3379 (+1.37z)| lr 5.87e-04 | 8443.60 ms | -100.0% bf16 MFU | 61990 tok/s +step 2482/19560 | loss 3.790260 (+0.44z)| norm 0.3199 (+0.72z)| lr 5.87e-04 | 8443.51 ms | -100.0% bf16 MFU | 61995 tok/s +step 2483/19560 | loss 3.797903 (+0.65z)| norm 0.3156 (+0.56z)| lr 5.87e-04 | 8444.06 ms | -100.0% bf16 MFU | 62000 tok/s +step 2484/19560 | loss 3.814397 (+1.11z)| norm 0.2974 (-0.12z)| lr 5.87e-04 | 8444.06 ms | -100.0% bf16 MFU | 62004 tok/s +step 2485/19560 | loss 3.760306 (-0.44z)| norm 0.2669 (-1.25z)| lr 5.87e-04 | 8443.04 ms | -100.0% bf16 MFU | 62009 tok/s +step 2486/19560 | loss 3.740804 (-1.00z)| norm 0.3207 (+0.74z)| lr 5.87e-04 | 8445.79 ms | -100.0% bf16 MFU | 62012 tok/s +step 2487/19560 | loss 3.722594 (-1.50z)| norm 0.3035 (+0.09z)| lr 5.87e-04 | 8437.05 ms | -100.0% bf16 MFU | 62019 tok/s +step 2488/19560 | loss 3.805819 (+0.86z)| norm 0.2795 (-0.81z)| lr 5.87e-04 | 8444.91 ms | -100.0% bf16 MFU | 62022 tok/s +step 2489/19560 | loss 3.747832 (-0.79z)| norm 0.2716 (-1.10z)| lr 5.87e-04 | 8445.75 ms | -100.0% bf16 MFU | 62025 tok/s +step 2490/19560 | loss 3.802482 (+0.75z)| norm 0.2745 (-0.98z)| lr 5.87e-04 | 8439.69 ms | -100.0% bf16 MFU | 62030 tok/s +step 2491/19560 | loss 3.749717 (-0.74z)| norm 0.2528 (-1.75z)| lr 5.87e-04 | 8439.99 ms | -100.0% bf16 MFU | 62034 tok/s +step 2492/19560 | loss 3.751388 (-0.69z)| norm 0.2502 (-1.81z)| lr 5.87e-04 | 8434.75 ms | -100.0% bf16 MFU | 62040 tok/s +step 2493/19560 | loss 3.811606 (+1.02z)| norm 0.3327 (+1.20z)| lr 5.87e-04 | 8439.04 ms | -100.0% bf16 MFU | 62045 tok/s +step 2494/19560 | loss 3.785683 (+0.27z)| norm 0.3615 (+2.19z)| lr 5.87e-04 | 8438.78 ms | -100.0% bf16 MFU | 62049 tok/s +step 2495/19560 | loss 3.796506 (+0.58z)| norm 0.2750 (-0.89z)| lr 5.87e-04 | 8436.08 ms | -100.0% bf16 MFU | 62054 tok/s +step 2496/19560 | loss 3.844229 (+1.92z)| norm 0.2990 (-0.04z)| lr 5.87e-04 | 8439.64 ms | -100.0% bf16 MFU | 62057 tok/s +step 2497/19560 | loss 3.741319 (-1.03z)| norm 0.2847 (-0.54z)| lr 5.87e-04 | 8437.43 ms | -100.0% bf16 MFU | 62061 tok/s +step 2498/19560 | loss 3.714758 (-1.75z)| norm 0.2586 (-1.46z)| lr 5.87e-04 | 8437.92 ms | -100.0% bf16 MFU | 62065 tok/s +step 2499/19560 | loss 3.832265 (+1.54z)| norm 0.2731 (-0.95z)| lr 5.87e-04 | 8437.65 ms | -100.0% bf16 MFU | 62068 tok/s +step 2500/19560 | loss 3.836493 (+1.63z)| norm 0.2910 (-0.32z)| lr 5.87e-04 | 8438.77 ms | -100.0% bf16 MFU | 62071 tok/s +val loss 3.762915 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 2638/10042 = 0.262697 +step 2501/19560 | loss 3.790483 (+0.35z)| norm 0.3204 (+0.71z)| lr 5.87e-04 | 8448.60 ms | -100.0% bf16 MFU | 62071 tok/s +step 2502/19560 | loss 3.774811 (-0.09z)| norm 0.3045 (+0.13z)| lr 5.87e-04 | 8452.85 ms | -100.0% bf16 MFU | 62068 tok/s +step 2503/19560 | loss 3.753958 (-0.67z)| norm 0.3115 (+0.39z)| lr 5.87e-04 | 8455.60 ms | -100.0% bf16 MFU | 62065 tok/s +step 2504/19560 | loss 3.748850 (-0.82z)| norm 0.2962 (-0.16z)| lr 5.87e-04 | 8450.78 ms | -100.0% bf16 MFU | 62064 tok/s +step 2505/19560 | loss 3.820168 (+1.21z)| norm 0.2709 (-1.06z)| lr 5.87e-04 | 8453.24 ms | -100.0% bf16 MFU | 62062 tok/s +step 2506/19560 | loss 3.755987 (-0.63z)| norm 0.2768 (-0.84z)| lr 5.87e-04 | 8454.90 ms | -100.0% bf16 MFU | 62059 tok/s +step 2507/19560 | loss 3.830032 (+1.47z)| norm 0.2572 (-1.53z)| lr 5.87e-04 | 8450.80 ms | -100.0% bf16 MFU | 62058 tok/s +step 2508/19560 | loss 3.836730 (+1.63z)| norm 0.2826 (-0.62z)| lr 5.87e-04 | 8451.63 ms | -100.0% bf16 MFU | 62057 tok/s +step 2509/19560 | loss 3.724836 (-1.49z)| norm 0.2801 (-0.72z)| lr 5.86e-04 | 8454.92 ms | -100.0% bf16 MFU | 62055 tok/s +step 2510/19560 | loss 3.746241 (-0.89z)| norm 0.2978 (-0.09z)| lr 5.86e-04 | 8456.24 ms | -100.0% bf16 MFU | 62052 tok/s +step 2511/19560 | loss 3.735998 (-1.18z)| norm 0.3122 (+0.42z)| lr 5.86e-04 | 8458.07 ms | -100.0% bf16 MFU | 62049 tok/s +step 2512/19560 | loss 3.750305 (-0.80z)| norm 0.2895 (-0.41z)| lr 5.86e-04 | 8456.63 ms | -100.0% bf16 MFU | 62046 tok/s +step 2513/19560 | loss 3.720493 (-1.64z)| norm 0.2592 (-1.49z)| lr 5.86e-04 | 8460.64 ms | -100.0% bf16 MFU | 62042 tok/s +step 2514/19560 | loss 3.724745 (-1.49z)| norm 0.2727 (-0.99z)| lr 5.86e-04 | 8454.87 ms | -100.0% bf16 MFU | 62041 tok/s +step 2515/19560 | loss 3.769430 (-0.24z)| norm 0.2713 (-1.02z)| lr 5.86e-04 | 8457.77 ms | -100.0% bf16 MFU | 62038 tok/s +step 2516/19560 | loss 3.753123 (-0.69z)| norm 0.2654 (-1.22z)| lr 5.86e-04 | 8458.99 ms | -100.0% bf16 MFU | 62035 tok/s +step 2517/19560 | loss 3.803161 (+0.70z)| norm 0.3032 (+0.17z)| lr 5.86e-04 | 8456.63 ms | -100.0% bf16 MFU | 62033 tok/s +step 2518/19560 | loss 3.756524 (-0.60z)| norm 0.3038 (+0.22z)| lr 5.86e-04 | 8457.16 ms | -100.0% bf16 MFU | 62031 tok/s +step 2519/19560 | loss 3.785876 (+0.22z)| norm 0.3040 (+0.24z)| lr 5.86e-04 | 8449.61 ms | -100.0% bf16 MFU | 62032 tok/s +step 2520/19560 | loss 3.728130 (-1.39z)| norm 0.3175 (+0.76z)| lr 5.86e-04 | 8442.44 ms | -100.0% bf16 MFU | 62036 tok/s +step 2521/19560 | loss 3.744352 (-0.94z)| norm 0.3216 (+0.90z)| lr 5.86e-04 | 8442.96 ms | -100.0% bf16 MFU | 62039 tok/s +step 2522/19560 | loss 3.706480 (-1.95z)| norm 0.3084 (+0.39z)| lr 5.86e-04 | 8438.10 ms | -100.0% bf16 MFU | 62043 tok/s +step 2523/19560 | loss 3.733642 (-1.18z)| norm 0.3073 (+0.34z)| lr 5.86e-04 | 8437.40 ms | -100.0% bf16 MFU | 62048 tok/s +step 2524/19560 | loss 3.735632 (-1.11z)| norm 0.3022 (+0.13z)| lr 5.86e-04 | 8439.13 ms | -100.0% bf16 MFU | 62052 tok/s +step 2525/19560 | loss 3.775816 (+0.01z)| norm 0.2643 (-1.32z)| lr 5.86e-04 | 8440.13 ms | -100.0% bf16 MFU | 62055 tok/s +step 2526/19560 | loss 3.691509 (-2.27z)| norm 0.2988 (+0.04z)| lr 5.86e-04 | 8436.06 ms | -100.0% bf16 MFU | 62060 tok/s +step 2527/19560 | loss 3.805158 (+0.82z)| norm 0.2909 (-0.27z)| lr 5.86e-04 | 8437.95 ms | -100.0% bf16 MFU | 62064 tok/s +step 2528/19560 | loss 3.737406 (-1.01z)| norm 0.2922 (-0.21z)| lr 5.86e-04 | 8437.15 ms | -100.0% bf16 MFU | 62068 tok/s +step 2529/19560 | loss 3.700486 (-1.97z)| norm 0.3017 (+0.16z)| lr 5.86e-04 | 8434.77 ms | -100.0% bf16 MFU | 62072 tok/s +step 2530/19560 | loss 3.764716 (-0.24z)| norm 0.3224 (+0.97z)| lr 5.86e-04 | 8440.53 ms | -100.0% bf16 MFU | 62074 tok/s +step 2531/19560 | loss 3.777176 (+0.11z)| norm 0.3166 (+0.73z)| lr 5.86e-04 | 8436.09 ms | -100.0% bf16 MFU | 62078 tok/s +step 2532/19560 | loss 3.771738 (-0.03z)| norm 0.2846 (-0.54z)| lr 5.86e-04 | 8433.67 ms | -100.0% bf16 MFU | 62082 tok/s +step 2533/19560 | loss 3.826619 (+1.48z)| norm 0.2832 (-0.60z)| lr 5.86e-04 | 8437.46 ms | -100.0% bf16 MFU | 62085 tok/s +step 2534/19560 | loss 3.758726 (-0.40z)| norm 0.2579 (-1.59z)| lr 5.86e-04 | 8436.50 ms | -100.0% bf16 MFU | 62088 tok/s +step 2535/19560 | loss 3.783853 (+0.30z)| norm 0.2724 (-1.02z)| lr 5.86e-04 | 8435.03 ms | -100.0% bf16 MFU | 62092 tok/s +step 2536/19560 | loss 3.826336 (+1.45z)| norm 0.2453 (-2.06z)| lr 5.86e-04 | 8437.03 ms | -100.0% bf16 MFU | 62094 tok/s +step 2537/19560 | loss 3.792816 (+0.53z)| norm 0.2948 (-0.12z)| lr 5.86e-04 | 8438.98 ms | -100.0% bf16 MFU | 62096 tok/s +step 2538/19560 | loss 3.754299 (-0.54z)| norm 0.3489 (+1.94z)| lr 5.86e-04 | 8439.77 ms | -100.0% bf16 MFU | 62097 tok/s +step 2539/19560 | loss 3.722218 (-1.41z)| norm 0.3582 (+2.24z)| lr 5.86e-04 | 8441.70 ms | -100.0% bf16 MFU | 62097 tok/s +step 2540/19560 | loss 3.797523 (+0.67z)| norm 0.3101 (+0.40z)| lr 5.86e-04 | 8437.17 ms | -100.0% bf16 MFU | 62100 tok/s +step 2541/19560 | loss 3.729980 (-1.18z)| norm 0.2660 (-1.25z)| lr 5.86e-04 | 8436.02 ms | -100.0% bf16 MFU | 62102 tok/s +step 2542/19560 | loss 3.710217 (-1.69z)| norm 0.3044 (+0.22z)| lr 5.86e-04 | 8440.42 ms | -100.0% bf16 MFU | 62103 tok/s +step 2543/19560 | loss 3.763592 (-0.24z)| norm 0.2840 (-0.56z)| lr 5.86e-04 | 8442.84 ms | -100.0% bf16 MFU | 62103 tok/s +step 2544/19560 | loss 3.749166 (-0.64z)| norm 0.2803 (-0.70z)| lr 5.86e-04 | 8442.32 ms | -100.0% bf16 MFU | 62103 tok/s +step 2545/19560 | loss 3.778570 (+0.17z)| norm 0.3054 (+0.27z)| lr 5.86e-04 | 8448.96 ms | -100.0% bf16 MFU | 62100 tok/s +step 2546/19560 | loss 3.733567 (-1.05z)| norm 0.3086 (+0.39z)| lr 5.86e-04 | 8447.15 ms | -100.0% bf16 MFU | 62098 tok/s +step 2547/19560 | loss 3.816187 (+1.18z)| norm 0.3072 (+0.33z)| lr 5.86e-04 | 8445.29 ms | -100.0% bf16 MFU | 62098 tok/s +step 2548/19560 | loss 3.793362 (+0.57z)| norm 0.3089 (+0.38z)| lr 5.86e-04 | 8451.17 ms | -100.0% bf16 MFU | 62095 tok/s +step 2549/19560 | loss 3.781843 (+0.26z)| norm 0.2875 (-0.46z)| lr 5.86e-04 | 8446.12 ms | -100.0% bf16 MFU | 62094 tok/s +step 2550/19560 | loss 3.736199 (-0.97z)| norm 0.2783 (-0.81z)| lr 5.86e-04 | 8449.64 ms | -100.0% bf16 MFU | 62091 tok/s +step 2551/19560 | loss 3.777400 (+0.14z)| norm 0.3128 (+0.54z)| lr 5.86e-04 | 8446.75 ms | -100.0% bf16 MFU | 62090 tok/s +step 2552/19560 | loss 3.768893 (-0.08z)| norm 0.2996 (+0.02z)| lr 5.86e-04 | 8446.69 ms | -100.0% bf16 MFU | 62089 tok/s +step 2553/19560 | loss 3.812826 (+1.14z)| norm 0.2895 (-0.38z)| lr 5.86e-04 | 8448.71 ms | -100.0% bf16 MFU | 62088 tok/s +step 2554/19560 | loss 3.711220 (-1.64z)| norm 0.3011 (+0.09z)| lr 5.86e-04 | 8447.85 ms | -100.0% bf16 MFU | 62086 tok/s +step 2555/19560 | loss 3.834153 (+1.70z)| norm 0.2866 (-0.47z)| lr 5.86e-04 | 8450.83 ms | -100.0% bf16 MFU | 62084 tok/s +step 2556/19560 | loss 3.790871 (+0.53z)| norm 0.2953 (-0.12z)| lr 5.86e-04 | 8446.57 ms | -100.0% bf16 MFU | 62083 tok/s +step 2557/19560 | loss 3.784246 (+0.36z)| norm 0.2842 (-0.55z)| lr 5.86e-04 | 8450.56 ms | -100.0% bf16 MFU | 62081 tok/s +step 2558/19560 | loss 3.834711 (+1.71z)| norm 0.2769 (-0.84z)| lr 5.86e-04 | 8445.65 ms | -100.0% bf16 MFU | 62081 tok/s +step 2559/19560 | loss 3.722921 (-1.31z)| norm 0.2719 (-1.03z)| lr 5.86e-04 | 8450.93 ms | -100.0% bf16 MFU | 62079 tok/s +step 2560/19560 | loss 3.736588 (-0.94z)| norm 0.2613 (-1.46z)| lr 5.86e-04 | 8453.26 ms | -100.0% bf16 MFU | 62076 tok/s +step 2561/19560 | loss 3.731485 (-1.06z)| norm 0.2651 (-1.29z)| lr 5.86e-04 | 8449.01 ms | -100.0% bf16 MFU | 62075 tok/s +step 2562/19560 | loss 3.788509 (+0.46z)| norm 0.2709 (-1.03z)| lr 5.86e-04 | 8448.32 ms | -100.0% bf16 MFU | 62074 tok/s +step 2563/19560 | loss 3.762414 (-0.23z)| norm 0.2850 (-0.43z)| lr 5.86e-04 | 8452.24 ms | -100.0% bf16 MFU | 62072 tok/s +step 2564/19560 | loss 3.750948 (-0.53z)| norm 0.2789 (-0.67z)| lr 5.86e-04 | 8454.95 ms | -100.0% bf16 MFU | 62069 tok/s +step 2565/19560 | loss 3.688337 (-2.15z)| norm 0.2565 (-1.60z)| lr 5.86e-04 | 8453.11 ms | -100.0% bf16 MFU | 62066 tok/s +step 2566/19560 | loss 3.749054 (-0.55z)| norm 0.2705 (-0.99z)| lr 5.86e-04 | 8453.54 ms | -100.0% bf16 MFU | 62064 tok/s +step 2567/19560 | loss 3.736702 (-0.86z)| norm 0.2654 (-1.20z)| lr 5.86e-04 | 8455.30 ms | -100.0% bf16 MFU | 62061 tok/s +step 2568/19560 | loss 3.743347 (-0.70z)| norm 0.2699 (-1.01z)| lr 5.86e-04 | 8453.50 ms | -100.0% bf16 MFU | 62059 tok/s +step 2569/19560 | loss 3.746549 (-0.60z)| norm 0.2793 (-0.60z)| lr 5.86e-04 | 8453.24 ms | -100.0% bf16 MFU | 62057 tok/s +step 2570/19560 | loss 3.691789 (-2.00z)| norm 0.2756 (-0.75z)| lr 5.86e-04 | 8454.52 ms | -100.0% bf16 MFU | 62055 tok/s +step 2571/19560 | loss 3.762752 (-0.15z)| norm 0.2877 (-0.24z)| lr 5.86e-04 | 8454.55 ms | -100.0% bf16 MFU | 62053 tok/s +step 2572/19560 | loss 3.812431 (+1.13z)| norm 0.2958 (+0.12z)| lr 5.86e-04 | 8452.90 ms | -100.0% bf16 MFU | 62052 tok/s +step 2573/19560 | loss 3.764900 (-0.10z)| norm 0.3157 (+0.97z)| lr 5.86e-04 | 8458.29 ms | -100.0% bf16 MFU | 62048 tok/s +step 2574/19560 | loss 3.716699 (-1.33z)| norm 0.3317 (+1.64z)| lr 5.86e-04 | 8453.79 ms | -100.0% bf16 MFU | 62047 tok/s +step 2575/19560 | loss 3.719137 (-1.26z)| norm 0.3221 (+1.23z)| lr 5.86e-04 | 8454.96 ms | -100.0% bf16 MFU | 62045 tok/s +step 2576/19560 | loss 3.751467 (-0.41z)| norm 0.3084 (+0.65z)| lr 5.85e-04 | 8457.36 ms | -100.0% bf16 MFU | 62042 tok/s +step 2577/19560 | loss 3.745955 (-0.54z)| norm 0.2816 (-0.49z)| lr 5.85e-04 | 8454.58 ms | -100.0% bf16 MFU | 62041 tok/s +step 2578/19560 | loss 3.759765 (-0.17z)| norm 0.3087 (+0.66z)| lr 5.85e-04 | 8455.46 ms | -100.0% bf16 MFU | 62039 tok/s +step 2579/19560 | loss 3.711128 (-1.44z)| norm 0.3477 (+2.25z)| lr 5.85e-04 | 8453.39 ms | -100.0% bf16 MFU | 62038 tok/s +step 2580/19560 | loss 3.710327 (-1.43z)| norm 0.3371 (+1.78z)| lr 5.85e-04 | 8455.13 ms | -100.0% bf16 MFU | 62037 tok/s +step 2581/19560 | loss 3.809265 (+1.14z)| norm 0.3019 (+0.34z)| lr 5.85e-04 | 8455.89 ms | -100.0% bf16 MFU | 62035 tok/s +step 2582/19560 | loss 3.756870 (-0.22z)| norm 0.3060 (+0.51z)| lr 5.85e-04 | 8457.16 ms | -100.0% bf16 MFU | 62033 tok/s +step 2583/19560 | loss 3.754546 (-0.28z)| norm 0.3051 (+0.47z)| lr 5.85e-04 | 8457.04 ms | -100.0% bf16 MFU | 62031 tok/s +step 2584/19560 | loss 3.787722 (+0.58z)| norm 0.2806 (-0.53z)| lr 5.85e-04 | 8456.76 ms | -100.0% bf16 MFU | 62029 tok/s +step 2585/19560 | loss 3.790013 (+0.63z)| norm 0.2643 (-1.20z)| lr 5.85e-04 | 8471.14 ms | -100.0% bf16 MFU | 62022 tok/s +step 2586/19560 | loss 3.791352 (+0.65z)| norm 0.2594 (-1.38z)| lr 5.85e-04 | 8482.21 ms | -100.0% bf16 MFU | 62012 tok/s +step 2587/19560 | loss 3.766558 (-0.01z)| norm 0.2721 (-0.86z)| lr 5.85e-04 | 8484.20 ms | -100.0% bf16 MFU | 62001 tok/s +step 2588/19560 | loss 3.760367 (-0.17z)| norm 0.2942 (+0.04z)| lr 5.85e-04 | 8476.36 ms | -100.0% bf16 MFU | 61993 tok/s +step 2589/19560 | loss 3.743785 (-0.62z)| norm 0.2582 (-1.44z)| lr 5.85e-04 | 8479.17 ms | -100.0% bf16 MFU | 61985 tok/s +step 2590/19560 | loss 3.674889 (-2.40z)| norm 0.2575 (-1.46z)| lr 5.85e-04 | 8479.61 ms | -100.0% bf16 MFU | 61978 tok/s +step 2591/19560 | loss 3.789272 (+0.59z)| norm 0.2582 (-1.42z)| lr 5.85e-04 | 8482.75 ms | -100.0% bf16 MFU | 61969 tok/s +step 2592/19560 | loss 3.736927 (-0.76z)| norm 0.2795 (-0.55z)| lr 5.85e-04 | 8479.65 ms | -100.0% bf16 MFU | 61962 tok/s +step 2593/19560 | loss 3.746497 (-0.51z)| norm 0.2883 (-0.19z)| lr 5.85e-04 | 8482.62 ms | -100.0% bf16 MFU | 61954 tok/s +step 2594/19560 | loss 3.834467 (+1.76z)| norm 0.2878 (-0.22z)| lr 5.85e-04 | 8481.03 ms | -100.0% bf16 MFU | 61948 tok/s +step 2595/19560 | loss 3.761914 (-0.10z)| norm 0.3059 (+0.52z)| lr 5.85e-04 | 8483.95 ms | -100.0% bf16 MFU | 61940 tok/s +step 2596/19560 | loss 3.773823 (+0.22z)| norm 0.3649 (+2.83z)| lr 5.85e-04 | 8477.66 ms | -100.0% bf16 MFU | 61935 tok/s +step 2597/19560 | loss 3.763660 (-0.04z)| norm 0.3538 (+2.32z)| lr 5.85e-04 | 8474.16 ms | -100.0% bf16 MFU | 61932 tok/s +step 2598/19560 | loss 3.779341 (+0.37z)| norm 0.2822 (-0.47z)| lr 5.85e-04 | 8480.15 ms | -100.0% bf16 MFU | 61927 tok/s +step 2599/19560 | loss 3.737119 (-0.76z)| norm 0.3375 (+1.66z)| lr 5.85e-04 | 8476.97 ms | -100.0% bf16 MFU | 61923 tok/s +step 2600/19560 | loss 3.771661 (+0.20z)| norm 0.3346 (+1.53z)| lr 5.85e-04 | 8476.71 ms | -100.0% bf16 MFU | 61919 tok/s +step 2601/19560 | loss 3.831224 (+1.81z)| norm 0.2975 (+0.11z)| lr 5.85e-04 | 8479.24 ms | -100.0% bf16 MFU | 61915 tok/s +step 2602/19560 | loss 3.794136 (+0.79z)| norm 0.2973 (+0.09z)| lr 5.85e-04 | 8480.38 ms | -100.0% bf16 MFU | 61910 tok/s +step 2603/19560 | loss 3.752426 (-0.33z)| norm 0.2896 (-0.20z)| lr 5.85e-04 | 8474.35 ms | -100.0% bf16 MFU | 61908 tok/s +step 2604/19560 | loss 3.751992 (-0.34z)| norm 0.2659 (-1.09z)| lr 5.85e-04 | 8476.09 ms | -100.0% bf16 MFU | 61905 tok/s +step 2605/19560 | loss 3.764958 (+0.01z)| norm 0.2808 (-0.51z)| lr 5.85e-04 | 8478.82 ms | -100.0% bf16 MFU | 61902 tok/s +step 2606/19560 | loss 3.745344 (-0.53z)| norm 0.2765 (-0.67z)| lr 5.85e-04 | 8471.27 ms | -100.0% bf16 MFU | 61901 tok/s +step 2607/19560 | loss 3.803241 (+1.05z)| norm 0.2813 (-0.47z)| lr 5.85e-04 | 8473.76 ms | -100.0% bf16 MFU | 61900 tok/s +step 2608/19560 | loss 3.786233 (+0.59z)| norm 0.2880 (-0.19z)| lr 5.85e-04 | 8470.80 ms | -100.0% bf16 MFU | 61899 tok/s +step 2609/19560 | loss 3.765391 (+0.02z)| norm 0.3004 (+0.32z)| lr 5.85e-04 | 8469.33 ms | -100.0% bf16 MFU | 61900 tok/s +step 2610/19560 | loss 3.727713 (-1.00z)| norm 0.2984 (+0.25z)| lr 5.85e-04 | 8469.66 ms | -100.0% bf16 MFU | 61900 tok/s +step 2611/19560 | loss 3.789674 (+0.69z)| norm 0.2890 (-0.13z)| lr 5.85e-04 | 8471.03 ms | -100.0% bf16 MFU | 61899 tok/s +step 2612/19560 | loss 3.793483 (+0.81z)| norm 0.3049 (+0.52z)| lr 5.85e-04 | 8470.55 ms | -100.0% bf16 MFU | 61899 tok/s +step 2613/19560 | loss 3.795574 (+0.85z)| norm 0.3021 (+0.40z)| lr 5.85e-04 | 8473.83 ms | -100.0% bf16 MFU | 61898 tok/s +step 2614/19560 | loss 3.737002 (-0.75z)| norm 0.2799 (-0.50z)| lr 5.85e-04 | 8469.96 ms | -100.0% bf16 MFU | 61898 tok/s +step 2615/19560 | loss 3.739752 (-0.68z)| norm 0.2829 (-0.37z)| lr 5.85e-04 | 8466.42 ms | -100.0% bf16 MFU | 61899 tok/s +step 2616/19560 | loss 3.733447 (-0.84z)| norm 0.2877 (-0.18z)| lr 5.85e-04 | 8474.24 ms | -100.0% bf16 MFU | 61898 tok/s +step 2617/19560 | loss 3.752296 (-0.32z)| norm 0.2598 (-1.32z)| lr 5.85e-04 | 8471.32 ms | -100.0% bf16 MFU | 61897 tok/s +step 2618/19560 | loss 3.749735 (-0.38z)| norm 0.2908 (-0.05z)| lr 5.85e-04 | 8468.14 ms | -100.0% bf16 MFU | 61898 tok/s +step 2619/19560 | loss 3.739467 (-0.66z)| norm 0.2726 (-0.81z)| lr 5.85e-04 | 8470.72 ms | -100.0% bf16 MFU | 61898 tok/s +step 2620/19560 | loss 3.757597 (-0.16z)| norm 0.2921 (-0.02z)| lr 5.85e-04 | 8468.18 ms | -100.0% bf16 MFU | 61899 tok/s +step 2621/19560 | loss 3.790600 (+0.76z)| norm 0.3440 (+2.14z)| lr 5.85e-04 | 8469.43 ms | -100.0% bf16 MFU | 61899 tok/s +step 2622/19560 | loss 3.782701 (+0.54z)| norm 0.3549 (+2.61z)| lr 5.85e-04 | 8468.26 ms | -100.0% bf16 MFU | 61900 tok/s +step 2623/19560 | loss 3.766333 (+0.09z)| norm 0.3893 (+3.79z)| lr 5.85e-04 | 8473.38 ms | -100.0% bf16 MFU | 61898 tok/s +step 2624/19560 | loss 3.790072 (+0.78z)| norm 0.3055 (+0.47z)| lr 5.85e-04 | 8469.78 ms | -100.0% bf16 MFU | 61898 tok/s +step 2625/19560 | loss 3.706699 (-1.57z)| norm 0.2817 (-0.47z)| lr 5.85e-04 | 8460.49 ms | -100.0% bf16 MFU | 61902 tok/s +step 2626/19560 | loss 3.724478 (-1.07z)| norm 0.2690 (-0.98z)| lr 5.85e-04 | 8466.26 ms | -100.0% bf16 MFU | 61903 tok/s +step 2627/19560 | loss 3.775849 (+0.40z)| norm 0.3049 (+0.44z)| lr 5.85e-04 | 8472.03 ms | -100.0% bf16 MFU | 61902 tok/s +step 2628/19560 | loss 3.813215 (+1.48z)| norm 0.3010 (+0.28z)| lr 5.85e-04 | 8467.35 ms | -100.0% bf16 MFU | 61903 tok/s +step 2629/19560 | loss 3.734482 (-0.78z)| norm 0.2836 (-0.40z)| lr 5.85e-04 | 8471.94 ms | -100.0% bf16 MFU | 61902 tok/s +step 2630/19560 | loss 3.743753 (-0.50z)| norm 0.2694 (-0.95z)| lr 5.85e-04 | 8471.41 ms | -100.0% bf16 MFU | 61902 tok/s +step 2631/19560 | loss 3.787499 (+0.75z)| norm 0.2524 (-1.60z)| lr 5.85e-04 | 8463.54 ms | -100.0% bf16 MFU | 61904 tok/s +step 2632/19560 | loss 3.704263 (-1.62z)| norm 0.2603 (-1.27z)| lr 5.85e-04 | 8468.02 ms | -100.0% bf16 MFU | 61904 tok/s +step 2633/19560 | loss 3.740602 (-0.57z)| norm 0.2968 (+0.16z)| lr 5.85e-04 | 8469.47 ms | -100.0% bf16 MFU | 61904 tok/s +step 2634/19560 | loss 3.770321 (+0.28z)| norm 0.3184 (+0.99z)| lr 5.85e-04 | 8461.85 ms | -100.0% bf16 MFU | 61907 tok/s +step 2635/19560 | loss 3.825146 (+1.87z)| norm 0.2893 (-0.16z)| lr 5.85e-04 | 8467.42 ms | -100.0% bf16 MFU | 61908 tok/s +step 2636/19560 | loss 3.788903 (+0.84z)| norm 0.2724 (-0.82z)| lr 5.85e-04 | 8469.74 ms | -100.0% bf16 MFU | 61907 tok/s +step 2637/19560 | loss 3.782558 (+0.65z)| norm 0.2992 (+0.23z)| lr 5.85e-04 | 8469.56 ms | -100.0% bf16 MFU | 61907 tok/s +step 2638/19560 | loss 3.766197 (+0.16z)| norm 0.2876 (-0.23z)| lr 5.85e-04 | 8467.04 ms | -100.0% bf16 MFU | 61908 tok/s +step 2639/19560 | loss 3.750087 (-0.32z)| norm 0.2817 (-0.45z)| lr 5.85e-04 | 8463.25 ms | -100.0% bf16 MFU | 61910 tok/s +step 2640/19560 | loss 3.718943 (-1.22z)| norm 0.2837 (-0.37z)| lr 5.84e-04 | 8468.67 ms | -100.0% bf16 MFU | 61910 tok/s +step 2641/19560 | loss 3.770244 (+0.27z)| norm 0.2597 (-1.32z)| lr 5.84e-04 | 8470.19 ms | -100.0% bf16 MFU | 61909 tok/s +step 2642/19560 | loss 3.733835 (-0.81z)| norm 0.2731 (-0.79z)| lr 5.84e-04 | 8465.31 ms | -100.0% bf16 MFU | 61910 tok/s +step 2643/19560 | loss 3.715548 (-1.32z)| norm 0.2852 (-0.32z)| lr 5.84e-04 | 8461.32 ms | -100.0% bf16 MFU | 61913 tok/s +step 2644/19560 | loss 3.751941 (-0.26z)| norm 0.2872 (-0.24z)| lr 5.84e-04 | 8469.04 ms | -100.0% bf16 MFU | 61913 tok/s +step 2645/19560 | loss 3.781690 (+0.62z)| norm 0.3082 (+0.59z)| lr 5.84e-04 | 8468.98 ms | -100.0% bf16 MFU | 61912 tok/s +step 2646/19560 | loss 3.875692 (+3.23z)| norm 0.2665 (-1.05z)| lr 5.84e-04 | 8464.29 ms | -100.0% bf16 MFU | 61914 tok/s +step 2647/19560 | loss 3.753749 (-0.21z)| norm 0.2619 (-1.22z)| lr 5.84e-04 | 8464.77 ms | -100.0% bf16 MFU | 61915 tok/s +step 2648/19560 | loss 3.741584 (-0.56z)| norm 0.2920 (-0.02z)| lr 5.84e-04 | 8464.16 ms | -100.0% bf16 MFU | 61916 tok/s +step 2649/19560 | loss 3.742262 (-0.54z)| norm 0.2859 (-0.25z)| lr 5.84e-04 | 8465.95 ms | -100.0% bf16 MFU | 61917 tok/s +step 2650/19560 | loss 3.737000 (-0.70z)| norm 0.3112 (+0.75z)| lr 5.84e-04 | 8466.43 ms | -100.0% bf16 MFU | 61917 tok/s +step 2651/19560 | loss 3.739195 (-0.64z)| norm 0.3175 (+0.99z)| lr 5.84e-04 | 8464.27 ms | -100.0% bf16 MFU | 61919 tok/s +step 2652/19560 | loss 3.808830 (+1.33z)| norm 0.2704 (-0.86z)| lr 5.84e-04 | 8463.60 ms | -100.0% bf16 MFU | 61920 tok/s +step 2653/19560 | loss 3.763365 (+0.04z)| norm 0.2831 (-0.37z)| lr 5.84e-04 | 8464.80 ms | -100.0% bf16 MFU | 61921 tok/s +step 2654/19560 | loss 3.750804 (-0.34z)| norm 0.2759 (-0.64z)| lr 5.84e-04 | 8464.15 ms | -100.0% bf16 MFU | 61922 tok/s +step 2655/19560 | loss 3.758112 (-0.12z)| norm 0.2886 (-0.14z)| lr 5.84e-04 | 8464.88 ms | -100.0% bf16 MFU | 61923 tok/s +step 2656/19560 | loss 3.743263 (-0.55z)| norm 0.2875 (-0.18z)| lr 5.84e-04 | 8468.35 ms | -100.0% bf16 MFU | 61922 tok/s +step 2657/19560 | loss 3.731141 (-0.92z)| norm 0.2761 (-0.62z)| lr 5.84e-04 | 8464.01 ms | -100.0% bf16 MFU | 61923 tok/s +step 2658/19560 | loss 3.792243 (+0.87z)| norm 0.2737 (-0.71z)| lr 5.84e-04 | 8458.29 ms | -100.0% bf16 MFU | 61926 tok/s +step 2659/19560 | loss 3.735176 (-0.79z)| norm 0.2679 (-0.92z)| lr 5.84e-04 | 8453.71 ms | -100.0% bf16 MFU | 61931 tok/s +step 2660/19560 | loss 3.753383 (-0.26z)| norm 0.2772 (-0.55z)| lr 5.84e-04 | 8447.29 ms | -100.0% bf16 MFU | 61938 tok/s +step 2661/19560 | loss 3.752289 (-0.28z)| norm 0.2887 (-0.10z)| lr 5.84e-04 | 8441.69 ms | -100.0% bf16 MFU | 61946 tok/s +step 2662/19560 | loss 3.783329 (+0.64z)| norm 0.2860 (-0.21z)| lr 5.84e-04 | 8443.91 ms | -100.0% bf16 MFU | 61953 tok/s +step 2663/19560 | loss 3.711658 (-1.46z)| norm 0.2749 (-0.66z)| lr 5.84e-04 | 8441.96 ms | -100.0% bf16 MFU | 61961 tok/s +step 2664/19560 | loss 3.768183 (+0.22z)| norm 0.2786 (-0.53z)| lr 5.84e-04 | 8443.19 ms | -100.0% bf16 MFU | 61968 tok/s +step 2665/19560 | loss 3.769139 (+0.26z)| norm 0.3429 (+2.03z)| lr 5.84e-04 | 8444.25 ms | -100.0% bf16 MFU | 61974 tok/s +step 2666/19560 | loss 3.794679 (+1.01z)| norm 0.3479 (+2.23z)| lr 5.84e-04 | 8443.48 ms | -100.0% bf16 MFU | 61980 tok/s +step 2667/19560 | loss 3.763189 (+0.06z)| norm 0.3716 (+3.14z)| lr 5.84e-04 | 8437.95 ms | -100.0% bf16 MFU | 61987 tok/s +step 2668/19560 | loss 3.796464 (+1.06z)| norm 0.3507 (+2.26z)| lr 5.84e-04 | 8448.07 ms | -100.0% bf16 MFU | 61991 tok/s +step 2669/19560 | loss 3.761228 (-0.01z)| norm 0.3115 (+0.73z)| lr 5.84e-04 | 8439.31 ms | -100.0% bf16 MFU | 61998 tok/s +step 2670/19560 | loss 3.733307 (-0.86z)| norm 0.2771 (-0.60z)| lr 5.84e-04 | 8437.78 ms | -100.0% bf16 MFU | 62005 tok/s +step 2671/19560 | loss 3.784363 (+0.68z)| norm 0.2899 (-0.10z)| lr 5.84e-04 | 8436.11 ms | -100.0% bf16 MFU | 62012 tok/s +step 2672/19560 | loss 3.753939 (-0.24z)| norm 0.3379 (+1.72z)| lr 5.84e-04 | 8439.22 ms | -100.0% bf16 MFU | 62017 tok/s +step 2673/19560 | loss 3.726642 (-1.05z)| norm 0.3035 (+0.40z)| lr 5.84e-04 | 8439.89 ms | -100.0% bf16 MFU | 62023 tok/s +step 2674/19560 | loss 3.726474 (-1.05z)| norm 0.2582 (-1.31z)| lr 5.84e-04 | 8443.24 ms | -100.0% bf16 MFU | 62026 tok/s +step 2675/19560 | loss 3.760520 (-0.01z)| norm 0.2924 (-0.00z)| lr 5.84e-04 | 8442.69 ms | -100.0% bf16 MFU | 62030 tok/s +step 2676/19560 | loss 3.684959 (-2.25z)| norm 0.2629 (-1.12z)| lr 5.84e-04 | 8446.31 ms | -100.0% bf16 MFU | 62032 tok/s +step 2677/19560 | loss 3.761761 (+0.06z)| norm 0.2506 (-1.56z)| lr 5.84e-04 | 8446.67 ms | -100.0% bf16 MFU | 62034 tok/s +step 2678/19560 | loss 3.706159 (-1.59z)| norm 0.2379 (-1.99z)| lr 5.84e-04 | 8442.12 ms | -100.0% bf16 MFU | 62038 tok/s +step 2679/19560 | loss 3.790486 (+0.91z)| norm 0.2684 (-0.85z)| lr 5.84e-04 | 8441.23 ms | -100.0% bf16 MFU | 62041 tok/s +step 2680/19560 | loss 3.729570 (-0.89z)| norm 0.2513 (-1.46z)| lr 5.84e-04 | 8443.48 ms | -100.0% bf16 MFU | 62044 tok/s +step 2681/19560 | loss 3.796656 (+1.11z)| norm 0.2770 (-0.51z)| lr 5.84e-04 | 8445.11 ms | -100.0% bf16 MFU | 62046 tok/s +step 2682/19560 | loss 3.731278 (-0.84z)| norm 0.3055 (+0.54z)| lr 5.84e-04 | 8447.93 ms | -100.0% bf16 MFU | 62046 tok/s +step 2683/19560 | loss 3.711680 (-1.42z)| norm 0.3188 (+1.02z)| lr 5.84e-04 | 8450.95 ms | -100.0% bf16 MFU | 62046 tok/s +step 2684/19560 | loss 3.750023 (-0.25z)| norm 0.3301 (+1.41z)| lr 5.84e-04 | 8452.56 ms | -100.0% bf16 MFU | 62045 tok/s +step 2685/19560 | loss 3.799945 (+1.26z)| norm 0.2988 (+0.27z)| lr 5.84e-04 | 8448.74 ms | -100.0% bf16 MFU | 62046 tok/s +step 2686/19560 | loss 3.699807 (-1.76z)| norm 0.3288 (+1.34z)| lr 5.84e-04 | 8450.33 ms | -100.0% bf16 MFU | 62046 tok/s +step 2687/19560 | loss 3.711905 (-1.38z)| norm 0.2897 (-0.08z)| lr 5.84e-04 | 8452.56 ms | -100.0% bf16 MFU | 62045 tok/s +step 2688/19560 | loss 3.743758 (-0.41z)| norm 0.3007 (+0.31z)| lr 5.84e-04 | 8450.27 ms | -100.0% bf16 MFU | 62045 tok/s +step 2689/19560 | loss 3.752241 (-0.16z)| norm 0.2843 (-0.30z)| lr 5.84e-04 | 8454.64 ms | -100.0% bf16 MFU | 62043 tok/s +step 2690/19560 | loss 3.822741 (+1.96z)| norm 0.2873 (-0.19z)| lr 5.84e-04 | 8458.36 ms | -100.0% bf16 MFU | 62040 tok/s +step 2691/19560 | loss 3.774821 (+0.51z)| norm 0.2954 (+0.10z)| lr 5.84e-04 | 8450.32 ms | -100.0% bf16 MFU | 62040 tok/s +step 2692/19560 | loss 3.781852 (+0.72z)| norm 0.2860 (-0.24z)| lr 5.84e-04 | 8453.75 ms | -100.0% bf16 MFU | 62039 tok/s +step 2693/19560 | loss 3.753665 (-0.15z)| norm 0.2869 (-0.22z)| lr 5.84e-04 | 8453.45 ms | -100.0% bf16 MFU | 62038 tok/s +step 2694/19560 | loss 3.765273 (+0.20z)| norm 0.2915 (-0.06z)| lr 5.84e-04 | 8456.02 ms | -100.0% bf16 MFU | 62036 tok/s +step 2695/19560 | loss 3.692188 (-2.00z)| norm 0.2626 (-1.13z)| lr 5.84e-04 | 8454.12 ms | -100.0% bf16 MFU | 62035 tok/s +step 2696/19560 | loss 3.757579 (-0.03z)| norm 0.2721 (-0.78z)| lr 5.84e-04 | 8455.81 ms | -100.0% bf16 MFU | 62034 tok/s +step 2697/19560 | loss 3.708858 (-1.48z)| norm 0.2706 (-0.83z)| lr 5.84e-04 | 8460.09 ms | -100.0% bf16 MFU | 62031 tok/s +step 2698/19560 | loss 3.707178 (-1.54z)| norm 0.2629 (-1.10z)| lr 5.84e-04 | 8461.35 ms | -100.0% bf16 MFU | 62027 tok/s +step 2699/19560 | loss 3.772186 (+0.42z)| norm 0.2964 (+0.12z)| lr 5.84e-04 | 8460.52 ms | -100.0% bf16 MFU | 62024 tok/s +step 2700/19560 | loss 3.763062 (+0.16z)| norm 0.2983 (+0.19z)| lr 5.84e-04 | 8457.51 ms | -100.0% bf16 MFU | 62023 tok/s +step 2701/19560 | loss 3.727683 (-0.91z)| norm 0.3023 (+0.35z)| lr 5.84e-04 | 8458.70 ms | -100.0% bf16 MFU | 62021 tok/s +step 2702/19560 | loss 3.846924 (+2.62z)| norm 0.3157 (+0.85z)| lr 5.83e-04 | 8460.29 ms | -100.0% bf16 MFU | 62018 tok/s +step 2703/19560 | loss 3.754107 (-0.14z)| norm 0.2997 (+0.27z)| lr 5.83e-04 | 8458.68 ms | -100.0% bf16 MFU | 62016 tok/s +step 2704/19560 | loss 3.725892 (-0.98z)| norm 0.3054 (+0.48z)| lr 5.83e-04 | 8457.58 ms | -100.0% bf16 MFU | 62015 tok/s +step 2705/19560 | loss 3.822663 (+1.86z)| norm 0.2989 (+0.23z)| lr 5.83e-04 | 8460.19 ms | -100.0% bf16 MFU | 62013 tok/s +step 2706/19560 | loss 3.719362 (-1.16z)| norm 0.2869 (-0.21z)| lr 5.83e-04 | 8460.05 ms | -100.0% bf16 MFU | 62011 tok/s +step 2707/19560 | loss 3.710727 (-1.41z)| norm 0.2681 (-0.90z)| lr 5.83e-04 | 8456.90 ms | -100.0% bf16 MFU | 62010 tok/s +step 2708/19560 | loss 3.765172 (+0.17z)| norm 0.2616 (-1.13z)| lr 5.83e-04 | 8458.56 ms | -100.0% bf16 MFU | 62009 tok/s +step 2709/19560 | loss 3.760915 (+0.05z)| norm 0.2827 (-0.32z)| lr 5.83e-04 | 8459.61 ms | -100.0% bf16 MFU | 62007 tok/s +step 2710/19560 | loss 3.753034 (-0.18z)| norm 0.2562 (-1.31z)| lr 5.83e-04 | 8458.77 ms | -100.0% bf16 MFU | 62006 tok/s +step 2711/19560 | loss 3.724515 (-1.02z)| norm 0.2586 (-1.20z)| lr 5.83e-04 | 8457.04 ms | -100.0% bf16 MFU | 62005 tok/s +step 2712/19560 | loss 3.782898 (+0.71z)| norm 0.2916 (+0.04z)| lr 5.83e-04 | 8456.72 ms | -100.0% bf16 MFU | 62005 tok/s +step 2713/19560 | loss 3.814852 (+1.64z)| norm 0.2961 (+0.20z)| lr 5.83e-04 | 8459.15 ms | -100.0% bf16 MFU | 62003 tok/s +step 2714/19560 | loss 3.770460 (+0.34z)| norm 0.2745 (-0.62z)| lr 5.83e-04 | 8456.06 ms | -100.0% bf16 MFU | 62003 tok/s +step 2715/19560 | loss 3.759742 (+0.03z)| norm 0.3332 (+1.58z)| lr 5.83e-04 | 8455.64 ms | -100.0% bf16 MFU | 62003 tok/s +step 2716/19560 | loss 3.742950 (-0.46z)| norm 0.3289 (+1.40z)| lr 5.83e-04 | 8453.37 ms | -100.0% bf16 MFU | 62004 tok/s +step 2717/19560 | loss 3.715331 (-1.26z)| norm 0.3064 (+0.54z)| lr 5.83e-04 | 8455.42 ms | -100.0% bf16 MFU | 62004 tok/s +step 2718/19560 | loss 3.695351 (-1.87z)| norm 0.2817 (-0.40z)| lr 5.83e-04 | 8457.09 ms | -100.0% bf16 MFU | 62004 tok/s +step 2719/19560 | loss 3.719223 (-1.15z)| norm 0.2671 (-0.95z)| lr 5.83e-04 | 8456.86 ms | -100.0% bf16 MFU | 62003 tok/s +step 2720/19560 | loss 3.718296 (-1.17z)| norm 0.2779 (-0.54z)| lr 5.83e-04 | 8456.02 ms | -100.0% bf16 MFU | 62003 tok/s +step 2721/19560 | loss 3.772713 (+0.43z)| norm 0.2603 (-1.20z)| lr 5.83e-04 | 8459.78 ms | -100.0% bf16 MFU | 62002 tok/s +step 2722/19560 | loss 3.743198 (-0.42z)| norm 0.2567 (-1.31z)| lr 5.83e-04 | 8460.43 ms | -100.0% bf16 MFU | 62000 tok/s +step 2723/19560 | loss 3.724934 (-0.96z)| norm 0.2685 (-0.86z)| lr 5.83e-04 | 8458.73 ms | -100.0% bf16 MFU | 61999 tok/s +step 2724/19560 | loss 3.741657 (-0.45z)| norm 0.3069 (+0.61z)| lr 5.83e-04 | 8457.68 ms | -100.0% bf16 MFU | 61999 tok/s +step 2725/19560 | loss 3.781018 (+0.72z)| norm 0.3079 (+0.68z)| lr 5.83e-04 | 8459.69 ms | -100.0% bf16 MFU | 61998 tok/s +step 2726/19560 | loss 3.785191 (+0.84z)| norm 0.3094 (+0.73z)| lr 5.83e-04 | 8456.11 ms | -100.0% bf16 MFU | 61998 tok/s +step 2727/19560 | loss 3.761116 (+0.12z)| norm 0.2940 (+0.14z)| lr 5.83e-04 | 8458.66 ms | -100.0% bf16 MFU | 61997 tok/s +step 2728/19560 | loss 3.715707 (-1.22z)| norm 0.3128 (+0.90z)| lr 5.83e-04 | 8457.29 ms | -100.0% bf16 MFU | 61997 tok/s +step 2729/19560 | loss 3.727900 (-0.85z)| norm 0.3551 (+2.52z)| lr 5.83e-04 | 8458.16 ms | -100.0% bf16 MFU | 61996 tok/s +step 2730/19560 | loss 3.771398 (+0.47z)| norm 0.3382 (+1.82z)| lr 5.83e-04 | 8458.67 ms | -100.0% bf16 MFU | 61996 tok/s +step 2731/19560 | loss 3.718465 (-1.12z)| norm 0.3140 (+0.88z)| lr 5.83e-04 | 8458.09 ms | -100.0% bf16 MFU | 61995 tok/s +step 2732/19560 | loss 3.733372 (-0.66z)| norm 0.2830 (-0.33z)| lr 5.83e-04 | 8455.69 ms | -100.0% bf16 MFU | 61996 tok/s +step 2733/19560 | loss 3.761156 (+0.18z)| norm 0.2902 (-0.05z)| lr 5.83e-04 | 8458.05 ms | -100.0% bf16 MFU | 61995 tok/s +step 2734/19560 | loss 3.849188 (+2.72z)| norm 0.2910 (-0.02z)| lr 5.83e-04 | 8450.87 ms | -100.0% bf16 MFU | 61997 tok/s +step 2735/19560 | loss 3.763358 (+0.22z)| norm 0.2923 (+0.02z)| lr 5.83e-04 | 8445.98 ms | -100.0% bf16 MFU | 62001 tok/s +step 2736/19560 | loss 3.733212 (-0.66z)| norm 0.2882 (-0.14z)| lr 5.83e-04 | 8441.04 ms | -100.0% bf16 MFU | 62007 tok/s +step 2737/19560 | loss 3.735112 (-0.59z)| norm 0.2626 (-1.11z)| lr 5.83e-04 | 8440.53 ms | -100.0% bf16 MFU | 62012 tok/s +step 2738/19560 | loss 3.790698 (+1.03z)| norm 0.2681 (-0.89z)| lr 5.83e-04 | 8439.08 ms | -100.0% bf16 MFU | 62018 tok/s +step 2739/19560 | loss 3.696886 (-1.70z)| norm 0.2650 (-1.00z)| lr 5.83e-04 | 8437.44 ms | -100.0% bf16 MFU | 62024 tok/s +step 2740/19560 | loss 3.722387 (-0.94z)| norm 0.2652 (-0.98z)| lr 5.83e-04 | 8436.97 ms | -100.0% bf16 MFU | 62030 tok/s +step 2741/19560 | loss 3.731197 (-0.67z)| norm 0.2898 (-0.03z)| lr 5.83e-04 | 8438.74 ms | -100.0% bf16 MFU | 62035 tok/s +step 2742/19560 | loss 3.705614 (-1.40z)| norm 0.3076 (+0.64z)| lr 5.83e-04 | 8435.35 ms | -100.0% bf16 MFU | 62041 tok/s +step 2743/19560 | loss 3.768434 (+0.43z)| norm 0.2965 (+0.21z)| lr 5.83e-04 | 8435.38 ms | -100.0% bf16 MFU | 62046 tok/s +step 2744/19560 | loss 3.707572 (-1.34z)| norm 0.2843 (-0.25z)| lr 5.83e-04 | 8440.32 ms | -100.0% bf16 MFU | 62050 tok/s +step 2745/19560 | loss 3.768633 (+0.43z)| norm 0.2850 (-0.24z)| lr 5.83e-04 | 8434.36 ms | -100.0% bf16 MFU | 62055 tok/s +step 2746/19560 | loss 3.734668 (-0.55z)| norm 0.3024 (+0.43z)| lr 5.83e-04 | 8437.38 ms | -100.0% bf16 MFU | 62060 tok/s +step 2747/19560 | loss 3.762103 (+0.24z)| norm 0.2821 (-0.35z)| lr 5.83e-04 | 8434.86 ms | -100.0% bf16 MFU | 62064 tok/s +step 2748/19560 | loss 3.709970 (-1.25z)| norm 0.2793 (-0.46z)| lr 5.83e-04 | 8436.46 ms | -100.0% bf16 MFU | 62069 tok/s +step 2749/19560 | loss 3.692254 (-1.73z)| norm 0.2892 (-0.06z)| lr 5.83e-04 | 8436.60 ms | -100.0% bf16 MFU | 62072 tok/s +step 2750/19560 | loss 3.740646 (-0.34z)| norm 0.2719 (-0.73z)| lr 5.83e-04 | 8436.10 ms | -100.0% bf16 MFU | 62076 tok/s +val loss 3.721925 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 2641/10042 = 0.262995 +step 2751/19560 | loss 3.748576 (-0.10z)| norm 0.2693 (-0.85z)| lr 5.83e-04 | 8452.07 ms | -100.0% bf16 MFU | 62074 tok/s +step 2752/19560 | loss 3.745922 (-0.17z)| norm 0.2697 (-0.82z)| lr 5.83e-04 | 8446.19 ms | -100.0% bf16 MFU | 62074 tok/s +step 2753/19560 | loss 3.729516 (-0.65z)| norm 0.2951 (+0.26z)| lr 5.83e-04 | 8449.06 ms | -100.0% bf16 MFU | 62073 tok/s +step 2754/19560 | loss 3.739221 (-0.38z)| norm 0.2833 (-0.25z)| lr 5.83e-04 | 8449.54 ms | -100.0% bf16 MFU | 62072 tok/s +step 2755/19560 | loss 3.729315 (-0.65z)| norm 0.2501 (-1.63z)| lr 5.83e-04 | 8448.46 ms | -100.0% bf16 MFU | 62071 tok/s +step 2756/19560 | loss 3.733571 (-0.52z)| norm 0.2663 (-0.93z)| lr 5.83e-04 | 8449.04 ms | -100.0% bf16 MFU | 62070 tok/s +step 2757/19560 | loss 3.761089 (+0.28z)| norm 0.2744 (-0.59z)| lr 5.83e-04 | 8451.12 ms | -100.0% bf16 MFU | 62068 tok/s +step 2758/19560 | loss 3.729132 (-0.65z)| norm 0.2746 (-0.58z)| lr 5.83e-04 | 8447.79 ms | -100.0% bf16 MFU | 62068 tok/s +step 2759/19560 | loss 3.654323 (-2.74z)| norm 0.2873 (-0.06z)| lr 5.83e-04 | 8446.93 ms | -100.0% bf16 MFU | 62068 tok/s +step 2760/19560 | loss 3.717973 (-0.93z)| norm 0.3063 (+0.74z)| lr 5.83e-04 | 8448.00 ms | -100.0% bf16 MFU | 62068 tok/s +step 2761/19560 | loss 3.764618 (+0.40z)| norm 0.2909 (+0.08z)| lr 5.83e-04 | 8452.22 ms | -100.0% bf16 MFU | 62066 tok/s +step 2762/19560 | loss 3.720291 (-0.86z)| norm 0.2987 (+0.42z)| lr 5.82e-04 | 8450.28 ms | -100.0% bf16 MFU | 62065 tok/s +step 2763/19560 | loss 3.724972 (-0.71z)| norm 0.3151 (+1.12z)| lr 5.82e-04 | 8452.33 ms | -100.0% bf16 MFU | 62063 tok/s +step 2764/19560 | loss 3.760437 (+0.33z)| norm 0.2720 (-0.73z)| lr 5.82e-04 | 8446.50 ms | -100.0% bf16 MFU | 62063 tok/s +step 2765/19560 | loss 3.773209 (+0.71z)| norm 0.3008 (+0.50z)| lr 5.82e-04 | 8447.45 ms | -100.0% bf16 MFU | 62063 tok/s +step 2766/19560 | loss 3.715090 (-0.98z)| norm 0.2920 (+0.13z)| lr 5.82e-04 | 8448.10 ms | -100.0% bf16 MFU | 62063 tok/s +step 2767/19560 | loss 3.710070 (-1.11z)| norm 0.3339 (+1.88z)| lr 5.82e-04 | 8446.30 ms | -100.0% bf16 MFU | 62064 tok/s +step 2768/19560 | loss 3.750608 (+0.06z)| norm 0.3044 (+0.62z)| lr 5.82e-04 | 8448.99 ms | -100.0% bf16 MFU | 62063 tok/s +step 2769/19560 | loss 3.775291 (+0.78z)| norm 0.2959 (+0.26z)| lr 5.82e-04 | 8447.12 ms | -100.0% bf16 MFU | 62063 tok/s +step 2770/19560 | loss 3.682401 (-1.89z)| norm 0.2756 (-0.61z)| lr 5.82e-04 | 8452.85 ms | -100.0% bf16 MFU | 62061 tok/s +step 2771/19560 | loss 3.851844 (+2.86z)| norm 0.2596 (-1.27z)| lr 5.82e-04 | 8447.52 ms | -100.0% bf16 MFU | 62062 tok/s +step 2772/19560 | loss 3.758595 (+0.26z)| norm 0.2816 (-0.34z)| lr 5.82e-04 | 8450.50 ms | -100.0% bf16 MFU | 62061 tok/s +step 2773/19560 | loss 3.731187 (-0.50z)| norm 0.2900 (+0.02z)| lr 5.82e-04 | 8448.10 ms | -100.0% bf16 MFU | 62061 tok/s +step 2774/19560 | loss 3.722441 (-0.74z)| norm 0.3060 (+0.68z)| lr 5.82e-04 | 8447.71 ms | -100.0% bf16 MFU | 62061 tok/s +step 2775/19560 | loss 3.706665 (-1.19z)| norm 0.3045 (+0.61z)| lr 5.82e-04 | 8451.01 ms | -100.0% bf16 MFU | 62060 tok/s +step 2776/19560 | loss 3.686552 (-1.74z)| norm 0.3090 (+0.79z)| lr 5.82e-04 | 8471.43 ms | -100.0% bf16 MFU | 62051 tok/s +step 2777/19560 | loss 3.746501 (-0.01z)| norm 0.2797 (-0.45z)| lr 5.82e-04 | 8474.00 ms | -100.0% bf16 MFU | 62042 tok/s +step 2778/19560 | loss 3.694339 (-1.50z)| norm 0.2686 (-0.90z)| lr 5.82e-04 | 8472.50 ms | -100.0% bf16 MFU | 62034 tok/s +step 2779/19560 | loss 3.690875 (-1.57z)| norm 0.2925 (+0.12z)| lr 5.82e-04 | 8472.35 ms | -100.0% bf16 MFU | 62026 tok/s +step 2780/19560 | loss 3.673903 (-2.02z)| norm 0.2785 (-0.48z)| lr 5.82e-04 | 8476.13 ms | -100.0% bf16 MFU | 62018 tok/s +step 2781/19560 | loss 3.718828 (-0.74z)| norm 0.2987 (+0.38z)| lr 5.82e-04 | 8475.47 ms | -100.0% bf16 MFU | 62010 tok/s +step 2782/19560 | loss 3.641623 (-2.80z)| norm 0.2695 (-0.87z)| lr 5.82e-04 | 8474.90 ms | -100.0% bf16 MFU | 62003 tok/s +step 2783/19560 | loss 3.706219 (-1.02z)| norm 0.3052 (+0.65z)| lr 5.82e-04 | 8472.74 ms | -100.0% bf16 MFU | 61996 tok/s +step 2784/19560 | loss 3.685325 (-1.56z)| norm 0.2882 (-0.08z)| lr 5.82e-04 | 8473.39 ms | -100.0% bf16 MFU | 61990 tok/s +step 2785/19560 | loss 3.712910 (-0.81z)| norm 0.2973 (+0.30z)| lr 5.82e-04 | 8472.64 ms | -100.0% bf16 MFU | 61985 tok/s +step 2786/19560 | loss 3.690001 (-1.41z)| norm 0.3066 (+0.69z)| lr 5.82e-04 | 8466.21 ms | -100.0% bf16 MFU | 61982 tok/s +step 2787/19560 | loss 3.684343 (-1.53z)| norm 0.2847 (-0.25z)| lr 5.82e-04 | 8469.78 ms | -100.0% bf16 MFU | 61978 tok/s +step 2788/19560 | loss 3.658092 (-2.17z)| norm 0.3094 (+0.79z)| lr 5.82e-04 | 8472.27 ms | -100.0% bf16 MFU | 61973 tok/s +step 2789/19560 | loss 3.679621 (-1.58z)| norm 0.3213 (+1.28z)| lr 5.82e-04 | 8471.97 ms | -100.0% bf16 MFU | 61969 tok/s +step 2790/19560 | loss 3.682783 (-1.47z)| norm 0.2883 (-0.12z)| lr 5.82e-04 | 8470.28 ms | -100.0% bf16 MFU | 61965 tok/s +step 2791/19560 | loss 3.683825 (-1.43z)| norm 0.2636 (-1.16z)| lr 5.82e-04 | 8470.18 ms | -100.0% bf16 MFU | 61962 tok/s +step 2792/19560 | loss 3.707430 (-0.81z)| norm 0.2895 (-0.07z)| lr 5.82e-04 | 8468.57 ms | -100.0% bf16 MFU | 61959 tok/s +step 2793/19560 | loss 3.720312 (-0.47z)| norm 0.2694 (-0.91z)| lr 5.82e-04 | 8462.45 ms | -100.0% bf16 MFU | 61959 tok/s +step 2794/19560 | loss 3.714758 (-0.60z)| norm 0.2696 (-0.89z)| lr 5.82e-04 | 8468.66 ms | -100.0% bf16 MFU | 61956 tok/s +step 2795/19560 | loss 3.727588 (-0.26z)| norm 0.2699 (-0.89z)| lr 5.82e-04 | 8473.84 ms | -100.0% bf16 MFU | 61952 tok/s +step 2796/19560 | loss 3.749658 (+0.32z)| norm 0.2777 (-0.51z)| lr 5.82e-04 | 8469.22 ms | -100.0% bf16 MFU | 61950 tok/s +step 2797/19560 | loss 3.620049 (-2.94z)| norm 0.2637 (-1.16z)| lr 5.82e-04 | 8470.74 ms | -100.0% bf16 MFU | 61947 tok/s +step 2798/19560 | loss 3.666348 (-1.73z)| norm 0.2922 (+0.19z)| lr 5.82e-04 | 8457.74 ms | -100.0% bf16 MFU | 61949 tok/s +step 2799/19560 | loss 3.678075 (-1.42z)| norm 0.3015 (+0.63z)| lr 5.82e-04 | 8468.71 ms | -100.0% bf16 MFU | 61947 tok/s +step 2800/19560 | loss 3.709010 (-0.64z)| norm 0.3172 (+1.40z)| lr 5.82e-04 | 8465.57 ms | -100.0% bf16 MFU | 61946 tok/s +step 2801/19560 | loss 3.785409 (+1.25z)| norm 0.3208 (+1.56z)| lr 5.82e-04 | 8477.54 ms | -100.0% bf16 MFU | 61941 tok/s +step 2802/19560 | loss 3.631622 (-2.48z)| norm 0.2988 (+0.49z)| lr 5.82e-04 | 8475.02 ms | -100.0% bf16 MFU | 61937 tok/s +step 2803/19560 | loss 3.687080 (-1.12z)| norm 0.3043 (+0.75z)| lr 5.82e-04 | 8471.16 ms | -100.0% bf16 MFU | 61935 tok/s +step 2804/19560 | loss 3.763339 (+0.70z)| norm 0.2966 (+0.37z)| lr 5.82e-04 | 8470.16 ms | -100.0% bf16 MFU | 61933 tok/s +step 2805/19560 | loss 3.714256 (-0.48z)| norm 0.3207 (+1.52z)| lr 5.82e-04 | 8468.53 ms | -100.0% bf16 MFU | 61932 tok/s +step 2806/19560 | loss 3.793695 (+1.42z)| norm 0.3466 (+2.73z)| lr 5.82e-04 | 8472.64 ms | -100.0% bf16 MFU | 61929 tok/s +step 2807/19560 | loss 3.693872 (-0.96z)| norm 0.2912 (+0.03z)| lr 5.82e-04 | 8468.44 ms | -100.0% bf16 MFU | 61928 tok/s +step 2808/19560 | loss 3.724263 (-0.23z)| norm 0.2794 (-0.56z)| lr 5.82e-04 | 8468.55 ms | -100.0% bf16 MFU | 61928 tok/s +step 2809/19560 | loss 3.655987 (-1.84z)| norm 0.2869 (-0.20z)| lr 5.82e-04 | 8468.41 ms | -100.0% bf16 MFU | 61927 tok/s +step 2810/19560 | loss 3.697920 (-0.83z)| norm 0.2599 (-1.50z)| lr 5.82e-04 | 8463.62 ms | -100.0% bf16 MFU | 61928 tok/s +step 2811/19560 | loss 3.683202 (-1.17z)| norm 0.2429 (-2.28z)| lr 5.82e-04 | 8467.60 ms | -100.0% bf16 MFU | 61927 tok/s +step 2812/19560 | loss 3.651272 (-1.89z)| norm 0.2544 (-1.70z)| lr 5.82e-04 | 8471.93 ms | -100.0% bf16 MFU | 61925 tok/s +step 2813/19560 | loss 3.672197 (-1.37z)| norm 0.2403 (-2.32z)| lr 5.82e-04 | 8466.79 ms | -100.0% bf16 MFU | 61925 tok/s +step 2814/19560 | loss 3.671955 (-1.37z)| norm 0.2568 (-1.51z)| lr 5.82e-04 | 8468.27 ms | -100.0% bf16 MFU | 61924 tok/s +step 2815/19560 | loss 3.662353 (-1.57z)| norm 0.2669 (-1.01z)| lr 5.82e-04 | 8469.14 ms | -100.0% bf16 MFU | 61923 tok/s +step 2816/19560 | loss 3.670177 (-1.36z)| norm 0.2475 (-1.89z)| lr 5.82e-04 | 8468.71 ms | -100.0% bf16 MFU | 61923 tok/s +step 2817/19560 | loss 3.698089 (-0.71z)| norm 0.2446 (-1.98z)| lr 5.82e-04 | 8470.23 ms | -100.0% bf16 MFU | 61921 tok/s +step 2818/19560 | loss 3.668615 (-1.38z)| norm 0.2955 (+0.37z)| lr 5.82e-04 | 8467.29 ms | -100.0% bf16 MFU | 61921 tok/s +step 2819/19560 | loss 3.637151 (-2.06z)| norm 0.2867 (-0.03z)| lr 5.82e-04 | 8467.91 ms | -100.0% bf16 MFU | 61921 tok/s +step 2820/19560 | loss 3.704051 (-0.51z)| norm 0.2733 (-0.65z)| lr 5.82e-04 | 8469.38 ms | -100.0% bf16 MFU | 61920 tok/s +step 2821/19560 | loss 3.730751 (+0.11z)| norm 0.2829 (-0.20z)| lr 5.81e-04 | 8464.13 ms | -100.0% bf16 MFU | 61921 tok/s +step 2822/19560 | loss 3.677141 (-1.11z)| norm 0.2830 (-0.20z)| lr 5.81e-04 | 8467.16 ms | -100.0% bf16 MFU | 61921 tok/s +step 2823/19560 | loss 3.720472 (-0.11z)| norm 0.2828 (-0.21z)| lr 5.81e-04 | 8468.30 ms | -100.0% bf16 MFU | 61921 tok/s +step 2824/19560 | loss 3.679129 (-1.06z)| norm 0.2957 (+0.38z)| lr 5.81e-04 | 8463.27 ms | -100.0% bf16 MFU | 61922 tok/s +step 2825/19560 | loss 3.670368 (-1.25z)| norm 0.2655 (-1.02z)| lr 5.81e-04 | 8471.24 ms | -100.0% bf16 MFU | 61921 tok/s +step 2826/19560 | loss 3.700425 (-0.55z)| norm 0.2734 (-0.66z)| lr 5.81e-04 | 8470.41 ms | -100.0% bf16 MFU | 61919 tok/s +step 2827/19560 | loss 3.781796 (+1.32z)| norm 0.2996 (+0.56z)| lr 5.81e-04 | 8464.17 ms | -100.0% bf16 MFU | 61920 tok/s +step 2828/19560 | loss 3.664188 (-1.36z)| norm 0.2945 (+0.32z)| lr 5.81e-04 | 8472.11 ms | -100.0% bf16 MFU | 61919 tok/s +step 2829/19560 | loss 3.663375 (-1.36z)| norm 0.2933 (+0.27z)| lr 5.81e-04 | 8469.34 ms | -100.0% bf16 MFU | 61918 tok/s +step 2830/19560 | loss 3.689717 (-0.75z)| norm 0.3108 (+1.09z)| lr 5.81e-04 | 8468.58 ms | -100.0% bf16 MFU | 61918 tok/s +step 2831/19560 | loss 3.669515 (-1.21z)| norm 0.3250 (+1.73z)| lr 5.81e-04 | 8462.06 ms | -100.0% bf16 MFU | 61920 tok/s +step 2832/19560 | loss 3.759845 (+0.89z)| norm 0.3388 (+2.31z)| lr 5.81e-04 | 8467.34 ms | -100.0% bf16 MFU | 61919 tok/s +step 2833/19560 | loss 3.699015 (-0.51z)| norm 0.3195 (+1.42z)| lr 5.81e-04 | 8465.30 ms | -100.0% bf16 MFU | 61920 tok/s +step 2834/19560 | loss 3.754739 (+0.80z)| norm 0.2658 (-1.00z)| lr 5.81e-04 | 8459.08 ms | -100.0% bf16 MFU | 61923 tok/s +step 2835/19560 | loss 3.741937 (+0.49z)| norm 0.2964 (+0.37z)| lr 5.81e-04 | 8469.36 ms | -100.0% bf16 MFU | 61922 tok/s +step 2836/19560 | loss 3.698586 (-0.52z)| norm 0.3117 (+1.04z)| lr 5.81e-04 | 8471.51 ms | -100.0% bf16 MFU | 61921 tok/s +step 2837/19560 | loss 3.732539 (+0.29z)| norm 0.3367 (+2.12z)| lr 5.81e-04 | 8467.77 ms | -100.0% bf16 MFU | 61920 tok/s +step 2838/19560 | loss 3.772140 (+1.22z)| norm 0.3258 (+1.61z)| lr 5.81e-04 | 8464.40 ms | -100.0% bf16 MFU | 61921 tok/s +step 2839/19560 | loss 3.807243 (+2.01z)| norm 0.3119 (+0.98z)| lr 5.81e-04 | 8460.84 ms | -100.0% bf16 MFU | 61924 tok/s +step 2840/19560 | loss 3.710496 (-0.24z)| norm 0.3211 (+1.37z)| lr 5.81e-04 | 8468.11 ms | -100.0% bf16 MFU | 61923 tok/s +step 2841/19560 | loss 3.665974 (-1.28z)| norm 0.2821 (-0.35z)| lr 5.81e-04 | 8465.95 ms | -100.0% bf16 MFU | 61923 tok/s +step 2842/19560 | loss 3.706568 (-0.30z)| norm 0.2693 (-0.91z)| lr 5.81e-04 | 8465.34 ms | -100.0% bf16 MFU | 61924 tok/s +step 2843/19560 | loss 3.660735 (-1.37z)| norm 0.2629 (-1.18z)| lr 5.81e-04 | 8463.26 ms | -100.0% bf16 MFU | 61925 tok/s +step 2844/19560 | loss 3.720216 (+0.05z)| norm 0.2773 (-0.53z)| lr 5.81e-04 | 8461.26 ms | -100.0% bf16 MFU | 61927 tok/s +step 2845/19560 | loss 3.701272 (-0.40z)| norm 0.3027 (+0.61z)| lr 5.81e-04 | 8467.38 ms | -100.0% bf16 MFU | 61927 tok/s +step 2846/19560 | loss 3.720033 (+0.05z)| norm 0.2751 (-0.62z)| lr 5.81e-04 | 8458.12 ms | -100.0% bf16 MFU | 61930 tok/s +step 2847/19560 | loss 3.635239 (-1.94z)| norm 0.2691 (-0.90z)| lr 5.81e-04 | 8463.01 ms | -100.0% bf16 MFU | 61931 tok/s +step 2848/19560 | loss 3.637107 (-1.85z)| norm 0.3021 (+0.58z)| lr 5.81e-04 | 8464.72 ms | -100.0% bf16 MFU | 61931 tok/s +step 2849/19560 | loss 3.698746 (-0.41z)| norm 0.2985 (+0.41z)| lr 5.81e-04 | 8460.54 ms | -100.0% bf16 MFU | 61933 tok/s +step 2850/19560 | loss 3.744820 (+0.67z)| norm 0.2861 (-0.17z)| lr 5.81e-04 | 8459.15 ms | -100.0% bf16 MFU | 61935 tok/s +step 2851/19560 | loss 3.714213 (-0.05z)| norm 0.2708 (-0.87z)| lr 5.81e-04 | 8458.69 ms | -100.0% bf16 MFU | 61938 tok/s +step 2852/19560 | loss 3.670044 (-1.06z)| norm 0.2445 (-2.01z)| lr 5.81e-04 | 8458.67 ms | -100.0% bf16 MFU | 61940 tok/s +step 2853/19560 | loss 3.661720 (-1.24z)| norm 0.2519 (-1.65z)| lr 5.81e-04 | 8463.28 ms | -100.0% bf16 MFU | 61940 tok/s +step 2854/19560 | loss 3.779379 (+1.52z)| norm 0.2644 (-1.07z)| lr 5.81e-04 | 8464.07 ms | -100.0% bf16 MFU | 61940 tok/s +step 2855/19560 | loss 3.735757 (+0.50z)| norm 0.3073 (+0.84z)| lr 5.81e-04 | 8463.40 ms | -100.0% bf16 MFU | 61941 tok/s +step 2856/19560 | loss 3.709344 (-0.12z)| norm 0.3481 (+2.58z)| lr 5.81e-04 | 8459.19 ms | -100.0% bf16 MFU | 61943 tok/s +step 2857/19560 | loss 3.687237 (-0.63z)| norm 0.3815 (+3.90z)| lr 5.81e-04 | 8462.91 ms | -100.0% bf16 MFU | 61943 tok/s +step 2858/19560 | loss 3.742079 (+0.67z)| norm 0.3245 (+1.51z)| lr 5.81e-04 | 8461.69 ms | -100.0% bf16 MFU | 61944 tok/s +step 2859/19560 | loss 3.724350 (+0.25z)| norm 0.2817 (-0.30z)| lr 5.81e-04 | 8459.89 ms | -100.0% bf16 MFU | 61945 tok/s +step 2860/19560 | loss 3.705279 (-0.20z)| norm 0.3184 (+1.25z)| lr 5.81e-04 | 8458.79 ms | -100.0% bf16 MFU | 61947 tok/s +step 2861/19560 | loss 3.660646 (-1.23z)| norm 0.2882 (-0.03z)| lr 5.81e-04 | 8457.06 ms | -100.0% bf16 MFU | 61949 tok/s +step 2862/19560 | loss 3.678097 (-0.82z)| norm 0.2750 (-0.59z)| lr 5.81e-04 | 8457.82 ms | -100.0% bf16 MFU | 61951 tok/s +step 2863/19560 | loss 3.740655 (+0.72z)| norm 0.3126 (+1.00z)| lr 5.81e-04 | 8458.46 ms | -100.0% bf16 MFU | 61953 tok/s +step 2864/19560 | loss 3.741897 (+0.75z)| norm 0.3108 (+0.91z)| lr 5.81e-04 | 8457.85 ms | -100.0% bf16 MFU | 61955 tok/s +step 2865/19560 | loss 3.724587 (+0.33z)| norm 0.3233 (+1.41z)| lr 5.81e-04 | 8457.68 ms | -100.0% bf16 MFU | 61957 tok/s +step 2866/19560 | loss 3.639107 (-1.76z)| norm 0.3005 (+0.45z)| lr 5.81e-04 | 8463.22 ms | -100.0% bf16 MFU | 61956 tok/s +step 2867/19560 | loss 3.729412 (+0.47z)| norm 0.3204 (+1.27z)| lr 5.81e-04 | 8462.34 ms | -100.0% bf16 MFU | 61956 tok/s +step 2868/19560 | loss 3.712397 (+0.05z)| norm 0.2842 (-0.27z)| lr 5.81e-04 | 8460.88 ms | -100.0% bf16 MFU | 61957 tok/s +step 2869/19560 | loss 3.705285 (-0.12z)| norm 0.2632 (-1.14z)| lr 5.81e-04 | 8464.58 ms | -100.0% bf16 MFU | 61956 tok/s +step 2870/19560 | loss 3.706785 (-0.08z)| norm 0.2976 (+0.31z)| lr 5.81e-04 | 8462.04 ms | -100.0% bf16 MFU | 61956 tok/s +step 2871/19560 | loss 3.690572 (-0.47z)| norm 0.2664 (-0.99z)| lr 5.81e-04 | 8462.57 ms | -100.0% bf16 MFU | 61956 tok/s +step 2872/19560 | loss 3.696741 (-0.32z)| norm 0.2590 (-1.28z)| lr 5.81e-04 | 8461.93 ms | -100.0% bf16 MFU | 61956 tok/s +step 2873/19560 | loss 3.668282 (-1.01z)| norm 0.2528 (-1.52z)| lr 5.81e-04 | 8463.74 ms | -100.0% bf16 MFU | 61955 tok/s +step 2874/19560 | loss 3.679904 (-0.71z)| norm 0.2761 (-0.55z)| lr 5.81e-04 | 8460.01 ms | -100.0% bf16 MFU | 61956 tok/s +step 2875/19560 | loss 3.664150 (-1.09z)| norm 0.2771 (-0.50z)| lr 5.81e-04 | 8459.91 ms | -100.0% bf16 MFU | 61957 tok/s +step 2876/19560 | loss 3.645744 (-1.52z)| norm 0.2714 (-0.73z)| lr 5.81e-04 | 8460.38 ms | -100.0% bf16 MFU | 61958 tok/s +step 2877/19560 | loss 3.665200 (-1.03z)| norm 0.2935 (+0.18z)| lr 5.81e-04 | 8465.33 ms | -100.0% bf16 MFU | 61956 tok/s +step 2878/19560 | loss 3.736866 (+0.75z)| norm 0.2999 (+0.43z)| lr 5.80e-04 | 8458.23 ms | -100.0% bf16 MFU | 61958 tok/s +step 2879/19560 | loss 3.624073 (-2.00z)| norm 0.3152 (+1.05z)| lr 5.80e-04 | 8460.30 ms | -100.0% bf16 MFU | 61959 tok/s +step 2880/19560 | loss 3.674559 (-0.75z)| norm 0.3195 (+1.20z)| lr 5.80e-04 | 8458.08 ms | -100.0% bf16 MFU | 61960 tok/s +step 2881/19560 | loss 3.676127 (-0.70z)| norm 0.3121 (+0.89z)| lr 5.80e-04 | 8462.03 ms | -100.0% bf16 MFU | 61960 tok/s +step 2882/19560 | loss 3.625884 (-1.89z)| norm 0.2901 (-0.01z)| lr 5.80e-04 | 8458.57 ms | -100.0% bf16 MFU | 61961 tok/s +step 2883/19560 | loss 3.757811 (+1.30z)| norm 0.2689 (-0.89z)| lr 5.80e-04 | 8457.15 ms | -100.0% bf16 MFU | 61963 tok/s +step 2884/19560 | loss 3.673697 (-0.72z)| norm 0.3043 (+0.56z)| lr 5.80e-04 | 8458.64 ms | -100.0% bf16 MFU | 61964 tok/s +step 2885/19560 | loss 3.710011 (+0.16z)| norm 0.2906 (-0.01z)| lr 5.80e-04 | 8457.86 ms | -100.0% bf16 MFU | 61965 tok/s +step 2886/19560 | loss 3.727196 (+0.58z)| norm 0.3042 (+0.54z)| lr 5.80e-04 | 8457.18 ms | -100.0% bf16 MFU | 61966 tok/s +step 2887/19560 | loss 3.714084 (+0.25z)| norm 0.2729 (-0.75z)| lr 5.80e-04 | 8462.87 ms | -100.0% bf16 MFU | 61966 tok/s +step 2888/19560 | loss 3.643360 (-1.45z)| norm 0.2610 (-1.22z)| lr 5.80e-04 | 8459.25 ms | -100.0% bf16 MFU | 61966 tok/s +step 2889/19560 | loss 3.694581 (-0.20z)| norm 0.2606 (-1.22z)| lr 5.80e-04 | 8457.90 ms | -100.0% bf16 MFU | 61967 tok/s +step 2890/19560 | loss 3.697230 (-0.13z)| norm 0.2830 (-0.30z)| lr 5.80e-04 | 8456.17 ms | -100.0% bf16 MFU | 61969 tok/s +step 2891/19560 | loss 3.658943 (-1.05z)| norm 0.2862 (-0.16z)| lr 5.80e-04 | 8458.84 ms | -100.0% bf16 MFU | 61970 tok/s +step 2892/19560 | loss 3.670900 (-0.74z)| norm 0.2726 (-0.72z)| lr 5.80e-04 | 8455.81 ms | -100.0% bf16 MFU | 61971 tok/s +step 2893/19560 | loss 3.638582 (-1.52z)| norm 0.2439 (-1.86z)| lr 5.80e-04 | 8460.88 ms | -100.0% bf16 MFU | 61971 tok/s +step 2894/19560 | loss 3.637403 (-1.52z)| norm 0.2423 (-1.88z)| lr 5.80e-04 | 8459.30 ms | -100.0% bf16 MFU | 61971 tok/s +step 2895/19560 | loss 3.663962 (-0.86z)| norm 0.2621 (-1.07z)| lr 5.80e-04 | 8464.46 ms | -100.0% bf16 MFU | 61970 tok/s +step 2896/19560 | loss 3.746552 (+1.16z)| norm 0.2532 (-1.41z)| lr 5.80e-04 | 8456.62 ms | -100.0% bf16 MFU | 61971 tok/s +step 2897/19560 | loss 3.692653 (-0.14z)| norm 0.2570 (-1.24z)| lr 5.80e-04 | 8459.22 ms | -100.0% bf16 MFU | 61971 tok/s +step 2898/19560 | loss 3.774801 (+1.84z)| norm 0.2811 (-0.28z)| lr 5.80e-04 | 8456.12 ms | -100.0% bf16 MFU | 61973 tok/s +step 2899/19560 | loss 3.732610 (+0.88z)| norm 0.3121 (+0.94z)| lr 5.80e-04 | 8459.41 ms | -100.0% bf16 MFU | 61973 tok/s +step 2900/19560 | loss 3.684322 (-0.35z)| norm 0.3336 (+1.77z)| lr 5.80e-04 | 8458.79 ms | -100.0% bf16 MFU | 61974 tok/s +step 2901/19560 | loss 3.725440 (+0.72z)| norm 0.3358 (+1.81z)| lr 5.80e-04 | 8454.20 ms | -100.0% bf16 MFU | 61976 tok/s +step 2902/19560 | loss 3.701834 (+0.11z)| norm 0.2969 (+0.30z)| lr 5.80e-04 | 8455.30 ms | -100.0% bf16 MFU | 61977 tok/s +step 2903/19560 | loss 3.655442 (-1.08z)| norm 0.3035 (+0.56z)| lr 5.80e-04 | 8460.53 ms | -100.0% bf16 MFU | 61977 tok/s +step 2904/19560 | loss 3.652561 (-1.14z)| norm 0.2828 (-0.24z)| lr 5.80e-04 | 8459.93 ms | -100.0% bf16 MFU | 61977 tok/s +step 2905/19560 | loss 3.689874 (-0.17z)| norm 0.2690 (-0.78z)| lr 5.80e-04 | 8454.29 ms | -100.0% bf16 MFU | 61978 tok/s +step 2906/19560 | loss 3.710166 (+0.35z)| norm 0.2992 (+0.39z)| lr 5.80e-04 | 8457.31 ms | -100.0% bf16 MFU | 61979 tok/s +step 2907/19560 | loss 3.636570 (-1.53z)| norm 0.2886 (-0.02z)| lr 5.80e-04 | 8454.57 ms | -100.0% bf16 MFU | 61981 tok/s +step 2908/19560 | loss 3.667691 (-0.73z)| norm 0.2606 (-1.11z)| lr 5.80e-04 | 8455.37 ms | -100.0% bf16 MFU | 61982 tok/s +step 2909/19560 | loss 3.716544 (+0.53z)| norm 0.2826 (-0.24z)| lr 5.80e-04 | 8458.37 ms | -100.0% bf16 MFU | 61982 tok/s +step 2910/19560 | loss 3.739769 (+1.11z)| norm 0.2888 (-0.01z)| lr 5.80e-04 | 8456.75 ms | -100.0% bf16 MFU | 61983 tok/s +step 2911/19560 | loss 3.683370 (-0.34z)| norm 0.2772 (-0.45z)| lr 5.80e-04 | 8454.39 ms | -100.0% bf16 MFU | 61984 tok/s +step 2912/19560 | loss 3.745981 (+1.25z)| norm 0.2717 (-0.66z)| lr 5.80e-04 | 8460.47 ms | -100.0% bf16 MFU | 61984 tok/s +step 2913/19560 | loss 3.709386 (+0.32z)| norm 0.2794 (-0.35z)| lr 5.80e-04 | 8459.09 ms | -100.0% bf16 MFU | 61983 tok/s +step 2914/19560 | loss 3.693065 (-0.10z)| norm 0.3034 (+0.58z)| lr 5.80e-04 | 8456.54 ms | -100.0% bf16 MFU | 61984 tok/s +step 2915/19560 | loss 3.688361 (-0.22z)| norm 0.2973 (+0.34z)| lr 5.80e-04 | 8462.48 ms | -100.0% bf16 MFU | 61983 tok/s +step 2916/19560 | loss 3.642967 (-1.38z)| norm 0.2995 (+0.43z)| lr 5.80e-04 | 8455.21 ms | -100.0% bf16 MFU | 61984 tok/s +step 2917/19560 | loss 3.639574 (-1.45z)| norm 0.2794 (-0.34z)| lr 5.80e-04 | 8456.96 ms | -100.0% bf16 MFU | 61984 tok/s +step 2918/19560 | loss 3.645723 (-1.28z)| norm 0.2613 (-1.04z)| lr 5.80e-04 | 8458.05 ms | -100.0% bf16 MFU | 61985 tok/s +step 2919/19560 | loss 3.693655 (-0.07z)| norm 0.2579 (-1.17z)| lr 5.80e-04 | 8460.18 ms | -100.0% bf16 MFU | 61984 tok/s +step 2920/19560 | loss 3.719188 (+0.57z)| norm 0.2693 (-0.72z)| lr 5.80e-04 | 8462.57 ms | -100.0% bf16 MFU | 61982 tok/s +step 2921/19560 | loss 3.706471 (+0.25z)| norm 0.2769 (-0.42z)| lr 5.80e-04 | 8458.89 ms | -100.0% bf16 MFU | 61982 tok/s +step 2922/19560 | loss 3.717483 (+0.53z)| norm 0.2584 (-1.14z)| lr 5.80e-04 | 8459.70 ms | -100.0% bf16 MFU | 61982 tok/s +step 2923/19560 | loss 3.673201 (-0.58z)| norm 0.2601 (-1.07z)| lr 5.80e-04 | 8456.34 ms | -100.0% bf16 MFU | 61983 tok/s +step 2924/19560 | loss 3.695934 (+0.01z)| norm 0.2615 (-1.01z)| lr 5.80e-04 | 8455.03 ms | -100.0% bf16 MFU | 61984 tok/s +step 2925/19560 | loss 3.689530 (-0.17z)| norm 0.2626 (-0.96z)| lr 5.80e-04 | 8461.38 ms | -100.0% bf16 MFU | 61983 tok/s +step 2926/19560 | loss 3.654772 (-1.06z)| norm 0.2721 (-0.59z)| lr 5.80e-04 | 8456.50 ms | -100.0% bf16 MFU | 61984 tok/s +step 2927/19560 | loss 3.742102 (+1.17z)| norm 0.2961 (+0.34z)| lr 5.80e-04 | 8459.21 ms | -100.0% bf16 MFU | 61984 tok/s +step 2928/19560 | loss 3.708284 (+0.30z)| norm 0.2873 (+0.01z)| lr 5.80e-04 | 8457.34 ms | -100.0% bf16 MFU | 61984 tok/s +step 2929/19560 | loss 3.702907 (+0.18z)| norm 0.3165 (+1.15z)| lr 5.80e-04 | 8456.87 ms | -100.0% bf16 MFU | 61985 tok/s +step 2930/19560 | loss 3.640056 (-1.47z)| norm 0.3093 (+0.86z)| lr 5.80e-04 | 8453.39 ms | -100.0% bf16 MFU | 61986 tok/s +step 2931/19560 | loss 3.659199 (-0.96z)| norm 0.3174 (+1.17z)| lr 5.80e-04 | 8457.69 ms | -100.0% bf16 MFU | 61987 tok/s +step 2932/19560 | loss 3.651895 (-1.13z)| norm 0.3176 (+1.16z)| lr 5.80e-04 | 8458.36 ms | -100.0% bf16 MFU | 61986 tok/s +step 2933/19560 | loss 3.654467 (-1.05z)| norm 0.3096 (+0.86z)| lr 5.80e-04 | 8457.31 ms | -100.0% bf16 MFU | 61987 tok/s +step 2934/19560 | loss 3.625031 (-1.81z)| norm 0.3107 (+0.93z)| lr 5.79e-04 | 8456.95 ms | -100.0% bf16 MFU | 61987 tok/s +step 2935/19560 | loss 3.632859 (-1.58z)| norm 0.2703 (-0.65z)| lr 5.79e-04 | 8456.05 ms | -100.0% bf16 MFU | 61988 tok/s +step 2936/19560 | loss 3.698538 (+0.16z)| norm 0.2515 (-1.37z)| lr 5.79e-04 | 8457.85 ms | -100.0% bf16 MFU | 61988 tok/s +step 2937/19560 | loss 3.663024 (-0.78z)| norm 0.2792 (-0.29z)| lr 5.79e-04 | 8457.96 ms | -100.0% bf16 MFU | 61988 tok/s +step 2938/19560 | loss 3.733642 (+1.08z)| norm 0.2979 (+0.43z)| lr 5.79e-04 | 8457.29 ms | -100.0% bf16 MFU | 61988 tok/s +step 2939/19560 | loss 3.708197 (+0.40z)| norm 0.2787 (-0.34z)| lr 5.79e-04 | 8456.97 ms | -100.0% bf16 MFU | 61988 tok/s +step 2940/19560 | loss 3.709888 (+0.44z)| norm 0.3215 (+1.34z)| lr 5.79e-04 | 8455.79 ms | -100.0% bf16 MFU | 61989 tok/s +step 2941/19560 | loss 3.661963 (-0.83z)| norm 0.3225 (+1.36z)| lr 5.79e-04 | 8455.91 ms | -100.0% bf16 MFU | 61990 tok/s +step 2942/19560 | loss 3.805475 (+2.85z)| norm 0.2547 (-1.34z)| lr 5.79e-04 | 8453.87 ms | -100.0% bf16 MFU | 61991 tok/s +step 2943/19560 | loss 3.693607 (-0.03z)| norm 0.2785 (-0.39z)| lr 5.79e-04 | 8458.29 ms | -100.0% bf16 MFU | 61991 tok/s +step 2944/19560 | loss 3.674191 (-0.53z)| norm 0.3164 (+1.10z)| lr 5.79e-04 | 8457.85 ms | -100.0% bf16 MFU | 61991 tok/s +step 2945/19560 | loss 3.667813 (-0.68z)| norm 0.3402 (+2.02z)| lr 5.79e-04 | 8457.72 ms | -100.0% bf16 MFU | 61991 tok/s +step 2946/19560 | loss 3.659314 (-0.90z)| norm 0.2953 (+0.23z)| lr 5.79e-04 | 8454.80 ms | -100.0% bf16 MFU | 61992 tok/s +step 2947/19560 | loss 3.710170 (+0.39z)| norm 0.2653 (-0.96z)| lr 5.79e-04 | 8457.40 ms | -100.0% bf16 MFU | 61992 tok/s +step 2948/19560 | loss 3.689102 (-0.15z)| norm 0.2783 (-0.45z)| lr 5.79e-04 | 8456.19 ms | -100.0% bf16 MFU | 61992 tok/s +step 2949/19560 | loss 3.701243 (+0.17z)| norm 0.2777 (-0.47z)| lr 5.79e-04 | 8457.97 ms | -100.0% bf16 MFU | 61992 tok/s +step 2950/19560 | loss 3.693451 (-0.03z)| norm 0.3018 (+0.48z)| lr 5.79e-04 | 8455.04 ms | -100.0% bf16 MFU | 61993 tok/s +step 2951/19560 | loss 3.738814 (+1.14z)| norm 0.2911 (+0.06z)| lr 5.79e-04 | 8457.94 ms | -100.0% bf16 MFU | 61993 tok/s +step 2952/19560 | loss 3.712483 (+0.45z)| norm 0.2537 (-1.41z)| lr 5.79e-04 | 8461.72 ms | -100.0% bf16 MFU | 61991 tok/s +step 2953/19560 | loss 3.639216 (-1.43z)| norm 0.3106 (+0.82z)| lr 5.79e-04 | 8457.85 ms | -100.0% bf16 MFU | 61991 tok/s +step 2954/19560 | loss 3.702236 (+0.19z)| norm 0.3225 (+1.27z)| lr 5.79e-04 | 8457.39 ms | -100.0% bf16 MFU | 61991 tok/s +step 2955/19560 | loss 3.758419 (+1.65z)| norm 0.3022 (+0.47z)| lr 5.79e-04 | 8459.78 ms | -100.0% bf16 MFU | 61990 tok/s +step 2956/19560 | loss 3.642882 (-1.33z)| norm 0.3369 (+1.80z)| lr 5.79e-04 | 8456.82 ms | -100.0% bf16 MFU | 61990 tok/s +step 2957/19560 | loss 3.633212 (-1.56z)| norm 0.2829 (-0.29z)| lr 5.79e-04 | 8456.05 ms | -100.0% bf16 MFU | 61991 tok/s +step 2958/19560 | loss 3.687075 (-0.18z)| norm 0.2611 (-1.12z)| lr 5.79e-04 | 8455.05 ms | -100.0% bf16 MFU | 61992 tok/s +step 2959/19560 | loss 3.645355 (-1.24z)| norm 0.2794 (-0.40z)| lr 5.79e-04 | 8460.30 ms | -100.0% bf16 MFU | 61991 tok/s +step 2960/19560 | loss 3.693693 (+0.00z)| norm 0.2738 (-0.61z)| lr 5.79e-04 | 8453.82 ms | -100.0% bf16 MFU | 61992 tok/s +step 2961/19560 | loss 3.772333 (+1.98z)| norm 0.2848 (-0.16z)| lr 5.79e-04 | 8457.20 ms | -100.0% bf16 MFU | 61992 tok/s +step 2962/19560 | loss 3.643126 (-1.28z)| norm 0.2951 (+0.24z)| lr 5.79e-04 | 8457.83 ms | -100.0% bf16 MFU | 61992 tok/s +step 2963/19560 | loss 3.701455 (+0.22z)| norm 0.2901 (+0.04z)| lr 5.79e-04 | 8456.50 ms | -100.0% bf16 MFU | 61992 tok/s +step 2964/19560 | loss 3.683044 (-0.25z)| norm 0.3164 (+1.09z)| lr 5.79e-04 | 8459.71 ms | -100.0% bf16 MFU | 61991 tok/s +step 2965/19560 | loss 3.665108 (-0.70z)| norm 0.3277 (+1.55z)| lr 5.79e-04 | 8457.60 ms | -100.0% bf16 MFU | 61991 tok/s +step 2966/19560 | loss 3.677825 (-0.36z)| norm 0.2666 (-0.89z)| lr 5.79e-04 | 8460.03 ms | -100.0% bf16 MFU | 61990 tok/s +step 2967/19560 | loss 3.673921 (-0.45z)| norm 0.2912 (+0.11z)| lr 5.79e-04 | 8478.88 ms | -100.0% bf16 MFU | 61983 tok/s +step 2968/19560 | loss 3.732727 (+1.13z)| norm 0.2783 (-0.40z)| lr 5.79e-04 | 8481.64 ms | -100.0% bf16 MFU | 61974 tok/s +step 2969/19560 | loss 3.762474 (+1.89z)| norm 0.2665 (-0.87z)| lr 5.79e-04 | 8480.83 ms | -100.0% bf16 MFU | 61966 tok/s +step 2970/19560 | loss 3.752548 (+1.60z)| norm 0.2938 (+0.23z)| lr 5.79e-04 | 8483.43 ms | -100.0% bf16 MFU | 61958 tok/s +step 2971/19560 | loss 3.724806 (+0.86z)| norm 0.3007 (+0.50z)| lr 5.79e-04 | 8479.93 ms | -100.0% bf16 MFU | 61952 tok/s +step 2972/19560 | loss 3.807968 (+2.93z)| norm 0.2793 (-0.37z)| lr 5.79e-04 | 8478.47 ms | -100.0% bf16 MFU | 61946 tok/s +step 2973/19560 | loss 3.686601 (-0.16z)| norm 0.2955 (+0.29z)| lr 5.79e-04 | 8477.16 ms | -100.0% bf16 MFU | 61941 tok/s +step 2974/19560 | loss 3.687787 (-0.12z)| norm 0.3002 (+0.48z)| lr 5.79e-04 | 8477.12 ms | -100.0% bf16 MFU | 61936 tok/s +step 2975/19560 | loss 3.736172 (+1.10z)| norm 0.2871 (-0.07z)| lr 5.79e-04 | 8477.54 ms | -100.0% bf16 MFU | 61932 tok/s +step 2976/19560 | loss 3.633856 (-1.52z)| norm 0.3086 (+0.81z)| lr 5.79e-04 | 8473.96 ms | -100.0% bf16 MFU | 61929 tok/s +step 2977/19560 | loss 3.674346 (-0.48z)| norm 0.2971 (+0.34z)| lr 5.79e-04 | 8479.74 ms | -100.0% bf16 MFU | 61924 tok/s +step 2978/19560 | loss 3.795920 (+2.57z)| norm 0.2740 (-0.60z)| lr 5.79e-04 | 8473.30 ms | -100.0% bf16 MFU | 61921 tok/s +step 2979/19560 | loss 3.675896 (-0.44z)| norm 0.2734 (-0.62z)| lr 5.79e-04 | 8474.60 ms | -100.0% bf16 MFU | 61918 tok/s +step 2980/19560 | loss 3.692677 (-0.02z)| norm 0.2538 (-1.44z)| lr 5.79e-04 | 8469.78 ms | -100.0% bf16 MFU | 61918 tok/s +step 2981/19560 | loss 3.675824 (-0.45z)| norm 0.2693 (-0.81z)| lr 5.79e-04 | 8477.52 ms | -100.0% bf16 MFU | 61914 tok/s +step 2982/19560 | loss 3.721530 (+0.73z)| norm 0.2473 (-1.70z)| lr 5.79e-04 | 8476.99 ms | -100.0% bf16 MFU | 61911 tok/s +step 2983/19560 | loss 3.768728 (+1.91z)| norm 0.2610 (-1.12z)| lr 5.79e-04 | 8473.55 ms | -100.0% bf16 MFU | 61909 tok/s +step 2984/19560 | loss 3.713116 (+0.50z)| norm 0.2539 (-1.40z)| lr 5.79e-04 | 8470.52 ms | -100.0% bf16 MFU | 61908 tok/s +step 2985/19560 | loss 3.674684 (-0.47z)| norm 0.2522 (-1.51z)| lr 5.79e-04 | 8476.77 ms | -100.0% bf16 MFU | 61905 tok/s +step 2986/19560 | loss 3.744649 (+1.30z)| norm 0.2705 (-0.70z)| lr 5.79e-04 | 8469.24 ms | -100.0% bf16 MFU | 61905 tok/s +step 2987/19560 | loss 3.724160 (+0.78z)| norm 0.2884 (+0.09z)| lr 5.79e-04 | 8471.81 ms | -100.0% bf16 MFU | 61904 tok/s +step 2988/19560 | loss 3.700850 (+0.19z)| norm 0.2995 (+0.60z)| lr 5.78e-04 | 8471.77 ms | -100.0% bf16 MFU | 61903 tok/s +step 2989/19560 | loss 3.714831 (+0.53z)| norm 0.2665 (-0.87z)| lr 5.78e-04 | 8469.72 ms | -100.0% bf16 MFU | 61903 tok/s +step 2990/19560 | loss 3.770655 (+1.90z)| norm 0.2819 (-0.18z)| lr 5.78e-04 | 8471.37 ms | -100.0% bf16 MFU | 61903 tok/s +step 2991/19560 | loss 3.707169 (+0.33z)| norm 0.3041 (+0.81z)| lr 5.78e-04 | 8476.06 ms | -100.0% bf16 MFU | 61900 tok/s +step 2992/19560 | loss 3.732004 (+0.95z)| norm 0.3488 (+2.72z)| lr 5.78e-04 | 8467.98 ms | -100.0% bf16 MFU | 61901 tok/s +step 2993/19560 | loss 3.677935 (-0.40z)| norm 0.3346 (+2.09z)| lr 5.78e-04 | 8467.51 ms | -100.0% bf16 MFU | 61902 tok/s +step 2994/19560 | loss 3.734853 (+1.02z)| norm 0.2819 (-0.19z)| lr 5.78e-04 | 8468.19 ms | -100.0% bf16 MFU | 61902 tok/s +step 2995/19560 | loss 3.732944 (+0.97z)| norm 0.2694 (-0.71z)| lr 5.78e-04 | 8467.83 ms | -100.0% bf16 MFU | 61903 tok/s +step 2996/19560 | loss 3.705207 (+0.27z)| norm 0.3055 (+0.85z)| lr 5.78e-04 | 8475.49 ms | -100.0% bf16 MFU | 61901 tok/s +step 2997/19560 | loss 3.729326 (+0.87z)| norm 0.2735 (-0.55z)| lr 5.78e-04 | 8473.66 ms | -100.0% bf16 MFU | 61899 tok/s +step 2998/19560 | loss 3.684263 (-0.26z)| norm 0.2583 (-1.19z)| lr 5.78e-04 | 8469.61 ms | -100.0% bf16 MFU | 61899 tok/s +step 2999/19560 | loss 3.691717 (-0.07z)| norm 0.2886 (+0.12z)| lr 5.78e-04 | 8474.45 ms | -100.0% bf16 MFU | 61898 tok/s +step 3000/19560 | loss 3.782339 (+2.16z)| norm 0.2685 (-0.76z)| lr 5.78e-04 | 8467.98 ms | -100.0% bf16 MFU | 61899 tok/s +val loss 3.693667 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 2655/10042 = 0.264390 +step 3001/19560 | loss 3.739810 (+1.09z)| norm 0.2752 (-0.48z)| lr 5.78e-04 | 8465.03 ms | -100.0% bf16 MFU | 61901 tok/s +step 3002/19560 | loss 3.699944 (+0.10z)| norm 0.2682 (-0.78z)| lr 5.78e-04 | 8455.38 ms | -100.0% bf16 MFU | 61906 tok/s +step 3003/19560 | loss 3.705355 (+0.23z)| norm 0.2626 (-1.02z)| lr 5.78e-04 | 8458.91 ms | -100.0% bf16 MFU | 61910 tok/s +step 3004/19560 | loss 3.662326 (-0.84z)| norm 0.2939 (+0.34z)| lr 5.78e-04 | 8446.73 ms | -100.0% bf16 MFU | 61918 tok/s +step 3005/19560 | loss 3.688404 (-0.20z)| norm 0.2613 (-1.07z)| lr 5.78e-04 | 8451.53 ms | -100.0% bf16 MFU | 61923 tok/s +step 3006/19560 | loss 3.728819 (+0.81z)| norm 0.2829 (-0.12z)| lr 5.78e-04 | 8450.51 ms | -100.0% bf16 MFU | 61929 tok/s +step 3007/19560 | loss 3.637306 (-1.48z)| norm 0.3125 (+1.17z)| lr 5.78e-04 | 8448.04 ms | -100.0% bf16 MFU | 61936 tok/s +step 3008/19560 | loss 3.668650 (-0.70z)| norm 0.2842 (-0.06z)| lr 5.78e-04 | 8451.03 ms | -100.0% bf16 MFU | 61941 tok/s +step 3009/19560 | loss 3.689366 (-0.18z)| norm 0.2513 (-1.48z)| lr 5.78e-04 | 8453.44 ms | -100.0% bf16 MFU | 61945 tok/s +step 3010/19560 | loss 3.704054 (+0.17z)| norm 0.2713 (-0.59z)| lr 5.78e-04 | 8442.59 ms | -100.0% bf16 MFU | 61953 tok/s +step 3011/19560 | loss 3.699582 (+0.07z)| norm 0.3031 (+0.79z)| lr 5.78e-04 | 8448.50 ms | -100.0% bf16 MFU | 61958 tok/s +step 3012/19560 | loss 3.776022 (+1.98z)| norm 0.2869 (+0.09z)| lr 5.78e-04 | 8442.51 ms | -100.0% bf16 MFU | 61965 tok/s +step 3013/19560 | loss 3.639647 (-1.44z)| norm 0.3693 (+3.50z)| lr 5.78e-04 | 8449.98 ms | -100.0% bf16 MFU | 61969 tok/s +step 3014/19560 | loss 3.769697 (+1.79z)| norm 0.3852 (+3.89z)| lr 5.78e-04 | 8439.34 ms | -100.0% bf16 MFU | 61977 tok/s +step 3015/19560 | loss 3.739720 (+1.04z)| norm 0.3858 (+3.67z)| lr 5.78e-04 | 8443.97 ms | -100.0% bf16 MFU | 61983 tok/s +step 3016/19560 | loss 3.692528 (-0.13z)| norm 0.3297 (+1.55z)| lr 5.78e-04 | 8449.78 ms | -100.0% bf16 MFU | 61986 tok/s +step 3017/19560 | loss 3.709663 (+0.29z)| norm 0.3117 (+0.88z)| lr 5.78e-04 | 8450.17 ms | -100.0% bf16 MFU | 61989 tok/s +step 3018/19560 | loss 3.721475 (+0.58z)| norm 0.2785 (-0.35z)| lr 5.78e-04 | 8450.03 ms | -100.0% bf16 MFU | 61992 tok/s +step 3019/19560 | loss 3.787722 (+2.16z)| norm 0.2593 (-1.05z)| lr 5.78e-04 | 8445.80 ms | -100.0% bf16 MFU | 61996 tok/s +step 3020/19560 | loss 3.684504 (-0.36z)| norm 0.2563 (-1.15z)| lr 5.78e-04 | 8453.05 ms | -100.0% bf16 MFU | 61997 tok/s +step 3021/19560 | loss 3.697668 (-0.05z)| norm 0.2521 (-1.31z)| lr 5.78e-04 | 8450.56 ms | -100.0% bf16 MFU | 61999 tok/s +step 3022/19560 | loss 3.679876 (-0.50z)| norm 0.2603 (-1.02z)| lr 5.78e-04 | 8449.51 ms | -100.0% bf16 MFU | 62002 tok/s +step 3023/19560 | loss 3.731028 (+0.76z)| norm 0.2929 (+0.18z)| lr 5.78e-04 | 8449.25 ms | -100.0% bf16 MFU | 62004 tok/s +step 3024/19560 | loss 3.737624 (+0.92z)| norm 0.2850 (-0.12z)| lr 5.78e-04 | 8451.79 ms | -100.0% bf16 MFU | 62006 tok/s +step 3025/19560 | loss 3.695929 (-0.12z)| norm 0.2634 (-0.94z)| lr 5.78e-04 | 8443.34 ms | -100.0% bf16 MFU | 62010 tok/s +step 3026/19560 | loss 3.706790 (+0.17z)| norm 0.2468 (-1.54z)| lr 5.78e-04 | 8443.94 ms | -100.0% bf16 MFU | 62014 tok/s +step 3027/19560 | loss 3.676703 (-0.58z)| norm 0.2543 (-1.24z)| lr 5.78e-04 | 8445.22 ms | -100.0% bf16 MFU | 62018 tok/s +step 3028/19560 | loss 3.741261 (+1.04z)| norm 0.2634 (-0.89z)| lr 5.78e-04 | 8444.92 ms | -100.0% bf16 MFU | 62021 tok/s +step 3029/19560 | loss 3.840692 (+3.37z)| norm 0.2563 (-1.14z)| lr 5.78e-04 | 8444.62 ms | -100.0% bf16 MFU | 62024 tok/s +step 3030/19560 | loss 3.747816 (+1.12z)| norm 0.2319 (-2.01z)| lr 5.78e-04 | 8442.93 ms | -100.0% bf16 MFU | 62028 tok/s +step 3031/19560 | loss 3.669407 (-0.77z)| norm 0.2306 (-2.01z)| lr 5.78e-04 | 8443.62 ms | -100.0% bf16 MFU | 62031 tok/s +step 3032/19560 | loss 3.737210 (+0.85z)| norm 0.2640 (-0.78z)| lr 5.78e-04 | 8446.34 ms | -100.0% bf16 MFU | 62033 tok/s +step 3033/19560 | loss 3.705574 (+0.08z)| norm 0.2656 (-0.72z)| lr 5.78e-04 | 8440.02 ms | -100.0% bf16 MFU | 62037 tok/s +step 3034/19560 | loss 3.672260 (-0.72z)| norm 0.2850 (-0.01z)| lr 5.78e-04 | 8439.66 ms | -100.0% bf16 MFU | 62042 tok/s +step 3035/19560 | loss 3.755983 (+1.28z)| norm 0.2983 (+0.48z)| lr 5.78e-04 | 8440.05 ms | -100.0% bf16 MFU | 62046 tok/s +step 3036/19560 | loss 3.675757 (-0.66z)| norm 0.3066 (+0.77z)| lr 5.78e-04 | 8442.85 ms | -100.0% bf16 MFU | 62048 tok/s +step 3037/19560 | loss 3.729861 (+0.65z)| norm 0.3494 (+2.26z)| lr 5.78e-04 | 8437.61 ms | -100.0% bf16 MFU | 62053 tok/s +step 3038/19560 | loss 3.714893 (+0.29z)| norm 0.3359 (+1.75z)| lr 5.78e-04 | 8438.77 ms | -100.0% bf16 MFU | 62056 tok/s +step 3039/19560 | loss 3.723227 (+0.49z)| norm 0.3171 (+1.07z)| lr 5.78e-04 | 8442.14 ms | -100.0% bf16 MFU | 62059 tok/s +step 3040/19560 | loss 3.716887 (+0.34z)| norm 0.2796 (-0.26z)| lr 5.78e-04 | 8442.25 ms | -100.0% bf16 MFU | 62061 tok/s +step 3041/19560 | loss 3.724478 (+0.52z)| norm 0.3056 (+0.65z)| lr 5.77e-04 | 8440.78 ms | -100.0% bf16 MFU | 62064 tok/s +step 3042/19560 | loss 3.694926 (-0.20z)| norm 0.3225 (+1.24z)| lr 5.77e-04 | 8439.83 ms | -100.0% bf16 MFU | 62066 tok/s +step 3043/19560 | loss 3.696680 (-0.16z)| norm 0.2929 (+0.20z)| lr 5.77e-04 | 8442.06 ms | -100.0% bf16 MFU | 62068 tok/s +step 3044/19560 | loss 3.694305 (-0.23z)| norm 0.2862 (-0.03z)| lr 5.77e-04 | 8442.48 ms | -100.0% bf16 MFU | 62070 tok/s +step 3045/19560 | loss 3.683255 (-0.51z)| norm 0.2836 (-0.12z)| lr 5.77e-04 | 8442.88 ms | -100.0% bf16 MFU | 62071 tok/s +step 3046/19560 | loss 3.696014 (-0.21z)| norm 0.2871 (-0.01z)| lr 5.77e-04 | 8442.33 ms | -100.0% bf16 MFU | 62073 tok/s +step 3047/19560 | loss 3.729094 (+0.61z)| norm 0.2506 (-1.29z)| lr 5.77e-04 | 8438.80 ms | -100.0% bf16 MFU | 62076 tok/s +step 3048/19560 | loss 3.761278 (+1.39z)| norm 0.3315 (+1.52z)| lr 5.77e-04 | 8445.52 ms | -100.0% bf16 MFU | 62076 tok/s +step 3049/19560 | loss 3.692324 (-0.31z)| norm 0.3358 (+1.64z)| lr 5.77e-04 | 8437.62 ms | -100.0% bf16 MFU | 62079 tok/s +step 3050/19560 | loss 3.672176 (-0.80z)| norm 0.3284 (+1.36z)| lr 5.77e-04 | 8444.10 ms | -100.0% bf16 MFU | 62079 tok/s +step 3051/19560 | loss 3.647605 (-1.39z)| norm 0.3078 (+0.64z)| lr 5.77e-04 | 8450.30 ms | -100.0% bf16 MFU | 62078 tok/s +step 3052/19560 | loss 3.752791 (+1.17z)| norm 0.2966 (+0.25z)| lr 5.77e-04 | 8439.90 ms | -100.0% bf16 MFU | 62080 tok/s +step 3053/19560 | loss 3.668043 (-0.89z)| norm 0.2978 (+0.28z)| lr 5.77e-04 | 8444.83 ms | -100.0% bf16 MFU | 62080 tok/s +step 3054/19560 | loss 3.720034 (+0.37z)| norm 0.2813 (-0.29z)| lr 5.77e-04 | 8443.35 ms | -100.0% bf16 MFU | 62081 tok/s +step 3055/19560 | loss 3.761964 (+1.38z)| norm 0.2507 (-1.33z)| lr 5.77e-04 | 8448.73 ms | -100.0% bf16 MFU | 62079 tok/s +step 3056/19560 | loss 3.690093 (-0.36z)| norm 0.2722 (-0.59z)| lr 5.77e-04 | 8448.11 ms | -100.0% bf16 MFU | 62078 tok/s +step 3057/19560 | loss 3.767934 (+1.51z)| norm 0.2582 (-1.05z)| lr 5.77e-04 | 8445.99 ms | -100.0% bf16 MFU | 62078 tok/s +step 3058/19560 | loss 3.731013 (+0.60z)| norm 0.2619 (-0.91z)| lr 5.77e-04 | 8440.76 ms | -100.0% bf16 MFU | 62080 tok/s +step 3059/19560 | loss 3.719428 (+0.31z)| norm 0.2728 (-0.53z)| lr 5.77e-04 | 8449.26 ms | -100.0% bf16 MFU | 62079 tok/s +step 3060/19560 | loss 3.703955 (-0.08z)| norm 0.2882 (+0.01z)| lr 5.77e-04 | 8448.08 ms | -100.0% bf16 MFU | 62078 tok/s +step 3061/19560 | loss 3.702940 (-0.11z)| norm 0.3018 (+0.48z)| lr 5.77e-04 | 8448.66 ms | -100.0% bf16 MFU | 62077 tok/s +step 3062/19560 | loss 3.683926 (-0.61z)| norm 0.2838 (-0.13z)| lr 5.77e-04 | 8457.44 ms | -100.0% bf16 MFU | 62072 tok/s +step 3063/19560 | loss 3.733791 (+0.64z)| norm 0.2765 (-0.39z)| lr 5.77e-04 | 8455.82 ms | -100.0% bf16 MFU | 62069 tok/s +step 3064/19560 | loss 3.678607 (-0.76z)| norm 0.3066 (+0.65z)| lr 5.77e-04 | 8452.99 ms | -100.0% bf16 MFU | 62067 tok/s +step 3065/19560 | loss 3.681403 (-0.70z)| norm 0.2819 (-0.22z)| lr 5.77e-04 | 8450.23 ms | -100.0% bf16 MFU | 62066 tok/s +step 3066/19560 | loss 3.773503 (+1.63z)| norm 0.2738 (-0.49z)| lr 5.77e-04 | 8450.29 ms | -100.0% bf16 MFU | 62064 tok/s +step 3067/19560 | loss 3.670304 (-0.97z)| norm 0.2993 (+0.39z)| lr 5.77e-04 | 8451.84 ms | -100.0% bf16 MFU | 62063 tok/s +step 3068/19560 | loss 3.748222 (+0.98z)| norm 0.3170 (+1.01z)| lr 5.77e-04 | 8448.64 ms | -100.0% bf16 MFU | 62062 tok/s +step 3069/19560 | loss 3.747607 (+0.95z)| norm 0.2913 (+0.12z)| lr 5.77e-04 | 8448.38 ms | -100.0% bf16 MFU | 62062 tok/s +step 3070/19560 | loss 3.724631 (+0.40z)| norm 0.2826 (-0.19z)| lr 5.77e-04 | 8451.19 ms | -100.0% bf16 MFU | 62061 tok/s +step 3071/19560 | loss 3.684502 (-0.63z)| norm 0.2920 (+0.14z)| lr 5.77e-04 | 8457.46 ms | -100.0% bf16 MFU | 62058 tok/s +step 3072/19560 | loss 3.799605 (+2.27z)| norm 0.2715 (-0.58z)| lr 5.77e-04 | 8450.07 ms | -100.0% bf16 MFU | 62057 tok/s +step 3073/19560 | loss 3.683007 (-0.69z)| norm 0.2947 (+0.26z)| lr 5.77e-04 | 8450.69 ms | -100.0% bf16 MFU | 62056 tok/s +step 3074/19560 | loss 3.734826 (+0.61z)| norm 0.2890 (+0.06z)| lr 5.77e-04 | 8450.06 ms | -100.0% bf16 MFU | 62056 tok/s +step 3075/19560 | loss 3.736997 (+0.66z)| norm 0.2995 (+0.43z)| lr 5.77e-04 | 8457.74 ms | -100.0% bf16 MFU | 62052 tok/s +step 3076/19560 | loss 3.745228 (+0.86z)| norm 0.2966 (+0.32z)| lr 5.77e-04 | 8456.53 ms | -100.0% bf16 MFU | 62050 tok/s +step 3077/19560 | loss 3.678975 (-0.82z)| norm 0.2977 (+0.35z)| lr 5.77e-04 | 8456.45 ms | -100.0% bf16 MFU | 62047 tok/s +step 3078/19560 | loss 3.681756 (-0.74z)| norm 0.2932 (+0.19z)| lr 5.77e-04 | 8451.10 ms | -100.0% bf16 MFU | 62047 tok/s +step 3079/19560 | loss 3.707478 (-0.09z)| norm 0.2837 (-0.15z)| lr 5.77e-04 | 8452.14 ms | -100.0% bf16 MFU | 62046 tok/s +step 3080/19560 | loss 3.708147 (-0.07z)| norm 0.2898 (+0.06z)| lr 5.77e-04 | 8455.14 ms | -100.0% bf16 MFU | 62044 tok/s +step 3081/19560 | loss 3.730906 (+0.50z)| norm 0.3103 (+0.80z)| lr 5.77e-04 | 8450.77 ms | -100.0% bf16 MFU | 62044 tok/s +step 3082/19560 | loss 3.794769 (+2.08z)| norm 0.2934 (+0.20z)| lr 5.77e-04 | 8452.55 ms | -100.0% bf16 MFU | 62043 tok/s +step 3083/19560 | loss 3.734059 (+0.56z)| norm 0.2913 (+0.13z)| lr 5.77e-04 | 8448.66 ms | -100.0% bf16 MFU | 62043 tok/s +step 3084/19560 | loss 3.711416 (-0.03z)| norm 0.3407 (+1.93z)| lr 5.77e-04 | 8453.04 ms | -100.0% bf16 MFU | 62042 tok/s +step 3085/19560 | loss 3.711262 (-0.05z)| norm 0.2864 (-0.05z)| lr 5.77e-04 | 8453.96 ms | -100.0% bf16 MFU | 62041 tok/s +step 3086/19560 | loss 3.674640 (-1.00z)| norm 0.3117 (+0.86z)| lr 5.77e-04 | 8452.55 ms | -100.0% bf16 MFU | 62040 tok/s +step 3087/19560 | loss 3.738398 (+0.65z)| norm 0.3022 (+0.50z)| lr 5.77e-04 | 8462.82 ms | -100.0% bf16 MFU | 62036 tok/s +step 3088/19560 | loss 3.690283 (-0.62z)| norm 0.3047 (+0.58z)| lr 5.77e-04 | 8451.87 ms | -100.0% bf16 MFU | 62036 tok/s +step 3089/19560 | loss 3.726182 (+0.34z)| norm 0.2727 (-0.58z)| lr 5.77e-04 | 8456.85 ms | -100.0% bf16 MFU | 62034 tok/s +step 3090/19560 | loss 3.726396 (+0.33z)| norm 0.2603 (-1.01z)| lr 5.77e-04 | 8456.31 ms | -100.0% bf16 MFU | 62032 tok/s +step 3091/19560 | loss 3.642267 (-1.89z)| norm 0.2700 (-0.65z)| lr 5.77e-04 | 8455.13 ms | -100.0% bf16 MFU | 62031 tok/s +step 3092/19560 | loss 3.710513 (-0.09z)| norm 0.2484 (-1.41z)| lr 5.77e-04 | 8456.88 ms | -100.0% bf16 MFU | 62029 tok/s +step 3093/19560 | loss 3.683437 (-0.82z)| norm 0.2691 (-0.65z)| lr 5.76e-04 | 8456.65 ms | -100.0% bf16 MFU | 62028 tok/s +step 3094/19560 | loss 3.692679 (-0.57z)| norm 0.2514 (-1.29z)| lr 5.76e-04 | 8455.46 ms | -100.0% bf16 MFU | 62026 tok/s +step 3095/19560 | loss 3.664860 (-1.31z)| norm 0.2527 (-1.22z)| lr 5.76e-04 | 8454.59 ms | -100.0% bf16 MFU | 62026 tok/s +step 3096/19560 | loss 3.698787 (-0.40z)| norm 0.2461 (-1.44z)| lr 5.76e-04 | 8457.25 ms | -100.0% bf16 MFU | 62024 tok/s +step 3097/19560 | loss 3.746549 (+0.88z)| norm 0.2554 (-1.10z)| lr 5.76e-04 | 8458.16 ms | -100.0% bf16 MFU | 62022 tok/s +step 3098/19560 | loss 3.636946 (-2.01z)| norm 0.2563 (-1.05z)| lr 5.76e-04 | 8457.92 ms | -100.0% bf16 MFU | 62020 tok/s +step 3099/19560 | loss 3.725782 (+0.34z)| norm 0.2666 (-0.68z)| lr 5.76e-04 | 8454.03 ms | -100.0% bf16 MFU | 62020 tok/s +step 3100/19560 | loss 3.736435 (+0.66z)| norm 0.2352 (-1.76z)| lr 5.76e-04 | 8454.39 ms | -100.0% bf16 MFU | 62020 tok/s +step 3101/19560 | loss 3.703364 (-0.24z)| norm 0.2405 (-1.54z)| lr 5.76e-04 | 8462.56 ms | -100.0% bf16 MFU | 62017 tok/s +step 3102/19560 | loss 3.692678 (-0.53z)| norm 0.2775 (-0.25z)| lr 5.76e-04 | 8457.57 ms | -100.0% bf16 MFU | 62015 tok/s +step 3103/19560 | loss 3.685419 (-0.72z)| norm 0.3172 (+1.11z)| lr 5.76e-04 | 8451.72 ms | -100.0% bf16 MFU | 62016 tok/s +step 3104/19560 | loss 3.688178 (-0.67z)| norm 0.3297 (+1.52z)| lr 5.76e-04 | 8455.78 ms | -100.0% bf16 MFU | 62016 tok/s +step 3105/19560 | loss 3.711130 (-0.04z)| norm 0.2897 (+0.15z)| lr 5.76e-04 | 8456.95 ms | -100.0% bf16 MFU | 62015 tok/s +step 3106/19560 | loss 3.708053 (-0.11z)| norm 0.2564 (-0.98z)| lr 5.76e-04 | 8458.25 ms | -100.0% bf16 MFU | 62013 tok/s +step 3107/19560 | loss 3.695639 (-0.47z)| norm 0.2666 (-0.63z)| lr 5.76e-04 | 8456.44 ms | -100.0% bf16 MFU | 62012 tok/s +step 3108/19560 | loss 3.697978 (-0.40z)| norm 0.2670 (-0.62z)| lr 5.76e-04 | 8457.88 ms | -100.0% bf16 MFU | 62011 tok/s +step 3109/19560 | loss 3.681914 (-0.86z)| norm 0.2803 (-0.17z)| lr 5.76e-04 | 8456.63 ms | -100.0% bf16 MFU | 62010 tok/s +step 3110/19560 | loss 3.732315 (+0.57z)| norm 0.2988 (+0.45z)| lr 5.76e-04 | 8451.71 ms | -100.0% bf16 MFU | 62012 tok/s +step 3111/19560 | loss 3.704248 (-0.22z)| norm 0.3024 (+0.57z)| lr 5.76e-04 | 8458.63 ms | -100.0% bf16 MFU | 62010 tok/s +step 3112/19560 | loss 3.682156 (-0.84z)| norm 0.2912 (+0.17z)| lr 5.76e-04 | 8456.79 ms | -100.0% bf16 MFU | 62009 tok/s +step 3113/19560 | loss 3.685569 (-0.75z)| norm 0.2875 (+0.04z)| lr 5.76e-04 | 8457.38 ms | -100.0% bf16 MFU | 62009 tok/s +step 3114/19560 | loss 3.675803 (-1.01z)| norm 0.2993 (+0.44z)| lr 5.76e-04 | 8453.19 ms | -100.0% bf16 MFU | 62009 tok/s +step 3115/19560 | loss 3.764115 (+1.50z)| norm 0.2771 (-0.33z)| lr 5.76e-04 | 8451.94 ms | -100.0% bf16 MFU | 62010 tok/s +step 3116/19560 | loss 3.723698 (+0.34z)| norm 0.2703 (-0.56z)| lr 5.76e-04 | 8456.65 ms | -100.0% bf16 MFU | 62010 tok/s +step 3117/19560 | loss 3.730096 (+0.52z)| norm 0.2836 (-0.10z)| lr 5.76e-04 | 8456.57 ms | -100.0% bf16 MFU | 62009 tok/s +step 3118/19560 | loss 3.688233 (-0.66z)| norm 0.2788 (-0.27z)| lr 5.76e-04 | 8452.94 ms | -100.0% bf16 MFU | 62010 tok/s +step 3119/19560 | loss 3.688808 (-0.63z)| norm 0.2501 (-1.25z)| lr 5.76e-04 | 8459.60 ms | -100.0% bf16 MFU | 62008 tok/s +step 3120/19560 | loss 3.673146 (-1.07z)| norm 0.2681 (-0.62z)| lr 5.76e-04 | 8453.06 ms | -100.0% bf16 MFU | 62009 tok/s +step 3121/19560 | loss 3.679275 (-0.89z)| norm 0.2507 (-1.21z)| lr 5.76e-04 | 8451.29 ms | -100.0% bf16 MFU | 62010 tok/s +step 3122/19560 | loss 3.725944 (+0.44z)| norm 0.2564 (-1.00z)| lr 5.76e-04 | 8456.54 ms | -100.0% bf16 MFU | 62010 tok/s +step 3123/19560 | loss 3.769196 (+1.66z)| norm 0.2884 (+0.13z)| lr 5.76e-04 | 8455.12 ms | -100.0% bf16 MFU | 62010 tok/s +step 3124/19560 | loss 3.759074 (+1.35z)| norm 0.2632 (-0.75z)| lr 5.76e-04 | 8454.69 ms | -100.0% bf16 MFU | 62010 tok/s +step 3125/19560 | loss 3.687290 (-0.66z)| norm 0.2669 (-0.62z)| lr 5.76e-04 | 8455.49 ms | -100.0% bf16 MFU | 62010 tok/s +step 3126/19560 | loss 3.698977 (-0.34z)| norm 0.2989 (+0.51z)| lr 5.76e-04 | 8455.93 ms | -100.0% bf16 MFU | 62009 tok/s +step 3127/19560 | loss 3.690947 (-0.56z)| norm 0.3218 (+1.30z)| lr 5.76e-04 | 8456.41 ms | -100.0% bf16 MFU | 62009 tok/s +step 3128/19560 | loss 3.705922 (-0.13z)| norm 0.3433 (+2.01z)| lr 5.76e-04 | 8452.34 ms | -100.0% bf16 MFU | 62010 tok/s +step 3129/19560 | loss 3.706984 (-0.09z)| norm 0.3324 (+1.60z)| lr 5.76e-04 | 8454.77 ms | -100.0% bf16 MFU | 62010 tok/s +step 3130/19560 | loss 3.682717 (-0.78z)| norm 0.2791 (-0.24z)| lr 5.76e-04 | 8455.93 ms | -100.0% bf16 MFU | 62009 tok/s +step 3131/19560 | loss 3.687623 (-0.63z)| norm 0.2535 (-1.12z)| lr 5.76e-04 | 8456.63 ms | -100.0% bf16 MFU | 62009 tok/s +step 3132/19560 | loss 3.672683 (-1.07z)| norm 0.2831 (-0.10z)| lr 5.76e-04 | 8453.69 ms | -100.0% bf16 MFU | 62009 tok/s +step 3133/19560 | loss 3.697669 (-0.35z)| norm 0.3048 (+0.64z)| lr 5.76e-04 | 8455.99 ms | -100.0% bf16 MFU | 62009 tok/s +step 3134/19560 | loss 3.634934 (-2.10z)| norm 0.2728 (-0.46z)| lr 5.76e-04 | 8454.39 ms | -100.0% bf16 MFU | 62009 tok/s +step 3135/19560 | loss 3.675590 (-0.97z)| norm 0.2563 (-1.01z)| lr 5.76e-04 | 8452.76 ms | -100.0% bf16 MFU | 62010 tok/s +step 3136/19560 | loss 3.694835 (-0.43z)| norm 0.2563 (-1.00z)| lr 5.76e-04 | 8454.56 ms | -100.0% bf16 MFU | 62010 tok/s +step 3137/19560 | loss 3.654800 (-1.56z)| norm 0.2664 (-0.66z)| lr 5.76e-04 | 8454.07 ms | -100.0% bf16 MFU | 62010 tok/s +step 3138/19560 | loss 3.691850 (-0.50z)| norm 0.2733 (-0.42z)| lr 5.76e-04 | 8452.34 ms | -100.0% bf16 MFU | 62011 tok/s +step 3139/19560 | loss 3.692520 (-0.48z)| norm 0.2917 (+0.21z)| lr 5.76e-04 | 8456.59 ms | -100.0% bf16 MFU | 62011 tok/s +step 3140/19560 | loss 3.697013 (-0.34z)| norm 0.2620 (-0.80z)| lr 5.76e-04 | 8458.35 ms | -100.0% bf16 MFU | 62009 tok/s +step 3141/19560 | loss 3.678610 (-0.89z)| norm 0.2914 (+0.24z)| lr 5.76e-04 | 8455.98 ms | -100.0% bf16 MFU | 62009 tok/s +step 3142/19560 | loss 3.672124 (-1.06z)| norm 0.3283 (+1.63z)| lr 5.76e-04 | 8458.26 ms | -100.0% bf16 MFU | 62008 tok/s +step 3143/19560 | loss 3.684347 (-0.69z)| norm 0.3047 (+0.83z)| lr 5.76e-04 | 8456.98 ms | -100.0% bf16 MFU | 62007 tok/s +step 3144/19560 | loss 3.728930 (+0.62z)| norm 0.2772 (-0.24z)| lr 5.76e-04 | 8451.11 ms | -100.0% bf16 MFU | 62009 tok/s +step 3145/19560 | loss 3.710640 (+0.08z)| norm 0.2611 (-0.87z)| lr 5.75e-04 | 8454.44 ms | -100.0% bf16 MFU | 62009 tok/s +step 3146/19560 | loss 3.673427 (-1.01z)| norm 0.2727 (-0.40z)| lr 5.75e-04 | 8450.11 ms | -100.0% bf16 MFU | 62011 tok/s +step 3147/19560 | loss 3.661708 (-1.34z)| norm 0.2700 (-0.51z)| lr 5.75e-04 | 8454.62 ms | -100.0% bf16 MFU | 62011 tok/s +step 3148/19560 | loss 3.722538 (+0.46z)| norm 0.2868 (+0.15z)| lr 5.75e-04 | 8454.45 ms | -100.0% bf16 MFU | 62011 tok/s +step 3149/19560 | loss 3.663094 (-1.30z)| norm 0.2964 (+0.52z)| lr 5.75e-04 | 8455.51 ms | -100.0% bf16 MFU | 62011 tok/s +step 3150/19560 | loss 3.724446 (+0.52z)| norm 0.2896 (+0.24z)| lr 5.75e-04 | 8455.13 ms | -100.0% bf16 MFU | 62010 tok/s +step 3151/19560 | loss 3.667791 (-1.15z)| norm 0.2909 (+0.29z)| lr 5.75e-04 | 8452.85 ms | -100.0% bf16 MFU | 62011 tok/s +step 3152/19560 | loss 3.621307 (-2.45z)| norm 0.2867 (+0.12z)| lr 5.75e-04 | 8454.70 ms | -100.0% bf16 MFU | 62011 tok/s +step 3153/19560 | loss 3.605209 (-2.80z)| norm 0.2971 (+0.53z)| lr 5.75e-04 | 8453.07 ms | -100.0% bf16 MFU | 62012 tok/s +step 3154/19560 | loss 3.692037 (-0.36z)| norm 0.3586 (+2.91z)| lr 5.75e-04 | 8451.47 ms | -100.0% bf16 MFU | 62013 tok/s +step 3155/19560 | loss 3.691394 (-0.38z)| norm 0.3578 (+2.77z)| lr 5.75e-04 | 8441.58 ms | -100.0% bf16 MFU | 62018 tok/s +step 3156/19560 | loss 3.695679 (-0.25z)| norm 0.3050 (+0.73z)| lr 5.75e-04 | 8438.63 ms | -100.0% bf16 MFU | 62023 tok/s +step 3157/19560 | loss 3.740957 (+1.11z)| norm 0.3063 (+0.77z)| lr 5.75e-04 | 8449.19 ms | -100.0% bf16 MFU | 62025 tok/s +step 3158/19560 | loss 3.701195 (-0.07z)| norm 0.3027 (+0.62z)| lr 5.75e-04 | 8463.60 ms | -100.0% bf16 MFU | 62021 tok/s +step 3159/19560 | loss 3.744685 (+1.22z)| norm 0.2896 (+0.09z)| lr 5.75e-04 | 8461.83 ms | -100.0% bf16 MFU | 62018 tok/s +step 3160/19560 | loss 3.690725 (-0.39z)| norm 0.2647 (-0.91z)| lr 5.75e-04 | 8462.64 ms | -100.0% bf16 MFU | 62015 tok/s +step 3161/19560 | loss 3.652516 (-1.52z)| norm 0.2732 (-0.57z)| lr 5.75e-04 | 8455.40 ms | -100.0% bf16 MFU | 62014 tok/s +step 3162/19560 | loss 3.684071 (-0.58z)| norm 0.2782 (-0.37z)| lr 5.75e-04 | 8456.65 ms | -100.0% bf16 MFU | 62013 tok/s +step 3163/19560 | loss 3.734243 (+0.94z)| norm 0.2796 (-0.31z)| lr 5.75e-04 | 8460.89 ms | -100.0% bf16 MFU | 62011 tok/s +step 3164/19560 | loss 3.678158 (-0.76z)| norm 0.3252 (+1.50z)| lr 5.75e-04 | 8458.95 ms | -100.0% bf16 MFU | 62009 tok/s +step 3165/19560 | loss 3.698786 (-0.13z)| norm 0.3201 (+1.34z)| lr 5.75e-04 | 8456.25 ms | -100.0% bf16 MFU | 62009 tok/s +step 3166/19560 | loss 3.735775 (+0.98z)| norm 0.2769 (-0.40z)| lr 5.75e-04 | 8463.94 ms | -100.0% bf16 MFU | 62006 tok/s +step 3167/19560 | loss 3.685966 (-0.51z)| norm 0.2741 (-0.51z)| lr 5.75e-04 | 8456.95 ms | -100.0% bf16 MFU | 62005 tok/s +step 3168/19560 | loss 3.728425 (+0.77z)| norm 0.2781 (-0.34z)| lr 5.75e-04 | 8458.48 ms | -100.0% bf16 MFU | 62004 tok/s +step 3169/19560 | loss 3.644713 (-1.72z)| norm 0.2499 (-1.48z)| lr 5.75e-04 | 8458.69 ms | -100.0% bf16 MFU | 62003 tok/s +step 3170/19560 | loss 3.634863 (-1.97z)| norm 0.3090 (+0.96z)| lr 5.75e-04 | 8455.95 ms | -100.0% bf16 MFU | 62003 tok/s +step 3171/19560 | loss 3.621581 (-2.29z)| norm 0.2830 (-0.11z)| lr 5.75e-04 | 8459.27 ms | -100.0% bf16 MFU | 62002 tok/s +step 3172/19560 | loss 3.644896 (-1.59z)| norm 0.2809 (-0.20z)| lr 5.75e-04 | 8459.39 ms | -100.0% bf16 MFU | 62000 tok/s +step 3173/19560 | loss 3.673748 (-0.77z)| norm 0.2847 (-0.04z)| lr 5.75e-04 | 8456.90 ms | -100.0% bf16 MFU | 62000 tok/s +step 3174/19560 | loss 3.729078 (+0.79z)| norm 0.2902 (+0.18z)| lr 5.75e-04 | 8458.24 ms | -100.0% bf16 MFU | 61999 tok/s +step 3175/19560 | loss 3.710837 (+0.28z)| norm 0.2853 (-0.03z)| lr 5.75e-04 | 8461.84 ms | -100.0% bf16 MFU | 61997 tok/s +step 3176/19560 | loss 3.642693 (-1.63z)| norm 0.3004 (+0.62z)| lr 5.75e-04 | 8457.93 ms | -100.0% bf16 MFU | 61997 tok/s +step 3177/19560 | loss 3.652488 (-1.33z)| norm 0.2743 (-0.47z)| lr 5.75e-04 | 8456.66 ms | -100.0% bf16 MFU | 61997 tok/s +step 3178/19560 | loss 3.719045 (+0.54z)| norm 0.2791 (-0.25z)| lr 5.75e-04 | 8453.03 ms | -100.0% bf16 MFU | 61998 tok/s +step 3179/19560 | loss 3.647537 (-1.49z)| norm 0.2687 (-0.69z)| lr 5.75e-04 | 8456.52 ms | -100.0% bf16 MFU | 61998 tok/s +step 3180/19560 | loss 3.641114 (-1.64z)| norm 0.2736 (-0.47z)| lr 5.75e-04 | 8459.87 ms | -100.0% bf16 MFU | 61997 tok/s +step 3181/19560 | loss 3.642087 (-1.59z)| norm 0.2706 (-0.59z)| lr 5.75e-04 | 8462.27 ms | -100.0% bf16 MFU | 61995 tok/s +step 3182/19560 | loss 3.767863 (+1.90z)| norm 0.2559 (-1.22z)| lr 5.75e-04 | 8464.86 ms | -100.0% bf16 MFU | 61992 tok/s +step 3183/19560 | loss 3.722409 (+0.65z)| norm 0.2870 (+0.12z)| lr 5.75e-04 | 8465.57 ms | -100.0% bf16 MFU | 61989 tok/s +step 3184/19560 | loss 3.669512 (-0.82z)| norm 0.2923 (+0.34z)| lr 5.75e-04 | 8464.33 ms | -100.0% bf16 MFU | 61987 tok/s +step 3185/19560 | loss 3.743968 (+1.27z)| norm 0.2698 (-0.65z)| lr 5.75e-04 | 8457.69 ms | -100.0% bf16 MFU | 61987 tok/s +step 3186/19560 | loss 3.696803 (-0.05z)| norm 0.2821 (-0.11z)| lr 5.75e-04 | 8464.01 ms | -100.0% bf16 MFU | 61985 tok/s +step 3187/19560 | loss 3.654681 (-1.21z)| norm 0.2868 (+0.09z)| lr 5.75e-04 | 8461.87 ms | -100.0% bf16 MFU | 61983 tok/s +step 3188/19560 | loss 3.787869 (+2.45z)| norm 0.3354 (+2.17z)| lr 5.75e-04 | 8461.98 ms | -100.0% bf16 MFU | 61982 tok/s +step 3189/19560 | loss 3.718443 (+0.54z)| norm 0.2917 (+0.28z)| lr 5.75e-04 | 8465.45 ms | -100.0% bf16 MFU | 61980 tok/s +step 3190/19560 | loss 3.675343 (-0.64z)| norm 0.2892 (+0.17z)| lr 5.75e-04 | 8467.38 ms | -100.0% bf16 MFU | 61977 tok/s +step 3191/19560 | loss 3.636542 (-1.67z)| norm 0.3060 (+0.89z)| lr 5.75e-04 | 8464.30 ms | -100.0% bf16 MFU | 61975 tok/s +step 3192/19560 | loss 3.622252 (-2.01z)| norm 0.3237 (+1.64z)| lr 5.75e-04 | 8468.06 ms | -100.0% bf16 MFU | 61972 tok/s +step 3193/19560 | loss 3.653720 (-1.16z)| norm 0.2725 (-0.56z)| lr 5.75e-04 | 8460.53 ms | -100.0% bf16 MFU | 61972 tok/s +step 3194/19560 | loss 3.657785 (-1.04z)| norm 0.2925 (+0.30z)| lr 5.75e-04 | 8460.62 ms | -100.0% bf16 MFU | 61971 tok/s +step 3195/19560 | loss 3.640162 (-1.50z)| norm 0.3157 (+1.28z)| lr 5.74e-04 | 8462.79 ms | -100.0% bf16 MFU | 61970 tok/s +step 3196/19560 | loss 3.696021 (+0.01z)| norm 0.2886 (+0.13z)| lr 5.74e-04 | 8460.86 ms | -100.0% bf16 MFU | 61970 tok/s +step 3197/19560 | loss 3.699148 (+0.11z)| norm 0.2659 (-0.83z)| lr 5.74e-04 | 8456.73 ms | -100.0% bf16 MFU | 61972 tok/s +step 3198/19560 | loss 3.686733 (-0.22z)| norm 0.2923 (+0.30z)| lr 5.74e-04 | 8460.66 ms | -100.0% bf16 MFU | 61971 tok/s +step 3199/19560 | loss 3.672197 (-0.62z)| norm 0.2790 (-0.27z)| lr 5.74e-04 | 8456.26 ms | -100.0% bf16 MFU | 61973 tok/s +step 3200/19560 | loss 3.756343 (+1.72z)| norm 0.2667 (-0.79z)| lr 5.74e-04 | 8461.97 ms | -100.0% bf16 MFU | 61972 tok/s +step 3201/19560 | loss 3.640847 (-1.47z)| norm 0.2631 (-0.93z)| lr 5.74e-04 | 8461.09 ms | -100.0% bf16 MFU | 61972 tok/s +step 3202/19560 | loss 3.721537 (+0.76z)| norm 0.2671 (-0.75z)| lr 5.74e-04 | 8459.50 ms | -100.0% bf16 MFU | 61972 tok/s +step 3203/19560 | loss 3.673560 (-0.56z)| norm 0.2856 (+0.04z)| lr 5.74e-04 | 8461.37 ms | -100.0% bf16 MFU | 61971 tok/s +step 3204/19560 | loss 3.629921 (-1.74z)| norm 0.2980 (+0.57z)| lr 5.74e-04 | 8455.92 ms | -100.0% bf16 MFU | 61973 tok/s +step 3205/19560 | loss 3.615077 (-2.10z)| norm 0.2815 (-0.13z)| lr 5.74e-04 | 8461.60 ms | -100.0% bf16 MFU | 61972 tok/s +step 3206/19560 | loss 3.631333 (-1.63z)| norm 0.2806 (-0.16z)| lr 5.74e-04 | 8461.65 ms | -100.0% bf16 MFU | 61972 tok/s +step 3207/19560 | loss 3.632331 (-1.57z)| norm 0.2450 (-1.65z)| lr 5.74e-04 | 8463.15 ms | -100.0% bf16 MFU | 61971 tok/s +step 3208/19560 | loss 3.686661 (-0.12z)| norm 0.2594 (-1.03z)| lr 5.74e-04 | 8460.90 ms | -100.0% bf16 MFU | 61970 tok/s +step 3209/19560 | loss 3.636448 (-1.43z)| norm 0.2757 (-0.34z)| lr 5.74e-04 | 8456.59 ms | -100.0% bf16 MFU | 61972 tok/s +step 3210/19560 | loss 3.642829 (-1.26z)| norm 0.2560 (-1.15z)| lr 5.74e-04 | 8457.97 ms | -100.0% bf16 MFU | 61973 tok/s +step 3211/19560 | loss 3.683346 (-0.15z)| norm 0.2519 (-1.30z)| lr 5.74e-04 | 8454.68 ms | -100.0% bf16 MFU | 61975 tok/s +step 3212/19560 | loss 3.715970 (+0.75z)| norm 0.2645 (-0.77z)| lr 5.74e-04 | 8460.07 ms | -100.0% bf16 MFU | 61974 tok/s +step 3213/19560 | loss 3.622206 (-1.79z)| norm 0.2621 (-0.86z)| lr 5.74e-04 | 8456.94 ms | -100.0% bf16 MFU | 61975 tok/s +step 3214/19560 | loss 3.675532 (-0.34z)| norm 0.2894 (+0.31z)| lr 5.74e-04 | 8457.37 ms | -100.0% bf16 MFU | 61976 tok/s +step 3215/19560 | loss 3.604630 (-2.21z)| norm 0.3348 (+2.21z)| lr 5.74e-04 | 8461.09 ms | -100.0% bf16 MFU | 61976 tok/s +step 3216/19560 | loss 3.670388 (-0.44z)| norm 0.3199 (+1.56z)| lr 5.74e-04 | 8452.36 ms | -100.0% bf16 MFU | 61978 tok/s +step 3217/19560 | loss 3.667075 (-0.52z)| norm 0.2770 (-0.23z)| lr 5.74e-04 | 8458.85 ms | -100.0% bf16 MFU | 61978 tok/s +step 3218/19560 | loss 3.665206 (-0.56z)| norm 0.3035 (+0.86z)| lr 5.74e-04 | 8459.47 ms | -100.0% bf16 MFU | 61978 tok/s +step 3219/19560 | loss 3.622155 (-1.70z)| norm 0.2982 (+0.63z)| lr 5.74e-04 | 8448.75 ms | -100.0% bf16 MFU | 61982 tok/s +step 3220/19560 | loss 3.671247 (-0.38z)| norm 0.2970 (+0.57z)| lr 5.74e-04 | 8455.22 ms | -100.0% bf16 MFU | 61983 tok/s +step 3221/19560 | loss 3.680583 (-0.13z)| norm 0.2977 (+0.59z)| lr 5.74e-04 | 8463.47 ms | -100.0% bf16 MFU | 61982 tok/s +step 3222/19560 | loss 3.684967 (-0.01z)| norm 0.2834 (-0.02z)| lr 5.74e-04 | 8459.27 ms | -100.0% bf16 MFU | 61981 tok/s +step 3223/19560 | loss 3.685162 (-0.01z)| norm 0.2751 (-0.39z)| lr 5.74e-04 | 8452.96 ms | -100.0% bf16 MFU | 61984 tok/s +step 3224/19560 | loss 3.693089 (+0.20z)| norm 0.2559 (-1.21z)| lr 5.74e-04 | 8453.48 ms | -100.0% bf16 MFU | 61985 tok/s +step 3225/19560 | loss 3.688797 (+0.10z)| norm 0.2537 (-1.31z)| lr 5.74e-04 | 8459.90 ms | -100.0% bf16 MFU | 61985 tok/s +step 3226/19560 | loss 3.723211 (+1.02z)| norm 0.3181 (+1.43z)| lr 5.74e-04 | 8452.70 ms | -100.0% bf16 MFU | 61987 tok/s +step 3227/19560 | loss 3.690115 (+0.13z)| norm 0.3252 (+1.70z)| lr 5.74e-04 | 8455.68 ms | -100.0% bf16 MFU | 61988 tok/s +step 3228/19560 | loss 3.683486 (-0.04z)| norm 0.3108 (+1.08z)| lr 5.74e-04 | 8458.27 ms | -100.0% bf16 MFU | 61988 tok/s +step 3229/19560 | loss 3.664876 (-0.55z)| norm 0.2682 (-0.77z)| lr 5.74e-04 | 8454.28 ms | -100.0% bf16 MFU | 61989 tok/s +step 3230/19560 | loss 3.732082 (+1.29z)| norm 0.2762 (-0.42z)| lr 5.74e-04 | 8453.78 ms | -100.0% bf16 MFU | 61990 tok/s +step 3231/19560 | loss 3.660761 (-0.66z)| norm 0.2987 (+0.57z)| lr 5.74e-04 | 8461.07 ms | -100.0% bf16 MFU | 61989 tok/s +step 3232/19560 | loss 3.786915 (+2.69z)| norm 0.3054 (+0.88z)| lr 5.74e-04 | 8453.19 ms | -100.0% bf16 MFU | 61991 tok/s +step 3233/19560 | loss 3.685398 (-0.00z)| norm 0.3184 (+1.43z)| lr 5.74e-04 | 8456.90 ms | -100.0% bf16 MFU | 61991 tok/s +step 3234/19560 | loss 3.692319 (+0.19z)| norm 0.3250 (+1.69z)| lr 5.74e-04 | 8459.01 ms | -100.0% bf16 MFU | 61990 tok/s +step 3235/19560 | loss 3.650269 (-0.92z)| norm 0.2957 (+0.40z)| lr 5.74e-04 | 8451.40 ms | -100.0% bf16 MFU | 61993 tok/s +step 3236/19560 | loss 3.685543 (+0.02z)| norm 0.2638 (-0.99z)| lr 5.74e-04 | 8455.34 ms | -100.0% bf16 MFU | 61993 tok/s +step 3237/19560 | loss 3.650803 (-0.90z)| norm 0.2565 (-1.30z)| lr 5.74e-04 | 8455.79 ms | -100.0% bf16 MFU | 61994 tok/s +step 3238/19560 | loss 3.636157 (-1.26z)| norm 0.2534 (-1.41z)| lr 5.74e-04 | 8456.40 ms | -100.0% bf16 MFU | 61994 tok/s +step 3239/19560 | loss 3.684869 (+0.03z)| norm 0.2487 (-1.58z)| lr 5.74e-04 | 8456.68 ms | -100.0% bf16 MFU | 61994 tok/s +step 3240/19560 | loss 3.675884 (-0.21z)| norm 0.2601 (-1.08z)| lr 5.74e-04 | 8459.67 ms | -100.0% bf16 MFU | 61993 tok/s +step 3241/19560 | loss 3.666766 (-0.44z)| norm 0.2608 (-1.03z)| lr 5.74e-04 | 8457.10 ms | -100.0% bf16 MFU | 61993 tok/s +step 3242/19560 | loss 3.693047 (+0.25z)| norm 0.2530 (-1.34z)| lr 5.74e-04 | 8454.65 ms | -100.0% bf16 MFU | 61994 tok/s +step 3243/19560 | loss 3.631912 (-1.36z)| norm 0.2765 (-0.35z)| lr 5.74e-04 | 8456.71 ms | -100.0% bf16 MFU | 61994 tok/s +step 3244/19560 | loss 3.659830 (-0.60z)| norm 0.2738 (-0.47z)| lr 5.73e-04 | 8453.66 ms | -100.0% bf16 MFU | 61996 tok/s +step 3245/19560 | loss 3.705235 (+0.63z)| norm 0.3047 (+0.83z)| lr 5.73e-04 | 8457.17 ms | -100.0% bf16 MFU | 61996 tok/s +step 3246/19560 | loss 3.706762 (+0.67z)| norm 0.2917 (+0.28z)| lr 5.73e-04 | 8453.87 ms | -100.0% bf16 MFU | 61997 tok/s +step 3247/19560 | loss 3.686741 (+0.13z)| norm 0.2875 (+0.09z)| lr 5.73e-04 | 8460.64 ms | -100.0% bf16 MFU | 61995 tok/s +step 3248/19560 | loss 3.678475 (-0.10z)| norm 0.2993 (+0.58z)| lr 5.73e-04 | 8451.37 ms | -100.0% bf16 MFU | 61997 tok/s +step 3249/19560 | loss 3.631909 (-1.33z)| norm 0.3289 (+1.81z)| lr 5.73e-04 | 8451.18 ms | -100.0% bf16 MFU | 61999 tok/s +step 3250/19560 | loss 3.702497 (+0.56z)| norm 0.3262 (+1.66z)| lr 5.73e-04 | 8451.91 ms | -100.0% bf16 MFU | 62001 tok/s +val loss 3.666925 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 2694/10042 = 0.268273 +step 3251/19560 | loss 3.711230 (+0.83z)| norm 0.3048 (+0.75z)| lr 5.73e-04 | 8456.56 ms | -100.0% bf16 MFU | 62001 tok/s +step 3252/19560 | loss 3.715133 (+0.96z)| norm 0.2893 (+0.09z)| lr 5.73e-04 | 8453.78 ms | -100.0% bf16 MFU | 62002 tok/s +step 3253/19560 | loss 3.690766 (+0.28z)| norm 0.2638 (-0.98z)| lr 5.73e-04 | 8451.94 ms | -100.0% bf16 MFU | 62003 tok/s +step 3254/19560 | loss 3.648199 (-0.89z)| norm 0.2638 (-0.97z)| lr 5.73e-04 | 8455.18 ms | -100.0% bf16 MFU | 62003 tok/s +step 3255/19560 | loss 3.639208 (-1.13z)| norm 0.2508 (-1.49z)| lr 5.73e-04 | 8455.58 ms | -100.0% bf16 MFU | 62003 tok/s +step 3256/19560 | loss 3.645167 (-0.95z)| norm 0.2446 (-1.74z)| lr 5.73e-04 | 8455.46 ms | -100.0% bf16 MFU | 62004 tok/s +step 3257/19560 | loss 3.705666 (+0.72z)| norm 0.2942 (+0.39z)| lr 5.73e-04 | 8451.23 ms | -100.0% bf16 MFU | 62005 tok/s +step 3258/19560 | loss 3.744367 (+1.75z)| norm 0.3406 (+2.33z)| lr 5.73e-04 | 8451.09 ms | -100.0% bf16 MFU | 62007 tok/s +step 3259/19560 | loss 3.626666 (-1.43z)| norm 0.3497 (+2.63z)| lr 5.73e-04 | 8448.96 ms | -100.0% bf16 MFU | 62009 tok/s +step 3260/19560 | loss 3.629191 (-1.34z)| norm 0.3305 (+1.79z)| lr 5.73e-04 | 8454.86 ms | -100.0% bf16 MFU | 62009 tok/s +step 3261/19560 | loss 3.716657 (+1.00z)| norm 0.2856 (-0.04z)| lr 5.73e-04 | 8454.73 ms | -100.0% bf16 MFU | 62009 tok/s +step 3262/19560 | loss 3.649859 (-0.79z)| norm 0.2657 (-0.86z)| lr 5.73e-04 | 8453.79 ms | -100.0% bf16 MFU | 62010 tok/s +step 3263/19560 | loss 3.655710 (-0.63z)| norm 0.2656 (-0.86z)| lr 5.73e-04 | 8456.70 ms | -100.0% bf16 MFU | 62009 tok/s +step 3264/19560 | loss 3.690240 (+0.30z)| norm 0.2506 (-1.48z)| lr 5.73e-04 | 8448.29 ms | -100.0% bf16 MFU | 62012 tok/s +step 3265/19560 | loss 3.612693 (-1.76z)| norm 0.2670 (-0.80z)| lr 5.73e-04 | 8451.16 ms | -100.0% bf16 MFU | 62013 tok/s +step 3266/19560 | loss 3.718177 (+1.03z)| norm 0.2677 (-0.77z)| lr 5.73e-04 | 8454.67 ms | -100.0% bf16 MFU | 62013 tok/s +step 3267/19560 | loss 3.671665 (-0.19z)| norm 0.2942 (+0.31z)| lr 5.73e-04 | 8449.09 ms | -100.0% bf16 MFU | 62015 tok/s +step 3268/19560 | loss 3.677330 (-0.04z)| norm 0.3026 (+0.65z)| lr 5.73e-04 | 8455.39 ms | -100.0% bf16 MFU | 62014 tok/s +step 3269/19560 | loss 3.664576 (-0.37z)| norm 0.2460 (-1.64z)| lr 5.73e-04 | 8448.49 ms | -100.0% bf16 MFU | 62017 tok/s +step 3270/19560 | loss 3.675033 (-0.10z)| norm 0.3172 (+1.26z)| lr 5.73e-04 | 8453.18 ms | -100.0% bf16 MFU | 62017 tok/s +step 3271/19560 | loss 3.709932 (+0.82z)| norm 0.2970 (+0.43z)| lr 5.73e-04 | 8450.81 ms | -100.0% bf16 MFU | 62018 tok/s +step 3272/19560 | loss 3.693516 (+0.40z)| norm 0.2901 (+0.15z)| lr 5.73e-04 | 8452.73 ms | -100.0% bf16 MFU | 62018 tok/s +step 3273/19560 | loss 3.665188 (-0.35z)| norm 0.3092 (+0.91z)| lr 5.73e-04 | 8449.50 ms | -100.0% bf16 MFU | 62020 tok/s +step 3274/19560 | loss 3.704310 (+0.69z)| norm 0.3030 (+0.65z)| lr 5.73e-04 | 8452.02 ms | -100.0% bf16 MFU | 62020 tok/s +step 3275/19560 | loss 3.784512 (+2.71z)| norm 0.2831 (-0.17z)| lr 5.73e-04 | 8452.25 ms | -100.0% bf16 MFU | 62021 tok/s +step 3276/19560 | loss 3.669545 (-0.25z)| norm 0.2676 (-0.80z)| lr 5.73e-04 | 8449.49 ms | -100.0% bf16 MFU | 62022 tok/s +step 3277/19560 | loss 3.769666 (+2.28z)| norm 0.2921 (+0.21z)| lr 5.73e-04 | 8452.23 ms | -100.0% bf16 MFU | 62023 tok/s +step 3278/19560 | loss 3.744435 (+1.63z)| norm 0.3138 (+1.08z)| lr 5.73e-04 | 8453.81 ms | -100.0% bf16 MFU | 62022 tok/s +step 3279/19560 | loss 3.697206 (+0.43z)| norm 0.3200 (+1.32z)| lr 5.73e-04 | 8452.25 ms | -100.0% bf16 MFU | 62023 tok/s +step 3280/19560 | loss 3.710733 (+0.76z)| norm 0.3263 (+1.55z)| lr 5.73e-04 | 8454.63 ms | -100.0% bf16 MFU | 62022 tok/s +step 3281/19560 | loss 3.660254 (-0.55z)| norm 0.2909 (+0.13z)| lr 5.73e-04 | 8451.62 ms | -100.0% bf16 MFU | 62023 tok/s +step 3282/19560 | loss 3.688135 (+0.17z)| norm 0.2835 (-0.15z)| lr 5.73e-04 | 8450.18 ms | -100.0% bf16 MFU | 62024 tok/s +step 3283/19560 | loss 3.646713 (-0.88z)| norm 0.2741 (-0.53z)| lr 5.73e-04 | 8452.25 ms | -100.0% bf16 MFU | 62024 tok/s +step 3284/19560 | loss 3.662635 (-0.47z)| norm 0.2963 (+0.42z)| lr 5.73e-04 | 8454.17 ms | -100.0% bf16 MFU | 62024 tok/s +step 3285/19560 | loss 3.696062 (+0.41z)| norm 0.2822 (-0.17z)| lr 5.73e-04 | 8452.56 ms | -100.0% bf16 MFU | 62024 tok/s +step 3286/19560 | loss 3.760771 (+2.04z)| norm 0.2977 (+0.50z)| lr 5.73e-04 | 8450.34 ms | -100.0% bf16 MFU | 62025 tok/s +step 3287/19560 | loss 3.695328 (+0.38z)| norm 0.2727 (-0.57z)| lr 5.73e-04 | 8450.12 ms | -100.0% bf16 MFU | 62026 tok/s +step 3288/19560 | loss 3.660112 (-0.52z)| norm 0.2886 (+0.10z)| lr 5.73e-04 | 8452.33 ms | -100.0% bf16 MFU | 62026 tok/s +step 3289/19560 | loss 3.758626 (+1.97z)| norm 0.2962 (+0.42z)| lr 5.73e-04 | 8450.65 ms | -100.0% bf16 MFU | 62027 tok/s +step 3290/19560 | loss 3.799401 (+2.89z)| norm 0.2773 (-0.39z)| lr 5.73e-04 | 8451.32 ms | -100.0% bf16 MFU | 62027 tok/s +step 3291/19560 | loss 3.640076 (-1.01z)| norm 0.2917 (+0.22z)| lr 5.73e-04 | 8449.52 ms | -100.0% bf16 MFU | 62028 tok/s +step 3292/19560 | loss 3.706321 (+0.61z)| norm 0.2825 (-0.16z)| lr 5.72e-04 | 8451.10 ms | -100.0% bf16 MFU | 62029 tok/s +step 3293/19560 | loss 3.675996 (-0.13z)| norm 0.2956 (+0.42z)| lr 5.72e-04 | 8449.41 ms | -100.0% bf16 MFU | 62030 tok/s +step 3294/19560 | loss 3.622014 (-1.44z)| norm 0.3232 (+1.61z)| lr 5.72e-04 | 8451.25 ms | -100.0% bf16 MFU | 62030 tok/s +step 3295/19560 | loss 3.662843 (-0.43z)| norm 0.3100 (+1.02z)| lr 5.72e-04 | 8448.29 ms | -100.0% bf16 MFU | 62032 tok/s +step 3296/19560 | loss 3.650248 (-0.72z)| norm 0.2899 (+0.14z)| lr 5.72e-04 | 8451.99 ms | -100.0% bf16 MFU | 62032 tok/s +step 3297/19560 | loss 3.641762 (-0.93z)| norm 0.2647 (-0.97z)| lr 5.72e-04 | 8449.48 ms | -100.0% bf16 MFU | 62033 tok/s +step 3298/19560 | loss 3.658748 (-0.52z)| norm 0.2676 (-0.83z)| lr 5.72e-04 | 8450.70 ms | -100.0% bf16 MFU | 62033 tok/s +step 3299/19560 | loss 3.666028 (-0.35z)| norm 0.3026 (+0.70z)| lr 5.72e-04 | 8450.80 ms | -100.0% bf16 MFU | 62033 tok/s +step 3300/19560 | loss 3.681345 (+0.02z)| norm 0.2779 (-0.38z)| lr 5.72e-04 | 8446.44 ms | -100.0% bf16 MFU | 62035 tok/s +step 3301/19560 | loss 3.668972 (-0.29z)| norm 0.2737 (-0.56z)| lr 5.72e-04 | 8448.19 ms | -100.0% bf16 MFU | 62036 tok/s +step 3302/19560 | loss 3.693778 (+0.34z)| norm 0.2861 (-0.01z)| lr 5.72e-04 | 8452.40 ms | -100.0% bf16 MFU | 62036 tok/s +step 3303/19560 | loss 3.696856 (+0.42z)| norm 0.2801 (-0.28z)| lr 5.72e-04 | 8448.53 ms | -100.0% bf16 MFU | 62037 tok/s +step 3304/19560 | loss 3.661480 (-0.47z)| norm 0.2745 (-0.51z)| lr 5.72e-04 | 8448.75 ms | -100.0% bf16 MFU | 62038 tok/s +step 3305/19560 | loss 3.671165 (-0.23z)| norm 0.2869 (+0.03z)| lr 5.72e-04 | 8451.96 ms | -100.0% bf16 MFU | 62038 tok/s +step 3306/19560 | loss 3.628223 (-1.30z)| norm 0.2831 (-0.14z)| lr 5.72e-04 | 8449.52 ms | -100.0% bf16 MFU | 62038 tok/s +step 3307/19560 | loss 3.682323 (+0.06z)| norm 0.3069 (+0.89z)| lr 5.72e-04 | 8451.00 ms | -100.0% bf16 MFU | 62038 tok/s +step 3308/19560 | loss 3.641512 (-0.97z)| norm 0.2781 (-0.38z)| lr 5.72e-04 | 8447.56 ms | -100.0% bf16 MFU | 62040 tok/s +step 3309/19560 | loss 3.636269 (-1.10z)| norm 0.2535 (-1.44z)| lr 5.72e-04 | 8448.32 ms | -100.0% bf16 MFU | 62040 tok/s +step 3310/19560 | loss 3.695945 (+0.43z)| norm 0.2577 (-1.26z)| lr 5.72e-04 | 8448.39 ms | -100.0% bf16 MFU | 62041 tok/s +step 3311/19560 | loss 3.741741 (+1.60z)| norm 0.2614 (-1.08z)| lr 5.72e-04 | 8451.79 ms | -100.0% bf16 MFU | 62041 tok/s +step 3312/19560 | loss 3.641954 (-0.95z)| norm 0.2327 (-2.26z)| lr 5.72e-04 | 8448.02 ms | -100.0% bf16 MFU | 62042 tok/s +step 3313/19560 | loss 3.674348 (-0.11z)| norm 0.2766 (-0.40z)| lr 5.72e-04 | 8449.32 ms | -100.0% bf16 MFU | 62042 tok/s +step 3314/19560 | loss 3.705776 (+0.70z)| norm 0.2685 (-0.73z)| lr 5.72e-04 | 8450.14 ms | -100.0% bf16 MFU | 62042 tok/s +step 3315/19560 | loss 3.683424 (+0.12z)| norm 0.2684 (-0.73z)| lr 5.72e-04 | 8450.34 ms | -100.0% bf16 MFU | 62043 tok/s +step 3316/19560 | loss 3.684347 (+0.17z)| norm 0.2570 (-1.21z)| lr 5.72e-04 | 8450.94 ms | -100.0% bf16 MFU | 62042 tok/s +step 3317/19560 | loss 3.651311 (-0.70z)| norm 0.2428 (-1.77z)| lr 5.72e-04 | 8446.65 ms | -100.0% bf16 MFU | 62044 tok/s +step 3318/19560 | loss 3.768271 (+2.35z)| norm 0.2669 (-0.75z)| lr 5.72e-04 | 8447.95 ms | -100.0% bf16 MFU | 62045 tok/s +step 3319/19560 | loss 3.612362 (-1.71z)| norm 0.2922 (+0.33z)| lr 5.72e-04 | 8449.00 ms | -100.0% bf16 MFU | 62045 tok/s +step 3320/19560 | loss 3.609990 (-1.76z)| norm 0.3050 (+0.88z)| lr 5.72e-04 | 8447.06 ms | -100.0% bf16 MFU | 62046 tok/s +step 3321/19560 | loss 3.605438 (-1.85z)| norm 0.3093 (+1.05z)| lr 5.72e-04 | 8449.23 ms | -100.0% bf16 MFU | 62046 tok/s +step 3322/19560 | loss 3.628997 (-1.23z)| norm 0.2857 (+0.05z)| lr 5.72e-04 | 8447.14 ms | -100.0% bf16 MFU | 62047 tok/s +step 3323/19560 | loss 3.667713 (-0.25z)| norm 0.2874 (+0.13z)| lr 5.72e-04 | 8448.27 ms | -100.0% bf16 MFU | 62048 tok/s +step 3324/19560 | loss 3.669541 (-0.20z)| norm 0.3046 (+0.86z)| lr 5.72e-04 | 8449.85 ms | -100.0% bf16 MFU | 62048 tok/s +step 3325/19560 | loss 3.681297 (+0.10z)| norm 0.2654 (-0.81z)| lr 5.72e-04 | 8450.58 ms | -100.0% bf16 MFU | 62048 tok/s +step 3326/19560 | loss 3.648310 (-0.73z)| norm 0.2989 (+0.61z)| lr 5.72e-04 | 8450.91 ms | -100.0% bf16 MFU | 62047 tok/s +step 3327/19560 | loss 3.713757 (+0.93z)| norm 0.2716 (-0.55z)| lr 5.72e-04 | 8445.52 ms | -100.0% bf16 MFU | 62049 tok/s +step 3328/19560 | loss 3.649755 (-0.69z)| norm 0.2565 (-1.18z)| lr 5.72e-04 | 8438.85 ms | -100.0% bf16 MFU | 62053 tok/s +step 3329/19560 | loss 3.684753 (+0.20z)| norm 0.2855 (+0.04z)| lr 5.72e-04 | 8438.06 ms | -100.0% bf16 MFU | 62057 tok/s +step 3330/19560 | loss 3.677678 (+0.03z)| norm 0.2860 (+0.06z)| lr 5.72e-04 | 8434.47 ms | -100.0% bf16 MFU | 62062 tok/s +step 3331/19560 | loss 3.699233 (+0.59z)| norm 0.2540 (-1.29z)| lr 5.72e-04 | 8433.71 ms | -100.0% bf16 MFU | 62067 tok/s +step 3332/19560 | loss 3.693934 (+0.44z)| norm 0.2509 (-1.40z)| lr 5.72e-04 | 8435.51 ms | -100.0% bf16 MFU | 62071 tok/s +step 3333/19560 | loss 3.673386 (-0.11z)| norm 0.2573 (-1.11z)| lr 5.72e-04 | 8434.94 ms | -100.0% bf16 MFU | 62076 tok/s +step 3334/19560 | loss 3.612709 (-1.70z)| norm 0.2590 (-1.03z)| lr 5.72e-04 | 8435.07 ms | -100.0% bf16 MFU | 62080 tok/s +step 3335/19560 | loss 3.682559 (+0.12z)| norm 0.2726 (-0.48z)| lr 5.72e-04 | 8433.62 ms | -100.0% bf16 MFU | 62084 tok/s +step 3336/19560 | loss 3.684051 (+0.16z)| norm 0.2919 (+0.33z)| lr 5.72e-04 | 8435.65 ms | -100.0% bf16 MFU | 62087 tok/s +step 3337/19560 | loss 3.713188 (+0.92z)| norm 0.2877 (+0.15z)| lr 5.72e-04 | 8435.79 ms | -100.0% bf16 MFU | 62091 tok/s +step 3338/19560 | loss 3.720595 (+1.09z)| norm 0.2842 (-0.01z)| lr 5.72e-04 | 8434.51 ms | -100.0% bf16 MFU | 62094 tok/s +step 3339/19560 | loss 3.651188 (-0.73z)| norm 0.2876 (+0.12z)| lr 5.71e-04 | 8434.18 ms | -100.0% bf16 MFU | 62097 tok/s +step 3340/19560 | loss 3.701281 (+0.60z)| norm 0.2780 (-0.29z)| lr 5.71e-04 | 8436.67 ms | -100.0% bf16 MFU | 62100 tok/s +step 3341/19560 | loss 3.693808 (+0.39z)| norm 0.2506 (-1.47z)| lr 5.71e-04 | 8437.76 ms | -100.0% bf16 MFU | 62102 tok/s +step 3342/19560 | loss 3.635485 (-1.15z)| norm 0.2702 (-0.62z)| lr 5.71e-04 | 8434.84 ms | -100.0% bf16 MFU | 62104 tok/s +step 3343/19560 | loss 3.671464 (-0.22z)| norm 0.2798 (-0.19z)| lr 5.71e-04 | 8436.69 ms | -100.0% bf16 MFU | 62106 tok/s +step 3344/19560 | loss 3.665618 (-0.37z)| norm 0.2713 (-0.55z)| lr 5.71e-04 | 8437.99 ms | -100.0% bf16 MFU | 62108 tok/s +step 3345/19560 | loss 3.667553 (-0.32z)| norm 0.2656 (-0.79z)| lr 5.71e-04 | 8437.70 ms | -100.0% bf16 MFU | 62109 tok/s +step 3346/19560 | loss 3.702559 (+0.61z)| norm 0.2534 (-1.31z)| lr 5.71e-04 | 8437.52 ms | -100.0% bf16 MFU | 62111 tok/s +step 3347/19560 | loss 3.617661 (-1.66z)| norm 0.2656 (-0.76z)| lr 5.71e-04 | 8437.04 ms | -100.0% bf16 MFU | 62112 tok/s +step 3348/19560 | loss 3.770195 (+2.35z)| norm 0.2915 (+0.37z)| lr 5.71e-04 | 8465.62 ms | -100.0% bf16 MFU | 62103 tok/s +step 3349/19560 | loss 3.657622 (-0.59z)| norm 0.3214 (+1.65z)| lr 5.71e-04 | 8461.52 ms | -100.0% bf16 MFU | 62096 tok/s +step 3350/19560 | loss 3.676263 (-0.10z)| norm 0.3079 (+1.06z)| lr 5.71e-04 | 8462.92 ms | -100.0% bf16 MFU | 62089 tok/s +step 3351/19560 | loss 3.729439 (+1.27z)| norm 0.3025 (+0.81z)| lr 5.71e-04 | 8466.73 ms | -100.0% bf16 MFU | 62080 tok/s +step 3352/19560 | loss 3.663140 (-0.45z)| norm 0.2979 (+0.60z)| lr 5.71e-04 | 8464.64 ms | -100.0% bf16 MFU | 62073 tok/s +step 3353/19560 | loss 3.650851 (-0.76z)| norm 0.2856 (+0.06z)| lr 5.71e-04 | 8464.80 ms | -100.0% bf16 MFU | 62067 tok/s +step 3354/19560 | loss 3.660019 (-0.51z)| norm 0.2982 (+0.62z)| lr 5.71e-04 | 8464.10 ms | -100.0% bf16 MFU | 62060 tok/s +step 3355/19560 | loss 3.667743 (-0.30z)| norm 0.3461 (+2.67z)| lr 5.71e-04 | 8469.81 ms | -100.0% bf16 MFU | 62052 tok/s +step 3356/19560 | loss 3.738440 (+1.51z)| norm 0.3919 (+4.29z)| lr 5.71e-04 | 8468.64 ms | -100.0% bf16 MFU | 62045 tok/s +step 3357/19560 | loss 3.655301 (-0.63z)| norm 0.3704 (+3.26z)| lr 5.71e-04 | 8465.53 ms | -100.0% bf16 MFU | 62040 tok/s +step 3358/19560 | loss 3.687532 (+0.21z)| norm 0.3298 (+1.66z)| lr 5.71e-04 | 8467.96 ms | -100.0% bf16 MFU | 62033 tok/s +step 3359/19560 | loss 3.659388 (-0.52z)| norm 0.2818 (-0.16z)| lr 5.71e-04 | 8473.46 ms | -100.0% bf16 MFU | 62025 tok/s +step 3360/19560 | loss 3.775922 (+2.52z)| norm 0.3160 (+1.14z)| lr 5.71e-04 | 8473.34 ms | -100.0% bf16 MFU | 62018 tok/s +step 3361/19560 | loss 3.705575 (+0.68z)| norm 0.2837 (-0.08z)| lr 5.71e-04 | 8468.52 ms | -100.0% bf16 MFU | 62012 tok/s +step 3362/19560 | loss 3.704222 (+0.64z)| norm 0.2938 (+0.32z)| lr 5.71e-04 | 8468.31 ms | -100.0% bf16 MFU | 62007 tok/s +step 3363/19560 | loss 3.692185 (+0.32z)| norm 0.3002 (+0.56z)| lr 5.71e-04 | 8469.48 ms | -100.0% bf16 MFU | 62002 tok/s +step 3364/19560 | loss 3.650746 (-0.75z)| norm 0.2606 (-0.96z)| lr 5.71e-04 | 8467.40 ms | -100.0% bf16 MFU | 61998 tok/s +step 3365/19560 | loss 3.698743 (+0.49z)| norm 0.2832 (-0.10z)| lr 5.71e-04 | 8467.42 ms | -100.0% bf16 MFU | 61994 tok/s +step 3366/19560 | loss 3.677230 (-0.08z)| norm 0.2781 (-0.30z)| lr 5.71e-04 | 8465.83 ms | -100.0% bf16 MFU | 61991 tok/s +step 3367/19560 | loss 3.667617 (-0.33z)| norm 0.2754 (-0.42z)| lr 5.71e-04 | 8463.76 ms | -100.0% bf16 MFU | 61989 tok/s +step 3368/19560 | loss 3.681865 (+0.04z)| norm 0.2747 (-0.45z)| lr 5.71e-04 | 8464.87 ms | -100.0% bf16 MFU | 61986 tok/s +step 3369/19560 | loss 3.706529 (+0.68z)| norm 0.2728 (-0.53z)| lr 5.71e-04 | 8464.59 ms | -100.0% bf16 MFU | 61984 tok/s +step 3370/19560 | loss 3.767877 (+2.23z)| norm 0.2843 (-0.09z)| lr 5.71e-04 | 8466.95 ms | -100.0% bf16 MFU | 61981 tok/s +step 3371/19560 | loss 3.628660 (-1.34z)| norm 0.2696 (-0.67z)| lr 5.71e-04 | 8466.56 ms | -100.0% bf16 MFU | 61978 tok/s +step 3372/19560 | loss 3.699138 (+0.46z)| norm 0.2536 (-1.29z)| lr 5.71e-04 | 8462.01 ms | -100.0% bf16 MFU | 61977 tok/s +step 3373/19560 | loss 3.668550 (-0.32z)| norm 0.2494 (-1.43z)| lr 5.71e-04 | 8461.37 ms | -100.0% bf16 MFU | 61976 tok/s +step 3374/19560 | loss 3.766525 (+2.15z)| norm 0.2763 (-0.37z)| lr 5.71e-04 | 8457.58 ms | -100.0% bf16 MFU | 61977 tok/s +step 3375/19560 | loss 3.692465 (+0.27z)| norm 0.2991 (+0.52z)| lr 5.71e-04 | 8450.42 ms | -100.0% bf16 MFU | 61980 tok/s +step 3376/19560 | loss 3.667036 (-0.37z)| norm 0.2947 (+0.35z)| lr 5.71e-04 | 8454.80 ms | -100.0% bf16 MFU | 61982 tok/s +step 3377/19560 | loss 3.687437 (+0.14z)| norm 0.3188 (+1.30z)| lr 5.71e-04 | 8456.19 ms | -100.0% bf16 MFU | 61983 tok/s +step 3378/19560 | loss 3.639332 (-1.07z)| norm 0.2611 (-0.96z)| lr 5.71e-04 | 8449.30 ms | -100.0% bf16 MFU | 61986 tok/s +step 3379/19560 | loss 3.685805 (+0.12z)| norm 0.2717 (-0.53z)| lr 5.71e-04 | 8456.51 ms | -100.0% bf16 MFU | 61987 tok/s +step 3380/19560 | loss 3.723798 (+1.08z)| norm 0.2787 (-0.25z)| lr 5.71e-04 | 8451.35 ms | -100.0% bf16 MFU | 61989 tok/s +step 3381/19560 | loss 3.659397 (-0.55z)| norm 0.2924 (+0.29z)| lr 5.71e-04 | 8454.55 ms | -100.0% bf16 MFU | 61990 tok/s +step 3382/19560 | loss 3.702288 (+0.53z)| norm 0.3085 (+0.91z)| lr 5.71e-04 | 8457.43 ms | -100.0% bf16 MFU | 61990 tok/s +step 3383/19560 | loss 3.663293 (-0.47z)| norm 0.3045 (+0.74z)| lr 5.71e-04 | 8451.28 ms | -100.0% bf16 MFU | 61993 tok/s +step 3384/19560 | loss 3.659622 (-0.57z)| norm 0.3083 (+0.88z)| lr 5.71e-04 | 8454.84 ms | -100.0% bf16 MFU | 61993 tok/s +step 3385/19560 | loss 3.669464 (-0.31z)| norm 0.3212 (+1.38z)| lr 5.71e-04 | 8445.76 ms | -100.0% bf16 MFU | 61998 tok/s +step 3386/19560 | loss 3.610053 (-1.79z)| norm 0.3421 (+2.21z)| lr 5.70e-04 | 8448.44 ms | -100.0% bf16 MFU | 62001 tok/s +step 3387/19560 | loss 3.725422 (+1.13z)| norm 0.2675 (-0.76z)| lr 5.70e-04 | 8458.36 ms | -100.0% bf16 MFU | 62000 tok/s +step 3388/19560 | loss 3.745564 (+1.61z)| norm 0.2856 (-0.00z)| lr 5.70e-04 | 8449.65 ms | -100.0% bf16 MFU | 62002 tok/s +step 3389/19560 | loss 3.660451 (-0.54z)| norm 0.2622 (-0.96z)| lr 5.70e-04 | 8453.98 ms | -100.0% bf16 MFU | 62003 tok/s +step 3390/19560 | loss 3.653433 (-0.72z)| norm 0.2790 (-0.27z)| lr 5.70e-04 | 8448.91 ms | -100.0% bf16 MFU | 62006 tok/s +step 3391/19560 | loss 3.642415 (-1.00z)| norm 0.2807 (-0.21z)| lr 5.70e-04 | 8457.54 ms | -100.0% bf16 MFU | 62005 tok/s +step 3392/19560 | loss 3.667485 (-0.36z)| norm 0.2563 (-1.22z)| lr 5.70e-04 | 8459.57 ms | -100.0% bf16 MFU | 62003 tok/s +step 3393/19560 | loss 3.645177 (-0.94z)| norm 0.2764 (-0.39z)| lr 5.70e-04 | 8459.32 ms | -100.0% bf16 MFU | 62002 tok/s +step 3394/19560 | loss 3.680915 (-0.01z)| norm 0.2629 (-0.95z)| lr 5.70e-04 | 8458.29 ms | -100.0% bf16 MFU | 62001 tok/s +step 3395/19560 | loss 3.706808 (+0.65z)| norm 0.2738 (-0.49z)| lr 5.70e-04 | 8463.75 ms | -100.0% bf16 MFU | 61998 tok/s +step 3396/19560 | loss 3.657634 (-0.61z)| norm 0.2528 (-1.34z)| lr 5.70e-04 | 8458.17 ms | -100.0% bf16 MFU | 61998 tok/s +step 3397/19560 | loss 3.725126 (+1.10z)| norm 0.2715 (-0.58z)| lr 5.70e-04 | 8455.43 ms | -100.0% bf16 MFU | 61998 tok/s +step 3398/19560 | loss 3.656513 (-0.65z)| norm 0.2519 (-1.37z)| lr 5.70e-04 | 8460.43 ms | -100.0% bf16 MFU | 61997 tok/s +step 3399/19560 | loss 3.622002 (-1.50z)| norm 0.2736 (-0.46z)| lr 5.70e-04 | 8451.93 ms | -100.0% bf16 MFU | 61998 tok/s +step 3400/19560 | loss 3.697516 (+0.41z)| norm 0.3258 (+1.68z)| lr 5.70e-04 | 8462.60 ms | -100.0% bf16 MFU | 61996 tok/s +step 3401/19560 | loss 3.641515 (-1.00z)| norm 0.3058 (+0.86z)| lr 5.70e-04 | 8458.43 ms | -100.0% bf16 MFU | 61996 tok/s +step 3402/19560 | loss 3.635210 (-1.14z)| norm 0.2377 (-1.91z)| lr 5.70e-04 | 8460.94 ms | -100.0% bf16 MFU | 61994 tok/s +step 3403/19560 | loss 3.662282 (-0.45z)| norm 0.2716 (-0.52z)| lr 5.70e-04 | 8463.20 ms | -100.0% bf16 MFU | 61992 tok/s +step 3404/19560 | loss 3.658686 (-0.54z)| norm 0.2707 (-0.56z)| lr 5.70e-04 | 8463.30 ms | -100.0% bf16 MFU | 61990 tok/s +step 3405/19560 | loss 3.635017 (-1.14z)| norm 0.2790 (-0.22z)| lr 5.70e-04 | 8454.01 ms | -100.0% bf16 MFU | 61991 tok/s +step 3406/19560 | loss 3.696266 (+0.48z)| norm 0.2711 (-0.53z)| lr 5.70e-04 | 8459.48 ms | -100.0% bf16 MFU | 61990 tok/s +step 3407/19560 | loss 3.716626 (+1.02z)| norm 0.2435 (-1.63z)| lr 5.70e-04 | 8458.54 ms | -100.0% bf16 MFU | 61990 tok/s +step 3408/19560 | loss 3.622612 (-1.44z)| norm 0.2491 (-1.38z)| lr 5.70e-04 | 8455.22 ms | -100.0% bf16 MFU | 61991 tok/s +step 3409/19560 | loss 3.681032 (+0.09z)| norm 0.2476 (-1.42z)| lr 5.70e-04 | 8455.29 ms | -100.0% bf16 MFU | 61992 tok/s +step 3410/19560 | loss 3.662804 (-0.39z)| norm 0.2442 (-1.53z)| lr 5.70e-04 | 8452.92 ms | -100.0% bf16 MFU | 61993 tok/s +step 3411/19560 | loss 3.781117 (+2.63z)| norm 0.2463 (-1.42z)| lr 5.70e-04 | 8458.23 ms | -100.0% bf16 MFU | 61993 tok/s +step 3412/19560 | loss 3.635949 (-1.08z)| norm 0.2627 (-0.76z)| lr 5.70e-04 | 8458.70 ms | -100.0% bf16 MFU | 61992 tok/s +step 3413/19560 | loss 3.667136 (-0.28z)| norm 0.2786 (-0.12z)| lr 5.70e-04 | 8457.79 ms | -100.0% bf16 MFU | 61992 tok/s +step 3414/19560 | loss 3.666775 (-0.28z)| norm 0.3064 (+0.99z)| lr 5.70e-04 | 8457.27 ms | -100.0% bf16 MFU | 61992 tok/s +step 3415/19560 | loss 3.721602 (+1.14z)| norm 0.3123 (+1.20z)| lr 5.70e-04 | 8453.73 ms | -100.0% bf16 MFU | 61994 tok/s +step 3416/19560 | loss 3.663477 (-0.37z)| norm 0.3403 (+2.25z)| lr 5.70e-04 | 8456.29 ms | -100.0% bf16 MFU | 61994 tok/s +step 3417/19560 | loss 3.710815 (+0.88z)| norm 0.3055 (+0.89z)| lr 5.70e-04 | 8455.43 ms | -100.0% bf16 MFU | 61994 tok/s +step 3418/19560 | loss 3.698207 (+0.60z)| norm 0.3055 (+0.88z)| lr 5.70e-04 | 8455.92 ms | -100.0% bf16 MFU | 61995 tok/s +step 3419/19560 | loss 3.600451 (-2.05z)| norm 0.3083 (+0.98z)| lr 5.70e-04 | 8456.04 ms | -100.0% bf16 MFU | 61995 tok/s +step 3420/19560 | loss 3.707572 (+0.85z)| norm 0.3062 (+0.89z)| lr 5.70e-04 | 8453.43 ms | -100.0% bf16 MFU | 61996 tok/s +step 3421/19560 | loss 3.701756 (+0.69z)| norm 0.2853 (+0.09z)| lr 5.70e-04 | 8453.43 ms | -100.0% bf16 MFU | 61998 tok/s +step 3422/19560 | loss 3.636145 (-1.09z)| norm 0.2912 (+0.33z)| lr 5.70e-04 | 8455.32 ms | -100.0% bf16 MFU | 61998 tok/s +step 3423/19560 | loss 3.624971 (-1.38z)| norm 0.2767 (-0.22z)| lr 5.70e-04 | 8457.44 ms | -100.0% bf16 MFU | 61998 tok/s +step 3424/19560 | loss 3.662322 (-0.37z)| norm 0.2703 (-0.47z)| lr 5.70e-04 | 8451.90 ms | -100.0% bf16 MFU | 62000 tok/s +step 3425/19560 | loss 3.702276 (+0.69z)| norm 0.2646 (-0.69z)| lr 5.70e-04 | 8456.64 ms | -100.0% bf16 MFU | 61999 tok/s +step 3426/19560 | loss 3.697410 (+0.55z)| norm 0.2618 (-0.80z)| lr 5.70e-04 | 8454.26 ms | -100.0% bf16 MFU | 62000 tok/s +step 3427/19560 | loss 3.599845 (-2.04z)| norm 0.2769 (-0.20z)| lr 5.70e-04 | 8454.88 ms | -100.0% bf16 MFU | 62001 tok/s +step 3428/19560 | loss 3.739226 (+1.64z)| norm 0.3045 (+0.87z)| lr 5.70e-04 | 8456.48 ms | -100.0% bf16 MFU | 62001 tok/s +step 3429/19560 | loss 3.646264 (-0.80z)| norm 0.3174 (+1.35z)| lr 5.70e-04 | 8456.44 ms | -100.0% bf16 MFU | 62000 tok/s +step 3430/19560 | loss 3.659383 (-0.45z)| norm 0.3007 (+0.70z)| lr 5.70e-04 | 8455.47 ms | -100.0% bf16 MFU | 62001 tok/s +step 3431/19560 | loss 3.681591 (+0.14z)| norm 0.2928 (+0.39z)| lr 5.70e-04 | 8451.85 ms | -100.0% bf16 MFU | 62002 tok/s +step 3432/19560 | loss 3.637192 (-1.02z)| norm 0.2762 (-0.25z)| lr 5.69e-04 | 8454.36 ms | -100.0% bf16 MFU | 62003 tok/s +step 3433/19560 | loss 3.653524 (-0.59z)| norm 0.2535 (-1.11z)| lr 5.69e-04 | 8454.93 ms | -100.0% bf16 MFU | 62003 tok/s +step 3434/19560 | loss 3.632120 (-1.15z)| norm 0.2831 (+0.02z)| lr 5.69e-04 | 8462.86 ms | -100.0% bf16 MFU | 62001 tok/s +step 3435/19560 | loss 3.693398 (+0.45z)| norm 0.2904 (+0.31z)| lr 5.69e-04 | 8452.25 ms | -100.0% bf16 MFU | 62002 tok/s +step 3436/19560 | loss 3.661461 (-0.39z)| norm 0.2804 (-0.08z)| lr 5.69e-04 | 8458.40 ms | -100.0% bf16 MFU | 62001 tok/s +step 3437/19560 | loss 3.699193 (+0.59z)| norm 0.2645 (-0.70z)| lr 5.69e-04 | 8450.75 ms | -100.0% bf16 MFU | 62003 tok/s +step 3438/19560 | loss 3.655993 (-0.54z)| norm 0.2865 (+0.15z)| lr 5.69e-04 | 8452.57 ms | -100.0% bf16 MFU | 62004 tok/s +step 3439/19560 | loss 3.659465 (-0.43z)| norm 0.3256 (+1.63z)| lr 5.69e-04 | 8450.62 ms | -100.0% bf16 MFU | 62006 tok/s +step 3440/19560 | loss 3.638627 (-0.99z)| norm 0.2808 (-0.11z)| lr 5.69e-04 | 8449.06 ms | -100.0% bf16 MFU | 62009 tok/s +step 3441/19560 | loss 3.633631 (-1.11z)| norm 0.2985 (+0.57z)| lr 5.69e-04 | 8445.34 ms | -100.0% bf16 MFU | 62012 tok/s +step 3442/19560 | loss 3.768081 (+2.38z)| norm 0.2911 (+0.28z)| lr 5.69e-04 | 8444.72 ms | -100.0% bf16 MFU | 62016 tok/s +step 3443/19560 | loss 3.611083 (-1.65z)| norm 0.2898 (+0.22z)| lr 5.69e-04 | 8443.91 ms | -100.0% bf16 MFU | 62019 tok/s +step 3444/19560 | loss 3.735781 (+1.52z)| norm 0.2817 (-0.10z)| lr 5.69e-04 | 8442.63 ms | -100.0% bf16 MFU | 62024 tok/s +step 3445/19560 | loss 3.642583 (-0.84z)| norm 0.2839 (-0.03z)| lr 5.69e-04 | 8444.42 ms | -100.0% bf16 MFU | 62027 tok/s +step 3446/19560 | loss 3.747440 (+1.84z)| norm 0.2626 (-0.87z)| lr 5.69e-04 | 8443.08 ms | -100.0% bf16 MFU | 62030 tok/s +step 3447/19560 | loss 3.676851 (+0.02z)| norm 0.3200 (+1.39z)| lr 5.69e-04 | 8446.61 ms | -100.0% bf16 MFU | 62032 tok/s +step 3448/19560 | loss 3.677494 (+0.02z)| norm 0.2921 (+0.29z)| lr 5.69e-04 | 8447.51 ms | -100.0% bf16 MFU | 62034 tok/s +step 3449/19560 | loss 3.717627 (+1.06z)| norm 0.3000 (+0.61z)| lr 5.69e-04 | 8442.91 ms | -100.0% bf16 MFU | 62037 tok/s +step 3450/19560 | loss 3.668646 (-0.24z)| norm 0.3193 (+1.35z)| lr 5.69e-04 | 8451.13 ms | -100.0% bf16 MFU | 62037 tok/s +step 3451/19560 | loss 3.677724 (-0.00z)| norm 0.3357 (+1.95z)| lr 5.69e-04 | 8454.76 ms | -100.0% bf16 MFU | 62036 tok/s +step 3452/19560 | loss 3.667451 (-0.28z)| norm 0.3399 (+2.07z)| lr 5.69e-04 | 8447.53 ms | -100.0% bf16 MFU | 62037 tok/s +step 3453/19560 | loss 3.700748 (+0.60z)| norm 0.3363 (+1.89z)| lr 5.69e-04 | 8448.86 ms | -100.0% bf16 MFU | 62038 tok/s +step 3454/19560 | loss 3.633217 (-1.18z)| norm 0.2871 (+0.04z)| lr 5.69e-04 | 8454.45 ms | -100.0% bf16 MFU | 62037 tok/s +step 3455/19560 | loss 3.677836 (+0.01z)| norm 0.2996 (+0.50z)| lr 5.69e-04 | 8452.82 ms | -100.0% bf16 MFU | 62036 tok/s +step 3456/19560 | loss 3.673745 (-0.11z)| norm 0.2885 (+0.07z)| lr 5.69e-04 | 8448.54 ms | -100.0% bf16 MFU | 62037 tok/s +step 3457/19560 | loss 3.668754 (-0.24z)| norm 0.2798 (-0.25z)| lr 5.69e-04 | 8452.42 ms | -100.0% bf16 MFU | 62037 tok/s +step 3458/19560 | loss 3.670712 (-0.18z)| norm 0.2606 (-0.97z)| lr 5.69e-04 | 8451.49 ms | -100.0% bf16 MFU | 62037 tok/s +step 3459/19560 | loss 3.653450 (-0.63z)| norm 0.2438 (-1.59z)| lr 5.69e-04 | 8452.22 ms | -100.0% bf16 MFU | 62036 tok/s +step 3460/19560 | loss 3.752882 (+1.97z)| norm 0.2624 (-0.90z)| lr 5.69e-04 | 8455.98 ms | -100.0% bf16 MFU | 62035 tok/s +step 3461/19560 | loss 3.689979 (+0.32z)| norm 0.2623 (-0.91z)| lr 5.69e-04 | 8452.47 ms | -100.0% bf16 MFU | 62034 tok/s +step 3462/19560 | loss 3.686717 (+0.22z)| norm 0.2719 (-0.55z)| lr 5.69e-04 | 8452.64 ms | -100.0% bf16 MFU | 62034 tok/s +step 3463/19560 | loss 3.697967 (+0.51z)| norm 0.2631 (-0.88z)| lr 5.69e-04 | 8448.57 ms | -100.0% bf16 MFU | 62035 tok/s +step 3464/19560 | loss 3.625752 (-1.37z)| norm 0.2507 (-1.32z)| lr 5.69e-04 | 8451.52 ms | -100.0% bf16 MFU | 62035 tok/s +step 3465/19560 | loss 3.645547 (-0.84z)| norm 0.3072 (+0.79z)| lr 5.69e-04 | 8448.72 ms | -100.0% bf16 MFU | 62036 tok/s +step 3466/19560 | loss 3.650662 (-0.69z)| norm 0.2787 (-0.28z)| lr 5.69e-04 | 8453.10 ms | -100.0% bf16 MFU | 62035 tok/s +step 3467/19560 | loss 3.663157 (-0.37z)| norm 0.2920 (+0.22z)| lr 5.69e-04 | 8449.04 ms | -100.0% bf16 MFU | 62036 tok/s +step 3468/19560 | loss 3.641928 (-0.91z)| norm 0.2788 (-0.27z)| lr 5.69e-04 | 8453.23 ms | -100.0% bf16 MFU | 62036 tok/s +step 3469/19560 | loss 3.679646 (+0.08z)| norm 0.2872 (+0.03z)| lr 5.69e-04 | 8452.54 ms | -100.0% bf16 MFU | 62035 tok/s +step 3470/19560 | loss 3.652155 (-0.65z)| norm 0.2650 (-0.81z)| lr 5.69e-04 | 8452.67 ms | -100.0% bf16 MFU | 62035 tok/s +step 3471/19560 | loss 3.663339 (-0.35z)| norm 0.2985 (+0.45z)| lr 5.69e-04 | 8449.57 ms | -100.0% bf16 MFU | 62035 tok/s +step 3472/19560 | loss 3.667555 (-0.24z)| norm 0.2816 (-0.19z)| lr 5.69e-04 | 8450.37 ms | -100.0% bf16 MFU | 62036 tok/s +step 3473/19560 | loss 3.641830 (-0.91z)| norm 0.3038 (+0.64z)| lr 5.69e-04 | 8452.91 ms | -100.0% bf16 MFU | 62035 tok/s +step 3474/19560 | loss 3.648123 (-0.73z)| norm 0.3000 (+0.48z)| lr 5.69e-04 | 8448.89 ms | -100.0% bf16 MFU | 62036 tok/s +step 3475/19560 | loss 3.663089 (-0.35z)| norm 0.2896 (+0.08z)| lr 5.69e-04 | 8453.20 ms | -100.0% bf16 MFU | 62035 tok/s +step 3476/19560 | loss 3.780532 (+2.75z)| norm 0.2837 (-0.14z)| lr 5.69e-04 | 8449.81 ms | -100.0% bf16 MFU | 62036 tok/s +step 3477/19560 | loss 3.672558 (-0.11z)| norm 0.2848 (-0.09z)| lr 5.68e-04 | 8449.21 ms | -100.0% bf16 MFU | 62037 tok/s +step 3478/19560 | loss 3.677868 (+0.03z)| norm 0.2580 (-1.10z)| lr 5.68e-04 | 8451.25 ms | -100.0% bf16 MFU | 62037 tok/s +step 3479/19560 | loss 3.673894 (-0.06z)| norm 0.2796 (-0.27z)| lr 5.68e-04 | 8455.50 ms | -100.0% bf16 MFU | 62035 tok/s +step 3480/19560 | loss 3.694130 (+0.47z)| norm 0.2728 (-0.52z)| lr 5.68e-04 | 8448.55 ms | -100.0% bf16 MFU | 62036 tok/s +step 3481/19560 | loss 3.774765 (+2.53z)| norm 0.2981 (+0.44z)| lr 5.68e-04 | 8449.70 ms | -100.0% bf16 MFU | 62037 tok/s +step 3482/19560 | loss 3.662750 (-0.38z)| norm 0.2830 (-0.13z)| lr 5.68e-04 | 8453.60 ms | -100.0% bf16 MFU | 62036 tok/s +step 3483/19560 | loss 3.728597 (+1.31z)| norm 0.2876 (+0.07z)| lr 5.68e-04 | 8458.22 ms | -100.0% bf16 MFU | 62034 tok/s +step 3484/19560 | loss 3.649503 (-0.72z)| norm 0.2772 (-0.33z)| lr 5.68e-04 | 8454.85 ms | -100.0% bf16 MFU | 62032 tok/s +step 3485/19560 | loss 3.691260 (+0.36z)| norm 0.2569 (-1.20z)| lr 5.68e-04 | 8450.62 ms | -100.0% bf16 MFU | 62033 tok/s +step 3486/19560 | loss 3.632664 (-1.15z)| norm 0.2714 (-0.55z)| lr 5.68e-04 | 8448.84 ms | -100.0% bf16 MFU | 62034 tok/s +step 3487/19560 | loss 3.661654 (-0.40z)| norm 0.2964 (+0.57z)| lr 5.68e-04 | 8447.43 ms | -100.0% bf16 MFU | 62035 tok/s +step 3488/19560 | loss 3.656340 (-0.53z)| norm 0.2726 (-0.49z)| lr 5.68e-04 | 8445.41 ms | -100.0% bf16 MFU | 62038 tok/s +step 3489/19560 | loss 3.683198 (+0.19z)| norm 0.2864 (+0.13z)| lr 5.68e-04 | 8438.83 ms | -100.0% bf16 MFU | 62042 tok/s +step 3490/19560 | loss 3.722761 (+1.24z)| norm 0.2785 (-0.22z)| lr 5.68e-04 | 8440.33 ms | -100.0% bf16 MFU | 62046 tok/s +step 3491/19560 | loss 3.687960 (+0.32z)| norm 0.2758 (-0.33z)| lr 5.68e-04 | 8439.78 ms | -100.0% bf16 MFU | 62050 tok/s +step 3492/19560 | loss 3.711366 (+0.93z)| norm 0.2896 (+0.28z)| lr 5.68e-04 | 8437.72 ms | -100.0% bf16 MFU | 62054 tok/s +step 3493/19560 | loss 3.667550 (-0.23z)| norm 0.2826 (-0.04z)| lr 5.68e-04 | 8442.07 ms | -100.0% bf16 MFU | 62057 tok/s +step 3494/19560 | loss 3.669050 (-0.19z)| norm 0.2760 (-0.33z)| lr 5.68e-04 | 8443.44 ms | -100.0% bf16 MFU | 62058 tok/s +step 3495/19560 | loss 3.755547 (+2.06z)| norm 0.2674 (-0.71z)| lr 5.68e-04 | 8444.04 ms | -100.0% bf16 MFU | 62060 tok/s +step 3496/19560 | loss 3.703028 (+0.68z)| norm 0.2772 (-0.28z)| lr 5.68e-04 | 8448.12 ms | -100.0% bf16 MFU | 62060 tok/s +step 3497/19560 | loss 3.655292 (-0.56z)| norm 0.2783 (-0.23z)| lr 5.68e-04 | 8446.06 ms | -100.0% bf16 MFU | 62061 tok/s +step 3498/19560 | loss 3.662851 (-0.35z)| norm 0.2770 (-0.28z)| lr 5.68e-04 | 8447.65 ms | -100.0% bf16 MFU | 62061 tok/s +step 3499/19560 | loss 3.679024 (+0.08z)| norm 0.2649 (-0.83z)| lr 5.68e-04 | 8445.55 ms | -100.0% bf16 MFU | 62062 tok/s +step 3500/19560 | loss 3.587397 (-2.32z)| norm 0.2940 (+0.47z)| lr 5.68e-04 | 8447.13 ms | -100.0% bf16 MFU | 62062 tok/s +val loss 3.642134 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 2634/10042 = 0.262298 +step 3501/19560 | loss 3.612176 (-1.63z)| norm 0.2894 (+0.25z)| lr 5.68e-04 | 8447.29 ms | -100.0% bf16 MFU | 62062 tok/s +step 3502/19560 | loss 3.710319 (+0.95z)| norm 0.2972 (+0.60z)| lr 5.68e-04 | 8449.23 ms | -100.0% bf16 MFU | 62062 tok/s +step 3503/19560 | loss 3.669511 (-0.13z)| norm 0.2677 (-0.73z)| lr 5.68e-04 | 8448.05 ms | -100.0% bf16 MFU | 62062 tok/s +step 3504/19560 | loss 3.696558 (+0.59z)| norm 0.2626 (-0.95z)| lr 5.68e-04 | 8452.31 ms | -100.0% bf16 MFU | 62060 tok/s +step 3505/19560 | loss 3.685194 (+0.28z)| norm 0.2667 (-0.76z)| lr 5.68e-04 | 8449.14 ms | -100.0% bf16 MFU | 62060 tok/s +step 3506/19560 | loss 3.659229 (-0.41z)| norm 0.2662 (-0.78z)| lr 5.68e-04 | 8451.10 ms | -100.0% bf16 MFU | 62058 tok/s +step 3507/19560 | loss 3.647023 (-0.73z)| norm 0.2652 (-0.82z)| lr 5.68e-04 | 8451.47 ms | -100.0% bf16 MFU | 62057 tok/s +step 3508/19560 | loss 3.619011 (-1.45z)| norm 0.2528 (-1.37z)| lr 5.68e-04 | 8449.45 ms | -100.0% bf16 MFU | 62057 tok/s +step 3509/19560 | loss 3.664991 (-0.23z)| norm 0.2879 (+0.23z)| lr 5.68e-04 | 8452.00 ms | -100.0% bf16 MFU | 62056 tok/s +step 3510/19560 | loss 3.686124 (+0.34z)| norm 0.3879 (+4.41z)| lr 5.68e-04 | 8454.30 ms | -100.0% bf16 MFU | 62054 tok/s +step 3511/19560 | loss 3.664538 (-0.24z)| norm 0.3311 (+1.98z)| lr 5.68e-04 | 8446.94 ms | -100.0% bf16 MFU | 62054 tok/s +step 3512/19560 | loss 3.654218 (-0.51z)| norm 0.3659 (+3.28z)| lr 5.68e-04 | 8448.61 ms | -100.0% bf16 MFU | 62054 tok/s +step 3513/19560 | loss 3.658231 (-0.40z)| norm 0.3226 (+1.54z)| lr 5.68e-04 | 8447.74 ms | -100.0% bf16 MFU | 62055 tok/s +step 3514/19560 | loss 3.684204 (+0.28z)| norm 0.2915 (+0.32z)| lr 5.68e-04 | 8449.96 ms | -100.0% bf16 MFU | 62054 tok/s +step 3515/19560 | loss 3.642282 (-0.84z)| norm 0.2777 (-0.25z)| lr 5.68e-04 | 8451.57 ms | -100.0% bf16 MFU | 62053 tok/s +step 3516/19560 | loss 3.628561 (-1.19z)| norm 0.2803 (-0.14z)| lr 5.68e-04 | 8448.15 ms | -100.0% bf16 MFU | 62054 tok/s +step 3517/19560 | loss 3.728248 (+1.50z)| norm 0.2457 (-1.55z)| lr 5.68e-04 | 8448.48 ms | -100.0% bf16 MFU | 62054 tok/s +step 3518/19560 | loss 3.694696 (+0.58z)| norm 0.2858 (+0.08z)| lr 5.68e-04 | 8449.93 ms | -100.0% bf16 MFU | 62053 tok/s +step 3519/19560 | loss 3.671582 (-0.05z)| norm 0.2819 (-0.07z)| lr 5.68e-04 | 8453.18 ms | -100.0% bf16 MFU | 62052 tok/s +step 3520/19560 | loss 3.715381 (+1.12z)| norm 0.2686 (-0.62z)| lr 5.68e-04 | 8450.50 ms | -100.0% bf16 MFU | 62051 tok/s +step 3521/19560 | loss 3.679795 (+0.15z)| norm 0.2481 (-1.44z)| lr 5.68e-04 | 8446.64 ms | -100.0% bf16 MFU | 62052 tok/s +step 3522/19560 | loss 3.678906 (+0.13z)| norm 0.2971 (+0.53z)| lr 5.67e-04 | 8449.87 ms | -100.0% bf16 MFU | 62052 tok/s +step 3523/19560 | loss 3.697944 (+0.65z)| norm 0.3125 (+1.14z)| lr 5.67e-04 | 8447.85 ms | -100.0% bf16 MFU | 62053 tok/s +step 3524/19560 | loss 3.693792 (+0.53z)| norm 0.3007 (+0.66z)| lr 5.67e-04 | 8448.39 ms | -100.0% bf16 MFU | 62053 tok/s +step 3525/19560 | loss 3.686033 (+0.33z)| norm 0.2761 (-0.35z)| lr 5.67e-04 | 8451.03 ms | -100.0% bf16 MFU | 62052 tok/s +step 3526/19560 | loss 3.757063 (+2.20z)| norm 0.2812 (-0.15z)| lr 5.67e-04 | 8452.62 ms | -100.0% bf16 MFU | 62051 tok/s +step 3527/19560 | loss 3.780832 (+2.74z)| norm 0.3073 (+0.91z)| lr 5.67e-04 | 8446.20 ms | -100.0% bf16 MFU | 62052 tok/s +step 3528/19560 | loss 3.713394 (+0.97z)| norm 0.3116 (+1.10z)| lr 5.67e-04 | 8449.62 ms | -100.0% bf16 MFU | 62052 tok/s +step 3529/19560 | loss 3.729829 (+1.38z)| norm 0.2950 (+0.41z)| lr 5.67e-04 | 8449.68 ms | -100.0% bf16 MFU | 62052 tok/s +step 3530/19560 | loss 3.633272 (-1.13z)| norm 0.2797 (-0.23z)| lr 5.67e-04 | 8447.92 ms | -100.0% bf16 MFU | 62052 tok/s +step 3531/19560 | loss 3.686558 (+0.25z)| norm 0.4611 (+6.13z)| lr 5.67e-04 | 8450.41 ms | -100.0% bf16 MFU | 62052 tok/s +step 3532/19560 | loss 3.678033 (+0.02z)| norm 0.2789 (-0.28z)| lr 5.67e-04 | 8448.23 ms | -100.0% bf16 MFU | 62052 tok/s +step 3533/19560 | loss 3.736604 (+1.52z)| norm 0.2729 (-0.49z)| lr 5.67e-04 | 8449.89 ms | -100.0% bf16 MFU | 62052 tok/s +step 3534/19560 | loss 3.644984 (-0.84z)| norm 0.2863 (-0.02z)| lr 5.67e-04 | 8451.44 ms | -100.0% bf16 MFU | 62051 tok/s +step 3535/19560 | loss 3.644416 (-0.84z)| norm 0.2764 (-0.38z)| lr 5.67e-04 | 8448.86 ms | -100.0% bf16 MFU | 62051 tok/s +step 3536/19560 | loss 3.722496 (+1.16z)| norm 0.2615 (-0.92z)| lr 5.67e-04 | 8446.07 ms | -100.0% bf16 MFU | 62052 tok/s +step 3537/19560 | loss 3.619533 (-1.48z)| norm 0.2738 (-0.49z)| lr 5.67e-04 | 8448.47 ms | -100.0% bf16 MFU | 62053 tok/s +step 3538/19560 | loss 3.621967 (-1.40z)| norm 0.2640 (-0.85z)| lr 5.67e-04 | 8451.07 ms | -100.0% bf16 MFU | 62052 tok/s +step 3539/19560 | loss 3.636115 (-1.04z)| norm 0.2805 (-0.27z)| lr 5.67e-04 | 8477.41 ms | -100.0% bf16 MFU | 62041 tok/s +step 3540/19560 | loss 3.693846 (+0.46z)| norm 0.3068 (+0.68z)| lr 5.67e-04 | 8472.93 ms | -100.0% bf16 MFU | 62033 tok/s +step 3541/19560 | loss 3.747625 (+1.83z)| norm 0.3352 (+1.68z)| lr 5.67e-04 | 8477.33 ms | -100.0% bf16 MFU | 62024 tok/s +step 3542/19560 | loss 3.683301 (+0.16z)| norm 0.3155 (+0.97z)| lr 5.67e-04 | 8472.99 ms | -100.0% bf16 MFU | 62017 tok/s +step 3543/19560 | loss 3.660528 (-0.42z)| norm 0.2678 (-0.74z)| lr 5.67e-04 | 8473.65 ms | -100.0% bf16 MFU | 62009 tok/s +step 3544/19560 | loss 3.602350 (-1.89z)| norm 0.2631 (-0.90z)| lr 5.67e-04 | 8471.05 ms | -100.0% bf16 MFU | 62004 tok/s +step 3545/19560 | loss 3.653990 (-0.56z)| norm 0.2507 (-1.33z)| lr 5.67e-04 | 8473.24 ms | -100.0% bf16 MFU | 61997 tok/s +step 3546/19560 | loss 3.664262 (-0.29z)| norm 0.2687 (-0.66z)| lr 5.67e-04 | 8473.73 ms | -100.0% bf16 MFU | 61991 tok/s +step 3547/19560 | loss 3.680460 (+0.12z)| norm 0.2837 (-0.11z)| lr 5.67e-04 | 8471.06 ms | -100.0% bf16 MFU | 61986 tok/s +step 3548/19560 | loss 3.597610 (-2.00z)| norm 0.2774 (-0.33z)| lr 5.67e-04 | 8473.08 ms | -100.0% bf16 MFU | 61980 tok/s +step 3549/19560 | loss 3.670890 (-0.10z)| norm 0.2541 (-1.17z)| lr 5.67e-04 | 8473.94 ms | -100.0% bf16 MFU | 61975 tok/s +step 3550/19560 | loss 3.672545 (-0.07z)| norm 0.2527 (-1.20z)| lr 5.67e-04 | 8469.97 ms | -100.0% bf16 MFU | 61971 tok/s +step 3551/19560 | loss 3.645786 (-0.77z)| norm 0.2436 (-1.51z)| lr 5.67e-04 | 8471.29 ms | -100.0% bf16 MFU | 61967 tok/s +step 3552/19560 | loss 3.650349 (-0.65z)| norm 0.2372 (-1.71z)| lr 5.67e-04 | 8465.48 ms | -100.0% bf16 MFU | 61965 tok/s +step 3553/19560 | loss 3.744508 (+1.77z)| norm 0.2759 (-0.34z)| lr 5.67e-04 | 8465.73 ms | -100.0% bf16 MFU | 61964 tok/s +step 3554/19560 | loss 3.660336 (-0.38z)| norm 0.2805 (-0.19z)| lr 5.67e-04 | 8467.64 ms | -100.0% bf16 MFU | 61961 tok/s +step 3555/19560 | loss 3.631503 (-1.14z)| norm 0.2657 (-0.71z)| lr 5.67e-04 | 8469.97 ms | -100.0% bf16 MFU | 61958 tok/s +step 3556/19560 | loss 3.625073 (-1.29z)| norm 0.2723 (-0.46z)| lr 5.67e-04 | 8466.37 ms | -100.0% bf16 MFU | 61957 tok/s +step 3557/19560 | loss 3.709152 (+0.89z)| norm 0.2512 (-1.19z)| lr 5.67e-04 | 8469.42 ms | -100.0% bf16 MFU | 61954 tok/s +step 3558/19560 | loss 3.672016 (-0.08z)| norm 0.2762 (-0.30z)| lr 5.67e-04 | 8464.60 ms | -100.0% bf16 MFU | 61953 tok/s +step 3559/19560 | loss 3.672590 (-0.07z)| norm 0.2754 (-0.33z)| lr 5.67e-04 | 8465.21 ms | -100.0% bf16 MFU | 61952 tok/s +step 3560/19560 | loss 3.639770 (-0.92z)| norm 0.2805 (-0.15z)| lr 5.67e-04 | 8468.06 ms | -100.0% bf16 MFU | 61950 tok/s +step 3561/19560 | loss 3.666638 (-0.23z)| norm 0.3053 (+0.72z)| lr 5.67e-04 | 8464.99 ms | -100.0% bf16 MFU | 61950 tok/s +step 3562/19560 | loss 3.643114 (-0.85z)| norm 0.3284 (+1.51z)| lr 5.67e-04 | 8466.40 ms | -100.0% bf16 MFU | 61948 tok/s +step 3563/19560 | loss 3.654929 (-0.53z)| norm 0.3429 (+1.98z)| lr 5.67e-04 | 8464.41 ms | -100.0% bf16 MFU | 61948 tok/s +step 3564/19560 | loss 3.657348 (-0.46z)| norm 0.3063 (+0.70z)| lr 5.67e-04 | 8466.18 ms | -100.0% bf16 MFU | 61947 tok/s +step 3565/19560 | loss 3.648919 (-0.67z)| norm 0.2910 (+0.17z)| lr 5.67e-04 | 8461.61 ms | -100.0% bf16 MFU | 61948 tok/s +step 3566/19560 | loss 3.702908 (+0.73z)| norm 0.5572 (+7.19z)| lr 5.66e-04 | 8461.20 ms | -100.0% bf16 MFU | 61948 tok/s +step 3567/19560 | loss 3.723372 (+1.25z)| norm 0.3076 (+0.52z)| lr 5.66e-04 | 8460.43 ms | -100.0% bf16 MFU | 61950 tok/s +step 3568/19560 | loss 3.617199 (-1.50z)| norm 0.2961 (+0.21z)| lr 5.66e-04 | 8465.86 ms | -100.0% bf16 MFU | 61949 tok/s +step 3569/19560 | loss 3.637825 (-0.97z)| norm 0.2763 (-0.32z)| lr 5.66e-04 | 8459.24 ms | -100.0% bf16 MFU | 61950 tok/s +step 3570/19560 | loss 3.726122 (+1.34z)| norm 0.2921 (+0.11z)| lr 5.66e-04 | 8466.80 ms | -100.0% bf16 MFU | 61949 tok/s +step 3571/19560 | loss 3.610392 (-1.70z)| norm 0.2889 (+0.02z)| lr 5.66e-04 | 8458.58 ms | -100.0% bf16 MFU | 61950 tok/s +step 3572/19560 | loss 3.621471 (-1.39z)| norm 0.2767 (-0.30z)| lr 5.66e-04 | 8464.22 ms | -100.0% bf16 MFU | 61950 tok/s +step 3573/19560 | loss 3.723061 (+1.27z)| norm 0.2826 (-0.15z)| lr 5.66e-04 | 8465.53 ms | -100.0% bf16 MFU | 61949 tok/s +step 3574/19560 | loss 3.562896 (-2.85z)| norm 0.2971 (+0.24z)| lr 5.66e-04 | 8461.54 ms | -100.0% bf16 MFU | 61950 tok/s +step 3575/19560 | loss 3.641185 (-0.82z)| norm 0.2803 (-0.21z)| lr 5.66e-04 | 8470.52 ms | -100.0% bf16 MFU | 61947 tok/s +step 3576/19560 | loss 3.640944 (-0.82z)| norm 0.2713 (-0.45z)| lr 5.66e-04 | 8469.23 ms | -100.0% bf16 MFU | 61945 tok/s +step 3577/19560 | loss 3.648494 (-0.61z)| norm 0.2600 (-0.74z)| lr 5.66e-04 | 8466.96 ms | -100.0% bf16 MFU | 61944 tok/s +step 3578/19560 | loss 3.642253 (-0.76z)| norm 0.2547 (-0.87z)| lr 5.66e-04 | 8459.30 ms | -100.0% bf16 MFU | 61945 tok/s +step 3579/19560 | loss 3.649316 (-0.58z)| norm 0.2627 (-0.64z)| lr 5.66e-04 | 8456.31 ms | -100.0% bf16 MFU | 61948 tok/s +step 3580/19560 | loss 3.613400 (-1.48z)| norm 0.2401 (-1.23z)| lr 5.66e-04 | 8450.29 ms | -100.0% bf16 MFU | 61953 tok/s +step 3581/19560 | loss 3.682442 (+0.29z)| norm 0.2483 (-1.00z)| lr 5.66e-04 | 8447.75 ms | -100.0% bf16 MFU | 61958 tok/s +step 3582/19560 | loss 3.688441 (+0.43z)| norm 0.2767 (-0.22z)| lr 5.66e-04 | 8448.70 ms | -100.0% bf16 MFU | 61963 tok/s +step 3583/19560 | loss 3.691055 (+0.49z)| norm 0.2807 (-0.11z)| lr 5.66e-04 | 8451.68 ms | -100.0% bf16 MFU | 61967 tok/s +step 3584/19560 | loss 3.651662 (-0.51z)| norm 0.2743 (-0.28z)| lr 5.66e-04 | 8440.30 ms | -100.0% bf16 MFU | 61974 tok/s +step 3585/19560 | loss 3.662162 (-0.24z)| norm 0.2499 (-0.93z)| lr 5.66e-04 | 8443.76 ms | -100.0% bf16 MFU | 61980 tok/s +step 3586/19560 | loss 3.634528 (-0.93z)| norm 0.2457 (-1.04z)| lr 5.66e-04 | 8441.86 ms | -100.0% bf16 MFU | 61986 tok/s +step 3587/19560 | loss 3.710633 (+0.99z)| norm 0.2736 (-0.30z)| lr 5.66e-04 | 8454.31 ms | -100.0% bf16 MFU | 61988 tok/s +step 3588/19560 | loss 3.579043 (-2.31z)| norm 0.2873 (+0.07z)| lr 5.66e-04 | 8444.24 ms | -100.0% bf16 MFU | 61993 tok/s +step 3589/19560 | loss 3.676200 (+0.15z)| norm 0.2741 (-0.29z)| lr 5.66e-04 | 8446.58 ms | -100.0% bf16 MFU | 61997 tok/s +step 3590/19560 | loss 3.674070 (+0.10z)| norm 0.2683 (-0.45z)| lr 5.66e-04 | 8446.24 ms | -100.0% bf16 MFU | 62001 tok/s +step 3591/19560 | loss 3.628074 (-1.05z)| norm 0.2939 (+0.24z)| lr 5.66e-04 | 8440.87 ms | -100.0% bf16 MFU | 62006 tok/s +step 3592/19560 | loss 3.676702 (+0.17z)| norm 0.2903 (+0.14z)| lr 5.66e-04 | 8441.86 ms | -100.0% bf16 MFU | 62011 tok/s +step 3593/19560 | loss 3.677428 (+0.18z)| norm 0.2843 (-0.02z)| lr 5.66e-04 | 8446.34 ms | -100.0% bf16 MFU | 62014 tok/s +step 3594/19560 | loss 3.653195 (-0.43z)| norm 0.2840 (-0.03z)| lr 5.66e-04 | 8442.98 ms | -100.0% bf16 MFU | 62018 tok/s +step 3595/19560 | loss 3.685138 (+0.37z)| norm 0.2641 (-0.57z)| lr 5.66e-04 | 8438.90 ms | -100.0% bf16 MFU | 62024 tok/s +step 3596/19560 | loss 3.684616 (+0.35z)| norm 0.2688 (-0.44z)| lr 5.66e-04 | 8447.05 ms | -100.0% bf16 MFU | 62026 tok/s +step 3597/19560 | loss 3.633677 (-0.93z)| norm 0.2785 (-0.17z)| lr 5.66e-04 | 8448.33 ms | -100.0% bf16 MFU | 62028 tok/s +step 3598/19560 | loss 3.633492 (-0.93z)| norm 0.2817 (-0.09z)| lr 5.66e-04 | 8441.59 ms | -100.0% bf16 MFU | 62032 tok/s +step 3599/19560 | loss 3.677829 (+0.19z)| norm 0.2773 (-0.21z)| lr 5.66e-04 | 8449.71 ms | -100.0% bf16 MFU | 62032 tok/s +step 3600/19560 | loss 3.555266 (-2.80z)| norm 0.3353 (+1.36z)| lr 5.66e-04 | 8453.97 ms | -100.0% bf16 MFU | 62032 tok/s +step 3601/19560 | loss 3.650114 (-0.48z)| norm 0.8458 (+9.03z)| lr 5.66e-04 | 8440.37 ms | -100.0% bf16 MFU | 62036 tok/s +step 3602/19560 | loss 3.712374 (+1.03z)| norm 0.3258 (+0.58z)| lr 5.66e-04 | 8449.81 ms | -100.0% bf16 MFU | 62037 tok/s +step 3603/19560 | loss 3.618725 (-1.24z)| norm 0.3248 (+0.56z)| lr 5.66e-04 | 8446.70 ms | -100.0% bf16 MFU | 62038 tok/s +step 3604/19560 | loss 3.675862 (+0.17z)| norm 0.3166 (+0.43z)| lr 5.66e-04 | 8451.05 ms | -100.0% bf16 MFU | 62038 tok/s +step 3605/19560 | loss 3.647789 (-0.52z)| norm 0.3196 (+0.47z)| lr 5.66e-04 | 8441.99 ms | -100.0% bf16 MFU | 62042 tok/s +step 3606/19560 | loss 3.596161 (-1.77z)| norm 0.3545 (+1.02z)| lr 5.66e-04 | 8445.50 ms | -100.0% bf16 MFU | 62043 tok/s +step 3607/19560 | loss 3.662213 (-0.14z)| norm 0.3626 (+1.13z)| lr 5.66e-04 | 8445.53 ms | -100.0% bf16 MFU | 62045 tok/s +step 3608/19560 | loss 3.586784 (-1.96z)| norm 0.3374 (+0.72z)| lr 5.66e-04 | 8442.89 ms | -100.0% bf16 MFU | 62048 tok/s +step 3609/19560 | loss 3.661755 (-0.11z)| norm 0.2879 (-0.07z)| lr 5.65e-04 | 8447.15 ms | -100.0% bf16 MFU | 62049 tok/s +step 3610/19560 | loss 3.625424 (-1.01z)| norm 0.2837 (-0.14z)| lr 5.65e-04 | 8448.47 ms | -100.0% bf16 MFU | 62049 tok/s +step 3611/19560 | loss 3.721840 (+1.40z)| norm 0.2608 (-0.50z)| lr 5.65e-04 | 8445.83 ms | -100.0% bf16 MFU | 62051 tok/s +step 3612/19560 | loss 3.619125 (-1.16z)| norm 0.2669 (-0.40z)| lr 5.65e-04 | 8452.01 ms | -100.0% bf16 MFU | 62050 tok/s +step 3613/19560 | loss 3.660099 (-0.13z)| norm 0.2776 (-0.23z)| lr 5.65e-04 | 8442.50 ms | -100.0% bf16 MFU | 62052 tok/s +step 3614/19560 | loss 3.651362 (-0.35z)| norm 0.2950 (+0.04z)| lr 5.65e-04 | 8446.04 ms | -100.0% bf16 MFU | 62053 tok/s +step 3615/19560 | loss 3.607408 (-1.43z)| norm 0.2921 (-0.00z)| lr 5.65e-04 | 8447.68 ms | -100.0% bf16 MFU | 62054 tok/s +step 3616/19560 | loss 3.656480 (-0.21z)| norm 0.2847 (-0.12z)| lr 5.65e-04 | 8457.66 ms | -100.0% bf16 MFU | 62051 tok/s +step 3617/19560 | loss 3.640748 (-0.60z)| norm 0.2937 (+0.02z)| lr 5.65e-04 | 8458.09 ms | -100.0% bf16 MFU | 62047 tok/s +step 3618/19560 | loss 3.618896 (-1.12z)| norm 0.2875 (-0.08z)| lr 5.65e-04 | 8452.92 ms | -100.0% bf16 MFU | 62046 tok/s +step 3619/19560 | loss 3.623104 (-1.00z)| norm 0.2752 (-0.28z)| lr 5.65e-04 | 8448.20 ms | -100.0% bf16 MFU | 62047 tok/s +step 3620/19560 | loss 3.672994 (+0.24z)| norm 0.2440 (-0.77z)| lr 5.65e-04 | 8452.88 ms | -100.0% bf16 MFU | 62046 tok/s +step 3621/19560 | loss 3.669158 (+0.15z)| norm 0.2631 (-0.46z)| lr 5.65e-04 | 8455.05 ms | -100.0% bf16 MFU | 62044 tok/s +step 3622/19560 | loss 3.602786 (-1.48z)| norm 0.2513 (-0.65z)| lr 5.65e-04 | 8451.23 ms | -100.0% bf16 MFU | 62044 tok/s +step 3623/19560 | loss 3.680144 (+0.45z)| norm 0.2390 (-0.84z)| lr 5.65e-04 | 8450.45 ms | -100.0% bf16 MFU | 62044 tok/s +step 3624/19560 | loss 3.664580 (+0.07z)| norm 0.2706 (-0.33z)| lr 5.65e-04 | 8441.89 ms | -100.0% bf16 MFU | 62047 tok/s +step 3625/19560 | loss 3.674053 (+0.31z)| norm 0.2608 (-0.49z)| lr 5.65e-04 | 8444.07 ms | -100.0% bf16 MFU | 62049 tok/s +step 3626/19560 | loss 3.659032 (-0.07z)| norm 0.2347 (-0.89z)| lr 5.65e-04 | 8440.55 ms | -100.0% bf16 MFU | 62052 tok/s +step 3627/19560 | loss 3.616997 (-1.12z)| norm 0.2589 (-0.51z)| lr 5.65e-04 | 8443.94 ms | -100.0% bf16 MFU | 62054 tok/s +step 3628/19560 | loss 3.661634 (-0.01z)| norm 0.2539 (-0.58z)| lr 5.65e-04 | 8436.88 ms | -100.0% bf16 MFU | 62058 tok/s +step 3629/19560 | loss 3.652700 (-0.25z)| norm 0.2599 (-0.48z)| lr 5.65e-04 | 8437.12 ms | -100.0% bf16 MFU | 62063 tok/s +step 3630/19560 | loss 3.588833 (-1.85z)| norm 0.2523 (-0.60z)| lr 5.65e-04 | 8438.83 ms | -100.0% bf16 MFU | 62066 tok/s +step 3631/19560 | loss 3.611857 (-1.24z)| norm 0.2592 (-0.49z)| lr 5.65e-04 | 8442.42 ms | -100.0% bf16 MFU | 62068 tok/s +step 3632/19560 | loss 3.646828 (-0.35z)| norm 0.2522 (-0.60z)| lr 5.65e-04 | 8442.82 ms | -100.0% bf16 MFU | 62069 tok/s +step 3633/19560 | loss 3.674263 (+0.35z)| norm 0.2559 (-0.54z)| lr 5.65e-04 | 8446.26 ms | -100.0% bf16 MFU | 62069 tok/s +step 3634/19560 | loss 3.677206 (+0.42z)| norm 0.2678 (-0.35z)| lr 5.65e-04 | 8435.84 ms | -100.0% bf16 MFU | 62073 tok/s +step 3635/19560 | loss 3.605441 (-1.38z)| norm 0.2536 (-0.57z)| lr 5.65e-04 | 8438.64 ms | -100.0% bf16 MFU | 62076 tok/s +step 3636/19560 | loss 3.641354 (-0.48z)| norm 0.2653 (-0.39z)| lr 5.65e-04 | 8443.58 ms | -100.0% bf16 MFU | 62077 tok/s +step 3637/19560 | loss 3.689096 (+0.72z)| norm 0.2689 (-0.33z)| lr 5.65e-04 | 8440.24 ms | -100.0% bf16 MFU | 62079 tok/s +step 3638/19560 | loss 3.645022 (-0.39z)| norm 0.2800 (-0.14z)| lr 5.65e-04 | 8440.54 ms | -100.0% bf16 MFU | 62081 tok/s +step 3639/19560 | loss 3.616208 (-1.10z)| norm 0.2585 (-0.48z)| lr 5.65e-04 | 8441.32 ms | -100.0% bf16 MFU | 62082 tok/s +step 3640/19560 | loss 3.656190 (-0.09z)| norm 0.2696 (-0.29z)| lr 5.65e-04 | 8443.53 ms | -100.0% bf16 MFU | 62083 tok/s +step 3641/19560 | loss 3.645916 (-0.35z)| norm 0.2792 (-0.13z)| lr 5.65e-04 | 8440.93 ms | -100.0% bf16 MFU | 62084 tok/s +step 3642/19560 | loss 3.628208 (-0.78z)| norm 0.3006 (+0.21z)| lr 5.65e-04 | 8441.16 ms | -100.0% bf16 MFU | 62086 tok/s +step 3643/19560 | loss 3.652582 (-0.17z)| norm 0.2943 (+0.11z)| lr 5.65e-04 | 8441.39 ms | -100.0% bf16 MFU | 62087 tok/s +step 3644/19560 | loss 3.591292 (-1.69z)| norm 0.2689 (-0.30z)| lr 5.65e-04 | 8444.99 ms | -100.0% bf16 MFU | 62087 tok/s +step 3645/19560 | loss 3.652242 (-0.16z)| norm 0.2787 (-0.14z)| lr 5.65e-04 | 8441.33 ms | -100.0% bf16 MFU | 62088 tok/s +step 3646/19560 | loss 3.661000 (+0.07z)| norm 0.2533 (-0.55z)| lr 5.65e-04 | 8443.86 ms | -100.0% bf16 MFU | 62088 tok/s +step 3647/19560 | loss 3.555830 (-2.51z)| norm 0.2520 (-0.56z)| lr 5.65e-04 | 8446.38 ms | -100.0% bf16 MFU | 62087 tok/s +step 3648/19560 | loss 3.673374 (+0.40z)| norm 0.2680 (-0.31z)| lr 5.65e-04 | 8442.62 ms | -100.0% bf16 MFU | 62088 tok/s +step 3649/19560 | loss 3.638463 (-0.46z)| norm 0.2806 (-0.11z)| lr 5.65e-04 | 8448.48 ms | -100.0% bf16 MFU | 62086 tok/s +step 3650/19560 | loss 3.616372 (-0.99z)| norm 0.3134 (+0.41z)| lr 5.65e-04 | 8446.51 ms | -100.0% bf16 MFU | 62086 tok/s +step 3651/19560 | loss 3.679102 (+0.57z)| norm 0.3199 (+0.52z)| lr 5.65e-04 | 8443.60 ms | -100.0% bf16 MFU | 62086 tok/s +step 3652/19560 | loss 3.662667 (+0.17z)| norm 0.3177 (+0.48z)| lr 5.64e-04 | 8444.73 ms | -100.0% bf16 MFU | 62086 tok/s +step 3653/19560 | loss 3.620922 (-0.86z)| norm 0.3032 (+0.24z)| lr 5.64e-04 | 8449.98 ms | -100.0% bf16 MFU | 62084 tok/s +step 3654/19560 | loss 3.618938 (-0.90z)| norm 0.3287 (+0.65z)| lr 5.64e-04 | 8450.42 ms | -100.0% bf16 MFU | 62082 tok/s +step 3655/19560 | loss 3.656762 (+0.09z)| norm 0.3092 (+0.33z)| lr 5.64e-04 | 8450.92 ms | -100.0% bf16 MFU | 62080 tok/s +step 3656/19560 | loss 3.639400 (-0.36z)| norm 0.2836 (-0.07z)| lr 5.64e-04 | 8444.40 ms | -100.0% bf16 MFU | 62080 tok/s +step 3657/19560 | loss 3.628272 (-0.65z)| norm 0.2875 (-0.01z)| lr 5.64e-04 | 8451.72 ms | -100.0% bf16 MFU | 62078 tok/s +step 3658/19560 | loss 3.624367 (-0.75z)| norm 0.2803 (-0.12z)| lr 5.64e-04 | 8448.31 ms | -100.0% bf16 MFU | 62077 tok/s +step 3659/19560 | loss 3.550943 (-2.66z)| norm 0.2989 (+0.20z)| lr 5.64e-04 | 8451.23 ms | -100.0% bf16 MFU | 62075 tok/s +step 3660/19560 | loss 3.655771 (+0.14z)| norm 0.2615 (-0.42z)| lr 5.64e-04 | 8452.41 ms | -100.0% bf16 MFU | 62072 tok/s +step 3661/19560 | loss 3.631750 (-0.49z)| norm 0.2790 (-0.13z)| lr 5.64e-04 | 8452.41 ms | -100.0% bf16 MFU | 62070 tok/s +step 3662/19560 | loss 3.685945 (+0.97z)| norm 0.2772 (-0.16z)| lr 5.64e-04 | 8450.85 ms | -100.0% bf16 MFU | 62069 tok/s +step 3663/19560 | loss 3.690351 (+1.07z)| norm 0.2679 (-0.31z)| lr 5.64e-04 | 8447.84 ms | -100.0% bf16 MFU | 62068 tok/s +step 3664/19560 | loss 3.632112 (-0.48z)| norm 0.3278 (+0.67z)| lr 5.64e-04 | 8451.77 ms | -100.0% bf16 MFU | 62067 tok/s +step 3665/19560 | loss 3.680010 (+0.81z)| norm 0.3332 (+0.75z)| lr 5.64e-04 | 8449.28 ms | -100.0% bf16 MFU | 62066 tok/s +step 3666/19560 | loss 3.734545 (+2.24z)| norm 0.3584 (+1.15z)| lr 5.64e-04 | 8448.42 ms | -100.0% bf16 MFU | 62065 tok/s +step 3667/19560 | loss 3.570422 (-2.12z)| norm 0.3072 (+0.31z)| lr 5.64e-04 | 8450.56 ms | -100.0% bf16 MFU | 62064 tok/s +step 3668/19560 | loss 3.700508 (+1.31z)| norm 0.3404 (+0.84z)| lr 5.64e-04 | 8450.49 ms | -100.0% bf16 MFU | 62063 tok/s +step 3669/19560 | loss 3.592950 (-1.52z)| norm 0.3114 (+0.37z)| lr 5.64e-04 | 8449.29 ms | -100.0% bf16 MFU | 62063 tok/s +step 3670/19560 | loss 3.621698 (-0.73z)| norm 0.3168 (+0.46z)| lr 5.64e-04 | 8450.45 ms | -100.0% bf16 MFU | 62062 tok/s +step 3671/19560 | loss 3.627442 (-0.57z)| norm 0.2691 (-0.32z)| lr 5.64e-04 | 8447.13 ms | -100.0% bf16 MFU | 62062 tok/s +step 3672/19560 | loss 3.586316 (-1.66z)| norm 0.2719 (-0.28z)| lr 5.64e-04 | 8449.90 ms | -100.0% bf16 MFU | 62061 tok/s +step 3673/19560 | loss 3.632162 (-0.44z)| norm 0.2831 (-0.10z)| lr 5.64e-04 | 8449.67 ms | -100.0% bf16 MFU | 62060 tok/s +step 3674/19560 | loss 3.700858 (+1.38z)| norm 0.2850 (-0.07z)| lr 5.64e-04 | 8447.81 ms | -100.0% bf16 MFU | 62060 tok/s +step 3675/19560 | loss 3.646742 (-0.05z)| norm 0.2772 (-0.19z)| lr 5.64e-04 | 8450.74 ms | -100.0% bf16 MFU | 62059 tok/s +step 3676/19560 | loss 3.649642 (+0.02z)| norm 0.2902 (+0.02z)| lr 5.64e-04 | 8451.42 ms | -100.0% bf16 MFU | 62058 tok/s +step 3677/19560 | loss 3.698860 (+1.32z)| norm 0.2720 (-0.28z)| lr 5.64e-04 | 8449.48 ms | -100.0% bf16 MFU | 62058 tok/s +step 3678/19560 | loss 3.638060 (-0.29z)| norm 0.2710 (-0.30z)| lr 5.64e-04 | 8448.11 ms | -100.0% bf16 MFU | 62058 tok/s +step 3679/19560 | loss 3.626681 (-0.59z)| norm 0.2830 (-0.11z)| lr 5.64e-04 | 8447.96 ms | -100.0% bf16 MFU | 62058 tok/s +step 3680/19560 | loss 3.658765 (+0.27z)| norm 0.2881 (-0.03z)| lr 5.64e-04 | 8452.30 ms | -100.0% bf16 MFU | 62057 tok/s +step 3681/19560 | loss 3.715299 (+1.80z)| norm 0.2851 (-0.08z)| lr 5.64e-04 | 8451.91 ms | -100.0% bf16 MFU | 62055 tok/s +step 3682/19560 | loss 3.626621 (-0.58z)| norm 0.2671 (-0.38z)| lr 5.64e-04 | 8449.59 ms | -100.0% bf16 MFU | 62055 tok/s +step 3683/19560 | loss 3.661076 (+0.34z)| norm 0.2688 (-0.35z)| lr 5.64e-04 | 8454.86 ms | -100.0% bf16 MFU | 62053 tok/s +step 3684/19560 | loss 3.681396 (+0.87z)| norm 0.2692 (-0.34z)| lr 5.64e-04 | 8450.76 ms | -100.0% bf16 MFU | 62052 tok/s +step 3685/19560 | loss 3.549093 (-2.61z)| norm 0.2478 (-0.70z)| lr 5.64e-04 | 8449.18 ms | -100.0% bf16 MFU | 62052 tok/s +step 3686/19560 | loss 3.685209 (+0.99z)| norm 0.2771 (-0.21z)| lr 5.64e-04 | 8448.85 ms | -100.0% bf16 MFU | 62052 tok/s +step 3687/19560 | loss 3.633994 (-0.36z)| norm 0.2496 (-0.66z)| lr 5.64e-04 | 8449.54 ms | -100.0% bf16 MFU | 62052 tok/s +step 3688/19560 | loss 3.654702 (+0.19z)| norm 0.2631 (-0.44z)| lr 5.64e-04 | 8447.09 ms | -100.0% bf16 MFU | 62053 tok/s +step 3689/19560 | loss 3.639973 (-0.20z)| norm 0.2815 (-0.13z)| lr 5.64e-04 | 8448.22 ms | -100.0% bf16 MFU | 62053 tok/s +step 3690/19560 | loss 3.615741 (-0.83z)| norm 0.2534 (-0.59z)| lr 5.64e-04 | 8449.49 ms | -100.0% bf16 MFU | 62053 tok/s +step 3691/19560 | loss 3.648644 (+0.04z)| norm 0.2452 (-0.71z)| lr 5.64e-04 | 8447.35 ms | -100.0% bf16 MFU | 62054 tok/s +step 3692/19560 | loss 3.623660 (-0.61z)| norm 0.2464 (-0.68z)| lr 5.64e-04 | 8447.90 ms | -100.0% bf16 MFU | 62054 tok/s +step 3693/19560 | loss 3.599877 (-1.22z)| norm 0.2608 (-0.44z)| lr 5.64e-04 | 8447.99 ms | -100.0% bf16 MFU | 62054 tok/s +step 3694/19560 | loss 3.710355 (+1.67z)| norm 0.2785 (-0.12z)| lr 5.63e-04 | 8449.07 ms | -100.0% bf16 MFU | 62054 tok/s +step 3695/19560 | loss 3.645373 (-0.02z)| norm 0.2976 (+0.22z)| lr 5.63e-04 | 8448.83 ms | -100.0% bf16 MFU | 62054 tok/s +step 3696/19560 | loss 3.640887 (-0.14z)| norm 0.2897 (+0.08z)| lr 5.63e-04 | 8450.87 ms | -100.0% bf16 MFU | 62054 tok/s +step 3697/19560 | loss 3.649864 (+0.10z)| norm 0.2935 (+0.15z)| lr 5.63e-04 | 8452.27 ms | -100.0% bf16 MFU | 62052 tok/s +step 3698/19560 | loss 3.726275 (+2.13z)| norm 0.2877 (+0.04z)| lr 5.63e-04 | 8449.28 ms | -100.0% bf16 MFU | 62052 tok/s +step 3699/19560 | loss 3.626237 (-0.54z)| norm 0.2856 (+0.01z)| lr 5.63e-04 | 8443.84 ms | -100.0% bf16 MFU | 62054 tok/s +step 3700/19560 | loss 3.630125 (-0.44z)| norm 0.2946 (+0.16z)| lr 5.63e-04 | 8447.86 ms | -100.0% bf16 MFU | 62055 tok/s +step 3701/19560 | loss 3.657325 (+0.31z)| norm 0.2513 (-0.61z)| lr 5.63e-04 | 8447.27 ms | -100.0% bf16 MFU | 62055 tok/s +step 3702/19560 | loss 3.645303 (-0.03z)| norm 0.2611 (-0.42z)| lr 5.63e-04 | 8447.12 ms | -100.0% bf16 MFU | 62056 tok/s +step 3703/19560 | loss 3.700471 (+1.47z)| norm 0.2499 (-0.62z)| lr 5.63e-04 | 8449.64 ms | -100.0% bf16 MFU | 62055 tok/s +step 3704/19560 | loss 3.625374 (-0.59z)| norm 0.2528 (-0.56z)| lr 5.63e-04 | 8450.27 ms | -100.0% bf16 MFU | 62055 tok/s +step 3705/19560 | loss 3.621860 (-0.68z)| norm 0.2790 (-0.10z)| lr 5.63e-04 | 8448.50 ms | -100.0% bf16 MFU | 62055 tok/s +step 3706/19560 | loss 3.677708 (+0.84z)| norm 0.2756 (-0.16z)| lr 5.63e-04 | 8448.85 ms | -100.0% bf16 MFU | 62055 tok/s +step 3707/19560 | loss 3.657010 (+0.27z)| norm 0.3157 (+0.54z)| lr 5.63e-04 | 8450.03 ms | -100.0% bf16 MFU | 62054 tok/s +step 3708/19560 | loss 3.661640 (+0.39z)| norm 0.2862 (+0.01z)| lr 5.63e-04 | 8448.50 ms | -100.0% bf16 MFU | 62055 tok/s +step 3709/19560 | loss 3.654953 (+0.21z)| norm 0.3244 (+0.68z)| lr 5.63e-04 | 8446.37 ms | -100.0% bf16 MFU | 62055 tok/s +step 3710/19560 | loss 3.636655 (-0.28z)| norm 0.3331 (+0.83z)| lr 5.63e-04 | 8446.78 ms | -100.0% bf16 MFU | 62056 tok/s +step 3711/19560 | loss 3.617111 (-0.81z)| norm 0.3182 (+0.56z)| lr 5.63e-04 | 8447.20 ms | -100.0% bf16 MFU | 62057 tok/s +step 3712/19560 | loss 3.645037 (-0.03z)| norm 0.3101 (+0.41z)| lr 5.63e-04 | 8448.46 ms | -100.0% bf16 MFU | 62057 tok/s +step 3713/19560 | loss 3.659719 (+0.38z)| norm 0.3004 (+0.23z)| lr 5.63e-04 | 8451.28 ms | -100.0% bf16 MFU | 62056 tok/s +step 3714/19560 | loss 3.625097 (-0.58z)| norm 0.2972 (+0.17z)| lr 5.63e-04 | 8445.75 ms | -100.0% bf16 MFU | 62057 tok/s +step 3715/19560 | loss 3.662488 (+0.47z)| norm 0.3111 (+0.41z)| lr 5.63e-04 | 8448.66 ms | -100.0% bf16 MFU | 62057 tok/s +step 3716/19560 | loss 3.619226 (-0.76z)| norm 0.2938 (+0.10z)| lr 5.63e-04 | 8447.16 ms | -100.0% bf16 MFU | 62057 tok/s +step 3717/19560 | loss 3.607563 (-1.07z)| norm 0.2712 (-0.31z)| lr 5.63e-04 | 8449.72 ms | -100.0% bf16 MFU | 62057 tok/s +step 3718/19560 | loss 3.722863 (+2.15z)| norm 0.2943 (+0.10z)| lr 5.63e-04 | 8448.79 ms | -100.0% bf16 MFU | 62057 tok/s +step 3719/19560 | loss 3.662045 (+0.44z)| norm 0.3124 (+0.42z)| lr 5.63e-04 | 8448.77 ms | -100.0% bf16 MFU | 62057 tok/s +step 3720/19560 | loss 3.721768 (+2.07z)| norm 0.2799 (-0.16z)| lr 5.63e-04 | 8449.10 ms | -100.0% bf16 MFU | 62056 tok/s +step 3721/19560 | loss 3.609790 (-0.99z)| norm 0.2899 (+0.02z)| lr 5.63e-04 | 8446.29 ms | -100.0% bf16 MFU | 62057 tok/s +step 3722/19560 | loss 3.592817 (-1.43z)| norm 0.2930 (+0.08z)| lr 5.63e-04 | 8448.42 ms | -100.0% bf16 MFU | 62057 tok/s +step 3723/19560 | loss 3.539529 (-2.78z)| norm 0.2860 (-0.05z)| lr 5.63e-04 | 8445.83 ms | -100.0% bf16 MFU | 62058 tok/s +step 3724/19560 | loss 3.551937 (-2.38z)| norm 0.3028 (+0.24z)| lr 5.63e-04 | 8446.88 ms | -100.0% bf16 MFU | 62059 tok/s +step 3725/19560 | loss 3.620656 (-0.59z)| norm 0.3139 (+0.44z)| lr 5.63e-04 | 8447.29 ms | -100.0% bf16 MFU | 62059 tok/s +step 3726/19560 | loss 3.650087 (+0.18z)| norm 0.2764 (-0.23z)| lr 5.63e-04 | 8446.66 ms | -100.0% bf16 MFU | 62060 tok/s +step 3727/19560 | loss 3.667859 (+0.64z)| norm 0.2964 (+0.12z)| lr 5.63e-04 | 8446.73 ms | -100.0% bf16 MFU | 62060 tok/s +step 3728/19560 | loss 3.675541 (+0.83z)| norm 0.2795 (-0.17z)| lr 5.63e-04 | 8448.59 ms | -100.0% bf16 MFU | 62060 tok/s +step 3729/19560 | loss 3.634171 (-0.26z)| norm 0.2633 (-0.81z)| lr 5.63e-04 | 8457.67 ms | -100.0% bf16 MFU | 62056 tok/s +step 3730/19560 | loss 3.629629 (-0.37z)| norm 0.2648 (-0.74z)| lr 5.63e-04 | 8470.44 ms | -100.0% bf16 MFU | 62048 tok/s +step 3731/19560 | loss 3.574411 (-1.82z)| norm 0.2985 (+0.57z)| lr 5.63e-04 | 8471.69 ms | -100.0% bf16 MFU | 62040 tok/s +step 3732/19560 | loss 3.597146 (-1.20z)| norm 0.2531 (-1.18z)| lr 5.63e-04 | 8465.26 ms | -100.0% bf16 MFU | 62035 tok/s +step 3733/19560 | loss 3.673702 (+0.82z)| norm 0.2564 (-1.03z)| lr 5.63e-04 | 8467.74 ms | -100.0% bf16 MFU | 62029 tok/s +step 3734/19560 | loss 3.694241 (+1.34z)| norm 0.2644 (-0.71z)| lr 5.63e-04 | 8471.38 ms | -100.0% bf16 MFU | 62022 tok/s +step 3735/19560 | loss 3.689798 (+1.21z)| norm 0.2647 (-0.70z)| lr 5.62e-04 | 8468.19 ms | -100.0% bf16 MFU | 62017 tok/s +step 3736/19560 | loss 3.633572 (-0.28z)| norm 0.2699 (-0.47z)| lr 5.62e-04 | 8467.90 ms | -100.0% bf16 MFU | 62012 tok/s +step 3737/19560 | loss 3.668065 (+0.64z)| norm 0.2569 (-1.01z)| lr 5.62e-04 | 8469.62 ms | -100.0% bf16 MFU | 62006 tok/s +step 3738/19560 | loss 3.650096 (+0.16z)| norm 0.2745 (-0.26z)| lr 5.62e-04 | 8466.34 ms | -100.0% bf16 MFU | 62002 tok/s +step 3739/19560 | loss 3.627707 (-0.43z)| norm 0.3019 (+0.90z)| lr 5.62e-04 | 8467.43 ms | -100.0% bf16 MFU | 61998 tok/s +step 3740/19560 | loss 3.707065 (+1.68z)| norm 0.2924 (+0.48z)| lr 5.62e-04 | 8468.17 ms | -100.0% bf16 MFU | 61994 tok/s +step 3741/19560 | loss 3.638309 (-0.15z)| norm 0.2672 (-0.59z)| lr 5.62e-04 | 8469.83 ms | -100.0% bf16 MFU | 61989 tok/s +step 3742/19560 | loss 3.597106 (-1.23z)| norm 0.2510 (-1.26z)| lr 5.62e-04 | 8469.70 ms | -100.0% bf16 MFU | 61985 tok/s +step 3743/19560 | loss 3.600912 (-1.13z)| norm 0.2524 (-1.18z)| lr 5.62e-04 | 8471.84 ms | -100.0% bf16 MFU | 61980 tok/s +step 3744/19560 | loss 3.650195 (+0.18z)| norm 0.2558 (-1.02z)| lr 5.62e-04 | 8465.93 ms | -100.0% bf16 MFU | 61977 tok/s +step 3745/19560 | loss 3.638563 (-0.13z)| norm 0.2824 (+0.10z)| lr 5.62e-04 | 8466.29 ms | -100.0% bf16 MFU | 61975 tok/s +step 3746/19560 | loss 3.597685 (-1.20z)| norm 0.4349 (+5.61z)| lr 5.62e-04 | 8458.83 ms | -100.0% bf16 MFU | 61975 tok/s +step 3747/19560 | loss 3.607128 (-0.95z)| norm 0.3022 (+0.76z)| lr 5.62e-04 | 8451.80 ms | -100.0% bf16 MFU | 61978 tok/s +step 3748/19560 | loss 3.692078 (+1.28z)| norm 0.2956 (+0.51z)| lr 5.62e-04 | 8453.64 ms | -100.0% bf16 MFU | 61980 tok/s +step 3749/19560 | loss 3.664502 (+0.56z)| norm 0.3117 (+1.08z)| lr 5.62e-04 | 8453.92 ms | -100.0% bf16 MFU | 61982 tok/s +step 3750/19560 | loss 3.652427 (+0.23z)| norm 0.2813 (-0.04z)| lr 5.62e-04 | 8455.22 ms | -100.0% bf16 MFU | 61983 tok/s +val loss 3.625043 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 2697/10042 = 0.268572 +step 3751/19560 | loss 3.647960 (+0.12z)| norm 0.2622 (-0.75z)| lr 5.62e-04 | 8451.96 ms | -100.0% bf16 MFU | 61985 tok/s +step 3752/19560 | loss 3.666925 (+0.62z)| norm 0.2753 (-0.27z)| lr 5.62e-04 | 8459.79 ms | -100.0% bf16 MFU | 61985 tok/s +step 3753/19560 | loss 3.683352 (+1.05z)| norm 0.2755 (-0.27z)| lr 5.62e-04 | 8464.18 ms | -100.0% bf16 MFU | 61983 tok/s +step 3754/19560 | loss 3.589402 (-1.40z)| norm 0.3690 (+3.08z)| lr 5.62e-04 | 8461.00 ms | -100.0% bf16 MFU | 61982 tok/s +step 3755/19560 | loss 3.640948 (-0.06z)| norm 0.3775 (+3.23z)| lr 5.62e-04 | 8457.56 ms | -100.0% bf16 MFU | 61982 tok/s +step 3756/19560 | loss 3.675445 (+0.84z)| norm 0.3799 (+3.16z)| lr 5.62e-04 | 8460.13 ms | -100.0% bf16 MFU | 61982 tok/s +step 3757/19560 | loss 3.614406 (-0.75z)| norm 0.3222 (+1.21z)| lr 5.62e-04 | 8460.13 ms | -100.0% bf16 MFU | 61981 tok/s +step 3758/19560 | loss 3.687485 (+1.15z)| norm 0.3405 (+1.78z)| lr 5.62e-04 | 8458.78 ms | -100.0% bf16 MFU | 61981 tok/s +step 3759/19560 | loss 3.676153 (+0.84z)| norm 0.3044 (+0.57z)| lr 5.62e-04 | 8458.21 ms | -100.0% bf16 MFU | 61982 tok/s +step 3760/19560 | loss 3.664581 (+0.53z)| norm 0.2846 (-0.10z)| lr 5.62e-04 | 8463.51 ms | -100.0% bf16 MFU | 61980 tok/s +step 3761/19560 | loss 3.652856 (+0.23z)| norm 0.2888 (+0.04z)| lr 5.62e-04 | 8464.55 ms | -100.0% bf16 MFU | 61978 tok/s +step 3762/19560 | loss 3.695024 (+1.32z)| norm 0.2874 (-0.02z)| lr 5.62e-04 | 8459.99 ms | -100.0% bf16 MFU | 61977 tok/s +step 3763/19560 | loss 3.657313 (+0.33z)| norm 0.3324 (+1.47z)| lr 5.62e-04 | 8464.65 ms | -100.0% bf16 MFU | 61976 tok/s +step 3764/19560 | loss 3.638818 (-0.16z)| norm 0.3159 (+0.90z)| lr 5.62e-04 | 8466.01 ms | -100.0% bf16 MFU | 61973 tok/s +step 3765/19560 | loss 3.732570 (+2.26z)| norm 0.8264 (+9.53z)| lr 5.62e-04 | 8465.79 ms | -100.0% bf16 MFU | 61971 tok/s +step 3766/19560 | loss 3.608745 (-0.93z)| norm 0.4122 (+2.07z)| lr 5.62e-04 | 8467.21 ms | -100.0% bf16 MFU | 61968 tok/s +step 3767/19560 | loss 3.839411 (+4.55z)| norm 0.4374 (+2.44z)| lr 5.62e-04 | 8456.48 ms | -100.0% bf16 MFU | 61970 tok/s +step 3768/19560 | loss 3.687022 (+0.95z)| norm 0.5208 (+3.63z)| lr 5.62e-04 | 8449.31 ms | -100.0% bf16 MFU | 61974 tok/s +step 3769/19560 | loss 3.716079 (+1.60z)| norm 0.4065 (+1.74z)| lr 5.62e-04 | 8450.80 ms | -100.0% bf16 MFU | 61977 tok/s +step 3770/19560 | loss 3.623897 (-0.54z)| norm 0.3347 (+0.58z)| lr 5.62e-04 | 8443.03 ms | -100.0% bf16 MFU | 61983 tok/s +step 3771/19560 | loss 3.642225 (-0.12z)| norm 0.3179 (+0.30z)| lr 5.62e-04 | 8444.87 ms | -100.0% bf16 MFU | 61988 tok/s +step 3772/19560 | loss 3.684640 (+0.86z)| norm 0.3116 (+0.20z)| lr 5.62e-04 | 8448.31 ms | -100.0% bf16 MFU | 61992 tok/s +step 3773/19560 | loss 3.582507 (-1.50z)| norm 0.2968 (-0.04z)| lr 5.62e-04 | 8447.20 ms | -100.0% bf16 MFU | 61996 tok/s +step 3774/19560 | loss 3.618837 (-0.65z)| norm 0.3067 (+0.11z)| lr 5.62e-04 | 8448.32 ms | -100.0% bf16 MFU | 61999 tok/s +step 3775/19560 | loss 3.633745 (-0.33z)| norm 0.2753 (-0.40z)| lr 5.62e-04 | 8447.69 ms | -100.0% bf16 MFU | 62002 tok/s +step 3776/19560 | loss 3.673198 (+0.60z)| norm 0.2643 (-0.58z)| lr 5.61e-04 | 8443.98 ms | -100.0% bf16 MFU | 62006 tok/s +step 3777/19560 | loss 3.669905 (+0.52z)| norm 0.2846 (-0.25z)| lr 5.61e-04 | 8441.17 ms | -100.0% bf16 MFU | 62012 tok/s +step 3778/19560 | loss 3.659376 (+0.26z)| norm 0.2648 (-0.56z)| lr 5.61e-04 | 8441.92 ms | -100.0% bf16 MFU | 62016 tok/s +step 3779/19560 | loss 3.739123 (+2.10z)| norm 0.2543 (-0.72z)| lr 5.61e-04 | 8439.89 ms | -100.0% bf16 MFU | 62021 tok/s +step 3780/19560 | loss 3.689959 (+0.95z)| norm 0.2706 (-0.46z)| lr 5.61e-04 | 8443.30 ms | -100.0% bf16 MFU | 62025 tok/s +step 3781/19560 | loss 3.595242 (-1.23z)| norm 0.2418 (-0.91z)| lr 5.61e-04 | 8450.64 ms | -100.0% bf16 MFU | 62026 tok/s +step 3782/19560 | loss 3.709955 (+1.38z)| norm 0.2732 (-0.40z)| lr 5.61e-04 | 8437.01 ms | -100.0% bf16 MFU | 62032 tok/s +step 3783/19560 | loss 3.634833 (-0.33z)| norm 0.2979 (+0.00z)| lr 5.61e-04 | 8446.59 ms | -100.0% bf16 MFU | 62034 tok/s +step 3784/19560 | loss 3.643203 (-0.14z)| norm 0.2615 (-0.58z)| lr 5.61e-04 | 8441.46 ms | -100.0% bf16 MFU | 62037 tok/s +step 3785/19560 | loss 3.608147 (-0.94z)| norm 0.2538 (-0.70z)| lr 5.61e-04 | 8445.03 ms | -100.0% bf16 MFU | 62040 tok/s +step 3786/19560 | loss 3.616138 (-0.75z)| norm 0.2595 (-0.60z)| lr 5.61e-04 | 8447.90 ms | -100.0% bf16 MFU | 62041 tok/s +step 3787/19560 | loss 3.648057 (-0.04z)| norm 0.2549 (-0.67z)| lr 5.61e-04 | 8446.38 ms | -100.0% bf16 MFU | 62042 tok/s +step 3788/19560 | loss 3.648857 (-0.02z)| norm 0.2664 (-0.49z)| lr 5.61e-04 | 8449.17 ms | -100.0% bf16 MFU | 62043 tok/s +step 3789/19560 | loss 3.596513 (-1.23z)| norm 0.2652 (-0.50z)| lr 5.61e-04 | 8445.99 ms | -100.0% bf16 MFU | 62044 tok/s +step 3790/19560 | loss 3.606753 (-0.98z)| norm 0.2558 (-0.65z)| lr 5.61e-04 | 8450.51 ms | -100.0% bf16 MFU | 62044 tok/s +step 3791/19560 | loss 3.577801 (-1.61z)| norm 0.2644 (-0.51z)| lr 5.61e-04 | 8452.58 ms | -100.0% bf16 MFU | 62043 tok/s +step 3792/19560 | loss 3.632481 (-0.36z)| norm 0.2880 (-0.13z)| lr 5.61e-04 | 8445.16 ms | -100.0% bf16 MFU | 62045 tok/s +step 3793/19560 | loss 3.693316 (+1.04z)| norm 0.2767 (-0.31z)| lr 5.61e-04 | 8451.50 ms | -100.0% bf16 MFU | 62045 tok/s +step 3794/19560 | loss 3.664730 (+0.40z)| norm 0.2833 (-0.19z)| lr 5.61e-04 | 8450.54 ms | -100.0% bf16 MFU | 62045 tok/s +step 3795/19560 | loss 3.621760 (-0.62z)| norm 0.2553 (-0.63z)| lr 5.61e-04 | 8450.16 ms | -100.0% bf16 MFU | 62045 tok/s +step 3796/19560 | loss 3.584470 (-1.47z)| norm 0.2764 (-0.29z)| lr 5.61e-04 | 8442.04 ms | -100.0% bf16 MFU | 62048 tok/s +step 3797/19560 | loss 3.662942 (+0.36z)| norm 0.3103 (+0.25z)| lr 5.61e-04 | 8449.71 ms | -100.0% bf16 MFU | 62048 tok/s +step 3798/19560 | loss 3.615183 (-0.76z)| norm 0.2875 (-0.11z)| lr 5.61e-04 | 8449.53 ms | -100.0% bf16 MFU | 62048 tok/s +step 3799/19560 | loss 3.681508 (+0.79z)| norm 0.3229 (+0.45z)| lr 5.61e-04 | 8449.44 ms | -100.0% bf16 MFU | 62048 tok/s +step 3800/19560 | loss 3.603113 (-1.06z)| norm 0.3208 (+0.41z)| lr 5.61e-04 | 8446.21 ms | -100.0% bf16 MFU | 62049 tok/s +step 3801/19560 | loss 3.649830 (+0.04z)| norm 0.2516 (-0.69z)| lr 5.61e-04 | 8449.66 ms | -100.0% bf16 MFU | 62049 tok/s +step 3802/19560 | loss 3.734647 (+2.02z)| norm 0.2781 (-0.26z)| lr 5.61e-04 | 8450.48 ms | -100.0% bf16 MFU | 62049 tok/s +step 3803/19560 | loss 3.657206 (+0.20z)| norm 0.2823 (-0.20z)| lr 5.61e-04 | 8452.17 ms | -100.0% bf16 MFU | 62048 tok/s +step 3804/19560 | loss 3.582114 (-1.53z)| norm 0.2517 (-0.68z)| lr 5.61e-04 | 8456.92 ms | -100.0% bf16 MFU | 62045 tok/s +step 3805/19560 | loss 3.693876 (+1.06z)| norm 0.2737 (-0.33z)| lr 5.61e-04 | 8453.00 ms | -100.0% bf16 MFU | 62044 tok/s +step 3806/19560 | loss 3.676733 (+0.66z)| norm 0.2497 (-0.71z)| lr 5.61e-04 | 8454.23 ms | -100.0% bf16 MFU | 62043 tok/s +step 3807/19560 | loss 3.648563 (+0.00z)| norm 0.2702 (-0.38z)| lr 5.61e-04 | 8447.70 ms | -100.0% bf16 MFU | 62044 tok/s +step 3808/19560 | loss 3.647507 (-0.02z)| norm 0.2671 (-0.43z)| lr 5.61e-04 | 8458.83 ms | -100.0% bf16 MFU | 62041 tok/s +step 3809/19560 | loss 3.695751 (+1.11z)| norm 0.2770 (-0.27z)| lr 5.61e-04 | 8453.83 ms | -100.0% bf16 MFU | 62039 tok/s +step 3810/19560 | loss 3.648752 (+0.01z)| norm 0.2760 (-0.29z)| lr 5.61e-04 | 8455.51 ms | -100.0% bf16 MFU | 62038 tok/s +step 3811/19560 | loss 3.622742 (-0.59z)| norm 0.2744 (-0.31z)| lr 5.61e-04 | 8462.06 ms | -100.0% bf16 MFU | 62034 tok/s +step 3812/19560 | loss 3.611880 (-0.83z)| norm 0.2524 (-0.66z)| lr 5.61e-04 | 8455.01 ms | -100.0% bf16 MFU | 62032 tok/s +step 3813/19560 | loss 3.607806 (-0.96z)| norm 0.2647 (-0.47z)| lr 5.61e-04 | 8453.67 ms | -100.0% bf16 MFU | 62032 tok/s +step 3814/19560 | loss 3.626112 (-0.51z)| norm 0.3102 (+0.25z)| lr 5.61e-04 | 8448.77 ms | -100.0% bf16 MFU | 62033 tok/s +step 3815/19560 | loss 3.649798 (+0.05z)| norm 0.2460 (-0.77z)| lr 5.61e-04 | 8455.62 ms | -100.0% bf16 MFU | 62032 tok/s +step 3816/19560 | loss 3.631783 (-0.38z)| norm 0.2548 (-0.63z)| lr 5.61e-04 | 8449.88 ms | -100.0% bf16 MFU | 62032 tok/s +step 3817/19560 | loss 3.631807 (-0.37z)| norm 0.2506 (-0.69z)| lr 5.60e-04 | 8459.12 ms | -100.0% bf16 MFU | 62030 tok/s +step 3818/19560 | loss 3.693121 (+1.07z)| norm 0.2420 (-0.82z)| lr 5.60e-04 | 8448.34 ms | -100.0% bf16 MFU | 62031 tok/s +step 3819/19560 | loss 3.653435 (+0.13z)| norm 0.2633 (-0.49z)| lr 5.60e-04 | 8455.76 ms | -100.0% bf16 MFU | 62030 tok/s +step 3820/19560 | loss 3.643208 (-0.12z)| norm 0.2869 (-0.12z)| lr 5.60e-04 | 8450.67 ms | -100.0% bf16 MFU | 62030 tok/s +step 3821/19560 | loss 3.623003 (-0.61z)| norm 0.3002 (+0.09z)| lr 5.60e-04 | 8453.24 ms | -100.0% bf16 MFU | 62030 tok/s +step 3822/19560 | loss 3.619993 (-0.67z)| norm 0.2797 (-0.24z)| lr 5.60e-04 | 8452.14 ms | -100.0% bf16 MFU | 62030 tok/s +step 3823/19560 | loss 3.668422 (+0.49z)| norm 0.3075 (+0.20z)| lr 5.60e-04 | 8453.71 ms | -100.0% bf16 MFU | 62029 tok/s +step 3824/19560 | loss 3.640468 (-0.18z)| norm 0.2916 (-0.05z)| lr 5.60e-04 | 8454.64 ms | -100.0% bf16 MFU | 62028 tok/s +step 3825/19560 | loss 3.672877 (+0.59z)| norm 0.2621 (-0.52z)| lr 5.60e-04 | 8451.58 ms | -100.0% bf16 MFU | 62029 tok/s +step 3826/19560 | loss 3.619857 (-0.67z)| norm 0.2974 (+0.04z)| lr 5.60e-04 | 8451.08 ms | -100.0% bf16 MFU | 62029 tok/s +step 3827/19560 | loss 3.677083 (+0.71z)| norm 0.2849 (-0.15z)| lr 5.60e-04 | 8448.22 ms | -100.0% bf16 MFU | 62031 tok/s +step 3828/19560 | loss 3.617498 (-0.73z)| norm 0.2845 (-0.16z)| lr 5.60e-04 | 8456.92 ms | -100.0% bf16 MFU | 62029 tok/s +step 3829/19560 | loss 3.697675 (+1.20z)| norm 0.2871 (-0.12z)| lr 5.60e-04 | 8453.65 ms | -100.0% bf16 MFU | 62028 tok/s +step 3830/19560 | loss 3.594141 (-1.28z)| norm 0.2930 (-0.03z)| lr 5.60e-04 | 8452.99 ms | -100.0% bf16 MFU | 62028 tok/s +step 3831/19560 | loss 3.620440 (-0.63z)| norm 0.2823 (-0.21z)| lr 5.60e-04 | 8452.29 ms | -100.0% bf16 MFU | 62028 tok/s +step 3832/19560 | loss 3.595873 (-1.21z)| norm 0.2567 (-0.62z)| lr 5.60e-04 | 8448.19 ms | -100.0% bf16 MFU | 62030 tok/s +step 3833/19560 | loss 3.604067 (-1.01z)| norm 0.2644 (-0.49z)| lr 5.60e-04 | 8450.82 ms | -100.0% bf16 MFU | 62030 tok/s +step 3834/19560 | loss 3.660306 (+0.33z)| norm 0.3142 (+0.30z)| lr 5.60e-04 | 8449.04 ms | -100.0% bf16 MFU | 62031 tok/s +step 3835/19560 | loss 3.638006 (-0.20z)| norm 0.3543 (+0.93z)| lr 5.60e-04 | 8451.33 ms | -100.0% bf16 MFU | 62032 tok/s +step 3836/19560 | loss 3.583818 (-1.46z)| norm 0.3152 (+0.31z)| lr 5.60e-04 | 8453.42 ms | -100.0% bf16 MFU | 62031 tok/s +step 3837/19560 | loss 3.639204 (-0.15z)| norm 0.2731 (-0.36z)| lr 5.60e-04 | 8450.84 ms | -100.0% bf16 MFU | 62032 tok/s +step 3838/19560 | loss 3.663804 (+0.43z)| norm 0.3113 (+0.25z)| lr 5.60e-04 | 8450.93 ms | -100.0% bf16 MFU | 62032 tok/s +step 3839/19560 | loss 3.624307 (-0.51z)| norm 0.2910 (-0.07z)| lr 5.60e-04 | 8447.95 ms | -100.0% bf16 MFU | 62033 tok/s +step 3840/19560 | loss 3.617151 (-0.67z)| norm 0.3018 (+0.10z)| lr 5.60e-04 | 8457.90 ms | -100.0% bf16 MFU | 62031 tok/s +step 3841/19560 | loss 3.670814 (+0.60z)| norm 0.3171 (+0.35z)| lr 5.60e-04 | 8449.30 ms | -100.0% bf16 MFU | 62032 tok/s +step 3842/19560 | loss 3.651040 (+0.12z)| norm 0.2689 (-0.42z)| lr 5.60e-04 | 8445.22 ms | -100.0% bf16 MFU | 62035 tok/s +step 3843/19560 | loss 3.610046 (-0.84z)| norm 0.2995 (+0.07z)| lr 5.60e-04 | 8450.78 ms | -100.0% bf16 MFU | 62035 tok/s +step 3844/19560 | loss 3.653731 (+0.19z)| norm 0.3884 (+1.47z)| lr 5.60e-04 | 8449.22 ms | -100.0% bf16 MFU | 62036 tok/s +step 3845/19560 | loss 3.588243 (-1.35z)| norm 0.3244 (+0.45z)| lr 5.60e-04 | 8447.79 ms | -100.0% bf16 MFU | 62037 tok/s +step 3846/19560 | loss 3.617232 (-0.65z)| norm 0.2955 (-0.01z)| lr 5.60e-04 | 8450.92 ms | -100.0% bf16 MFU | 62037 tok/s +step 3847/19560 | loss 3.615651 (-0.68z)| norm 0.3055 (+0.15z)| lr 5.60e-04 | 8453.87 ms | -100.0% bf16 MFU | 62036 tok/s +step 3848/19560 | loss 3.683658 (+0.95z)| norm 0.2657 (-0.48z)| lr 5.60e-04 | 8449.99 ms | -100.0% bf16 MFU | 62037 tok/s +step 3849/19560 | loss 3.636202 (-0.19z)| norm 0.4838 (+2.85z)| lr 5.60e-04 | 8449.16 ms | -100.0% bf16 MFU | 62037 tok/s +step 3850/19560 | loss 3.631254 (-0.32z)| norm 0.2778 (-0.30z)| lr 5.60e-04 | 8448.86 ms | -100.0% bf16 MFU | 62038 tok/s +step 3851/19560 | loss 3.627244 (-0.45z)| norm 0.3038 (+0.10z)| lr 5.60e-04 | 8450.09 ms | -100.0% bf16 MFU | 62039 tok/s +step 3852/19560 | loss 3.638216 (-0.20z)| norm 0.2676 (-0.45z)| lr 5.60e-04 | 8448.47 ms | -100.0% bf16 MFU | 62040 tok/s +step 3853/19560 | loss 3.607832 (-0.96z)| norm 0.2714 (-0.39z)| lr 5.60e-04 | 8446.16 ms | -100.0% bf16 MFU | 62041 tok/s +step 3854/19560 | loss 3.674796 (+0.73z)| norm 0.2704 (-0.41z)| lr 5.60e-04 | 8445.13 ms | -100.0% bf16 MFU | 62043 tok/s +step 3855/19560 | loss 3.582764 (-1.57z)| norm 0.2777 (-0.29z)| lr 5.60e-04 | 8447.13 ms | -100.0% bf16 MFU | 62044 tok/s +step 3856/19560 | loss 3.616076 (-0.72z)| norm 0.2942 (-0.04z)| lr 5.60e-04 | 8449.70 ms | -100.0% bf16 MFU | 62045 tok/s +step 3857/19560 | loss 3.619380 (-0.64z)| norm 0.2757 (-0.33z)| lr 5.59e-04 | 8449.91 ms | -100.0% bf16 MFU | 62045 tok/s +step 3858/19560 | loss 3.620016 (-0.62z)| norm 0.2855 (-0.18z)| lr 5.59e-04 | 8449.36 ms | -100.0% bf16 MFU | 62045 tok/s +step 3859/19560 | loss 3.635338 (-0.25z)| norm 0.2711 (-0.39z)| lr 5.59e-04 | 8451.30 ms | -100.0% bf16 MFU | 62045 tok/s +step 3860/19560 | loss 3.661567 (+0.40z)| norm 0.2733 (-0.36z)| lr 5.59e-04 | 8447.39 ms | -100.0% bf16 MFU | 62046 tok/s +step 3861/19560 | loss 3.624735 (-0.52z)| norm 0.2737 (-0.36z)| lr 5.59e-04 | 8449.48 ms | -100.0% bf16 MFU | 62046 tok/s +step 3862/19560 | loss 3.660705 (+0.40z)| norm 0.2937 (-0.06z)| lr 5.59e-04 | 8450.47 ms | -100.0% bf16 MFU | 62046 tok/s +step 3863/19560 | loss 3.682320 (+0.96z)| norm 0.3026 (+0.07z)| lr 5.59e-04 | 8448.92 ms | -100.0% bf16 MFU | 62046 tok/s +step 3864/19560 | loss 3.644267 (-0.02z)| norm 0.2912 (-0.10z)| lr 5.59e-04 | 8448.87 ms | -100.0% bf16 MFU | 62046 tok/s +step 3865/19560 | loss 3.677364 (+0.83z)| norm 0.2712 (-0.41z)| lr 5.59e-04 | 8447.67 ms | -100.0% bf16 MFU | 62047 tok/s +step 3866/19560 | loss 3.615367 (-0.75z)| norm 0.2770 (-0.32z)| lr 5.59e-04 | 8447.75 ms | -100.0% bf16 MFU | 62048 tok/s +step 3867/19560 | loss 3.719815 (+1.87z)| norm 0.3253 (+0.42z)| lr 5.59e-04 | 8448.87 ms | -100.0% bf16 MFU | 62048 tok/s +step 3868/19560 | loss 3.652093 (+0.18z)| norm 0.2783 (-0.30z)| lr 5.59e-04 | 8448.83 ms | -100.0% bf16 MFU | 62049 tok/s +step 3869/19560 | loss 3.630584 (-0.37z)| norm 0.2481 (-0.77z)| lr 5.59e-04 | 8448.08 ms | -100.0% bf16 MFU | 62049 tok/s +step 3870/19560 | loss 3.633581 (-0.30z)| norm 0.2484 (-0.76z)| lr 5.59e-04 | 8450.11 ms | -100.0% bf16 MFU | 62049 tok/s +step 3871/19560 | loss 3.691627 (+1.17z)| norm 0.2614 (-0.56z)| lr 5.59e-04 | 8446.87 ms | -100.0% bf16 MFU | 62050 tok/s +step 3872/19560 | loss 3.659870 (+0.35z)| norm 0.2636 (-0.53z)| lr 5.59e-04 | 8448.65 ms | -100.0% bf16 MFU | 62050 tok/s +step 3873/19560 | loss 3.527312 (-2.92z)| norm 0.2986 (+0.01z)| lr 5.59e-04 | 8449.28 ms | -100.0% bf16 MFU | 62050 tok/s +step 3874/19560 | loss 3.597149 (-1.19z)| norm 0.3338 (+0.57z)| lr 5.59e-04 | 8451.74 ms | -100.0% bf16 MFU | 62049 tok/s +step 3875/19560 | loss 3.659831 (+0.35z)| norm 0.3233 (+0.40z)| lr 5.59e-04 | 8447.84 ms | -100.0% bf16 MFU | 62050 tok/s +step 3876/19560 | loss 3.681006 (+0.88z)| norm 0.2729 (-0.38z)| lr 5.59e-04 | 8447.79 ms | -100.0% bf16 MFU | 62051 tok/s +step 3877/19560 | loss 3.640595 (-0.12z)| norm 0.3067 (+0.15z)| lr 5.59e-04 | 8449.16 ms | -100.0% bf16 MFU | 62051 tok/s +step 3878/19560 | loss 3.606938 (-0.95z)| norm 0.2737 (-0.37z)| lr 5.59e-04 | 8453.22 ms | -100.0% bf16 MFU | 62049 tok/s +step 3879/19560 | loss 3.582072 (-1.53z)| norm 0.2629 (-0.54z)| lr 5.59e-04 | 8447.29 ms | -100.0% bf16 MFU | 62050 tok/s +step 3880/19560 | loss 3.631764 (-0.31z)| norm 0.2815 (-0.25z)| lr 5.59e-04 | 8447.75 ms | -100.0% bf16 MFU | 62051 tok/s +step 3881/19560 | loss 3.625210 (-0.46z)| norm 0.2801 (-0.27z)| lr 5.59e-04 | 8449.95 ms | -100.0% bf16 MFU | 62051 tok/s +step 3882/19560 | loss 3.617603 (-0.66z)| norm 0.2831 (-0.21z)| lr 5.59e-04 | 8444.74 ms | -100.0% bf16 MFU | 62052 tok/s +step 3883/19560 | loss 3.591914 (-1.27z)| norm 0.2590 (-0.58z)| lr 5.59e-04 | 8447.32 ms | -100.0% bf16 MFU | 62053 tok/s +step 3884/19560 | loss 3.605260 (-0.93z)| norm 0.2914 (-0.06z)| lr 5.59e-04 | 8448.05 ms | -100.0% bf16 MFU | 62053 tok/s +step 3885/19560 | loss 3.692596 (+1.20z)| norm 0.2929 (-0.03z)| lr 5.59e-04 | 8444.23 ms | -100.0% bf16 MFU | 62055 tok/s +step 3886/19560 | loss 3.622805 (-0.50z)| norm 0.3657 (+1.12z)| lr 5.59e-04 | 8447.79 ms | -100.0% bf16 MFU | 62055 tok/s +step 3887/19560 | loss 3.597205 (-1.11z)| norm 0.3312 (+0.57z)| lr 5.59e-04 | 8445.59 ms | -100.0% bf16 MFU | 62057 tok/s +step 3888/19560 | loss 3.648445 (+0.15z)| norm 0.2944 (-0.01z)| lr 5.59e-04 | 8448.34 ms | -100.0% bf16 MFU | 62057 tok/s +step 3889/19560 | loss 3.663511 (+0.51z)| norm 0.2791 (-0.26z)| lr 5.59e-04 | 8447.61 ms | -100.0% bf16 MFU | 62057 tok/s +step 3890/19560 | loss 3.606728 (-0.87z)| norm 0.2849 (-0.16z)| lr 5.59e-04 | 8451.95 ms | -100.0% bf16 MFU | 62056 tok/s +step 3891/19560 | loss 3.595286 (-1.13z)| norm 0.2722 (-0.36z)| lr 5.59e-04 | 8450.44 ms | -100.0% bf16 MFU | 62055 tok/s +step 3892/19560 | loss 3.695533 (+1.31z)| norm 0.2797 (-0.23z)| lr 5.59e-04 | 8447.90 ms | -100.0% bf16 MFU | 62055 tok/s +step 3893/19560 | loss 3.643181 (+0.05z)| norm 0.2711 (-0.45z)| lr 5.59e-04 | 8449.03 ms | -100.0% bf16 MFU | 62055 tok/s +step 3894/19560 | loss 3.630910 (-0.26z)| norm 0.2972 (+0.20z)| lr 5.59e-04 | 8444.52 ms | -100.0% bf16 MFU | 62057 tok/s +step 3895/19560 | loss 3.640924 (+0.03z)| norm 0.3833 (+2.41z)| lr 5.59e-04 | 8447.27 ms | -100.0% bf16 MFU | 62057 tok/s +step 3896/19560 | loss 3.658709 (+0.53z)| norm 0.2801 (-0.20z)| lr 5.59e-04 | 8447.00 ms | -100.0% bf16 MFU | 62058 tok/s +step 3897/19560 | loss 3.652948 (+0.39z)| norm 0.2853 (-0.02z)| lr 5.58e-04 | 8450.01 ms | -100.0% bf16 MFU | 62057 tok/s +step 3898/19560 | loss 3.648962 (+0.28z)| norm 0.2543 (-0.98z)| lr 5.58e-04 | 8446.37 ms | -100.0% bf16 MFU | 62058 tok/s +step 3899/19560 | loss 3.623759 (-0.43z)| norm 0.2628 (-0.70z)| lr 5.58e-04 | 8448.76 ms | -100.0% bf16 MFU | 62058 tok/s +step 3900/19560 | loss 3.631027 (-0.22z)| norm 0.2565 (-0.89z)| lr 5.58e-04 | 8445.49 ms | -100.0% bf16 MFU | 62059 tok/s +step 3901/19560 | loss 3.642394 (+0.09z)| norm 0.2287 (-1.74z)| lr 5.58e-04 | 8447.08 ms | -100.0% bf16 MFU | 62059 tok/s +step 3902/19560 | loss 3.636003 (-0.09z)| norm 0.2923 (+0.27z)| lr 5.58e-04 | 8448.91 ms | -100.0% bf16 MFU | 62059 tok/s +step 3903/19560 | loss 3.659288 (+0.57z)| norm 0.2798 (-0.13z)| lr 5.58e-04 | 8446.72 ms | -100.0% bf16 MFU | 62060 tok/s +step 3904/19560 | loss 3.642120 (+0.08z)| norm 0.2790 (-0.16z)| lr 5.58e-04 | 8447.85 ms | -100.0% bf16 MFU | 62060 tok/s +step 3905/19560 | loss 3.685171 (+1.32z)| norm 0.2946 (+0.33z)| lr 5.58e-04 | 8445.30 ms | -100.0% bf16 MFU | 62061 tok/s +step 3906/19560 | loss 3.601618 (-1.07z)| norm 0.3199 (+1.11z)| lr 5.58e-04 | 8438.95 ms | -100.0% bf16 MFU | 62064 tok/s +step 3907/19560 | loss 3.583348 (-1.60z)| norm 0.3106 (+0.81z)| lr 5.58e-04 | 8440.18 ms | -100.0% bf16 MFU | 62067 tok/s +step 3908/19560 | loss 3.664616 (+0.80z)| norm 0.2732 (-0.37z)| lr 5.58e-04 | 8439.19 ms | -100.0% bf16 MFU | 62070 tok/s +step 3909/19560 | loss 3.658705 (+0.62z)| norm 0.2800 (-0.17z)| lr 5.58e-04 | 8438.46 ms | -100.0% bf16 MFU | 62073 tok/s +step 3910/19560 | loss 3.647757 (+0.31z)| norm 0.2752 (-0.32z)| lr 5.58e-04 | 8435.31 ms | -100.0% bf16 MFU | 62077 tok/s +step 3911/19560 | loss 3.588175 (-1.47z)| norm 0.3104 (+0.79z)| lr 5.58e-04 | 8437.56 ms | -100.0% bf16 MFU | 62080 tok/s +step 3912/19560 | loss 3.640017 (+0.09z)| norm 0.2755 (-0.32z)| lr 5.58e-04 | 8438.58 ms | -100.0% bf16 MFU | 62082 tok/s +step 3913/19560 | loss 3.609388 (-0.83z)| norm 0.2787 (-0.22z)| lr 5.58e-04 | 8437.99 ms | -100.0% bf16 MFU | 62085 tok/s +step 3914/19560 | loss 3.637731 (+0.01z)| norm 0.3094 (+0.74z)| lr 5.58e-04 | 8439.41 ms | -100.0% bf16 MFU | 62087 tok/s +step 3915/19560 | loss 3.587061 (-1.48z)| norm 0.2970 (+0.34z)| lr 5.58e-04 | 8434.65 ms | -100.0% bf16 MFU | 62090 tok/s +step 3916/19560 | loss 3.754351 (+3.33z)| norm 0.2776 (-0.28z)| lr 5.58e-04 | 8437.98 ms | -100.0% bf16 MFU | 62093 tok/s +step 3917/19560 | loss 3.647402 (+0.27z)| norm 0.2780 (-0.28z)| lr 5.58e-04 | 8437.43 ms | -100.0% bf16 MFU | 62095 tok/s +step 3918/19560 | loss 3.611131 (-0.77z)| norm 0.2696 (-0.55z)| lr 5.58e-04 | 8438.88 ms | -100.0% bf16 MFU | 62097 tok/s +step 3919/19560 | loss 3.579004 (-1.70z)| norm 0.2750 (-0.38z)| lr 5.58e-04 | 8436.88 ms | -100.0% bf16 MFU | 62099 tok/s +step 3920/19560 | loss 3.608742 (-0.83z)| norm 0.2676 (-0.61z)| lr 5.58e-04 | 8457.72 ms | -100.0% bf16 MFU | 62093 tok/s +step 3921/19560 | loss 3.604641 (-0.94z)| norm 0.3159 (+0.93z)| lr 5.58e-04 | 8465.40 ms | -100.0% bf16 MFU | 62085 tok/s +step 3922/19560 | loss 3.641541 (+0.13z)| norm 0.2824 (-0.15z)| lr 5.58e-04 | 8466.07 ms | -100.0% bf16 MFU | 62077 tok/s +step 3923/19560 | loss 3.651284 (+0.41z)| norm 0.2694 (-0.57z)| lr 5.58e-04 | 8463.05 ms | -100.0% bf16 MFU | 62071 tok/s +step 3924/19560 | loss 3.583718 (-1.55z)| norm 0.2677 (-0.62z)| lr 5.58e-04 | 8465.11 ms | -100.0% bf16 MFU | 62064 tok/s +step 3925/19560 | loss 3.626736 (-0.30z)| norm 0.2790 (-0.25z)| lr 5.58e-04 | 8458.47 ms | -100.0% bf16 MFU | 62060 tok/s +step 3926/19560 | loss 3.824843 (+4.87z)| norm 0.2803 (-0.21z)| lr 5.58e-04 | 8466.47 ms | -100.0% bf16 MFU | 62054 tok/s +step 3927/19560 | loss 3.676548 (+0.99z)| norm 0.2512 (-1.12z)| lr 5.58e-04 | 8457.35 ms | -100.0% bf16 MFU | 62050 tok/s +step 3928/19560 | loss 3.666034 (+0.71z)| norm 0.2597 (-0.84z)| lr 5.58e-04 | 8465.99 ms | -100.0% bf16 MFU | 62044 tok/s +step 3929/19560 | loss 3.637094 (-0.05z)| norm 0.2699 (-0.51z)| lr 5.58e-04 | 8460.34 ms | -100.0% bf16 MFU | 62041 tok/s +step 3930/19560 | loss 3.632545 (-0.15z)| norm 0.2687 (-0.55z)| lr 5.58e-04 | 8458.84 ms | -100.0% bf16 MFU | 62038 tok/s +step 3931/19560 | loss 3.599191 (-1.03z)| norm 0.3103 (+0.79z)| lr 5.58e-04 | 8458.91 ms | -100.0% bf16 MFU | 62035 tok/s +step 3932/19560 | loss 3.623286 (-0.40z)| norm 0.3130 (+0.86z)| lr 5.58e-04 | 8462.89 ms | -100.0% bf16 MFU | 62031 tok/s +step 3933/19560 | loss 3.667859 (+0.82z)| norm 0.2698 (-0.54z)| lr 5.58e-04 | 8469.66 ms | -100.0% bf16 MFU | 62024 tok/s +step 3934/19560 | loss 3.661963 (+0.66z)| norm 0.2918 (+0.17z)| lr 5.58e-04 | 8464.68 ms | -100.0% bf16 MFU | 62020 tok/s +step 3935/19560 | loss 3.640238 (+0.07z)| norm 0.2760 (-0.35z)| lr 5.58e-04 | 8466.53 ms | -100.0% bf16 MFU | 62015 tok/s +step 3936/19560 | loss 3.601020 (-0.98z)| norm 0.2649 (-0.71z)| lr 5.57e-04 | 8465.74 ms | -100.0% bf16 MFU | 62011 tok/s +step 3937/19560 | loss 3.616513 (-0.55z)| norm 0.2913 (+0.14z)| lr 5.57e-04 | 8462.70 ms | -100.0% bf16 MFU | 62008 tok/s +step 3938/19560 | loss 3.644269 (+0.21z)| norm 0.2943 (+0.24z)| lr 5.57e-04 | 8460.19 ms | -100.0% bf16 MFU | 62006 tok/s +step 3939/19560 | loss 3.576486 (-1.62z)| norm 0.2756 (-0.37z)| lr 5.57e-04 | 8450.23 ms | -100.0% bf16 MFU | 62008 tok/s +step 3940/19560 | loss 3.656974 (+0.55z)| norm 0.2910 (+0.12z)| lr 5.57e-04 | 8460.24 ms | -100.0% bf16 MFU | 62006 tok/s +step 3941/19560 | loss 3.618304 (-0.50z)| norm 0.2641 (-0.76z)| lr 5.57e-04 | 8451.51 ms | -100.0% bf16 MFU | 62008 tok/s +step 3942/19560 | loss 3.584924 (-1.39z)| norm 0.2792 (-0.26z)| lr 5.57e-04 | 8453.52 ms | -100.0% bf16 MFU | 62008 tok/s +step 3943/19560 | loss 3.583339 (-1.41z)| norm 0.2725 (-0.48z)| lr 5.57e-04 | 8447.01 ms | -100.0% bf16 MFU | 62011 tok/s +step 3944/19560 | loss 3.618789 (-0.45z)| norm 0.2601 (-0.89z)| lr 5.57e-04 | 8451.59 ms | -100.0% bf16 MFU | 62012 tok/s +step 3945/19560 | loss 3.560104 (-1.98z)| norm 0.2908 (+0.11z)| lr 5.57e-04 | 8451.62 ms | -100.0% bf16 MFU | 62013 tok/s +step 3946/19560 | loss 3.565169 (-1.81z)| norm 0.2775 (-0.35z)| lr 5.57e-04 | 8446.41 ms | -100.0% bf16 MFU | 62016 tok/s +step 3947/19560 | loss 3.598190 (-0.93z)| norm 0.2907 (+0.09z)| lr 5.57e-04 | 8447.95 ms | -100.0% bf16 MFU | 62019 tok/s +step 3948/19560 | loss 3.623500 (-0.26z)| norm 0.2963 (+0.27z)| lr 5.57e-04 | 8445.12 ms | -100.0% bf16 MFU | 62022 tok/s +step 3949/19560 | loss 3.613249 (-0.53z)| norm 0.2856 (-0.08z)| lr 5.57e-04 | 8449.08 ms | -100.0% bf16 MFU | 62023 tok/s +step 3950/19560 | loss 3.639923 (+0.16z)| norm 0.2604 (-0.92z)| lr 5.57e-04 | 8447.91 ms | -100.0% bf16 MFU | 62025 tok/s +step 3951/19560 | loss 3.604516 (-0.75z)| norm 0.2719 (-0.53z)| lr 5.57e-04 | 8451.96 ms | -100.0% bf16 MFU | 62026 tok/s +step 3952/19560 | loss 3.555259 (-1.99z)| norm 0.2802 (-0.25z)| lr 5.57e-04 | 8447.80 ms | -100.0% bf16 MFU | 62027 tok/s +step 3953/19560 | loss 3.615307 (-0.43z)| norm 0.2781 (-0.32z)| lr 5.57e-04 | 8464.78 ms | -100.0% bf16 MFU | 62023 tok/s +step 3954/19560 | loss 3.632882 (+0.02z)| norm 0.2656 (-0.73z)| lr 5.57e-04 | 8453.37 ms | -100.0% bf16 MFU | 62023 tok/s +step 3955/19560 | loss 3.634763 (+0.08z)| norm 0.2853 (-0.07z)| lr 5.57e-04 | 8443.38 ms | -100.0% bf16 MFU | 62026 tok/s +step 3956/19560 | loss 3.628918 (-0.08z)| norm 0.2749 (-0.42z)| lr 5.57e-04 | 8449.89 ms | -100.0% bf16 MFU | 62027 tok/s +step 3957/19560 | loss 3.601936 (-0.77z)| norm 0.2596 (-0.92z)| lr 5.57e-04 | 8450.98 ms | -100.0% bf16 MFU | 62028 tok/s +step 3958/19560 | loss 3.597632 (-0.88z)| norm 0.3017 (+0.48z)| lr 5.57e-04 | 8456.68 ms | -100.0% bf16 MFU | 62026 tok/s +step 3959/19560 | loss 3.600922 (-0.79z)| norm 0.3133 (+0.86z)| lr 5.57e-04 | 8453.00 ms | -100.0% bf16 MFU | 62026 tok/s +step 3960/19560 | loss 3.627735 (-0.09z)| norm 0.2666 (-0.70z)| lr 5.57e-04 | 8462.88 ms | -100.0% bf16 MFU | 62023 tok/s +step 3961/19560 | loss 3.618736 (-0.33z)| norm 0.2675 (-0.67z)| lr 5.57e-04 | 8453.69 ms | -100.0% bf16 MFU | 62022 tok/s +step 3962/19560 | loss 3.606316 (-0.65z)| norm 0.2873 (-0.00z)| lr 5.57e-04 | 8448.70 ms | -100.0% bf16 MFU | 62024 tok/s +step 3963/19560 | loss 3.613146 (-0.47z)| norm 0.3217 (+1.17z)| lr 5.57e-04 | 8456.79 ms | -100.0% bf16 MFU | 62023 tok/s +step 3964/19560 | loss 3.576021 (-1.44z)| norm 0.3201 (+1.11z)| lr 5.57e-04 | 8454.03 ms | -100.0% bf16 MFU | 62022 tok/s +step 3965/19560 | loss 3.639468 (+0.23z)| norm 0.2921 (+0.16z)| lr 5.57e-04 | 8446.76 ms | -100.0% bf16 MFU | 62025 tok/s +step 3966/19560 | loss 3.591958 (-1.01z)| norm 0.2646 (-0.76z)| lr 5.57e-04 | 8452.55 ms | -100.0% bf16 MFU | 62025 tok/s +step 3967/19560 | loss 3.644680 (+0.38z)| norm 0.2506 (-1.21z)| lr 5.57e-04 | 8452.64 ms | -100.0% bf16 MFU | 62025 tok/s +step 3968/19560 | loss 3.602912 (-0.72z)| norm 0.2782 (-0.28z)| lr 5.57e-04 | 8449.49 ms | -100.0% bf16 MFU | 62026 tok/s +step 3969/19560 | loss 3.610247 (-0.51z)| norm 0.2831 (-0.10z)| lr 5.57e-04 | 8458.41 ms | -100.0% bf16 MFU | 62024 tok/s +step 3970/19560 | loss 3.618974 (-0.28z)| norm 0.2577 (-0.96z)| lr 5.57e-04 | 8455.00 ms | -100.0% bf16 MFU | 62023 tok/s +step 3971/19560 | loss 3.646467 (+0.44z)| norm 0.2623 (-0.79z)| lr 5.57e-04 | 8451.82 ms | -100.0% bf16 MFU | 62024 tok/s +step 3972/19560 | loss 3.636718 (+0.19z)| norm 0.2962 (+0.40z)| lr 5.57e-04 | 8451.05 ms | -100.0% bf16 MFU | 62024 tok/s +step 3973/19560 | loss 3.621606 (-0.22z)| norm 0.3222 (+1.32z)| lr 5.57e-04 | 8454.59 ms | -100.0% bf16 MFU | 62024 tok/s +step 3974/19560 | loss 3.594767 (-0.93z)| norm 0.3066 (+0.76z)| lr 5.57e-04 | 8451.47 ms | -100.0% bf16 MFU | 62024 tok/s +step 3975/19560 | loss 3.658107 (+0.74z)| norm 0.3065 (+0.75z)| lr 5.56e-04 | 8452.64 ms | -100.0% bf16 MFU | 62025 tok/s +step 3976/19560 | loss 3.578670 (-1.34z)| norm 0.2934 (+0.28z)| lr 5.56e-04 | 8461.06 ms | -100.0% bf16 MFU | 62022 tok/s +step 3977/19560 | loss 3.633217 (+0.11z)| norm 0.2944 (+0.48z)| lr 5.56e-04 | 8454.76 ms | -100.0% bf16 MFU | 62021 tok/s +step 3978/19560 | loss 3.600904 (-0.74z)| norm 0.2784 (-0.25z)| lr 5.56e-04 | 8453.33 ms | -100.0% bf16 MFU | 62021 tok/s +step 3979/19560 | loss 3.580864 (-1.25z)| norm 0.2843 (+0.03z)| lr 5.56e-04 | 8457.19 ms | -100.0% bf16 MFU | 62020 tok/s +step 3980/19560 | loss 3.682639 (+1.40z)| norm 0.2838 (-0.00z)| lr 5.56e-04 | 8460.39 ms | -100.0% bf16 MFU | 62017 tok/s +step 3981/19560 | loss 3.535926 (-2.36z)| norm 0.3015 (+0.79z)| lr 5.56e-04 | 8451.81 ms | -100.0% bf16 MFU | 62018 tok/s +step 3982/19560 | loss 3.600841 (-0.69z)| norm 0.2660 (-0.82z)| lr 5.56e-04 | 8457.89 ms | -100.0% bf16 MFU | 62016 tok/s +step 3983/19560 | loss 3.638351 (+0.26z)| norm 0.2713 (-0.57z)| lr 5.56e-04 | 8459.85 ms | -100.0% bf16 MFU | 62014 tok/s +step 3984/19560 | loss 3.583729 (-1.13z)| norm 0.2883 (+0.20z)| lr 5.56e-04 | 8450.68 ms | -100.0% bf16 MFU | 62016 tok/s +step 3985/19560 | loss 3.616918 (-0.28z)| norm 0.3052 (+0.95z)| lr 5.56e-04 | 8457.68 ms | -100.0% bf16 MFU | 62014 tok/s +step 3986/19560 | loss 3.613939 (-0.36z)| norm 0.2876 (+0.15z)| lr 5.56e-04 | 8452.77 ms | -100.0% bf16 MFU | 62015 tok/s +step 3987/19560 | loss 3.626673 (-0.03z)| norm 0.3115 (+1.22z)| lr 5.56e-04 | 8456.57 ms | -100.0% bf16 MFU | 62014 tok/s +step 3988/19560 | loss 3.600281 (-0.69z)| norm 0.2824 (-0.10z)| lr 5.56e-04 | 8453.50 ms | -100.0% bf16 MFU | 62014 tok/s +step 3989/19560 | loss 3.590025 (-0.95z)| norm 0.2619 (-1.02z)| lr 5.56e-04 | 8453.67 ms | -100.0% bf16 MFU | 62015 tok/s +step 3990/19560 | loss 3.637917 (+0.28z)| norm 0.2750 (-0.42z)| lr 5.56e-04 | 8448.25 ms | -100.0% bf16 MFU | 62017 tok/s +step 3991/19560 | loss 3.529826 (-2.42z)| norm 0.3016 (+0.78z)| lr 5.56e-04 | 8441.13 ms | -100.0% bf16 MFU | 62021 tok/s +step 3992/19560 | loss 3.592155 (-0.83z)| norm 0.2792 (-0.23z)| lr 5.56e-04 | 8438.36 ms | -100.0% bf16 MFU | 62027 tok/s +step 3993/19560 | loss 3.587489 (-0.94z)| norm 0.2662 (-0.81z)| lr 5.56e-04 | 8435.90 ms | -100.0% bf16 MFU | 62033 tok/s +step 3994/19560 | loss 3.613779 (-0.27z)| norm 0.3037 (+0.86z)| lr 5.56e-04 | 8435.35 ms | -100.0% bf16 MFU | 62039 tok/s +step 3995/19560 | loss 3.612800 (-0.28z)| norm 0.3286 (+1.98z)| lr 5.56e-04 | 8436.80 ms | -100.0% bf16 MFU | 62044 tok/s +step 3996/19560 | loss 3.636719 (+0.34z)| norm 0.2823 (-0.10z)| lr 5.56e-04 | 8440.50 ms | -100.0% bf16 MFU | 62048 tok/s +step 3997/19560 | loss 3.621591 (-0.05z)| norm 0.2961 (+0.51z)| lr 5.56e-04 | 8436.75 ms | -100.0% bf16 MFU | 62053 tok/s +step 3998/19560 | loss 3.595089 (-0.73z)| norm 0.2815 (-0.16z)| lr 5.56e-04 | 8435.98 ms | -100.0% bf16 MFU | 62057 tok/s +step 3999/19560 | loss 3.599102 (-0.61z)| norm 0.2575 (-1.26z)| lr 5.56e-04 | 8440.88 ms | -100.0% bf16 MFU | 62060 tok/s +step 4000/19560 | loss 3.626227 (+0.10z)| norm 0.2743 (-0.50z)| lr 5.56e-04 | 8444.21 ms | -100.0% bf16 MFU | 62062 tok/s +val loss 3.604953 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 2727/10042 = 0.271559 +step 4001/19560 | loss 3.598066 (-0.66z)| norm 0.2565 (-1.29z)| lr 5.56e-04 | 8447.69 ms | -100.0% bf16 MFU | 62062 tok/s +step 4002/19560 | loss 3.581635 (-1.10z)| norm 0.2765 (-0.37z)| lr 5.56e-04 | 8445.55 ms | -100.0% bf16 MFU | 62063 tok/s +step 4003/19560 | loss 3.650444 (+0.74z)| norm 0.2729 (-0.52z)| lr 5.56e-04 | 8446.54 ms | -100.0% bf16 MFU | 62063 tok/s +step 4004/19560 | loss 3.569654 (-1.40z)| norm 0.2627 (-0.99z)| lr 5.56e-04 | 8442.87 ms | -100.0% bf16 MFU | 62065 tok/s +step 4005/19560 | loss 3.643886 (+0.59z)| norm 0.2543 (-1.36z)| lr 5.56e-04 | 8444.77 ms | -100.0% bf16 MFU | 62066 tok/s +step 4006/19560 | loss 3.642020 (+0.53z)| norm 0.2758 (-0.36z)| lr 5.56e-04 | 8449.62 ms | -100.0% bf16 MFU | 62065 tok/s +step 4007/19560 | loss 3.548844 (-1.94z)| norm 0.2622 (-1.00z)| lr 5.56e-04 | 8447.52 ms | -100.0% bf16 MFU | 62065 tok/s +step 4008/19560 | loss 3.592797 (-0.76z)| norm 0.2607 (-1.05z)| lr 5.56e-04 | 8452.50 ms | -100.0% bf16 MFU | 62063 tok/s +step 4009/19560 | loss 3.640901 (+0.51z)| norm 0.3092 (+1.18z)| lr 5.56e-04 | 8448.28 ms | -100.0% bf16 MFU | 62063 tok/s +step 4010/19560 | loss 3.669518 (+1.25z)| norm 0.3209 (+1.69z)| lr 5.56e-04 | 8450.59 ms | -100.0% bf16 MFU | 62062 tok/s +step 4011/19560 | loss 3.644126 (+0.57z)| norm 0.2743 (-0.45z)| lr 5.56e-04 | 8449.44 ms | -100.0% bf16 MFU | 62061 tok/s +step 4012/19560 | loss 3.605492 (-0.45z)| norm 0.3025 (+0.84z)| lr 5.56e-04 | 8453.22 ms | -100.0% bf16 MFU | 62059 tok/s +step 4013/19560 | loss 3.639601 (+0.47z)| norm 0.2838 (-0.01z)| lr 5.55e-04 | 8449.69 ms | -100.0% bf16 MFU | 62059 tok/s +step 4014/19560 | loss 3.635247 (+0.35z)| norm 0.2686 (-0.71z)| lr 5.55e-04 | 8449.49 ms | -100.0% bf16 MFU | 62058 tok/s +step 4015/19560 | loss 3.597208 (-0.66z)| norm 0.2969 (+0.69z)| lr 5.55e-04 | 8453.48 ms | -100.0% bf16 MFU | 62056 tok/s +step 4016/19560 | loss 3.630733 (+0.23z)| norm 0.3206 (+1.83z)| lr 5.55e-04 | 8449.76 ms | -100.0% bf16 MFU | 62056 tok/s +step 4017/19560 | loss 3.553216 (-1.80z)| norm 0.3242 (+1.96z)| lr 5.55e-04 | 8455.10 ms | -100.0% bf16 MFU | 62054 tok/s +step 4018/19560 | loss 3.575747 (-1.19z)| norm 0.2973 (+0.66z)| lr 5.55e-04 | 8451.70 ms | -100.0% bf16 MFU | 62052 tok/s +step 4019/19560 | loss 3.606976 (-0.37z)| norm 0.2988 (+0.72z)| lr 5.55e-04 | 8450.11 ms | -100.0% bf16 MFU | 62052 tok/s +step 4020/19560 | loss 3.619723 (-0.02z)| norm 0.2629 (-0.99z)| lr 5.55e-04 | 8445.24 ms | -100.0% bf16 MFU | 62054 tok/s +step 4021/19560 | loss 3.578691 (-1.10z)| norm 0.2952 (+0.54z)| lr 5.55e-04 | 8452.44 ms | -100.0% bf16 MFU | 62052 tok/s +step 4022/19560 | loss 3.673290 (+1.41z)| norm 0.2930 (+0.44z)| lr 5.55e-04 | 8447.19 ms | -100.0% bf16 MFU | 62053 tok/s +step 4023/19560 | loss 3.595075 (-0.66z)| norm 0.2892 (+0.32z)| lr 5.55e-04 | 8446.61 ms | -100.0% bf16 MFU | 62054 tok/s +step 4024/19560 | loss 3.595315 (-0.64z)| norm 0.2628 (-1.06z)| lr 5.55e-04 | 8452.30 ms | -100.0% bf16 MFU | 62053 tok/s +step 4025/19560 | loss 3.684309 (+1.70z)| norm 0.2845 (+0.08z)| lr 5.55e-04 | 8450.26 ms | -100.0% bf16 MFU | 62052 tok/s +step 4026/19560 | loss 3.607654 (-0.31z)| norm 0.2829 (-0.01z)| lr 5.55e-04 | 8449.74 ms | -100.0% bf16 MFU | 62052 tok/s +step 4027/19560 | loss 3.521754 (-2.49z)| norm 0.2775 (-0.31z)| lr 5.55e-04 | 8448.81 ms | -100.0% bf16 MFU | 62052 tok/s +step 4028/19560 | loss 3.573826 (-1.13z)| norm 0.2579 (-1.36z)| lr 5.55e-04 | 8454.46 ms | -100.0% bf16 MFU | 62050 tok/s +step 4029/19560 | loss 3.542779 (-1.88z)| norm 0.2890 (+0.29z)| lr 5.55e-04 | 8450.77 ms | -100.0% bf16 MFU | 62050 tok/s +step 4030/19560 | loss 3.599827 (-0.43z)| norm 0.3223 (+2.08z)| lr 5.55e-04 | 8448.92 ms | -100.0% bf16 MFU | 62050 tok/s +step 4031/19560 | loss 3.612341 (-0.11z)| norm 0.3054 (+1.14z)| lr 5.55e-04 | 8447.40 ms | -100.0% bf16 MFU | 62051 tok/s +step 4032/19560 | loss 3.614930 (-0.04z)| norm 0.2952 (+0.59z)| lr 5.55e-04 | 8450.34 ms | -100.0% bf16 MFU | 62050 tok/s +step 4033/19560 | loss 3.575953 (-1.02z)| norm 0.2866 (+0.12z)| lr 5.55e-04 | 8447.57 ms | -100.0% bf16 MFU | 62051 tok/s +step 4034/19560 | loss 3.658006 (+1.07z)| norm 0.2645 (-1.05z)| lr 5.55e-04 | 8450.90 ms | -100.0% bf16 MFU | 62050 tok/s +step 4035/19560 | loss 3.625962 (+0.25z)| norm 0.2761 (-0.41z)| lr 5.55e-04 | 8451.48 ms | -100.0% bf16 MFU | 62050 tok/s +step 4036/19560 | loss 3.576029 (-1.02z)| norm 0.2762 (-0.41z)| lr 5.55e-04 | 8450.79 ms | -100.0% bf16 MFU | 62049 tok/s +step 4037/19560 | loss 3.649730 (+0.88z)| norm 0.2520 (-1.71z)| lr 5.55e-04 | 8449.54 ms | -100.0% bf16 MFU | 62049 tok/s +step 4038/19560 | loss 3.540284 (-1.89z)| norm 0.2610 (-1.21z)| lr 5.55e-04 | 8452.21 ms | -100.0% bf16 MFU | 62048 tok/s +step 4039/19560 | loss 3.622365 (+0.19z)| norm 0.2512 (-1.71z)| lr 5.55e-04 | 8451.14 ms | -100.0% bf16 MFU | 62048 tok/s +step 4040/19560 | loss 3.614311 (-0.01z)| norm 0.2426 (-2.12z)| lr 5.55e-04 | 8451.29 ms | -100.0% bf16 MFU | 62047 tok/s +step 4041/19560 | loss 3.678068 (+1.58z)| norm 0.2748 (-0.41z)| lr 5.55e-04 | 8447.11 ms | -100.0% bf16 MFU | 62048 tok/s +step 4042/19560 | loss 3.545187 (-1.73z)| norm 0.2617 (-1.09z)| lr 5.55e-04 | 8448.45 ms | -100.0% bf16 MFU | 62049 tok/s +step 4043/19560 | loss 3.633098 (+0.45z)| norm 0.2608 (-1.12z)| lr 5.55e-04 | 8450.72 ms | -100.0% bf16 MFU | 62048 tok/s +step 4044/19560 | loss 3.602705 (-0.29z)| norm 0.2693 (-0.66z)| lr 5.55e-04 | 8453.00 ms | -100.0% bf16 MFU | 62047 tok/s +step 4045/19560 | loss 3.675894 (+1.61z)| norm 0.2931 (+0.59z)| lr 5.55e-04 | 8448.07 ms | -100.0% bf16 MFU | 62048 tok/s +step 4046/19560 | loss 3.612544 (-0.04z)| norm 0.3087 (+1.39z)| lr 5.55e-04 | 8447.91 ms | -100.0% bf16 MFU | 62048 tok/s +step 4047/19560 | loss 3.670305 (+1.44z)| norm 0.2978 (+0.81z)| lr 5.55e-04 | 8447.95 ms | -100.0% bf16 MFU | 62049 tok/s +step 4048/19560 | loss 3.629782 (+0.39z)| norm 0.2734 (-0.47z)| lr 5.55e-04 | 8450.36 ms | -100.0% bf16 MFU | 62049 tok/s +step 4049/19560 | loss 3.579951 (-0.90z)| norm 0.2904 (+0.43z)| lr 5.55e-04 | 8446.09 ms | -100.0% bf16 MFU | 62050 tok/s +step 4050/19560 | loss 3.534905 (-2.01z)| norm 0.2968 (+0.77z)| lr 5.55e-04 | 8449.71 ms | -100.0% bf16 MFU | 62050 tok/s +step 4051/19560 | loss 3.594235 (-0.49z)| norm 0.2759 (-0.35z)| lr 5.54e-04 | 8449.77 ms | -100.0% bf16 MFU | 62050 tok/s +step 4052/19560 | loss 3.575701 (-0.96z)| norm 0.2654 (-0.90z)| lr 5.54e-04 | 8451.61 ms | -100.0% bf16 MFU | 62049 tok/s +step 4053/19560 | loss 3.616780 (+0.09z)| norm 0.2856 (+0.17z)| lr 5.54e-04 | 8444.04 ms | -100.0% bf16 MFU | 62051 tok/s +step 4054/19560 | loss 3.635188 (+0.68z)| norm 0.3382 (+2.84z)| lr 5.54e-04 | 8436.85 ms | -100.0% bf16 MFU | 62056 tok/s +step 4055/19560 | loss 3.613562 (+0.07z)| norm 0.3398 (+2.82z)| lr 5.54e-04 | 8438.11 ms | -100.0% bf16 MFU | 62059 tok/s +step 4056/19560 | loss 3.600143 (-0.32z)| norm 0.3067 (+1.14z)| lr 5.54e-04 | 8436.08 ms | -100.0% bf16 MFU | 62064 tok/s +step 4057/19560 | loss 3.603774 (-0.20z)| norm 0.3048 (+1.03z)| lr 5.54e-04 | 8437.08 ms | -100.0% bf16 MFU | 62068 tok/s +step 4058/19560 | loss 3.676182 (+1.92z)| norm 0.3240 (+1.95z)| lr 5.54e-04 | 8435.95 ms | -100.0% bf16 MFU | 62072 tok/s +step 4059/19560 | loss 3.571615 (-1.14z)| norm 0.2896 (+0.25z)| lr 5.54e-04 | 8433.84 ms | -100.0% bf16 MFU | 62076 tok/s +step 4060/19560 | loss 3.649740 (+1.13z)| norm 0.2767 (-0.38z)| lr 5.54e-04 | 8431.59 ms | -100.0% bf16 MFU | 62082 tok/s +step 4061/19560 | loss 3.631972 (+0.63z)| norm 0.2734 (-0.55z)| lr 5.54e-04 | 8433.24 ms | -100.0% bf16 MFU | 62086 tok/s +step 4062/19560 | loss 3.716361 (+3.02z)| norm 0.2697 (-0.72z)| lr 5.54e-04 | 8439.43 ms | -100.0% bf16 MFU | 62088 tok/s +step 4063/19560 | loss 3.584738 (-0.74z)| norm 0.2731 (-0.55z)| lr 5.54e-04 | 8432.45 ms | -100.0% bf16 MFU | 62092 tok/s +step 4064/19560 | loss 3.594423 (-0.46z)| norm 0.2546 (-1.47z)| lr 5.54e-04 | 8434.89 ms | -100.0% bf16 MFU | 62096 tok/s +step 4065/19560 | loss 3.641095 (+0.87z)| norm 0.2394 (-2.17z)| lr 5.54e-04 | 8435.36 ms | -100.0% bf16 MFU | 62098 tok/s +step 4066/19560 | loss 3.635362 (+0.71z)| norm 0.2623 (-1.03z)| lr 5.54e-04 | 8438.14 ms | -100.0% bf16 MFU | 62100 tok/s +step 4067/19560 | loss 3.618366 (+0.21z)| norm 0.2697 (-0.66z)| lr 5.54e-04 | 8437.67 ms | -100.0% bf16 MFU | 62102 tok/s +step 4068/19560 | loss 3.562838 (-1.36z)| norm 0.2680 (-0.74z)| lr 5.54e-04 | 8442.67 ms | -100.0% bf16 MFU | 62102 tok/s +step 4069/19560 | loss 3.593638 (-0.47z)| norm 0.2866 (+0.16z)| lr 5.54e-04 | 8431.69 ms | -100.0% bf16 MFU | 62106 tok/s +step 4070/19560 | loss 3.605928 (-0.12z)| norm 0.2732 (-0.49z)| lr 5.54e-04 | 8437.04 ms | -100.0% bf16 MFU | 62108 tok/s +step 4071/19560 | loss 3.575647 (-0.99z)| norm 0.2747 (-0.42z)| lr 5.54e-04 | 8440.09 ms | -100.0% bf16 MFU | 62108 tok/s +step 4072/19560 | loss 3.669414 (+1.67z)| norm 0.2511 (-1.56z)| lr 5.54e-04 | 8440.09 ms | -100.0% bf16 MFU | 62109 tok/s +step 4073/19560 | loss 3.652389 (+1.17z)| norm 0.2852 (+0.10z)| lr 5.54e-04 | 8434.58 ms | -100.0% bf16 MFU | 62111 tok/s +step 4074/19560 | loss 3.601895 (-0.28z)| norm 0.2722 (-0.53z)| lr 5.54e-04 | 8436.15 ms | -100.0% bf16 MFU | 62113 tok/s +step 4075/19560 | loss 3.574458 (-1.05z)| norm 0.2655 (-0.84z)| lr 5.54e-04 | 8439.82 ms | -100.0% bf16 MFU | 62113 tok/s +step 4076/19560 | loss 3.612140 (+0.02z)| norm 0.2770 (-0.28z)| lr 5.54e-04 | 8440.01 ms | -100.0% bf16 MFU | 62114 tok/s +step 4077/19560 | loss 3.632451 (+0.60z)| norm 0.2769 (-0.28z)| lr 5.54e-04 | 8437.86 ms | -100.0% bf16 MFU | 62115 tok/s +step 4078/19560 | loss 3.625532 (+0.41z)| norm 0.2805 (-0.11z)| lr 5.54e-04 | 8440.52 ms | -100.0% bf16 MFU | 62115 tok/s +step 4079/19560 | loss 3.657347 (+1.29z)| norm 0.3295 (+2.21z)| lr 5.54e-04 | 8436.24 ms | -100.0% bf16 MFU | 62116 tok/s +step 4080/19560 | loss 3.592728 (-0.55z)| norm 0.2886 (+0.25z)| lr 5.54e-04 | 8443.21 ms | -100.0% bf16 MFU | 62115 tok/s +step 4081/19560 | loss 3.591957 (-0.57z)| norm 0.2926 (+0.44z)| lr 5.54e-04 | 8438.11 ms | -100.0% bf16 MFU | 62116 tok/s +step 4082/19560 | loss 3.579421 (-0.91z)| norm 0.2735 (-0.48z)| lr 5.54e-04 | 8439.73 ms | -100.0% bf16 MFU | 62117 tok/s +step 4083/19560 | loss 3.625316 (+0.40z)| norm 0.2717 (-0.56z)| lr 5.54e-04 | 8441.19 ms | -100.0% bf16 MFU | 62116 tok/s +step 4084/19560 | loss 3.640316 (+0.82z)| norm 0.2882 (+0.23z)| lr 5.54e-04 | 8438.28 ms | -100.0% bf16 MFU | 62117 tok/s +step 4085/19560 | loss 3.588384 (-0.65z)| norm 0.2567 (-1.28z)| lr 5.54e-04 | 8443.82 ms | -100.0% bf16 MFU | 62116 tok/s +step 4086/19560 | loss 3.597008 (-0.41z)| norm 0.2752 (-0.39z)| lr 5.54e-04 | 8439.58 ms | -100.0% bf16 MFU | 62116 tok/s +step 4087/19560 | loss 3.693398 (+2.27z)| norm 0.2724 (-0.51z)| lr 5.54e-04 | 8438.49 ms | -100.0% bf16 MFU | 62117 tok/s +step 4088/19560 | loss 3.563624 (-1.33z)| norm 0.2806 (-0.12z)| lr 5.54e-04 | 8438.02 ms | -100.0% bf16 MFU | 62118 tok/s +step 4089/19560 | loss 3.594131 (-0.48z)| norm 0.2955 (+0.59z)| lr 5.53e-04 | 8440.20 ms | -100.0% bf16 MFU | 62118 tok/s +step 4090/19560 | loss 3.617570 (+0.17z)| norm 0.3119 (+1.37z)| lr 5.53e-04 | 8441.75 ms | -100.0% bf16 MFU | 62117 tok/s +step 4091/19560 | loss 3.592523 (-0.52z)| norm 0.3019 (+0.90z)| lr 5.53e-04 | 8439.56 ms | -100.0% bf16 MFU | 62117 tok/s +step 4092/19560 | loss 3.614828 (+0.09z)| norm 0.2763 (-0.33z)| lr 5.53e-04 | 8440.84 ms | -100.0% bf16 MFU | 62117 tok/s +step 4093/19560 | loss 3.626698 (+0.42z)| norm 0.2702 (-0.62z)| lr 5.53e-04 | 8439.32 ms | -100.0% bf16 MFU | 62118 tok/s +step 4094/19560 | loss 3.573904 (-1.04z)| norm 0.2553 (-1.35z)| lr 5.53e-04 | 8443.14 ms | -100.0% bf16 MFU | 62117 tok/s +step 4095/19560 | loss 3.644182 (+0.91z)| norm 0.2614 (-1.06z)| lr 5.53e-04 | 8440.85 ms | -100.0% bf16 MFU | 62116 tok/s +step 4096/19560 | loss 3.643371 (+0.88z)| norm 0.2942 (+0.55z)| lr 5.53e-04 | 8439.66 ms | -100.0% bf16 MFU | 62117 tok/s +step 4097/19560 | loss 3.601447 (-0.28z)| norm 0.2703 (-0.61z)| lr 5.53e-04 | 8440.14 ms | -100.0% bf16 MFU | 62117 tok/s +step 4098/19560 | loss 3.596728 (-0.41z)| norm 0.2803 (-0.14z)| lr 5.53e-04 | 8437.42 ms | -100.0% bf16 MFU | 62118 tok/s +step 4099/19560 | loss 3.603955 (-0.20z)| norm 0.2929 (+0.48z)| lr 5.53e-04 | 8442.93 ms | -100.0% bf16 MFU | 62117 tok/s +step 4100/19560 | loss 3.547131 (-1.74z)| norm 0.2833 (+0.01z)| lr 5.53e-04 | 8440.06 ms | -100.0% bf16 MFU | 62117 tok/s +step 4101/19560 | loss 3.575925 (-0.93z)| norm 0.3395 (+2.75z)| lr 5.53e-04 | 8442.69 ms | -100.0% bf16 MFU | 62116 tok/s +step 4102/19560 | loss 3.581704 (-0.77z)| norm 0.3420 (+2.78z)| lr 5.53e-04 | 8442.42 ms | -100.0% bf16 MFU | 62115 tok/s +step 4103/19560 | loss 3.596648 (-0.35z)| norm 0.3109 (+1.30z)| lr 5.53e-04 | 8440.78 ms | -100.0% bf16 MFU | 62115 tok/s +step 4104/19560 | loss 3.624665 (+0.41z)| norm 0.3042 (+0.97z)| lr 5.53e-04 | 8437.16 ms | -100.0% bf16 MFU | 62116 tok/s +step 4105/19560 | loss 3.666234 (+1.54z)| norm 0.2783 (-0.25z)| lr 5.53e-04 | 8437.49 ms | -100.0% bf16 MFU | 62118 tok/s +step 4106/19560 | loss 3.629244 (+0.52z)| norm 0.2709 (-0.60z)| lr 5.53e-04 | 8432.47 ms | -100.0% bf16 MFU | 62120 tok/s +step 4107/19560 | loss 3.580768 (-0.81z)| norm 0.2833 (-0.01z)| lr 5.53e-04 | 8433.92 ms | -100.0% bf16 MFU | 62123 tok/s +step 4108/19560 | loss 3.603849 (-0.16z)| norm 0.2561 (-1.28z)| lr 5.53e-04 | 8433.49 ms | -100.0% bf16 MFU | 62125 tok/s +step 4109/19560 | loss 3.609984 (-0.01z)| norm 0.2612 (-1.03z)| lr 5.53e-04 | 8431.47 ms | -100.0% bf16 MFU | 62128 tok/s +step 4110/19560 | loss 3.559815 (-1.41z)| norm 0.2702 (-0.60z)| lr 5.53e-04 | 8434.18 ms | -100.0% bf16 MFU | 62129 tok/s +step 4111/19560 | loss 3.599462 (-0.29z)| norm 0.2450 (-1.76z)| lr 5.53e-04 | 8459.01 ms | -100.0% bf16 MFU | 62122 tok/s +step 4112/19560 | loss 3.657639 (+1.33z)| norm 0.3000 (+0.79z)| lr 5.53e-04 | 8457.91 ms | -100.0% bf16 MFU | 62115 tok/s +step 4113/19560 | loss 3.581668 (-0.79z)| norm 0.3446 (+2.77z)| lr 5.53e-04 | 8459.00 ms | -100.0% bf16 MFU | 62108 tok/s +step 4114/19560 | loss 3.547196 (-1.72z)| norm 0.3871 (+4.31z)| lr 5.53e-04 | 8457.84 ms | -100.0% bf16 MFU | 62102 tok/s +step 4115/19560 | loss 3.709527 (+2.67z)| norm 0.3244 (+1.67z)| lr 5.53e-04 | 8457.76 ms | -100.0% bf16 MFU | 62097 tok/s +step 4116/19560 | loss 3.574100 (-0.96z)| norm 0.2763 (-0.32z)| lr 5.53e-04 | 8457.14 ms | -100.0% bf16 MFU | 62092 tok/s +step 4117/19560 | loss 3.601306 (-0.23z)| norm 0.2959 (+0.48z)| lr 5.53e-04 | 8460.47 ms | -100.0% bf16 MFU | 62086 tok/s +step 4118/19560 | loss 3.700897 (+2.37z)| norm 0.3230 (+1.58z)| lr 5.53e-04 | 8455.36 ms | -100.0% bf16 MFU | 62082 tok/s +step 4119/19560 | loss 3.556948 (-1.42z)| norm 0.3227 (+1.55z)| lr 5.53e-04 | 8457.03 ms | -100.0% bf16 MFU | 62077 tok/s +step 4120/19560 | loss 3.585303 (-0.67z)| norm 0.2957 (+0.44z)| lr 5.53e-04 | 8455.57 ms | -100.0% bf16 MFU | 62074 tok/s +step 4121/19560 | loss 3.582116 (-0.75z)| norm 0.2930 (+0.32z)| lr 5.53e-04 | 8460.06 ms | -100.0% bf16 MFU | 62069 tok/s +step 4122/19560 | loss 3.560021 (-1.32z)| norm 0.2810 (-0.17z)| lr 5.53e-04 | 8456.95 ms | -100.0% bf16 MFU | 62065 tok/s +step 4123/19560 | loss 3.559179 (-1.32z)| norm 0.2698 (-0.61z)| lr 5.53e-04 | 8459.54 ms | -100.0% bf16 MFU | 62060 tok/s +step 4124/19560 | loss 3.600674 (-0.23z)| norm 0.2712 (-0.55z)| lr 5.53e-04 | 8455.86 ms | -100.0% bf16 MFU | 62058 tok/s +step 4125/19560 | loss 3.551419 (-1.49z)| norm 0.2647 (-0.81z)| lr 5.53e-04 | 8452.74 ms | -100.0% bf16 MFU | 62056 tok/s +step 4126/19560 | loss 3.610691 (+0.04z)| norm 0.3390 (+2.20z)| lr 5.52e-04 | 8457.81 ms | -100.0% bf16 MFU | 62053 tok/s +step 4127/19560 | loss 3.520329 (-2.24z)| norm 0.3311 (+1.84z)| lr 5.52e-04 | 8454.35 ms | -100.0% bf16 MFU | 62051 tok/s +step 4128/19560 | loss 3.578366 (-0.75z)| norm 0.2692 (-0.65z)| lr 5.52e-04 | 8459.72 ms | -100.0% bf16 MFU | 62047 tok/s +step 4129/19560 | loss 3.571391 (-0.92z)| norm 0.2881 (+0.11z)| lr 5.52e-04 | 8458.95 ms | -100.0% bf16 MFU | 62044 tok/s +step 4130/19560 | loss 3.584425 (-0.59z)| norm 0.2689 (-0.66z)| lr 5.52e-04 | 8455.95 ms | -100.0% bf16 MFU | 62041 tok/s +step 4131/19560 | loss 3.648678 (+1.03z)| norm 0.2688 (-0.67z)| lr 5.52e-04 | 8455.47 ms | -100.0% bf16 MFU | 62040 tok/s +step 4132/19560 | loss 3.621972 (+0.35z)| norm 0.3150 (+1.18z)| lr 5.52e-04 | 8455.76 ms | -100.0% bf16 MFU | 62038 tok/s +step 4133/19560 | loss 3.631118 (+0.58z)| norm 0.2714 (-0.58z)| lr 5.52e-04 | 8460.40 ms | -100.0% bf16 MFU | 62034 tok/s +step 4134/19560 | loss 3.586773 (-0.53z)| norm 0.2589 (-1.08z)| lr 5.52e-04 | 8460.91 ms | -100.0% bf16 MFU | 62031 tok/s +step 4135/19560 | loss 3.533604 (-1.87z)| norm 0.2885 (+0.10z)| lr 5.52e-04 | 8455.88 ms | -100.0% bf16 MFU | 62030 tok/s +step 4136/19560 | loss 3.635241 (+0.69z)| norm 0.2696 (-0.67z)| lr 5.52e-04 | 8456.04 ms | -100.0% bf16 MFU | 62028 tok/s +step 4137/19560 | loss 3.620190 (+0.32z)| norm 0.2699 (-0.64z)| lr 5.52e-04 | 8457.74 ms | -100.0% bf16 MFU | 62026 tok/s +step 4138/19560 | loss 3.637242 (+0.76z)| norm 0.2597 (-1.04z)| lr 5.52e-04 | 8462.80 ms | -100.0% bf16 MFU | 62023 tok/s +step 4139/19560 | loss 3.531435 (-1.90z)| norm 0.2710 (-0.58z)| lr 5.52e-04 | 8457.37 ms | -100.0% bf16 MFU | 62021 tok/s +step 4140/19560 | loss 3.578267 (-0.71z)| norm 0.2680 (-0.69z)| lr 5.52e-04 | 8453.17 ms | -100.0% bf16 MFU | 62021 tok/s +step 4141/19560 | loss 3.580792 (-0.64z)| norm 0.2694 (-0.63z)| lr 5.52e-04 | 8454.09 ms | -100.0% bf16 MFU | 62021 tok/s +step 4142/19560 | loss 3.610500 (+0.12z)| norm 0.2923 (+0.30z)| lr 5.52e-04 | 8454.67 ms | -100.0% bf16 MFU | 62020 tok/s +step 4143/19560 | loss 3.593074 (-0.32z)| norm 0.2930 (+0.33z)| lr 5.52e-04 | 8453.76 ms | -100.0% bf16 MFU | 62020 tok/s +step 4144/19560 | loss 3.581620 (-0.60z)| norm 0.3010 (+0.66z)| lr 5.52e-04 | 8458.86 ms | -100.0% bf16 MFU | 62018 tok/s +step 4145/19560 | loss 3.594482 (-0.28z)| norm 0.2977 (+0.54z)| lr 5.52e-04 | 8453.66 ms | -100.0% bf16 MFU | 62018 tok/s +step 4146/19560 | loss 3.600772 (-0.13z)| norm 0.2656 (-0.78z)| lr 5.52e-04 | 8449.23 ms | -100.0% bf16 MFU | 62020 tok/s +step 4147/19560 | loss 3.597658 (-0.21z)| norm 0.2475 (-1.50z)| lr 5.52e-04 | 8453.19 ms | -100.0% bf16 MFU | 62020 tok/s +step 4148/19560 | loss 3.568070 (-0.95z)| norm 0.2457 (-1.56z)| lr 5.52e-04 | 8453.70 ms | -100.0% bf16 MFU | 62020 tok/s +step 4149/19560 | loss 3.633667 (+0.71z)| norm 0.2644 (-0.78z)| lr 5.52e-04 | 8457.15 ms | -100.0% bf16 MFU | 62019 tok/s +step 4150/19560 | loss 3.567139 (-0.97z)| norm 0.2662 (-0.70z)| lr 5.52e-04 | 8459.28 ms | -100.0% bf16 MFU | 62017 tok/s +step 4151/19560 | loss 3.640946 (+0.91z)| norm 0.2798 (-0.14z)| lr 5.52e-04 | 8452.03 ms | -100.0% bf16 MFU | 62017 tok/s +step 4152/19560 | loss 3.584767 (-0.52z)| norm 0.2449 (-1.55z)| lr 5.52e-04 | 8450.60 ms | -100.0% bf16 MFU | 62019 tok/s +step 4153/19560 | loss 3.582099 (-0.58z)| norm 0.2595 (-0.94z)| lr 5.52e-04 | 8445.20 ms | -100.0% bf16 MFU | 62022 tok/s +step 4154/19560 | loss 3.570096 (-0.88z)| norm 0.2458 (-1.47z)| lr 5.52e-04 | 8439.56 ms | -100.0% bf16 MFU | 62027 tok/s +step 4155/19560 | loss 3.593275 (-0.30z)| norm 0.3165 (+1.33z)| lr 5.52e-04 | 8449.88 ms | -100.0% bf16 MFU | 62028 tok/s +step 4156/19560 | loss 3.641161 (+0.94z)| norm 0.3041 (+0.82z)| lr 5.52e-04 | 8452.04 ms | -100.0% bf16 MFU | 62028 tok/s +step 4157/19560 | loss 3.605233 (-0.01z)| norm 0.2689 (-0.57z)| lr 5.52e-04 | 8454.57 ms | -100.0% bf16 MFU | 62027 tok/s +step 4158/19560 | loss 3.636448 (+0.81z)| norm 0.3999 (+4.29z)| lr 5.52e-04 | 8452.74 ms | -100.0% bf16 MFU | 62027 tok/s +step 4159/19560 | loss 3.582115 (-0.63z)| norm 0.3231 (+1.44z)| lr 5.52e-04 | 8453.41 ms | -100.0% bf16 MFU | 62027 tok/s +step 4160/19560 | loss 3.637412 (+0.83z)| norm 0.3117 (+1.01z)| lr 5.52e-04 | 8451.37 ms | -100.0% bf16 MFU | 62027 tok/s +step 4161/19560 | loss 3.569761 (-0.95z)| norm 0.3141 (+1.09z)| lr 5.52e-04 | 8447.70 ms | -100.0% bf16 MFU | 62029 tok/s +step 4162/19560 | loss 3.602046 (-0.09z)| norm 0.2984 (+0.50z)| lr 5.52e-04 | 8447.56 ms | -100.0% bf16 MFU | 62031 tok/s +step 4163/19560 | loss 3.555367 (-1.31z)| norm 0.2735 (-0.40z)| lr 5.51e-04 | 8450.96 ms | -100.0% bf16 MFU | 62031 tok/s +step 4164/19560 | loss 3.569455 (-0.94z)| norm 0.2998 (+0.55z)| lr 5.51e-04 | 8449.28 ms | -100.0% bf16 MFU | 62032 tok/s +step 4165/19560 | loss 3.590019 (-0.38z)| norm 0.3330 (+1.73z)| lr 5.51e-04 | 8449.35 ms | -100.0% bf16 MFU | 62033 tok/s +step 4166/19560 | loss 3.592196 (-0.34z)| norm 0.3692 (+2.92z)| lr 5.51e-04 | 8453.63 ms | -100.0% bf16 MFU | 62032 tok/s +step 4167/19560 | loss 3.661763 (+1.51z)| norm 0.4077 (+3.98z)| lr 5.51e-04 | 8447.79 ms | -100.0% bf16 MFU | 62034 tok/s +step 4168/19560 | loss 3.560165 (-1.18z)| norm 0.3409 (+1.74z)| lr 5.51e-04 | 8448.94 ms | -100.0% bf16 MFU | 62035 tok/s +step 4169/19560 | loss 3.576955 (-0.72z)| norm 0.2791 (-0.30z)| lr 5.51e-04 | 8450.98 ms | -100.0% bf16 MFU | 62035 tok/s +step 4170/19560 | loss 3.544798 (-1.59z)| norm 0.3328 (+1.44z)| lr 5.51e-04 | 8445.03 ms | -100.0% bf16 MFU | 62037 tok/s +step 4171/19560 | loss 3.599221 (-0.12z)| norm 0.2838 (-0.17z)| lr 5.51e-04 | 8447.31 ms | -100.0% bf16 MFU | 62039 tok/s +step 4172/19560 | loss 3.574141 (-0.79z)| norm 0.3051 (+0.52z)| lr 5.51e-04 | 8454.58 ms | -100.0% bf16 MFU | 62038 tok/s +step 4173/19560 | loss 3.575869 (-0.73z)| norm 0.3097 (+0.67z)| lr 5.51e-04 | 8451.14 ms | -100.0% bf16 MFU | 62038 tok/s +step 4174/19560 | loss 3.538021 (-1.72z)| norm 0.2673 (-0.71z)| lr 5.51e-04 | 8452.29 ms | -100.0% bf16 MFU | 62037 tok/s +step 4175/19560 | loss 3.628092 (+0.72z)| norm 0.3111 (+0.72z)| lr 5.51e-04 | 8446.34 ms | -100.0% bf16 MFU | 62039 tok/s +step 4176/19560 | loss 3.567695 (-0.91z)| norm 0.2711 (-0.59z)| lr 5.51e-04 | 8456.96 ms | -100.0% bf16 MFU | 62037 tok/s +step 4177/19560 | loss 3.683033 (+2.16z)| norm 0.2930 (+0.13z)| lr 5.51e-04 | 8453.08 ms | -100.0% bf16 MFU | 62036 tok/s +step 4178/19560 | loss 3.655503 (+1.41z)| norm 0.3104 (+0.69z)| lr 5.51e-04 | 8442.98 ms | -100.0% bf16 MFU | 62039 tok/s +step 4179/19560 | loss 3.632169 (+0.77z)| norm 0.3230 (+1.09z)| lr 5.51e-04 | 8442.79 ms | -100.0% bf16 MFU | 62042 tok/s +step 4180/19560 | loss 3.566781 (-0.97z)| norm 0.3009 (+0.36z)| lr 5.51e-04 | 8448.59 ms | -100.0% bf16 MFU | 62043 tok/s +step 4181/19560 | loss 3.590787 (-0.33z)| norm 0.2828 (-0.23z)| lr 5.51e-04 | 8441.27 ms | -100.0% bf16 MFU | 62046 tok/s +step 4182/19560 | loss 3.562583 (-1.06z)| norm 0.2593 (-0.99z)| lr 5.51e-04 | 8445.38 ms | -100.0% bf16 MFU | 62048 tok/s +step 4183/19560 | loss 3.586895 (-0.41z)| norm 0.2619 (-0.89z)| lr 5.51e-04 | 8450.50 ms | -100.0% bf16 MFU | 62048 tok/s +step 4184/19560 | loss 3.619768 (+0.46z)| norm 0.2662 (-0.73z)| lr 5.51e-04 | 8448.62 ms | -100.0% bf16 MFU | 62048 tok/s +step 4185/19560 | loss 3.623427 (+0.56z)| norm 0.2674 (-0.68z)| lr 5.51e-04 | 8454.28 ms | -100.0% bf16 MFU | 62046 tok/s +step 4186/19560 | loss 3.586296 (-0.42z)| norm 0.2611 (-0.88z)| lr 5.51e-04 | 8444.05 ms | -100.0% bf16 MFU | 62048 tok/s +step 4187/19560 | loss 3.600981 (-0.03z)| norm 0.2366 (-1.66z)| lr 5.51e-04 | 8442.41 ms | -100.0% bf16 MFU | 62051 tok/s +step 4188/19560 | loss 3.507996 (-2.47z)| norm 0.2627 (-0.80z)| lr 5.51e-04 | 8444.07 ms | -100.0% bf16 MFU | 62053 tok/s +step 4189/19560 | loss 3.583297 (-0.46z)| norm 0.2406 (-1.50z)| lr 5.51e-04 | 8439.42 ms | -100.0% bf16 MFU | 62057 tok/s +step 4190/19560 | loss 3.584716 (-0.41z)| norm 0.2655 (-0.69z)| lr 5.51e-04 | 8438.25 ms | -100.0% bf16 MFU | 62060 tok/s +step 4191/19560 | loss 3.621247 (+0.59z)| norm 0.2577 (-0.94z)| lr 5.51e-04 | 8448.95 ms | -100.0% bf16 MFU | 62060 tok/s +step 4192/19560 | loss 3.595659 (-0.12z)| norm 0.2433 (-1.40z)| lr 5.51e-04 | 8445.05 ms | -100.0% bf16 MFU | 62061 tok/s +step 4193/19560 | loss 3.537728 (-1.68z)| norm 0.2780 (-0.29z)| lr 5.51e-04 | 8444.89 ms | -100.0% bf16 MFU | 62062 tok/s +step 4194/19560 | loss 3.578358 (-0.56z)| norm 0.2592 (-0.90z)| lr 5.51e-04 | 8440.54 ms | -100.0% bf16 MFU | 62065 tok/s +step 4195/19560 | loss 3.572258 (-0.72z)| norm 0.2557 (-1.01z)| lr 5.51e-04 | 8441.25 ms | -100.0% bf16 MFU | 62067 tok/s +step 4196/19560 | loss 3.582852 (-0.43z)| norm 0.2515 (-1.13z)| lr 5.51e-04 | 8441.58 ms | -100.0% bf16 MFU | 62069 tok/s +step 4197/19560 | loss 3.640968 (+1.16z)| norm 0.2692 (-0.56z)| lr 5.51e-04 | 8447.27 ms | -100.0% bf16 MFU | 62069 tok/s +step 4198/19560 | loss 3.571789 (-0.73z)| norm 0.3056 (+0.61z)| lr 5.51e-04 | 8441.56 ms | -100.0% bf16 MFU | 62071 tok/s +step 4199/19560 | loss 3.518063 (-2.16z)| norm 0.2813 (-0.18z)| lr 5.50e-04 | 8446.15 ms | -100.0% bf16 MFU | 62071 tok/s +step 4200/19560 | loss 3.582948 (-0.39z)| norm 0.2546 (-1.04z)| lr 5.50e-04 | 8444.18 ms | -100.0% bf16 MFU | 62072 tok/s +step 4201/19560 | loss 3.577565 (-0.53z)| norm 0.3314 (+1.42z)| lr 5.50e-04 | 8441.56 ms | -100.0% bf16 MFU | 62074 tok/s +step 4202/19560 | loss 3.624922 (+0.77z)| norm 0.3008 (+0.43z)| lr 5.50e-04 | 8441.43 ms | -100.0% bf16 MFU | 62076 tok/s +step 4203/19560 | loss 3.606502 (+0.26z)| norm 0.2738 (-0.44z)| lr 5.50e-04 | 8442.59 ms | -100.0% bf16 MFU | 62077 tok/s +step 4204/19560 | loss 3.581634 (-0.42z)| norm 0.2572 (-0.96z)| lr 5.50e-04 | 8445.40 ms | -100.0% bf16 MFU | 62077 tok/s +step 4205/19560 | loss 3.528762 (-1.84z)| norm 0.2627 (-0.78z)| lr 5.50e-04 | 8448.32 ms | -100.0% bf16 MFU | 62076 tok/s +step 4206/19560 | loss 3.542151 (-1.45z)| norm 0.2767 (-0.33z)| lr 5.50e-04 | 8440.25 ms | -100.0% bf16 MFU | 62078 tok/s +step 4207/19560 | loss 3.566841 (-0.76z)| norm 0.2554 (-1.00z)| lr 5.50e-04 | 8444.08 ms | -100.0% bf16 MFU | 62079 tok/s +step 4208/19560 | loss 3.566803 (-0.76z)| norm 0.2695 (-0.54z)| lr 5.50e-04 | 8447.14 ms | -100.0% bf16 MFU | 62078 tok/s +step 4209/19560 | loss 3.561061 (-0.91z)| norm 0.2714 (-0.47z)| lr 5.50e-04 | 8455.42 ms | -100.0% bf16 MFU | 62074 tok/s +step 4210/19560 | loss 3.628079 (+0.90z)| norm 0.2577 (-0.91z)| lr 5.50e-04 | 8442.39 ms | -100.0% bf16 MFU | 62076 tok/s +step 4211/19560 | loss 3.581528 (-0.35z)| norm 0.2594 (-0.85z)| lr 5.50e-04 | 8445.96 ms | -100.0% bf16 MFU | 62076 tok/s +step 4212/19560 | loss 3.594402 (+0.01z)| norm 0.2689 (-0.54z)| lr 5.50e-04 | 8441.19 ms | -100.0% bf16 MFU | 62078 tok/s +step 4213/19560 | loss 3.562648 (-0.85z)| norm 0.2468 (-1.24z)| lr 5.50e-04 | 8440.48 ms | -100.0% bf16 MFU | 62079 tok/s +step 4214/19560 | loss 3.589643 (-0.11z)| norm 0.2623 (-0.74z)| lr 5.50e-04 | 8442.66 ms | -100.0% bf16 MFU | 62081 tok/s +step 4215/19560 | loss 3.693389 (+2.72z)| norm 0.2788 (-0.22z)| lr 5.50e-04 | 8444.72 ms | -100.0% bf16 MFU | 62081 tok/s +step 4216/19560 | loss 3.626462 (+0.88z)| norm 0.2950 (+0.29z)| lr 5.50e-04 | 8444.60 ms | -100.0% bf16 MFU | 62081 tok/s +step 4217/19560 | loss 3.546286 (-1.29z)| norm 0.3025 (+0.52z)| lr 5.50e-04 | 8442.47 ms | -100.0% bf16 MFU | 62082 tok/s +step 4218/19560 | loss 3.584332 (-0.25z)| norm 0.2959 (+0.32z)| lr 5.50e-04 | 8444.05 ms | -100.0% bf16 MFU | 62082 tok/s +step 4219/19560 | loss 3.609130 (+0.42z)| norm 0.2684 (-0.54z)| lr 5.50e-04 | 8445.05 ms | -100.0% bf16 MFU | 62082 tok/s +step 4220/19560 | loss 3.574834 (-0.51z)| norm 0.2534 (-1.01z)| lr 5.50e-04 | 8438.78 ms | -100.0% bf16 MFU | 62085 tok/s +step 4221/19560 | loss 3.648409 (+1.48z)| norm 0.4532 (+4.77z)| lr 5.50e-04 | 8442.34 ms | -100.0% bf16 MFU | 62086 tok/s +step 4222/19560 | loss 3.620820 (+0.72z)| norm 0.2889 (+0.05z)| lr 5.50e-04 | 8444.89 ms | -100.0% bf16 MFU | 62085 tok/s +step 4223/19560 | loss 3.617044 (+0.63z)| norm 0.2818 (-0.16z)| lr 5.50e-04 | 8444.04 ms | -100.0% bf16 MFU | 62086 tok/s +step 4224/19560 | loss 3.567760 (-0.69z)| norm 0.2825 (-0.13z)| lr 5.50e-04 | 8447.18 ms | -100.0% bf16 MFU | 62085 tok/s +step 4225/19560 | loss 3.594743 (+0.04z)| norm 0.2828 (-0.13z)| lr 5.50e-04 | 8446.04 ms | -100.0% bf16 MFU | 62084 tok/s +step 4226/19560 | loss 3.572589 (-0.55z)| norm 0.2727 (-0.42z)| lr 5.50e-04 | 8443.45 ms | -100.0% bf16 MFU | 62085 tok/s +step 4227/19560 | loss 3.614824 (+0.59z)| norm 0.2776 (-0.27z)| lr 5.50e-04 | 8443.74 ms | -100.0% bf16 MFU | 62085 tok/s +step 4228/19560 | loss 3.522305 (-1.90z)| norm 0.3038 (+0.48z)| lr 5.50e-04 | 8443.60 ms | -100.0% bf16 MFU | 62085 tok/s +step 4229/19560 | loss 3.584205 (-0.23z)| norm 0.3163 (+0.85z)| lr 5.50e-04 | 8446.03 ms | -100.0% bf16 MFU | 62085 tok/s +step 4230/19560 | loss 3.625217 (+0.86z)| norm 0.3488 (+1.78z)| lr 5.50e-04 | 8445.78 ms | -100.0% bf16 MFU | 62085 tok/s +step 4231/19560 | loss 3.731225 (+3.50z)| norm 0.3742 (+2.45z)| lr 5.50e-04 | 8437.98 ms | -100.0% bf16 MFU | 62087 tok/s +step 4232/19560 | loss 3.536791 (-1.44z)| norm 0.3467 (+1.64z)| lr 5.50e-04 | 8441.20 ms | -100.0% bf16 MFU | 62088 tok/s +step 4233/19560 | loss 3.603195 (+0.26z)| norm 0.3269 (+1.08z)| lr 5.50e-04 | 8443.91 ms | -100.0% bf16 MFU | 62088 tok/s +step 4234/19560 | loss 3.576685 (-0.41z)| norm 0.3060 (+0.49z)| lr 5.50e-04 | 8443.58 ms | -100.0% bf16 MFU | 62089 tok/s +step 4235/19560 | loss 3.572721 (-0.51z)| norm 0.3048 (+0.45z)| lr 5.50e-04 | 8439.32 ms | -100.0% bf16 MFU | 62090 tok/s +step 4236/19560 | loss 3.580502 (-0.31z)| norm 0.3058 (+0.47z)| lr 5.49e-04 | 8443.09 ms | -100.0% bf16 MFU | 62091 tok/s +step 4237/19560 | loss 3.645154 (+1.34z)| norm 0.2873 (-0.06z)| lr 5.49e-04 | 8443.13 ms | -100.0% bf16 MFU | 62091 tok/s +step 4238/19560 | loss 3.701071 (+2.68z)| norm 0.3153 (+0.72z)| lr 5.49e-04 | 8444.44 ms | -100.0% bf16 MFU | 62091 tok/s +step 4239/19560 | loss 3.599428 (+0.14z)| norm 0.2734 (-0.46z)| lr 5.49e-04 | 8439.74 ms | -100.0% bf16 MFU | 62092 tok/s +step 4240/19560 | loss 3.547419 (-1.14z)| norm 0.2511 (-1.07z)| lr 5.49e-04 | 8444.68 ms | -100.0% bf16 MFU | 62092 tok/s +step 4241/19560 | loss 3.576882 (-0.40z)| norm 0.2946 (+0.16z)| lr 5.49e-04 | 8443.89 ms | -100.0% bf16 MFU | 62092 tok/s +step 4242/19560 | loss 3.564625 (-0.72z)| norm 0.2937 (+0.16z)| lr 5.49e-04 | 8444.11 ms | -100.0% bf16 MFU | 62092 tok/s +step 4243/19560 | loss 3.601546 (+0.24z)| norm 0.2833 (-0.14z)| lr 5.49e-04 | 8441.37 ms | -100.0% bf16 MFU | 62093 tok/s +step 4244/19560 | loss 3.682343 (+2.28z)| norm 0.2580 (-0.87z)| lr 5.49e-04 | 8442.43 ms | -100.0% bf16 MFU | 62093 tok/s +step 4245/19560 | loss 3.629322 (+0.92z)| norm 0.2727 (-0.44z)| lr 5.49e-04 | 8442.92 ms | -100.0% bf16 MFU | 62093 tok/s +step 4246/19560 | loss 3.602520 (+0.26z)| norm 0.2901 (+0.08z)| lr 5.49e-04 | 8441.37 ms | -100.0% bf16 MFU | 62094 tok/s +step 4247/19560 | loss 3.580882 (-0.31z)| norm 0.2559 (-0.91z)| lr 5.49e-04 | 8442.89 ms | -100.0% bf16 MFU | 62094 tok/s +step 4248/19560 | loss 3.588467 (-0.11z)| norm 0.2483 (-1.12z)| lr 5.49e-04 | 8439.95 ms | -100.0% bf16 MFU | 62096 tok/s +step 4249/19560 | loss 3.617106 (+0.63z)| norm 0.2532 (-0.96z)| lr 5.49e-04 | 8442.99 ms | -100.0% bf16 MFU | 62096 tok/s +step 4250/19560 | loss 3.525718 (-1.75z)| norm 0.2631 (-0.67z)| lr 5.49e-04 | 8442.81 ms | -100.0% bf16 MFU | 62096 tok/s +val loss 3.591483 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 2722/10042 = 0.271062 +step 4251/19560 | loss 3.484129 (-2.74z)| norm 0.2577 (-0.82z)| lr 5.49e-04 | 8443.87 ms | -100.0% bf16 MFU | 62096 tok/s +step 4252/19560 | loss 3.612324 (+0.51z)| norm 0.2639 (-0.64z)| lr 5.49e-04 | 8441.17 ms | -100.0% bf16 MFU | 62096 tok/s +step 4253/19560 | loss 3.553192 (-0.99z)| norm 0.2664 (-0.57z)| lr 5.49e-04 | 8442.40 ms | -100.0% bf16 MFU | 62097 tok/s +step 4254/19560 | loss 3.555277 (-0.92z)| norm 0.2654 (-0.59z)| lr 5.49e-04 | 8440.22 ms | -100.0% bf16 MFU | 62098 tok/s +step 4255/19560 | loss 3.613074 (+0.52z)| norm 0.2868 (+0.05z)| lr 5.49e-04 | 8441.51 ms | -100.0% bf16 MFU | 62098 tok/s +step 4256/19560 | loss 3.559362 (-0.84z)| norm 0.2672 (-0.53z)| lr 5.49e-04 | 8444.87 ms | -100.0% bf16 MFU | 62097 tok/s +step 4257/19560 | loss 3.611836 (+0.49z)| norm 0.2526 (-0.94z)| lr 5.49e-04 | 8443.38 ms | -100.0% bf16 MFU | 62097 tok/s +step 4258/19560 | loss 3.601935 (+0.23z)| norm 0.2593 (-0.74z)| lr 5.49e-04 | 8440.07 ms | -100.0% bf16 MFU | 62098 tok/s +step 4259/19560 | loss 3.672272 (+2.01z)| norm 0.2462 (-1.12z)| lr 5.49e-04 | 8442.48 ms | -100.0% bf16 MFU | 62099 tok/s +step 4260/19560 | loss 3.610915 (+0.46z)| norm 0.2490 (-1.02z)| lr 5.49e-04 | 8441.48 ms | -100.0% bf16 MFU | 62099 tok/s +step 4261/19560 | loss 3.655921 (+1.58z)| norm 0.2882 (+0.12z)| lr 5.49e-04 | 8445.55 ms | -100.0% bf16 MFU | 62098 tok/s +step 4262/19560 | loss 3.576868 (-0.41z)| norm 0.2851 (+0.02z)| lr 5.49e-04 | 8440.93 ms | -100.0% bf16 MFU | 62099 tok/s +step 4263/19560 | loss 3.621640 (+0.71z)| norm 0.2879 (+0.10z)| lr 5.49e-04 | 8442.01 ms | -100.0% bf16 MFU | 62099 tok/s +step 4264/19560 | loss 3.589619 (-0.10z)| norm 0.2701 (-0.42z)| lr 5.49e-04 | 8440.13 ms | -100.0% bf16 MFU | 62100 tok/s +step 4265/19560 | loss 3.591009 (-0.06z)| norm 0.2992 (+0.43z)| lr 5.49e-04 | 8443.99 ms | -100.0% bf16 MFU | 62100 tok/s +step 4266/19560 | loss 3.625339 (+0.82z)| norm 0.3123 (+0.80z)| lr 5.49e-04 | 8442.12 ms | -100.0% bf16 MFU | 62100 tok/s +step 4267/19560 | loss 3.602189 (+0.22z)| norm 0.3102 (+0.72z)| lr 5.49e-04 | 8439.57 ms | -100.0% bf16 MFU | 62101 tok/s +step 4268/19560 | loss 3.577858 (-0.41z)| norm 0.2727 (-0.37z)| lr 5.49e-04 | 8442.01 ms | -100.0% bf16 MFU | 62101 tok/s +step 4269/19560 | loss 3.606958 (+0.34z)| norm 0.2682 (-0.50z)| lr 5.49e-04 | 8440.53 ms | -100.0% bf16 MFU | 62102 tok/s +step 4270/19560 | loss 3.578277 (-0.40z)| norm 0.2707 (-0.42z)| lr 5.49e-04 | 8442.59 ms | -100.0% bf16 MFU | 62102 tok/s +step 4271/19560 | loss 3.654315 (+1.54z)| norm 0.2849 (-0.01z)| lr 5.48e-04 | 8440.60 ms | -100.0% bf16 MFU | 62102 tok/s +step 4272/19560 | loss 3.559367 (-0.88z)| norm 0.2528 (-0.93z)| lr 5.48e-04 | 8443.56 ms | -100.0% bf16 MFU | 62102 tok/s +step 4273/19560 | loss 3.589754 (-0.10z)| norm 0.2847 (+0.00z)| lr 5.48e-04 | 8442.14 ms | -100.0% bf16 MFU | 62102 tok/s +step 4274/19560 | loss 3.611279 (+0.44z)| norm 0.2945 (+0.28z)| lr 5.48e-04 | 8440.05 ms | -100.0% bf16 MFU | 62103 tok/s +step 4275/19560 | loss 3.591514 (-0.06z)| norm 0.2616 (-0.68z)| lr 5.48e-04 | 8441.05 ms | -100.0% bf16 MFU | 62103 tok/s +step 4276/19560 | loss 3.601090 (+0.18z)| norm 0.2523 (-0.95z)| lr 5.48e-04 | 8439.49 ms | -100.0% bf16 MFU | 62104 tok/s +step 4277/19560 | loss 3.615571 (+0.55z)| norm 0.3780 (+2.62z)| lr 5.48e-04 | 8435.88 ms | -100.0% bf16 MFU | 62107 tok/s +step 4278/19560 | loss 3.592466 (-0.04z)| norm 0.2887 (+0.08z)| lr 5.48e-04 | 8440.16 ms | -100.0% bf16 MFU | 62107 tok/s +step 4279/19560 | loss 3.626690 (+0.84z)| norm 0.2805 (-0.16z)| lr 5.48e-04 | 8441.08 ms | -100.0% bf16 MFU | 62107 tok/s +step 4280/19560 | loss 3.587301 (-0.18z)| norm 0.3015 (+0.43z)| lr 5.48e-04 | 8439.09 ms | -100.0% bf16 MFU | 62108 tok/s +step 4281/19560 | loss 3.633065 (+0.99z)| norm 0.3072 (+0.58z)| lr 5.48e-04 | 8441.40 ms | -100.0% bf16 MFU | 62108 tok/s +step 4282/19560 | loss 3.559236 (-0.90z)| norm 0.2824 (-0.14z)| lr 5.48e-04 | 8437.41 ms | -100.0% bf16 MFU | 62110 tok/s +step 4283/19560 | loss 3.600719 (+0.16z)| norm 0.2894 (+0.07z)| lr 5.48e-04 | 8436.08 ms | -100.0% bf16 MFU | 62112 tok/s +step 4284/19560 | loss 3.648265 (+1.37z)| norm 0.3378 (+1.45z)| lr 5.48e-04 | 8434.89 ms | -100.0% bf16 MFU | 62114 tok/s +step 4285/19560 | loss 3.563030 (-0.80z)| norm 0.2734 (-0.40z)| lr 5.48e-04 | 8436.21 ms | -100.0% bf16 MFU | 62116 tok/s +step 4286/19560 | loss 3.582129 (-0.30z)| norm 0.2598 (-0.79z)| lr 5.48e-04 | 8431.54 ms | -100.0% bf16 MFU | 62119 tok/s +step 4287/19560 | loss 3.460620 (-3.24z)| norm 0.2918 (+0.18z)| lr 5.48e-04 | 8431.83 ms | -100.0% bf16 MFU | 62122 tok/s +step 4288/19560 | loss 3.606969 (+0.35z)| norm 0.2766 (-0.27z)| lr 5.48e-04 | 8430.12 ms | -100.0% bf16 MFU | 62126 tok/s +step 4289/19560 | loss 3.604177 (+0.28z)| norm 0.2900 (+0.14z)| lr 5.48e-04 | 8431.75 ms | -100.0% bf16 MFU | 62128 tok/s +step 4290/19560 | loss 3.616070 (+0.57z)| norm 0.2831 (-0.07z)| lr 5.48e-04 | 8434.59 ms | -100.0% bf16 MFU | 62130 tok/s +step 4291/19560 | loss 3.582333 (-0.27z)| norm 0.2512 (-1.02z)| lr 5.48e-04 | 8432.12 ms | -100.0% bf16 MFU | 62132 tok/s +step 4292/19560 | loss 3.625958 (+0.80z)| norm 0.2914 (+0.19z)| lr 5.48e-04 | 8433.19 ms | -100.0% bf16 MFU | 62134 tok/s +step 4293/19560 | loss 3.608929 (+0.37z)| norm 0.2701 (-0.44z)| lr 5.48e-04 | 8433.03 ms | -100.0% bf16 MFU | 62136 tok/s +step 4294/19560 | loss 3.557077 (-0.90z)| norm 0.2772 (-0.21z)| lr 5.48e-04 | 8434.34 ms | -100.0% bf16 MFU | 62137 tok/s +step 4295/19560 | loss 3.566232 (-0.66z)| norm 0.2471 (-1.17z)| lr 5.48e-04 | 8433.44 ms | -100.0% bf16 MFU | 62139 tok/s +step 4296/19560 | loss 3.596695 (+0.09z)| norm 0.2731 (-0.30z)| lr 5.48e-04 | 8435.68 ms | -100.0% bf16 MFU | 62139 tok/s +step 4297/19560 | loss 3.579677 (-0.33z)| norm 0.3000 (+0.59z)| lr 5.48e-04 | 8434.24 ms | -100.0% bf16 MFU | 62140 tok/s +step 4298/19560 | loss 3.630697 (+0.92z)| norm 0.3244 (+1.41z)| lr 5.48e-04 | 8435.43 ms | -100.0% bf16 MFU | 62141 tok/s +step 4299/19560 | loss 3.594978 (+0.03z)| norm 0.3263 (+1.45z)| lr 5.48e-04 | 8434.12 ms | -100.0% bf16 MFU | 62142 tok/s +step 4300/19560 | loss 3.579055 (-0.37z)| norm 0.3024 (+0.66z)| lr 5.48e-04 | 8433.43 ms | -100.0% bf16 MFU | 62143 tok/s +step 4301/19560 | loss 3.607775 (+0.34z)| norm 0.3148 (+1.07z)| lr 5.48e-04 | 8449.09 ms | -100.0% bf16 MFU | 62139 tok/s +step 4302/19560 | loss 3.641337 (+1.17z)| norm 0.2842 (+0.05z)| lr 5.48e-04 | 8463.98 ms | -100.0% bf16 MFU | 62129 tok/s +step 4303/19560 | loss 3.649574 (+1.36z)| norm 0.2834 (+0.03z)| lr 5.48e-04 | 8463.74 ms | -100.0% bf16 MFU | 62120 tok/s +step 4304/19560 | loss 3.540005 (-1.36z)| norm 0.2955 (+0.43z)| lr 5.48e-04 | 8466.37 ms | -100.0% bf16 MFU | 62110 tok/s +step 4305/19560 | loss 3.650760 (+1.41z)| norm 0.2825 (-0.00z)| lr 5.48e-04 | 8464.07 ms | -100.0% bf16 MFU | 62102 tok/s +step 4306/19560 | loss 3.575146 (-0.47z)| norm 0.2631 (-0.64z)| lr 5.48e-04 | 8463.09 ms | -100.0% bf16 MFU | 62094 tok/s +step 4307/19560 | loss 3.588426 (-0.13z)| norm 0.2582 (-0.79z)| lr 5.47e-04 | 8462.73 ms | -100.0% bf16 MFU | 62087 tok/s +step 4308/19560 | loss 3.535529 (-1.46z)| norm 0.2891 (+0.25z)| lr 5.47e-04 | 8460.79 ms | -100.0% bf16 MFU | 62081 tok/s +step 4309/19560 | loss 3.621309 (+0.70z)| norm 0.2555 (-0.87z)| lr 5.47e-04 | 8459.60 ms | -100.0% bf16 MFU | 62076 tok/s +step 4310/19560 | loss 3.553384 (-1.01z)| norm 0.2881 (+0.22z)| lr 5.47e-04 | 8460.74 ms | -100.0% bf16 MFU | 62070 tok/s +step 4311/19560 | loss 3.548760 (-1.11z)| norm 0.2576 (-0.80z)| lr 5.47e-04 | 8461.43 ms | -100.0% bf16 MFU | 62065 tok/s +step 4312/19560 | loss 3.589974 (-0.07z)| norm 0.2472 (-1.14z)| lr 5.47e-04 | 8460.89 ms | -100.0% bf16 MFU | 62060 tok/s +step 4313/19560 | loss 3.529896 (-1.55z)| norm 0.2560 (-0.84z)| lr 5.47e-04 | 8464.29 ms | -100.0% bf16 MFU | 62054 tok/s +step 4314/19560 | loss 3.574332 (-0.44z)| norm 0.2489 (-1.07z)| lr 5.47e-04 | 8459.07 ms | -100.0% bf16 MFU | 62050 tok/s +step 4315/19560 | loss 3.581999 (-0.25z)| norm 0.2816 (+0.00z)| lr 5.47e-04 | 8466.71 ms | -100.0% bf16 MFU | 62044 tok/s +step 4316/19560 | loss 3.604270 (+0.29z)| norm 0.2663 (-0.51z)| lr 5.47e-04 | 8464.15 ms | -100.0% bf16 MFU | 62039 tok/s +step 4317/19560 | loss 3.592020 (-0.02z)| norm 0.2474 (-1.15z)| lr 5.47e-04 | 8456.29 ms | -100.0% bf16 MFU | 62037 tok/s +step 4318/19560 | loss 3.634649 (+1.04z)| norm 0.2839 (+0.07z)| lr 5.47e-04 | 8456.25 ms | -100.0% bf16 MFU | 62035 tok/s +step 4319/19560 | loss 3.651388 (+1.45z)| norm 0.3080 (+0.87z)| lr 5.47e-04 | 8460.03 ms | -100.0% bf16 MFU | 62032 tok/s +step 4320/19560 | loss 3.625137 (+0.79z)| norm 0.2781 (-0.15z)| lr 5.47e-04 | 8460.71 ms | -100.0% bf16 MFU | 62029 tok/s +step 4321/19560 | loss 3.576477 (-0.44z)| norm 0.2567 (-0.87z)| lr 5.47e-04 | 8455.42 ms | -100.0% bf16 MFU | 62028 tok/s +step 4322/19560 | loss 3.622499 (+0.71z)| norm 0.2721 (-0.35z)| lr 5.47e-04 | 8458.21 ms | -100.0% bf16 MFU | 62026 tok/s +step 4323/19560 | loss 3.593862 (-0.01z)| norm 0.2629 (-0.67z)| lr 5.47e-04 | 8455.66 ms | -100.0% bf16 MFU | 62024 tok/s +step 4324/19560 | loss 3.607379 (+0.32z)| norm 0.2864 (+0.12z)| lr 5.47e-04 | 8454.45 ms | -100.0% bf16 MFU | 62024 tok/s +step 4325/19560 | loss 3.577335 (-0.42z)| norm 0.2972 (+0.48z)| lr 5.47e-04 | 8458.95 ms | -100.0% bf16 MFU | 62022 tok/s +step 4326/19560 | loss 3.579025 (-0.38z)| norm 0.2721 (-0.36z)| lr 5.47e-04 | 8455.02 ms | -100.0% bf16 MFU | 62021 tok/s +step 4327/19560 | loss 3.607219 (+0.32z)| norm 0.2694 (-0.45z)| lr 5.47e-04 | 8455.23 ms | -100.0% bf16 MFU | 62020 tok/s +step 4328/19560 | loss 3.624319 (+0.74z)| norm 0.2743 (-0.29z)| lr 5.47e-04 | 8451.95 ms | -100.0% bf16 MFU | 62021 tok/s +step 4329/19560 | loss 3.562840 (-0.82z)| norm 0.2734 (-0.31z)| lr 5.47e-04 | 8455.02 ms | -100.0% bf16 MFU | 62020 tok/s +step 4330/19560 | loss 3.617047 (+0.56z)| norm 0.2870 (+0.16z)| lr 5.47e-04 | 8458.51 ms | -100.0% bf16 MFU | 62019 tok/s +step 4331/19560 | loss 3.564466 (-0.77z)| norm 0.2772 (-0.17z)| lr 5.47e-04 | 8456.18 ms | -100.0% bf16 MFU | 62018 tok/s +step 4332/19560 | loss 3.578405 (-0.42z)| norm 0.2508 (-1.09z)| lr 5.47e-04 | 8449.54 ms | -100.0% bf16 MFU | 62019 tok/s +step 4333/19560 | loss 3.678524 (+2.09z)| norm 0.2759 (-0.22z)| lr 5.47e-04 | 8455.34 ms | -100.0% bf16 MFU | 62019 tok/s +step 4334/19560 | loss 3.654003 (+1.45z)| norm 0.3000 (+0.61z)| lr 5.47e-04 | 8452.60 ms | -100.0% bf16 MFU | 62019 tok/s +step 4335/19560 | loss 3.603771 (+0.17z)| norm 0.2934 (+0.37z)| lr 5.47e-04 | 8449.34 ms | -100.0% bf16 MFU | 62021 tok/s +step 4336/19560 | loss 3.572320 (-0.63z)| norm 0.2951 (+0.42z)| lr 5.47e-04 | 8454.18 ms | -100.0% bf16 MFU | 62020 tok/s +step 4337/19560 | loss 3.577159 (-0.51z)| norm 0.3235 (+1.38z)| lr 5.47e-04 | 8452.09 ms | -100.0% bf16 MFU | 62021 tok/s +step 4338/19560 | loss 3.591096 (-0.15z)| norm 0.3113 (+0.95z)| lr 5.47e-04 | 8456.94 ms | -100.0% bf16 MFU | 62020 tok/s +step 4339/19560 | loss 3.565263 (-0.80z)| norm 0.2974 (+0.46z)| lr 5.47e-04 | 8453.98 ms | -100.0% bf16 MFU | 62019 tok/s +step 4340/19560 | loss 3.611097 (+0.36z)| norm 0.2592 (-0.86z)| lr 5.47e-04 | 8454.17 ms | -100.0% bf16 MFU | 62019 tok/s +step 4341/19560 | loss 3.679929 (+2.06z)| norm 0.2928 (+0.29z)| lr 5.47e-04 | 8457.34 ms | -100.0% bf16 MFU | 62018 tok/s +step 4342/19560 | loss 3.583458 (-0.36z)| norm 0.2641 (-0.70z)| lr 5.46e-04 | 8450.48 ms | -100.0% bf16 MFU | 62019 tok/s +step 4343/19560 | loss 3.635756 (+0.98z)| norm 0.3215 (+1.27z)| lr 5.46e-04 | 8449.36 ms | -100.0% bf16 MFU | 62021 tok/s +step 4344/19560 | loss 3.623384 (+0.67z)| norm 0.3131 (+0.97z)| lr 5.46e-04 | 8456.78 ms | -100.0% bf16 MFU | 62019 tok/s +step 4345/19560 | loss 3.612602 (+0.38z)| norm 0.2887 (+0.14z)| lr 5.46e-04 | 8457.29 ms | -100.0% bf16 MFU | 62018 tok/s +step 4346/19560 | loss 3.541831 (-1.42z)| norm 0.2801 (-0.16z)| lr 5.46e-04 | 8452.00 ms | -100.0% bf16 MFU | 62019 tok/s +step 4347/19560 | loss 3.590628 (-0.17z)| norm 0.2988 (+0.48z)| lr 5.46e-04 | 8452.76 ms | -100.0% bf16 MFU | 62019 tok/s +step 4348/19560 | loss 3.595985 (-0.04z)| norm 0.2703 (-0.51z)| lr 5.46e-04 | 8453.36 ms | -100.0% bf16 MFU | 62019 tok/s +step 4349/19560 | loss 3.561743 (-0.90z)| norm 0.2747 (-0.36z)| lr 5.46e-04 | 8448.65 ms | -100.0% bf16 MFU | 62021 tok/s +step 4350/19560 | loss 3.685884 (+2.23z)| norm 0.3493 (+2.56z)| lr 5.46e-04 | 8458.72 ms | -100.0% bf16 MFU | 62019 tok/s +step 4351/19560 | loss 3.712000 (+2.78z)| norm 0.3322 (+1.85z)| lr 5.46e-04 | 8448.41 ms | -100.0% bf16 MFU | 62021 tok/s +step 4352/19560 | loss 3.565368 (-0.80z)| norm 0.2875 (+0.12z)| lr 5.46e-04 | 8449.65 ms | -100.0% bf16 MFU | 62022 tok/s +step 4353/19560 | loss 3.497364 (-2.39z)| norm 0.2865 (+0.08z)| lr 5.46e-04 | 8447.31 ms | -100.0% bf16 MFU | 62025 tok/s +step 4354/19560 | loss 3.670385 (+1.71z)| norm 0.3483 (+2.39z)| lr 5.46e-04 | 8449.21 ms | -100.0% bf16 MFU | 62026 tok/s +step 4355/19560 | loss 3.613359 (+0.36z)| norm 0.3426 (+2.12z)| lr 5.46e-04 | 8453.35 ms | -100.0% bf16 MFU | 62026 tok/s +step 4356/19560 | loss 3.535923 (-1.48z)| norm 0.3001 (+0.54z)| lr 5.46e-04 | 8450.03 ms | -100.0% bf16 MFU | 62027 tok/s +step 4357/19560 | loss 3.702843 (+2.41z)| norm 0.2704 (-0.55z)| lr 5.46e-04 | 8445.56 ms | -100.0% bf16 MFU | 62029 tok/s +step 4358/19560 | loss 3.570746 (-0.65z)| norm 0.3170 (+1.22z)| lr 5.46e-04 | 8450.39 ms | -100.0% bf16 MFU | 62030 tok/s +step 4359/19560 | loss 3.638183 (+0.97z)| norm 0.5146 (+7.08z)| lr 5.46e-04 | 8452.36 ms | -100.0% bf16 MFU | 62030 tok/s +step 4360/19560 | loss 3.634567 (+0.87z)| norm 0.3355 (+1.54z)| lr 5.46e-04 | 8448.52 ms | -100.0% bf16 MFU | 62031 tok/s +step 4361/19560 | loss 3.657066 (+1.39z)| norm 0.2997 (+0.43z)| lr 5.46e-04 | 8451.01 ms | -100.0% bf16 MFU | 62032 tok/s +step 4362/19560 | loss 3.603516 (+0.10z)| norm 0.3216 (+1.11z)| lr 5.46e-04 | 8453.47 ms | -100.0% bf16 MFU | 62031 tok/s +step 4363/19560 | loss 3.581468 (-0.43z)| norm 0.3045 (+0.58z)| lr 5.46e-04 | 8443.55 ms | -100.0% bf16 MFU | 62034 tok/s +step 4364/19560 | loss 3.581555 (-0.43z)| norm 0.3034 (+0.55z)| lr 5.46e-04 | 8450.41 ms | -100.0% bf16 MFU | 62035 tok/s +step 4365/19560 | loss 3.601308 (+0.05z)| norm 0.3056 (+0.61z)| lr 5.46e-04 | 8451.68 ms | -100.0% bf16 MFU | 62035 tok/s +step 4366/19560 | loss 3.565383 (-0.80z)| norm 0.2960 (+0.32z)| lr 5.46e-04 | 8448.97 ms | -100.0% bf16 MFU | 62035 tok/s +step 4367/19560 | loss 3.622285 (+0.59z)| norm 0.3023 (+0.51z)| lr 5.46e-04 | 8448.91 ms | -100.0% bf16 MFU | 62036 tok/s +step 4368/19560 | loss 3.584032 (-0.36z)| norm 0.2893 (+0.09z)| lr 5.46e-04 | 8444.62 ms | -100.0% bf16 MFU | 62039 tok/s +step 4369/19560 | loss 3.565120 (-0.82z)| norm 0.2566 (-0.92z)| lr 5.46e-04 | 8444.22 ms | -100.0% bf16 MFU | 62041 tok/s +step 4370/19560 | loss 3.633619 (+0.86z)| norm 0.2970 (+0.34z)| lr 5.46e-04 | 8444.28 ms | -100.0% bf16 MFU | 62044 tok/s +step 4371/19560 | loss 3.553183 (-1.12z)| norm 0.2854 (-0.02z)| lr 5.46e-04 | 8448.24 ms | -100.0% bf16 MFU | 62044 tok/s +step 4372/19560 | loss 3.619107 (+0.53z)| norm 0.2602 (-0.81z)| lr 5.46e-04 | 8444.13 ms | -100.0% bf16 MFU | 62047 tok/s +step 4373/19560 | loss 3.664788 (+1.65z)| norm 0.2969 (+0.33z)| lr 5.46e-04 | 8454.06 ms | -100.0% bf16 MFU | 62045 tok/s +step 4374/19560 | loss 3.576939 (-0.52z)| norm 0.2907 (+0.13z)| lr 5.46e-04 | 8445.98 ms | -100.0% bf16 MFU | 62047 tok/s +step 4375/19560 | loss 3.568810 (-0.72z)| norm 0.2710 (-0.49z)| lr 5.46e-04 | 8445.29 ms | -100.0% bf16 MFU | 62048 tok/s +step 4376/19560 | loss 3.593096 (-0.12z)| norm 0.2831 (-0.12z)| lr 5.46e-04 | 8447.11 ms | -100.0% bf16 MFU | 62049 tok/s +step 4377/19560 | loss 3.496666 (-2.43z)| norm 0.2649 (-0.69z)| lr 5.45e-04 | 8449.36 ms | -100.0% bf16 MFU | 62049 tok/s +step 4378/19560 | loss 3.647656 (+1.21z)| norm 0.3118 (+0.78z)| lr 5.45e-04 | 8443.86 ms | -100.0% bf16 MFU | 62051 tok/s +step 4379/19560 | loss 3.579226 (-0.49z)| norm 0.2936 (+0.19z)| lr 5.45e-04 | 8447.58 ms | -100.0% bf16 MFU | 62052 tok/s +step 4380/19560 | loss 3.609481 (+0.27z)| norm 0.2454 (-1.32z)| lr 5.45e-04 | 8446.50 ms | -100.0% bf16 MFU | 62053 tok/s +step 4381/19560 | loss 3.688569 (+2.19z)| norm 0.2714 (-0.50z)| lr 5.45e-04 | 8442.05 ms | -100.0% bf16 MFU | 62056 tok/s +step 4382/19560 | loss 3.555653 (-1.09z)| norm 0.2673 (-0.64z)| lr 5.45e-04 | 8445.02 ms | -100.0% bf16 MFU | 62057 tok/s +step 4383/19560 | loss 3.625415 (+0.63z)| norm 0.2633 (-0.76z)| lr 5.45e-04 | 8446.81 ms | -100.0% bf16 MFU | 62058 tok/s +step 4384/19560 | loss 3.538980 (-1.49z)| norm 0.2658 (-0.68z)| lr 5.45e-04 | 8448.74 ms | -100.0% bf16 MFU | 62057 tok/s +step 4385/19560 | loss 3.599353 (-0.01z)| norm 0.2662 (-0.67z)| lr 5.45e-04 | 8446.67 ms | -100.0% bf16 MFU | 62058 tok/s +step 4386/19560 | loss 3.630637 (+0.75z)| norm 0.2672 (-0.64z)| lr 5.45e-04 | 8449.56 ms | -100.0% bf16 MFU | 62058 tok/s +step 4387/19560 | loss 3.590365 (-0.22z)| norm 0.2728 (-0.47z)| lr 5.45e-04 | 8442.28 ms | -100.0% bf16 MFU | 62060 tok/s +step 4388/19560 | loss 3.606236 (+0.17z)| norm 0.2977 (+0.31z)| lr 5.45e-04 | 8444.49 ms | -100.0% bf16 MFU | 62061 tok/s +step 4389/19560 | loss 3.605326 (+0.16z)| norm 0.3123 (+0.77z)| lr 5.45e-04 | 8447.05 ms | -100.0% bf16 MFU | 62062 tok/s +step 4390/19560 | loss 3.713569 (+2.76z)| norm 0.3105 (+0.71z)| lr 5.45e-04 | 8445.11 ms | -100.0% bf16 MFU | 62063 tok/s +step 4391/19560 | loss 3.609596 (+0.24z)| norm 0.2596 (-0.91z)| lr 5.45e-04 | 8448.47 ms | -100.0% bf16 MFU | 62062 tok/s +step 4392/19560 | loss 3.665984 (+1.58z)| norm 0.2696 (-0.59z)| lr 5.45e-04 | 8447.71 ms | -100.0% bf16 MFU | 62062 tok/s +step 4393/19560 | loss 3.525174 (-1.78z)| norm 0.2691 (-0.60z)| lr 5.45e-04 | 8445.75 ms | -100.0% bf16 MFU | 62063 tok/s +step 4394/19560 | loss 3.717759 (+2.71z)| norm 0.2916 (+0.12z)| lr 5.45e-04 | 8442.43 ms | -100.0% bf16 MFU | 62065 tok/s +step 4395/19560 | loss 3.575319 (-0.58z)| norm 0.2655 (-0.70z)| lr 5.45e-04 | 8446.58 ms | -100.0% bf16 MFU | 62065 tok/s +step 4396/19560 | loss 3.591433 (-0.21z)| norm 0.2349 (-1.65z)| lr 5.45e-04 | 8447.85 ms | -100.0% bf16 MFU | 62065 tok/s +step 4397/19560 | loss 3.588270 (-0.28z)| norm 0.2442 (-1.34z)| lr 5.45e-04 | 8441.88 ms | -100.0% bf16 MFU | 62067 tok/s +step 4398/19560 | loss 3.618396 (+0.41z)| norm 0.2414 (-1.41z)| lr 5.45e-04 | 8448.32 ms | -100.0% bf16 MFU | 62067 tok/s +step 4399/19560 | loss 3.540412 (-1.37z)| norm 0.2564 (-0.93z)| lr 5.45e-04 | 8442.70 ms | -100.0% bf16 MFU | 62068 tok/s +step 4400/19560 | loss 3.564326 (-0.82z)| norm 0.2564 (-0.93z)| lr 5.45e-04 | 8444.24 ms | -100.0% bf16 MFU | 62069 tok/s +step 4401/19560 | loss 3.548925 (-1.16z)| norm 0.2546 (-0.98z)| lr 5.45e-04 | 8448.27 ms | -100.0% bf16 MFU | 62069 tok/s +step 4402/19560 | loss 3.579227 (-0.46z)| norm 0.2572 (-0.89z)| lr 5.45e-04 | 8442.18 ms | -100.0% bf16 MFU | 62070 tok/s +step 4403/19560 | loss 3.586964 (-0.28z)| norm 0.2562 (-0.92z)| lr 5.45e-04 | 8442.47 ms | -100.0% bf16 MFU | 62072 tok/s +step 4404/19560 | loss 3.574070 (-0.57z)| norm 0.2823 (-0.12z)| lr 5.45e-04 | 8447.54 ms | -100.0% bf16 MFU | 62072 tok/s +step 4405/19560 | loss 3.570076 (-0.66z)| norm 0.2721 (-0.42z)| lr 5.45e-04 | 8444.70 ms | -100.0% bf16 MFU | 62072 tok/s +step 4406/19560 | loss 3.570280 (-0.65z)| norm 0.2497 (-1.12z)| lr 5.45e-04 | 8442.53 ms | -100.0% bf16 MFU | 62074 tok/s +step 4407/19560 | loss 3.578541 (-0.45z)| norm 0.2539 (-0.98z)| lr 5.45e-04 | 8443.95 ms | -100.0% bf16 MFU | 62075 tok/s +step 4408/19560 | loss 3.624020 (+0.59z)| norm 0.2527 (-1.00z)| lr 5.45e-04 | 8439.60 ms | -100.0% bf16 MFU | 62077 tok/s +step 4409/19560 | loss 3.534580 (-1.43z)| norm 0.2930 (+0.27z)| lr 5.45e-04 | 8442.26 ms | -100.0% bf16 MFU | 62078 tok/s +step 4410/19560 | loss 3.627645 (+0.67z)| norm 0.2788 (-0.17z)| lr 5.45e-04 | 8442.71 ms | -100.0% bf16 MFU | 62079 tok/s +step 4411/19560 | loss 3.563089 (-0.79z)| norm 0.2542 (-0.94z)| lr 5.45e-04 | 8444.09 ms | -100.0% bf16 MFU | 62080 tok/s +step 4412/19560 | loss 3.594139 (-0.08z)| norm 0.2692 (-0.46z)| lr 5.44e-04 | 8446.04 ms | -100.0% bf16 MFU | 62080 tok/s +step 4413/19560 | loss 3.678498 (+1.81z)| norm 0.2700 (-0.43z)| lr 5.44e-04 | 8449.08 ms | -100.0% bf16 MFU | 62078 tok/s +step 4414/19560 | loss 3.602126 (+0.08z)| norm 0.3112 (+0.87z)| lr 5.44e-04 | 8441.90 ms | -100.0% bf16 MFU | 62080 tok/s +step 4415/19560 | loss 3.581821 (-0.41z)| norm 0.3088 (+0.79z)| lr 5.44e-04 | 8446.99 ms | -100.0% bf16 MFU | 62079 tok/s +step 4416/19560 | loss 3.543226 (-1.30z)| norm 0.2782 (-0.19z)| lr 5.44e-04 | 8442.58 ms | -100.0% bf16 MFU | 62080 tok/s +step 4417/19560 | loss 3.600682 (+0.04z)| norm 0.2859 (+0.06z)| lr 5.44e-04 | 8442.63 ms | -100.0% bf16 MFU | 62081 tok/s +step 4418/19560 | loss 3.574893 (-0.55z)| norm 0.2992 (+0.48z)| lr 5.44e-04 | 8436.76 ms | -100.0% bf16 MFU | 62084 tok/s +step 4419/19560 | loss 3.584927 (-0.32z)| norm 0.2934 (+0.28z)| lr 5.44e-04 | 8434.78 ms | -100.0% bf16 MFU | 62088 tok/s +step 4420/19560 | loss 3.592005 (-0.15z)| norm 0.2599 (-0.77z)| lr 5.44e-04 | 8432.00 ms | -100.0% bf16 MFU | 62092 tok/s +step 4421/19560 | loss 3.574346 (-0.55z)| norm 0.2464 (-1.19z)| lr 5.44e-04 | 8436.37 ms | -100.0% bf16 MFU | 62095 tok/s +step 4422/19560 | loss 3.592392 (-0.14z)| norm 0.2705 (-0.43z)| lr 5.44e-04 | 8436.85 ms | -100.0% bf16 MFU | 62097 tok/s +step 4423/19560 | loss 3.615594 (+0.40z)| norm 0.2438 (-1.27z)| lr 5.44e-04 | 8434.91 ms | -100.0% bf16 MFU | 62100 tok/s +step 4424/19560 | loss 3.607409 (+0.20z)| norm 0.3154 (+0.98z)| lr 5.44e-04 | 8437.21 ms | -100.0% bf16 MFU | 62102 tok/s +step 4425/19560 | loss 3.612291 (+0.31z)| norm 0.3181 (+1.06z)| lr 5.44e-04 | 8436.44 ms | -100.0% bf16 MFU | 62105 tok/s +step 4426/19560 | loss 3.590364 (-0.20z)| norm 0.2747 (-0.30z)| lr 5.44e-04 | 8432.99 ms | -100.0% bf16 MFU | 62108 tok/s +step 4427/19560 | loss 3.589215 (-0.22z)| norm 0.2748 (-0.28z)| lr 5.44e-04 | 8431.56 ms | -100.0% bf16 MFU | 62112 tok/s +step 4428/19560 | loss 3.523144 (-1.75z)| norm 0.2600 (-0.74z)| lr 5.44e-04 | 8438.71 ms | -100.0% bf16 MFU | 62112 tok/s +step 4429/19560 | loss 3.578754 (-0.45z)| norm 0.2516 (-0.99z)| lr 5.44e-04 | 8437.22 ms | -100.0% bf16 MFU | 62114 tok/s +step 4430/19560 | loss 3.633640 (+0.83z)| norm 0.2493 (-1.05z)| lr 5.44e-04 | 8433.89 ms | -100.0% bf16 MFU | 62116 tok/s +step 4431/19560 | loss 3.552180 (-1.05z)| norm 0.2586 (-0.75z)| lr 5.44e-04 | 8437.22 ms | -100.0% bf16 MFU | 62118 tok/s +step 4432/19560 | loss 3.644758 (+1.09z)| norm 0.2661 (-0.50z)| lr 5.44e-04 | 8437.31 ms | -100.0% bf16 MFU | 62119 tok/s +step 4433/19560 | loss 3.660102 (+1.44z)| norm 0.2761 (-0.19z)| lr 5.44e-04 | 8437.87 ms | -100.0% bf16 MFU | 62119 tok/s +step 4434/19560 | loss 3.586080 (-0.28z)| norm 0.2977 (+0.49z)| lr 5.44e-04 | 8434.45 ms | -100.0% bf16 MFU | 62121 tok/s +step 4435/19560 | loss 3.600348 (+0.05z)| norm 0.3173 (+1.09z)| lr 5.44e-04 | 8431.12 ms | -100.0% bf16 MFU | 62125 tok/s +step 4436/19560 | loss 3.547681 (-1.19z)| norm 0.2670 (-0.49z)| lr 5.44e-04 | 8436.19 ms | -100.0% bf16 MFU | 62126 tok/s +step 4437/19560 | loss 3.545424 (-1.22z)| norm 0.2811 (-0.05z)| lr 5.44e-04 | 8434.31 ms | -100.0% bf16 MFU | 62128 tok/s +step 4438/19560 | loss 3.564828 (-0.77z)| norm 0.3133 (+0.95z)| lr 5.44e-04 | 8438.35 ms | -100.0% bf16 MFU | 62128 tok/s +step 4439/19560 | loss 3.617093 (+0.44z)| norm 0.2779 (-0.16z)| lr 5.44e-04 | 8439.41 ms | -100.0% bf16 MFU | 62128 tok/s +step 4440/19560 | loss 3.655763 (+1.32z)| norm 0.2810 (-0.08z)| lr 5.44e-04 | 8440.81 ms | -100.0% bf16 MFU | 62127 tok/s +step 4441/19560 | loss 3.585261 (-0.33z)| norm 0.2898 (+0.19z)| lr 5.44e-04 | 8440.52 ms | -100.0% bf16 MFU | 62126 tok/s +step 4442/19560 | loss 3.642514 (+1.00z)| norm 0.3054 (+0.68z)| lr 5.44e-04 | 8438.71 ms | -100.0% bf16 MFU | 62126 tok/s +step 4443/19560 | loss 3.622212 (+0.52z)| norm 0.2750 (-0.29z)| lr 5.44e-04 | 8442.57 ms | -100.0% bf16 MFU | 62125 tok/s +step 4444/19560 | loss 3.564555 (-0.83z)| norm 0.2829 (-0.04z)| lr 5.44e-04 | 8443.75 ms | -100.0% bf16 MFU | 62123 tok/s +step 4445/19560 | loss 3.679763 (+1.83z)| norm 0.2765 (-0.26z)| lr 5.44e-04 | 8441.39 ms | -100.0% bf16 MFU | 62123 tok/s +step 4446/19560 | loss 3.571953 (-0.65z)| norm 0.2638 (-0.65z)| lr 5.43e-04 | 8439.64 ms | -100.0% bf16 MFU | 62123 tok/s +step 4447/19560 | loss 3.561807 (-0.87z)| norm 0.2411 (-1.36z)| lr 5.43e-04 | 8439.21 ms | -100.0% bf16 MFU | 62123 tok/s +step 4448/19560 | loss 3.589596 (-0.22z)| norm 0.2803 (-0.11z)| lr 5.43e-04 | 8441.46 ms | -100.0% bf16 MFU | 62122 tok/s +step 4449/19560 | loss 3.577276 (-0.51z)| norm 0.2776 (-0.20z)| lr 5.43e-04 | 8437.42 ms | -100.0% bf16 MFU | 62123 tok/s +step 4450/19560 | loss 3.532869 (-1.51z)| norm 0.2658 (-0.58z)| lr 5.43e-04 | 8442.20 ms | -100.0% bf16 MFU | 62122 tok/s +step 4451/19560 | loss 3.628053 (+0.67z)| norm 0.3081 (+0.76z)| lr 5.43e-04 | 8440.88 ms | -100.0% bf16 MFU | 62122 tok/s +step 4452/19560 | loss 3.597109 (-0.04z)| norm 0.2781 (-0.19z)| lr 5.43e-04 | 8439.49 ms | -100.0% bf16 MFU | 62122 tok/s +step 4453/19560 | loss 3.545360 (-1.21z)| norm 0.2906 (+0.21z)| lr 5.43e-04 | 8439.90 ms | -100.0% bf16 MFU | 62122 tok/s +step 4454/19560 | loss 3.545387 (-1.20z)| norm 0.3039 (+0.62z)| lr 5.43e-04 | 8442.34 ms | -100.0% bf16 MFU | 62121 tok/s +step 4455/19560 | loss 3.625659 (+0.62z)| norm 0.3122 (+0.87z)| lr 5.43e-04 | 8440.58 ms | -100.0% bf16 MFU | 62120 tok/s +step 4456/19560 | loss 3.595230 (-0.06z)| norm 0.2957 (+0.34z)| lr 5.43e-04 | 8440.70 ms | -100.0% bf16 MFU | 62120 tok/s +step 4457/19560 | loss 3.635770 (+0.85z)| norm 0.2638 (-0.67z)| lr 5.43e-04 | 8443.25 ms | -100.0% bf16 MFU | 62119 tok/s +step 4458/19560 | loss 3.604023 (+0.13z)| norm 0.2848 (+0.00z)| lr 5.43e-04 | 8442.75 ms | -100.0% bf16 MFU | 62118 tok/s +step 4459/19560 | loss 3.609212 (+0.24z)| norm 0.2814 (-0.11z)| lr 5.43e-04 | 8441.29 ms | -100.0% bf16 MFU | 62117 tok/s +step 4460/19560 | loss 3.604956 (+0.13z)| norm 0.3888 (+3.16z)| lr 5.43e-04 | 8442.89 ms | -100.0% bf16 MFU | 62116 tok/s +step 4461/19560 | loss 3.609907 (+0.26z)| norm 0.2753 (-0.32z)| lr 5.43e-04 | 8440.12 ms | -100.0% bf16 MFU | 62117 tok/s +step 4462/19560 | loss 3.604239 (+0.14z)| norm 0.3016 (+0.48z)| lr 5.43e-04 | 8438.92 ms | -100.0% bf16 MFU | 62117 tok/s +step 4463/19560 | loss 3.585775 (-0.28z)| norm 0.2757 (-0.31z)| lr 5.43e-04 | 8439.52 ms | -100.0% bf16 MFU | 62117 tok/s +step 4464/19560 | loss 3.602750 (+0.11z)| norm 0.2627 (-0.70z)| lr 5.43e-04 | 8440.66 ms | -100.0% bf16 MFU | 62117 tok/s +step 4465/19560 | loss 3.636703 (+0.88z)| norm 0.2482 (-1.12z)| lr 5.43e-04 | 8443.40 ms | -100.0% bf16 MFU | 62116 tok/s +step 4466/19560 | loss 3.558840 (-0.92z)| norm 0.2898 (+0.16z)| lr 5.43e-04 | 8443.59 ms | -100.0% bf16 MFU | 62115 tok/s +step 4467/19560 | loss 3.633198 (+0.79z)| norm 0.2996 (+0.45z)| lr 5.43e-04 | 8443.06 ms | -100.0% bf16 MFU | 62114 tok/s +step 4468/19560 | loss 3.612138 (+0.30z)| norm 0.2631 (-0.67z)| lr 5.43e-04 | 8440.34 ms | -100.0% bf16 MFU | 62114 tok/s +step 4469/19560 | loss 3.638366 (+0.93z)| norm 0.3546 (+2.09z)| lr 5.43e-04 | 8440.81 ms | -100.0% bf16 MFU | 62114 tok/s +step 4470/19560 | loss 3.572494 (-0.61z)| norm 0.3508 (+1.93z)| lr 5.43e-04 | 8441.06 ms | -100.0% bf16 MFU | 62114 tok/s +step 4471/19560 | loss 3.608525 (+0.24z)| norm 0.2999 (+0.42z)| lr 5.43e-04 | 8438.85 ms | -100.0% bf16 MFU | 62115 tok/s +step 4472/19560 | loss 3.607899 (+0.23z)| norm 0.2914 (+0.17z)| lr 5.43e-04 | 8439.79 ms | -100.0% bf16 MFU | 62115 tok/s +step 4473/19560 | loss 3.645001 (+1.09z)| norm 0.2913 (+0.17z)| lr 5.43e-04 | 8440.96 ms | -100.0% bf16 MFU | 62115 tok/s +step 4474/19560 | loss 3.650266 (+1.19z)| norm 0.3543 (+2.01z)| lr 5.43e-04 | 8436.95 ms | -100.0% bf16 MFU | 62116 tok/s +step 4475/19560 | loss 3.624395 (+0.58z)| norm 0.2962 (+0.29z)| lr 5.43e-04 | 8438.25 ms | -100.0% bf16 MFU | 62117 tok/s +step 4476/19560 | loss 3.503035 (-2.20z)| norm 0.2750 (-0.33z)| lr 5.43e-04 | 8437.60 ms | -100.0% bf16 MFU | 62118 tok/s +step 4477/19560 | loss 3.543060 (-1.27z)| norm 0.3056 (+0.56z)| lr 5.43e-04 | 8438.54 ms | -100.0% bf16 MFU | 62119 tok/s +step 4478/19560 | loss 3.624274 (+0.60z)| norm 0.3035 (+0.52z)| lr 5.43e-04 | 8441.85 ms | -100.0% bf16 MFU | 62118 tok/s +step 4479/19560 | loss 3.587165 (-0.24z)| norm 0.2767 (-0.27z)| lr 5.43e-04 | 8440.68 ms | -100.0% bf16 MFU | 62118 tok/s +step 4480/19560 | loss 3.595526 (-0.05z)| norm 0.2745 (-0.33z)| lr 5.42e-04 | 8442.29 ms | -100.0% bf16 MFU | 62117 tok/s +step 4481/19560 | loss 3.527690 (-1.69z)| norm 0.2864 (+0.03z)| lr 5.42e-04 | 8439.42 ms | -100.0% bf16 MFU | 62117 tok/s +step 4482/19560 | loss 3.573783 (-0.57z)| norm 0.2690 (-0.49z)| lr 5.42e-04 | 8439.62 ms | -100.0% bf16 MFU | 62118 tok/s +step 4483/19560 | loss 3.512339 (-2.02z)| norm 0.2548 (-0.91z)| lr 5.42e-04 | 8438.04 ms | -100.0% bf16 MFU | 62118 tok/s +step 4484/19560 | loss 3.567166 (-0.71z)| norm 0.2550 (-0.89z)| lr 5.42e-04 | 8437.23 ms | -100.0% bf16 MFU | 62120 tok/s +step 4485/19560 | loss 3.572752 (-0.56z)| norm 0.2441 (-1.21z)| lr 5.42e-04 | 8440.06 ms | -100.0% bf16 MFU | 62119 tok/s +step 4486/19560 | loss 3.649749 (+1.33z)| norm 0.2446 (-1.18z)| lr 5.42e-04 | 8440.72 ms | -100.0% bf16 MFU | 62119 tok/s +step 4487/19560 | loss 3.559889 (-0.88z)| norm 0.2341 (-1.81z)| lr 5.42e-04 | 8439.27 ms | -100.0% bf16 MFU | 62120 tok/s +step 4488/19560 | loss 3.640643 (+1.12z)| norm 0.2695 (-0.43z)| lr 5.42e-04 | 8439.24 ms | -100.0% bf16 MFU | 62120 tok/s +step 4489/19560 | loss 3.573223 (-0.54z)| norm 0.2795 (-0.03z)| lr 5.42e-04 | 8438.69 ms | -100.0% bf16 MFU | 62120 tok/s +step 4490/19560 | loss 3.628343 (+0.83z)| norm 0.2878 (+0.31z)| lr 5.42e-04 | 8442.18 ms | -100.0% bf16 MFU | 62119 tok/s +step 4491/19560 | loss 3.532254 (-1.54z)| norm 0.2631 (-0.66z)| lr 5.42e-04 | 8440.84 ms | -100.0% bf16 MFU | 62119 tok/s +step 4492/19560 | loss 3.569151 (-0.62z)| norm 0.2582 (-0.84z)| lr 5.42e-04 | 8455.63 ms | -100.0% bf16 MFU | 62113 tok/s +step 4493/19560 | loss 3.621770 (+0.66z)| norm 0.2412 (-1.49z)| lr 5.42e-04 | 8468.39 ms | -100.0% bf16 MFU | 62103 tok/s +step 4494/19560 | loss 3.549994 (-1.09z)| norm 0.2403 (-1.50z)| lr 5.42e-04 | 8466.59 ms | -100.0% bf16 MFU | 62094 tok/s +step 4495/19560 | loss 3.592528 (-0.04z)| norm 0.2605 (-0.69z)| lr 5.42e-04 | 8462.54 ms | -100.0% bf16 MFU | 62087 tok/s +step 4496/19560 | loss 3.562795 (-0.77z)| norm 0.2661 (-0.47z)| lr 5.42e-04 | 8460.98 ms | -100.0% bf16 MFU | 62081 tok/s +step 4497/19560 | loss 3.547265 (-1.14z)| norm 0.2576 (-0.80z)| lr 5.42e-04 | 8467.77 ms | -100.0% bf16 MFU | 62073 tok/s +step 4498/19560 | loss 3.619891 (+0.63z)| norm 0.2966 (+0.74z)| lr 5.42e-04 | 8462.88 ms | -100.0% bf16 MFU | 62067 tok/s +step 4499/19560 | loss 3.608210 (+0.34z)| norm 0.3045 (+1.04z)| lr 5.42e-04 | 8460.38 ms | -100.0% bf16 MFU | 62062 tok/s +step 4500/19560 | loss 3.632563 (+0.93z)| norm 0.2775 (-0.03z)| lr 5.42e-04 | 8462.54 ms | -100.0% bf16 MFU | 62057 tok/s +val loss 3.578791 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 2684/10042 = 0.267277 +step 4501/19560 | loss 3.588658 (-0.13z)| norm 0.2952 (+0.67z)| lr 5.42e-04 | 8463.76 ms | -100.0% bf16 MFU | 62051 tok/s +step 4502/19560 | loss 3.667950 (+1.80z)| norm 0.3001 (+0.86z)| lr 5.42e-04 | 8459.96 ms | -100.0% bf16 MFU | 62047 tok/s +step 4503/19560 | loss 3.576212 (-0.45z)| norm 0.2530 (-0.98z)| lr 5.42e-04 | 8462.91 ms | -100.0% bf16 MFU | 62042 tok/s +step 4504/19560 | loss 3.517125 (-1.86z)| norm 0.2676 (-0.41z)| lr 5.42e-04 | 8458.72 ms | -100.0% bf16 MFU | 62039 tok/s +step 4505/19560 | loss 3.553699 (-1.00z)| norm 0.2652 (-0.50z)| lr 5.42e-04 | 8460.46 ms | -100.0% bf16 MFU | 62036 tok/s +step 4506/19560 | loss 3.723767 (+3.07z)| norm 0.2842 (+0.25z)| lr 5.42e-04 | 8462.86 ms | -100.0% bf16 MFU | 62032 tok/s +step 4507/19560 | loss 3.617429 (+0.53z)| norm 0.2920 (+0.56z)| lr 5.42e-04 | 8457.68 ms | -100.0% bf16 MFU | 62029 tok/s +step 4508/19560 | loss 3.584112 (-0.26z)| norm 0.2644 (-0.54z)| lr 5.42e-04 | 8455.37 ms | -100.0% bf16 MFU | 62028 tok/s +step 4509/19560 | loss 3.608966 (+0.35z)| norm 0.2685 (-0.37z)| lr 5.42e-04 | 8454.01 ms | -100.0% bf16 MFU | 62028 tok/s +step 4510/19560 | loss 3.555593 (-0.95z)| norm 0.2940 (+0.63z)| lr 5.42e-04 | 8467.47 ms | -100.0% bf16 MFU | 62022 tok/s +step 4511/19560 | loss 3.536478 (-1.39z)| norm 0.2661 (-0.48z)| lr 5.42e-04 | 8458.34 ms | -100.0% bf16 MFU | 62020 tok/s +step 4512/19560 | loss 3.580235 (-0.34z)| norm 0.2620 (-0.63z)| lr 5.42e-04 | 8453.20 ms | -100.0% bf16 MFU | 62020 tok/s +step 4513/19560 | loss 3.561830 (-0.78z)| norm 0.2986 (+0.80z)| lr 5.42e-04 | 8460.16 ms | -100.0% bf16 MFU | 62018 tok/s +step 4514/19560 | loss 3.574069 (-0.47z)| norm 0.2898 (+0.44z)| lr 5.41e-04 | 8455.68 ms | -100.0% bf16 MFU | 62017 tok/s +step 4515/19560 | loss 3.565279 (-0.68z)| norm 0.2829 (+0.17z)| lr 5.41e-04 | 8453.04 ms | -100.0% bf16 MFU | 62018 tok/s +step 4516/19560 | loss 3.626888 (+0.81z)| norm 0.3121 (+1.31z)| lr 5.41e-04 | 8458.38 ms | -100.0% bf16 MFU | 62016 tok/s +step 4517/19560 | loss 3.583960 (-0.23z)| norm 0.3025 (+0.94z)| lr 5.41e-04 | 8455.89 ms | -100.0% bf16 MFU | 62015 tok/s +step 4518/19560 | loss 3.581903 (-0.26z)| norm 0.2911 (+0.50z)| lr 5.41e-04 | 8460.61 ms | -100.0% bf16 MFU | 62013 tok/s +step 4519/19560 | loss 3.569021 (-0.57z)| norm 0.3020 (+0.92z)| lr 5.41e-04 | 8454.22 ms | -100.0% bf16 MFU | 62013 tok/s +step 4520/19560 | loss 3.542607 (-1.22z)| norm 0.2870 (+0.32z)| lr 5.41e-04 | 8456.06 ms | -100.0% bf16 MFU | 62012 tok/s +step 4521/19560 | loss 3.526254 (-1.64z)| norm 0.2763 (-0.11z)| lr 5.41e-04 | 8458.26 ms | -100.0% bf16 MFU | 62011 tok/s +step 4522/19560 | loss 3.539648 (-1.31z)| norm 0.2751 (-0.15z)| lr 5.41e-04 | 8450.70 ms | -100.0% bf16 MFU | 62013 tok/s +step 4523/19560 | loss 3.559386 (-0.79z)| norm 0.2644 (-0.58z)| lr 5.41e-04 | 8453.48 ms | -100.0% bf16 MFU | 62013 tok/s +step 4524/19560 | loss 3.586284 (-0.08z)| norm 0.2455 (-1.33z)| lr 5.41e-04 | 8449.13 ms | -100.0% bf16 MFU | 62015 tok/s +step 4525/19560 | loss 3.571431 (-0.47z)| norm 0.2506 (-1.14z)| lr 5.41e-04 | 8449.91 ms | -100.0% bf16 MFU | 62017 tok/s +step 4526/19560 | loss 3.632825 (+1.13z)| norm 0.2419 (-1.48z)| lr 5.41e-04 | 8461.64 ms | -100.0% bf16 MFU | 62014 tok/s +step 4527/19560 | loss 3.516808 (-1.87z)| norm 0.2499 (-1.16z)| lr 5.41e-04 | 8448.71 ms | -100.0% bf16 MFU | 62016 tok/s +step 4528/19560 | loss 3.556382 (-0.85z)| norm 0.2388 (-1.59z)| lr 5.41e-04 | 8449.87 ms | -100.0% bf16 MFU | 62017 tok/s +step 4529/19560 | loss 3.567051 (-0.58z)| norm 0.2402 (-1.52z)| lr 5.41e-04 | 8453.05 ms | -100.0% bf16 MFU | 62018 tok/s +step 4530/19560 | loss 3.703643 (+2.84z)| norm 0.2546 (-0.95z)| lr 5.41e-04 | 8451.35 ms | -100.0% bf16 MFU | 62019 tok/s +step 4531/19560 | loss 3.528367 (-1.53z)| norm 0.2764 (-0.10z)| lr 5.41e-04 | 8449.40 ms | -100.0% bf16 MFU | 62020 tok/s +step 4532/19560 | loss 3.548487 (-1.02z)| norm 0.2628 (-0.63z)| lr 5.41e-04 | 8454.34 ms | -100.0% bf16 MFU | 62020 tok/s +step 4533/19560 | loss 3.586432 (-0.08z)| norm 0.2586 (-0.79z)| lr 5.41e-04 | 8452.30 ms | -100.0% bf16 MFU | 62020 tok/s +step 4534/19560 | loss 3.506872 (-2.01z)| norm 0.2759 (-0.11z)| lr 5.41e-04 | 8453.85 ms | -100.0% bf16 MFU | 62020 tok/s +step 4535/19560 | loss 3.758180 (+3.84z)| norm 0.2799 (+0.04z)| lr 5.41e-04 | 8450.88 ms | -100.0% bf16 MFU | 62021 tok/s +step 4536/19560 | loss 3.622804 (+0.74z)| norm 0.3606 (+3.10z)| lr 5.41e-04 | 8451.95 ms | -100.0% bf16 MFU | 62022 tok/s +step 4537/19560 | loss 3.567700 (-0.53z)| norm 0.3177 (+1.44z)| lr 5.41e-04 | 8449.29 ms | -100.0% bf16 MFU | 62023 tok/s +step 4538/19560 | loss 3.600600 (+0.23z)| norm 0.2808 (+0.03z)| lr 5.41e-04 | 8455.20 ms | -100.0% bf16 MFU | 62022 tok/s +step 4539/19560 | loss 3.553761 (-0.85z)| norm 0.2821 (+0.07z)| lr 5.41e-04 | 8446.89 ms | -100.0% bf16 MFU | 62025 tok/s +step 4540/19560 | loss 3.584645 (-0.14z)| norm 0.2894 (+0.34z)| lr 5.41e-04 | 8446.72 ms | -100.0% bf16 MFU | 62027 tok/s +step 4541/19560 | loss 3.563210 (-0.62z)| norm 0.2624 (-0.69z)| lr 5.41e-04 | 8453.16 ms | -100.0% bf16 MFU | 62027 tok/s +step 4542/19560 | loss 3.641455 (+1.20z)| norm 0.2957 (+0.59z)| lr 5.41e-04 | 8446.13 ms | -100.0% bf16 MFU | 62029 tok/s +step 4543/19560 | loss 3.520506 (-1.59z)| norm 0.2856 (+0.21z)| lr 5.41e-04 | 8451.35 ms | -100.0% bf16 MFU | 62029 tok/s +step 4544/19560 | loss 3.593647 (+0.09z)| norm 0.2575 (-0.86z)| lr 5.41e-04 | 8455.28 ms | -100.0% bf16 MFU | 62028 tok/s +step 4545/19560 | loss 3.558650 (-0.71z)| norm 0.2497 (-1.14z)| lr 5.41e-04 | 8454.83 ms | -100.0% bf16 MFU | 62027 tok/s +step 4546/19560 | loss 3.566125 (-0.54z)| norm 0.2818 (+0.09z)| lr 5.41e-04 | 8446.06 ms | -100.0% bf16 MFU | 62030 tok/s +step 4547/19560 | loss 3.567396 (-0.51z)| norm 0.2901 (+0.41z)| lr 5.41e-04 | 8455.63 ms | -100.0% bf16 MFU | 62029 tok/s +step 4548/19560 | loss 3.633276 (+1.00z)| norm 0.2917 (+0.46z)| lr 5.40e-04 | 8448.30 ms | -100.0% bf16 MFU | 62030 tok/s +step 4549/19560 | loss 3.568596 (-0.48z)| norm 0.2669 (-0.50z)| lr 5.40e-04 | 8448.80 ms | -100.0% bf16 MFU | 62031 tok/s +step 4550/19560 | loss 3.592155 (+0.06z)| norm 0.2662 (-0.53z)| lr 5.40e-04 | 8450.21 ms | -100.0% bf16 MFU | 62032 tok/s +step 4551/19560 | loss 3.589310 (-0.00z)| norm 0.2674 (-0.49z)| lr 5.40e-04 | 8444.84 ms | -100.0% bf16 MFU | 62035 tok/s +step 4552/19560 | loss 3.532869 (-1.28z)| norm 0.2849 (+0.20z)| lr 5.40e-04 | 8449.15 ms | -100.0% bf16 MFU | 62035 tok/s +step 4553/19560 | loss 3.590720 (+0.05z)| norm 0.2849 (+0.21z)| lr 5.40e-04 | 8447.93 ms | -100.0% bf16 MFU | 62037 tok/s +step 4554/19560 | loss 3.602157 (+0.31z)| norm 0.2735 (-0.24z)| lr 5.40e-04 | 8449.83 ms | -100.0% bf16 MFU | 62037 tok/s +step 4555/19560 | loss 3.575682 (-0.30z)| norm 0.2546 (-0.97z)| lr 5.40e-04 | 8445.07 ms | -100.0% bf16 MFU | 62039 tok/s +step 4556/19560 | loss 3.575542 (-0.31z)| norm 0.2782 (-0.05z)| lr 5.40e-04 | 8447.12 ms | -100.0% bf16 MFU | 62041 tok/s +step 4557/19560 | loss 3.534862 (-1.24z)| norm 0.2526 (-1.06z)| lr 5.40e-04 | 8445.59 ms | -100.0% bf16 MFU | 62043 tok/s +step 4558/19560 | loss 3.583562 (-0.11z)| norm 0.2581 (-0.85z)| lr 5.40e-04 | 8449.52 ms | -100.0% bf16 MFU | 62043 tok/s +step 4559/19560 | loss 3.590155 (+0.04z)| norm 0.2772 (-0.10z)| lr 5.40e-04 | 8445.43 ms | -100.0% bf16 MFU | 62045 tok/s +step 4560/19560 | loss 3.525664 (-1.43z)| norm 0.2886 (+0.35z)| lr 5.40e-04 | 8446.65 ms | -100.0% bf16 MFU | 62046 tok/s +step 4561/19560 | loss 3.566752 (-0.47z)| norm 0.2674 (-0.49z)| lr 5.40e-04 | 8448.12 ms | -100.0% bf16 MFU | 62047 tok/s +step 4562/19560 | loss 3.527621 (-1.36z)| norm 0.2718 (-0.31z)| lr 5.40e-04 | 8444.31 ms | -100.0% bf16 MFU | 62049 tok/s +step 4563/19560 | loss 3.557050 (-0.67z)| norm 0.2692 (-0.40z)| lr 5.40e-04 | 8449.95 ms | -100.0% bf16 MFU | 62049 tok/s +step 4564/19560 | loss 3.687724 (+2.28z)| norm 0.2575 (-0.86z)| lr 5.40e-04 | 8447.68 ms | -100.0% bf16 MFU | 62049 tok/s +step 4565/19560 | loss 3.562634 (-0.56z)| norm 0.2512 (-1.10z)| lr 5.40e-04 | 8448.91 ms | -100.0% bf16 MFU | 62050 tok/s +step 4566/19560 | loss 3.583657 (-0.09z)| norm 0.2455 (-1.31z)| lr 5.40e-04 | 8441.46 ms | -100.0% bf16 MFU | 62053 tok/s +step 4567/19560 | loss 3.581524 (-0.13z)| norm 0.2714 (-0.28z)| lr 5.40e-04 | 8450.01 ms | -100.0% bf16 MFU | 62052 tok/s +step 4568/19560 | loss 3.681124 (+2.12z)| norm 0.2609 (-0.69z)| lr 5.40e-04 | 8441.96 ms | -100.0% bf16 MFU | 62055 tok/s +step 4569/19560 | loss 3.545806 (-0.93z)| norm 0.2743 (-0.15z)| lr 5.40e-04 | 8443.71 ms | -100.0% bf16 MFU | 62057 tok/s +step 4570/19560 | loss 3.604468 (+0.40z)| norm 0.2828 (+0.20z)| lr 5.40e-04 | 8448.74 ms | -100.0% bf16 MFU | 62057 tok/s +step 4571/19560 | loss 3.572097 (-0.33z)| norm 0.3083 (+1.20z)| lr 5.40e-04 | 8443.95 ms | -100.0% bf16 MFU | 62058 tok/s +step 4572/19560 | loss 3.608753 (+0.50z)| norm 0.3048 (+1.04z)| lr 5.40e-04 | 8445.77 ms | -100.0% bf16 MFU | 62059 tok/s +step 4573/19560 | loss 3.564073 (-0.50z)| norm 0.3096 (+1.22z)| lr 5.40e-04 | 8447.02 ms | -100.0% bf16 MFU | 62060 tok/s +step 4574/19560 | loss 3.614250 (+0.65z)| norm 0.3036 (+0.97z)| lr 5.40e-04 | 8452.20 ms | -100.0% bf16 MFU | 62058 tok/s +step 4575/19560 | loss 3.561116 (-0.58z)| norm 0.3148 (+1.39z)| lr 5.40e-04 | 8444.74 ms | -100.0% bf16 MFU | 62060 tok/s +step 4576/19560 | loss 3.603512 (+0.40z)| norm 0.3039 (+0.95z)| lr 5.40e-04 | 8447.03 ms | -100.0% bf16 MFU | 62060 tok/s +step 4577/19560 | loss 3.636467 (+1.14z)| norm 0.3110 (+1.21z)| lr 5.40e-04 | 8447.38 ms | -100.0% bf16 MFU | 62060 tok/s +step 4578/19560 | loss 3.536158 (-1.16z)| norm 0.2748 (-0.20z)| lr 5.40e-04 | 8446.23 ms | -100.0% bf16 MFU | 62061 tok/s +step 4579/19560 | loss 3.561343 (-0.57z)| norm 0.2891 (+0.36z)| lr 5.40e-04 | 8441.43 ms | -100.0% bf16 MFU | 62063 tok/s +step 4580/19560 | loss 3.690008 (+2.32z)| norm 0.2765 (-0.13z)| lr 5.40e-04 | 8449.28 ms | -100.0% bf16 MFU | 62063 tok/s +step 4581/19560 | loss 3.555463 (-0.71z)| norm 0.3138 (+1.31z)| lr 5.39e-04 | 8447.22 ms | -100.0% bf16 MFU | 62063 tok/s +step 4582/19560 | loss 3.626380 (+0.87z)| norm 0.2837 (+0.15z)| lr 5.39e-04 | 8444.83 ms | -100.0% bf16 MFU | 62064 tok/s +step 4583/19560 | loss 3.592340 (+0.11z)| norm 0.3010 (+0.83z)| lr 5.39e-04 | 8445.18 ms | -100.0% bf16 MFU | 62065 tok/s +step 4584/19560 | loss 3.540080 (-1.06z)| norm 0.2799 (+0.01z)| lr 5.39e-04 | 8442.19 ms | -100.0% bf16 MFU | 62067 tok/s +step 4585/19560 | loss 3.550025 (-0.82z)| norm 0.2585 (-0.82z)| lr 5.39e-04 | 8441.86 ms | -100.0% bf16 MFU | 62069 tok/s +step 4586/19560 | loss 3.533925 (-1.17z)| norm 0.2720 (-0.29z)| lr 5.39e-04 | 8443.37 ms | -100.0% bf16 MFU | 62070 tok/s +step 4587/19560 | loss 3.590816 (+0.12z)| norm 0.2773 (-0.09z)| lr 5.39e-04 | 8446.49 ms | -100.0% bf16 MFU | 62070 tok/s +step 4588/19560 | loss 3.592464 (+0.16z)| norm 0.2893 (+0.45z)| lr 5.39e-04 | 8443.16 ms | -100.0% bf16 MFU | 62071 tok/s +step 4589/19560 | loss 3.577158 (-0.18z)| norm 0.2831 (+0.19z)| lr 5.39e-04 | 8444.66 ms | -100.0% bf16 MFU | 62072 tok/s +step 4590/19560 | loss 3.529241 (-1.25z)| norm 0.2658 (-0.54z)| lr 5.39e-04 | 8444.02 ms | -100.0% bf16 MFU | 62073 tok/s +step 4591/19560 | loss 3.561677 (-0.51z)| norm 0.2429 (-1.48z)| lr 5.39e-04 | 8441.39 ms | -100.0% bf16 MFU | 62075 tok/s +step 4592/19560 | loss 3.603329 (+0.42z)| norm 0.2650 (-0.55z)| lr 5.39e-04 | 8444.81 ms | -100.0% bf16 MFU | 62075 tok/s +step 4593/19560 | loss 3.626130 (+0.94z)| norm 0.2758 (-0.11z)| lr 5.39e-04 | 8446.45 ms | -100.0% bf16 MFU | 62075 tok/s +step 4594/19560 | loss 3.618718 (+0.76z)| norm 0.2402 (-1.58z)| lr 5.39e-04 | 8443.05 ms | -100.0% bf16 MFU | 62076 tok/s +step 4595/19560 | loss 3.602041 (+0.39z)| norm 0.2525 (-1.05z)| lr 5.39e-04 | 8444.24 ms | -100.0% bf16 MFU | 62077 tok/s +step 4596/19560 | loss 3.529319 (-1.23z)| norm 0.2626 (-0.63z)| lr 5.39e-04 | 8442.87 ms | -100.0% bf16 MFU | 62078 tok/s +step 4597/19560 | loss 3.547933 (-0.80z)| norm 0.2505 (-1.15z)| lr 5.39e-04 | 8447.25 ms | -100.0% bf16 MFU | 62077 tok/s +step 4598/19560 | loss 3.561960 (-0.48z)| norm 0.2362 (-1.78z)| lr 5.39e-04 | 8444.77 ms | -100.0% bf16 MFU | 62078 tok/s +step 4599/19560 | loss 3.548630 (-0.77z)| norm 0.2540 (-0.96z)| lr 5.39e-04 | 8444.42 ms | -100.0% bf16 MFU | 62078 tok/s +step 4600/19560 | loss 3.540813 (-0.93z)| norm 0.2677 (-0.35z)| lr 5.39e-04 | 8440.51 ms | -100.0% bf16 MFU | 62080 tok/s +step 4601/19560 | loss 3.687065 (+2.31z)| norm 0.3099 (+1.53z)| lr 5.39e-04 | 8441.34 ms | -100.0% bf16 MFU | 62081 tok/s +step 4602/19560 | loss 3.563993 (-0.40z)| norm 0.2699 (-0.24z)| lr 5.39e-04 | 8445.85 ms | -100.0% bf16 MFU | 62081 tok/s +step 4603/19560 | loss 3.670916 (+1.95z)| norm 0.2888 (+0.65z)| lr 5.39e-04 | 8444.65 ms | -100.0% bf16 MFU | 62081 tok/s +step 4604/19560 | loss 3.600759 (+0.40z)| norm 0.2419 (-1.52z)| lr 5.39e-04 | 8442.50 ms | -100.0% bf16 MFU | 62082 tok/s +step 4605/19560 | loss 3.513399 (-1.54z)| norm 0.2652 (-0.42z)| lr 5.39e-04 | 8448.22 ms | -100.0% bf16 MFU | 62081 tok/s +step 4606/19560 | loss 3.570866 (-0.26z)| norm 0.2930 (+0.88z)| lr 5.39e-04 | 8445.95 ms | -100.0% bf16 MFU | 62081 tok/s +step 4607/19560 | loss 3.566116 (-0.36z)| norm 0.2695 (-0.22z)| lr 5.39e-04 | 8446.44 ms | -100.0% bf16 MFU | 62080 tok/s +step 4608/19560 | loss 3.521228 (-1.33z)| norm 0.2472 (-1.25z)| lr 5.39e-04 | 8441.65 ms | -100.0% bf16 MFU | 62082 tok/s +step 4609/19560 | loss 3.606506 (+0.54z)| norm 0.2735 (-0.02z)| lr 5.39e-04 | 8445.47 ms | -100.0% bf16 MFU | 62082 tok/s +step 4610/19560 | loss 3.566180 (-0.36z)| norm 0.2544 (-0.90z)| lr 5.39e-04 | 8438.21 ms | -100.0% bf16 MFU | 62084 tok/s +step 4611/19560 | loss 3.551802 (-0.69z)| norm 0.2594 (-0.67z)| lr 5.39e-04 | 8440.64 ms | -100.0% bf16 MFU | 62086 tok/s +step 4612/19560 | loss 3.534290 (-1.07z)| norm 0.2781 (+0.20z)| lr 5.39e-04 | 8447.30 ms | -100.0% bf16 MFU | 62085 tok/s +step 4613/19560 | loss 3.580791 (-0.03z)| norm 0.2902 (+0.75z)| lr 5.39e-04 | 8438.58 ms | -100.0% bf16 MFU | 62087 tok/s +step 4614/19560 | loss 3.619517 (+0.84z)| norm 0.2525 (-1.03z)| lr 5.38e-04 | 8445.07 ms | -100.0% bf16 MFU | 62087 tok/s +step 4615/19560 | loss 3.591339 (+0.20z)| norm 0.2591 (-0.74z)| lr 5.38e-04 | 8442.80 ms | -100.0% bf16 MFU | 62087 tok/s +step 4616/19560 | loss 3.566851 (-0.34z)| norm 0.2657 (-0.42z)| lr 5.38e-04 | 8442.20 ms | -100.0% bf16 MFU | 62088 tok/s +step 4617/19560 | loss 3.502342 (-1.76z)| norm 0.2920 (+0.83z)| lr 5.38e-04 | 8443.39 ms | -100.0% bf16 MFU | 62088 tok/s +step 4618/19560 | loss 3.618937 (+0.84z)| norm 0.2578 (-0.79z)| lr 5.38e-04 | 8441.39 ms | -100.0% bf16 MFU | 62090 tok/s +step 4619/19560 | loss 3.606328 (+0.55z)| norm 0.2850 (+0.50z)| lr 5.38e-04 | 8441.28 ms | -100.0% bf16 MFU | 62091 tok/s +step 4620/19560 | loss 3.570801 (-0.24z)| norm 0.2943 (+0.93z)| lr 5.38e-04 | 8440.63 ms | -100.0% bf16 MFU | 62092 tok/s +step 4621/19560 | loss 3.554758 (-0.59z)| norm 0.2923 (+0.82z)| lr 5.38e-04 | 8440.96 ms | -100.0% bf16 MFU | 62093 tok/s +step 4622/19560 | loss 3.565388 (-0.36z)| norm 0.2696 (-0.29z)| lr 5.38e-04 | 8441.19 ms | -100.0% bf16 MFU | 62094 tok/s +step 4623/19560 | loss 3.597744 (+0.37z)| norm 0.2609 (-0.71z)| lr 5.38e-04 | 8441.35 ms | -100.0% bf16 MFU | 62094 tok/s +step 4624/19560 | loss 3.536061 (-1.01z)| norm 0.2611 (-0.69z)| lr 5.38e-04 | 8443.49 ms | -100.0% bf16 MFU | 62094 tok/s +step 4625/19560 | loss 3.602544 (+0.47z)| norm 0.2992 (+1.13z)| lr 5.38e-04 | 8439.44 ms | -100.0% bf16 MFU | 62096 tok/s +step 4626/19560 | loss 3.502517 (-1.74z)| norm 0.2431 (-1.55z)| lr 5.38e-04 | 8442.25 ms | -100.0% bf16 MFU | 62096 tok/s +step 4627/19560 | loss 3.612404 (+0.70z)| norm 0.2801 (+0.24z)| lr 5.38e-04 | 8444.69 ms | -100.0% bf16 MFU | 62096 tok/s +step 4628/19560 | loss 3.597390 (+0.38z)| norm 0.2829 (+0.37z)| lr 5.38e-04 | 8441.15 ms | -100.0% bf16 MFU | 62096 tok/s +step 4629/19560 | loss 3.575363 (-0.11z)| norm 0.2856 (+0.51z)| lr 5.38e-04 | 8437.62 ms | -100.0% bf16 MFU | 62098 tok/s +step 4630/19560 | loss 3.529816 (-1.11z)| norm 0.2581 (-0.81z)| lr 5.38e-04 | 8439.30 ms | -100.0% bf16 MFU | 62100 tok/s +step 4631/19560 | loss 3.640404 (+1.36z)| norm 0.2749 (-0.00z)| lr 5.38e-04 | 8441.15 ms | -100.0% bf16 MFU | 62100 tok/s +step 4632/19560 | loss 3.538369 (-0.93z)| norm 0.3138 (+1.86z)| lr 5.38e-04 | 8439.27 ms | -100.0% bf16 MFU | 62102 tok/s +step 4633/19560 | loss 3.583524 (+0.08z)| norm 0.2899 (+0.70z)| lr 5.38e-04 | 8438.59 ms | -100.0% bf16 MFU | 62103 tok/s +step 4634/19560 | loss 3.552652 (-0.61z)| norm 0.2778 (+0.11z)| lr 5.38e-04 | 8441.32 ms | -100.0% bf16 MFU | 62103 tok/s +step 4635/19560 | loss 3.590501 (+0.28z)| norm 0.3198 (+2.10z)| lr 5.38e-04 | 8441.80 ms | -100.0% bf16 MFU | 62103 tok/s +step 4636/19560 | loss 3.607454 (+0.67z)| norm 0.2691 (-0.31z)| lr 5.38e-04 | 8442.19 ms | -100.0% bf16 MFU | 62103 tok/s +step 4637/19560 | loss 3.582803 (+0.10z)| norm 0.2994 (+1.11z)| lr 5.38e-04 | 8442.45 ms | -100.0% bf16 MFU | 62103 tok/s +step 4638/19560 | loss 3.558890 (-0.46z)| norm 0.2584 (-0.82z)| lr 5.38e-04 | 8442.03 ms | -100.0% bf16 MFU | 62103 tok/s +step 4639/19560 | loss 3.602361 (+0.55z)| norm 0.2702 (-0.26z)| lr 5.38e-04 | 8437.46 ms | -100.0% bf16 MFU | 62105 tok/s +step 4640/19560 | loss 3.575904 (-0.07z)| norm 0.2718 (-0.19z)| lr 5.38e-04 | 8443.54 ms | -100.0% bf16 MFU | 62105 tok/s +step 4641/19560 | loss 3.564059 (-0.35z)| norm 0.2876 (+0.57z)| lr 5.38e-04 | 8437.16 ms | -100.0% bf16 MFU | 62106 tok/s +step 4642/19560 | loss 3.534904 (-1.03z)| norm 0.2791 (+0.16z)| lr 5.38e-04 | 8437.44 ms | -100.0% bf16 MFU | 62108 tok/s +step 4643/19560 | loss 3.514426 (-1.49z)| norm 0.2965 (+0.99z)| lr 5.38e-04 | 8436.68 ms | -100.0% bf16 MFU | 62110 tok/s +step 4644/19560 | loss 3.673486 (+2.18z)| norm 0.3046 (+1.38z)| lr 5.38e-04 | 8433.21 ms | -100.0% bf16 MFU | 62113 tok/s +step 4645/19560 | loss 3.597480 (+0.43z)| norm 0.2813 (+0.28z)| lr 5.38e-04 | 8437.95 ms | -100.0% bf16 MFU | 62114 tok/s +step 4646/19560 | loss 3.576926 (-0.04z)| norm 0.2704 (-0.24z)| lr 5.38e-04 | 8434.53 ms | -100.0% bf16 MFU | 62116 tok/s +step 4647/19560 | loss 3.554241 (-0.56z)| norm 0.2581 (-0.82z)| lr 5.37e-04 | 8432.48 ms | -100.0% bf16 MFU | 62119 tok/s +step 4648/19560 | loss 3.587770 (+0.20z)| norm 0.2996 (+1.18z)| lr 5.37e-04 | 8432.70 ms | -100.0% bf16 MFU | 62122 tok/s +step 4649/19560 | loss 3.580326 (+0.02z)| norm 0.2714 (-0.18z)| lr 5.37e-04 | 8433.17 ms | -100.0% bf16 MFU | 62124 tok/s +step 4650/19560 | loss 3.560271 (-0.45z)| norm 0.2704 (-0.22z)| lr 5.37e-04 | 8435.34 ms | -100.0% bf16 MFU | 62126 tok/s +step 4651/19560 | loss 3.526016 (-1.23z)| norm 0.2550 (-0.96z)| lr 5.37e-04 | 8431.90 ms | -100.0% bf16 MFU | 62128 tok/s +step 4652/19560 | loss 3.617269 (+0.87z)| norm 0.2597 (-0.74z)| lr 5.37e-04 | 8433.50 ms | -100.0% bf16 MFU | 62130 tok/s +step 4653/19560 | loss 3.546079 (-0.77z)| norm 0.2566 (-0.90z)| lr 5.37e-04 | 8433.77 ms | -100.0% bf16 MFU | 62132 tok/s +step 4654/19560 | loss 3.541492 (-0.86z)| norm 0.2617 (-0.66z)| lr 5.37e-04 | 8431.11 ms | -100.0% bf16 MFU | 62135 tok/s +step 4655/19560 | loss 3.532813 (-1.06z)| norm 0.2721 (-0.16z)| lr 5.37e-04 | 8429.79 ms | -100.0% bf16 MFU | 62138 tok/s +step 4656/19560 | loss 3.601448 (+0.51z)| norm 0.2582 (-0.86z)| lr 5.37e-04 | 8432.94 ms | -100.0% bf16 MFU | 62139 tok/s +step 4657/19560 | loss 3.543897 (-0.81z)| norm 0.2817 (+0.29z)| lr 5.37e-04 | 8435.53 ms | -100.0% bf16 MFU | 62140 tok/s +step 4658/19560 | loss 3.589005 (+0.26z)| norm 0.2852 (+0.46z)| lr 5.37e-04 | 8432.13 ms | -100.0% bf16 MFU | 62142 tok/s +step 4659/19560 | loss 3.584924 (+0.15z)| norm 0.2746 (-0.08z)| lr 5.37e-04 | 8434.88 ms | -100.0% bf16 MFU | 62143 tok/s +step 4660/19560 | loss 3.543383 (-0.84z)| norm 0.2628 (-0.67z)| lr 5.37e-04 | 8435.56 ms | -100.0% bf16 MFU | 62143 tok/s +step 4661/19560 | loss 3.565352 (-0.31z)| norm 0.2760 (-0.02z)| lr 5.37e-04 | 8432.15 ms | -100.0% bf16 MFU | 62145 tok/s +step 4662/19560 | loss 3.582700 (+0.09z)| norm 0.2986 (+1.11z)| lr 5.37e-04 | 8434.65 ms | -100.0% bf16 MFU | 62145 tok/s +step 4663/19560 | loss 3.541248 (-0.94z)| norm 0.3321 (+2.70z)| lr 5.37e-04 | 8437.88 ms | -100.0% bf16 MFU | 62145 tok/s +step 4664/19560 | loss 3.551633 (-0.66z)| norm 0.3229 (+2.37z)| lr 5.37e-04 | 8436.09 ms | -100.0% bf16 MFU | 62145 tok/s +step 4665/19560 | loss 3.595258 (+0.48z)| norm 0.3042 (+1.43z)| lr 5.37e-04 | 8439.14 ms | -100.0% bf16 MFU | 62144 tok/s +step 4666/19560 | loss 3.504011 (-1.87z)| norm 0.2628 (-0.70z)| lr 5.37e-04 | 8437.32 ms | -100.0% bf16 MFU | 62144 tok/s +step 4667/19560 | loss 3.555152 (-0.54z)| norm 0.2718 (-0.23z)| lr 5.37e-04 | 8438.64 ms | -100.0% bf16 MFU | 62143 tok/s +step 4668/19560 | loss 3.593488 (+0.45z)| norm 0.2609 (-0.78z)| lr 5.37e-04 | 8436.45 ms | -100.0% bf16 MFU | 62143 tok/s +step 4669/19560 | loss 3.544970 (-0.80z)| norm 0.2636 (-0.64z)| lr 5.37e-04 | 8438.22 ms | -100.0% bf16 MFU | 62143 tok/s +step 4670/19560 | loss 3.540060 (-0.92z)| norm 0.2630 (-0.66z)| lr 5.37e-04 | 8440.11 ms | -100.0% bf16 MFU | 62142 tok/s +step 4671/19560 | loss 3.549639 (-0.68z)| norm 0.2675 (-0.42z)| lr 5.37e-04 | 8440.79 ms | -100.0% bf16 MFU | 62140 tok/s +step 4672/19560 | loss 3.542122 (-0.86z)| norm 0.2777 (+0.10z)| lr 5.37e-04 | 8438.55 ms | -100.0% bf16 MFU | 62140 tok/s +step 4673/19560 | loss 3.541323 (-0.88z)| norm 0.2424 (-1.72z)| lr 5.37e-04 | 8438.08 ms | -100.0% bf16 MFU | 62139 tok/s +step 4674/19560 | loss 3.582433 (+0.19z)| norm 0.2662 (-0.49z)| lr 5.37e-04 | 8439.55 ms | -100.0% bf16 MFU | 62139 tok/s +step 4675/19560 | loss 3.474582 (-2.54z)| norm 0.2810 (+0.28z)| lr 5.37e-04 | 8439.83 ms | -100.0% bf16 MFU | 62138 tok/s +step 4676/19560 | loss 3.617492 (+1.11z)| norm 0.2643 (-0.57z)| lr 5.37e-04 | 8437.68 ms | -100.0% bf16 MFU | 62138 tok/s +step 4677/19560 | loss 3.589758 (+0.39z)| norm 0.2943 (+0.97z)| lr 5.37e-04 | 8437.96 ms | -100.0% bf16 MFU | 62137 tok/s +step 4678/19560 | loss 3.608402 (+0.86z)| norm 0.2729 (-0.14z)| lr 5.37e-04 | 8439.71 ms | -100.0% bf16 MFU | 62137 tok/s +step 4679/19560 | loss 3.590013 (+0.39z)| norm 0.2678 (-0.40z)| lr 5.37e-04 | 8439.53 ms | -100.0% bf16 MFU | 62136 tok/s +step 4680/19560 | loss 3.595612 (+0.53z)| norm 0.2552 (-1.04z)| lr 5.36e-04 | 8441.44 ms | -100.0% bf16 MFU | 62135 tok/s +step 4681/19560 | loss 3.557523 (-0.44z)| norm 0.2868 (+0.59z)| lr 5.36e-04 | 8439.24 ms | -100.0% bf16 MFU | 62134 tok/s +step 4682/19560 | loss 3.604665 (+0.76z)| norm 0.2567 (-0.95z)| lr 5.36e-04 | 8441.06 ms | -100.0% bf16 MFU | 62133 tok/s +step 4683/19560 | loss 3.585673 (+0.28z)| norm 0.2545 (-1.07z)| lr 5.36e-04 | 8468.63 ms | -100.0% bf16 MFU | 62122 tok/s +step 4684/19560 | loss 3.554914 (-0.51z)| norm 0.2836 (+0.43z)| lr 5.36e-04 | 8464.00 ms | -100.0% bf16 MFU | 62113 tok/s +step 4685/19560 | loss 3.587947 (+0.33z)| norm 0.3048 (+1.49z)| lr 5.36e-04 | 8465.15 ms | -100.0% bf16 MFU | 62104 tok/s +step 4686/19560 | loss 3.536376 (-0.98z)| norm 0.3123 (+1.83z)| lr 5.36e-04 | 8462.52 ms | -100.0% bf16 MFU | 62096 tok/s +step 4687/19560 | loss 3.636603 (+1.56z)| norm 0.3039 (+1.39z)| lr 5.36e-04 | 8457.38 ms | -100.0% bf16 MFU | 62091 tok/s +step 4688/19560 | loss 3.541201 (-0.86z)| norm 0.3008 (+1.22z)| lr 5.36e-04 | 8461.23 ms | -100.0% bf16 MFU | 62085 tok/s +step 4689/19560 | loss 3.598797 (+0.59z)| norm 0.2959 (+0.96z)| lr 5.36e-04 | 8462.70 ms | -100.0% bf16 MFU | 62078 tok/s +step 4690/19560 | loss 3.570308 (-0.14z)| norm 0.3150 (+1.87z)| lr 5.36e-04 | 8462.35 ms | -100.0% bf16 MFU | 62072 tok/s +step 4691/19560 | loss 3.638314 (+1.56z)| norm 0.2692 (-0.38z)| lr 5.36e-04 | 8462.48 ms | -100.0% bf16 MFU | 62066 tok/s +step 4692/19560 | loss 3.546839 (-0.74z)| norm 0.2634 (-0.67z)| lr 5.36e-04 | 8459.24 ms | -100.0% bf16 MFU | 62062 tok/s +step 4693/19560 | loss 3.573788 (-0.04z)| norm 0.2698 (-0.36z)| lr 5.36e-04 | 8458.66 ms | -100.0% bf16 MFU | 62058 tok/s +step 4694/19560 | loss 3.596950 (+0.56z)| norm 0.2555 (-1.08z)| lr 5.36e-04 | 8460.12 ms | -100.0% bf16 MFU | 62054 tok/s +step 4695/19560 | loss 3.543236 (-0.83z)| norm 0.2641 (-0.65z)| lr 5.36e-04 | 8457.27 ms | -100.0% bf16 MFU | 62050 tok/s +step 4696/19560 | loss 3.599835 (+0.67z)| norm 0.2654 (-0.59z)| lr 5.36e-04 | 8460.10 ms | -100.0% bf16 MFU | 62047 tok/s +step 4697/19560 | loss 3.551947 (-0.61z)| norm 0.2692 (-0.40z)| lr 5.36e-04 | 8463.08 ms | -100.0% bf16 MFU | 62042 tok/s +step 4698/19560 | loss 3.604227 (+0.79z)| norm 0.2549 (-1.09z)| lr 5.36e-04 | 8456.26 ms | -100.0% bf16 MFU | 62040 tok/s +step 4699/19560 | loss 3.586642 (+0.32z)| norm 0.2716 (-0.26z)| lr 5.36e-04 | 8458.82 ms | -100.0% bf16 MFU | 62037 tok/s +step 4700/19560 | loss 3.548842 (-0.68z)| norm 0.2765 (+0.00z)| lr 5.36e-04 | 8456.83 ms | -100.0% bf16 MFU | 62035 tok/s +step 4701/19560 | loss 3.640460 (+1.74z)| norm 0.2606 (-0.79z)| lr 5.36e-04 | 8457.50 ms | -100.0% bf16 MFU | 62032 tok/s +step 4702/19560 | loss 3.617697 (+1.13z)| norm 0.2651 (-0.55z)| lr 5.36e-04 | 8460.33 ms | -100.0% bf16 MFU | 62029 tok/s +step 4703/19560 | loss 3.596251 (+0.56z)| norm 0.2723 (-0.16z)| lr 5.36e-04 | 8450.68 ms | -100.0% bf16 MFU | 62030 tok/s +step 4704/19560 | loss 3.660737 (+2.21z)| norm 0.2500 (-1.31z)| lr 5.36e-04 | 8451.90 ms | -100.0% bf16 MFU | 62030 tok/s +step 4705/19560 | loss 3.584788 (+0.25z)| norm 0.2659 (-0.46z)| lr 5.36e-04 | 8452.26 ms | -100.0% bf16 MFU | 62030 tok/s +step 4706/19560 | loss 3.591323 (+0.41z)| norm 0.2779 (+0.17z)| lr 5.36e-04 | 8448.54 ms | -100.0% bf16 MFU | 62031 tok/s +step 4707/19560 | loss 3.564055 (-0.31z)| norm 0.3060 (+1.63z)| lr 5.36e-04 | 8448.83 ms | -100.0% bf16 MFU | 62033 tok/s +step 4708/19560 | loss 3.529590 (-1.22z)| norm 0.3236 (+2.47z)| lr 5.36e-04 | 8447.27 ms | -100.0% bf16 MFU | 62034 tok/s +step 4709/19560 | loss 3.632223 (+1.54z)| norm 0.3041 (+1.49z)| lr 5.36e-04 | 8447.60 ms | -100.0% bf16 MFU | 62036 tok/s +step 4710/19560 | loss 3.629589 (+1.47z)| norm 0.2824 (+0.38z)| lr 5.36e-04 | 8449.26 ms | -100.0% bf16 MFU | 62036 tok/s +step 4711/19560 | loss 3.638586 (+1.68z)| norm 0.3007 (+1.32z)| lr 5.36e-04 | 8446.05 ms | -100.0% bf16 MFU | 62038 tok/s +step 4712/19560 | loss 3.593046 (+0.46z)| norm 0.2923 (+0.88z)| lr 5.35e-04 | 8443.84 ms | -100.0% bf16 MFU | 62041 tok/s +step 4713/19560 | loss 3.571015 (-0.13z)| norm 0.2643 (-0.56z)| lr 5.35e-04 | 8436.79 ms | -100.0% bf16 MFU | 62046 tok/s +step 4714/19560 | loss 3.595589 (+0.51z)| norm 0.3026 (+1.39z)| lr 5.35e-04 | 8443.53 ms | -100.0% bf16 MFU | 62048 tok/s +step 4715/19560 | loss 3.624136 (+1.27z)| norm 0.3014 (+1.30z)| lr 5.35e-04 | 8436.88 ms | -100.0% bf16 MFU | 62053 tok/s +step 4716/19560 | loss 3.578016 (+0.04z)| norm 0.2695 (-0.31z)| lr 5.35e-04 | 8445.14 ms | -100.0% bf16 MFU | 62055 tok/s +step 4717/19560 | loss 3.672084 (+2.47z)| norm 0.3278 (+2.57z)| lr 5.35e-04 | 8441.66 ms | -100.0% bf16 MFU | 62057 tok/s +step 4718/19560 | loss 3.517624 (-1.55z)| norm 0.3298 (+2.58z)| lr 5.35e-04 | 8436.13 ms | -100.0% bf16 MFU | 62062 tok/s +step 4719/19560 | loss 3.588005 (+0.27z)| norm 0.3176 (+1.95z)| lr 5.35e-04 | 8450.20 ms | -100.0% bf16 MFU | 62061 tok/s +step 4720/19560 | loss 3.662013 (+2.15z)| norm 0.3103 (+1.57z)| lr 5.35e-04 | 8441.54 ms | -100.0% bf16 MFU | 62063 tok/s +step 4721/19560 | loss 3.588582 (+0.28z)| norm 0.2668 (-0.50z)| lr 5.35e-04 | 8438.02 ms | -100.0% bf16 MFU | 62067 tok/s +step 4722/19560 | loss 3.507591 (-1.77z)| norm 0.2640 (-0.65z)| lr 5.35e-04 | 8443.86 ms | -100.0% bf16 MFU | 62068 tok/s +step 4723/19560 | loss 3.521167 (-1.39z)| norm 0.2783 (+0.03z)| lr 5.35e-04 | 8442.89 ms | -100.0% bf16 MFU | 62070 tok/s +step 4724/19560 | loss 3.575925 (-0.02z)| norm 0.2675 (-0.49z)| lr 5.35e-04 | 8444.52 ms | -100.0% bf16 MFU | 62070 tok/s +step 4725/19560 | loss 3.579344 (+0.07z)| norm 0.2537 (-1.16z)| lr 5.35e-04 | 8445.38 ms | -100.0% bf16 MFU | 62071 tok/s +step 4726/19560 | loss 3.560407 (-0.42z)| norm 0.2544 (-1.15z)| lr 5.35e-04 | 8437.63 ms | -100.0% bf16 MFU | 62074 tok/s +step 4727/19560 | loss 3.542664 (-0.87z)| norm 0.2667 (-0.55z)| lr 5.35e-04 | 8441.24 ms | -100.0% bf16 MFU | 62076 tok/s +step 4728/19560 | loss 3.569363 (-0.19z)| norm 0.2645 (-0.66z)| lr 5.35e-04 | 8440.32 ms | -100.0% bf16 MFU | 62078 tok/s +step 4729/19560 | loss 3.611362 (+0.92z)| norm 0.3156 (+1.84z)| lr 5.35e-04 | 8440.48 ms | -100.0% bf16 MFU | 62080 tok/s +step 4730/19560 | loss 3.584768 (+0.22z)| norm 0.3107 (+1.58z)| lr 5.35e-04 | 8440.93 ms | -100.0% bf16 MFU | 62082 tok/s +step 4731/19560 | loss 3.528868 (-1.25z)| norm 0.2646 (-0.65z)| lr 5.35e-04 | 8447.53 ms | -100.0% bf16 MFU | 62081 tok/s +step 4732/19560 | loss 3.592498 (+0.46z)| norm 0.2880 (+0.47z)| lr 5.35e-04 | 8437.51 ms | -100.0% bf16 MFU | 62083 tok/s +step 4733/19560 | loss 3.549464 (-0.71z)| norm 0.2934 (+0.72z)| lr 5.35e-04 | 8439.56 ms | -100.0% bf16 MFU | 62085 tok/s +step 4734/19560 | loss 3.617487 (+1.12z)| norm 0.2698 (-0.43z)| lr 5.35e-04 | 8441.81 ms | -100.0% bf16 MFU | 62086 tok/s +step 4735/19560 | loss 3.604981 (+0.77z)| norm 0.2471 (-1.52z)| lr 5.35e-04 | 8448.78 ms | -100.0% bf16 MFU | 62085 tok/s +step 4736/19560 | loss 3.538793 (-1.02z)| norm 0.2443 (-1.65z)| lr 5.35e-04 | 8450.54 ms | -100.0% bf16 MFU | 62083 tok/s +step 4737/19560 | loss 3.574485 (-0.05z)| norm 0.2453 (-1.57z)| lr 5.35e-04 | 8447.35 ms | -100.0% bf16 MFU | 62082 tok/s +step 4738/19560 | loss 3.555598 (-0.56z)| norm 0.2695 (-0.42z)| lr 5.35e-04 | 8446.71 ms | -100.0% bf16 MFU | 62081 tok/s +step 4739/19560 | loss 3.531411 (-1.20z)| norm 0.2509 (-1.31z)| lr 5.35e-04 | 8449.58 ms | -100.0% bf16 MFU | 62080 tok/s +step 4740/19560 | loss 3.595301 (+0.51z)| norm 0.2684 (-0.46z)| lr 5.35e-04 | 8450.13 ms | -100.0% bf16 MFU | 62078 tok/s +step 4741/19560 | loss 3.663602 (+2.30z)| norm 0.2559 (-1.05z)| lr 5.35e-04 | 8449.04 ms | -100.0% bf16 MFU | 62077 tok/s +step 4742/19560 | loss 3.568516 (-0.22z)| norm 0.2465 (-1.49z)| lr 5.35e-04 | 8447.72 ms | -100.0% bf16 MFU | 62076 tok/s +step 4743/19560 | loss 3.665336 (+2.30z)| norm 0.2601 (-0.84z)| lr 5.35e-04 | 8443.79 ms | -100.0% bf16 MFU | 62077 tok/s +step 4744/19560 | loss 3.612558 (+0.91z)| norm 0.2879 (+0.48z)| lr 5.35e-04 | 8447.66 ms | -100.0% bf16 MFU | 62076 tok/s +step 4745/19560 | loss 3.631950 (+1.40z)| norm 0.2567 (-1.00z)| lr 5.34e-04 | 8450.65 ms | -100.0% bf16 MFU | 62074 tok/s +step 4746/19560 | loss 3.572509 (-0.15z)| norm 0.2665 (-0.54z)| lr 5.34e-04 | 8447.07 ms | -100.0% bf16 MFU | 62074 tok/s +step 4747/19560 | loss 3.700813 (+3.09z)| norm 0.2604 (-0.82z)| lr 5.34e-04 | 8452.90 ms | -100.0% bf16 MFU | 62072 tok/s +step 4748/19560 | loss 3.579599 (+0.01z)| norm 0.2572 (-0.95z)| lr 5.34e-04 | 8446.42 ms | -100.0% bf16 MFU | 62072 tok/s +step 4749/19560 | loss 3.572329 (-0.17z)| norm 0.2554 (-1.02z)| lr 5.34e-04 | 8452.18 ms | -100.0% bf16 MFU | 62069 tok/s +step 4750/19560 | loss 3.516283 (-1.58z)| norm 0.2678 (-0.43z)| lr 5.34e-04 | 8448.05 ms | -100.0% bf16 MFU | 62069 tok/s +val loss 3.559233 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 2769/10042 = 0.275742 +step 4751/19560 | loss 3.559191 (-0.49z)| norm 0.2631 (-0.66z)| lr 5.34e-04 | 8447.59 ms | -100.0% bf16 MFU | 62069 tok/s +step 4752/19560 | loss 3.550677 (-0.71z)| norm 0.2805 (+0.16z)| lr 5.34e-04 | 8448.52 ms | -100.0% bf16 MFU | 62068 tok/s +step 4753/19560 | loss 3.545636 (-0.82z)| norm 0.2619 (-0.71z)| lr 5.34e-04 | 8442.05 ms | -100.0% bf16 MFU | 62070 tok/s +step 4754/19560 | loss 3.566329 (-0.32z)| norm 0.2610 (-0.77z)| lr 5.34e-04 | 8448.87 ms | -100.0% bf16 MFU | 62069 tok/s +step 4755/19560 | loss 3.601836 (+0.60z)| norm 0.2668 (-0.49z)| lr 5.34e-04 | 8452.49 ms | -100.0% bf16 MFU | 62067 tok/s +step 4756/19560 | loss 3.571677 (-0.17z)| norm 0.2595 (-0.82z)| lr 5.34e-04 | 8444.94 ms | -100.0% bf16 MFU | 62068 tok/s +step 4757/19560 | loss 3.573691 (-0.12z)| norm 0.2436 (-1.56z)| lr 5.34e-04 | 8447.43 ms | -100.0% bf16 MFU | 62068 tok/s +step 4758/19560 | loss 3.694283 (+2.86z)| norm 0.3518 (+3.41z)| lr 5.34e-04 | 8451.74 ms | -100.0% bf16 MFU | 62066 tok/s +step 4759/19560 | loss 3.578724 (-0.01z)| norm 0.2541 (-1.03z)| lr 5.34e-04 | 8447.62 ms | -100.0% bf16 MFU | 62066 tok/s +step 4760/19560 | loss 3.551165 (-0.71z)| norm 0.2468 (-1.35z)| lr 5.34e-04 | 8444.35 ms | -100.0% bf16 MFU | 62067 tok/s +step 4761/19560 | loss 3.573164 (-0.15z)| norm 0.2700 (-0.28z)| lr 5.34e-04 | 8443.85 ms | -100.0% bf16 MFU | 62068 tok/s +step 4762/19560 | loss 3.589448 (+0.25z)| norm 0.2874 (+0.51z)| lr 5.34e-04 | 8438.48 ms | -100.0% bf16 MFU | 62071 tok/s +step 4763/19560 | loss 3.574489 (-0.12z)| norm 0.2860 (+0.46z)| lr 5.34e-04 | 8436.03 ms | -100.0% bf16 MFU | 62075 tok/s +step 4764/19560 | loss 3.591472 (+0.31z)| norm 0.3013 (+1.15z)| lr 5.34e-04 | 8439.57 ms | -100.0% bf16 MFU | 62078 tok/s +step 4765/19560 | loss 3.571422 (-0.20z)| norm 0.2802 (+0.19z)| lr 5.34e-04 | 8439.48 ms | -100.0% bf16 MFU | 62080 tok/s +step 4766/19560 | loss 3.628038 (+1.22z)| norm 0.2485 (-1.27z)| lr 5.34e-04 | 8439.50 ms | -100.0% bf16 MFU | 62082 tok/s +step 4767/19560 | loss 3.584431 (+0.12z)| norm 0.2842 (+0.37z)| lr 5.34e-04 | 8432.57 ms | -100.0% bf16 MFU | 62087 tok/s +step 4768/19560 | loss 3.630507 (+1.27z)| norm 0.2883 (+0.55z)| lr 5.34e-04 | 8438.40 ms | -100.0% bf16 MFU | 62089 tok/s +step 4769/19560 | loss 3.588813 (+0.22z)| norm 0.2936 (+0.79z)| lr 5.34e-04 | 8433.56 ms | -100.0% bf16 MFU | 62093 tok/s +step 4770/19560 | loss 3.559848 (-0.52z)| norm 0.2969 (+0.93z)| lr 5.34e-04 | 8434.05 ms | -100.0% bf16 MFU | 62096 tok/s +step 4771/19560 | loss 3.617749 (+0.93z)| norm 0.2701 (-0.28z)| lr 5.34e-04 | 8439.11 ms | -100.0% bf16 MFU | 62098 tok/s +step 4772/19560 | loss 3.596683 (+0.42z)| norm 0.2884 (+0.56z)| lr 5.34e-04 | 8428.82 ms | -100.0% bf16 MFU | 62103 tok/s +step 4773/19560 | loss 3.643716 (+1.61z)| norm 0.2919 (+0.72z)| lr 5.34e-04 | 8433.06 ms | -100.0% bf16 MFU | 62106 tok/s +step 4774/19560 | loss 3.539857 (-1.04z)| norm 0.2458 (-1.38z)| lr 5.34e-04 | 8430.43 ms | -100.0% bf16 MFU | 62110 tok/s +step 4775/19560 | loss 3.637038 (+1.42z)| norm 0.2541 (-1.00z)| lr 5.34e-04 | 8435.09 ms | -100.0% bf16 MFU | 62113 tok/s +step 4776/19560 | loss 3.566393 (-0.37z)| norm 0.2790 (+0.15z)| lr 5.33e-04 | 8434.02 ms | -100.0% bf16 MFU | 62115 tok/s +step 4777/19560 | loss 3.554396 (-0.67z)| norm 0.2863 (+0.48z)| lr 5.33e-04 | 8433.57 ms | -100.0% bf16 MFU | 62118 tok/s +step 4778/19560 | loss 3.590971 (+0.25z)| norm 0.2799 (+0.18z)| lr 5.33e-04 | 8436.92 ms | -100.0% bf16 MFU | 62119 tok/s +step 4779/19560 | loss 3.578096 (-0.09z)| norm 0.2802 (+0.18z)| lr 5.33e-04 | 8433.21 ms | -100.0% bf16 MFU | 62122 tok/s +step 4780/19560 | loss 3.583960 (+0.07z)| norm 0.3086 (+1.46z)| lr 5.33e-04 | 8440.55 ms | -100.0% bf16 MFU | 62121 tok/s +step 4781/19560 | loss 3.563186 (-0.47z)| norm 0.3119 (+1.59z)| lr 5.33e-04 | 8438.05 ms | -100.0% bf16 MFU | 62122 tok/s +step 4782/19560 | loss 3.576703 (-0.13z)| norm 0.2753 (-0.08z)| lr 5.33e-04 | 8433.22 ms | -100.0% bf16 MFU | 62124 tok/s +step 4783/19560 | loss 3.569082 (-0.34z)| norm 0.2680 (-0.41z)| lr 5.33e-04 | 8436.93 ms | -100.0% bf16 MFU | 62125 tok/s +step 4784/19560 | loss 3.571740 (-0.26z)| norm 0.2606 (-0.75z)| lr 5.33e-04 | 8445.38 ms | -100.0% bf16 MFU | 62123 tok/s +step 4785/19560 | loss 3.535984 (-1.19z)| norm 0.2854 (+0.38z)| lr 5.33e-04 | 8438.01 ms | -100.0% bf16 MFU | 62123 tok/s +step 4786/19560 | loss 3.581232 (-0.01z)| norm 0.2878 (+0.48z)| lr 5.33e-04 | 8439.67 ms | -100.0% bf16 MFU | 62123 tok/s +step 4787/19560 | loss 3.581588 (-0.00z)| norm 0.2664 (-0.49z)| lr 5.33e-04 | 8441.65 ms | -100.0% bf16 MFU | 62123 tok/s +step 4788/19560 | loss 3.549166 (-0.84z)| norm 0.2752 (-0.09z)| lr 5.33e-04 | 8439.72 ms | -100.0% bf16 MFU | 62123 tok/s +step 4789/19560 | loss 3.578406 (-0.09z)| norm 0.3456 (+2.98z)| lr 5.33e-04 | 8440.67 ms | -100.0% bf16 MFU | 62122 tok/s +step 4790/19560 | loss 3.583689 (+0.05z)| norm 0.2595 (-0.79z)| lr 5.33e-04 | 8443.83 ms | -100.0% bf16 MFU | 62121 tok/s +step 4791/19560 | loss 3.516179 (-1.69z)| norm 0.2687 (-0.37z)| lr 5.33e-04 | 8445.17 ms | -100.0% bf16 MFU | 62119 tok/s +step 4792/19560 | loss 3.611007 (+0.75z)| norm 0.2424 (-1.54z)| lr 5.33e-04 | 8445.03 ms | -100.0% bf16 MFU | 62117 tok/s +step 4793/19560 | loss 3.572037 (-0.25z)| norm 0.2774 (+0.06z)| lr 5.33e-04 | 8447.59 ms | -100.0% bf16 MFU | 62114 tok/s +step 4794/19560 | loss 3.541691 (-1.06z)| norm 0.2587 (-0.79z)| lr 5.33e-04 | 8439.97 ms | -100.0% bf16 MFU | 62114 tok/s +step 4795/19560 | loss 3.550877 (-0.82z)| norm 0.2539 (-1.00z)| lr 5.33e-04 | 8441.87 ms | -100.0% bf16 MFU | 62114 tok/s +step 4796/19560 | loss 3.592568 (+0.27z)| norm 0.2424 (-1.51z)| lr 5.33e-04 | 8443.36 ms | -100.0% bf16 MFU | 62113 tok/s +step 4797/19560 | loss 3.548161 (-0.89z)| norm 0.2520 (-1.06z)| lr 5.33e-04 | 8439.54 ms | -100.0% bf16 MFU | 62114 tok/s +step 4798/19560 | loss 3.600082 (+0.46z)| norm 0.2635 (-0.55z)| lr 5.33e-04 | 8443.41 ms | -100.0% bf16 MFU | 62113 tok/s +step 4799/19560 | loss 3.563995 (-0.49z)| norm 0.2806 (+0.21z)| lr 5.33e-04 | 8441.78 ms | -100.0% bf16 MFU | 62112 tok/s +step 4800/19560 | loss 3.595006 (+0.31z)| norm 0.3065 (+1.36z)| lr 5.33e-04 | 8437.47 ms | -100.0% bf16 MFU | 62114 tok/s +step 4801/19560 | loss 3.541289 (-1.11z)| norm 0.3257 (+2.16z)| lr 5.33e-04 | 8438.90 ms | -100.0% bf16 MFU | 62114 tok/s +step 4802/19560 | loss 3.588646 (+0.14z)| norm 0.2861 (+0.41z)| lr 5.33e-04 | 8442.51 ms | -100.0% bf16 MFU | 62114 tok/s +step 4803/19560 | loss 3.558705 (-0.68z)| norm 0.2691 (-0.34z)| lr 5.33e-04 | 8443.59 ms | -100.0% bf16 MFU | 62113 tok/s +step 4804/19560 | loss 3.594218 (+0.29z)| norm 0.3146 (+1.64z)| lr 5.33e-04 | 8445.53 ms | -100.0% bf16 MFU | 62111 tok/s +step 4805/19560 | loss 3.597333 (+0.37z)| norm 0.3224 (+1.94z)| lr 5.33e-04 | 8441.77 ms | -100.0% bf16 MFU | 62111 tok/s +step 4806/19560 | loss 3.526035 (-1.55z)| norm 0.2602 (-0.73z)| lr 5.33e-04 | 8440.16 ms | -100.0% bf16 MFU | 62111 tok/s +step 4807/19560 | loss 3.606122 (+0.62z)| norm 0.2882 (+0.47z)| lr 5.33e-04 | 8442.21 ms | -100.0% bf16 MFU | 62111 tok/s +step 4808/19560 | loss 3.582950 (-0.00z)| norm 0.3025 (+1.07z)| lr 5.32e-04 | 8440.34 ms | -100.0% bf16 MFU | 62111 tok/s +step 4809/19560 | loss 3.636248 (+1.41z)| norm 0.2846 (+0.29z)| lr 5.32e-04 | 8444.88 ms | -100.0% bf16 MFU | 62110 tok/s +step 4810/19560 | loss 3.536596 (-1.25z)| norm 0.2834 (+0.23z)| lr 5.32e-04 | 8439.33 ms | -100.0% bf16 MFU | 62110 tok/s +step 4811/19560 | loss 3.543012 (-1.06z)| norm 0.2673 (-0.46z)| lr 5.32e-04 | 8440.81 ms | -100.0% bf16 MFU | 62110 tok/s +step 4812/19560 | loss 3.568232 (-0.39z)| norm 0.3104 (+1.38z)| lr 5.32e-04 | 8444.95 ms | -100.0% bf16 MFU | 62109 tok/s +step 4813/19560 | loss 3.552675 (-0.80z)| norm 0.2859 (+0.34z)| lr 5.32e-04 | 8443.71 ms | -100.0% bf16 MFU | 62108 tok/s +step 4814/19560 | loss 3.554791 (-0.75z)| norm 0.2560 (-0.94z)| lr 5.32e-04 | 8440.35 ms | -100.0% bf16 MFU | 62109 tok/s +step 4815/19560 | loss 3.617054 (+0.92z)| norm 0.2733 (-0.18z)| lr 5.32e-04 | 8439.78 ms | -100.0% bf16 MFU | 62109 tok/s +step 4816/19560 | loss 3.522213 (-1.61z)| norm 0.2904 (+0.57z)| lr 5.32e-04 | 8442.31 ms | -100.0% bf16 MFU | 62109 tok/s +step 4817/19560 | loss 3.537159 (-1.19z)| norm 0.2826 (+0.23z)| lr 5.32e-04 | 8441.51 ms | -100.0% bf16 MFU | 62109 tok/s +step 4818/19560 | loss 3.607079 (+0.66z)| norm 0.3156 (+1.68z)| lr 5.32e-04 | 8442.82 ms | -100.0% bf16 MFU | 62108 tok/s +step 4819/19560 | loss 3.561022 (-0.55z)| norm 0.3034 (+1.13z)| lr 5.32e-04 | 8439.27 ms | -100.0% bf16 MFU | 62109 tok/s +step 4820/19560 | loss 3.640772 (+1.55z)| norm 0.2448 (-1.42z)| lr 5.32e-04 | 8442.30 ms | -100.0% bf16 MFU | 62109 tok/s +step 4821/19560 | loss 3.566456 (-0.42z)| norm 0.2789 (+0.06z)| lr 5.32e-04 | 8439.67 ms | -100.0% bf16 MFU | 62110 tok/s +step 4822/19560 | loss 3.609108 (+0.71z)| norm 0.2791 (+0.06z)| lr 5.32e-04 | 8443.82 ms | -100.0% bf16 MFU | 62109 tok/s +step 4823/19560 | loss 3.536901 (-1.21z)| norm 0.2498 (-1.20z)| lr 5.32e-04 | 8439.10 ms | -100.0% bf16 MFU | 62109 tok/s +step 4824/19560 | loss 3.555337 (-0.71z)| norm 0.2696 (-0.35z)| lr 5.32e-04 | 8442.08 ms | -100.0% bf16 MFU | 62109 tok/s +step 4825/19560 | loss 3.511461 (-1.84z)| norm 0.2764 (-0.05z)| lr 5.32e-04 | 8439.95 ms | -100.0% bf16 MFU | 62110 tok/s +step 4826/19560 | loss 3.501871 (-2.04z)| norm 0.2523 (-1.10z)| lr 5.32e-04 | 8441.50 ms | -100.0% bf16 MFU | 62110 tok/s +step 4827/19560 | loss 3.587161 (+0.16z)| norm 0.2671 (-0.46z)| lr 5.32e-04 | 8444.38 ms | -100.0% bf16 MFU | 62109 tok/s +step 4828/19560 | loss 3.570257 (-0.28z)| norm 0.2937 (+0.69z)| lr 5.32e-04 | 8439.77 ms | -100.0% bf16 MFU | 62109 tok/s +step 4829/19560 | loss 3.568376 (-0.32z)| norm 0.3062 (+1.21z)| lr 5.32e-04 | 8440.81 ms | -100.0% bf16 MFU | 62109 tok/s +step 4830/19560 | loss 3.543224 (-0.96z)| norm 0.2694 (-0.38z)| lr 5.32e-04 | 8442.56 ms | -100.0% bf16 MFU | 62109 tok/s +step 4831/19560 | loss 3.576316 (-0.09z)| norm 0.2605 (-0.75z)| lr 5.32e-04 | 8441.59 ms | -100.0% bf16 MFU | 62109 tok/s +step 4832/19560 | loss 3.565499 (-0.36z)| norm 0.2716 (-0.29z)| lr 5.32e-04 | 8441.38 ms | -100.0% bf16 MFU | 62109 tok/s +step 4833/19560 | loss 3.560318 (-0.49z)| norm 0.2677 (-0.45z)| lr 5.32e-04 | 8438.44 ms | -100.0% bf16 MFU | 62110 tok/s +step 4834/19560 | loss 3.563405 (-0.41z)| norm 0.2740 (-0.18z)| lr 5.32e-04 | 8441.13 ms | -100.0% bf16 MFU | 62110 tok/s +step 4835/19560 | loss 3.623055 (+1.16z)| norm 0.2839 (+0.26z)| lr 5.32e-04 | 8438.87 ms | -100.0% bf16 MFU | 62111 tok/s +step 4836/19560 | loss 3.558927 (-0.54z)| norm 0.2737 (-0.17z)| lr 5.32e-04 | 8441.46 ms | -100.0% bf16 MFU | 62111 tok/s +step 4837/19560 | loss 3.579534 (+0.01z)| norm 0.2650 (-0.54z)| lr 5.32e-04 | 8441.36 ms | -100.0% bf16 MFU | 62111 tok/s +step 4838/19560 | loss 3.568410 (-0.27z)| norm 0.2860 (+0.39z)| lr 5.32e-04 | 8448.37 ms | -100.0% bf16 MFU | 62108 tok/s +step 4839/19560 | loss 3.541413 (-0.99z)| norm 0.2582 (-0.83z)| lr 5.32e-04 | 8440.18 ms | -100.0% bf16 MFU | 62109 tok/s +step 4840/19560 | loss 3.544696 (-0.88z)| norm 0.2650 (-0.52z)| lr 5.31e-04 | 8437.68 ms | -100.0% bf16 MFU | 62110 tok/s +step 4841/19560 | loss 3.462898 (-2.97z)| norm 0.3146 (+1.65z)| lr 5.31e-04 | 8441.60 ms | -100.0% bf16 MFU | 62110 tok/s +step 4842/19560 | loss 3.623320 (+1.21z)| norm 0.2615 (-0.68z)| lr 5.31e-04 | 8441.37 ms | -100.0% bf16 MFU | 62110 tok/s +step 4843/19560 | loss 3.533184 (-1.11z)| norm 0.2936 (+0.75z)| lr 5.31e-04 | 8440.54 ms | -100.0% bf16 MFU | 62110 tok/s +step 4844/19560 | loss 3.609575 (+0.86z)| norm 0.2989 (+0.96z)| lr 5.31e-04 | 8443.18 ms | -100.0% bf16 MFU | 62109 tok/s +step 4845/19560 | loss 3.582060 (+0.17z)| norm 0.2783 (+0.08z)| lr 5.31e-04 | 8441.69 ms | -100.0% bf16 MFU | 62109 tok/s +step 4846/19560 | loss 3.597157 (+0.56z)| norm 0.2815 (+0.24z)| lr 5.31e-04 | 8436.39 ms | -100.0% bf16 MFU | 62111 tok/s +step 4847/19560 | loss 3.565061 (-0.29z)| norm 0.3019 (+1.20z)| lr 5.31e-04 | 8442.99 ms | -100.0% bf16 MFU | 62110 tok/s +step 4848/19560 | loss 3.520882 (-1.47z)| norm 0.2729 (-0.14z)| lr 5.31e-04 | 8437.23 ms | -100.0% bf16 MFU | 62112 tok/s +step 4849/19560 | loss 3.538917 (-0.96z)| norm 0.2660 (-0.46z)| lr 5.31e-04 | 8438.31 ms | -100.0% bf16 MFU | 62113 tok/s +step 4850/19560 | loss 3.532040 (-1.16z)| norm 0.3001 (+1.12z)| lr 5.31e-04 | 8436.85 ms | -100.0% bf16 MFU | 62114 tok/s +step 4851/19560 | loss 3.553363 (-0.59z)| norm 0.2724 (-0.17z)| lr 5.31e-04 | 8434.91 ms | -100.0% bf16 MFU | 62117 tok/s +step 4852/19560 | loss 3.570858 (-0.11z)| norm 0.3141 (+1.74z)| lr 5.31e-04 | 8430.67 ms | -100.0% bf16 MFU | 62120 tok/s +step 4853/19560 | loss 3.554317 (-0.56z)| norm 0.2871 (+0.48z)| lr 5.31e-04 | 8431.54 ms | -100.0% bf16 MFU | 62123 tok/s +step 4854/19560 | loss 3.651973 (+2.07z)| norm 0.2882 (+0.52z)| lr 5.31e-04 | 8432.41 ms | -100.0% bf16 MFU | 62126 tok/s +step 4855/19560 | loss 3.493534 (-2.17z)| norm 0.3357 (+2.64z)| lr 5.31e-04 | 8431.53 ms | -100.0% bf16 MFU | 62129 tok/s +step 4856/19560 | loss 3.576677 (+0.04z)| norm 0.2858 (+0.37z)| lr 5.31e-04 | 8430.05 ms | -100.0% bf16 MFU | 62132 tok/s +step 4857/19560 | loss 3.556983 (-0.47z)| norm 0.2861 (+0.40z)| lr 5.31e-04 | 8432.50 ms | -100.0% bf16 MFU | 62134 tok/s +step 4858/19560 | loss 3.578476 (+0.10z)| norm 0.2591 (-0.83z)| lr 5.31e-04 | 8428.60 ms | -100.0% bf16 MFU | 62137 tok/s +step 4859/19560 | loss 3.599588 (+0.65z)| norm 0.2689 (-0.38z)| lr 5.31e-04 | 8431.89 ms | -100.0% bf16 MFU | 62140 tok/s +step 4860/19560 | loss 3.572730 (-0.06z)| norm 0.2989 (+1.00z)| lr 5.31e-04 | 8432.02 ms | -100.0% bf16 MFU | 62141 tok/s +step 4861/19560 | loss 3.547013 (-0.75z)| norm 0.2757 (-0.06z)| lr 5.31e-04 | 8431.84 ms | -100.0% bf16 MFU | 62143 tok/s +step 4862/19560 | loss 3.579358 (+0.12z)| norm 0.2627 (-0.66z)| lr 5.31e-04 | 8429.68 ms | -100.0% bf16 MFU | 62146 tok/s +step 4863/19560 | loss 3.550435 (-0.64z)| norm 0.2737 (-0.16z)| lr 5.31e-04 | 8430.06 ms | -100.0% bf16 MFU | 62148 tok/s +step 4864/19560 | loss 3.592582 (+0.48z)| norm 0.2485 (-1.34z)| lr 5.31e-04 | 8432.87 ms | -100.0% bf16 MFU | 62149 tok/s +step 4865/19560 | loss 3.568131 (-0.18z)| norm 0.2355 (-1.92z)| lr 5.31e-04 | 8432.66 ms | -100.0% bf16 MFU | 62151 tok/s +step 4866/19560 | loss 3.589251 (+0.39z)| norm 0.2649 (-0.56z)| lr 5.31e-04 | 8432.88 ms | -100.0% bf16 MFU | 62152 tok/s +step 4867/19560 | loss 3.587146 (+0.32z)| norm 0.2552 (-1.01z)| lr 5.31e-04 | 8433.07 ms | -100.0% bf16 MFU | 62153 tok/s +step 4868/19560 | loss 3.530654 (-1.20z)| norm 0.2496 (-1.26z)| lr 5.31e-04 | 8436.44 ms | -100.0% bf16 MFU | 62152 tok/s +step 4869/19560 | loss 3.561912 (-0.34z)| norm 0.2589 (-0.84z)| lr 5.31e-04 | 8433.85 ms | -100.0% bf16 MFU | 62153 tok/s +step 4870/19560 | loss 3.680494 (+2.83z)| norm 0.2480 (-1.34z)| lr 5.31e-04 | 8435.24 ms | -100.0% bf16 MFU | 62153 tok/s +step 4871/19560 | loss 3.598541 (+0.66z)| norm 0.2636 (-0.62z)| lr 5.30e-04 | 8434.65 ms | -100.0% bf16 MFU | 62153 tok/s +step 4872/19560 | loss 3.526648 (-1.29z)| norm 0.2570 (-0.91z)| lr 5.30e-04 | 8435.49 ms | -100.0% bf16 MFU | 62153 tok/s +step 4873/19560 | loss 3.581252 (+0.22z)| norm 0.2623 (-0.67z)| lr 5.30e-04 | 8443.70 ms | -100.0% bf16 MFU | 62150 tok/s +step 4874/19560 | loss 3.550009 (-0.64z)| norm 0.2707 (-0.28z)| lr 5.30e-04 | 8461.43 ms | -100.0% bf16 MFU | 62141 tok/s +step 4875/19560 | loss 3.633763 (+1.75z)| norm 0.2640 (-0.60z)| lr 5.30e-04 | 8462.59 ms | -100.0% bf16 MFU | 62131 tok/s +step 4876/19560 | loss 3.546143 (-0.75z)| norm 0.2756 (-0.07z)| lr 5.30e-04 | 8464.53 ms | -100.0% bf16 MFU | 62122 tok/s +step 4877/19560 | loss 3.597004 (+0.70z)| norm 0.2575 (-0.91z)| lr 5.30e-04 | 8460.62 ms | -100.0% bf16 MFU | 62114 tok/s +step 4878/19560 | loss 3.602217 (+0.83z)| norm 0.2657 (-0.53z)| lr 5.30e-04 | 8462.40 ms | -100.0% bf16 MFU | 62106 tok/s +step 4879/19560 | loss 3.545489 (-0.79z)| norm 0.2800 (+0.13z)| lr 5.30e-04 | 8462.25 ms | -100.0% bf16 MFU | 62099 tok/s +step 4880/19560 | loss 3.605713 (+0.92z)| norm 0.2708 (-0.29z)| lr 5.30e-04 | 8464.21 ms | -100.0% bf16 MFU | 62091 tok/s +step 4881/19560 | loss 3.561080 (-0.36z)| norm 0.2658 (-0.53z)| lr 5.30e-04 | 8460.72 ms | -100.0% bf16 MFU | 62085 tok/s +step 4882/19560 | loss 3.540469 (-0.94z)| norm 0.2907 (+0.62z)| lr 5.30e-04 | 8461.06 ms | -100.0% bf16 MFU | 62079 tok/s +step 4883/19560 | loss 3.623735 (+1.42z)| norm 0.3163 (+1.78z)| lr 5.30e-04 | 8462.36 ms | -100.0% bf16 MFU | 62073 tok/s +step 4884/19560 | loss 3.562190 (-0.32z)| norm 0.2904 (+0.57z)| lr 5.30e-04 | 8461.91 ms | -100.0% bf16 MFU | 62067 tok/s +step 4885/19560 | loss 3.578481 (+0.14z)| norm 0.2532 (-1.15z)| lr 5.30e-04 | 8456.11 ms | -100.0% bf16 MFU | 62064 tok/s +step 4886/19560 | loss 3.536477 (-1.07z)| norm 0.2538 (-1.14z)| lr 5.30e-04 | 8456.00 ms | -100.0% bf16 MFU | 62060 tok/s +step 4887/19560 | loss 3.570374 (-0.06z)| norm 0.2509 (-1.27z)| lr 5.30e-04 | 8457.15 ms | -100.0% bf16 MFU | 62057 tok/s +step 4888/19560 | loss 3.624307 (+1.52z)| norm 0.2423 (-1.68z)| lr 5.30e-04 | 8457.21 ms | -100.0% bf16 MFU | 62054 tok/s +step 4889/19560 | loss 3.573728 (+0.02z)| norm 0.2495 (-1.32z)| lr 5.30e-04 | 8459.51 ms | -100.0% bf16 MFU | 62050 tok/s +step 4890/19560 | loss 3.573234 (+0.01z)| norm 0.2654 (-0.55z)| lr 5.30e-04 | 8457.63 ms | -100.0% bf16 MFU | 62047 tok/s +step 4891/19560 | loss 3.547464 (-0.74z)| norm 0.2591 (-0.84z)| lr 5.30e-04 | 8460.18 ms | -100.0% bf16 MFU | 62043 tok/s +step 4892/19560 | loss 3.621336 (+1.42z)| norm 0.2711 (-0.26z)| lr 5.30e-04 | 8455.80 ms | -100.0% bf16 MFU | 62041 tok/s +step 4893/19560 | loss 3.590489 (+0.51z)| norm 0.2806 (+0.20z)| lr 5.30e-04 | 8457.06 ms | -100.0% bf16 MFU | 62039 tok/s +step 4894/19560 | loss 3.586806 (+0.42z)| norm 0.3257 (+2.30z)| lr 5.30e-04 | 8459.41 ms | -100.0% bf16 MFU | 62036 tok/s +step 4895/19560 | loss 3.587298 (+0.43z)| norm 0.3623 (+3.77z)| lr 5.30e-04 | 8461.43 ms | -100.0% bf16 MFU | 62032 tok/s +step 4896/19560 | loss 3.558515 (-0.41z)| norm 0.3138 (+1.59z)| lr 5.30e-04 | 8456.41 ms | -100.0% bf16 MFU | 62030 tok/s +step 4897/19560 | loss 3.518202 (-1.58z)| norm 0.3056 (+1.21z)| lr 5.30e-04 | 8458.16 ms | -100.0% bf16 MFU | 62028 tok/s +step 4898/19560 | loss 3.629384 (+1.68z)| norm 0.3202 (+1.83z)| lr 5.30e-04 | 8449.96 ms | -100.0% bf16 MFU | 62029 tok/s +step 4899/19560 | loss 3.596301 (+0.72z)| norm 0.3313 (+2.25z)| lr 5.30e-04 | 8461.68 ms | -100.0% bf16 MFU | 62026 tok/s +step 4900/19560 | loss 3.599133 (+0.80z)| norm 0.2898 (+0.47z)| lr 5.30e-04 | 8459.68 ms | -100.0% bf16 MFU | 62023 tok/s +step 4901/19560 | loss 3.636338 (+1.90z)| norm 0.2651 (-0.57z)| lr 5.30e-04 | 8452.50 ms | -100.0% bf16 MFU | 62023 tok/s +step 4902/19560 | loss 3.537022 (-1.03z)| norm 0.3019 (+0.99z)| lr 5.29e-04 | 8453.31 ms | -100.0% bf16 MFU | 62023 tok/s +step 4903/19560 | loss 3.570665 (-0.02z)| norm 0.2629 (-0.69z)| lr 5.29e-04 | 8459.28 ms | -100.0% bf16 MFU | 62021 tok/s +step 4904/19560 | loss 3.593278 (+0.65z)| norm 0.2667 (-0.52z)| lr 5.29e-04 | 8456.21 ms | -100.0% bf16 MFU | 62020 tok/s +step 4905/19560 | loss 3.568631 (-0.09z)| norm 0.2493 (-1.25z)| lr 5.29e-04 | 8454.34 ms | -100.0% bf16 MFU | 62020 tok/s +step 4906/19560 | loss 3.586461 (+0.44z)| norm 0.2721 (-0.27z)| lr 5.29e-04 | 8454.18 ms | -100.0% bf16 MFU | 62019 tok/s +step 4907/19560 | loss 3.552112 (-0.58z)| norm 0.2638 (-0.62z)| lr 5.29e-04 | 8447.19 ms | -100.0% bf16 MFU | 62022 tok/s +step 4908/19560 | loss 3.562349 (-0.27z)| norm 0.2589 (-0.82z)| lr 5.29e-04 | 8451.27 ms | -100.0% bf16 MFU | 62023 tok/s +step 4909/19560 | loss 3.574062 (+0.08z)| norm 0.2858 (+0.35z)| lr 5.29e-04 | 8450.21 ms | -100.0% bf16 MFU | 62024 tok/s +step 4910/19560 | loss 3.727380 (+4.28z)| norm 0.2876 (+0.42z)| lr 5.29e-04 | 8450.70 ms | -100.0% bf16 MFU | 62024 tok/s +step 4911/19560 | loss 3.587057 (+0.40z)| norm 0.2858 (+0.33z)| lr 5.29e-04 | 8446.36 ms | -100.0% bf16 MFU | 62027 tok/s +step 4912/19560 | loss 3.537097 (-0.97z)| norm 0.3074 (+1.24z)| lr 5.29e-04 | 8454.13 ms | -100.0% bf16 MFU | 62026 tok/s +step 4913/19560 | loss 3.623540 (+1.38z)| norm 0.2946 (+0.69z)| lr 5.29e-04 | 8450.85 ms | -100.0% bf16 MFU | 62027 tok/s +step 4914/19560 | loss 3.556253 (-0.46z)| norm 0.2951 (+0.71z)| lr 5.29e-04 | 8448.61 ms | -100.0% bf16 MFU | 62028 tok/s +step 4915/19560 | loss 3.587330 (+0.40z)| norm 0.2641 (-0.62z)| lr 5.29e-04 | 8447.51 ms | -100.0% bf16 MFU | 62030 tok/s +step 4916/19560 | loss 3.605844 (+0.89z)| norm 0.2663 (-0.52z)| lr 5.29e-04 | 8452.11 ms | -100.0% bf16 MFU | 62030 tok/s +step 4917/19560 | loss 3.594267 (+0.57z)| norm 0.2603 (-0.77z)| lr 5.29e-04 | 8449.21 ms | -100.0% bf16 MFU | 62031 tok/s +step 4918/19560 | loss 3.677151 (+2.73z)| norm 0.2723 (-0.24z)| lr 5.29e-04 | 8456.54 ms | -100.0% bf16 MFU | 62030 tok/s +step 4919/19560 | loss 3.568953 (-0.15z)| norm 0.2704 (-0.33z)| lr 5.29e-04 | 8455.45 ms | -100.0% bf16 MFU | 62028 tok/s +step 4920/19560 | loss 3.611464 (+0.99z)| norm 0.2825 (+0.19z)| lr 5.29e-04 | 8452.70 ms | -100.0% bf16 MFU | 62028 tok/s +step 4921/19560 | loss 3.580597 (+0.16z)| norm 0.2702 (-0.35z)| lr 5.29e-04 | 8448.03 ms | -100.0% bf16 MFU | 62030 tok/s +step 4922/19560 | loss 3.555903 (-0.51z)| norm 0.2698 (-0.37z)| lr 5.29e-04 | 8453.46 ms | -100.0% bf16 MFU | 62029 tok/s +step 4923/19560 | loss 3.573650 (-0.04z)| norm 0.2450 (-1.47z)| lr 5.29e-04 | 8451.58 ms | -100.0% bf16 MFU | 62030 tok/s +step 4924/19560 | loss 3.540218 (-0.92z)| norm 0.2646 (-0.61z)| lr 5.29e-04 | 8449.07 ms | -100.0% bf16 MFU | 62031 tok/s +step 4925/19560 | loss 3.573623 (-0.03z)| norm 0.2545 (-1.07z)| lr 5.29e-04 | 8444.30 ms | -100.0% bf16 MFU | 62034 tok/s +step 4926/19560 | loss 3.527907 (-1.24z)| norm 0.2672 (-0.50z)| lr 5.29e-04 | 8446.26 ms | -100.0% bf16 MFU | 62036 tok/s +step 4927/19560 | loss 3.559959 (-0.38z)| norm 0.2600 (-0.82z)| lr 5.29e-04 | 8439.26 ms | -100.0% bf16 MFU | 62040 tok/s +step 4928/19560 | loss 3.576211 (+0.06z)| norm 0.2710 (-0.31z)| lr 5.29e-04 | 8441.94 ms | -100.0% bf16 MFU | 62043 tok/s +step 4929/19560 | loss 3.601544 (+0.72z)| norm 0.2598 (-0.81z)| lr 5.29e-04 | 8439.29 ms | -100.0% bf16 MFU | 62047 tok/s +step 4930/19560 | loss 3.538211 (-0.96z)| norm 0.2584 (-0.86z)| lr 5.29e-04 | 8435.69 ms | -100.0% bf16 MFU | 62053 tok/s +step 4931/19560 | loss 3.637348 (+1.66z)| norm 0.2843 (+0.32z)| lr 5.29e-04 | 8438.94 ms | -100.0% bf16 MFU | 62056 tok/s +step 4932/19560 | loss 3.675654 (+2.58z)| norm 0.2823 (+0.24z)| lr 5.29e-04 | 8431.02 ms | -100.0% bf16 MFU | 62063 tok/s +step 4933/19560 | loss 3.586241 (+0.28z)| norm 0.2787 (+0.09z)| lr 5.28e-04 | 8427.30 ms | -100.0% bf16 MFU | 62070 tok/s +step 4934/19560 | loss 3.568690 (-0.18z)| norm 0.2746 (-0.10z)| lr 5.28e-04 | 8442.16 ms | -100.0% bf16 MFU | 62072 tok/s +step 4935/19560 | loss 3.537621 (-0.97z)| norm 0.2781 (+0.07z)| lr 5.28e-04 | 8437.70 ms | -100.0% bf16 MFU | 62075 tok/s +step 4936/19560 | loss 3.543573 (-0.81z)| norm 0.3068 (+1.42z)| lr 5.28e-04 | 8437.84 ms | -100.0% bf16 MFU | 62078 tok/s +step 4937/19560 | loss 3.705647 (+3.26z)| norm 0.2612 (-0.72z)| lr 5.28e-04 | 8434.55 ms | -100.0% bf16 MFU | 62082 tok/s +step 4938/19560 | loss 3.570160 (-0.14z)| norm 0.2734 (-0.15z)| lr 5.28e-04 | 8440.08 ms | -100.0% bf16 MFU | 62084 tok/s +step 4939/19560 | loss 3.610998 (+0.88z)| norm 0.2768 (+0.01z)| lr 5.28e-04 | 8439.72 ms | -100.0% bf16 MFU | 62086 tok/s +step 4940/19560 | loss 3.630317 (+1.34z)| norm 0.2786 (+0.11z)| lr 5.28e-04 | 8438.75 ms | -100.0% bf16 MFU | 62088 tok/s +step 4941/19560 | loss 3.588511 (+0.29z)| norm 0.2634 (-0.61z)| lr 5.28e-04 | 8443.55 ms | -100.0% bf16 MFU | 62088 tok/s +step 4942/19560 | loss 3.615881 (+0.96z)| norm 0.2480 (-1.33z)| lr 5.28e-04 | 8433.31 ms | -100.0% bf16 MFU | 62092 tok/s +step 4943/19560 | loss 3.603122 (+0.65z)| norm 0.2954 (+0.90z)| lr 5.28e-04 | 8441.56 ms | -100.0% bf16 MFU | 62093 tok/s +step 4944/19560 | loss 3.531853 (-1.14z)| norm 0.2686 (-0.35z)| lr 5.28e-04 | 8441.31 ms | -100.0% bf16 MFU | 62094 tok/s +step 4945/19560 | loss 3.593331 (+0.39z)| norm 0.2852 (+0.43z)| lr 5.28e-04 | 8437.32 ms | -100.0% bf16 MFU | 62096 tok/s +step 4946/19560 | loss 3.581103 (+0.09z)| norm 0.2759 (+0.00z)| lr 5.28e-04 | 8439.16 ms | -100.0% bf16 MFU | 62098 tok/s +step 4947/19560 | loss 3.547994 (-0.74z)| norm 0.2509 (-1.18z)| lr 5.28e-04 | 8445.36 ms | -100.0% bf16 MFU | 62097 tok/s +step 4948/19560 | loss 3.554016 (-0.58z)| norm 0.2808 (+0.25z)| lr 5.28e-04 | 8445.58 ms | -100.0% bf16 MFU | 62096 tok/s +step 4949/19560 | loss 3.514045 (-1.56z)| norm 0.2505 (-1.20z)| lr 5.28e-04 | 8443.34 ms | -100.0% bf16 MFU | 62096 tok/s +step 4950/19560 | loss 3.573453 (-0.07z)| norm 0.2697 (-0.28z)| lr 5.28e-04 | 8444.56 ms | -100.0% bf16 MFU | 62095 tok/s +step 4951/19560 | loss 3.554487 (-0.55z)| norm 0.2475 (-1.34z)| lr 5.28e-04 | 8445.89 ms | -100.0% bf16 MFU | 62094 tok/s +step 4952/19560 | loss 3.551274 (-0.63z)| norm 0.2643 (-0.53z)| lr 5.28e-04 | 8438.53 ms | -100.0% bf16 MFU | 62096 tok/s +step 4953/19560 | loss 3.628982 (+1.31z)| norm 0.2895 (+0.67z)| lr 5.28e-04 | 8445.32 ms | -100.0% bf16 MFU | 62095 tok/s +step 4954/19560 | loss 3.616357 (+0.98z)| norm 0.3301 (+2.54z)| lr 5.28e-04 | 8444.49 ms | -100.0% bf16 MFU | 62095 tok/s +step 4955/19560 | loss 3.547208 (-0.78z)| norm 0.3156 (+1.82z)| lr 5.28e-04 | 8439.70 ms | -100.0% bf16 MFU | 62096 tok/s +step 4956/19560 | loss 3.532797 (-1.13z)| norm 0.2914 (+0.69z)| lr 5.28e-04 | 8446.25 ms | -100.0% bf16 MFU | 62095 tok/s +step 4957/19560 | loss 3.548132 (-0.74z)| norm 0.3215 (+2.07z)| lr 5.28e-04 | 8443.09 ms | -100.0% bf16 MFU | 62095 tok/s +step 4958/19560 | loss 3.538264 (-0.98z)| norm 0.3143 (+1.70z)| lr 5.28e-04 | 8441.96 ms | -100.0% bf16 MFU | 62096 tok/s +step 4959/19560 | loss 3.640202 (+1.57z)| norm 0.3070 (+1.35z)| lr 5.28e-04 | 8445.02 ms | -100.0% bf16 MFU | 62095 tok/s +step 4960/19560 | loss 3.602929 (+0.62z)| norm 0.2990 (+0.97z)| lr 5.28e-04 | 8445.27 ms | -100.0% bf16 MFU | 62094 tok/s +step 4961/19560 | loss 3.564735 (-0.33z)| norm 0.2852 (+0.34z)| lr 5.28e-04 | 8447.69 ms | -100.0% bf16 MFU | 62093 tok/s +step 4962/19560 | loss 3.635678 (+1.42z)| norm 0.2820 (+0.20z)| lr 5.28e-04 | 8446.72 ms | -100.0% bf16 MFU | 62092 tok/s +step 4963/19560 | loss 3.566396 (-0.29z)| norm 0.2564 (-0.94z)| lr 5.28e-04 | 8439.88 ms | -100.0% bf16 MFU | 62093 tok/s +step 4964/19560 | loss 3.618984 (+1.00z)| norm 0.2828 (+0.24z)| lr 5.27e-04 | 8445.66 ms | -100.0% bf16 MFU | 62092 tok/s +step 4965/19560 | loss 3.542874 (-0.88z)| norm 0.2456 (-1.42z)| lr 5.27e-04 | 8441.37 ms | -100.0% bf16 MFU | 62093 tok/s +step 4966/19560 | loss 3.577454 (-0.02z)| norm 0.2433 (-1.49z)| lr 5.27e-04 | 8440.40 ms | -100.0% bf16 MFU | 62094 tok/s +step 4967/19560 | loss 3.620460 (+1.03z)| norm 0.2864 (+0.41z)| lr 5.27e-04 | 8441.37 ms | -100.0% bf16 MFU | 62095 tok/s +step 4968/19560 | loss 3.608029 (+0.71z)| norm 0.2972 (+0.88z)| lr 5.27e-04 | 8449.06 ms | -100.0% bf16 MFU | 62093 tok/s +step 4969/19560 | loss 3.754074 (+4.12z)| norm 0.2909 (+0.61z)| lr 5.27e-04 | 8441.67 ms | -100.0% bf16 MFU | 62094 tok/s +step 4970/19560 | loss 3.567945 (-0.32z)| norm 0.2827 (+0.23z)| lr 5.27e-04 | 8443.98 ms | -100.0% bf16 MFU | 62093 tok/s +step 4971/19560 | loss 3.570248 (-0.27z)| norm 0.2905 (+0.58z)| lr 5.27e-04 | 8443.11 ms | -100.0% bf16 MFU | 62094 tok/s +step 4972/19560 | loss 3.630455 (+1.17z)| norm 0.3025 (+1.12z)| lr 5.27e-04 | 8441.21 ms | -100.0% bf16 MFU | 62094 tok/s +step 4973/19560 | loss 3.514682 (-1.58z)| norm 0.3272 (+2.17z)| lr 5.27e-04 | 8435.62 ms | -100.0% bf16 MFU | 62097 tok/s +step 4974/19560 | loss 3.542284 (-0.91z)| norm 0.2799 (+0.09z)| lr 5.27e-04 | 8437.17 ms | -100.0% bf16 MFU | 62099 tok/s +step 4975/19560 | loss 3.566659 (-0.34z)| norm 0.2796 (+0.08z)| lr 5.27e-04 | 8441.67 ms | -100.0% bf16 MFU | 62100 tok/s +step 4976/19560 | loss 3.544768 (-0.86z)| norm 0.3171 (+1.71z)| lr 5.27e-04 | 8434.68 ms | -100.0% bf16 MFU | 62103 tok/s +step 4977/19560 | loss 3.583834 (+0.06z)| norm 0.2767 (-0.06z)| lr 5.27e-04 | 8434.14 ms | -100.0% bf16 MFU | 62106 tok/s +step 4978/19560 | loss 3.569144 (-0.30z)| norm 0.2813 (+0.15z)| lr 5.27e-04 | 8438.71 ms | -100.0% bf16 MFU | 62107 tok/s +step 4979/19560 | loss 3.546932 (-0.83z)| norm 0.2449 (-1.43z)| lr 5.27e-04 | 8437.13 ms | -100.0% bf16 MFU | 62109 tok/s +step 4980/19560 | loss 3.479981 (-2.37z)| norm 0.2550 (-0.97z)| lr 5.27e-04 | 8431.64 ms | -100.0% bf16 MFU | 62112 tok/s +step 4981/19560 | loss 3.524369 (-1.32z)| norm 0.2502 (-1.17z)| lr 5.27e-04 | 8434.31 ms | -100.0% bf16 MFU | 62115 tok/s +step 4982/19560 | loss 3.527274 (-1.23z)| norm 0.2601 (-0.72z)| lr 5.27e-04 | 8433.88 ms | -100.0% bf16 MFU | 62117 tok/s +step 4983/19560 | loss 3.535580 (-1.06z)| norm 0.2550 (-0.94z)| lr 5.27e-04 | 8436.25 ms | -100.0% bf16 MFU | 62119 tok/s +step 4984/19560 | loss 3.599433 (+0.46z)| norm 0.2662 (-0.44z)| lr 5.27e-04 | 8436.27 ms | -100.0% bf16 MFU | 62120 tok/s +step 4985/19560 | loss 3.633665 (+1.25z)| norm 0.2845 (+0.38z)| lr 5.27e-04 | 8433.76 ms | -100.0% bf16 MFU | 62122 tok/s +step 4986/19560 | loss 3.505302 (-1.75z)| norm 0.3113 (+1.55z)| lr 5.27e-04 | 8431.33 ms | -100.0% bf16 MFU | 62125 tok/s +step 4987/19560 | loss 3.565014 (-0.35z)| norm 0.3017 (+1.10z)| lr 5.27e-04 | 8436.00 ms | -100.0% bf16 MFU | 62127 tok/s +step 4988/19560 | loss 3.556625 (-0.54z)| norm 0.2672 (-0.41z)| lr 5.27e-04 | 8434.92 ms | -100.0% bf16 MFU | 62128 tok/s +step 4989/19560 | loss 3.584432 (+0.10z)| norm 0.2869 (+0.46z)| lr 5.27e-04 | 8436.75 ms | -100.0% bf16 MFU | 62129 tok/s +step 4990/19560 | loss 3.563570 (-0.38z)| norm 0.3088 (+1.40z)| lr 5.27e-04 | 8435.96 ms | -100.0% bf16 MFU | 62130 tok/s +step 4991/19560 | loss 3.551177 (-0.67z)| norm 0.2602 (-0.72z)| lr 5.27e-04 | 8432.94 ms | -100.0% bf16 MFU | 62132 tok/s +step 4992/19560 | loss 3.606781 (+0.62z)| norm 0.2697 (-0.32z)| lr 5.27e-04 | 8435.83 ms | -100.0% bf16 MFU | 62133 tok/s +step 4993/19560 | loss 3.575288 (-0.11z)| norm 0.2868 (+0.42z)| lr 5.27e-04 | 8438.61 ms | -100.0% bf16 MFU | 62133 tok/s +step 4994/19560 | loss 3.581984 (+0.04z)| norm 0.2962 (+0.83z)| lr 5.27e-04 | 8437.84 ms | -100.0% bf16 MFU | 62133 tok/s +step 4995/19560 | loss 3.544579 (-0.82z)| norm 0.3054 (+1.22z)| lr 5.26e-04 | 8438.08 ms | -100.0% bf16 MFU | 62133 tok/s +step 4996/19560 | loss 3.557566 (-0.53z)| norm 0.3020 (+1.05z)| lr 5.26e-04 | 8439.14 ms | -100.0% bf16 MFU | 62133 tok/s +step 4997/19560 | loss 3.590796 (+0.25z)| norm 0.2721 (-0.28z)| lr 5.26e-04 | 8440.75 ms | -100.0% bf16 MFU | 62132 tok/s +step 4998/19560 | loss 3.560542 (-0.45z)| norm 0.2836 (+0.22z)| lr 5.26e-04 | 8440.92 ms | -100.0% bf16 MFU | 62131 tok/s +step 4999/19560 | loss 3.684922 (+2.45z)| norm 0.2712 (-0.34z)| lr 5.26e-04 | 8436.07 ms | -100.0% bf16 MFU | 62132 tok/s +step 5000/19560 | loss 3.531260 (-1.14z)| norm 0.2734 (-0.25z)| lr 5.26e-04 | 8439.03 ms | -100.0% bf16 MFU | 62131 tok/s +val loss 3.545444 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 2715/10042 = 0.270364 +Writing checkpoint at step 5000 +Writing model to log124M/model_00005000.bin +Writing state to log124M/state_00005000_00000.bin +step 5001/19560 | loss 3.584346 (+0.10z)| norm 0.3095 (+1.36z)| lr 5.26e-04 | 8439.85 ms | -100.0% bf16 MFU | 62131 tok/s +step 5002/19560 | loss 3.579551 (-0.02z)| norm 0.2714 (-0.36z)| lr 5.26e-04 | 8440.39 ms | -100.0% bf16 MFU | 62130 tok/s +step 5003/19560 | loss 3.613336 (+0.78z)| norm 0.2715 (-0.35z)| lr 5.26e-04 | 8442.49 ms | -100.0% bf16 MFU | 62129 tok/s +step 5004/19560 | loss 3.547910 (-0.76z)| norm 0.2828 (+0.15z)| lr 5.26e-04 | 8461.66 ms | -100.0% bf16 MFU | 62120 tok/s +step 5005/19560 | loss 3.589655 (+0.23z)| norm 0.2668 (-0.57z)| lr 5.26e-04 | 8463.58 ms | -100.0% bf16 MFU | 62111 tok/s +step 5006/19560 | loss 3.532079 (-1.11z)| norm 0.2763 (-0.15z)| lr 5.26e-04 | 8439.08 ms | -100.0% bf16 MFU | 62112 tok/s +step 5007/19560 | loss 3.563993 (-0.37z)| norm 0.3056 (+1.16z)| lr 5.26e-04 | 8442.83 ms | -100.0% bf16 MFU | 62112 tok/s +step 5008/19560 | loss 3.604966 (+0.59z)| norm 0.2731 (-0.30z)| lr 5.26e-04 | 8439.57 ms | -100.0% bf16 MFU | 62112 tok/s +step 5009/19560 | loss 3.570694 (-0.21z)| norm 0.2863 (+0.29z)| lr 5.26e-04 | 8443.68 ms | -100.0% bf16 MFU | 62111 tok/s +step 5010/19560 | loss 3.505546 (-1.72z)| norm 0.2934 (+0.60z)| lr 5.26e-04 | 8443.12 ms | -100.0% bf16 MFU | 62110 tok/s +step 5011/19560 | loss 3.586334 (+0.17z)| norm 0.2728 (-0.31z)| lr 5.26e-04 | 8445.94 ms | -100.0% bf16 MFU | 62109 tok/s +step 5012/19560 | loss 3.573978 (-0.12z)| norm 0.2624 (-0.77z)| lr 5.26e-04 | 8442.67 ms | -100.0% bf16 MFU | 62108 tok/s +step 5013/19560 | loss 3.569190 (-0.23z)| norm 0.2675 (-0.55z)| lr 5.26e-04 | 8440.46 ms | -100.0% bf16 MFU | 62109 tok/s +step 5014/19560 | loss 3.622459 (+1.00z)| norm 0.2489 (-1.39z)| lr 5.26e-04 | 8443.73 ms | -100.0% bf16 MFU | 62108 tok/s +step 5015/19560 | loss 3.552697 (-0.63z)| norm 0.2513 (-1.28z)| lr 5.26e-04 | 8439.69 ms | -100.0% bf16 MFU | 62108 tok/s +step 5016/19560 | loss 3.557585 (-0.51z)| norm 0.2811 (+0.06z)| lr 5.26e-04 | 8439.71 ms | -100.0% bf16 MFU | 62109 tok/s +step 5017/19560 | loss 3.568102 (-0.26z)| norm 0.3073 (+1.25z)| lr 5.26e-04 | 8439.14 ms | -100.0% bf16 MFU | 62110 tok/s +step 5018/19560 | loss 3.598249 (+0.44z)| norm 0.3092 (+1.31z)| lr 5.26e-04 | 8440.08 ms | -100.0% bf16 MFU | 62110 tok/s +step 5019/19560 | loss 3.589730 (+0.24z)| norm 0.3020 (+0.96z)| lr 5.26e-04 | 8442.45 ms | -100.0% bf16 MFU | 62110 tok/s +step 5020/19560 | loss 3.574116 (-0.12z)| norm 0.2892 (+0.37z)| lr 5.26e-04 | 8440.80 ms | -100.0% bf16 MFU | 62110 tok/s +step 5021/19560 | loss 3.482073 (-2.23z)| norm 0.2774 (-0.17z)| lr 5.26e-04 | 8440.48 ms | -100.0% bf16 MFU | 62110 tok/s +step 5022/19560 | loss 3.548513 (-0.68z)| norm 0.2840 (+0.15z)| lr 5.26e-04 | 8439.66 ms | -100.0% bf16 MFU | 62111 tok/s +step 5023/19560 | loss 3.567398 (-0.24z)| norm 0.2982 (+0.89z)| lr 5.26e-04 | 8443.15 ms | -100.0% bf16 MFU | 62110 tok/s +step 5024/19560 | loss 3.543226 (-0.80z)| norm 0.3222 (+2.06z)| lr 5.26e-04 | 8442.20 ms | -100.0% bf16 MFU | 62110 tok/s +step 5025/19560 | loss 3.617533 (+0.90z)| norm 0.2769 (-0.15z)| lr 5.25e-04 | 8439.43 ms | -100.0% bf16 MFU | 62111 tok/s +step 5026/19560 | loss 3.561292 (-0.39z)| norm 0.2751 (-0.23z)| lr 5.25e-04 | 8436.24 ms | -100.0% bf16 MFU | 62112 tok/s +step 5027/19560 | loss 3.550657 (-0.63z)| norm 0.2789 (-0.02z)| lr 5.25e-04 | 8442.01 ms | -100.0% bf16 MFU | 62112 tok/s +step 5028/19560 | loss 3.554493 (-0.53z)| norm 0.2796 (+0.02z)| lr 5.25e-04 | 8440.23 ms | -100.0% bf16 MFU | 62112 tok/s +step 5029/19560 | loss 3.604503 (+0.64z)| norm 0.2823 (+0.15z)| lr 5.25e-04 | 8441.14 ms | -100.0% bf16 MFU | 62112 tok/s +step 5030/19560 | loss 3.633619 (+1.30z)| norm 0.2917 (+0.65z)| lr 5.25e-04 | 8441.43 ms | -100.0% bf16 MFU | 62112 tok/s +step 5031/19560 | loss 3.564686 (-0.31z)| norm 0.2918 (+0.64z)| lr 5.25e-04 | 8438.36 ms | -100.0% bf16 MFU | 62113 tok/s +step 5032/19560 | loss 3.529223 (-1.12z)| norm 0.2782 (-0.07z)| lr 5.25e-04 | 8440.38 ms | -100.0% bf16 MFU | 62113 tok/s +step 5033/19560 | loss 3.514621 (-1.43z)| norm 0.2540 (-1.34z)| lr 5.25e-04 | 8440.15 ms | -100.0% bf16 MFU | 62113 tok/s +step 5034/19560 | loss 3.564008 (-0.29z)| norm 0.2738 (-0.31z)| lr 5.25e-04 | 8439.41 ms | -100.0% bf16 MFU | 62114 tok/s +step 5035/19560 | loss 3.567051 (-0.23z)| norm 0.2705 (-0.48z)| lr 5.25e-04 | 8439.66 ms | -100.0% bf16 MFU | 62114 tok/s +step 5036/19560 | loss 3.612234 (+0.80z)| norm 0.2523 (-1.42z)| lr 5.25e-04 | 8439.10 ms | -100.0% bf16 MFU | 62115 tok/s +step 5037/19560 | loss 3.606553 (+0.66z)| norm 0.2779 (-0.09z)| lr 5.25e-04 | 8443.07 ms | -100.0% bf16 MFU | 62114 tok/s +step 5038/19560 | loss 3.590450 (+0.34z)| norm 0.2925 (+0.67z)| lr 5.25e-04 | 8441.55 ms | -100.0% bf16 MFU | 62114 tok/s +step 5039/19560 | loss 3.597567 (+0.50z)| norm 0.2772 (-0.12z)| lr 5.25e-04 | 8440.68 ms | -100.0% bf16 MFU | 62114 tok/s +step 5040/19560 | loss 3.522325 (-1.30z)| norm 0.2439 (-1.82z)| lr 5.25e-04 | 8438.38 ms | -100.0% bf16 MFU | 62115 tok/s +step 5041/19560 | loss 3.562279 (-0.33z)| norm 0.2609 (-0.92z)| lr 5.25e-04 | 8443.06 ms | -100.0% bf16 MFU | 62114 tok/s +step 5042/19560 | loss 3.598217 (+0.53z)| norm 0.2772 (-0.07z)| lr 5.25e-04 | 8440.33 ms | -100.0% bf16 MFU | 62114 tok/s +step 5043/19560 | loss 3.575283 (-0.02z)| norm 0.2708 (-0.41z)| lr 5.25e-04 | 8440.05 ms | -100.0% bf16 MFU | 62114 tok/s +step 5044/19560 | loss 3.515097 (-1.45z)| norm 0.2616 (-0.89z)| lr 5.25e-04 | 8441.05 ms | -100.0% bf16 MFU | 62114 tok/s +step 5045/19560 | loss 3.600044 (+0.59z)| norm 0.2704 (-0.44z)| lr 5.25e-04 | 8439.52 ms | -100.0% bf16 MFU | 62115 tok/s +step 5046/19560 | loss 3.576964 (+0.05z)| norm 0.2886 (+0.51z)| lr 5.25e-04 | 8440.77 ms | -100.0% bf16 MFU | 62114 tok/s +step 5047/19560 | loss 3.552475 (-0.54z)| norm 0.2994 (+1.05z)| lr 5.25e-04 | 8441.85 ms | -100.0% bf16 MFU | 62114 tok/s +step 5048/19560 | loss 3.552853 (-0.52z)| norm 0.2718 (-0.37z)| lr 5.25e-04 | 8439.83 ms | -100.0% bf16 MFU | 62114 tok/s +step 5049/19560 | loss 3.541761 (-0.79z)| norm 0.2729 (-0.32z)| lr 5.25e-04 | 8439.69 ms | -100.0% bf16 MFU | 62115 tok/s +step 5050/19560 | loss 3.526125 (-1.16z)| norm 0.2522 (-1.37z)| lr 5.25e-04 | 8440.96 ms | -100.0% bf16 MFU | 62115 tok/s +step 5051/19560 | loss 3.625506 (+1.25z)| norm 0.2736 (-0.29z)| lr 5.25e-04 | 8444.03 ms | -100.0% bf16 MFU | 62113 tok/s +step 5052/19560 | loss 3.554579 (-0.47z)| norm 0.2873 (+0.42z)| lr 5.25e-04 | 8438.85 ms | -100.0% bf16 MFU | 62114 tok/s +step 5053/19560 | loss 3.541698 (-0.78z)| norm 0.2753 (-0.22z)| lr 5.25e-04 | 8442.60 ms | -100.0% bf16 MFU | 62113 tok/s +step 5054/19560 | loss 3.488255 (-2.04z)| norm 0.2436 (-1.85z)| lr 5.25e-04 | 8441.06 ms | -100.0% bf16 MFU | 62113 tok/s +step 5055/19560 | loss 3.595076 (+0.51z)| norm 0.2682 (-0.58z)| lr 5.24e-04 | 8437.54 ms | -100.0% bf16 MFU | 62115 tok/s +step 5056/19560 | loss 3.534211 (-0.94z)| norm 0.2548 (-1.26z)| lr 5.24e-04 | 8435.70 ms | -100.0% bf16 MFU | 62116 tok/s +step 5057/19560 | loss 3.558657 (-0.35z)| norm 0.2589 (-1.05z)| lr 5.24e-04 | 8435.84 ms | -100.0% bf16 MFU | 62118 tok/s +step 5058/19560 | loss 3.601835 (+0.67z)| norm 0.2545 (-1.27z)| lr 5.24e-04 | 8434.11 ms | -100.0% bf16 MFU | 62120 tok/s +step 5059/19560 | loss 3.532185 (-0.98z)| norm 0.2819 (+0.14z)| lr 5.24e-04 | 8431.88 ms | -100.0% bf16 MFU | 62123 tok/s +step 5060/19560 | loss 3.596243 (+0.59z)| norm 0.2544 (-1.26z)| lr 5.24e-04 | 8432.37 ms | -100.0% bf16 MFU | 62126 tok/s +step 5061/19560 | loss 3.561283 (-0.26z)| norm 0.2639 (-0.76z)| lr 5.24e-04 | 8431.63 ms | -100.0% bf16 MFU | 62129 tok/s +step 5062/19560 | loss 3.596716 (+0.60z)| norm 0.3013 (+1.13z)| lr 5.24e-04 | 8429.99 ms | -100.0% bf16 MFU | 62132 tok/s +step 5063/19560 | loss 3.520402 (-1.26z)| norm 0.2717 (-0.37z)| lr 5.24e-04 | 8431.12 ms | -100.0% bf16 MFU | 62134 tok/s +step 5064/19560 | loss 3.606594 (+0.83z)| norm 0.3136 (+1.76z)| lr 5.24e-04 | 8447.05 ms | -100.0% bf16 MFU | 62131 tok/s +step 5065/19560 | loss 3.583745 (+0.31z)| norm 0.2921 (+0.65z)| lr 5.24e-04 | 8454.77 ms | -100.0% bf16 MFU | 62125 tok/s +step 5066/19560 | loss 3.521586 (-1.26z)| norm 0.2917 (+0.62z)| lr 5.24e-04 | 8455.08 ms | -100.0% bf16 MFU | 62119 tok/s +step 5067/19560 | loss 3.592586 (+0.54z)| norm 0.2736 (-0.29z)| lr 5.24e-04 | 8455.43 ms | -100.0% bf16 MFU | 62114 tok/s +step 5068/19560 | loss 3.533093 (-0.95z)| norm 0.2674 (-0.60z)| lr 5.24e-04 | 8459.14 ms | -100.0% bf16 MFU | 62107 tok/s +step 5069/19560 | loss 3.525841 (-1.12z)| norm 0.2741 (-0.27z)| lr 5.24e-04 | 8456.01 ms | -100.0% bf16 MFU | 62102 tok/s +step 5070/19560 | loss 3.515038 (-1.37z)| norm 0.2477 (-1.61z)| lr 5.24e-04 | 8456.39 ms | -100.0% bf16 MFU | 62097 tok/s +step 5071/19560 | loss 3.484665 (-2.09z)| norm 0.2742 (-0.25z)| lr 5.24e-04 | 8457.20 ms | -100.0% bf16 MFU | 62091 tok/s +step 5072/19560 | loss 3.512037 (-1.39z)| norm 0.2814 (+0.11z)| lr 5.24e-04 | 8452.76 ms | -100.0% bf16 MFU | 62088 tok/s +step 5073/19560 | loss 3.560341 (-0.19z)| norm 0.2644 (-0.75z)| lr 5.24e-04 | 8458.32 ms | -100.0% bf16 MFU | 62083 tok/s +step 5074/19560 | loss 3.479956 (-2.13z)| norm 0.2358 (-2.15z)| lr 5.24e-04 | 8455.63 ms | -100.0% bf16 MFU | 62079 tok/s +step 5075/19560 | loss 3.526146 (-0.99z)| norm 0.2663 (-0.64z)| lr 5.24e-04 | 8453.96 ms | -100.0% bf16 MFU | 62076 tok/s +step 5076/19560 | loss 3.463535 (-2.45z)| norm 0.2546 (-1.21z)| lr 5.24e-04 | 8457.07 ms | -100.0% bf16 MFU | 62072 tok/s +step 5077/19560 | loss 3.570706 (+0.10z)| norm 0.2640 (-0.74z)| lr 5.24e-04 | 8459.18 ms | -100.0% bf16 MFU | 62067 tok/s +step 5078/19560 | loss 3.520518 (-1.09z)| norm 0.2559 (-1.14z)| lr 5.24e-04 | 8460.52 ms | -100.0% bf16 MFU | 62062 tok/s +step 5079/19560 | loss 3.511109 (-1.30z)| norm 0.2719 (-0.36z)| lr 5.24e-04 | 8456.14 ms | -100.0% bf16 MFU | 62059 tok/s +step 5080/19560 | loss 3.545585 (-0.48z)| norm 0.2901 (+0.56z)| lr 5.24e-04 | 8461.08 ms | -100.0% bf16 MFU | 62054 tok/s +step 5081/19560 | loss 3.525186 (-0.95z)| norm 0.2740 (-0.25z)| lr 5.24e-04 | 8447.05 ms | -100.0% bf16 MFU | 62055 tok/s +step 5082/19560 | loss 3.601844 (+0.89z)| norm 0.2672 (-0.59z)| lr 5.24e-04 | 8454.29 ms | -100.0% bf16 MFU | 62053 tok/s +step 5083/19560 | loss 3.586976 (+0.52z)| norm 0.2630 (-0.79z)| lr 5.24e-04 | 8453.19 ms | -100.0% bf16 MFU | 62052 tok/s +step 5084/19560 | loss 3.521734 (-1.03z)| norm 0.2858 (+0.41z)| lr 5.24e-04 | 8454.15 ms | -100.0% bf16 MFU | 62050 tok/s +step 5085/19560 | loss 3.686807 (+2.79z)| norm 1.1318 (+10.94z)| lr 5.23e-04 | 8456.09 ms | -100.0% bf16 MFU | 62047 tok/s +step 5086/19560 | loss 3.570035 (+0.08z)| norm 0.3412 (+0.73z)| lr 5.23e-04 | 8449.93 ms | -100.0% bf16 MFU | 62047 tok/s +step 5087/19560 | loss 3.505807 (-1.39z)| norm 0.2995 (+0.19z)| lr 5.23e-04 | 8455.21 ms | -100.0% bf16 MFU | 62045 tok/s +step 5088/19560 | loss 3.535780 (-0.68z)| norm 0.3271 (+0.54z)| lr 5.23e-04 | 8451.62 ms | -100.0% bf16 MFU | 62045 tok/s +step 5089/19560 | loss 3.537829 (-0.62z)| norm 0.2727 (-0.15z)| lr 5.23e-04 | 8448.78 ms | -100.0% bf16 MFU | 62045 tok/s +step 5090/19560 | loss 3.583402 (+0.45z)| norm 0.2583 (-0.34z)| lr 5.23e-04 | 8448.29 ms | -100.0% bf16 MFU | 62046 tok/s +step 5091/19560 | loss 3.556182 (-0.19z)| norm 0.2847 (+0.00z)| lr 5.23e-04 | 8447.67 ms | -100.0% bf16 MFU | 62047 tok/s +step 5092/19560 | loss 3.534352 (-0.69z)| norm 0.2700 (-0.19z)| lr 5.23e-04 | 8446.13 ms | -100.0% bf16 MFU | 62048 tok/s +step 5093/19560 | loss 3.531754 (-0.75z)| norm 0.2889 (+0.05z)| lr 5.23e-04 | 8442.02 ms | -100.0% bf16 MFU | 62051 tok/s +step 5094/19560 | loss 3.498895 (-1.49z)| norm 0.2635 (-0.28z)| lr 5.23e-04 | 8454.69 ms | -100.0% bf16 MFU | 62049 tok/s +step 5095/19560 | loss 3.526340 (-0.84z)| norm 0.2538 (-0.40z)| lr 5.23e-04 | 8444.39 ms | -100.0% bf16 MFU | 62051 tok/s +step 5096/19560 | loss 3.582661 (+0.49z)| norm 0.2521 (-0.42z)| lr 5.23e-04 | 8450.82 ms | -100.0% bf16 MFU | 62050 tok/s +step 5097/19560 | loss 3.517483 (-1.09z)| norm 0.2491 (-0.45z)| lr 5.23e-04 | 8439.94 ms | -100.0% bf16 MFU | 62054 tok/s +step 5098/19560 | loss 3.601870 (+1.06z)| norm 0.3207 (+0.47z)| lr 5.23e-04 | 8446.38 ms | -100.0% bf16 MFU | 62055 tok/s +step 5099/19560 | loss 3.469634 (-2.25z)| norm 0.3048 (+0.26z)| lr 5.23e-04 | 8449.10 ms | -100.0% bf16 MFU | 62055 tok/s +step 5100/19560 | loss 3.597918 (+0.98z)| norm 0.2874 (+0.04z)| lr 5.23e-04 | 8447.82 ms | -100.0% bf16 MFU | 62055 tok/s +step 5101/19560 | loss 3.513806 (-1.14z)| norm 0.2726 (-0.15z)| lr 5.23e-04 | 8445.65 ms | -100.0% bf16 MFU | 62056 tok/s +step 5102/19560 | loss 3.569353 (+0.25z)| norm 0.2751 (-0.12z)| lr 5.23e-04 | 8439.85 ms | -100.0% bf16 MFU | 62059 tok/s +step 5103/19560 | loss 3.505591 (-1.34z)| norm 0.2578 (-0.33z)| lr 5.23e-04 | 8445.06 ms | -100.0% bf16 MFU | 62060 tok/s +step 5104/19560 | loss 3.533642 (-0.63z)| norm 0.2858 (+0.03z)| lr 5.23e-04 | 8443.72 ms | -100.0% bf16 MFU | 62062 tok/s +step 5105/19560 | loss 3.524344 (-0.85z)| norm 0.2780 (-0.07z)| lr 5.23e-04 | 8446.49 ms | -100.0% bf16 MFU | 62063 tok/s +step 5106/19560 | loss 3.484329 (-1.81z)| norm 0.2533 (-0.39z)| lr 5.23e-04 | 8447.75 ms | -100.0% bf16 MFU | 62063 tok/s +step 5107/19560 | loss 3.536230 (-0.53z)| norm 0.2963 (+0.16z)| lr 5.23e-04 | 8443.41 ms | -100.0% bf16 MFU | 62064 tok/s +step 5108/19560 | loss 3.498398 (-1.47z)| norm 0.2688 (-0.19z)| lr 5.23e-04 | 8449.69 ms | -100.0% bf16 MFU | 62063 tok/s +step 5109/19560 | loss 3.530028 (-0.69z)| norm 0.2579 (-0.34z)| lr 5.23e-04 | 8449.13 ms | -100.0% bf16 MFU | 62063 tok/s +step 5110/19560 | loss 3.557427 (-0.02z)| norm 0.2655 (-0.24z)| lr 5.23e-04 | 8449.79 ms | -100.0% bf16 MFU | 62062 tok/s +step 5111/19560 | loss 3.509247 (-1.20z)| norm 0.2597 (-0.31z)| lr 5.23e-04 | 8458.15 ms | -100.0% bf16 MFU | 62058 tok/s +step 5112/19560 | loss 3.567739 (+0.25z)| norm 0.2683 (-0.20z)| lr 5.23e-04 | 8454.25 ms | -100.0% bf16 MFU | 62056 tok/s +step 5113/19560 | loss 3.555548 (-0.04z)| norm 0.2838 (-0.00z)| lr 5.23e-04 | 8452.03 ms | -100.0% bf16 MFU | 62055 tok/s +step 5114/19560 | loss 3.573628 (+0.41z)| norm 0.2549 (-0.37z)| lr 5.23e-04 | 8451.65 ms | -100.0% bf16 MFU | 62054 tok/s +step 5115/19560 | loss 3.523392 (-0.86z)| norm 0.2814 (-0.03z)| lr 5.22e-04 | 8454.31 ms | -100.0% bf16 MFU | 62052 tok/s +step 5116/19560 | loss 3.591797 (+0.87z)| norm 0.2748 (-0.11z)| lr 5.22e-04 | 8450.78 ms | -100.0% bf16 MFU | 62051 tok/s +step 5117/19560 | loss 3.528864 (-0.71z)| norm 0.2613 (-0.28z)| lr 5.22e-04 | 8450.89 ms | -100.0% bf16 MFU | 62051 tok/s +step 5118/19560 | loss 3.561063 (+0.10z)| norm 0.2657 (-0.22z)| lr 5.22e-04 | 8451.04 ms | -100.0% bf16 MFU | 62050 tok/s +step 5119/19560 | loss 3.510536 (-1.16z)| norm 0.2459 (-0.48z)| lr 5.22e-04 | 8445.24 ms | -100.0% bf16 MFU | 62052 tok/s +step 5120/19560 | loss 3.545024 (-0.28z)| norm 0.2956 (+0.16z)| lr 5.22e-04 | 8444.04 ms | -100.0% bf16 MFU | 62053 tok/s +step 5121/19560 | loss 3.587772 (+0.79z)| norm 0.2988 (+0.20z)| lr 5.22e-04 | 8440.11 ms | -100.0% bf16 MFU | 62057 tok/s +step 5122/19560 | loss 3.556459 (+0.01z)| norm 0.2847 (+0.02z)| lr 5.22e-04 | 8441.21 ms | -100.0% bf16 MFU | 62059 tok/s +step 5123/19560 | loss 3.557488 (+0.03z)| norm 0.2651 (-0.23z)| lr 5.22e-04 | 8447.49 ms | -100.0% bf16 MFU | 62060 tok/s +step 5124/19560 | loss 3.503335 (-1.32z)| norm 0.2913 (+0.11z)| lr 5.22e-04 | 8450.85 ms | -100.0% bf16 MFU | 62059 tok/s +step 5125/19560 | loss 3.540722 (-0.37z)| norm 0.2909 (+0.10z)| lr 5.22e-04 | 8444.55 ms | -100.0% bf16 MFU | 62060 tok/s +step 5126/19560 | loss 3.521272 (-0.85z)| norm 0.2848 (+0.03z)| lr 5.22e-04 | 8445.44 ms | -100.0% bf16 MFU | 62061 tok/s +step 5127/19560 | loss 3.597055 (+1.11z)| norm 0.2854 (+0.03z)| lr 5.22e-04 | 8447.89 ms | -100.0% bf16 MFU | 62061 tok/s +step 5128/19560 | loss 3.558578 (+0.10z)| norm 0.2665 (-0.21z)| lr 5.22e-04 | 8449.62 ms | -100.0% bf16 MFU | 62060 tok/s +step 5129/19560 | loss 3.558989 (+0.12z)| norm 0.3174 (+0.44z)| lr 5.22e-04 | 8444.66 ms | -100.0% bf16 MFU | 62062 tok/s +step 5130/19560 | loss 3.525035 (-0.76z)| norm 0.2968 (+0.18z)| lr 5.22e-04 | 8442.98 ms | -100.0% bf16 MFU | 62063 tok/s +step 5131/19560 | loss 3.504863 (-1.27z)| norm 0.2717 (-0.15z)| lr 5.22e-04 | 8444.05 ms | -100.0% bf16 MFU | 62065 tok/s +step 5132/19560 | loss 3.560823 (+0.20z)| norm 0.2904 (+0.09z)| lr 5.22e-04 | 8447.35 ms | -100.0% bf16 MFU | 62065 tok/s +step 5133/19560 | loss 3.505479 (-1.23z)| norm 0.3008 (+0.22z)| lr 5.22e-04 | 8451.93 ms | -100.0% bf16 MFU | 62063 tok/s +step 5134/19560 | loss 3.585581 (+0.85z)| norm 0.3028 (+0.25z)| lr 5.22e-04 | 8442.70 ms | -100.0% bf16 MFU | 62065 tok/s +step 5135/19560 | loss 3.557054 (+0.11z)| norm 0.3092 (+0.33z)| lr 5.22e-04 | 8448.32 ms | -100.0% bf16 MFU | 62065 tok/s +step 5136/19560 | loss 3.552716 (+0.00z)| norm 0.3966 (+1.43z)| lr 5.22e-04 | 8448.61 ms | -100.0% bf16 MFU | 62064 tok/s +step 5137/19560 | loss 3.604216 (+1.34z)| norm 0.2683 (-0.21z)| lr 5.22e-04 | 8448.67 ms | -100.0% bf16 MFU | 62064 tok/s +step 5138/19560 | loss 3.552285 (-0.02z)| norm 0.3020 (+0.22z)| lr 5.22e-04 | 8446.12 ms | -100.0% bf16 MFU | 62064 tok/s +step 5139/19560 | loss 3.531739 (-0.55z)| norm 0.2761 (-0.11z)| lr 5.22e-04 | 8447.32 ms | -100.0% bf16 MFU | 62064 tok/s +step 5140/19560 | loss 3.481163 (-1.85z)| norm 0.3083 (+0.30z)| lr 5.22e-04 | 8448.89 ms | -100.0% bf16 MFU | 62064 tok/s +step 5141/19560 | loss 3.548946 (-0.08z)| norm 0.2755 (-0.12z)| lr 5.22e-04 | 8446.03 ms | -100.0% bf16 MFU | 62064 tok/s +step 5142/19560 | loss 3.550990 (-0.01z)| norm 0.2782 (-0.09z)| lr 5.22e-04 | 8444.32 ms | -100.0% bf16 MFU | 62066 tok/s +step 5143/19560 | loss 3.559991 (+0.23z)| norm 0.3113 (+0.33z)| lr 5.22e-04 | 8445.86 ms | -100.0% bf16 MFU | 62066 tok/s +step 5144/19560 | loss 3.544839 (-0.17z)| norm 0.2557 (-0.38z)| lr 5.22e-04 | 8446.42 ms | -100.0% bf16 MFU | 62066 tok/s +step 5145/19560 | loss 3.509946 (-1.08z)| norm 0.2651 (-0.26z)| lr 5.21e-04 | 8446.41 ms | -100.0% bf16 MFU | 62067 tok/s +step 5146/19560 | loss 3.532011 (-0.48z)| norm 0.2827 (-0.03z)| lr 5.21e-04 | 8447.42 ms | -100.0% bf16 MFU | 62067 tok/s +step 5147/19560 | loss 3.591033 (+1.08z)| norm 0.2676 (-0.22z)| lr 5.21e-04 | 8444.02 ms | -100.0% bf16 MFU | 62068 tok/s +step 5148/19560 | loss 3.481086 (-1.79z)| norm 0.2631 (-0.27z)| lr 5.21e-04 | 8439.17 ms | -100.0% bf16 MFU | 62071 tok/s +step 5149/19560 | loss 3.572226 (+0.58z)| norm 0.2646 (-0.25z)| lr 5.21e-04 | 8443.04 ms | -100.0% bf16 MFU | 62072 tok/s +step 5150/19560 | loss 3.524093 (-0.69z)| norm 0.2506 (-0.43z)| lr 5.21e-04 | 8443.34 ms | -100.0% bf16 MFU | 62073 tok/s +step 5151/19560 | loss 3.475320 (-1.93z)| norm 0.2758 (-0.10z)| lr 5.21e-04 | 8447.44 ms | -100.0% bf16 MFU | 62073 tok/s +step 5152/19560 | loss 3.606121 (+1.45z)| norm 0.2639 (-0.25z)| lr 5.21e-04 | 8446.97 ms | -100.0% bf16 MFU | 62072 tok/s +step 5153/19560 | loss 3.536680 (-0.33z)| norm 0.2881 (+0.06z)| lr 5.21e-04 | 8439.16 ms | -100.0% bf16 MFU | 62075 tok/s +step 5154/19560 | loss 3.494551 (-1.40z)| norm 0.3061 (+0.28z)| lr 5.21e-04 | 8441.36 ms | -100.0% bf16 MFU | 62077 tok/s +step 5155/19560 | loss 3.503129 (-1.17z)| norm 0.2386 (-0.57z)| lr 5.21e-04 | 8440.14 ms | -100.0% bf16 MFU | 62079 tok/s +step 5156/19560 | loss 3.554230 (+0.15z)| norm 0.2749 (-0.11z)| lr 5.21e-04 | 8442.31 ms | -100.0% bf16 MFU | 62080 tok/s +step 5157/19560 | loss 3.546628 (-0.03z)| norm 0.2640 (-0.25z)| lr 5.21e-04 | 8444.77 ms | -100.0% bf16 MFU | 62080 tok/s +step 5158/19560 | loss 3.598321 (+1.34z)| norm 0.2810 (-0.03z)| lr 5.21e-04 | 8443.59 ms | -100.0% bf16 MFU | 62081 tok/s +step 5159/19560 | loss 3.620172 (+1.87z)| norm 0.2852 (+0.03z)| lr 5.21e-04 | 8443.07 ms | -100.0% bf16 MFU | 62082 tok/s +step 5160/19560 | loss 3.484771 (-1.62z)| norm 0.3063 (+0.29z)| lr 5.21e-04 | 8447.94 ms | -100.0% bf16 MFU | 62081 tok/s +step 5161/19560 | loss 3.587480 (+1.01z)| norm 0.3231 (+0.50z)| lr 5.21e-04 | 8445.97 ms | -100.0% bf16 MFU | 62080 tok/s +step 5162/19560 | loss 3.472282 (-1.91z)| norm 0.3008 (+0.21z)| lr 5.21e-04 | 8442.80 ms | -100.0% bf16 MFU | 62081 tok/s +step 5163/19560 | loss 3.552389 (+0.13z)| norm 0.2536 (-0.39z)| lr 5.21e-04 | 8441.96 ms | -100.0% bf16 MFU | 62083 tok/s +step 5164/19560 | loss 3.475674 (-1.79z)| norm 0.2911 (+0.09z)| lr 5.21e-04 | 8443.91 ms | -100.0% bf16 MFU | 62083 tok/s +step 5165/19560 | loss 3.541543 (-0.11z)| norm 0.2855 (+0.01z)| lr 5.21e-04 | 8441.72 ms | -100.0% bf16 MFU | 62084 tok/s +step 5166/19560 | loss 3.616162 (+1.78z)| norm 0.2590 (-0.32z)| lr 5.21e-04 | 8446.83 ms | -100.0% bf16 MFU | 62083 tok/s +step 5167/19560 | loss 3.521732 (-0.61z)| norm 0.2838 (-0.00z)| lr 5.21e-04 | 8441.34 ms | -100.0% bf16 MFU | 62085 tok/s +step 5168/19560 | loss 3.506222 (-1.00z)| norm 0.2929 (+0.11z)| lr 5.21e-04 | 8444.46 ms | -100.0% bf16 MFU | 62085 tok/s +step 5169/19560 | loss 3.555507 (+0.26z)| norm 0.2694 (-0.19z)| lr 5.21e-04 | 8445.80 ms | -100.0% bf16 MFU | 62084 tok/s +step 5170/19560 | loss 3.537571 (-0.19z)| norm 0.2517 (-0.42z)| lr 5.21e-04 | 8443.96 ms | -100.0% bf16 MFU | 62085 tok/s +step 5171/19560 | loss 3.601426 (+1.44z)| norm 0.2787 (-0.07z)| lr 5.21e-04 | 8437.78 ms | -100.0% bf16 MFU | 62087 tok/s +step 5172/19560 | loss 3.536542 (-0.22z)| norm 0.2664 (-0.23z)| lr 5.21e-04 | 8441.28 ms | -100.0% bf16 MFU | 62088 tok/s +step 5173/19560 | loss 3.558162 (+0.34z)| norm 0.2785 (-0.08z)| lr 5.21e-04 | 8444.34 ms | -100.0% bf16 MFU | 62088 tok/s +step 5174/19560 | loss 3.506218 (-0.98z)| norm 0.2395 (-0.57z)| lr 5.21e-04 | 8444.13 ms | -100.0% bf16 MFU | 62088 tok/s +step 5175/19560 | loss 3.487216 (-1.44z)| norm 0.2846 (+0.01z)| lr 5.20e-04 | 8442.93 ms | -100.0% bf16 MFU | 62089 tok/s +step 5176/19560 | loss 3.504674 (-0.98z)| norm 0.3203 (+0.46z)| lr 5.20e-04 | 8438.65 ms | -100.0% bf16 MFU | 62091 tok/s +step 5177/19560 | loss 3.526552 (-0.43z)| norm 0.3096 (+0.32z)| lr 5.20e-04 | 8443.23 ms | -100.0% bf16 MFU | 62091 tok/s +step 5178/19560 | loss 3.521757 (-0.55z)| norm 0.2523 (-0.41z)| lr 5.20e-04 | 8441.31 ms | -100.0% bf16 MFU | 62092 tok/s +step 5179/19560 | loss 3.632934 (+2.27z)| norm 0.2796 (-0.07z)| lr 5.20e-04 | 8440.71 ms | -100.0% bf16 MFU | 62093 tok/s +step 5180/19560 | loss 3.522925 (-0.51z)| norm 0.2776 (-0.09z)| lr 5.20e-04 | 8446.41 ms | -100.0% bf16 MFU | 62092 tok/s +step 5181/19560 | loss 3.507444 (-0.89z)| norm 0.2680 (-0.21z)| lr 5.20e-04 | 8442.81 ms | -100.0% bf16 MFU | 62092 tok/s +step 5182/19560 | loss 3.593426 (+1.26z)| norm 0.2951 (+0.13z)| lr 5.20e-04 | 8438.95 ms | -100.0% bf16 MFU | 62094 tok/s +step 5183/19560 | loss 3.529280 (-0.35z)| norm 0.2770 (-0.10z)| lr 5.20e-04 | 8441.90 ms | -100.0% bf16 MFU | 62095 tok/s +step 5184/19560 | loss 3.568107 (+0.63z)| norm 0.2685 (-0.21z)| lr 5.20e-04 | 8442.20 ms | -100.0% bf16 MFU | 62095 tok/s +step 5185/19560 | loss 3.555284 (+0.30z)| norm 0.2809 (-0.06z)| lr 5.20e-04 | 8438.76 ms | -100.0% bf16 MFU | 62097 tok/s +step 5186/19560 | loss 3.552028 (+0.23z)| norm 0.2831 (-0.03z)| lr 5.20e-04 | 8442.01 ms | -100.0% bf16 MFU | 62097 tok/s +step 5187/19560 | loss 3.703251 (+3.82z)| norm 0.3214 (+0.45z)| lr 5.20e-04 | 8445.66 ms | -100.0% bf16 MFU | 62096 tok/s +step 5188/19560 | loss 3.528744 \ No newline at end of file