{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.01068979463122881, "eval_steps": 500, "global_step": 12000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 173.0, "learning_rate": 4.4539461963299484e-07, "loss": 2.153, "step": 100 }, { "epoch": 0.0, "grad_norm": 40.5, "learning_rate": 8.907892392659897e-07, "loss": 2.398, "step": 200 }, { "epoch": 0.0, "grad_norm": 223.0, "learning_rate": 1.3361838588989846e-06, "loss": 1.8912, "step": 300 }, { "epoch": 0.0, "grad_norm": 95.5, "learning_rate": 1.7815784785319793e-06, "loss": 1.6403, "step": 400 }, { "epoch": 0.0, "grad_norm": 600.0, "learning_rate": 2.226973098164974e-06, "loss": 1.4652, "step": 500 }, { "epoch": 0.0, "grad_norm": 51.75, "learning_rate": 2.672367717797969e-06, "loss": 1.2489, "step": 600 }, { "epoch": 0.0, "grad_norm": 290.0, "learning_rate": 3.1177623374309637e-06, "loss": 1.2421, "step": 700 }, { "epoch": 0.0, "grad_norm": 27.75, "learning_rate": 3.5631569570639587e-06, "loss": 1.2188, "step": 800 }, { "epoch": 0.0, "grad_norm": 13.0, "learning_rate": 4.008551576696954e-06, "loss": 1.123, "step": 900 }, { "epoch": 0.0, "grad_norm": 39.75, "learning_rate": 4.453946196329948e-06, "loss": 1.0388, "step": 1000 }, { "epoch": 0.0, "grad_norm": 21.375, "learning_rate": 4.899340815962943e-06, "loss": 1.3034, "step": 1100 }, { "epoch": 0.0, "grad_norm": 36.75, "learning_rate": 5.344735435595938e-06, "loss": 1.2272, "step": 1200 }, { "epoch": 0.0, "grad_norm": 71.5, "learning_rate": 5.790130055228933e-06, "loss": 1.1825, "step": 1300 }, { "epoch": 0.0, "grad_norm": 54.5, "learning_rate": 6.235524674861927e-06, "loss": 1.3466, "step": 1400 }, { "epoch": 0.0, "grad_norm": 169.0, "learning_rate": 6.680919294494922e-06, "loss": 1.0543, "step": 1500 }, { "epoch": 0.0, "grad_norm": 47.75, "learning_rate": 7.126313914127917e-06, "loss": 1.1117, "step": 1600 }, { "epoch": 0.0, "grad_norm": 9.5, "learning_rate": 7.571708533760913e-06, "loss": 1.1458, "step": 1700 }, { "epoch": 0.0, "grad_norm": 119.0, "learning_rate": 8.017103153393907e-06, "loss": 1.1823, "step": 1800 }, { "epoch": 0.0, "grad_norm": 31.0, "learning_rate": 8.462497773026902e-06, "loss": 1.2171, "step": 1900 }, { "epoch": 0.0, "grad_norm": 57.25, "learning_rate": 8.907892392659896e-06, "loss": 1.16, "step": 2000 }, { "epoch": 0.0, "grad_norm": 77.0, "learning_rate": 9.353287012292893e-06, "loss": 1.1243, "step": 2100 }, { "epoch": 0.0, "grad_norm": 31.5, "learning_rate": 9.798681631925886e-06, "loss": 1.1218, "step": 2200 }, { "epoch": 0.0, "grad_norm": 98.0, "learning_rate": 1.0244076251558882e-05, "loss": 1.1527, "step": 2300 }, { "epoch": 0.0, "grad_norm": 32.25, "learning_rate": 1.0689470871191876e-05, "loss": 1.1042, "step": 2400 }, { "epoch": 0.0, "grad_norm": 29.0, "learning_rate": 1.1134865490824871e-05, "loss": 1.0131, "step": 2500 }, { "epoch": 0.0, "grad_norm": 50.0, "learning_rate": 1.1580260110457866e-05, "loss": 1.2544, "step": 2600 }, { "epoch": 0.0, "grad_norm": 20.625, "learning_rate": 1.2025654730090862e-05, "loss": 1.1475, "step": 2700 }, { "epoch": 0.0, "grad_norm": 23.375, "learning_rate": 1.2471049349723855e-05, "loss": 1.2345, "step": 2800 }, { "epoch": 0.0, "grad_norm": 39.5, "learning_rate": 1.2916443969356851e-05, "loss": 1.269, "step": 2900 }, { "epoch": 0.0, "grad_norm": 30.5, "learning_rate": 1.3361838588989844e-05, "loss": 1.1995, "step": 3000 }, { "epoch": 0.0, "grad_norm": 17.875, "learning_rate": 1.3807233208622842e-05, "loss": 1.1367, "step": 3100 }, { "epoch": 0.0, "grad_norm": 255.0, "learning_rate": 1.4252627828255835e-05, "loss": 1.1993, "step": 3200 }, { "epoch": 0.0, "grad_norm": 73.5, "learning_rate": 1.469802244788883e-05, "loss": 1.039, "step": 3300 }, { "epoch": 0.0, "grad_norm": 106.5, "learning_rate": 1.5143417067521826e-05, "loss": 1.2538, "step": 3400 }, { "epoch": 0.0, "grad_norm": 153.0, "learning_rate": 1.558881168715482e-05, "loss": 1.156, "step": 3500 }, { "epoch": 0.0, "grad_norm": 111.0, "learning_rate": 1.6034206306787815e-05, "loss": 1.2418, "step": 3600 }, { "epoch": 0.0, "grad_norm": 20.25, "learning_rate": 1.647960092642081e-05, "loss": 1.0018, "step": 3700 }, { "epoch": 0.0, "grad_norm": 68.0, "learning_rate": 1.6924995546053804e-05, "loss": 1.101, "step": 3800 }, { "epoch": 0.0, "grad_norm": 79.5, "learning_rate": 1.7370390165686802e-05, "loss": 1.1538, "step": 3900 }, { "epoch": 0.0, "grad_norm": 26.5, "learning_rate": 1.7815784785319793e-05, "loss": 1.176, "step": 4000 }, { "epoch": 0.0, "grad_norm": 96.5, "learning_rate": 1.8261179404952788e-05, "loss": 0.9909, "step": 4100 }, { "epoch": 0.0, "grad_norm": 45.75, "learning_rate": 1.8706574024585786e-05, "loss": 1.1521, "step": 4200 }, { "epoch": 0.0, "grad_norm": 42.0, "learning_rate": 1.915196864421878e-05, "loss": 1.1438, "step": 4300 }, { "epoch": 0.0, "grad_norm": 16.125, "learning_rate": 1.959736326385177e-05, "loss": 1.1887, "step": 4400 }, { "epoch": 0.0, "grad_norm": 38.75, "learning_rate": 2.004275788348477e-05, "loss": 1.2946, "step": 4500 }, { "epoch": 0.0, "grad_norm": 99.5, "learning_rate": 2.0488152503117764e-05, "loss": 1.2574, "step": 4600 }, { "epoch": 0.0, "grad_norm": 65.5, "learning_rate": 2.093354712275076e-05, "loss": 1.1517, "step": 4700 }, { "epoch": 0.0, "grad_norm": 79.5, "learning_rate": 2.1378941742383753e-05, "loss": 1.187, "step": 4800 }, { "epoch": 0.0, "grad_norm": 162.0, "learning_rate": 2.1824336362016748e-05, "loss": 0.9974, "step": 4900 }, { "epoch": 0.0, "grad_norm": 48.5, "learning_rate": 2.2269730981649742e-05, "loss": 1.2278, "step": 5000 }, { "epoch": 0.0, "grad_norm": 0.09423828125, "learning_rate": 2.271512560128274e-05, "loss": 1.2128, "step": 5100 }, { "epoch": 0.0, "grad_norm": 34.75, "learning_rate": 2.316052022091573e-05, "loss": 1.1712, "step": 5200 }, { "epoch": 0.0, "grad_norm": 0.01483154296875, "learning_rate": 2.3605914840548726e-05, "loss": 0.9784, "step": 5300 }, { "epoch": 0.0, "grad_norm": 89.0, "learning_rate": 2.4051309460181724e-05, "loss": 1.1382, "step": 5400 }, { "epoch": 0.0, "grad_norm": 59.5, "learning_rate": 2.449670407981472e-05, "loss": 1.291, "step": 5500 }, { "epoch": 0.0, "grad_norm": 29.5, "learning_rate": 2.494209869944771e-05, "loss": 1.1317, "step": 5600 }, { "epoch": 0.01, "grad_norm": 24.125, "learning_rate": 2.5387493319080707e-05, "loss": 1.1689, "step": 5700 }, { "epoch": 0.01, "grad_norm": 34.0, "learning_rate": 2.5832887938713702e-05, "loss": 1.2427, "step": 5800 }, { "epoch": 0.01, "grad_norm": 34.5, "learning_rate": 2.6278282558346697e-05, "loss": 1.353, "step": 5900 }, { "epoch": 0.01, "grad_norm": 32.25, "learning_rate": 2.6723677177979688e-05, "loss": 1.1593, "step": 6000 }, { "epoch": 0.01, "grad_norm": 33.75, "learning_rate": 2.716907179761269e-05, "loss": 1.1391, "step": 6100 }, { "epoch": 0.01, "grad_norm": 82.0, "learning_rate": 2.7614466417245684e-05, "loss": 1.1999, "step": 6200 }, { "epoch": 0.01, "grad_norm": 25.375, "learning_rate": 2.8059861036878675e-05, "loss": 1.0865, "step": 6300 }, { "epoch": 0.01, "grad_norm": 34.25, "learning_rate": 2.850525565651167e-05, "loss": 1.0761, "step": 6400 }, { "epoch": 0.01, "grad_norm": 44.0, "learning_rate": 2.8950650276144664e-05, "loss": 1.1187, "step": 6500 }, { "epoch": 0.01, "grad_norm": 38.25, "learning_rate": 2.939604489577766e-05, "loss": 1.0824, "step": 6600 }, { "epoch": 0.01, "grad_norm": 49.75, "learning_rate": 2.9841439515410657e-05, "loss": 1.0826, "step": 6700 }, { "epoch": 0.01, "grad_norm": 47.0, "learning_rate": 3.028683413504365e-05, "loss": 1.1387, "step": 6800 }, { "epoch": 0.01, "grad_norm": 61.0, "learning_rate": 3.073222875467664e-05, "loss": 1.1661, "step": 6900 }, { "epoch": 0.01, "grad_norm": 31.625, "learning_rate": 3.117762337430964e-05, "loss": 1.2022, "step": 7000 }, { "epoch": 0.01, "grad_norm": 38.75, "learning_rate": 3.162301799394263e-05, "loss": 1.2461, "step": 7100 }, { "epoch": 0.01, "grad_norm": 28.5, "learning_rate": 3.206841261357563e-05, "loss": 1.2583, "step": 7200 }, { "epoch": 0.01, "grad_norm": 12.0, "learning_rate": 3.251380723320863e-05, "loss": 1.1094, "step": 7300 }, { "epoch": 0.01, "grad_norm": 68.0, "learning_rate": 3.295920185284162e-05, "loss": 1.2244, "step": 7400 }, { "epoch": 0.01, "grad_norm": 42.0, "learning_rate": 3.3404596472474617e-05, "loss": 1.1486, "step": 7500 }, { "epoch": 0.01, "grad_norm": 344.0, "learning_rate": 3.384999109210761e-05, "loss": 1.2049, "step": 7600 }, { "epoch": 0.01, "grad_norm": 11.6875, "learning_rate": 3.42953857117406e-05, "loss": 1.0226, "step": 7700 }, { "epoch": 0.01, "grad_norm": 43.75, "learning_rate": 3.4740780331373604e-05, "loss": 1.2662, "step": 7800 }, { "epoch": 0.01, "grad_norm": 51.5, "learning_rate": 3.5186174951006595e-05, "loss": 1.1029, "step": 7900 }, { "epoch": 0.01, "grad_norm": 36.25, "learning_rate": 3.5631569570639586e-05, "loss": 1.1623, "step": 8000 }, { "epoch": 0.01, "grad_norm": 32.75, "learning_rate": 3.6076964190272584e-05, "loss": 1.0682, "step": 8100 }, { "epoch": 0.01, "grad_norm": 57.75, "learning_rate": 3.6522358809905575e-05, "loss": 1.1232, "step": 8200 }, { "epoch": 0.01, "grad_norm": 27.125, "learning_rate": 3.696775342953857e-05, "loss": 1.1474, "step": 8300 }, { "epoch": 0.01, "grad_norm": 93.0, "learning_rate": 3.741314804917157e-05, "loss": 1.0526, "step": 8400 }, { "epoch": 0.01, "grad_norm": 20.125, "learning_rate": 3.785854266880456e-05, "loss": 1.2, "step": 8500 }, { "epoch": 0.01, "grad_norm": 46.75, "learning_rate": 3.830393728843756e-05, "loss": 1.1716, "step": 8600 }, { "epoch": 0.01, "grad_norm": 54.75, "learning_rate": 3.874933190807055e-05, "loss": 1.042, "step": 8700 }, { "epoch": 0.01, "grad_norm": 51.0, "learning_rate": 3.919472652770354e-05, "loss": 1.1756, "step": 8800 }, { "epoch": 0.01, "grad_norm": 48.25, "learning_rate": 3.964012114733654e-05, "loss": 1.2597, "step": 8900 }, { "epoch": 0.01, "grad_norm": 14.75, "learning_rate": 4.008551576696954e-05, "loss": 1.0449, "step": 9000 }, { "epoch": 0.01, "grad_norm": 18.25, "learning_rate": 4.0530910386602536e-05, "loss": 1.2163, "step": 9100 }, { "epoch": 0.01, "grad_norm": 80.0, "learning_rate": 4.097630500623553e-05, "loss": 1.2302, "step": 9200 }, { "epoch": 0.01, "grad_norm": 32.0, "learning_rate": 4.142169962586852e-05, "loss": 1.1843, "step": 9300 }, { "epoch": 0.01, "grad_norm": 44.25, "learning_rate": 4.186709424550152e-05, "loss": 1.0384, "step": 9400 }, { "epoch": 0.01, "grad_norm": 66.5, "learning_rate": 4.231248886513451e-05, "loss": 1.0315, "step": 9500 }, { "epoch": 0.01, "grad_norm": 72.5, "learning_rate": 4.2757883484767506e-05, "loss": 1.1746, "step": 9600 }, { "epoch": 0.01, "grad_norm": 37.5, "learning_rate": 4.3203278104400504e-05, "loss": 1.2093, "step": 9700 }, { "epoch": 0.01, "grad_norm": 81.5, "learning_rate": 4.3648672724033495e-05, "loss": 1.3342, "step": 9800 }, { "epoch": 0.01, "grad_norm": 88.5, "learning_rate": 4.409406734366649e-05, "loss": 1.1162, "step": 9900 }, { "epoch": 0.01, "grad_norm": 356.0, "learning_rate": 4.4539461963299484e-05, "loss": 0.8431, "step": 10000 }, { "epoch": 0.01, "grad_norm": 75.5, "learning_rate": 4.498485658293248e-05, "loss": 1.15, "step": 10100 }, { "epoch": 0.01, "grad_norm": 51.0, "learning_rate": 4.543025120256548e-05, "loss": 1.1635, "step": 10200 }, { "epoch": 0.01, "grad_norm": 136.0, "learning_rate": 4.587564582219847e-05, "loss": 1.1175, "step": 10300 }, { "epoch": 0.01, "grad_norm": 180.0, "learning_rate": 4.632104044183146e-05, "loss": 1.0931, "step": 10400 }, { "epoch": 0.01, "grad_norm": 107.5, "learning_rate": 4.676643506146446e-05, "loss": 1.3118, "step": 10500 }, { "epoch": 0.01, "grad_norm": 34.0, "learning_rate": 4.721182968109745e-05, "loss": 1.0301, "step": 10600 }, { "epoch": 0.01, "grad_norm": 33.75, "learning_rate": 4.765722430073045e-05, "loss": 0.9912, "step": 10700 }, { "epoch": 0.01, "grad_norm": 48.5, "learning_rate": 4.810261892036345e-05, "loss": 1.1649, "step": 10800 }, { "epoch": 0.01, "grad_norm": 88.5, "learning_rate": 4.854801353999644e-05, "loss": 1.4056, "step": 10900 }, { "epoch": 0.01, "grad_norm": 143.0, "learning_rate": 4.899340815962944e-05, "loss": 1.1315, "step": 11000 }, { "epoch": 0.01, "grad_norm": 26.5, "learning_rate": 4.943880277926243e-05, "loss": 1.231, "step": 11100 }, { "epoch": 0.01, "grad_norm": 47.75, "learning_rate": 4.988419739889542e-05, "loss": 1.2519, "step": 11200 }, { "epoch": 0.01, "grad_norm": 4.5625, "learning_rate": 4.999667068583872e-05, "loss": 1.1036, "step": 11300 }, { "epoch": 0.01, "grad_norm": 72.5, "learning_rate": 4.99921716126478e-05, "loss": 1.0886, "step": 11400 }, { "epoch": 0.01, "grad_norm": 102.5, "learning_rate": 4.998767253945688e-05, "loss": 1.1742, "step": 11500 }, { "epoch": 0.01, "grad_norm": 127.0, "learning_rate": 4.9983173466265954e-05, "loss": 1.2533, "step": 11600 }, { "epoch": 0.01, "grad_norm": 221.0, "learning_rate": 4.997867439307503e-05, "loss": 1.1356, "step": 11700 }, { "epoch": 0.01, "grad_norm": 132.0, "learning_rate": 4.997417531988411e-05, "loss": 1.0534, "step": 11800 }, { "epoch": 0.01, "grad_norm": 30.625, "learning_rate": 4.996967624669318e-05, "loss": 0.936, "step": 11900 }, { "epoch": 0.01, "grad_norm": 56.5, "learning_rate": 4.9965177173502256e-05, "loss": 1.0134, "step": 12000 } ], "logging_steps": 100, "max_steps": 1122566, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 6000, "total_flos": 1.8994803365154816e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }