{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 1000, "global_step": 1972, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 3.0303030303030305e-07, "loss": 1.0924, "step": 1 }, { "epoch": 0.01, "learning_rate": 3.0303030303030305e-06, "loss": 0.8661, "step": 10 }, { "epoch": 0.02, "learning_rate": 6.060606060606061e-06, "loss": 0.8932, "step": 20 }, { "epoch": 0.03, "learning_rate": 9.090909090909091e-06, "loss": 0.8119, "step": 30 }, { "epoch": 0.04, "learning_rate": 1.2121212121212122e-05, "loss": 0.7431, "step": 40 }, { "epoch": 0.05, "learning_rate": 1.5151515151515153e-05, "loss": 0.6742, "step": 50 }, { "epoch": 0.06, "learning_rate": 1.8181818181818182e-05, "loss": 0.6329, "step": 60 }, { "epoch": 0.07, "learning_rate": 2.121212121212121e-05, "loss": 0.5587, "step": 70 }, { "epoch": 0.08, "learning_rate": 2.4242424242424244e-05, "loss": 0.5006, "step": 80 }, { "epoch": 0.09, "learning_rate": 2.7272727272727273e-05, "loss": 0.5024, "step": 90 }, { "epoch": 0.1, "learning_rate": 2.9999978899859252e-05, "loss": 0.4811, "step": 100 }, { "epoch": 0.11, "learning_rate": 2.999744695479716e-05, "loss": 0.4825, "step": 110 }, { "epoch": 0.12, "learning_rate": 2.9990695797777382e-05, "loss": 0.4656, "step": 120 }, { "epoch": 0.13, "learning_rate": 2.9979727328094035e-05, "loss": 0.4804, "step": 130 }, { "epoch": 0.14, "learning_rate": 2.9964544631492205e-05, "loss": 0.4627, "step": 140 }, { "epoch": 0.15, "learning_rate": 2.9945151979299888e-05, "loss": 0.4164, "step": 150 }, { "epoch": 0.16, "learning_rate": 2.9921554827226334e-05, "loss": 0.426, "step": 160 }, { "epoch": 0.17, "learning_rate": 2.9893759813827195e-05, "loss": 0.4441, "step": 170 }, { "epoch": 0.18, "learning_rate": 2.98617747586369e-05, "loss": 0.41, "step": 180 }, { "epoch": 0.19, "learning_rate": 2.9825608659968814e-05, "loss": 0.3916, "step": 190 }, { "epoch": 0.2, "learning_rate": 2.9785271692383737e-05, "loss": 0.4281, "step": 200 }, { "epoch": 0.21, "learning_rate": 2.9740775203827525e-05, "loss": 0.4219, "step": 210 }, { "epoch": 0.22, "learning_rate": 2.9692131712438576e-05, "loss": 0.4138, "step": 220 }, { "epoch": 0.23, "learning_rate": 2.963935490302613e-05, "loss": 0.392, "step": 230 }, { "epoch": 0.24, "learning_rate": 2.9582459623220305e-05, "loss": 0.4244, "step": 240 }, { "epoch": 0.25, "learning_rate": 2.9521461879295086e-05, "loss": 0.3797, "step": 250 }, { "epoch": 0.26, "learning_rate": 2.9456378831665264e-05, "loss": 0.3956, "step": 260 }, { "epoch": 0.27, "learning_rate": 2.938722879005873e-05, "loss": 0.4031, "step": 270 }, { "epoch": 0.28, "learning_rate": 2.9314031208365425e-05, "loss": 0.4382, "step": 280 }, { "epoch": 0.29, "learning_rate": 2.923680667916439e-05, "loss": 0.4096, "step": 290 }, { "epoch": 0.3, "learning_rate": 2.9155576927930516e-05, "loss": 0.3965, "step": 300 }, { "epoch": 0.31, "learning_rate": 2.9070364806922495e-05, "loss": 0.3842, "step": 310 }, { "epoch": 0.32, "learning_rate": 2.8981194288753864e-05, "loss": 0.4057, "step": 320 }, { "epoch": 0.33, "learning_rate": 2.888809045964882e-05, "loss": 0.407, "step": 330 }, { "epoch": 0.34, "learning_rate": 2.8791079512384716e-05, "loss": 0.4048, "step": 340 }, { "epoch": 0.35, "learning_rate": 2.869018873892331e-05, "loss": 0.4119, "step": 350 }, { "epoch": 0.37, "learning_rate": 2.8585446522732738e-05, "loss": 0.379, "step": 360 }, { "epoch": 0.38, "learning_rate": 2.8476882330802442e-05, "loss": 0.4007, "step": 370 }, { "epoch": 0.39, "learning_rate": 2.8364526705353253e-05, "loss": 0.3841, "step": 380 }, { "epoch": 0.4, "learning_rate": 2.8248411255244985e-05, "loss": 0.3977, "step": 390 }, { "epoch": 0.41, "learning_rate": 2.8128568647083962e-05, "loss": 0.3957, "step": 400 }, { "epoch": 0.42, "learning_rate": 2.8005032596032966e-05, "loss": 0.4228, "step": 410 }, { "epoch": 0.43, "learning_rate": 2.7877837856326193e-05, "loss": 0.3848, "step": 420 }, { "epoch": 0.44, "learning_rate": 2.7747020211491894e-05, "loss": 0.3618, "step": 430 }, { "epoch": 0.45, "learning_rate": 2.761261646428543e-05, "loss": 0.4117, "step": 440 }, { "epoch": 0.46, "learning_rate": 2.747466442633561e-05, "loss": 0.396, "step": 450 }, { "epoch": 0.47, "learning_rate": 2.7333202907507196e-05, "loss": 0.4077, "step": 460 }, { "epoch": 0.48, "learning_rate": 2.7188271704982577e-05, "loss": 0.4131, "step": 470 }, { "epoch": 0.49, "learning_rate": 2.7039911592065674e-05, "loss": 0.3943, "step": 480 }, { "epoch": 0.5, "learning_rate": 2.688816430671124e-05, "loss": 0.4165, "step": 490 }, { "epoch": 0.51, "learning_rate": 2.6733072539782788e-05, "loss": 0.3639, "step": 500 }, { "epoch": 0.52, "learning_rate": 2.6574679923042412e-05, "loss": 0.4091, "step": 510 }, { "epoch": 0.53, "learning_rate": 2.641303101687593e-05, "loss": 0.3809, "step": 520 }, { "epoch": 0.54, "learning_rate": 2.6248171297756755e-05, "loss": 0.3801, "step": 530 }, { "epoch": 0.55, "learning_rate": 2.6080147145452088e-05, "loss": 0.3986, "step": 540 }, { "epoch": 0.56, "learning_rate": 2.590900582997493e-05, "loss": 0.4076, "step": 550 }, { "epoch": 0.57, "learning_rate": 2.5734795498285684e-05, "loss": 0.3971, "step": 560 }, { "epoch": 0.58, "learning_rate": 2.555756516074704e-05, "loss": 0.3847, "step": 570 }, { "epoch": 0.59, "learning_rate": 2.5377364677335944e-05, "loss": 0.3839, "step": 580 }, { "epoch": 0.6, "learning_rate": 2.5194244743616557e-05, "loss": 0.3845, "step": 590 }, { "epoch": 0.61, "learning_rate": 2.5008256876478146e-05, "loss": 0.3907, "step": 600 }, { "epoch": 0.62, "learning_rate": 2.481945339964189e-05, "loss": 0.4105, "step": 610 }, { "epoch": 0.63, "learning_rate": 2.4627887428940757e-05, "loss": 0.3809, "step": 620 }, { "epoch": 0.64, "learning_rate": 2.4433612857376438e-05, "loss": 0.3887, "step": 630 }, { "epoch": 0.65, "learning_rate": 2.4236684339957745e-05, "loss": 0.4146, "step": 640 }, { "epoch": 0.66, "learning_rate": 2.4037157278324564e-05, "loss": 0.3752, "step": 650 }, { "epoch": 0.67, "learning_rate": 2.383508780516181e-05, "loss": 0.3826, "step": 660 }, { "epoch": 0.68, "learning_rate": 2.3630532768407667e-05, "loss": 0.3884, "step": 670 }, { "epoch": 0.69, "learning_rate": 2.3423549715260643e-05, "loss": 0.3926, "step": 680 }, { "epoch": 0.7, "learning_rate": 2.3214196875989905e-05, "loss": 0.3777, "step": 690 }, { "epoch": 0.71, "learning_rate": 2.300253314755341e-05, "loss": 0.3966, "step": 700 }, { "epoch": 0.72, "learning_rate": 2.2788618077028512e-05, "loss": 0.3884, "step": 710 }, { "epoch": 0.73, "learning_rate": 2.2572511844859682e-05, "loss": 0.3736, "step": 720 }, { "epoch": 0.74, "learning_rate": 2.2354275247927982e-05, "loss": 0.3989, "step": 730 }, { "epoch": 0.75, "learning_rate": 2.2133969682447195e-05, "loss": 0.394, "step": 740 }, { "epoch": 0.76, "learning_rate": 2.1911657126691292e-05, "loss": 0.3795, "step": 750 }, { "epoch": 0.77, "learning_rate": 2.1687400123558168e-05, "loss": 0.3941, "step": 760 }, { "epoch": 0.78, "learning_rate": 2.146126176297453e-05, "loss": 0.3936, "step": 770 }, { "epoch": 0.79, "learning_rate": 2.12333056641469e-05, "loss": 0.405, "step": 780 }, { "epoch": 0.8, "learning_rate": 2.1003595957663693e-05, "loss": 0.3903, "step": 790 }, { "epoch": 0.81, "learning_rate": 2.077219726745346e-05, "loss": 0.3823, "step": 800 }, { "epoch": 0.82, "learning_rate": 2.053917469260431e-05, "loss": 0.3883, "step": 810 }, { "epoch": 0.83, "learning_rate": 2.0304593789049672e-05, "loss": 0.3923, "step": 820 }, { "epoch": 0.84, "learning_rate": 2.0068520551125535e-05, "loss": 0.3741, "step": 830 }, { "epoch": 0.85, "learning_rate": 1.9831021393004352e-05, "loss": 0.3735, "step": 840 }, { "epoch": 0.86, "learning_rate": 1.959216313001081e-05, "loss": 0.3828, "step": 850 }, { "epoch": 0.87, "learning_rate": 1.9352012959824793e-05, "loss": 0.3936, "step": 860 }, { "epoch": 0.88, "learning_rate": 1.9110638443576718e-05, "loss": 0.3533, "step": 870 }, { "epoch": 0.89, "learning_rate": 1.886810748684066e-05, "loss": 0.3512, "step": 880 }, { "epoch": 0.9, "learning_rate": 1.862448832053059e-05, "loss": 0.3703, "step": 890 }, { "epoch": 0.91, "learning_rate": 1.837984948170502e-05, "loss": 0.3976, "step": 900 }, { "epoch": 0.92, "learning_rate": 1.8134259794285595e-05, "loss": 0.3846, "step": 910 }, { "epoch": 0.93, "learning_rate": 1.7887788349694946e-05, "loss": 0.3679, "step": 920 }, { "epoch": 0.94, "learning_rate": 1.7640504487419268e-05, "loss": 0.3693, "step": 930 }, { "epoch": 0.95, "learning_rate": 1.7392477775501178e-05, "loss": 0.3825, "step": 940 }, { "epoch": 0.96, "learning_rate": 1.714377799096817e-05, "loss": 0.3948, "step": 950 }, { "epoch": 0.97, "learning_rate": 1.6894475100202392e-05, "loss": 0.4137, "step": 960 }, { "epoch": 0.98, "learning_rate": 1.6644639239257066e-05, "loss": 0.4053, "step": 970 }, { "epoch": 0.99, "learning_rate": 1.6394340694125204e-05, "loss": 0.3788, "step": 980 }, { "epoch": 1.0, "learning_rate": 1.6143649880966164e-05, "loss": 0.4028, "step": 990 }, { "epoch": 1.01, "learning_rate": 1.5892637326295542e-05, "loss": 0.3858, "step": 1000 }, { "epoch": 1.01, "eval_loss": 0.4780150055885315, "eval_runtime": 11.6787, "eval_samples_per_second": 3.853, "eval_steps_per_second": 1.028, "step": 1000 }, { "epoch": 1.02, "learning_rate": 1.5641373647144043e-05, "loss": 0.3697, "step": 1010 }, { "epoch": 1.03, "learning_rate": 1.538992953119089e-05, "loss": 0.3503, "step": 1020 }, { "epoch": 1.04, "learning_rate": 1.5138375716877379e-05, "loss": 0.3831, "step": 1030 }, { "epoch": 1.05, "learning_rate": 1.4886782973506064e-05, "loss": 0.3854, "step": 1040 }, { "epoch": 1.06, "learning_rate": 1.463522208133139e-05, "loss": 0.3572, "step": 1050 }, { "epoch": 1.08, "learning_rate": 1.4383763811647126e-05, "loss": 0.3455, "step": 1060 }, { "epoch": 1.09, "learning_rate": 1.4132478906876406e-05, "loss": 0.3861, "step": 1070 }, { "epoch": 1.1, "learning_rate": 1.3881438060669832e-05, "loss": 0.3825, "step": 1080 }, { "epoch": 1.11, "learning_rate": 1.3630711898017336e-05, "loss": 0.3487, "step": 1090 }, { "epoch": 1.12, "learning_rate": 1.3380370955379331e-05, "loss": 0.3885, "step": 1100 }, { "epoch": 1.13, "learning_rate": 1.3130485660842818e-05, "loss": 0.3644, "step": 1110 }, { "epoch": 1.14, "learning_rate": 1.2881126314307895e-05, "loss": 0.3769, "step": 1120 }, { "epoch": 1.15, "learning_rate": 1.2632363067710408e-05, "loss": 0.3709, "step": 1130 }, { "epoch": 1.16, "learning_rate": 1.2384265905286195e-05, "loss": 0.3795, "step": 1140 }, { "epoch": 1.17, "learning_rate": 1.213690462388252e-05, "loss": 0.3525, "step": 1150 }, { "epoch": 1.18, "learning_rate": 1.1890348813322198e-05, "loss": 0.3664, "step": 1160 }, { "epoch": 1.19, "learning_rate": 1.1644667836826004e-05, "loss": 0.374, "step": 1170 }, { "epoch": 1.2, "learning_rate": 1.1399930811498802e-05, "loss": 0.3554, "step": 1180 }, { "epoch": 1.21, "learning_rate": 1.115620658888491e-05, "loss": 0.3747, "step": 1190 }, { "epoch": 1.22, "learning_rate": 1.0913563735598206e-05, "loss": 0.3676, "step": 1200 }, { "epoch": 1.23, "learning_rate": 1.0672070514032354e-05, "loss": 0.3478, "step": 1210 }, { "epoch": 1.24, "learning_rate": 1.0431794863156649e-05, "loss": 0.3633, "step": 1220 }, { "epoch": 1.25, "learning_rate": 1.0192804379402823e-05, "loss": 0.3591, "step": 1230 }, { "epoch": 1.26, "learning_rate": 9.955166297648241e-06, "loss": 0.3727, "step": 1240 }, { "epoch": 1.27, "learning_rate": 9.718947472300812e-06, "loss": 0.3669, "step": 1250 }, { "epoch": 1.28, "learning_rate": 9.484214358490896e-06, "loss": 0.3653, "step": 1260 }, { "epoch": 1.29, "learning_rate": 9.251032993375591e-06, "loss": 0.3713, "step": 1270 }, { "epoch": 1.3, "learning_rate": 9.019468977560585e-06, "loss": 0.3622, "step": 1280 }, { "epoch": 1.31, "learning_rate": 8.789587456644819e-06, "loss": 0.3562, "step": 1290 }, { "epoch": 1.32, "learning_rate": 8.561453102893177e-06, "loss": 0.3619, "step": 1300 }, { "epoch": 1.33, "learning_rate": 8.335130097042284e-06, "loss": 0.4009, "step": 1310 }, { "epoch": 1.34, "learning_rate": 8.110682110244685e-06, "loss": 0.347, "step": 1320 }, { "epoch": 1.35, "learning_rate": 7.888172286156265e-06, "loss": 0.3891, "step": 1330 }, { "epoch": 1.36, "learning_rate": 7.667663223172156e-06, "loss": 0.3655, "step": 1340 }, { "epoch": 1.37, "learning_rate": 7.449216956816015e-06, "loss": 0.3552, "step": 1350 }, { "epoch": 1.38, "learning_rate": 7.232894942287638e-06, "loss": 0.3632, "step": 1360 }, { "epoch": 1.39, "learning_rate": 7.018758037173876e-06, "loss": 0.3519, "step": 1370 }, { "epoch": 1.4, "learning_rate": 6.806866484327612e-06, "loss": 0.375, "step": 1380 }, { "epoch": 1.41, "learning_rate": 6.597279894919741e-06, "loss": 0.3745, "step": 1390 }, { "epoch": 1.42, "learning_rate": 6.390057231668853e-06, "loss": 0.3651, "step": 1400 }, { "epoch": 1.43, "learning_rate": 6.185256792253298e-06, "loss": 0.3639, "step": 1410 }, { "epoch": 1.44, "learning_rate": 5.982936192910392e-06, "loss": 0.3725, "step": 1420 }, { "epoch": 1.45, "learning_rate": 5.78315235222732e-06, "loss": 0.3568, "step": 1430 }, { "epoch": 1.46, "learning_rate": 5.585961475128293e-06, "loss": 0.3373, "step": 1440 }, { "epoch": 1.47, "learning_rate": 5.39141903706252e-06, "loss": 0.3593, "step": 1450 }, { "epoch": 1.48, "learning_rate": 5.199579768397331e-06, "loss": 0.3451, "step": 1460 }, { "epoch": 1.49, "learning_rate": 5.010497639020985e-06, "loss": 0.3503, "step": 1470 }, { "epoch": 1.5, "learning_rate": 4.824225843159382e-06, "loss": 0.3575, "step": 1480 }, { "epoch": 1.51, "learning_rate": 4.640816784411005e-06, "loss": 0.3542, "step": 1490 }, { "epoch": 1.52, "learning_rate": 4.460322061004282e-06, "loss": 0.3447, "step": 1500 }, { "epoch": 1.53, "learning_rate": 4.28279245128152e-06, "loss": 0.3791, "step": 1510 }, { "epoch": 1.54, "learning_rate": 4.108277899413525e-06, "loss": 0.38, "step": 1520 }, { "epoch": 1.55, "learning_rate": 3.936827501348838e-06, "loss": 0.3568, "step": 1530 }, { "epoch": 1.56, "learning_rate": 3.76848949100166e-06, "loss": 0.384, "step": 1540 }, { "epoch": 1.57, "learning_rate": 3.6033112266822636e-06, "loss": 0.3476, "step": 1550 }, { "epoch": 1.58, "learning_rate": 3.441339177773745e-06, "loss": 0.3807, "step": 1560 }, { "epoch": 1.59, "learning_rate": 3.2826189116588445e-06, "loss": 0.3548, "step": 1570 }, { "epoch": 1.6, "learning_rate": 3.1271950809005424e-06, "loss": 0.371, "step": 1580 }, { "epoch": 1.61, "learning_rate": 2.9751114106800186e-06, "loss": 0.373, "step": 1590 }, { "epoch": 1.62, "learning_rate": 2.82641068649549e-06, "loss": 0.3704, "step": 1600 }, { "epoch": 1.63, "learning_rate": 2.6811347421254266e-06, "loss": 0.3719, "step": 1610 }, { "epoch": 1.64, "learning_rate": 2.5393244478595223e-06, "loss": 0.3669, "step": 1620 }, { "epoch": 1.65, "learning_rate": 2.401019699000693e-06, "loss": 0.3544, "step": 1630 }, { "epoch": 1.66, "learning_rate": 2.266259404641401e-06, "loss": 0.3862, "step": 1640 }, { "epoch": 1.67, "learning_rate": 2.135081476717396e-06, "loss": 0.3763, "step": 1650 }, { "epoch": 1.68, "learning_rate": 2.007522819342005e-06, "loss": 0.3751, "step": 1660 }, { "epoch": 1.69, "learning_rate": 1.8836193184239536e-06, "loss": 0.3443, "step": 1670 }, { "epoch": 1.7, "learning_rate": 1.7634058315716257e-06, "loss": 0.3694, "step": 1680 }, { "epoch": 1.71, "learning_rate": 1.6469161782866115e-06, "loss": 0.3459, "step": 1690 }, { "epoch": 1.72, "learning_rate": 1.5341831304493258e-06, "loss": 0.3807, "step": 1700 }, { "epoch": 1.73, "learning_rate": 1.425238403099323e-06, "loss": 0.3747, "step": 1710 }, { "epoch": 1.74, "learning_rate": 1.3201126455129669e-06, "loss": 0.353, "step": 1720 }, { "epoch": 1.75, "learning_rate": 1.2188354325808853e-06, "loss": 0.3695, "step": 1730 }, { "epoch": 1.76, "learning_rate": 1.1214352564877266e-06, "loss": 0.3623, "step": 1740 }, { "epoch": 1.77, "learning_rate": 1.027939518696478e-06, "loss": 0.3616, "step": 1750 }, { "epoch": 1.78, "learning_rate": 9.383745222396545e-07, "loss": 0.366, "step": 1760 }, { "epoch": 1.8, "learning_rate": 8.527654643194959e-07, "loss": 0.3463, "step": 1770 }, { "epoch": 1.81, "learning_rate": 7.711364292192768e-07, "loss": 0.3657, "step": 1780 }, { "epoch": 1.82, "learning_rate": 6.935103815277094e-07, "loss": 0.3636, "step": 1790 }, { "epoch": 1.83, "learning_rate": 6.199091596783352e-07, "loss": 0.3644, "step": 1800 }, { "epoch": 1.84, "learning_rate": 5.503534698057561e-07, "loss": 0.3631, "step": 1810 }, { "epoch": 1.85, "learning_rate": 4.848628799204024e-07, "loss": 0.3544, "step": 1820 }, { "epoch": 1.86, "learning_rate": 4.2345581440348834e-07, "loss": 0.354, "step": 1830 }, { "epoch": 1.87, "learning_rate": 3.66149548823711e-07, "loss": 0.3841, "step": 1840 }, { "epoch": 1.88, "learning_rate": 3.129602050771285e-07, "loss": 0.3435, "step": 1850 }, { "epoch": 1.89, "learning_rate": 2.639027468516181e-07, "loss": 0.3456, "step": 1860 }, { "epoch": 1.9, "learning_rate": 2.1899097541715485e-07, "loss": 0.3673, "step": 1870 }, { "epoch": 1.91, "learning_rate": 1.782375257431196e-07, "loss": 0.3566, "step": 1880 }, { "epoch": 1.92, "learning_rate": 1.4165386294372028e-07, "loss": 0.3729, "step": 1890 }, { "epoch": 1.93, "learning_rate": 1.0925027905253072e-07, "loss": 0.3607, "step": 1900 }, { "epoch": 1.94, "learning_rate": 8.103589012703805e-08, "loss": 0.3696, "step": 1910 }, { "epoch": 1.95, "learning_rate": 5.701863368403948e-08, "loss": 0.3594, "step": 1920 }, { "epoch": 1.96, "learning_rate": 3.720526646659339e-08, "loss": 0.3631, "step": 1930 }, { "epoch": 1.97, "learning_rate": 2.1601362543148263e-08, "loss": 0.3805, "step": 1940 }, { "epoch": 1.98, "learning_rate": 1.0211311739400376e-08, "loss": 0.3676, "step": 1950 }, { "epoch": 1.99, "learning_rate": 3.038318403308371e-09, "loss": 0.3749, "step": 1960 }, { "epoch": 2.0, "learning_rate": 8.440050362101825e-11, "loss": 0.3555, "step": 1970 }, { "epoch": 2.0, "step": 1972, "total_flos": 3.329734212625367e+17, "train_loss": 0.3953790905872175, "train_runtime": 6822.6023, "train_samples_per_second": 1.155, "train_steps_per_second": 0.289 } ], "logging_steps": 10, "max_steps": 1972, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1000, "total_flos": 3.329734212625367e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }