{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 80, "global_step": 1040, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004807692307692308, "grad_norm": 172.91949232049635, "learning_rate": 7.142857142857143e-07, "loss": 1.275, "step": 5 }, { "epoch": 0.009615384615384616, "grad_norm": 15.162819415928812, "learning_rate": 1.4285714285714286e-06, "loss": 1.0842, "step": 10 }, { "epoch": 0.014423076923076924, "grad_norm": 43.156613200294984, "learning_rate": 2.142857142857143e-06, "loss": 1.0731, "step": 15 }, { "epoch": 0.019230769230769232, "grad_norm": 12.05730410265307, "learning_rate": 2.8571428571428573e-06, "loss": 0.9966, "step": 20 }, { "epoch": 0.02403846153846154, "grad_norm": 29.645022197375592, "learning_rate": 3.5714285714285714e-06, "loss": 0.9506, "step": 25 }, { "epoch": 0.028846153846153848, "grad_norm": 27.496331665845375, "learning_rate": 4.285714285714286e-06, "loss": 0.9081, "step": 30 }, { "epoch": 0.03365384615384615, "grad_norm": 33.04607017399404, "learning_rate": 5e-06, "loss": 0.9839, "step": 35 }, { "epoch": 0.038461538461538464, "grad_norm": 4.67215100059801, "learning_rate": 5.7142857142857145e-06, "loss": 0.9143, "step": 40 }, { "epoch": 0.04326923076923077, "grad_norm": 3.5686279553698648, "learning_rate": 5.958760472832704e-06, "loss": 1.0449, "step": 45 }, { "epoch": 0.04807692307692308, "grad_norm": 3.5137053750826093, "learning_rate": 5.890441320869003e-06, "loss": 1.1233, "step": 50 }, { "epoch": 0.052884615384615384, "grad_norm": 39.50156599931848, "learning_rate": 5.822637783235761e-06, "loss": 0.7838, "step": 55 }, { "epoch": 0.057692307692307696, "grad_norm": 3.9954682494826987, "learning_rate": 5.755348556225628e-06, "loss": 0.937, "step": 60 }, { "epoch": 0.0625, "grad_norm": 3.606702118329892, "learning_rate": 5.688572332818116e-06, "loss": 0.9261, "step": 65 }, { "epoch": 0.0673076923076923, "grad_norm": 5.018248874418577, "learning_rate": 5.622307802654199e-06, "loss": 0.929, "step": 70 }, { "epoch": 0.07211538461538461, "grad_norm": 14.46426266685474, "learning_rate": 5.556553652010609e-06, "loss": 1.0281, "step": 75 }, { "epoch": 0.07692307692307693, "grad_norm": 6.717639424187089, "learning_rate": 5.4913085637737825e-06, "loss": 1.0252, "step": 80 }, { "epoch": 0.07692307692307693, "eval_loss": 1.0117295980453491, "eval_runtime": 22.175, "eval_samples_per_second": 9.019, "eval_steps_per_second": 2.255, "step": 80 }, { "epoch": 0.08173076923076923, "grad_norm": 5.9628841757039055, "learning_rate": 5.42657121741348e-06, "loss": 0.9798, "step": 85 }, { "epoch": 0.08653846153846154, "grad_norm": 2.8480154385386633, "learning_rate": 5.362340288956054e-06, "loss": 0.9422, "step": 90 }, { "epoch": 0.09134615384615384, "grad_norm": 3.041511518203806, "learning_rate": 5.298614450957377e-06, "loss": 0.7751, "step": 95 }, { "epoch": 0.09615384615384616, "grad_norm": 3.245117237641717, "learning_rate": 5.235392372475402e-06, "loss": 1.0559, "step": 100 }, { "epoch": 0.10096153846153846, "grad_norm": 4.186131160050305, "learning_rate": 5.1726727190423596e-06, "loss": 0.8535, "step": 105 }, { "epoch": 0.10576923076923077, "grad_norm": 3.2112656498482743, "learning_rate": 5.110454152636601e-06, "loss": 1.0847, "step": 110 }, { "epoch": 0.11057692307692307, "grad_norm": 3.5829116342694443, "learning_rate": 5.04873533165404e-06, "loss": 0.989, "step": 115 }, { "epoch": 0.11538461538461539, "grad_norm": 2.8192163511739308, "learning_rate": 4.987514910879233e-06, "loss": 0.7562, "step": 120 }, { "epoch": 0.1201923076923077, "grad_norm": 3.552581067366997, "learning_rate": 4.9267915414560465e-06, "loss": 0.882, "step": 125 }, { "epoch": 0.125, "grad_norm": 3.166131159213283, "learning_rate": 4.866563870857949e-06, "loss": 0.8461, "step": 130 }, { "epoch": 0.12980769230769232, "grad_norm": 3.4902158184612873, "learning_rate": 4.806830542857871e-06, "loss": 1.0949, "step": 135 }, { "epoch": 0.1346153846153846, "grad_norm": 2.763230275625746, "learning_rate": 4.7475901974976784e-06, "loss": 0.9741, "step": 140 }, { "epoch": 0.13942307692307693, "grad_norm": 3.7680960024565047, "learning_rate": 4.688841471057191e-06, "loss": 0.8267, "step": 145 }, { "epoch": 0.14423076923076922, "grad_norm": 3.7223152035406177, "learning_rate": 4.630582996022805e-06, "loss": 0.9237, "step": 150 }, { "epoch": 0.14903846153846154, "grad_norm": 163.68789967501425, "learning_rate": 4.572813401055646e-06, "loss": 0.9735, "step": 155 }, { "epoch": 0.15384615384615385, "grad_norm": 4.052744089990857, "learning_rate": 4.515531310959294e-06, "loss": 0.8185, "step": 160 }, { "epoch": 0.15384615384615385, "eval_loss": 0.9820164442062378, "eval_runtime": 20.5987, "eval_samples_per_second": 9.709, "eval_steps_per_second": 2.427, "step": 160 }, { "epoch": 0.15865384615384615, "grad_norm": 3.5962134321693213, "learning_rate": 4.458735346647049e-06, "loss": 0.9701, "step": 165 }, { "epoch": 0.16346153846153846, "grad_norm": 3.405720690826482, "learning_rate": 4.402424125108714e-06, "loss": 0.7428, "step": 170 }, { "epoch": 0.16826923076923078, "grad_norm": 3.5656581164655297, "learning_rate": 4.346596259376934e-06, "loss": 1.0573, "step": 175 }, { "epoch": 0.17307692307692307, "grad_norm": 3.1116839574479944, "learning_rate": 4.291250358493015e-06, "loss": 0.99, "step": 180 }, { "epoch": 0.1778846153846154, "grad_norm": 3.1856579669538037, "learning_rate": 4.236385027472282e-06, "loss": 0.9208, "step": 185 }, { "epoch": 0.18269230769230768, "grad_norm": 2.713262969000155, "learning_rate": 4.181998867268901e-06, "loss": 0.9552, "step": 190 }, { "epoch": 0.1875, "grad_norm": 3.4690878970474364, "learning_rate": 4.1280904747402165e-06, "loss": 0.9004, "step": 195 }, { "epoch": 0.19230769230769232, "grad_norm": 2.6094836512830755, "learning_rate": 4.07465844261054e-06, "loss": 1.0189, "step": 200 }, { "epoch": 0.1971153846153846, "grad_norm": 2.7258662188339917, "learning_rate": 4.021701359434411e-06, "loss": 0.8663, "step": 205 }, { "epoch": 0.20192307692307693, "grad_norm": 2.130745708170683, "learning_rate": 3.9692178095593185e-06, "loss": 0.9191, "step": 210 }, { "epoch": 0.20673076923076922, "grad_norm": 3.632025896546127, "learning_rate": 3.917206373087843e-06, "loss": 0.8463, "step": 215 }, { "epoch": 0.21153846153846154, "grad_norm": 2.8163127172248754, "learning_rate": 3.86566562583925e-06, "loss": 0.9113, "step": 220 }, { "epoch": 0.21634615384615385, "grad_norm": 2.925143301211318, "learning_rate": 3.814594139310489e-06, "loss": 0.8026, "step": 225 }, { "epoch": 0.22115384615384615, "grad_norm": 3.491601498263278, "learning_rate": 3.7639904806365957e-06, "loss": 1.0014, "step": 230 }, { "epoch": 0.22596153846153846, "grad_norm": 3.60018394829918, "learning_rate": 3.7138532125504874e-06, "loss": 0.8704, "step": 235 }, { "epoch": 0.23076923076923078, "grad_norm": 2.8380175093955193, "learning_rate": 3.664180893342146e-06, "loss": 0.9686, "step": 240 }, { "epoch": 0.23076923076923078, "eval_loss": 0.9701676964759827, "eval_runtime": 21.0027, "eval_samples_per_second": 9.523, "eval_steps_per_second": 2.381, "step": 240 }, { "epoch": 0.23557692307692307, "grad_norm": 2.88654950044071, "learning_rate": 3.6149720768171497e-06, "loss": 0.9927, "step": 245 }, { "epoch": 0.2403846153846154, "grad_norm": 4.075449595613525, "learning_rate": 3.5662253122545742e-06, "loss": 0.8335, "step": 250 }, { "epoch": 0.24519230769230768, "grad_norm": 3.1216694362939137, "learning_rate": 3.517939144364211e-06, "loss": 0.9225, "step": 255 }, { "epoch": 0.25, "grad_norm": 3.4006563474977787, "learning_rate": 3.4701121132431283e-06, "loss": 0.9645, "step": 260 }, { "epoch": 0.2548076923076923, "grad_norm": 5.006159977047571, "learning_rate": 3.422742754331519e-06, "loss": 1.0596, "step": 265 }, { "epoch": 0.25961538461538464, "grad_norm": 6.352134442675443, "learning_rate": 3.3758295983678575e-06, "loss": 0.8279, "step": 270 }, { "epoch": 0.2644230769230769, "grad_norm": 4.599350051977448, "learning_rate": 3.329371171343321e-06, "loss": 0.7653, "step": 275 }, { "epoch": 0.2692307692307692, "grad_norm": 3.775428351149461, "learning_rate": 3.2833659944554757e-06, "loss": 0.8703, "step": 280 }, { "epoch": 0.27403846153846156, "grad_norm": 2.5830965772659287, "learning_rate": 3.2378125840611978e-06, "loss": 0.826, "step": 285 }, { "epoch": 0.27884615384615385, "grad_norm": 3.6279739322810642, "learning_rate": 3.192709451628821e-06, "loss": 0.8617, "step": 290 }, { "epoch": 0.28365384615384615, "grad_norm": 2.574184072314173, "learning_rate": 3.1480551036895063e-06, "loss": 0.9925, "step": 295 }, { "epoch": 0.28846153846153844, "grad_norm": 3.245622107244932, "learning_rate": 3.1038480417877728e-06, "loss": 0.8276, "step": 300 }, { "epoch": 0.2932692307692308, "grad_norm": 2.7094531818622385, "learning_rate": 3.0600867624312124e-06, "loss": 0.93, "step": 305 }, { "epoch": 0.2980769230769231, "grad_norm": 3.4108002405937996, "learning_rate": 3.0167697570393586e-06, "loss": 0.9093, "step": 310 }, { "epoch": 0.30288461538461536, "grad_norm": 3.2261512213908468, "learning_rate": 2.973895511891673e-06, "loss": 0.8436, "step": 315 }, { "epoch": 0.3076923076923077, "grad_norm": 2.9111217804814733, "learning_rate": 2.9314625080746407e-06, "loss": 0.7962, "step": 320 }, { "epoch": 0.3076923076923077, "eval_loss": 0.9604336619377136, "eval_runtime": 20.7503, "eval_samples_per_second": 9.638, "eval_steps_per_second": 2.41, "step": 320 }, { "epoch": 0.3125, "grad_norm": 3.0069826903052568, "learning_rate": 2.8894692214279614e-06, "loss": 0.9501, "step": 325 }, { "epoch": 0.3173076923076923, "grad_norm": 2.7402700321309497, "learning_rate": 2.8479141224897947e-06, "loss": 0.8932, "step": 330 }, { "epoch": 0.32211538461538464, "grad_norm": 2.850461668225791, "learning_rate": 2.806795676441052e-06, "loss": 0.8509, "step": 335 }, { "epoch": 0.3269230769230769, "grad_norm": 2.8055976999039833, "learning_rate": 2.7661123430487023e-06, "loss": 0.8531, "step": 340 }, { "epoch": 0.3317307692307692, "grad_norm": 3.950790855598453, "learning_rate": 2.725862576608072e-06, "loss": 0.8428, "step": 345 }, { "epoch": 0.33653846153846156, "grad_norm": 2.608925093832874, "learning_rate": 2.6860448258841182e-06, "loss": 0.9324, "step": 350 }, { "epoch": 0.34134615384615385, "grad_norm": 4.161582883109561, "learning_rate": 2.6466575340516312e-06, "loss": 0.8302, "step": 355 }, { "epoch": 0.34615384615384615, "grad_norm": 3.223665474192437, "learning_rate": 2.607699138634365e-06, "loss": 1.0338, "step": 360 }, { "epoch": 0.35096153846153844, "grad_norm": 4.360630028683017, "learning_rate": 2.5691680714430463e-06, "loss": 0.781, "step": 365 }, { "epoch": 0.3557692307692308, "grad_norm": 3.2326801834772256, "learning_rate": 2.531062758512248e-06, "loss": 0.9277, "step": 370 }, { "epoch": 0.3605769230769231, "grad_norm": 3.518325507567999, "learning_rate": 2.493381620036082e-06, "loss": 0.7648, "step": 375 }, { "epoch": 0.36538461538461536, "grad_norm": 3.905842925893686, "learning_rate": 2.4561230703027005e-06, "loss": 0.7278, "step": 380 }, { "epoch": 0.3701923076923077, "grad_norm": 5.371293959548764, "learning_rate": 2.4192855176275597e-06, "loss": 0.7564, "step": 385 }, { "epoch": 0.375, "grad_norm": 2.850075623051217, "learning_rate": 2.382867364285416e-06, "loss": 0.7983, "step": 390 }, { "epoch": 0.3798076923076923, "grad_norm": 6.661652196819241, "learning_rate": 2.3468670064410194e-06, "loss": 0.9005, "step": 395 }, { "epoch": 0.38461538461538464, "grad_norm": 4.700394864120094, "learning_rate": 2.3112828340784763e-06, "loss": 0.8669, "step": 400 }, { "epoch": 0.38461538461538464, "eval_loss": 0.9519588351249695, "eval_runtime": 20.79, "eval_samples_per_second": 9.62, "eval_steps_per_second": 2.405, "step": 400 }, { "epoch": 0.3894230769230769, "grad_norm": 3.3197778882289297, "learning_rate": 2.2761132309292435e-06, "loss": 0.8864, "step": 405 }, { "epoch": 0.3942307692307692, "grad_norm": 4.198490325675027, "learning_rate": 2.241356574398701e-06, "loss": 0.9219, "step": 410 }, { "epoch": 0.39903846153846156, "grad_norm": 8.447734132502742, "learning_rate": 2.2070112354912867e-06, "loss": 0.9542, "step": 415 }, { "epoch": 0.40384615384615385, "grad_norm": 3.6043476480492873, "learning_rate": 2.1730755787341422e-06, "loss": 0.7828, "step": 420 }, { "epoch": 0.40865384615384615, "grad_norm": 3.550876988072227, "learning_rate": 2.1395479620992237e-06, "loss": 0.9213, "step": 425 }, { "epoch": 0.41346153846153844, "grad_norm": 4.346265355776214, "learning_rate": 2.1064267369238405e-06, "loss": 0.8832, "step": 430 }, { "epoch": 0.4182692307692308, "grad_norm": 8.956356184457416, "learning_rate": 2.0737102478295753e-06, "loss": 1.0524, "step": 435 }, { "epoch": 0.4230769230769231, "grad_norm": 4.0026073252992225, "learning_rate": 2.0413968326395454e-06, "loss": 0.8951, "step": 440 }, { "epoch": 0.42788461538461536, "grad_norm": 3.769313811024604, "learning_rate": 2.009484822293941e-06, "loss": 0.8803, "step": 445 }, { "epoch": 0.4326923076923077, "grad_norm": 3.4579810908904927, "learning_rate": 1.9779725407638038e-06, "loss": 0.8575, "step": 450 }, { "epoch": 0.4375, "grad_norm": 3.6235925820400112, "learning_rate": 1.946858304962993e-06, "loss": 0.874, "step": 455 }, { "epoch": 0.4423076923076923, "grad_norm": 3.2454132821623607, "learning_rate": 1.9161404246582834e-06, "loss": 1.0103, "step": 460 }, { "epoch": 0.44711538461538464, "grad_norm": 3.438741636237806, "learning_rate": 1.8858172023775289e-06, "loss": 0.8943, "step": 465 }, { "epoch": 0.4519230769230769, "grad_norm": 3.1798809256755205, "learning_rate": 1.8558869333158512e-06, "loss": 0.9638, "step": 470 }, { "epoch": 0.4567307692307692, "grad_norm": 3.6082058444107177, "learning_rate": 1.8263479052397838e-06, "loss": 0.8781, "step": 475 }, { "epoch": 0.46153846153846156, "grad_norm": 2.83102154533938, "learning_rate": 1.7971983983893046e-06, "loss": 0.8883, "step": 480 }, { "epoch": 0.46153846153846156, "eval_loss": 0.9505824446678162, "eval_runtime": 20.9063, "eval_samples_per_second": 9.566, "eval_steps_per_second": 2.392, "step": 480 }, { "epoch": 0.46634615384615385, "grad_norm": 2.9075319767858425, "learning_rate": 1.768436685377699e-06, "loss": 0.7087, "step": 485 }, { "epoch": 0.47115384615384615, "grad_norm": 3.7507183698931117, "learning_rate": 1.7400610310891816e-06, "loss": 0.928, "step": 490 }, { "epoch": 0.47596153846153844, "grad_norm": 3.0576523378992326, "learning_rate": 1.7120696925742107e-06, "loss": 0.8047, "step": 495 }, { "epoch": 0.4807692307692308, "grad_norm": 2.6687945237895287, "learning_rate": 1.6844609189424112e-06, "loss": 1.0923, "step": 500 }, { "epoch": 0.4855769230769231, "grad_norm": 3.7056881913494277, "learning_rate": 1.6572329512530394e-06, "loss": 0.7718, "step": 505 }, { "epoch": 0.49038461538461536, "grad_norm": 4.261130783269975, "learning_rate": 1.630384022402907e-06, "loss": 0.7462, "step": 510 }, { "epoch": 0.4951923076923077, "grad_norm": 2.8143821099136024, "learning_rate": 1.6039123570116796e-06, "loss": 0.965, "step": 515 }, { "epoch": 0.5, "grad_norm": 3.0264813559392616, "learning_rate": 1.5778161713044614e-06, "loss": 0.8943, "step": 520 }, { "epoch": 0.5048076923076923, "grad_norm": 18.246495136897703, "learning_rate": 1.5520936729915777e-06, "loss": 0.9694, "step": 525 }, { "epoch": 0.5096153846153846, "grad_norm": 4.039649841411536, "learning_rate": 1.5267430611454654e-06, "loss": 0.8589, "step": 530 }, { "epoch": 0.5144230769230769, "grad_norm": 3.028129518354503, "learning_rate": 1.5017625260745615e-06, "loss": 0.8761, "step": 535 }, { "epoch": 0.5192307692307693, "grad_norm": 3.0504275368028115, "learning_rate": 1.4771502491940911e-06, "loss": 0.9293, "step": 540 }, { "epoch": 0.5240384615384616, "grad_norm": 2.520216608258428, "learning_rate": 1.4529044028936606e-06, "loss": 0.7738, "step": 545 }, { "epoch": 0.5288461538461539, "grad_norm": 3.4732840458118197, "learning_rate": 1.4290231504015187e-06, "loss": 0.8173, "step": 550 }, { "epoch": 0.5336538461538461, "grad_norm": 2.992673074333473, "learning_rate": 1.4055046456453867e-06, "loss": 1.0166, "step": 555 }, { "epoch": 0.5384615384615384, "grad_norm": 3.676863247659791, "learning_rate": 1.3823470331097324e-06, "loss": 0.7636, "step": 560 }, { "epoch": 0.5384615384615384, "eval_loss": 0.9441266059875488, "eval_runtime": 20.933, "eval_samples_per_second": 9.554, "eval_steps_per_second": 2.389, "step": 560 }, { "epoch": 0.5432692307692307, "grad_norm": 2.562908465662044, "learning_rate": 1.3595484476893454e-06, "loss": 0.9229, "step": 565 }, { "epoch": 0.5480769230769231, "grad_norm": 2.2982897576935724, "learning_rate": 1.3371070145391023e-06, "loss": 0.8806, "step": 570 }, { "epoch": 0.5528846153846154, "grad_norm": 4.029788762639043, "learning_rate": 1.3150208489197545e-06, "loss": 0.7314, "step": 575 }, { "epoch": 0.5576923076923077, "grad_norm": 3.4816155172912575, "learning_rate": 1.2932880560396128e-06, "loss": 0.819, "step": 580 }, { "epoch": 0.5625, "grad_norm": 3.8108295243391868, "learning_rate": 1.2719067308919584e-06, "loss": 0.7222, "step": 585 }, { "epoch": 0.5673076923076923, "grad_norm": 2.7857292629014183, "learning_rate": 1.2508749580880287e-06, "loss": 0.8022, "step": 590 }, { "epoch": 0.5721153846153846, "grad_norm": 3.6021354748640677, "learning_rate": 1.2301908116853925e-06, "loss": 0.884, "step": 595 }, { "epoch": 0.5769230769230769, "grad_norm": 3.135380180508478, "learning_rate": 1.2098523550115558e-06, "loss": 1.0023, "step": 600 }, { "epoch": 0.5817307692307693, "grad_norm": 3.3653027564726035, "learning_rate": 1.189857640482588e-06, "loss": 0.9518, "step": 605 }, { "epoch": 0.5865384615384616, "grad_norm": 2.459430693726985, "learning_rate": 1.170204709416585e-06, "loss": 0.8211, "step": 610 }, { "epoch": 0.5913461538461539, "grad_norm": 5.022938552667774, "learning_rate": 1.1508915918417567e-06, "loss": 0.7398, "step": 615 }, { "epoch": 0.5961538461538461, "grad_norm": 3.8724856541183357, "learning_rate": 1.1319163062989139e-06, "loss": 0.941, "step": 620 }, { "epoch": 0.6009615384615384, "grad_norm": 3.1280693366860963, "learning_rate": 1.1132768596381337e-06, "loss": 0.815, "step": 625 }, { "epoch": 0.6057692307692307, "grad_norm": 2.8201015243807284, "learning_rate": 1.0949712468093497e-06, "loss": 0.8991, "step": 630 }, { "epoch": 0.6105769230769231, "grad_norm": 3.32788176588362, "learning_rate": 1.076997450646619e-06, "loss": 0.9282, "step": 635 }, { "epoch": 0.6153846153846154, "grad_norm": 3.9582374514755134, "learning_rate": 1.0593534416457847e-06, "loss": 0.8221, "step": 640 }, { "epoch": 0.6153846153846154, "eval_loss": 0.9404194355010986, "eval_runtime": 21.0496, "eval_samples_per_second": 9.501, "eval_steps_per_second": 2.375, "step": 640 }, { "epoch": 0.6201923076923077, "grad_norm": 2.5869189332376004, "learning_rate": 1.0420371777352623e-06, "loss": 0.8804, "step": 645 }, { "epoch": 0.625, "grad_norm": 2.53500848922609, "learning_rate": 1.0250466040396306e-06, "loss": 0.7947, "step": 650 }, { "epoch": 0.6298076923076923, "grad_norm": 3.07037325829785, "learning_rate": 1.0083796526357243e-06, "loss": 0.8485, "step": 655 }, { "epoch": 0.6346153846153846, "grad_norm": 2.5949762709128814, "learning_rate": 9.920342423008766e-07, "loss": 0.7737, "step": 660 }, { "epoch": 0.6394230769230769, "grad_norm": 3.723350500191604, "learning_rate": 9.760082782529624e-07, "loss": 0.8044, "step": 665 }, { "epoch": 0.6442307692307693, "grad_norm": 2.91223481306706, "learning_rate": 9.602996518818617e-07, "loss": 0.8059, "step": 670 }, { "epoch": 0.6490384615384616, "grad_norm": 3.228159750161236, "learning_rate": 9.449062404719376e-07, "loss": 0.9736, "step": 675 }, { "epoch": 0.6538461538461539, "grad_norm": 4.2304614726707594, "learning_rate": 9.298259069151074e-07, "loss": 0.8253, "step": 680 }, { "epoch": 0.6586538461538461, "grad_norm": 3.253581255940029, "learning_rate": 9.15056499414049e-07, "loss": 1.0807, "step": 685 }, { "epoch": 0.6634615384615384, "grad_norm": 4.2515171628124975, "learning_rate": 9.005958511750684e-07, "loss": 0.8206, "step": 690 }, { "epoch": 0.6682692307692307, "grad_norm": 2.7617275421854526, "learning_rate": 8.864417800901062e-07, "loss": 0.9496, "step": 695 }, { "epoch": 0.6730769230769231, "grad_norm": 3.233107996911771, "learning_rate": 8.72592088407351e-07, "loss": 0.9023, "step": 700 }, { "epoch": 0.6778846153846154, "grad_norm": 3.1204863795886184, "learning_rate": 8.590445623898662e-07, "loss": 0.869, "step": 705 }, { "epoch": 0.6826923076923077, "grad_norm": 2.5285063680240234, "learning_rate": 8.457969719616223e-07, "loss": 0.9186, "step": 710 }, { "epoch": 0.6875, "grad_norm": 3.0506459039436336, "learning_rate": 8.32847070340265e-07, "loss": 0.9203, "step": 715 }, { "epoch": 0.6923076923076923, "grad_norm": 3.7957636063897318, "learning_rate": 8.201925936559198e-07, "loss": 0.9417, "step": 720 }, { "epoch": 0.6923076923076923, "eval_loss": 0.9345305562019348, "eval_runtime": 21.1147, "eval_samples_per_second": 9.472, "eval_steps_per_second": 2.368, "step": 720 }, { "epoch": 0.6971153846153846, "grad_norm": 3.3254122602539624, "learning_rate": 8.078312605552745e-07, "loss": 0.9107, "step": 725 }, { "epoch": 0.7019230769230769, "grad_norm": 2.8068324192286487, "learning_rate": 7.957607717901299e-07, "loss": 0.9438, "step": 730 }, { "epoch": 0.7067307692307693, "grad_norm": 3.498836942130792, "learning_rate": 7.839788097895564e-07, "loss": 0.8693, "step": 735 }, { "epoch": 0.7115384615384616, "grad_norm": 2.5787803338017885, "learning_rate": 7.72483038214722e-07, "loss": 0.896, "step": 740 }, { "epoch": 0.7163461538461539, "grad_norm": 3.67630240687256, "learning_rate": 7.612711014953991e-07, "loss": 0.8243, "step": 745 }, { "epoch": 0.7211538461538461, "grad_norm": 2.4521374343388125, "learning_rate": 7.503406243470673e-07, "loss": 1.0063, "step": 750 }, { "epoch": 0.7259615384615384, "grad_norm": 2.6536830050201536, "learning_rate": 7.396892112674676e-07, "loss": 0.8133, "step": 755 }, { "epoch": 0.7307692307692307, "grad_norm": 3.057951252038446, "learning_rate": 7.293144460113513e-07, "loss": 0.8753, "step": 760 }, { "epoch": 0.7355769230769231, "grad_norm": 2.3939129798326815, "learning_rate": 7.192138910420856e-07, "loss": 0.8277, "step": 765 }, { "epoch": 0.7403846153846154, "grad_norm": 2.8809002810189233, "learning_rate": 7.093850869586572e-07, "loss": 0.8746, "step": 770 }, { "epoch": 0.7451923076923077, "grad_norm": 3.272891692948664, "learning_rate": 6.998255518965055e-07, "loss": 0.8711, "step": 775 }, { "epoch": 0.75, "grad_norm": 3.1649449172099073, "learning_rate": 6.905327809004765e-07, "loss": 0.8073, "step": 780 }, { "epoch": 0.7548076923076923, "grad_norm": 2.862835029692555, "learning_rate": 6.815042452680482e-07, "loss": 0.852, "step": 785 }, { "epoch": 0.7596153846153846, "grad_norm": 4.777839902626332, "learning_rate": 6.727373918608166e-07, "loss": 0.7941, "step": 790 }, { "epoch": 0.7644230769230769, "grad_norm": 3.4663518671110403, "learning_rate": 6.642296423820508e-07, "loss": 0.8553, "step": 795 }, { "epoch": 0.7692307692307693, "grad_norm": 3.062550953679388, "learning_rate": 6.559783926179307e-07, "loss": 0.9623, "step": 800 }, { "epoch": 0.7692307692307693, "eval_loss": 0.9317355155944824, "eval_runtime": 21.1215, "eval_samples_per_second": 9.469, "eval_steps_per_second": 2.367, "step": 800 }, { "epoch": 0.7740384615384616, "grad_norm": 2.9850983787230145, "learning_rate": 6.479810116398562e-07, "loss": 0.9048, "step": 805 }, { "epoch": 0.7788461538461539, "grad_norm": 2.5686622431209387, "learning_rate": 6.40234840964976e-07, "loss": 0.7535, "step": 810 }, { "epoch": 0.7836538461538461, "grad_norm": 2.8469066270016894, "learning_rate": 6.327371936718024e-07, "loss": 0.8606, "step": 815 }, { "epoch": 0.7884615384615384, "grad_norm": 3.567677645668133, "learning_rate": 6.254853534674779e-07, "loss": 0.8133, "step": 820 }, { "epoch": 0.7932692307692307, "grad_norm": 2.331177876625003, "learning_rate": 6.184765737029068e-07, "loss": 0.921, "step": 825 }, { "epoch": 0.7980769230769231, "grad_norm": 2.684486602009453, "learning_rate": 6.117080763315794e-07, "loss": 0.8378, "step": 830 }, { "epoch": 0.8028846153846154, "grad_norm": 2.7951045757499546, "learning_rate": 6.051770508074766e-07, "loss": 0.7412, "step": 835 }, { "epoch": 0.8076923076923077, "grad_norm": 4.34395271902391, "learning_rate": 5.98880652916942e-07, "loss": 0.8488, "step": 840 }, { "epoch": 0.8125, "grad_norm": 2.4901987068339175, "learning_rate": 5.928160035388477e-07, "loss": 0.7888, "step": 845 }, { "epoch": 0.8173076923076923, "grad_norm": 3.410681331565254, "learning_rate": 5.869801873267336e-07, "loss": 0.9896, "step": 850 }, { "epoch": 0.8221153846153846, "grad_norm": 3.0373771991309715, "learning_rate": 5.813702513058679e-07, "loss": 0.7731, "step": 855 }, { "epoch": 0.8269230769230769, "grad_norm": 2.6095155256301656, "learning_rate": 5.759832033773325e-07, "loss": 0.9015, "step": 860 }, { "epoch": 0.8317307692307693, "grad_norm": 3.499761379842187, "learning_rate": 5.708160107202719e-07, "loss": 0.8423, "step": 865 }, { "epoch": 0.8365384615384616, "grad_norm": 2.63663041754238, "learning_rate": 5.658655980823239e-07, "loss": 0.8807, "step": 870 }, { "epoch": 0.8413461538461539, "grad_norm": 3.943874822020016, "learning_rate": 5.611288459469594e-07, "loss": 0.8609, "step": 875 }, { "epoch": 0.8461538461538461, "grad_norm": 2.9004043511306525, "learning_rate": 5.566025885649524e-07, "loss": 0.9654, "step": 880 }, { "epoch": 0.8461538461538461, "eval_loss": 0.9302033185958862, "eval_runtime": 21.0263, "eval_samples_per_second": 9.512, "eval_steps_per_second": 2.378, "step": 880 }, { "epoch": 0.8509615384615384, "grad_norm": 3.182299494802371, "learning_rate": 5.522836118354419e-07, "loss": 0.7406, "step": 885 }, { "epoch": 0.8557692307692307, "grad_norm": 3.1170335107274214, "learning_rate": 5.481686510199858e-07, "loss": 0.9893, "step": 890 }, { "epoch": 0.8605769230769231, "grad_norm": 2.437332494806209, "learning_rate": 5.442543882705713e-07, "loss": 0.9432, "step": 895 }, { "epoch": 0.8653846153846154, "grad_norm": 3.248411155382253, "learning_rate": 5.405374499496658e-07, "loss": 0.8199, "step": 900 }, { "epoch": 0.8701923076923077, "grad_norm": 3.699605668699813, "learning_rate": 5.370144037169503e-07, "loss": 0.8742, "step": 905 }, { "epoch": 0.875, "grad_norm": 4.418113021858762, "learning_rate": 5.336817553532644e-07, "loss": 0.8431, "step": 910 }, { "epoch": 0.8798076923076923, "grad_norm": 2.3988015404279874, "learning_rate": 5.305359452873153e-07, "loss": 0.8947, "step": 915 }, { "epoch": 0.8846153846153846, "grad_norm": 3.0267726009783766, "learning_rate": 5.275733447846792e-07, "loss": 0.7263, "step": 920 }, { "epoch": 0.8894230769230769, "grad_norm": 3.722228079235539, "learning_rate": 5.247902517512378e-07, "loss": 0.8365, "step": 925 }, { "epoch": 0.8942307692307693, "grad_norm": 2.603232021464912, "learning_rate": 5.221828860941111e-07, "loss": 1.0223, "step": 930 }, { "epoch": 0.8990384615384616, "grad_norm": 2.784717139792509, "learning_rate": 5.197473845718411e-07, "loss": 0.8666, "step": 935 }, { "epoch": 0.9038461538461539, "grad_norm": 2.864173244146164, "learning_rate": 5.174797950514308e-07, "loss": 0.7097, "step": 940 }, { "epoch": 0.9086538461538461, "grad_norm": 3.1016453769012395, "learning_rate": 5.153760700719024e-07, "loss": 0.9475, "step": 945 }, { "epoch": 0.9134615384615384, "grad_norm": 3.5038468947729973, "learning_rate": 5.13432059591097e-07, "loss": 0.8123, "step": 950 }, { "epoch": 0.9182692307692307, "grad_norm": 3.2927805818210407, "learning_rate": 5.116435027627297e-07, "loss": 0.8134, "step": 955 }, { "epoch": 0.9230769230769231, "grad_norm": 2.3328148005143747, "learning_rate": 5.100060185517474e-07, "loss": 0.9169, "step": 960 }, { "epoch": 0.9230769230769231, "eval_loss": 0.928638756275177, "eval_runtime": 21.0064, "eval_samples_per_second": 9.521, "eval_steps_per_second": 2.38, "step": 960 }, { "epoch": 0.9278846153846154, "grad_norm": 3.644838748812858, "learning_rate": 5.085150949442101e-07, "loss": 0.7718, "step": 965 }, { "epoch": 0.9326923076923077, "grad_norm": 2.7559502909140505, "learning_rate": 5.071660764378547e-07, "loss": 0.9096, "step": 970 }, { "epoch": 0.9375, "grad_norm": 2.5970949524935363, "learning_rate": 5.059541494031398e-07, "loss": 0.8835, "step": 975 }, { "epoch": 0.9423076923076923, "grad_norm": 2.1523312550723066, "learning_rate": 5.048743247693103e-07, "loss": 0.8909, "step": 980 }, { "epoch": 0.9471153846153846, "grad_norm": 5.2539613787039885, "learning_rate": 5.039214172958587e-07, "loss": 0.8688, "step": 985 }, { "epoch": 0.9519230769230769, "grad_norm": 2.9606045980250837, "learning_rate": 5.030900204036544e-07, "loss": 0.8714, "step": 990 }, { "epoch": 0.9567307692307693, "grad_norm": 2.939313716550038, "learning_rate": 5.023744751055416e-07, "loss": 0.9248, "step": 995 }, { "epoch": 0.9615384615384616, "grad_norm": 2.7776091933130473, "learning_rate": 5.017688308926548e-07, "loss": 0.8965, "step": 1000 }, { "epoch": 0.9663461538461539, "grad_norm": 3.3105407766408685, "learning_rate": 5.012667953109271e-07, "loss": 0.8606, "step": 1005 }, { "epoch": 0.9711538461538461, "grad_norm": 7.289088245652649, "learning_rate": 5.008616670245212e-07, "loss": 0.8847, "step": 1010 }, { "epoch": 0.9759615384615384, "grad_norm": 4.342531181739036, "learning_rate": 5.005462435953572e-07, "loss": 0.7237, "step": 1015 }, { "epoch": 0.9807692307692307, "grad_norm": 3.3798170801004304, "learning_rate": 5.003126880797421e-07, "loss": 0.9875, "step": 1020 }, { "epoch": 0.9855769230769231, "grad_norm": 2.413281341822416, "learning_rate": 5.00152322649041e-07, "loss": 0.8558, "step": 1025 }, { "epoch": 0.9903846153846154, "grad_norm": 3.479845931889368, "learning_rate": 5.000552759653955e-07, "loss": 0.6462, "step": 1030 }, { "epoch": 0.9951923076923077, "grad_norm": 3.6411495658522273, "learning_rate": 5.000097715024919e-07, "loss": 0.7703, "step": 1035 }, { "epoch": 1.0, "grad_norm": 2.04941647733406, "learning_rate": 5e-07, "loss": 0.9005, "step": 1040 }, { "epoch": 1.0, "eval_loss": 0.9287646412849426, "eval_runtime": 21.1549, "eval_samples_per_second": 9.454, "eval_steps_per_second": 2.364, "step": 1040 }, { "epoch": 1.0, "step": 1040, "total_flos": 8.200255844856627e+16, "train_loss": 0.8882445046534905, "train_runtime": 9157.2611, "train_samples_per_second": 3.18, "train_steps_per_second": 0.114 } ], "logging_steps": 5, "max_steps": 1040, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1040, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.200255844856627e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }