{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 300, "global_step": 897, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0033444816053511705, "grad_norm": 28.25, "learning_rate": 2e-06, "loss": 5.9886, "step": 1 }, { "epoch": 0.016722408026755852, "grad_norm": 16.375, "learning_rate": 1.999901651759575e-06, "loss": 5.9178, "step": 5 }, { "epoch": 0.033444816053511704, "grad_norm": 11.0625, "learning_rate": 1.9995021451869543e-06, "loss": 5.7759, "step": 10 }, { "epoch": 0.05016722408026756, "grad_norm": 9.0625, "learning_rate": 1.9987954562051724e-06, "loss": 5.6469, "step": 15 }, { "epoch": 0.06688963210702341, "grad_norm": 7.5, "learning_rate": 1.9977818020047815e-06, "loss": 5.5274, "step": 20 }, { "epoch": 0.08361204013377926, "grad_norm": 6.9375, "learning_rate": 1.996461494117619e-06, "loss": 5.4792, "step": 25 }, { "epoch": 0.10033444816053512, "grad_norm": 6.65625, "learning_rate": 1.994834938321061e-06, "loss": 5.4114, "step": 30 }, { "epoch": 0.11705685618729098, "grad_norm": 6.0625, "learning_rate": 1.992902634513312e-06, "loss": 5.3313, "step": 35 }, { "epoch": 0.13377926421404682, "grad_norm": 5.8125, "learning_rate": 1.99066517655977e-06, "loss": 5.2905, "step": 40 }, { "epoch": 0.1505016722408027, "grad_norm": 5.21875, "learning_rate": 1.9881232521105087e-06, "loss": 5.2384, "step": 45 }, { "epoch": 0.16722408026755853, "grad_norm": 4.375, "learning_rate": 1.985277642388941e-06, "loss": 5.1707, "step": 50 }, { "epoch": 0.18394648829431437, "grad_norm": 4.25, "learning_rate": 1.982129221951719e-06, "loss": 5.1543, "step": 55 }, { "epoch": 0.20066889632107024, "grad_norm": 3.953125, "learning_rate": 1.978678958419952e-06, "loss": 5.1196, "step": 60 }, { "epoch": 0.21739130434782608, "grad_norm": 4.25, "learning_rate": 1.9749279121818236e-06, "loss": 5.0982, "step": 65 }, { "epoch": 0.23411371237458195, "grad_norm": 4.6875, "learning_rate": 1.9708772360666954e-06, "loss": 5.031, "step": 70 }, { "epoch": 0.2508361204013378, "grad_norm": 4.28125, "learning_rate": 1.9665281749908033e-06, "loss": 5.0424, "step": 75 }, { "epoch": 0.26755852842809363, "grad_norm": 3.265625, "learning_rate": 1.9618820655746486e-06, "loss": 4.981, "step": 80 }, { "epoch": 0.2842809364548495, "grad_norm": 3.28125, "learning_rate": 1.956940335732209e-06, "loss": 4.9588, "step": 85 }, { "epoch": 0.3010033444816054, "grad_norm": 3.359375, "learning_rate": 1.951704504232089e-06, "loss": 4.9495, "step": 90 }, { "epoch": 0.3177257525083612, "grad_norm": 3.03125, "learning_rate": 1.9461761802307492e-06, "loss": 4.9255, "step": 95 }, { "epoch": 0.33444816053511706, "grad_norm": 3.984375, "learning_rate": 1.9403570627779557e-06, "loss": 4.908, "step": 100 }, { "epoch": 0.3511705685618729, "grad_norm": 3.328125, "learning_rate": 1.9342489402945995e-06, "loss": 4.8782, "step": 105 }, { "epoch": 0.36789297658862874, "grad_norm": 2.890625, "learning_rate": 1.9278536900230564e-06, "loss": 4.8632, "step": 110 }, { "epoch": 0.38461538461538464, "grad_norm": 2.875, "learning_rate": 1.921173277450237e-06, "loss": 4.8481, "step": 115 }, { "epoch": 0.4013377926421405, "grad_norm": 3.59375, "learning_rate": 1.9142097557035305e-06, "loss": 4.8467, "step": 120 }, { "epoch": 0.4180602006688963, "grad_norm": 2.9375, "learning_rate": 1.9069652649198002e-06, "loss": 4.8267, "step": 125 }, { "epoch": 0.43478260869565216, "grad_norm": 3.421875, "learning_rate": 1.8994420315876468e-06, "loss": 4.8225, "step": 130 }, { "epoch": 0.451505016722408, "grad_norm": 3.609375, "learning_rate": 1.891642367863127e-06, "loss": 4.7949, "step": 135 }, { "epoch": 0.4682274247491639, "grad_norm": 4.40625, "learning_rate": 1.8835686708591495e-06, "loss": 4.817, "step": 140 }, { "epoch": 0.48494983277591974, "grad_norm": 4.625, "learning_rate": 1.8752234219087537e-06, "loss": 4.8144, "step": 145 }, { "epoch": 0.5016722408026756, "grad_norm": 2.703125, "learning_rate": 1.866609185802511e-06, "loss": 4.7662, "step": 150 }, { "epoch": 0.5183946488294314, "grad_norm": 3.796875, "learning_rate": 1.857728610000272e-06, "loss": 4.7743, "step": 155 }, { "epoch": 0.5351170568561873, "grad_norm": 2.515625, "learning_rate": 1.8485844238175095e-06, "loss": 4.7647, "step": 160 }, { "epoch": 0.5518394648829431, "grad_norm": 2.75, "learning_rate": 1.839179437586502e-06, "loss": 4.7506, "step": 165 }, { "epoch": 0.568561872909699, "grad_norm": 3.0625, "learning_rate": 1.8295165417926206e-06, "loss": 4.7598, "step": 170 }, { "epoch": 0.5852842809364549, "grad_norm": 2.6875, "learning_rate": 1.8195987061859789e-06, "loss": 4.7302, "step": 175 }, { "epoch": 0.6020066889632107, "grad_norm": 3.71875, "learning_rate": 1.8094289788687243e-06, "loss": 4.7313, "step": 180 }, { "epoch": 0.6187290969899666, "grad_norm": 2.515625, "learning_rate": 1.799010485358249e-06, "loss": 4.7353, "step": 185 }, { "epoch": 0.6354515050167224, "grad_norm": 2.5, "learning_rate": 1.7883464276266062e-06, "loss": 4.7363, "step": 190 }, { "epoch": 0.6521739130434783, "grad_norm": 2.90625, "learning_rate": 1.777440083116432e-06, "loss": 4.7022, "step": 195 }, { "epoch": 0.6688963210702341, "grad_norm": 2.609375, "learning_rate": 1.766294803733671e-06, "loss": 4.6943, "step": 200 }, { "epoch": 0.68561872909699, "grad_norm": 3.03125, "learning_rate": 1.754914014817416e-06, "loss": 4.6801, "step": 205 }, { "epoch": 0.7023411371237458, "grad_norm": 2.546875, "learning_rate": 1.743301214087181e-06, "loss": 4.6977, "step": 210 }, { "epoch": 0.7190635451505016, "grad_norm": 3.359375, "learning_rate": 1.7314599705679277e-06, "loss": 4.6801, "step": 215 }, { "epoch": 0.7357859531772575, "grad_norm": 2.59375, "learning_rate": 1.7193939234931776e-06, "loss": 4.6643, "step": 220 }, { "epoch": 0.7525083612040134, "grad_norm": 2.625, "learning_rate": 1.7071067811865474e-06, "loss": 4.6589, "step": 225 }, { "epoch": 0.7692307692307693, "grad_norm": 2.484375, "learning_rate": 1.6946023199220484e-06, "loss": 4.6329, "step": 230 }, { "epoch": 0.7859531772575251, "grad_norm": 2.515625, "learning_rate": 1.681884382763505e-06, "loss": 4.666, "step": 235 }, { "epoch": 0.802675585284281, "grad_norm": 3.171875, "learning_rate": 1.6689568783834447e-06, "loss": 4.6603, "step": 240 }, { "epoch": 0.8193979933110368, "grad_norm": 3.484375, "learning_rate": 1.6558237798618241e-06, "loss": 4.6389, "step": 245 }, { "epoch": 0.8361204013377926, "grad_norm": 2.546875, "learning_rate": 1.6424891234649616e-06, "loss": 4.6501, "step": 250 }, { "epoch": 0.8528428093645485, "grad_norm": 2.9375, "learning_rate": 1.6289570074050492e-06, "loss": 4.6093, "step": 255 }, { "epoch": 0.8695652173913043, "grad_norm": 3.109375, "learning_rate": 1.6152315905806267e-06, "loss": 4.6375, "step": 260 }, { "epoch": 0.8862876254180602, "grad_norm": 3.46875, "learning_rate": 1.6013170912984058e-06, "loss": 4.6377, "step": 265 }, { "epoch": 0.903010033444816, "grad_norm": 2.46875, "learning_rate": 1.5872177859768332e-06, "loss": 4.6326, "step": 270 }, { "epoch": 0.919732441471572, "grad_norm": 2.71875, "learning_rate": 1.572938007831798e-06, "loss": 4.6, "step": 275 }, { "epoch": 0.9364548494983278, "grad_norm": 2.421875, "learning_rate": 1.5584821455448788e-06, "loss": 4.6083, "step": 280 }, { "epoch": 0.9531772575250836, "grad_norm": 3.65625, "learning_rate": 1.5438546419145485e-06, "loss": 4.6106, "step": 285 }, { "epoch": 0.9698996655518395, "grad_norm": 2.421875, "learning_rate": 1.5290599924907432e-06, "loss": 4.6087, "step": 290 }, { "epoch": 0.9866220735785953, "grad_norm": 2.71875, "learning_rate": 1.5141027441932214e-06, "loss": 4.597, "step": 295 }, { "epoch": 1.0033444816053512, "grad_norm": 2.75, "learning_rate": 1.498987493914135e-06, "loss": 4.5658, "step": 300 }, { "epoch": 1.0033444816053512, "eval_loss": 4.585446834564209, "eval_runtime": 7.2206, "eval_samples_per_second": 80.188, "eval_steps_per_second": 2.631, "step": 300 }, { "epoch": 1.020066889632107, "grad_norm": 2.90625, "learning_rate": 1.4837188871052397e-06, "loss": 4.5897, "step": 305 }, { "epoch": 1.0367892976588629, "grad_norm": 2.734375, "learning_rate": 1.4683016163501854e-06, "loss": 4.5885, "step": 310 }, { "epoch": 1.0535117056856187, "grad_norm": 2.75, "learning_rate": 1.452740419922317e-06, "loss": 4.6009, "step": 315 }, { "epoch": 1.0702341137123745, "grad_norm": 2.96875, "learning_rate": 1.4370400803284373e-06, "loss": 4.5789, "step": 320 }, { "epoch": 1.0869565217391304, "grad_norm": 3.4375, "learning_rate": 1.421205422838971e-06, "loss": 4.5435, "step": 325 }, { "epoch": 1.1036789297658862, "grad_norm": 2.734375, "learning_rate": 1.4052413140049897e-06, "loss": 4.566, "step": 330 }, { "epoch": 1.120401337792642, "grad_norm": 2.59375, "learning_rate": 1.389152660162549e-06, "loss": 4.5621, "step": 335 }, { "epoch": 1.137123745819398, "grad_norm": 2.65625, "learning_rate": 1.3729444059247953e-06, "loss": 4.5505, "step": 340 }, { "epoch": 1.1538461538461537, "grad_norm": 2.40625, "learning_rate": 1.3566215326623129e-06, "loss": 4.5709, "step": 345 }, { "epoch": 1.1705685618729098, "grad_norm": 2.390625, "learning_rate": 1.3401890569721723e-06, "loss": 4.5569, "step": 350 }, { "epoch": 1.1872909698996654, "grad_norm": 2.46875, "learning_rate": 1.3236520291361515e-06, "loss": 4.5456, "step": 355 }, { "epoch": 1.2040133779264215, "grad_norm": 2.5625, "learning_rate": 1.3070155315686059e-06, "loss": 4.5543, "step": 360 }, { "epoch": 1.2207357859531773, "grad_norm": 2.65625, "learning_rate": 1.2902846772544622e-06, "loss": 4.5408, "step": 365 }, { "epoch": 1.2374581939799332, "grad_norm": 2.90625, "learning_rate": 1.273464608177818e-06, "loss": 4.5435, "step": 370 }, { "epoch": 1.254180602006689, "grad_norm": 2.578125, "learning_rate": 1.2565604937416266e-06, "loss": 4.5436, "step": 375 }, { "epoch": 1.2709030100334449, "grad_norm": 2.484375, "learning_rate": 1.2395775291789567e-06, "loss": 4.5448, "step": 380 }, { "epoch": 1.2876254180602007, "grad_norm": 2.375, "learning_rate": 1.2225209339563143e-06, "loss": 4.5398, "step": 385 }, { "epoch": 1.3043478260869565, "grad_norm": 2.484375, "learning_rate": 1.2053959501695144e-06, "loss": 4.5383, "step": 390 }, { "epoch": 1.3210702341137124, "grad_norm": 2.421875, "learning_rate": 1.1882078409326002e-06, "loss": 4.5556, "step": 395 }, { "epoch": 1.3377926421404682, "grad_norm": 2.703125, "learning_rate": 1.1709618887603011e-06, "loss": 4.5273, "step": 400 }, { "epoch": 1.354515050167224, "grad_norm": 2.5, "learning_rate": 1.15366339394453e-06, "loss": 4.5262, "step": 405 }, { "epoch": 1.37123745819398, "grad_norm": 2.46875, "learning_rate": 1.1363176729254144e-06, "loss": 4.5391, "step": 410 }, { "epoch": 1.3879598662207357, "grad_norm": 2.953125, "learning_rate": 1.118930056657367e-06, "loss": 4.5218, "step": 415 }, { "epoch": 1.4046822742474916, "grad_norm": 2.421875, "learning_rate": 1.1015058889706942e-06, "loss": 4.5255, "step": 420 }, { "epoch": 1.4214046822742474, "grad_norm": 2.53125, "learning_rate": 1.0840505249292475e-06, "loss": 4.5304, "step": 425 }, { "epoch": 1.4381270903010033, "grad_norm": 2.5, "learning_rate": 1.0665693291846243e-06, "loss": 4.5418, "step": 430 }, { "epoch": 1.4548494983277591, "grad_norm": 2.40625, "learning_rate": 1.0490676743274181e-06, "loss": 4.522, "step": 435 }, { "epoch": 1.471571906354515, "grad_norm": 2.5, "learning_rate": 1.031550939236033e-06, "loss": 4.5346, "step": 440 }, { "epoch": 1.488294314381271, "grad_norm": 2.296875, "learning_rate": 1.0140245074235622e-06, "loss": 4.496, "step": 445 }, { "epoch": 1.5050167224080266, "grad_norm": 2.390625, "learning_rate": 9.964937653832469e-07, "loss": 4.5212, "step": 450 }, { "epoch": 1.5217391304347827, "grad_norm": 2.3125, "learning_rate": 9.78964100933011e-07, "loss": 4.5069, "step": 455 }, { "epoch": 1.5384615384615383, "grad_norm": 2.546875, "learning_rate": 9.614409015595994e-07, "loss": 4.5124, "step": 460 }, { "epoch": 1.5551839464882944, "grad_norm": 2.515625, "learning_rate": 9.43929552762808e-07, "loss": 4.5155, "step": 465 }, { "epoch": 1.57190635451505, "grad_norm": 2.46875, "learning_rate": 9.264354364003326e-07, "loss": 4.5143, "step": 470 }, { "epoch": 1.588628762541806, "grad_norm": 2.25, "learning_rate": 9.089639290337298e-07, "loss": 4.4947, "step": 475 }, { "epoch": 1.605351170568562, "grad_norm": 2.53125, "learning_rate": 8.915204002760122e-07, "loss": 4.5113, "step": 480 }, { "epoch": 1.6220735785953178, "grad_norm": 2.359375, "learning_rate": 8.741102111413748e-07, "loss": 4.5215, "step": 485 }, { "epoch": 1.6387959866220736, "grad_norm": 2.390625, "learning_rate": 8.567387123975647e-07, "loss": 4.4991, "step": 490 }, { "epoch": 1.6555183946488294, "grad_norm": 3.296875, "learning_rate": 8.394112429214029e-07, "loss": 4.5263, "step": 495 }, { "epoch": 1.6722408026755853, "grad_norm": 2.28125, "learning_rate": 8.221331280579564e-07, "loss": 4.5039, "step": 500 }, { "epoch": 1.6889632107023411, "grad_norm": 2.46875, "learning_rate": 8.049096779838717e-07, "loss": 4.5294, "step": 505 }, { "epoch": 1.705685618729097, "grad_norm": 2.734375, "learning_rate": 7.877461860753696e-07, "loss": 4.4868, "step": 510 }, { "epoch": 1.7224080267558528, "grad_norm": 2.484375, "learning_rate": 7.706479272814022e-07, "loss": 4.4988, "step": 515 }, { "epoch": 1.7391304347826086, "grad_norm": 2.3125, "learning_rate": 7.536201565024767e-07, "loss": 4.4843, "step": 520 }, { "epoch": 1.7558528428093645, "grad_norm": 2.359375, "learning_rate": 7.366681069756351e-07, "loss": 4.4878, "step": 525 }, { "epoch": 1.7725752508361206, "grad_norm": 2.578125, "learning_rate": 7.197969886660984e-07, "loss": 4.4925, "step": 530 }, { "epoch": 1.7892976588628762, "grad_norm": 2.65625, "learning_rate": 7.030119866660565e-07, "loss": 4.5185, "step": 535 }, { "epoch": 1.8060200668896322, "grad_norm": 2.375, "learning_rate": 6.863182596011085e-07, "loss": 4.4988, "step": 540 }, { "epoch": 1.8227424749163879, "grad_norm": 2.640625, "learning_rate": 6.697209380448332e-07, "loss": 4.4764, "step": 545 }, { "epoch": 1.839464882943144, "grad_norm": 2.296875, "learning_rate": 6.532251229419809e-07, "loss": 4.4987, "step": 550 }, { "epoch": 1.8561872909698995, "grad_norm": 2.46875, "learning_rate": 6.368358840407752e-07, "loss": 4.5123, "step": 555 }, { "epoch": 1.8729096989966556, "grad_norm": 2.375, "learning_rate": 6.205582583347973e-07, "loss": 4.5019, "step": 560 }, { "epoch": 1.8896321070234112, "grad_norm": 2.28125, "learning_rate": 6.043972485149414e-07, "loss": 4.5041, "step": 565 }, { "epoch": 1.9063545150501673, "grad_norm": 2.453125, "learning_rate": 5.88357821431908e-07, "loss": 4.485, "step": 570 }, { "epoch": 1.9230769230769231, "grad_norm": 2.4375, "learning_rate": 5.724449065697181e-07, "loss": 4.4854, "step": 575 }, { "epoch": 1.939799331103679, "grad_norm": 2.5625, "learning_rate": 5.566633945307052e-07, "loss": 4.5039, "step": 580 }, { "epoch": 1.9565217391304348, "grad_norm": 2.359375, "learning_rate": 5.410181355324621e-07, "loss": 4.507, "step": 585 }, { "epoch": 1.9732441471571907, "grad_norm": 2.3125, "learning_rate": 5.255139379171966e-07, "loss": 4.5087, "step": 590 }, { "epoch": 1.9899665551839465, "grad_norm": 2.359375, "learning_rate": 5.101555666739563e-07, "loss": 4.5007, "step": 595 }, { "epoch": 2.0066889632107023, "grad_norm": 2.25, "learning_rate": 4.949477419741814e-07, "loss": 4.4832, "step": 600 }, { "epoch": 2.0066889632107023, "eval_loss": 4.490821361541748, "eval_runtime": 13.8089, "eval_samples_per_second": 41.929, "eval_steps_per_second": 1.376, "step": 600 }, { "epoch": 2.0234113712374584, "grad_norm": 2.25, "learning_rate": 4.798951377210254e-07, "loss": 4.481, "step": 605 }, { "epoch": 2.040133779264214, "grad_norm": 2.4375, "learning_rate": 4.6500238011290293e-07, "loss": 4.5015, "step": 610 }, { "epoch": 2.05685618729097, "grad_norm": 2.265625, "learning_rate": 4.5027404622169185e-07, "loss": 4.4934, "step": 615 }, { "epoch": 2.0735785953177257, "grad_norm": 2.5, "learning_rate": 4.3571466258603907e-07, "loss": 4.5015, "step": 620 }, { "epoch": 2.0903010033444818, "grad_norm": 2.515625, "learning_rate": 4.2132870382019427e-07, "loss": 4.5043, "step": 625 }, { "epoch": 2.1070234113712374, "grad_norm": 2.390625, "learning_rate": 4.071205912388015e-07, "loss": 4.5132, "step": 630 }, { "epoch": 2.1237458193979935, "grad_norm": 2.484375, "learning_rate": 3.930946914980744e-07, "loss": 4.5032, "step": 635 }, { "epoch": 2.140468227424749, "grad_norm": 2.5625, "learning_rate": 3.7925531525376617e-07, "loss": 4.487, "step": 640 }, { "epoch": 2.157190635451505, "grad_norm": 2.453125, "learning_rate": 3.656067158363546e-07, "loss": 4.4899, "step": 645 }, { "epoch": 2.1739130434782608, "grad_norm": 2.296875, "learning_rate": 3.521530879438407e-07, "loss": 4.4797, "step": 650 }, { "epoch": 2.190635451505017, "grad_norm": 2.34375, "learning_rate": 3.388985663525702e-07, "loss": 4.4826, "step": 655 }, { "epoch": 2.2073578595317724, "grad_norm": 2.375, "learning_rate": 3.2584722464647165e-07, "loss": 4.4947, "step": 660 }, { "epoch": 2.2240802675585285, "grad_norm": 2.328125, "learning_rate": 3.1300307396509826e-07, "loss": 4.4985, "step": 665 }, { "epoch": 2.240802675585284, "grad_norm": 2.203125, "learning_rate": 3.0037006177086345e-07, "loss": 4.4972, "step": 670 }, { "epoch": 2.25752508361204, "grad_norm": 2.75, "learning_rate": 2.879520706358446e-07, "loss": 4.4785, "step": 675 }, { "epoch": 2.274247491638796, "grad_norm": 2.28125, "learning_rate": 2.757529170485332e-07, "loss": 4.4973, "step": 680 }, { "epoch": 2.290969899665552, "grad_norm": 2.359375, "learning_rate": 2.6377635024089087e-07, "loss": 4.4967, "step": 685 }, { "epoch": 2.3076923076923075, "grad_norm": 2.1875, "learning_rate": 2.5202605103607835e-07, "loss": 4.4944, "step": 690 }, { "epoch": 2.3244147157190636, "grad_norm": 2.265625, "learning_rate": 2.4050563071720864e-07, "loss": 4.5062, "step": 695 }, { "epoch": 2.3411371237458196, "grad_norm": 2.421875, "learning_rate": 2.2921862991747121e-07, "loss": 4.4884, "step": 700 }, { "epoch": 2.3578595317725752, "grad_norm": 2.296875, "learning_rate": 2.181685175319702e-07, "loss": 4.4793, "step": 705 }, { "epoch": 2.374581939799331, "grad_norm": 2.46875, "learning_rate": 2.073586896516095e-07, "loss": 4.4759, "step": 710 }, { "epoch": 2.391304347826087, "grad_norm": 2.46875, "learning_rate": 1.9679246851935516e-07, "loss": 4.4862, "step": 715 }, { "epoch": 2.408026755852843, "grad_norm": 2.3125, "learning_rate": 1.864731015091908e-07, "loss": 4.4886, "step": 720 }, { "epoch": 2.4247491638795986, "grad_norm": 2.34375, "learning_rate": 1.7640376012808533e-07, "loss": 4.5015, "step": 725 }, { "epoch": 2.4414715719063547, "grad_norm": 2.234375, "learning_rate": 1.6658753904127732e-07, "loss": 4.4842, "step": 730 }, { "epoch": 2.4581939799331103, "grad_norm": 2.34375, "learning_rate": 1.570274551211732e-07, "loss": 4.5025, "step": 735 }, { "epoch": 2.4749163879598663, "grad_norm": 2.359375, "learning_rate": 1.477264465201572e-07, "loss": 4.492, "step": 740 }, { "epoch": 2.491638795986622, "grad_norm": 2.4375, "learning_rate": 1.3868737176759105e-07, "loss": 4.5102, "step": 745 }, { "epoch": 2.508361204013378, "grad_norm": 2.34375, "learning_rate": 1.2991300889128865e-07, "loss": 4.4899, "step": 750 }, { "epoch": 2.5250836120401337, "grad_norm": 2.296875, "learning_rate": 1.2140605456372854e-07, "loss": 4.5076, "step": 755 }, { "epoch": 2.5418060200668897, "grad_norm": 2.21875, "learning_rate": 1.13169123273271e-07, "loss": 4.4919, "step": 760 }, { "epoch": 2.5585284280936453, "grad_norm": 2.40625, "learning_rate": 1.0520474652063394e-07, "loss": 4.4859, "step": 765 }, { "epoch": 2.5752508361204014, "grad_norm": 2.453125, "learning_rate": 9.751537204087257e-08, "loss": 4.4933, "step": 770 }, { "epoch": 2.591973244147157, "grad_norm": 2.359375, "learning_rate": 9.010336305110345e-08, "loss": 4.493, "step": 775 }, { "epoch": 2.608695652173913, "grad_norm": 2.328125, "learning_rate": 8.297099752420444e-08, "loss": 4.4685, "step": 780 }, { "epoch": 2.625418060200669, "grad_norm": 2.328125, "learning_rate": 7.612046748871326e-08, "loss": 4.4856, "step": 785 }, { "epoch": 2.6421404682274248, "grad_norm": 2.375, "learning_rate": 6.955387835513893e-08, "loss": 4.4676, "step": 790 }, { "epoch": 2.6588628762541804, "grad_norm": 2.25, "learning_rate": 6.327324826889469e-08, "loss": 4.4745, "step": 795 }, { "epoch": 2.6755852842809364, "grad_norm": 2.40625, "learning_rate": 5.728050749005098e-08, "loss": 4.4856, "step": 800 }, { "epoch": 2.6923076923076925, "grad_norm": 2.234375, "learning_rate": 5.1577497800097345e-08, "loss": 4.5096, "step": 805 }, { "epoch": 2.709030100334448, "grad_norm": 2.484375, "learning_rate": 4.616597193589833e-08, "loss": 4.4839, "step": 810 }, { "epoch": 2.7257525083612038, "grad_norm": 2.328125, "learning_rate": 4.1047593051015237e-08, "loss": 4.5145, "step": 815 }, { "epoch": 2.74247491638796, "grad_norm": 2.4375, "learning_rate": 3.622393420456016e-08, "loss": 4.4835, "step": 820 }, { "epoch": 2.759197324414716, "grad_norm": 2.234375, "learning_rate": 3.169647787773866e-08, "loss": 4.4983, "step": 825 }, { "epoch": 2.7759197324414715, "grad_norm": 2.21875, "learning_rate": 2.7466615518231483e-08, "loss": 4.4734, "step": 830 }, { "epoch": 2.7926421404682276, "grad_norm": 2.390625, "learning_rate": 2.353564711255329e-08, "loss": 4.4825, "step": 835 }, { "epoch": 2.809364548494983, "grad_norm": 2.328125, "learning_rate": 1.990478078652047e-08, "loss": 4.4676, "step": 840 }, { "epoch": 2.8260869565217392, "grad_norm": 2.421875, "learning_rate": 1.657513243395159e-08, "loss": 4.4944, "step": 845 }, { "epoch": 2.842809364548495, "grad_norm": 2.203125, "learning_rate": 1.3547725373713403e-08, "loss": 4.4819, "step": 850 }, { "epoch": 2.859531772575251, "grad_norm": 2.34375, "learning_rate": 1.0823490035218986e-08, "loss": 4.4775, "step": 855 }, { "epoch": 2.8762541806020065, "grad_norm": 2.3125, "learning_rate": 8.403263672473792e-09, "loss": 4.4999, "step": 860 }, { "epoch": 2.8929765886287626, "grad_norm": 2.390625, "learning_rate": 6.2877901067573955e-09, "loss": 4.4673, "step": 865 }, { "epoch": 2.9096989966555182, "grad_norm": 2.53125, "learning_rate": 4.477719498021781e-09, "loss": 4.4816, "step": 870 }, { "epoch": 2.9264214046822743, "grad_norm": 2.265625, "learning_rate": 2.973608145073081e-09, "loss": 4.4942, "step": 875 }, { "epoch": 2.94314381270903, "grad_norm": 2.390625, "learning_rate": 1.7759183146021094e-09, "loss": 4.5004, "step": 880 }, { "epoch": 2.959866220735786, "grad_norm": 2.28125, "learning_rate": 8.850180991131218e-10, "loss": 4.4763, "step": 885 }, { "epoch": 2.976588628762542, "grad_norm": 2.515625, "learning_rate": 3.0118130379575e-10, "loss": 4.4912, "step": 890 }, { "epoch": 2.9933110367892977, "grad_norm": 2.4375, "learning_rate": 2.45873623754278e-11, "loss": 4.473, "step": 895 } ], "logging_steps": 5, "max_steps": 897, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.294195477904884e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }