|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 300, |
|
"global_step": 897, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0033444816053511705, |
|
"grad_norm": 28.25, |
|
"learning_rate": 2e-06, |
|
"loss": 5.9886, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.016722408026755852, |
|
"grad_norm": 16.375, |
|
"learning_rate": 1.999901651759575e-06, |
|
"loss": 5.9178, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.033444816053511704, |
|
"grad_norm": 11.0625, |
|
"learning_rate": 1.9995021451869543e-06, |
|
"loss": 5.7759, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.05016722408026756, |
|
"grad_norm": 9.0625, |
|
"learning_rate": 1.9987954562051724e-06, |
|
"loss": 5.6469, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.06688963210702341, |
|
"grad_norm": 7.5, |
|
"learning_rate": 1.9977818020047815e-06, |
|
"loss": 5.5274, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.08361204013377926, |
|
"grad_norm": 6.9375, |
|
"learning_rate": 1.996461494117619e-06, |
|
"loss": 5.4792, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.10033444816053512, |
|
"grad_norm": 6.65625, |
|
"learning_rate": 1.994834938321061e-06, |
|
"loss": 5.4114, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.11705685618729098, |
|
"grad_norm": 6.0625, |
|
"learning_rate": 1.992902634513312e-06, |
|
"loss": 5.3313, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.13377926421404682, |
|
"grad_norm": 5.8125, |
|
"learning_rate": 1.99066517655977e-06, |
|
"loss": 5.2905, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1505016722408027, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 1.9881232521105087e-06, |
|
"loss": 5.2384, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.16722408026755853, |
|
"grad_norm": 4.375, |
|
"learning_rate": 1.985277642388941e-06, |
|
"loss": 5.1707, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.18394648829431437, |
|
"grad_norm": 4.25, |
|
"learning_rate": 1.982129221951719e-06, |
|
"loss": 5.1543, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.20066889632107024, |
|
"grad_norm": 3.953125, |
|
"learning_rate": 1.978678958419952e-06, |
|
"loss": 5.1196, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.21739130434782608, |
|
"grad_norm": 4.25, |
|
"learning_rate": 1.9749279121818236e-06, |
|
"loss": 5.0982, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.23411371237458195, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 1.9708772360666954e-06, |
|
"loss": 5.031, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2508361204013378, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 1.9665281749908033e-06, |
|
"loss": 5.0424, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.26755852842809363, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 1.9618820655746486e-06, |
|
"loss": 4.981, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2842809364548495, |
|
"grad_norm": 3.28125, |
|
"learning_rate": 1.956940335732209e-06, |
|
"loss": 4.9588, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.3010033444816054, |
|
"grad_norm": 3.359375, |
|
"learning_rate": 1.951704504232089e-06, |
|
"loss": 4.9495, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3177257525083612, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 1.9461761802307492e-06, |
|
"loss": 4.9255, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.33444816053511706, |
|
"grad_norm": 3.984375, |
|
"learning_rate": 1.9403570627779557e-06, |
|
"loss": 4.908, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3511705685618729, |
|
"grad_norm": 3.328125, |
|
"learning_rate": 1.9342489402945995e-06, |
|
"loss": 4.8782, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.36789297658862874, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 1.9278536900230564e-06, |
|
"loss": 4.8632, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.38461538461538464, |
|
"grad_norm": 2.875, |
|
"learning_rate": 1.921173277450237e-06, |
|
"loss": 4.8481, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.4013377926421405, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 1.9142097557035305e-06, |
|
"loss": 4.8467, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4180602006688963, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 1.9069652649198002e-06, |
|
"loss": 4.8267, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.43478260869565216, |
|
"grad_norm": 3.421875, |
|
"learning_rate": 1.8994420315876468e-06, |
|
"loss": 4.8225, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.451505016722408, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 1.891642367863127e-06, |
|
"loss": 4.7949, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.4682274247491639, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 1.8835686708591495e-06, |
|
"loss": 4.817, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.48494983277591974, |
|
"grad_norm": 4.625, |
|
"learning_rate": 1.8752234219087537e-06, |
|
"loss": 4.8144, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.5016722408026756, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 1.866609185802511e-06, |
|
"loss": 4.7662, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5183946488294314, |
|
"grad_norm": 3.796875, |
|
"learning_rate": 1.857728610000272e-06, |
|
"loss": 4.7743, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.5351170568561873, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.8485844238175095e-06, |
|
"loss": 4.7647, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5518394648829431, |
|
"grad_norm": 2.75, |
|
"learning_rate": 1.839179437586502e-06, |
|
"loss": 4.7506, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.568561872909699, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 1.8295165417926206e-06, |
|
"loss": 4.7598, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5852842809364549, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 1.8195987061859789e-06, |
|
"loss": 4.7302, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.6020066889632107, |
|
"grad_norm": 3.71875, |
|
"learning_rate": 1.8094289788687243e-06, |
|
"loss": 4.7313, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6187290969899666, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.799010485358249e-06, |
|
"loss": 4.7353, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.6354515050167224, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.7883464276266062e-06, |
|
"loss": 4.7363, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6521739130434783, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 1.777440083116432e-06, |
|
"loss": 4.7022, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.6688963210702341, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.766294803733671e-06, |
|
"loss": 4.6943, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.68561872909699, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 1.754914014817416e-06, |
|
"loss": 4.6801, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.7023411371237458, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.743301214087181e-06, |
|
"loss": 4.6977, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.7190635451505016, |
|
"grad_norm": 3.359375, |
|
"learning_rate": 1.7314599705679277e-06, |
|
"loss": 4.6801, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.7357859531772575, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 1.7193939234931776e-06, |
|
"loss": 4.6643, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.7525083612040134, |
|
"grad_norm": 2.625, |
|
"learning_rate": 1.7071067811865474e-06, |
|
"loss": 4.6589, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.6946023199220484e-06, |
|
"loss": 4.6329, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7859531772575251, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.681884382763505e-06, |
|
"loss": 4.666, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.802675585284281, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 1.6689568783834447e-06, |
|
"loss": 4.6603, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.8193979933110368, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 1.6558237798618241e-06, |
|
"loss": 4.6389, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.8361204013377926, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.6424891234649616e-06, |
|
"loss": 4.6501, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.8528428093645485, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 1.6289570074050492e-06, |
|
"loss": 4.6093, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.8695652173913043, |
|
"grad_norm": 3.109375, |
|
"learning_rate": 1.6152315905806267e-06, |
|
"loss": 4.6375, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8862876254180602, |
|
"grad_norm": 3.46875, |
|
"learning_rate": 1.6013170912984058e-06, |
|
"loss": 4.6377, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.903010033444816, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.5872177859768332e-06, |
|
"loss": 4.6326, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.919732441471572, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 1.572938007831798e-06, |
|
"loss": 4.6, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.9364548494983278, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 1.5584821455448788e-06, |
|
"loss": 4.6083, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.9531772575250836, |
|
"grad_norm": 3.65625, |
|
"learning_rate": 1.5438546419145485e-06, |
|
"loss": 4.6106, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.9698996655518395, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 1.5290599924907432e-06, |
|
"loss": 4.6087, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9866220735785953, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 1.5141027441932214e-06, |
|
"loss": 4.597, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.0033444816053512, |
|
"grad_norm": 2.75, |
|
"learning_rate": 1.498987493914135e-06, |
|
"loss": 4.5658, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.0033444816053512, |
|
"eval_loss": 4.585446834564209, |
|
"eval_runtime": 7.2206, |
|
"eval_samples_per_second": 80.188, |
|
"eval_steps_per_second": 2.631, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.020066889632107, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 1.4837188871052397e-06, |
|
"loss": 4.5897, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.0367892976588629, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 1.4683016163501854e-06, |
|
"loss": 4.5885, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.0535117056856187, |
|
"grad_norm": 2.75, |
|
"learning_rate": 1.452740419922317e-06, |
|
"loss": 4.6009, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.0702341137123745, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 1.4370400803284373e-06, |
|
"loss": 4.5789, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.0869565217391304, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 1.421205422838971e-06, |
|
"loss": 4.5435, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.1036789297658862, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 1.4052413140049897e-06, |
|
"loss": 4.566, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.120401337792642, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 1.389152660162549e-06, |
|
"loss": 4.5621, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.137123745819398, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 1.3729444059247953e-06, |
|
"loss": 4.5505, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.1538461538461537, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.3566215326623129e-06, |
|
"loss": 4.5709, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.1705685618729098, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 1.3401890569721723e-06, |
|
"loss": 4.5569, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.1872909698996654, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.3236520291361515e-06, |
|
"loss": 4.5456, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.2040133779264215, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 1.3070155315686059e-06, |
|
"loss": 4.5543, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.2207357859531773, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 1.2902846772544622e-06, |
|
"loss": 4.5408, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.2374581939799332, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 1.273464608177818e-06, |
|
"loss": 4.5435, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.254180602006689, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 1.2565604937416266e-06, |
|
"loss": 4.5436, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.2709030100334449, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.2395775291789567e-06, |
|
"loss": 4.5448, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.2876254180602007, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.2225209339563143e-06, |
|
"loss": 4.5398, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.3043478260869565, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.2053959501695144e-06, |
|
"loss": 4.5383, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.3210702341137124, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 1.1882078409326002e-06, |
|
"loss": 4.5556, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.3377926421404682, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 1.1709618887603011e-06, |
|
"loss": 4.5273, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.354515050167224, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.15366339394453e-06, |
|
"loss": 4.5262, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.37123745819398, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.1363176729254144e-06, |
|
"loss": 4.5391, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.3879598662207357, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 1.118930056657367e-06, |
|
"loss": 4.5218, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.4046822742474916, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 1.1015058889706942e-06, |
|
"loss": 4.5255, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.4214046822742474, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 1.0840505249292475e-06, |
|
"loss": 4.5304, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.4381270903010033, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.0665693291846243e-06, |
|
"loss": 4.5418, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.4548494983277591, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.0490676743274181e-06, |
|
"loss": 4.522, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.471571906354515, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.031550939236033e-06, |
|
"loss": 4.5346, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.488294314381271, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 1.0140245074235622e-06, |
|
"loss": 4.496, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.5050167224080266, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 9.964937653832469e-07, |
|
"loss": 4.5212, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.5217391304347827, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 9.78964100933011e-07, |
|
"loss": 4.5069, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 9.614409015595994e-07, |
|
"loss": 4.5124, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.5551839464882944, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 9.43929552762808e-07, |
|
"loss": 4.5155, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.57190635451505, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 9.264354364003326e-07, |
|
"loss": 4.5143, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.588628762541806, |
|
"grad_norm": 2.25, |
|
"learning_rate": 9.089639290337298e-07, |
|
"loss": 4.4947, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.605351170568562, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 8.915204002760122e-07, |
|
"loss": 4.5113, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.6220735785953178, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 8.741102111413748e-07, |
|
"loss": 4.5215, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.6387959866220736, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 8.567387123975647e-07, |
|
"loss": 4.4991, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.6555183946488294, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 8.394112429214029e-07, |
|
"loss": 4.5263, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.6722408026755853, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 8.221331280579564e-07, |
|
"loss": 4.5039, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.6889632107023411, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 8.049096779838717e-07, |
|
"loss": 4.5294, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.705685618729097, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 7.877461860753696e-07, |
|
"loss": 4.4868, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.7224080267558528, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 7.706479272814022e-07, |
|
"loss": 4.4988, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.7391304347826086, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 7.536201565024767e-07, |
|
"loss": 4.4843, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.7558528428093645, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 7.366681069756351e-07, |
|
"loss": 4.4878, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.7725752508361206, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 7.197969886660984e-07, |
|
"loss": 4.4925, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.7892976588628762, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 7.030119866660565e-07, |
|
"loss": 4.5185, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.8060200668896322, |
|
"grad_norm": 2.375, |
|
"learning_rate": 6.863182596011085e-07, |
|
"loss": 4.4988, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.8227424749163879, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 6.697209380448332e-07, |
|
"loss": 4.4764, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.839464882943144, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 6.532251229419809e-07, |
|
"loss": 4.4987, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.8561872909698995, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 6.368358840407752e-07, |
|
"loss": 4.5123, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.8729096989966556, |
|
"grad_norm": 2.375, |
|
"learning_rate": 6.205582583347973e-07, |
|
"loss": 4.5019, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.8896321070234112, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 6.043972485149414e-07, |
|
"loss": 4.5041, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.9063545150501673, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 5.88357821431908e-07, |
|
"loss": 4.485, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.9230769230769231, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 5.724449065697181e-07, |
|
"loss": 4.4854, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.939799331103679, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 5.566633945307052e-07, |
|
"loss": 4.5039, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.9565217391304348, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 5.410181355324621e-07, |
|
"loss": 4.507, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.9732441471571907, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 5.255139379171966e-07, |
|
"loss": 4.5087, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.9899665551839465, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 5.101555666739563e-07, |
|
"loss": 4.5007, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 2.0066889632107023, |
|
"grad_norm": 2.25, |
|
"learning_rate": 4.949477419741814e-07, |
|
"loss": 4.4832, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.0066889632107023, |
|
"eval_loss": 4.490821361541748, |
|
"eval_runtime": 13.8089, |
|
"eval_samples_per_second": 41.929, |
|
"eval_steps_per_second": 1.376, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.0234113712374584, |
|
"grad_norm": 2.25, |
|
"learning_rate": 4.798951377210254e-07, |
|
"loss": 4.481, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 2.040133779264214, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 4.6500238011290293e-07, |
|
"loss": 4.5015, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.05685618729097, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 4.5027404622169185e-07, |
|
"loss": 4.4934, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 2.0735785953177257, |
|
"grad_norm": 2.5, |
|
"learning_rate": 4.3571466258603907e-07, |
|
"loss": 4.5015, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.0903010033444818, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 4.2132870382019427e-07, |
|
"loss": 4.5043, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 2.1070234113712374, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 4.071205912388015e-07, |
|
"loss": 4.5132, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.1237458193979935, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 3.930946914980744e-07, |
|
"loss": 4.5032, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 2.140468227424749, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 3.7925531525376617e-07, |
|
"loss": 4.487, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.157190635451505, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 3.656067158363546e-07, |
|
"loss": 4.4899, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 2.1739130434782608, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 3.521530879438407e-07, |
|
"loss": 4.4797, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.190635451505017, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 3.388985663525702e-07, |
|
"loss": 4.4826, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 2.2073578595317724, |
|
"grad_norm": 2.375, |
|
"learning_rate": 3.2584722464647165e-07, |
|
"loss": 4.4947, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.2240802675585285, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 3.1300307396509826e-07, |
|
"loss": 4.4985, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 2.240802675585284, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 3.0037006177086345e-07, |
|
"loss": 4.4972, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.25752508361204, |
|
"grad_norm": 2.75, |
|
"learning_rate": 2.879520706358446e-07, |
|
"loss": 4.4785, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 2.274247491638796, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 2.757529170485332e-07, |
|
"loss": 4.4973, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.290969899665552, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 2.6377635024089087e-07, |
|
"loss": 4.4967, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 2.3076923076923075, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 2.5202605103607835e-07, |
|
"loss": 4.4944, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.3244147157190636, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 2.4050563071720864e-07, |
|
"loss": 4.5062, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 2.3411371237458196, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 2.2921862991747121e-07, |
|
"loss": 4.4884, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.3578595317725752, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 2.181685175319702e-07, |
|
"loss": 4.4793, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 2.374581939799331, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 2.073586896516095e-07, |
|
"loss": 4.4759, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.391304347826087, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.9679246851935516e-07, |
|
"loss": 4.4862, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 2.408026755852843, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 1.864731015091908e-07, |
|
"loss": 4.4886, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.4247491638795986, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 1.7640376012808533e-07, |
|
"loss": 4.5015, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 2.4414715719063547, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.6658753904127732e-07, |
|
"loss": 4.4842, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.4581939799331103, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 1.570274551211732e-07, |
|
"loss": 4.5025, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 2.4749163879598663, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.477264465201572e-07, |
|
"loss": 4.492, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.491638795986622, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.3868737176759105e-07, |
|
"loss": 4.5102, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 2.508361204013378, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 1.2991300889128865e-07, |
|
"loss": 4.4899, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.5250836120401337, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 1.2140605456372854e-07, |
|
"loss": 4.5076, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 2.5418060200668897, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 1.13169123273271e-07, |
|
"loss": 4.4919, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.5585284280936453, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.0520474652063394e-07, |
|
"loss": 4.4859, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 2.5752508361204014, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 9.751537204087257e-08, |
|
"loss": 4.4933, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.591973244147157, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 9.010336305110345e-08, |
|
"loss": 4.493, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 2.608695652173913, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 8.297099752420444e-08, |
|
"loss": 4.4685, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.625418060200669, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 7.612046748871326e-08, |
|
"loss": 4.4856, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 2.6421404682274248, |
|
"grad_norm": 2.375, |
|
"learning_rate": 6.955387835513893e-08, |
|
"loss": 4.4676, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.6588628762541804, |
|
"grad_norm": 2.25, |
|
"learning_rate": 6.327324826889469e-08, |
|
"loss": 4.4745, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 2.6755852842809364, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 5.728050749005098e-08, |
|
"loss": 4.4856, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.6923076923076925, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 5.1577497800097345e-08, |
|
"loss": 4.5096, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 2.709030100334448, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 4.616597193589833e-08, |
|
"loss": 4.4839, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.7257525083612038, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 4.1047593051015237e-08, |
|
"loss": 4.5145, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 2.74247491638796, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 3.622393420456016e-08, |
|
"loss": 4.4835, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.759197324414716, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 3.169647787773866e-08, |
|
"loss": 4.4983, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 2.7759197324414715, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 2.7466615518231483e-08, |
|
"loss": 4.4734, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.7926421404682276, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 2.353564711255329e-08, |
|
"loss": 4.4825, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 2.809364548494983, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 1.990478078652047e-08, |
|
"loss": 4.4676, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.8260869565217392, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 1.657513243395159e-08, |
|
"loss": 4.4944, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 2.842809364548495, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 1.3547725373713403e-08, |
|
"loss": 4.4819, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.859531772575251, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 1.0823490035218986e-08, |
|
"loss": 4.4775, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 2.8762541806020065, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 8.403263672473792e-09, |
|
"loss": 4.4999, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.8929765886287626, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 6.2877901067573955e-09, |
|
"loss": 4.4673, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 2.9096989966555182, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 4.477719498021781e-09, |
|
"loss": 4.4816, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.9264214046822743, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 2.973608145073081e-09, |
|
"loss": 4.4942, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 2.94314381270903, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 1.7759183146021094e-09, |
|
"loss": 4.5004, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.959866220735786, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 8.850180991131218e-10, |
|
"loss": 4.4763, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 2.976588628762542, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 3.0118130379575e-10, |
|
"loss": 4.4912, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.9933110367892977, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 2.45873623754278e-11, |
|
"loss": 4.473, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 897, |
|
"total_flos": 5.294195477904884e+18, |
|
"train_loss": 4.635029224514828, |
|
"train_runtime": 4299.9756, |
|
"train_samples_per_second": 13.31, |
|
"train_steps_per_second": 0.209 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 897, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 300, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.294195477904884e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|