|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.0, |
|
"eval_steps": 500, |
|
"global_step": 2825, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0017699115044247787, |
|
"grad_norm": 5.395828211145053, |
|
"learning_rate": 7.06713780918728e-08, |
|
"loss": 0.8769, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.008849557522123894, |
|
"grad_norm": 5.078223153245108, |
|
"learning_rate": 3.53356890459364e-07, |
|
"loss": 0.8957, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.017699115044247787, |
|
"grad_norm": 4.3462255239787675, |
|
"learning_rate": 7.06713780918728e-07, |
|
"loss": 0.8679, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02654867256637168, |
|
"grad_norm": 2.7342805368137206, |
|
"learning_rate": 1.060070671378092e-06, |
|
"loss": 0.849, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.035398230088495575, |
|
"grad_norm": 2.245119548796511, |
|
"learning_rate": 1.413427561837456e-06, |
|
"loss": 0.8632, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.04424778761061947, |
|
"grad_norm": 1.9300825255667216, |
|
"learning_rate": 1.76678445229682e-06, |
|
"loss": 0.8287, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.05309734513274336, |
|
"grad_norm": 2.0548676134373025, |
|
"learning_rate": 2.120141342756184e-06, |
|
"loss": 0.8216, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.061946902654867256, |
|
"grad_norm": 2.107520072503226, |
|
"learning_rate": 2.473498233215548e-06, |
|
"loss": 0.8336, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.07079646017699115, |
|
"grad_norm": 2.012421705138888, |
|
"learning_rate": 2.826855123674912e-06, |
|
"loss": 0.8106, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.07964601769911504, |
|
"grad_norm": 2.0477120819165138, |
|
"learning_rate": 3.1802120141342757e-06, |
|
"loss": 0.8035, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.08849557522123894, |
|
"grad_norm": 1.94045311291632, |
|
"learning_rate": 3.53356890459364e-06, |
|
"loss": 0.8212, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.09734513274336283, |
|
"grad_norm": 1.9781807437344778, |
|
"learning_rate": 3.886925795053004e-06, |
|
"loss": 0.8167, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.10619469026548672, |
|
"grad_norm": 2.1588067674308955, |
|
"learning_rate": 4.240282685512368e-06, |
|
"loss": 0.8159, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.11504424778761062, |
|
"grad_norm": 2.0574710215669936, |
|
"learning_rate": 4.593639575971732e-06, |
|
"loss": 0.7809, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.12389380530973451, |
|
"grad_norm": 2.0440859301429692, |
|
"learning_rate": 4.946996466431096e-06, |
|
"loss": 0.7735, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.13274336283185842, |
|
"grad_norm": 2.1450366358677857, |
|
"learning_rate": 5.300353356890459e-06, |
|
"loss": 0.775, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.1415929203539823, |
|
"grad_norm": 2.0239595981615444, |
|
"learning_rate": 5.653710247349824e-06, |
|
"loss": 0.7739, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.1504424778761062, |
|
"grad_norm": 2.248989868043356, |
|
"learning_rate": 6.0070671378091885e-06, |
|
"loss": 0.7629, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.1592920353982301, |
|
"grad_norm": 2.113620356465933, |
|
"learning_rate": 6.360424028268551e-06, |
|
"loss": 0.7617, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.168141592920354, |
|
"grad_norm": 2.1063718938021987, |
|
"learning_rate": 6.713780918727916e-06, |
|
"loss": 0.7741, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.17699115044247787, |
|
"grad_norm": 2.0994149998808074, |
|
"learning_rate": 7.06713780918728e-06, |
|
"loss": 0.7657, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.18584070796460178, |
|
"grad_norm": 2.0741285650614785, |
|
"learning_rate": 7.420494699646644e-06, |
|
"loss": 0.7564, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.19469026548672566, |
|
"grad_norm": 1.8812346063525685, |
|
"learning_rate": 7.773851590106007e-06, |
|
"loss": 0.764, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.20353982300884957, |
|
"grad_norm": 1.6875596687855905, |
|
"learning_rate": 8.127208480565372e-06, |
|
"loss": 0.7395, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.21238938053097345, |
|
"grad_norm": 1.7946446429435303, |
|
"learning_rate": 8.480565371024736e-06, |
|
"loss": 0.7507, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.22123893805309736, |
|
"grad_norm": 1.5175436501935706, |
|
"learning_rate": 8.8339222614841e-06, |
|
"loss": 0.7197, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.23008849557522124, |
|
"grad_norm": 1.5846428467972697, |
|
"learning_rate": 9.187279151943464e-06, |
|
"loss": 0.7422, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.23893805309734514, |
|
"grad_norm": 1.9811600560411595, |
|
"learning_rate": 9.540636042402828e-06, |
|
"loss": 0.7305, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.24778761061946902, |
|
"grad_norm": 2.0603096951143822, |
|
"learning_rate": 9.893992932862191e-06, |
|
"loss": 0.7266, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.25663716814159293, |
|
"grad_norm": 1.79227194384631, |
|
"learning_rate": 1.0247349823321556e-05, |
|
"loss": 0.7273, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.26548672566371684, |
|
"grad_norm": 1.935925432369111, |
|
"learning_rate": 1.0600706713780919e-05, |
|
"loss": 0.7337, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.2743362831858407, |
|
"grad_norm": 1.6735245424795926, |
|
"learning_rate": 1.0954063604240283e-05, |
|
"loss": 0.726, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.2831858407079646, |
|
"grad_norm": 1.6786739654407825, |
|
"learning_rate": 1.1307420494699648e-05, |
|
"loss": 0.7442, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.2920353982300885, |
|
"grad_norm": 1.4647520145743216, |
|
"learning_rate": 1.1660777385159012e-05, |
|
"loss": 0.7104, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.3008849557522124, |
|
"grad_norm": 1.6975078748459098, |
|
"learning_rate": 1.2014134275618377e-05, |
|
"loss": 0.7202, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.30973451327433627, |
|
"grad_norm": 2.1939246563522925, |
|
"learning_rate": 1.2367491166077738e-05, |
|
"loss": 0.7135, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.3185840707964602, |
|
"grad_norm": 1.7970007273551845, |
|
"learning_rate": 1.2720848056537103e-05, |
|
"loss": 0.7404, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.3274336283185841, |
|
"grad_norm": 2.2094978373729, |
|
"learning_rate": 1.3074204946996467e-05, |
|
"loss": 0.7395, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.336283185840708, |
|
"grad_norm": 1.6148149490160606, |
|
"learning_rate": 1.3427561837455832e-05, |
|
"loss": 0.7236, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.34513274336283184, |
|
"grad_norm": 1.5716018292237066, |
|
"learning_rate": 1.3780918727915195e-05, |
|
"loss": 0.7176, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.35398230088495575, |
|
"grad_norm": 1.56394291086092, |
|
"learning_rate": 1.413427561837456e-05, |
|
"loss": 0.7077, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.36283185840707965, |
|
"grad_norm": 1.5944555462298586, |
|
"learning_rate": 1.4487632508833924e-05, |
|
"loss": 0.732, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.37168141592920356, |
|
"grad_norm": 3.0644131086145716, |
|
"learning_rate": 1.4840989399293289e-05, |
|
"loss": 0.7226, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.3805309734513274, |
|
"grad_norm": 48.89141497644312, |
|
"learning_rate": 1.519434628975265e-05, |
|
"loss": 0.9165, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.3893805309734513, |
|
"grad_norm": 63.82971563129597, |
|
"learning_rate": 1.5547703180212014e-05, |
|
"loss": 2.8435, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.39823008849557523, |
|
"grad_norm": 63.166528581176436, |
|
"learning_rate": 1.590106007067138e-05, |
|
"loss": 1.8489, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.40707964601769914, |
|
"grad_norm": 11.532232434183182, |
|
"learning_rate": 1.6254416961130744e-05, |
|
"loss": 1.1145, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.415929203539823, |
|
"grad_norm": 8.320428887121134, |
|
"learning_rate": 1.6607773851590106e-05, |
|
"loss": 1.0432, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.4247787610619469, |
|
"grad_norm": 5.49695556302703, |
|
"learning_rate": 1.6961130742049473e-05, |
|
"loss": 0.9249, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.4336283185840708, |
|
"grad_norm": 3.8086780643307954, |
|
"learning_rate": 1.7314487632508836e-05, |
|
"loss": 0.901, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.4424778761061947, |
|
"grad_norm": 3.0530888165680254, |
|
"learning_rate": 1.76678445229682e-05, |
|
"loss": 0.8318, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.45132743362831856, |
|
"grad_norm": 2.051374080224868, |
|
"learning_rate": 1.802120141342756e-05, |
|
"loss": 0.8123, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.46017699115044247, |
|
"grad_norm": 1.9860592805013697, |
|
"learning_rate": 1.8374558303886928e-05, |
|
"loss": 0.8145, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.4690265486725664, |
|
"grad_norm": 1.9585131132246008, |
|
"learning_rate": 1.872791519434629e-05, |
|
"loss": 0.8015, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.4778761061946903, |
|
"grad_norm": 2.0807728265740937, |
|
"learning_rate": 1.9081272084805657e-05, |
|
"loss": 0.7572, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.48672566371681414, |
|
"grad_norm": 8.726925488702674, |
|
"learning_rate": 1.943462897526502e-05, |
|
"loss": 0.7968, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.49557522123893805, |
|
"grad_norm": 3.0506762781221926, |
|
"learning_rate": 1.9787985865724383e-05, |
|
"loss": 0.7859, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.504424778761062, |
|
"grad_norm": 20.477250938697214, |
|
"learning_rate": 1.999996945230629e-05, |
|
"loss": 0.7726, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.5132743362831859, |
|
"grad_norm": 18.602610432760407, |
|
"learning_rate": 1.9999625792895357e-05, |
|
"loss": 0.7615, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.5221238938053098, |
|
"grad_norm": 1.9360445735897074, |
|
"learning_rate": 1.9998900302622567e-05, |
|
"loss": 0.7346, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.5309734513274337, |
|
"grad_norm": 1.9907733223798811, |
|
"learning_rate": 1.9997793009190403e-05, |
|
"loss": 0.7257, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5398230088495575, |
|
"grad_norm": 2.649430391779402, |
|
"learning_rate": 1.999630395488034e-05, |
|
"loss": 0.7398, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.5486725663716814, |
|
"grad_norm": 2.295494291640341, |
|
"learning_rate": 1.9994433196551183e-05, |
|
"loss": 0.7404, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.5575221238938053, |
|
"grad_norm": 1.6045394414529988, |
|
"learning_rate": 1.9992180805636936e-05, |
|
"loss": 0.7157, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.5663716814159292, |
|
"grad_norm": 1.7545617453249185, |
|
"learning_rate": 1.998954686814406e-05, |
|
"loss": 0.722, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.5752212389380531, |
|
"grad_norm": 2.20743008915746, |
|
"learning_rate": 1.998653148464817e-05, |
|
"loss": 0.7133, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.584070796460177, |
|
"grad_norm": 1.8768620308499198, |
|
"learning_rate": 1.9983134770290232e-05, |
|
"loss": 0.7247, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.5929203539823009, |
|
"grad_norm": 2.4610134324640396, |
|
"learning_rate": 1.9979356854772128e-05, |
|
"loss": 0.6939, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.6017699115044248, |
|
"grad_norm": 1.5013573865061423, |
|
"learning_rate": 1.997519788235174e-05, |
|
"loss": 0.7184, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.6106194690265486, |
|
"grad_norm": 1.607524878532932, |
|
"learning_rate": 1.9970658011837404e-05, |
|
"loss": 0.7206, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.6194690265486725, |
|
"grad_norm": 1.57447888430762, |
|
"learning_rate": 1.996573741658188e-05, |
|
"loss": 0.7082, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.6283185840707964, |
|
"grad_norm": 1.5581830775430778, |
|
"learning_rate": 1.9960436284475712e-05, |
|
"loss": 0.6727, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.6371681415929203, |
|
"grad_norm": 1.5066025396716325, |
|
"learning_rate": 1.9954754817940054e-05, |
|
"loss": 0.708, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.6460176991150443, |
|
"grad_norm": 1.4817421328262548, |
|
"learning_rate": 1.994869323391895e-05, |
|
"loss": 0.6863, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.6548672566371682, |
|
"grad_norm": 1.4055734979132297, |
|
"learning_rate": 1.9942251763871056e-05, |
|
"loss": 0.7108, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.6637168141592921, |
|
"grad_norm": 1.391086495332054, |
|
"learning_rate": 1.9935430653760772e-05, |
|
"loss": 0.6608, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.672566371681416, |
|
"grad_norm": 1.5585496911243557, |
|
"learning_rate": 1.9928230164048885e-05, |
|
"loss": 0.6968, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.6814159292035398, |
|
"grad_norm": 1.5660850244776638, |
|
"learning_rate": 1.99206505696826e-05, |
|
"loss": 0.6892, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.6902654867256637, |
|
"grad_norm": 1.6030826428695462, |
|
"learning_rate": 1.9912692160085054e-05, |
|
"loss": 0.707, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.6991150442477876, |
|
"grad_norm": 1.4696166964471262, |
|
"learning_rate": 1.990435523914426e-05, |
|
"loss": 0.6773, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.7079646017699115, |
|
"grad_norm": 1.4674837525767894, |
|
"learning_rate": 1.9895640125201498e-05, |
|
"loss": 0.7007, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7168141592920354, |
|
"grad_norm": 1.6349741965597524, |
|
"learning_rate": 1.988654715103917e-05, |
|
"loss": 0.6884, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.7256637168141593, |
|
"grad_norm": 1.6317810785917048, |
|
"learning_rate": 1.9877076663868084e-05, |
|
"loss": 0.6761, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.7345132743362832, |
|
"grad_norm": 1.7519745537508327, |
|
"learning_rate": 1.9867229025314204e-05, |
|
"loss": 0.6843, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.7433628318584071, |
|
"grad_norm": 1.707801199033859, |
|
"learning_rate": 1.9857004611404825e-05, |
|
"loss": 0.6735, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.7522123893805309, |
|
"grad_norm": 1.687330084394135, |
|
"learning_rate": 1.984640381255424e-05, |
|
"loss": 0.6661, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.7610619469026548, |
|
"grad_norm": 1.7043833380846305, |
|
"learning_rate": 1.9835427033548807e-05, |
|
"loss": 0.6794, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.7699115044247787, |
|
"grad_norm": 1.5675248939760222, |
|
"learning_rate": 1.982407469353152e-05, |
|
"loss": 0.6864, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.7787610619469026, |
|
"grad_norm": 1.4561869480484766, |
|
"learning_rate": 1.9812347225985966e-05, |
|
"loss": 0.657, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.7876106194690266, |
|
"grad_norm": 1.3651968406362616, |
|
"learning_rate": 1.9800245078719814e-05, |
|
"loss": 0.6725, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.7964601769911505, |
|
"grad_norm": 1.7068501163645904, |
|
"learning_rate": 1.9787768713847685e-05, |
|
"loss": 0.6907, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.8053097345132744, |
|
"grad_norm": 1.8186666681748165, |
|
"learning_rate": 1.9774918607773524e-05, |
|
"loss": 0.6666, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.8141592920353983, |
|
"grad_norm": 52.762622436160925, |
|
"learning_rate": 1.9761695251172398e-05, |
|
"loss": 0.6903, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.8230088495575221, |
|
"grad_norm": 2.785296160404902, |
|
"learning_rate": 1.9748099148971766e-05, |
|
"loss": 0.682, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.831858407079646, |
|
"grad_norm": 1.8158247940222614, |
|
"learning_rate": 1.97341308203322e-05, |
|
"loss": 0.6654, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.8407079646017699, |
|
"grad_norm": 3.2136675982919427, |
|
"learning_rate": 1.9719790798627555e-05, |
|
"loss": 0.6875, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.8495575221238938, |
|
"grad_norm": 1.9905381473001953, |
|
"learning_rate": 1.9705079631424605e-05, |
|
"loss": 0.6785, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.8584070796460177, |
|
"grad_norm": 2.788997906081037, |
|
"learning_rate": 1.9689997880462134e-05, |
|
"loss": 0.6614, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.8672566371681416, |
|
"grad_norm": 1.947320603121196, |
|
"learning_rate": 1.9674546121629495e-05, |
|
"loss": 0.6612, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.8761061946902655, |
|
"grad_norm": 16.64653233306791, |
|
"learning_rate": 1.9658724944944597e-05, |
|
"loss": 0.6755, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.8849557522123894, |
|
"grad_norm": 2.445435643330624, |
|
"learning_rate": 1.964253495453141e-05, |
|
"loss": 0.6489, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.8938053097345132, |
|
"grad_norm": 1.8384904388034906, |
|
"learning_rate": 1.9625976768596862e-05, |
|
"loss": 0.6832, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.9026548672566371, |
|
"grad_norm": 1.7969557638918616, |
|
"learning_rate": 1.9609051019407254e-05, |
|
"loss": 0.6624, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.911504424778761, |
|
"grad_norm": 1.7456317363852023, |
|
"learning_rate": 1.9591758353264106e-05, |
|
"loss": 0.6573, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.9203539823008849, |
|
"grad_norm": 1.4813878883475884, |
|
"learning_rate": 1.9574099430479498e-05, |
|
"loss": 0.659, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.9292035398230089, |
|
"grad_norm": 4.494324308598284, |
|
"learning_rate": 1.9556074925350826e-05, |
|
"loss": 0.6811, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.9380530973451328, |
|
"grad_norm": 7.919549938124173, |
|
"learning_rate": 1.9537685526135088e-05, |
|
"loss": 0.6812, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.9469026548672567, |
|
"grad_norm": 15.115841458620135, |
|
"learning_rate": 1.951893193502256e-05, |
|
"loss": 0.6774, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.9557522123893806, |
|
"grad_norm": 5.495425735869417, |
|
"learning_rate": 1.9499814868110035e-05, |
|
"loss": 0.6889, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.9646017699115044, |
|
"grad_norm": 12.34480108912751, |
|
"learning_rate": 1.9480335055373444e-05, |
|
"loss": 0.689, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.9734513274336283, |
|
"grad_norm": 15.091035487093771, |
|
"learning_rate": 1.9460493240639985e-05, |
|
"loss": 0.6907, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.9823008849557522, |
|
"grad_norm": 2.6410011813854815, |
|
"learning_rate": 1.9440290181559737e-05, |
|
"loss": 0.6728, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.9911504424778761, |
|
"grad_norm": 3.807048913327981, |
|
"learning_rate": 1.9419726649576707e-05, |
|
"loss": 0.6699, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 11.71551051866065, |
|
"learning_rate": 1.93988034298994e-05, |
|
"loss": 0.6823, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.6282660961151123, |
|
"eval_runtime": 346.0891, |
|
"eval_samples_per_second": 21.731, |
|
"eval_steps_per_second": 0.341, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.008849557522124, |
|
"grad_norm": 5.445947511608255, |
|
"learning_rate": 1.9377521321470806e-05, |
|
"loss": 0.5764, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.0176991150442478, |
|
"grad_norm": 5.441443896405208, |
|
"learning_rate": 1.935588113693792e-05, |
|
"loss": 0.5922, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.0265486725663717, |
|
"grad_norm": 4.764716514583245, |
|
"learning_rate": 1.9333883702620692e-05, |
|
"loss": 0.5688, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.0353982300884956, |
|
"grad_norm": 6.234212565235311, |
|
"learning_rate": 1.9311529858480488e-05, |
|
"loss": 0.5627, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.0442477876106195, |
|
"grad_norm": 3.275772180265634, |
|
"learning_rate": 1.9288820458088004e-05, |
|
"loss": 0.5405, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.0530973451327434, |
|
"grad_norm": 2.4494316038791752, |
|
"learning_rate": 1.926575636859068e-05, |
|
"loss": 0.5353, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.0619469026548674, |
|
"grad_norm": 1.964088050973173, |
|
"learning_rate": 1.924233847067959e-05, |
|
"loss": 0.5409, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.0707964601769913, |
|
"grad_norm": 1.7637636165900412, |
|
"learning_rate": 1.9218567658555813e-05, |
|
"loss": 0.5477, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.079646017699115, |
|
"grad_norm": 2.3214954304344433, |
|
"learning_rate": 1.919444483989628e-05, |
|
"loss": 0.5519, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.0884955752212389, |
|
"grad_norm": 2.0536428402479134, |
|
"learning_rate": 1.9169970935819123e-05, |
|
"loss": 0.5218, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.0973451327433628, |
|
"grad_norm": 1.670923852239301, |
|
"learning_rate": 1.9145146880848505e-05, |
|
"loss": 0.531, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.1061946902654867, |
|
"grad_norm": 1.9728407984682095, |
|
"learning_rate": 1.9119973622878928e-05, |
|
"loss": 0.5402, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.1150442477876106, |
|
"grad_norm": 8.142996728944224, |
|
"learning_rate": 1.9094452123139034e-05, |
|
"loss": 0.5656, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.1238938053097345, |
|
"grad_norm": 1.890805938276718, |
|
"learning_rate": 1.9068583356154917e-05, |
|
"loss": 0.539, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.1327433628318584, |
|
"grad_norm": 2.1078644912267968, |
|
"learning_rate": 1.9042368309712906e-05, |
|
"loss": 0.5461, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.1415929203539823, |
|
"grad_norm": 1.8971067802854567, |
|
"learning_rate": 1.9015807984821827e-05, |
|
"loss": 0.5494, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.1504424778761062, |
|
"grad_norm": 27.881164380475635, |
|
"learning_rate": 1.8988903395674814e-05, |
|
"loss": 0.535, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.1592920353982301, |
|
"grad_norm": 3.4980726529367576, |
|
"learning_rate": 1.8961655569610557e-05, |
|
"loss": 0.531, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.168141592920354, |
|
"grad_norm": 2.2255039588879066, |
|
"learning_rate": 1.8934065547074077e-05, |
|
"loss": 0.5369, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.176991150442478, |
|
"grad_norm": 1.7665125865767044, |
|
"learning_rate": 1.8906134381577008e-05, |
|
"loss": 0.5231, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 1.1858407079646018, |
|
"grad_norm": 1.7187220901255802, |
|
"learning_rate": 1.887786313965736e-05, |
|
"loss": 0.5205, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.1946902654867257, |
|
"grad_norm": 1.793384142575199, |
|
"learning_rate": 1.8849252900838795e-05, |
|
"loss": 0.5307, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.2035398230088497, |
|
"grad_norm": 1.789651406311183, |
|
"learning_rate": 1.8820304757589406e-05, |
|
"loss": 0.5259, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.2123893805309733, |
|
"grad_norm": 1.9237494570492129, |
|
"learning_rate": 1.8791019815280015e-05, |
|
"loss": 0.5262, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 1.2212389380530975, |
|
"grad_norm": 1.6656226502649325, |
|
"learning_rate": 1.8761399192141933e-05, |
|
"loss": 0.5681, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.2300884955752212, |
|
"grad_norm": 1.8325987162210478, |
|
"learning_rate": 1.8731444019224296e-05, |
|
"loss": 0.5373, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 1.238938053097345, |
|
"grad_norm": 1.772645490498923, |
|
"learning_rate": 1.8701155440350854e-05, |
|
"loss": 0.5274, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.247787610619469, |
|
"grad_norm": 1.7076814876614839, |
|
"learning_rate": 1.8670534612076304e-05, |
|
"loss": 0.5345, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 1.2566371681415929, |
|
"grad_norm": 2.8366257516212925, |
|
"learning_rate": 1.863958270364213e-05, |
|
"loss": 0.5448, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.2654867256637168, |
|
"grad_norm": 1.6438063307669566, |
|
"learning_rate": 1.8608300896931935e-05, |
|
"loss": 0.5345, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 1.2743362831858407, |
|
"grad_norm": 7.910992161903767, |
|
"learning_rate": 1.857669038642635e-05, |
|
"loss": 0.5771, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.2831858407079646, |
|
"grad_norm": 5.320203779573794, |
|
"learning_rate": 1.8544752379157383e-05, |
|
"loss": 0.5889, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.2920353982300885, |
|
"grad_norm": 4.200987040185538, |
|
"learning_rate": 1.851248809466236e-05, |
|
"loss": 0.5572, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.3008849557522124, |
|
"grad_norm": 3.6844001075774564, |
|
"learning_rate": 1.847989876493733e-05, |
|
"loss": 0.5729, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 1.3097345132743363, |
|
"grad_norm": 2.6959513168479003, |
|
"learning_rate": 1.8446985634390056e-05, |
|
"loss": 0.5438, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.3185840707964602, |
|
"grad_norm": 2.0110192321282074, |
|
"learning_rate": 1.841374995979246e-05, |
|
"loss": 0.5346, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 1.3274336283185841, |
|
"grad_norm": 2.4149221674637142, |
|
"learning_rate": 1.8380193010232664e-05, |
|
"loss": 0.5443, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.336283185840708, |
|
"grad_norm": 3.216072321253876, |
|
"learning_rate": 1.834631606706651e-05, |
|
"loss": 0.5388, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 1.3451327433628317, |
|
"grad_norm": 1.7562091306971943, |
|
"learning_rate": 1.831212042386865e-05, |
|
"loss": 0.5332, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.3539823008849559, |
|
"grad_norm": 1.7681274800133804, |
|
"learning_rate": 1.8277607386383134e-05, |
|
"loss": 0.5531, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 1.3628318584070795, |
|
"grad_norm": 1.6184444020633955, |
|
"learning_rate": 1.8242778272473566e-05, |
|
"loss": 0.5288, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.3716814159292037, |
|
"grad_norm": 1.8584096368775243, |
|
"learning_rate": 1.8207634412072765e-05, |
|
"loss": 0.5134, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.3805309734513274, |
|
"grad_norm": 2.3984599706556504, |
|
"learning_rate": 1.8172177147132e-05, |
|
"loss": 0.5293, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.3893805309734513, |
|
"grad_norm": 1.6184621514028006, |
|
"learning_rate": 1.8136407831569748e-05, |
|
"loss": 0.5332, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 1.3982300884955752, |
|
"grad_norm": 1.659497791050273, |
|
"learning_rate": 1.8100327831219968e-05, |
|
"loss": 0.5499, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.407079646017699, |
|
"grad_norm": 2.080085747337152, |
|
"learning_rate": 1.806393852377998e-05, |
|
"loss": 0.5373, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 1.415929203539823, |
|
"grad_norm": 1.7412321009147458, |
|
"learning_rate": 1.802724129875784e-05, |
|
"loss": 0.5237, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.424778761061947, |
|
"grad_norm": 1.5117367250487712, |
|
"learning_rate": 1.7990237557419298e-05, |
|
"loss": 0.5212, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 1.4336283185840708, |
|
"grad_norm": 1.5677828815765256, |
|
"learning_rate": 1.7952928712734266e-05, |
|
"loss": 0.5293, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.4424778761061947, |
|
"grad_norm": 1.6191863845989973, |
|
"learning_rate": 1.791531618932289e-05, |
|
"loss": 0.5108, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 1.4513274336283186, |
|
"grad_norm": 1.6356183497375685, |
|
"learning_rate": 1.7877401423401134e-05, |
|
"loss": 0.535, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.4601769911504425, |
|
"grad_norm": 2.120698903964094, |
|
"learning_rate": 1.7839185862725953e-05, |
|
"loss": 0.5276, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 1.4690265486725664, |
|
"grad_norm": 1.7162725245546222, |
|
"learning_rate": 1.7800670966539997e-05, |
|
"loss": 0.5157, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.4778761061946903, |
|
"grad_norm": 1.8342947704435748, |
|
"learning_rate": 1.7761858205515904e-05, |
|
"loss": 0.503, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 1.4867256637168142, |
|
"grad_norm": 1.6080172248542548, |
|
"learning_rate": 1.7722749061700122e-05, |
|
"loss": 0.5164, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.495575221238938, |
|
"grad_norm": 1.7310863566366472, |
|
"learning_rate": 1.7683345028456357e-05, |
|
"loss": 0.5144, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 1.504424778761062, |
|
"grad_norm": 1.5259860237803888, |
|
"learning_rate": 1.7643647610408507e-05, |
|
"loss": 0.5144, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.5132743362831858, |
|
"grad_norm": 1.949693136998924, |
|
"learning_rate": 1.760365832338322e-05, |
|
"loss": 0.5208, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 1.5221238938053099, |
|
"grad_norm": 1.4606753283332923, |
|
"learning_rate": 1.7563378694352038e-05, |
|
"loss": 0.514, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.5309734513274336, |
|
"grad_norm": 1.500515219738256, |
|
"learning_rate": 1.752281026137306e-05, |
|
"loss": 0.5105, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 1.5398230088495575, |
|
"grad_norm": 1.6809093610034818, |
|
"learning_rate": 1.7481954573532233e-05, |
|
"loss": 0.5246, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.5486725663716814, |
|
"grad_norm": 1.8505913851080076, |
|
"learning_rate": 1.7440813190884177e-05, |
|
"loss": 0.5263, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 1.5575221238938053, |
|
"grad_norm": 1.5042921112971175, |
|
"learning_rate": 1.7399387684392643e-05, |
|
"loss": 0.5078, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.5663716814159292, |
|
"grad_norm": 1.7603274810661258, |
|
"learning_rate": 1.7357679635870504e-05, |
|
"loss": 0.5152, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 1.575221238938053, |
|
"grad_norm": 1.9907861198097643, |
|
"learning_rate": 1.731569063791937e-05, |
|
"loss": 0.517, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.584070796460177, |
|
"grad_norm": 2.287444737461443, |
|
"learning_rate": 1.727342229386877e-05, |
|
"loss": 0.5118, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 1.592920353982301, |
|
"grad_norm": 1.7114791811335306, |
|
"learning_rate": 1.723087621771492e-05, |
|
"loss": 0.512, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.6017699115044248, |
|
"grad_norm": 1.6387744190074889, |
|
"learning_rate": 1.718805403405911e-05, |
|
"loss": 0.5151, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 1.6106194690265485, |
|
"grad_norm": 1.9126014391813266, |
|
"learning_rate": 1.7144957378045656e-05, |
|
"loss": 0.5072, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.6194690265486726, |
|
"grad_norm": 1.5534492075843847, |
|
"learning_rate": 1.7101587895299463e-05, |
|
"loss": 0.5139, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 1.6283185840707963, |
|
"grad_norm": 3.324022735746321, |
|
"learning_rate": 1.7057947241863207e-05, |
|
"loss": 0.486, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.6371681415929205, |
|
"grad_norm": 2.5161147424579413, |
|
"learning_rate": 1.7014037084134076e-05, |
|
"loss": 0.5127, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 1.6460176991150441, |
|
"grad_norm": 2.6183686325273814, |
|
"learning_rate": 1.696985909880015e-05, |
|
"loss": 0.5103, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.6548672566371683, |
|
"grad_norm": 1.8311730816272584, |
|
"learning_rate": 1.692541497277637e-05, |
|
"loss": 0.51, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 1.663716814159292, |
|
"grad_norm": 2.077861904241967, |
|
"learning_rate": 1.6880706403140146e-05, |
|
"loss": 0.5082, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.672566371681416, |
|
"grad_norm": 2.0643493814936282, |
|
"learning_rate": 1.6835735097066524e-05, |
|
"loss": 0.5199, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 1.6814159292035398, |
|
"grad_norm": 1.5582651497341313, |
|
"learning_rate": 1.6790502771763018e-05, |
|
"loss": 0.5014, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.6902654867256637, |
|
"grad_norm": 1.6655179770008597, |
|
"learning_rate": 1.6745011154404037e-05, |
|
"loss": 0.4854, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 1.6991150442477876, |
|
"grad_norm": 1.4915522872459333, |
|
"learning_rate": 1.669926198206493e-05, |
|
"loss": 0.5132, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.7079646017699115, |
|
"grad_norm": 1.6252968757056179, |
|
"learning_rate": 1.6653257001655652e-05, |
|
"loss": 0.5016, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 1.7168141592920354, |
|
"grad_norm": 2.4097499030817096, |
|
"learning_rate": 1.6606997969854087e-05, |
|
"loss": 0.5227, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.7256637168141593, |
|
"grad_norm": 1.631370053795557, |
|
"learning_rate": 1.6560486653038916e-05, |
|
"loss": 0.5119, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 1.7345132743362832, |
|
"grad_norm": 2.442759512892503, |
|
"learning_rate": 1.6513724827222225e-05, |
|
"loss": 0.4912, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.7433628318584071, |
|
"grad_norm": 12.222832820637523, |
|
"learning_rate": 1.6466714277981656e-05, |
|
"loss": 0.5224, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 1.752212389380531, |
|
"grad_norm": 1.699557253240237, |
|
"learning_rate": 1.641945680039223e-05, |
|
"loss": 0.52, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.7610619469026547, |
|
"grad_norm": 1.6614165390235756, |
|
"learning_rate": 1.6371954198957823e-05, |
|
"loss": 0.5118, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 1.7699115044247788, |
|
"grad_norm": 5.006976593399911, |
|
"learning_rate": 1.6324208287542228e-05, |
|
"loss": 0.4785, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.7787610619469025, |
|
"grad_norm": 2.501687669574672, |
|
"learning_rate": 1.6276220889299918e-05, |
|
"loss": 0.494, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 1.7876106194690267, |
|
"grad_norm": 1.593500017019138, |
|
"learning_rate": 1.622799383660643e-05, |
|
"loss": 0.5184, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.7964601769911503, |
|
"grad_norm": 1.8062411504712435, |
|
"learning_rate": 1.617952897098839e-05, |
|
"loss": 0.4905, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 1.8053097345132745, |
|
"grad_norm": 1.7405963191869815, |
|
"learning_rate": 1.6130828143053173e-05, |
|
"loss": 0.4826, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.8141592920353982, |
|
"grad_norm": 2.046715193192915, |
|
"learning_rate": 1.6081893212418292e-05, |
|
"loss": 0.4923, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 1.823008849557522, |
|
"grad_norm": 2.016184225830357, |
|
"learning_rate": 1.6032726047640336e-05, |
|
"loss": 0.5014, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.831858407079646, |
|
"grad_norm": 35.09896301727333, |
|
"learning_rate": 1.5983328526143653e-05, |
|
"loss": 0.4711, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 1.8407079646017699, |
|
"grad_norm": 2.493655284729721, |
|
"learning_rate": 1.5933702534148648e-05, |
|
"loss": 0.5138, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.8495575221238938, |
|
"grad_norm": 5.918180177995943, |
|
"learning_rate": 1.588384996659976e-05, |
|
"loss": 0.5389, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 1.8584070796460177, |
|
"grad_norm": 20.200037687607217, |
|
"learning_rate": 1.583377272709311e-05, |
|
"loss": 0.5038, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.8672566371681416, |
|
"grad_norm": 11.981522736747214, |
|
"learning_rate": 1.5783472727803796e-05, |
|
"loss": 0.5098, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 1.8761061946902655, |
|
"grad_norm": 2.493450240622104, |
|
"learning_rate": 1.5732951889412905e-05, |
|
"loss": 0.5068, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.8849557522123894, |
|
"grad_norm": 84.35037035076192, |
|
"learning_rate": 1.5682212141034153e-05, |
|
"loss": 0.5365, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 1.893805309734513, |
|
"grad_norm": 5.882151543248406, |
|
"learning_rate": 1.5631255420140225e-05, |
|
"loss": 0.5275, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.9026548672566372, |
|
"grad_norm": 4.244653752423409, |
|
"learning_rate": 1.55800836724888e-05, |
|
"loss": 0.5221, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 1.911504424778761, |
|
"grad_norm": 2.318077082885742, |
|
"learning_rate": 1.5528698852048247e-05, |
|
"loss": 0.5034, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.920353982300885, |
|
"grad_norm": 1.9732014714384287, |
|
"learning_rate": 1.547710292092301e-05, |
|
"loss": 0.522, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 1.9292035398230087, |
|
"grad_norm": 1.650645415727171, |
|
"learning_rate": 1.5425297849278714e-05, |
|
"loss": 0.511, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.9380530973451329, |
|
"grad_norm": 1.8379847429115603, |
|
"learning_rate": 1.5373285615266884e-05, |
|
"loss": 0.5234, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 1.9469026548672566, |
|
"grad_norm": 1.8823141679209345, |
|
"learning_rate": 1.5321068204949465e-05, |
|
"loss": 0.494, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.9557522123893807, |
|
"grad_norm": 1.7345679128452123, |
|
"learning_rate": 1.526864761222294e-05, |
|
"loss": 0.4964, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 1.9646017699115044, |
|
"grad_norm": 2.076946865200806, |
|
"learning_rate": 1.5216025838742226e-05, |
|
"loss": 0.487, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.9734513274336283, |
|
"grad_norm": 1.6316830069632373, |
|
"learning_rate": 1.5163204893844223e-05, |
|
"loss": 0.4799, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 1.9823008849557522, |
|
"grad_norm": 1.5027611303212294, |
|
"learning_rate": 1.5110186794471105e-05, |
|
"loss": 0.5015, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.991150442477876, |
|
"grad_norm": 1.4539381914768303, |
|
"learning_rate": 1.505697356509328e-05, |
|
"loss": 0.4817, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.6088305066129869, |
|
"learning_rate": 1.5003567237632113e-05, |
|
"loss": 0.4922, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.385861873626709, |
|
"eval_runtime": 342.0399, |
|
"eval_samples_per_second": 21.989, |
|
"eval_steps_per_second": 0.345, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.0088495575221237, |
|
"grad_norm": 2.7761717129790804, |
|
"learning_rate": 1.4949969851382315e-05, |
|
"loss": 0.3518, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 2.017699115044248, |
|
"grad_norm": 1.7619273167046978, |
|
"learning_rate": 1.4896183452934087e-05, |
|
"loss": 0.3277, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.0265486725663715, |
|
"grad_norm": 1.7864597766278332, |
|
"learning_rate": 1.4842210096094984e-05, |
|
"loss": 0.3257, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 2.0353982300884956, |
|
"grad_norm": 2.7271140490759724, |
|
"learning_rate": 1.478805184181145e-05, |
|
"loss": 0.3358, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.0442477876106193, |
|
"grad_norm": 1.6882000707315932, |
|
"learning_rate": 1.4733710758090175e-05, |
|
"loss": 0.3295, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 2.0530973451327434, |
|
"grad_norm": 1.6306905404465186, |
|
"learning_rate": 1.4679188919919076e-05, |
|
"loss": 0.3355, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.061946902654867, |
|
"grad_norm": 1.7785685190803522, |
|
"learning_rate": 1.4624488409188116e-05, |
|
"loss": 0.329, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 2.0707964601769913, |
|
"grad_norm": 1.7355960995540745, |
|
"learning_rate": 1.4569611314609767e-05, |
|
"loss": 0.3384, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.079646017699115, |
|
"grad_norm": 4.525568032764957, |
|
"learning_rate": 1.4514559731639273e-05, |
|
"loss": 0.3318, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 2.088495575221239, |
|
"grad_norm": 28.29868813913778, |
|
"learning_rate": 1.4459335762394637e-05, |
|
"loss": 0.3307, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.0973451327433628, |
|
"grad_norm": 3.34159347381774, |
|
"learning_rate": 1.4403941515576344e-05, |
|
"loss": 0.331, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 2.106194690265487, |
|
"grad_norm": 2.279005617470207, |
|
"learning_rate": 1.434837910638685e-05, |
|
"loss": 0.3506, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.1150442477876106, |
|
"grad_norm": 1.818174492390053, |
|
"learning_rate": 1.42926506564498e-05, |
|
"loss": 0.335, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 2.1238938053097347, |
|
"grad_norm": 2.21280984926424, |
|
"learning_rate": 1.4236758293729034e-05, |
|
"loss": 0.3383, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.1327433628318584, |
|
"grad_norm": 2.6931991751982154, |
|
"learning_rate": 1.4180704152447322e-05, |
|
"loss": 0.3431, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 2.1415929203539825, |
|
"grad_norm": 3.1061095288545504, |
|
"learning_rate": 1.4124490373004864e-05, |
|
"loss": 0.3485, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.150442477876106, |
|
"grad_norm": 1.8692302359437503, |
|
"learning_rate": 1.4068119101897568e-05, |
|
"loss": 0.3482, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 2.15929203539823, |
|
"grad_norm": 4.590488061615715, |
|
"learning_rate": 1.4011592491635088e-05, |
|
"loss": 0.3349, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.168141592920354, |
|
"grad_norm": 1.7406001344065656, |
|
"learning_rate": 1.3954912700658626e-05, |
|
"loss": 0.33, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 2.1769911504424777, |
|
"grad_norm": 2.147923934228603, |
|
"learning_rate": 1.389808189325851e-05, |
|
"loss": 0.3384, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.185840707964602, |
|
"grad_norm": 1.980447202723778, |
|
"learning_rate": 1.3841102239491567e-05, |
|
"loss": 0.3409, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 2.1946902654867255, |
|
"grad_norm": 1.754785548742135, |
|
"learning_rate": 1.3783975915098244e-05, |
|
"loss": 0.3267, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.2035398230088497, |
|
"grad_norm": 1.685034715782438, |
|
"learning_rate": 1.3726705101419538e-05, |
|
"loss": 0.3173, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 2.2123893805309733, |
|
"grad_norm": 1.697282116117874, |
|
"learning_rate": 1.3669291985313695e-05, |
|
"loss": 0.3422, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.2212389380530975, |
|
"grad_norm": 1.7593945507869428, |
|
"learning_rate": 1.3611738759072712e-05, |
|
"loss": 0.33, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 2.230088495575221, |
|
"grad_norm": 1.7917883090230355, |
|
"learning_rate": 1.3554047620338629e-05, |
|
"loss": 0.3305, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.2389380530973453, |
|
"grad_norm": 1.8630367840502837, |
|
"learning_rate": 1.3496220772019597e-05, |
|
"loss": 0.3331, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 2.247787610619469, |
|
"grad_norm": 1.4789663707909015, |
|
"learning_rate": 1.3438260422205779e-05, |
|
"loss": 0.3388, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.256637168141593, |
|
"grad_norm": 1.605944505641843, |
|
"learning_rate": 1.3380168784085028e-05, |
|
"loss": 0.3366, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 2.265486725663717, |
|
"grad_norm": 1.6963565599437993, |
|
"learning_rate": 1.3321948075858377e-05, |
|
"loss": 0.3563, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.274336283185841, |
|
"grad_norm": 1.6228304534956426, |
|
"learning_rate": 1.3263600520655333e-05, |
|
"loss": 0.3365, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 2.2831858407079646, |
|
"grad_norm": 1.76469997194976, |
|
"learning_rate": 1.3205128346449003e-05, |
|
"loss": 0.3443, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.2920353982300883, |
|
"grad_norm": 1.645969241835665, |
|
"learning_rate": 1.3146533785970997e-05, |
|
"loss": 0.3288, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 2.3008849557522124, |
|
"grad_norm": 1.6481237108795372, |
|
"learning_rate": 1.3087819076626201e-05, |
|
"loss": 0.3314, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.309734513274336, |
|
"grad_norm": 1.672673680905006, |
|
"learning_rate": 1.3028986460407312e-05, |
|
"loss": 0.3142, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 2.3185840707964602, |
|
"grad_norm": 1.6126633305212483, |
|
"learning_rate": 1.297003818380926e-05, |
|
"loss": 0.3331, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.327433628318584, |
|
"grad_norm": 1.6511744854665646, |
|
"learning_rate": 1.2910976497743389e-05, |
|
"loss": 0.321, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 2.336283185840708, |
|
"grad_norm": 1.6614647687728916, |
|
"learning_rate": 1.2851803657451554e-05, |
|
"loss": 0.34, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.3451327433628317, |
|
"grad_norm": 1.5556961404816416, |
|
"learning_rate": 1.2792521922419958e-05, |
|
"loss": 0.3378, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 2.353982300884956, |
|
"grad_norm": 1.607457047378207, |
|
"learning_rate": 1.2733133556292914e-05, |
|
"loss": 0.3277, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.3628318584070795, |
|
"grad_norm": 1.6195875099518586, |
|
"learning_rate": 1.2673640826786378e-05, |
|
"loss": 0.3268, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 2.3716814159292037, |
|
"grad_norm": 1.5352915106052365, |
|
"learning_rate": 1.2614046005601377e-05, |
|
"loss": 0.3186, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.3805309734513274, |
|
"grad_norm": 1.5797503971889433, |
|
"learning_rate": 1.2554351368337262e-05, |
|
"loss": 0.3344, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 2.3893805309734515, |
|
"grad_norm": 2.5943284467596253, |
|
"learning_rate": 1.2494559194404809e-05, |
|
"loss": 0.3468, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.398230088495575, |
|
"grad_norm": 1.7257039415600963, |
|
"learning_rate": 1.2434671766939184e-05, |
|
"loss": 0.3348, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 2.4070796460176993, |
|
"grad_norm": 1.892133636249101, |
|
"learning_rate": 1.2374691372712761e-05, |
|
"loss": 0.3276, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.415929203539823, |
|
"grad_norm": 1.959589712843592, |
|
"learning_rate": 1.2314620302047818e-05, |
|
"loss": 0.3273, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 2.4247787610619467, |
|
"grad_norm": 1.5186200162272043, |
|
"learning_rate": 1.2254460848729046e-05, |
|
"loss": 0.3274, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.433628318584071, |
|
"grad_norm": 1.5473486484026677, |
|
"learning_rate": 1.2194215309916005e-05, |
|
"loss": 0.3443, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 2.442477876106195, |
|
"grad_norm": 1.6097900368392104, |
|
"learning_rate": 1.2133885986055379e-05, |
|
"loss": 0.3179, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.4513274336283186, |
|
"grad_norm": 1.59719321690748, |
|
"learning_rate": 1.2073475180793144e-05, |
|
"loss": 0.324, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 2.4601769911504423, |
|
"grad_norm": 1.657560283923696, |
|
"learning_rate": 1.2012985200886602e-05, |
|
"loss": 0.3279, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.4690265486725664, |
|
"grad_norm": 1.6312886292352506, |
|
"learning_rate": 1.1952418356116309e-05, |
|
"loss": 0.342, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 2.47787610619469, |
|
"grad_norm": 1.673413598360019, |
|
"learning_rate": 1.1891776959197854e-05, |
|
"loss": 0.3325, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.4867256637168142, |
|
"grad_norm": 1.5641257680673357, |
|
"learning_rate": 1.1831063325693578e-05, |
|
"loss": 0.33, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 2.495575221238938, |
|
"grad_norm": 1.557582800520083, |
|
"learning_rate": 1.1770279773924133e-05, |
|
"loss": 0.3229, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.504424778761062, |
|
"grad_norm": 2.0076002123451033, |
|
"learning_rate": 1.1709428624879971e-05, |
|
"loss": 0.338, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 2.5132743362831858, |
|
"grad_norm": 1.7747271993836249, |
|
"learning_rate": 1.1648512202132705e-05, |
|
"loss": 0.3312, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.52212389380531, |
|
"grad_norm": 1.7345249327456702, |
|
"learning_rate": 1.15875328317464e-05, |
|
"loss": 0.3324, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 2.5309734513274336, |
|
"grad_norm": 1.5526595051928562, |
|
"learning_rate": 1.1526492842188746e-05, |
|
"loss": 0.3183, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.5398230088495577, |
|
"grad_norm": 1.7188069365366003, |
|
"learning_rate": 1.1465394564242142e-05, |
|
"loss": 0.3382, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 2.5486725663716814, |
|
"grad_norm": 1.5515396732118238, |
|
"learning_rate": 1.1404240330914706e-05, |
|
"loss": 0.3214, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.557522123893805, |
|
"grad_norm": 1.7016705926250035, |
|
"learning_rate": 1.1343032477351183e-05, |
|
"loss": 0.341, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 2.566371681415929, |
|
"grad_norm": 1.700802699390158, |
|
"learning_rate": 1.128177334074377e-05, |
|
"loss": 0.3206, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.5752212389380533, |
|
"grad_norm": 1.5960912094984059, |
|
"learning_rate": 1.122046526024291e-05, |
|
"loss": 0.3155, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 2.584070796460177, |
|
"grad_norm": 1.7339788975369348, |
|
"learning_rate": 1.1159110576867915e-05, |
|
"loss": 0.3239, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.5929203539823007, |
|
"grad_norm": 1.6059957404958596, |
|
"learning_rate": 1.1097711633417623e-05, |
|
"loss": 0.3221, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 2.601769911504425, |
|
"grad_norm": 1.8427719621309488, |
|
"learning_rate": 1.1036270774380906e-05, |
|
"loss": 0.3304, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.6106194690265485, |
|
"grad_norm": 1.631329628154416, |
|
"learning_rate": 1.0974790345847187e-05, |
|
"loss": 0.3202, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 2.6194690265486726, |
|
"grad_norm": 1.588519061268432, |
|
"learning_rate": 1.0913272695416807e-05, |
|
"loss": 0.3262, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.6283185840707963, |
|
"grad_norm": 1.6832971532585765, |
|
"learning_rate": 1.085172017211142e-05, |
|
"loss": 0.343, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 2.6371681415929205, |
|
"grad_norm": 1.6201470424053184, |
|
"learning_rate": 1.0790135126284275e-05, |
|
"loss": 0.3173, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.646017699115044, |
|
"grad_norm": 1.6068035539370134, |
|
"learning_rate": 1.072851990953049e-05, |
|
"loss": 0.3302, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 2.6548672566371683, |
|
"grad_norm": 1.6527582221176542, |
|
"learning_rate": 1.0666876874597235e-05, |
|
"loss": 0.317, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.663716814159292, |
|
"grad_norm": 1.6872940097146771, |
|
"learning_rate": 1.0605208375293905e-05, |
|
"loss": 0.3327, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 2.672566371681416, |
|
"grad_norm": 1.9993336890377256, |
|
"learning_rate": 1.0543516766402245e-05, |
|
"loss": 0.327, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.6814159292035398, |
|
"grad_norm": 1.8925758992460178, |
|
"learning_rate": 1.0481804403586421e-05, |
|
"loss": 0.3232, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 2.6902654867256635, |
|
"grad_norm": 1.7506417492910218, |
|
"learning_rate": 1.0420073643303085e-05, |
|
"loss": 0.3236, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.6991150442477876, |
|
"grad_norm": 1.7254760531768936, |
|
"learning_rate": 1.0358326842711383e-05, |
|
"loss": 0.3376, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 2.7079646017699117, |
|
"grad_norm": 1.6056971874044912, |
|
"learning_rate": 1.0296566359582951e-05, |
|
"loss": 0.3197, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.7168141592920354, |
|
"grad_norm": 1.692264180914051, |
|
"learning_rate": 1.023479455221189e-05, |
|
"loss": 0.317, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 2.725663716814159, |
|
"grad_norm": 1.755215116123095, |
|
"learning_rate": 1.0173013779324714e-05, |
|
"loss": 0.3309, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.734513274336283, |
|
"grad_norm": 1.6225703848243365, |
|
"learning_rate": 1.0111226399990267e-05, |
|
"loss": 0.3247, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 2.7433628318584073, |
|
"grad_norm": 1.574502935573898, |
|
"learning_rate": 1.0049434773529678e-05, |
|
"loss": 0.3193, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.752212389380531, |
|
"grad_norm": 1.642771315319423, |
|
"learning_rate": 9.98764125942623e-06, |
|
"loss": 0.3304, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 2.7610619469026547, |
|
"grad_norm": 1.5731646251204943, |
|
"learning_rate": 9.9258482172353e-06, |
|
"loss": 0.3438, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.769911504424779, |
|
"grad_norm": 1.8184099244247223, |
|
"learning_rate": 9.864058006494237e-06, |
|
"loss": 0.3278, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 2.7787610619469025, |
|
"grad_norm": 1.9834453880133907, |
|
"learning_rate": 9.80227298663227e-06, |
|
"loss": 0.3305, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.7876106194690267, |
|
"grad_norm": 1.9264046453250971, |
|
"learning_rate": 9.740495516880428e-06, |
|
"loss": 0.3158, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 2.7964601769911503, |
|
"grad_norm": 2.1903532218624857, |
|
"learning_rate": 9.678727956181438e-06, |
|
"loss": 0.3267, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.8053097345132745, |
|
"grad_norm": 1.8158272144124583, |
|
"learning_rate": 9.616972663099648e-06, |
|
"loss": 0.342, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 2.814159292035398, |
|
"grad_norm": 2.7692218225078724, |
|
"learning_rate": 9.55523199573098e-06, |
|
"loss": 0.3258, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.823008849557522, |
|
"grad_norm": 1.6127123082950938, |
|
"learning_rate": 9.493508311612874e-06, |
|
"loss": 0.3214, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 2.831858407079646, |
|
"grad_norm": 1.7692693875404215, |
|
"learning_rate": 9.431803967634284e-06, |
|
"loss": 0.337, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.84070796460177, |
|
"grad_norm": 1.5722273068604184, |
|
"learning_rate": 9.370121319945657e-06, |
|
"loss": 0.3354, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 2.849557522123894, |
|
"grad_norm": 1.5455898969077757, |
|
"learning_rate": 9.308462723868987e-06, |
|
"loss": 0.3203, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.8584070796460175, |
|
"grad_norm": 1.6331654733208454, |
|
"learning_rate": 9.246830533807857e-06, |
|
"loss": 0.3215, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 2.8672566371681416, |
|
"grad_norm": 1.6276033261489797, |
|
"learning_rate": 9.185227103157573e-06, |
|
"loss": 0.3152, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.8761061946902657, |
|
"grad_norm": 1.6614797915917234, |
|
"learning_rate": 9.12365478421525e-06, |
|
"loss": 0.3214, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 2.8849557522123894, |
|
"grad_norm": 1.732426911950826, |
|
"learning_rate": 9.062115928090036e-06, |
|
"loss": 0.3068, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.893805309734513, |
|
"grad_norm": 1.518118162492201, |
|
"learning_rate": 9.000612884613306e-06, |
|
"loss": 0.3126, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 2.9026548672566372, |
|
"grad_norm": 1.6702030494722016, |
|
"learning_rate": 8.939148002248954e-06, |
|
"loss": 0.3348, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.911504424778761, |
|
"grad_norm": 1.7443283818272917, |
|
"learning_rate": 8.877723628003703e-06, |
|
"loss": 0.3266, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 2.920353982300885, |
|
"grad_norm": 1.6887892210021154, |
|
"learning_rate": 8.816342107337501e-06, |
|
"loss": 0.331, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.9292035398230087, |
|
"grad_norm": 1.6444996183942215, |
|
"learning_rate": 8.755005784073948e-06, |
|
"loss": 0.3078, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 2.938053097345133, |
|
"grad_norm": 1.5829832005281133, |
|
"learning_rate": 8.693717000310801e-06, |
|
"loss": 0.3071, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.9469026548672566, |
|
"grad_norm": 1.7157739006910784, |
|
"learning_rate": 8.632478096330559e-06, |
|
"loss": 0.3255, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 2.9557522123893807, |
|
"grad_norm": 1.5977542612636495, |
|
"learning_rate": 8.571291410511063e-06, |
|
"loss": 0.3176, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.9646017699115044, |
|
"grad_norm": 1.6999676018113206, |
|
"learning_rate": 8.510159279236244e-06, |
|
"loss": 0.3275, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 2.9734513274336285, |
|
"grad_norm": 1.8724555229847881, |
|
"learning_rate": 8.449084036806893e-06, |
|
"loss": 0.3201, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.982300884955752, |
|
"grad_norm": 1.980909859882966, |
|
"learning_rate": 8.388068015351521e-06, |
|
"loss": 0.3105, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 2.991150442477876, |
|
"grad_norm": 1.797933699630095, |
|
"learning_rate": 8.327113544737325e-06, |
|
"loss": 0.3207, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 1.6572798098507155, |
|
"learning_rate": 8.2662229524812e-06, |
|
"loss": 0.3003, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.235044464468956, |
|
"eval_runtime": 341.1312, |
|
"eval_samples_per_second": 22.047, |
|
"eval_steps_per_second": 0.346, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 3.0088495575221237, |
|
"grad_norm": 2.353752696189, |
|
"learning_rate": 8.205398563660886e-06, |
|
"loss": 0.179, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.017699115044248, |
|
"grad_norm": 2.3653722927763288, |
|
"learning_rate": 8.144642700826182e-06, |
|
"loss": 0.1704, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 3.0265486725663715, |
|
"grad_norm": 1.9259007577930276, |
|
"learning_rate": 8.08395768391024e-06, |
|
"loss": 0.1707, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 3.0353982300884956, |
|
"grad_norm": 1.893459900305888, |
|
"learning_rate": 8.02334583014101e-06, |
|
"loss": 0.1675, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 3.0442477876106193, |
|
"grad_norm": 2.0717584069847037, |
|
"learning_rate": 7.96280945395273e-06, |
|
"loss": 0.1839, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 3.0530973451327434, |
|
"grad_norm": 2.114700678563183, |
|
"learning_rate": 7.902350866897573e-06, |
|
"loss": 0.1793, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 3.061946902654867, |
|
"grad_norm": 1.8731510987373088, |
|
"learning_rate": 7.841972377557366e-06, |
|
"loss": 0.1846, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 3.0707964601769913, |
|
"grad_norm": 1.722257787492872, |
|
"learning_rate": 7.78167629145545e-06, |
|
"loss": 0.1697, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 3.079646017699115, |
|
"grad_norm": 1.705359298929866, |
|
"learning_rate": 7.721464910968628e-06, |
|
"loss": 0.1687, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 3.088495575221239, |
|
"grad_norm": 1.8344884220833366, |
|
"learning_rate": 7.661340535239266e-06, |
|
"loss": 0.1724, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 3.0973451327433628, |
|
"grad_norm": 1.7566442049461333, |
|
"learning_rate": 7.6013054600875005e-06, |
|
"loss": 0.1754, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 3.106194690265487, |
|
"grad_norm": 1.6129096564380034, |
|
"learning_rate": 7.541361977923564e-06, |
|
"loss": 0.1667, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 3.1150442477876106, |
|
"grad_norm": 1.7068252140891824, |
|
"learning_rate": 7.481512377660251e-06, |
|
"loss": 0.1667, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 3.1238938053097347, |
|
"grad_norm": 1.6531864536593857, |
|
"learning_rate": 7.421758944625528e-06, |
|
"loss": 0.1785, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 3.1327433628318584, |
|
"grad_norm": 1.821254758412697, |
|
"learning_rate": 7.362103960475258e-06, |
|
"loss": 0.1698, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 3.1415929203539825, |
|
"grad_norm": 1.7780006102159944, |
|
"learning_rate": 7.302549703106084e-06, |
|
"loss": 0.1828, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 3.150442477876106, |
|
"grad_norm": 2.39713947955002, |
|
"learning_rate": 7.243098446568442e-06, |
|
"loss": 0.1736, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 3.15929203539823, |
|
"grad_norm": 1.6290436678999984, |
|
"learning_rate": 7.183752460979737e-06, |
|
"loss": 0.1699, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 3.168141592920354, |
|
"grad_norm": 1.582870471036582, |
|
"learning_rate": 7.124514012437645e-06, |
|
"loss": 0.1718, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 3.1769911504424777, |
|
"grad_norm": 1.5585723392749353, |
|
"learning_rate": 7.065385362933603e-06, |
|
"loss": 0.166, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 3.185840707964602, |
|
"grad_norm": 1.6356629585103508, |
|
"learning_rate": 7.006368770266421e-06, |
|
"loss": 0.1738, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.1946902654867255, |
|
"grad_norm": 1.8092852534302832, |
|
"learning_rate": 6.947466487956067e-06, |
|
"loss": 0.184, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 3.2035398230088497, |
|
"grad_norm": 1.763331264313237, |
|
"learning_rate": 6.88868076515763e-06, |
|
"loss": 0.1747, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 3.2123893805309733, |
|
"grad_norm": 1.6618660989856646, |
|
"learning_rate": 6.83001384657543e-06, |
|
"loss": 0.1753, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 3.2212389380530975, |
|
"grad_norm": 1.587869179848222, |
|
"learning_rate": 6.7714679723772996e-06, |
|
"loss": 0.177, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 3.230088495575221, |
|
"grad_norm": 1.7012507664894645, |
|
"learning_rate": 6.713045378109058e-06, |
|
"loss": 0.182, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 3.2389380530973453, |
|
"grad_norm": 3.092871482624404, |
|
"learning_rate": 6.654748294609137e-06, |
|
"loss": 0.1749, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 3.247787610619469, |
|
"grad_norm": 5.246672357838822, |
|
"learning_rate": 6.596578947923395e-06, |
|
"loss": 0.1852, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 3.256637168141593, |
|
"grad_norm": 2.6149168080932492, |
|
"learning_rate": 6.538539559220141e-06, |
|
"loss": 0.1717, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 3.265486725663717, |
|
"grad_norm": 2.298194261929981, |
|
"learning_rate": 6.480632344705274e-06, |
|
"loss": 0.1827, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 3.274336283185841, |
|
"grad_norm": 2.0137609080864083, |
|
"learning_rate": 6.422859515537709e-06, |
|
"loss": 0.1783, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 3.2831858407079646, |
|
"grad_norm": 1.9067096587439358, |
|
"learning_rate": 6.365223277744907e-06, |
|
"loss": 0.1762, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 3.2920353982300883, |
|
"grad_norm": 1.749958280136199, |
|
"learning_rate": 6.3077258321386604e-06, |
|
"loss": 0.1666, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 3.3008849557522124, |
|
"grad_norm": 1.6753892133330817, |
|
"learning_rate": 6.25036937423105e-06, |
|
"loss": 0.1817, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 3.309734513274336, |
|
"grad_norm": 1.770861004923346, |
|
"learning_rate": 6.1931560941506055e-06, |
|
"loss": 0.1753, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 3.3185840707964602, |
|
"grad_norm": 1.6941989909031387, |
|
"learning_rate": 6.136088176558683e-06, |
|
"loss": 0.1683, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 3.327433628318584, |
|
"grad_norm": 1.7818744676278702, |
|
"learning_rate": 6.07916780056604e-06, |
|
"loss": 0.1819, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 3.336283185840708, |
|
"grad_norm": 1.7169221713713938, |
|
"learning_rate": 6.022397139649636e-06, |
|
"loss": 0.1753, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 3.3451327433628317, |
|
"grad_norm": 1.7383379208379126, |
|
"learning_rate": 5.96577836156963e-06, |
|
"loss": 0.1772, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 3.353982300884956, |
|
"grad_norm": 10.593484473821267, |
|
"learning_rate": 5.9093136282866014e-06, |
|
"loss": 0.1776, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 3.3628318584070795, |
|
"grad_norm": 7.892581375758647, |
|
"learning_rate": 5.853005095879015e-06, |
|
"loss": 0.177, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 3.3716814159292037, |
|
"grad_norm": 3.632404735066819, |
|
"learning_rate": 5.796854914460873e-06, |
|
"loss": 0.1819, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 3.3805309734513274, |
|
"grad_norm": 2.48312406396527, |
|
"learning_rate": 5.740865228099621e-06, |
|
"loss": 0.1765, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 3.3893805309734515, |
|
"grad_norm": 2.2660397170129993, |
|
"learning_rate": 5.68503817473429e-06, |
|
"loss": 0.1833, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 3.398230088495575, |
|
"grad_norm": 1.9157388795853385, |
|
"learning_rate": 5.629375886093835e-06, |
|
"loss": 0.1735, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 3.4070796460176993, |
|
"grad_norm": 1.9732798667035623, |
|
"learning_rate": 5.573880487615755e-06, |
|
"loss": 0.1776, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 3.415929203539823, |
|
"grad_norm": 1.694802471034398, |
|
"learning_rate": 5.518554098364932e-06, |
|
"loss": 0.1723, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 3.4247787610619467, |
|
"grad_norm": 1.607717285267641, |
|
"learning_rate": 5.463398830952714e-06, |
|
"loss": 0.1699, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 3.433628318584071, |
|
"grad_norm": 1.742187593317827, |
|
"learning_rate": 5.408416791456239e-06, |
|
"loss": 0.1829, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 3.442477876106195, |
|
"grad_norm": 1.6295575263043325, |
|
"learning_rate": 5.3536100793380234e-06, |
|
"loss": 0.168, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 3.4513274336283186, |
|
"grad_norm": 1.5116611834425375, |
|
"learning_rate": 5.298980787365785e-06, |
|
"loss": 0.1733, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 3.4601769911504423, |
|
"grad_norm": 1.7614935769749696, |
|
"learning_rate": 5.244531001532558e-06, |
|
"loss": 0.1639, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 3.4690265486725664, |
|
"grad_norm": 1.4882500944214951, |
|
"learning_rate": 5.190262800977007e-06, |
|
"loss": 0.1623, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 3.47787610619469, |
|
"grad_norm": 1.7919227523392, |
|
"learning_rate": 5.136178257904048e-06, |
|
"loss": 0.1793, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 3.4867256637168142, |
|
"grad_norm": 1.5720819445780152, |
|
"learning_rate": 5.082279437505739e-06, |
|
"loss": 0.1814, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 3.495575221238938, |
|
"grad_norm": 1.777272679047606, |
|
"learning_rate": 5.028568397882397e-06, |
|
"loss": 0.1732, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 3.504424778761062, |
|
"grad_norm": 1.661227110456758, |
|
"learning_rate": 4.975047189964027e-06, |
|
"loss": 0.1681, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 3.5132743362831858, |
|
"grad_norm": 1.5442922066155755, |
|
"learning_rate": 4.921717857431997e-06, |
|
"loss": 0.165, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 3.52212389380531, |
|
"grad_norm": 1.7475725868687078, |
|
"learning_rate": 4.868582436641006e-06, |
|
"loss": 0.1654, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 3.5309734513274336, |
|
"grad_norm": 1.6819734436503684, |
|
"learning_rate": 4.81564295654134e-06, |
|
"loss": 0.1689, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 3.5398230088495577, |
|
"grad_norm": 1.5921158221510299, |
|
"learning_rate": 4.762901438601368e-06, |
|
"loss": 0.1712, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.5486725663716814, |
|
"grad_norm": 1.7221909524561103, |
|
"learning_rate": 4.710359896730379e-06, |
|
"loss": 0.1761, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 3.557522123893805, |
|
"grad_norm": 1.6159991650918777, |
|
"learning_rate": 4.658020337201666e-06, |
|
"loss": 0.1779, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 3.566371681415929, |
|
"grad_norm": 1.6617020529449875, |
|
"learning_rate": 4.6058847585759335e-06, |
|
"loss": 0.1805, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 3.5752212389380533, |
|
"grad_norm": 1.6141307718438942, |
|
"learning_rate": 4.5539551516249735e-06, |
|
"loss": 0.181, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 3.584070796460177, |
|
"grad_norm": 1.613159926564658, |
|
"learning_rate": 4.502233499255641e-06, |
|
"loss": 0.1812, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 3.5929203539823007, |
|
"grad_norm": 1.5123746738195785, |
|
"learning_rate": 4.450721776434152e-06, |
|
"loss": 0.1737, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 3.601769911504425, |
|
"grad_norm": 1.59229310691719, |
|
"learning_rate": 4.399421950110657e-06, |
|
"loss": 0.156, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 3.6106194690265485, |
|
"grad_norm": 1.5956185168711787, |
|
"learning_rate": 4.348335979144158e-06, |
|
"loss": 0.1739, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 3.6194690265486726, |
|
"grad_norm": 1.5890271518636088, |
|
"learning_rate": 4.297465814227678e-06, |
|
"loss": 0.1682, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 3.6283185840707963, |
|
"grad_norm": 1.5754061195856448, |
|
"learning_rate": 4.2468133978137945e-06, |
|
"loss": 0.172, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 3.6371681415929205, |
|
"grad_norm": 1.5165865020871168, |
|
"learning_rate": 4.196380664040468e-06, |
|
"loss": 0.1514, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 3.646017699115044, |
|
"grad_norm": 1.47915973391418, |
|
"learning_rate": 4.146169538657185e-06, |
|
"loss": 0.1685, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 3.6548672566371683, |
|
"grad_norm": 1.6368988601716563, |
|
"learning_rate": 4.096181938951419e-06, |
|
"loss": 0.163, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 3.663716814159292, |
|
"grad_norm": 1.6225383306108707, |
|
"learning_rate": 4.046419773675421e-06, |
|
"loss": 0.1709, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 3.672566371681416, |
|
"grad_norm": 1.4634676872930676, |
|
"learning_rate": 3.9968849429733396e-06, |
|
"loss": 0.1617, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 3.6814159292035398, |
|
"grad_norm": 1.6544557711131658, |
|
"learning_rate": 3.94757933830867e-06, |
|
"loss": 0.1675, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 3.6902654867256635, |
|
"grad_norm": 1.6512592824076686, |
|
"learning_rate": 3.898504842392017e-06, |
|
"loss": 0.1722, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 3.6991150442477876, |
|
"grad_norm": 1.5886810130145228, |
|
"learning_rate": 3.849663329109206e-06, |
|
"loss": 0.1726, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 3.7079646017699117, |
|
"grad_norm": 1.631842537467322, |
|
"learning_rate": 3.801056663449737e-06, |
|
"loss": 0.1598, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 3.7168141592920354, |
|
"grad_norm": 1.5134893226587467, |
|
"learning_rate": 3.7526867014355685e-06, |
|
"loss": 0.1647, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 3.725663716814159, |
|
"grad_norm": 1.5640824884747317, |
|
"learning_rate": 3.70455529005025e-06, |
|
"loss": 0.1739, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 3.734513274336283, |
|
"grad_norm": 1.50638388767751, |
|
"learning_rate": 3.6566642671683806e-06, |
|
"loss": 0.1644, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 3.7433628318584073, |
|
"grad_norm": 1.6707015790458888, |
|
"learning_rate": 3.6090154614854432e-06, |
|
"loss": 0.1623, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 3.752212389380531, |
|
"grad_norm": 1.6269953969553939, |
|
"learning_rate": 3.561610692447982e-06, |
|
"loss": 0.1603, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 3.7610619469026547, |
|
"grad_norm": 1.6173678776254214, |
|
"learning_rate": 3.514451770184113e-06, |
|
"loss": 0.1751, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 3.769911504424779, |
|
"grad_norm": 2.901697583641324, |
|
"learning_rate": 3.467540495434415e-06, |
|
"loss": 0.1599, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 3.7787610619469025, |
|
"grad_norm": 1.6837282238012816, |
|
"learning_rate": 3.420878659483161e-06, |
|
"loss": 0.1636, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 3.7876106194690267, |
|
"grad_norm": 1.4175543694096393, |
|
"learning_rate": 3.374468044089937e-06, |
|
"loss": 0.1649, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 3.7964601769911503, |
|
"grad_norm": 1.7233110576016455, |
|
"learning_rate": 3.328310421421579e-06, |
|
"loss": 0.1641, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 3.8053097345132745, |
|
"grad_norm": 1.5605391560510777, |
|
"learning_rate": 3.2824075539845334e-06, |
|
"loss": 0.1705, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 3.814159292035398, |
|
"grad_norm": 1.5377565176020294, |
|
"learning_rate": 3.2367611945575308e-06, |
|
"loss": 0.1539, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 3.823008849557522, |
|
"grad_norm": 1.5868929794341793, |
|
"learning_rate": 3.191373086124666e-06, |
|
"loss": 0.1709, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 3.831858407079646, |
|
"grad_norm": 1.5622595706222757, |
|
"learning_rate": 3.1462449618088576e-06, |
|
"loss": 0.1559, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 3.84070796460177, |
|
"grad_norm": 1.7270166919786822, |
|
"learning_rate": 3.1013785448056454e-06, |
|
"loss": 0.1748, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 3.849557522123894, |
|
"grad_norm": 1.5748266845975978, |
|
"learning_rate": 3.0567755483174043e-06, |
|
"loss": 0.1726, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 3.8584070796460175, |
|
"grad_norm": 1.5844961174820655, |
|
"learning_rate": 3.0124376754879305e-06, |
|
"loss": 0.1586, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 3.8672566371681416, |
|
"grad_norm": 2.1519156556967776, |
|
"learning_rate": 2.968366619337394e-06, |
|
"loss": 0.1619, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 3.8761061946902657, |
|
"grad_norm": 1.5353376420507088, |
|
"learning_rate": 2.9245640626977012e-06, |
|
"loss": 0.1582, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 3.8849557522123894, |
|
"grad_norm": 1.604740201034412, |
|
"learning_rate": 2.881031678148244e-06, |
|
"loss": 0.1632, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 3.893805309734513, |
|
"grad_norm": 1.5866339698891518, |
|
"learning_rate": 2.837771127952007e-06, |
|
"loss": 0.1564, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 3.9026548672566372, |
|
"grad_norm": 2.295835726400093, |
|
"learning_rate": 2.7947840639921308e-06, |
|
"loss": 0.163, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 3.911504424778761, |
|
"grad_norm": 1.5757417684634734, |
|
"learning_rate": 2.7520721277088023e-06, |
|
"loss": 0.1596, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 3.920353982300885, |
|
"grad_norm": 1.8301488972667728, |
|
"learning_rate": 2.709636950036597e-06, |
|
"loss": 0.1609, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 3.9292035398230087, |
|
"grad_norm": 1.5193689628299787, |
|
"learning_rate": 2.6674801513421945e-06, |
|
"loss": 0.166, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 3.938053097345133, |
|
"grad_norm": 1.514470929679295, |
|
"learning_rate": 2.6256033413625136e-06, |
|
"loss": 0.1579, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 3.9469026548672566, |
|
"grad_norm": 1.5711723708738545, |
|
"learning_rate": 2.584008119143234e-06, |
|
"loss": 0.1636, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 3.9557522123893807, |
|
"grad_norm": 1.5856863391487659, |
|
"learning_rate": 2.5426960729777496e-06, |
|
"loss": 0.1656, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 3.9646017699115044, |
|
"grad_norm": 1.511859910396652, |
|
"learning_rate": 2.5016687803465033e-06, |
|
"loss": 0.1583, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 3.9734513274336285, |
|
"grad_norm": 1.5002228905494757, |
|
"learning_rate": 2.460927807856778e-06, |
|
"loss": 0.1602, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 3.982300884955752, |
|
"grad_norm": 1.4446988734661728, |
|
"learning_rate": 2.4204747111828463e-06, |
|
"loss": 0.1587, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 3.991150442477876, |
|
"grad_norm": 1.549269756232167, |
|
"learning_rate": 2.3803110350065884e-06, |
|
"loss": 0.1696, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 1.641037232857028, |
|
"learning_rate": 2.3404383129585018e-06, |
|
"loss": 0.1776, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 0.1632937490940094, |
|
"eval_runtime": 341.9824, |
|
"eval_samples_per_second": 21.992, |
|
"eval_steps_per_second": 0.345, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 4.008849557522124, |
|
"grad_norm": 2.475219520944848, |
|
"learning_rate": 2.3008580675591462e-06, |
|
"loss": 0.0861, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 4.017699115044247, |
|
"grad_norm": 1.729540631610469, |
|
"learning_rate": 2.2615718101609986e-06, |
|
"loss": 0.0782, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 4.0265486725663715, |
|
"grad_norm": 1.467284022386594, |
|
"learning_rate": 2.222581040890741e-06, |
|
"loss": 0.0792, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 4.035398230088496, |
|
"grad_norm": 1.4386951051586159, |
|
"learning_rate": 2.183887248591996e-06, |
|
"loss": 0.0799, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 4.04424778761062, |
|
"grad_norm": 1.3926757930334368, |
|
"learning_rate": 2.1454919107684615e-06, |
|
"loss": 0.0824, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 4.053097345132743, |
|
"grad_norm": 1.3315563224559441, |
|
"learning_rate": 2.107396493527489e-06, |
|
"loss": 0.0765, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 4.061946902654867, |
|
"grad_norm": 1.36195332075, |
|
"learning_rate": 2.069602451524114e-06, |
|
"loss": 0.0752, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 4.070796460176991, |
|
"grad_norm": 1.350156993277736, |
|
"learning_rate": 2.0321112279055e-06, |
|
"loss": 0.0791, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 4.079646017699115, |
|
"grad_norm": 1.448441545074319, |
|
"learning_rate": 1.9949242542558466e-06, |
|
"loss": 0.0737, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 4.088495575221239, |
|
"grad_norm": 1.3217457960916599, |
|
"learning_rate": 1.9580429505417054e-06, |
|
"loss": 0.083, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 4.097345132743363, |
|
"grad_norm": 1.33752273930371, |
|
"learning_rate": 1.9214687250577766e-06, |
|
"loss": 0.0833, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 4.106194690265487, |
|
"grad_norm": 1.3522253645938511, |
|
"learning_rate": 1.8852029743731203e-06, |
|
"loss": 0.0778, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 4.115044247787611, |
|
"grad_norm": 1.3575033220346027, |
|
"learning_rate": 1.8492470832778442e-06, |
|
"loss": 0.0812, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 4.123893805309734, |
|
"grad_norm": 1.2823684094843621, |
|
"learning_rate": 1.8136024247302152e-06, |
|
"loss": 0.0763, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 4.132743362831858, |
|
"grad_norm": 1.3765176805961667, |
|
"learning_rate": 1.7782703598042327e-06, |
|
"loss": 0.0758, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 4.1415929203539825, |
|
"grad_norm": 1.2766052485378403, |
|
"learning_rate": 1.7432522376376637e-06, |
|
"loss": 0.0824, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 4.150442477876107, |
|
"grad_norm": 1.3056775652487742, |
|
"learning_rate": 1.7085493953805187e-06, |
|
"loss": 0.0788, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 4.15929203539823, |
|
"grad_norm": 1.2765927958394436, |
|
"learning_rate": 1.6741631581440066e-06, |
|
"loss": 0.0792, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 4.168141592920354, |
|
"grad_norm": 1.2224525666208792, |
|
"learning_rate": 1.6400948389499194e-06, |
|
"loss": 0.0767, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 4.176991150442478, |
|
"grad_norm": 1.3375837067028162, |
|
"learning_rate": 1.6063457386805004e-06, |
|
"loss": 0.0734, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 4.185840707964601, |
|
"grad_norm": 1.336866928045873, |
|
"learning_rate": 1.572917146028783e-06, |
|
"loss": 0.0812, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 4.1946902654867255, |
|
"grad_norm": 1.3309435734104655, |
|
"learning_rate": 1.539810337449369e-06, |
|
"loss": 0.079, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 4.20353982300885, |
|
"grad_norm": 1.32678570412564, |
|
"learning_rate": 1.507026577109686e-06, |
|
"loss": 0.0796, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 4.212389380530974, |
|
"grad_norm": 1.3156430424303722, |
|
"learning_rate": 1.4745671168417265e-06, |
|
"loss": 0.0777, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 4.221238938053097, |
|
"grad_norm": 1.2973545964944124, |
|
"learning_rate": 1.442433196094236e-06, |
|
"loss": 0.0827, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 4.230088495575221, |
|
"grad_norm": 1.311408962321446, |
|
"learning_rate": 1.4106260418854033e-06, |
|
"loss": 0.0775, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 4.238938053097345, |
|
"grad_norm": 1.3792578224833796, |
|
"learning_rate": 1.379146868755985e-06, |
|
"loss": 0.0804, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 4.247787610619469, |
|
"grad_norm": 1.2720416544972974, |
|
"learning_rate": 1.3479968787229402e-06, |
|
"loss": 0.0811, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 4.256637168141593, |
|
"grad_norm": 1.2865117421029972, |
|
"learning_rate": 1.3171772612335332e-06, |
|
"loss": 0.076, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 4.265486725663717, |
|
"grad_norm": 1.3809883790356827, |
|
"learning_rate": 1.2866891931199132e-06, |
|
"loss": 0.0797, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 4.274336283185841, |
|
"grad_norm": 1.339742153054439, |
|
"learning_rate": 1.2565338385541792e-06, |
|
"loss": 0.0773, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 4.283185840707965, |
|
"grad_norm": 1.369494397621055, |
|
"learning_rate": 1.2267123490039201e-06, |
|
"loss": 0.0803, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 4.292035398230088, |
|
"grad_norm": 1.2994483999857558, |
|
"learning_rate": 1.1972258631882527e-06, |
|
"loss": 0.076, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 4.300884955752212, |
|
"grad_norm": 1.3824019299588173, |
|
"learning_rate": 1.168075507034341e-06, |
|
"loss": 0.0779, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 4.3097345132743365, |
|
"grad_norm": 1.3842282775051018, |
|
"learning_rate": 1.1392623936343994e-06, |
|
"loss": 0.08, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 4.31858407079646, |
|
"grad_norm": 1.359855304821783, |
|
"learning_rate": 1.110787623203189e-06, |
|
"loss": 0.08, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 4.327433628318584, |
|
"grad_norm": 1.3510428064785265, |
|
"learning_rate": 1.0826522830360087e-06, |
|
"loss": 0.0814, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 4.336283185840708, |
|
"grad_norm": 1.3365703898748194, |
|
"learning_rate": 1.0548574474671835e-06, |
|
"loss": 0.0791, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 4.345132743362832, |
|
"grad_norm": 1.391541582481427, |
|
"learning_rate": 1.027404177829031e-06, |
|
"loss": 0.0827, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 4.353982300884955, |
|
"grad_norm": 1.324819487757073, |
|
"learning_rate": 1.0002935224113387e-06, |
|
"loss": 0.0796, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 4.3628318584070795, |
|
"grad_norm": 1.403599449784847, |
|
"learning_rate": 9.735265164213349e-07, |
|
"loss": 0.0806, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 4.371681415929204, |
|
"grad_norm": 1.254012872145482, |
|
"learning_rate": 9.471041819441673e-07, |
|
"loss": 0.0762, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 4.380530973451328, |
|
"grad_norm": 1.3727040572468259, |
|
"learning_rate": 9.210275279038638e-07, |
|
"loss": 0.0773, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 4.389380530973451, |
|
"grad_norm": 1.269212054809803, |
|
"learning_rate": 8.952975500248129e-07, |
|
"loss": 0.0789, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 4.398230088495575, |
|
"grad_norm": 1.2901290730207529, |
|
"learning_rate": 8.69915230793742e-07, |
|
"loss": 0.075, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 4.407079646017699, |
|
"grad_norm": 1.2318338508628874, |
|
"learning_rate": 8.448815394222043e-07, |
|
"loss": 0.0813, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 4.415929203539823, |
|
"grad_norm": 1.2925512164200812, |
|
"learning_rate": 8.20197431809564e-07, |
|
"loss": 0.0755, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 4.424778761061947, |
|
"grad_norm": 1.3632740857909407, |
|
"learning_rate": 7.958638505065031e-07, |
|
"loss": 0.077, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 4.433628318584071, |
|
"grad_norm": 1.2727197193668072, |
|
"learning_rate": 7.718817246790222e-07, |
|
"loss": 0.0756, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 4.442477876106195, |
|
"grad_norm": 1.283160067311808, |
|
"learning_rate": 7.48251970072964e-07, |
|
"loss": 0.0771, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 4.451327433628318, |
|
"grad_norm": 1.3266085812679953, |
|
"learning_rate": 7.249754889790539e-07, |
|
"loss": 0.0779, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 4.460176991150442, |
|
"grad_norm": 1.3553958194021836, |
|
"learning_rate": 7.020531701984334e-07, |
|
"loss": 0.0815, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 4.469026548672566, |
|
"grad_norm": 1.4498354656139365, |
|
"learning_rate": 6.794858890087275e-07, |
|
"loss": 0.0804, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 4.477876106194691, |
|
"grad_norm": 1.33445017208115, |
|
"learning_rate": 6.572745071306286e-07, |
|
"loss": 0.0825, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 4.486725663716814, |
|
"grad_norm": 1.2094739845924918, |
|
"learning_rate": 6.3541987269498e-07, |
|
"loss": 0.0724, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 4.495575221238938, |
|
"grad_norm": 1.2915737532727858, |
|
"learning_rate": 6.139228202104008e-07, |
|
"loss": 0.0745, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 4.504424778761062, |
|
"grad_norm": 1.3914894835662084, |
|
"learning_rate": 5.927841705314175e-07, |
|
"loss": 0.0796, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 4.513274336283186, |
|
"grad_norm": 1.292394593993158, |
|
"learning_rate": 5.720047308271149e-07, |
|
"loss": 0.078, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 4.522123893805309, |
|
"grad_norm": 1.3661686769110382, |
|
"learning_rate": 5.515852945503241e-07, |
|
"loss": 0.0811, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 4.530973451327434, |
|
"grad_norm": 1.2989196151190672, |
|
"learning_rate": 5.315266414073161e-07, |
|
"loss": 0.077, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 4.539823008849558, |
|
"grad_norm": 1.415979853747286, |
|
"learning_rate": 5.118295373280335e-07, |
|
"loss": 0.0812, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 4.548672566371682, |
|
"grad_norm": 1.3239654771249467, |
|
"learning_rate": 4.924947344368448e-07, |
|
"loss": 0.079, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 4.557522123893805, |
|
"grad_norm": 1.4440576212335767, |
|
"learning_rate": 4.7352297102382317e-07, |
|
"loss": 0.0747, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 4.566371681415929, |
|
"grad_norm": 1.3092000545668983, |
|
"learning_rate": 4.549149715165546e-07, |
|
"loss": 0.0754, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 4.575221238938053, |
|
"grad_norm": 1.2120991458734485, |
|
"learning_rate": 4.3667144645247463e-07, |
|
"loss": 0.0782, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 4.584070796460177, |
|
"grad_norm": 1.2333368517726013, |
|
"learning_rate": 4.187930924517436e-07, |
|
"loss": 0.076, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 4.592920353982301, |
|
"grad_norm": 1.2863679695017827, |
|
"learning_rate": 4.012805921906393e-07, |
|
"loss": 0.0751, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 4.601769911504425, |
|
"grad_norm": 1.3354897817488147, |
|
"learning_rate": 3.8413461437549203e-07, |
|
"loss": 0.0774, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 4.610619469026549, |
|
"grad_norm": 1.3165459456119635, |
|
"learning_rate": 3.673558137171496e-07, |
|
"loss": 0.0758, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 4.619469026548672, |
|
"grad_norm": 1.448724213864104, |
|
"learning_rate": 3.5094483090597706e-07, |
|
"loss": 0.0772, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 4.628318584070796, |
|
"grad_norm": 1.401376773315023, |
|
"learning_rate": 3.3490229258739794e-07, |
|
"loss": 0.0799, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 4.6371681415929205, |
|
"grad_norm": 1.3907713234287105, |
|
"learning_rate": 3.1922881133795827e-07, |
|
"loss": 0.0784, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 4.646017699115045, |
|
"grad_norm": 1.2597928306220052, |
|
"learning_rate": 3.0392498564193685e-07, |
|
"loss": 0.076, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 4.654867256637168, |
|
"grad_norm": 1.3365628143767583, |
|
"learning_rate": 2.889913998684979e-07, |
|
"loss": 0.0772, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 4.663716814159292, |
|
"grad_norm": 1.284544234891874, |
|
"learning_rate": 2.744286242493721e-07, |
|
"loss": 0.0782, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 4.672566371681416, |
|
"grad_norm": 1.2551721985478679, |
|
"learning_rate": 2.602372148570864e-07, |
|
"loss": 0.0811, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 4.68141592920354, |
|
"grad_norm": 1.2553558965086657, |
|
"learning_rate": 2.4641771358372537e-07, |
|
"loss": 0.0807, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 4.6902654867256635, |
|
"grad_norm": 1.2348755466860069, |
|
"learning_rate": 2.329706481202443e-07, |
|
"loss": 0.0801, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 4.699115044247788, |
|
"grad_norm": 1.1907724591359092, |
|
"learning_rate": 2.1989653193631667e-07, |
|
"loss": 0.0738, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 4.707964601769912, |
|
"grad_norm": 1.3510071207779715, |
|
"learning_rate": 2.0719586426072858e-07, |
|
"loss": 0.084, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 4.716814159292035, |
|
"grad_norm": 1.295370917721196, |
|
"learning_rate": 1.9486913006231846e-07, |
|
"loss": 0.0776, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 4.725663716814159, |
|
"grad_norm": 1.3413694479416487, |
|
"learning_rate": 1.8291680003145074e-07, |
|
"loss": 0.0797, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 4.734513274336283, |
|
"grad_norm": 1.4581808262991165, |
|
"learning_rate": 1.7133933056205366e-07, |
|
"loss": 0.0804, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 4.743362831858407, |
|
"grad_norm": 1.2741635529778859, |
|
"learning_rate": 1.601371637341864e-07, |
|
"loss": 0.0752, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 4.752212389380531, |
|
"grad_norm": 1.2782881439793918, |
|
"learning_rate": 1.49310727297155e-07, |
|
"loss": 0.0774, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 4.761061946902655, |
|
"grad_norm": 1.3120371018641195, |
|
"learning_rate": 1.3886043465318522e-07, |
|
"loss": 0.0813, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 4.769911504424779, |
|
"grad_norm": 1.1988340814149363, |
|
"learning_rate": 1.2878668484163303e-07, |
|
"loss": 0.0777, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 4.778761061946903, |
|
"grad_norm": 1.321759520377253, |
|
"learning_rate": 1.1908986252375243e-07, |
|
"loss": 0.078, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 4.787610619469026, |
|
"grad_norm": 1.4194270019959467, |
|
"learning_rate": 1.097703379679993e-07, |
|
"loss": 0.0775, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 4.79646017699115, |
|
"grad_norm": 1.2788598484535139, |
|
"learning_rate": 1.0082846703590055e-07, |
|
"loss": 0.0765, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 4.8053097345132745, |
|
"grad_norm": 1.3148204824038185, |
|
"learning_rate": 9.226459116846054e-08, |
|
"loss": 0.0751, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 4.814159292035399, |
|
"grad_norm": 1.344867899591603, |
|
"learning_rate": 8.407903737312929e-08, |
|
"loss": 0.0808, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 4.823008849557522, |
|
"grad_norm": 1.278318794310315, |
|
"learning_rate": 7.627211821130576e-08, |
|
"loss": 0.0756, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 4.831858407079646, |
|
"grad_norm": 1.2101543880815988, |
|
"learning_rate": 6.884413178641414e-08, |
|
"loss": 0.0728, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 4.84070796460177, |
|
"grad_norm": 1.4344243105198062, |
|
"learning_rate": 6.179536173251399e-08, |
|
"loss": 0.0844, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 4.849557522123893, |
|
"grad_norm": 1.3276414122745406, |
|
"learning_rate": 5.5126077203471186e-08, |
|
"loss": 0.0808, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 4.8584070796460175, |
|
"grad_norm": 1.3506069083893197, |
|
"learning_rate": 4.883653286268164e-08, |
|
"loss": 0.0833, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 4.867256637168142, |
|
"grad_norm": 1.3682014190493492, |
|
"learning_rate": 4.292696887334691e-08, |
|
"loss": 0.0771, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 4.876106194690266, |
|
"grad_norm": 1.2828761325152347, |
|
"learning_rate": 3.7397610889300384e-08, |
|
"loss": 0.0801, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 4.88495575221239, |
|
"grad_norm": 1.3118235861689838, |
|
"learning_rate": 3.224867004639642e-08, |
|
"loss": 0.0775, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 4.893805309734513, |
|
"grad_norm": 1.2599575243275205, |
|
"learning_rate": 2.7480342954444572e-08, |
|
"loss": 0.0771, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 4.902654867256637, |
|
"grad_norm": 1.2689219424528377, |
|
"learning_rate": 2.309281168970223e-08, |
|
"loss": 0.0745, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 4.911504424778761, |
|
"grad_norm": 1.3933156024565598, |
|
"learning_rate": 1.9086243787922453e-08, |
|
"loss": 0.0754, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 4.920353982300885, |
|
"grad_norm": 1.321697338465418, |
|
"learning_rate": 1.5460792237960154e-08, |
|
"loss": 0.0757, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 4.929203539823009, |
|
"grad_norm": 1.2224130906716606, |
|
"learning_rate": 1.2216595475921245e-08, |
|
"loss": 0.0804, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 4.938053097345133, |
|
"grad_norm": 1.3113541450877073, |
|
"learning_rate": 9.353777379889073e-09, |
|
"loss": 0.0774, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 4.946902654867257, |
|
"grad_norm": 1.3170038581923675, |
|
"learning_rate": 6.8724472651815474e-09, |
|
"loss": 0.0793, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 4.95575221238938, |
|
"grad_norm": 1.2713773209438923, |
|
"learning_rate": 4.772699880187804e-09, |
|
"loss": 0.0763, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 4.964601769911504, |
|
"grad_norm": 1.201702049473955, |
|
"learning_rate": 3.054615402743322e-09, |
|
"loss": 0.0733, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 4.9734513274336285, |
|
"grad_norm": 1.272942179409837, |
|
"learning_rate": 1.7182594370701577e-09, |
|
"loss": 0.0772, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 4.982300884955752, |
|
"grad_norm": 1.480916856542288, |
|
"learning_rate": 7.636830112733862e-10, |
|
"loss": 0.0793, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 4.991150442477876, |
|
"grad_norm": 1.328167260480697, |
|
"learning_rate": 1.9092257538932956e-10, |
|
"loss": 0.0787, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 1.2170466610329072, |
|
"learning_rate": 0.0, |
|
"loss": 0.0793, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 0.1581079065799713, |
|
"eval_runtime": 341.788, |
|
"eval_samples_per_second": 22.005, |
|
"eval_steps_per_second": 0.345, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"step": 2825, |
|
"total_flos": 2365990109184000.0, |
|
"train_loss": 0.3740393664457102, |
|
"train_runtime": 65777.1761, |
|
"train_samples_per_second": 5.497, |
|
"train_steps_per_second": 0.043 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 2825, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 100, |
|
"total_flos": 2365990109184000.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|