|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.99981234753237, |
|
"eval_steps": 500, |
|
"global_step": 999, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0010008131606930632, |
|
"grad_norm": 0.09912109375, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 1.7364, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.005004065803465315, |
|
"grad_norm": 0.09228515625, |
|
"learning_rate": 1e-05, |
|
"loss": 1.7255, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01000813160693063, |
|
"grad_norm": 0.10009765625, |
|
"learning_rate": 2e-05, |
|
"loss": 1.693, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.015012197410395946, |
|
"grad_norm": 0.10546875, |
|
"learning_rate": 3e-05, |
|
"loss": 1.7131, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.02001626321386126, |
|
"grad_norm": 0.1298828125, |
|
"learning_rate": 4e-05, |
|
"loss": 1.6944, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.02502032901732658, |
|
"grad_norm": 0.162109375, |
|
"learning_rate": 5e-05, |
|
"loss": 1.6902, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.030024394820791892, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 6e-05, |
|
"loss": 1.6689, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.035028460624257206, |
|
"grad_norm": 0.173828125, |
|
"learning_rate": 7e-05, |
|
"loss": 1.6255, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.04003252642772252, |
|
"grad_norm": 0.1611328125, |
|
"learning_rate": 8e-05, |
|
"loss": 1.5785, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.04503659223118784, |
|
"grad_norm": 0.1357421875, |
|
"learning_rate": 9e-05, |
|
"loss": 1.5011, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.05004065803465316, |
|
"grad_norm": 0.1376953125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4936, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.055044723838118474, |
|
"grad_norm": 0.08740234375, |
|
"learning_rate": 0.00011000000000000002, |
|
"loss": 1.48, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.060048789641583784, |
|
"grad_norm": 0.06591796875, |
|
"learning_rate": 0.00012, |
|
"loss": 1.4597, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0650528554450491, |
|
"grad_norm": 0.04345703125, |
|
"learning_rate": 0.00013000000000000002, |
|
"loss": 1.4573, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.07005692124851441, |
|
"grad_norm": 0.0322265625, |
|
"learning_rate": 0.00014, |
|
"loss": 1.4094, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.07506098705197974, |
|
"grad_norm": 0.037353515625, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 1.4073, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.08006505285544505, |
|
"grad_norm": 0.0277099609375, |
|
"learning_rate": 0.00016, |
|
"loss": 1.4222, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.08506911865891037, |
|
"grad_norm": 0.0230712890625, |
|
"learning_rate": 0.00017, |
|
"loss": 1.4125, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.09007318446237568, |
|
"grad_norm": 0.021728515625, |
|
"learning_rate": 0.00018, |
|
"loss": 1.3956, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.09507725026584099, |
|
"grad_norm": 0.0211181640625, |
|
"learning_rate": 0.00019, |
|
"loss": 1.4091, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.10008131606930631, |
|
"grad_norm": 0.0198974609375, |
|
"learning_rate": 0.0002, |
|
"loss": 1.368, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.10508538187277162, |
|
"grad_norm": 0.0196533203125, |
|
"learning_rate": 0.00019998473561448797, |
|
"loss": 1.381, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.11008944767623695, |
|
"grad_norm": 0.01806640625, |
|
"learning_rate": 0.0001999389471179811, |
|
"loss": 1.3798, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.11509351347970226, |
|
"grad_norm": 0.017333984375, |
|
"learning_rate": 0.00019986264848914474, |
|
"loss": 1.3918, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.12009757928316757, |
|
"grad_norm": 0.017578125, |
|
"learning_rate": 0.00019975586302101248, |
|
"loss": 1.3851, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.12510164508663288, |
|
"grad_norm": 0.017578125, |
|
"learning_rate": 0.00019961862331387543, |
|
"loss": 1.3727, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.1301057108900982, |
|
"grad_norm": 0.0169677734375, |
|
"learning_rate": 0.00019945097126532955, |
|
"loss": 1.3798, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.13510977669356353, |
|
"grad_norm": 0.016357421875, |
|
"learning_rate": 0.0001992529580574848, |
|
"loss": 1.3553, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.14011384249702882, |
|
"grad_norm": 0.01953125, |
|
"learning_rate": 0.00019902464414134005, |
|
"loss": 1.3801, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.14511790830049415, |
|
"grad_norm": 0.0203857421875, |
|
"learning_rate": 0.000198766099218328, |
|
"loss": 1.354, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.15012197410395947, |
|
"grad_norm": 0.01806640625, |
|
"learning_rate": 0.0001984774022190361, |
|
"loss": 1.3874, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.1551260399074248, |
|
"grad_norm": 0.0166015625, |
|
"learning_rate": 0.0001981586412791103, |
|
"loss": 1.3565, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.1601301057108901, |
|
"grad_norm": 0.0177001953125, |
|
"learning_rate": 0.00019780991371234801, |
|
"loss": 1.3525, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.16513417151435542, |
|
"grad_norm": 0.01806640625, |
|
"learning_rate": 0.00019743132598098963, |
|
"loss": 1.3537, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.17013823731782074, |
|
"grad_norm": 0.02734375, |
|
"learning_rate": 0.00019702299366321677, |
|
"loss": 1.3681, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.17514230312128604, |
|
"grad_norm": 0.0201416015625, |
|
"learning_rate": 0.00019658504141786774, |
|
"loss": 1.3453, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.18014636892475136, |
|
"grad_norm": 0.0184326171875, |
|
"learning_rate": 0.0001961176029463807, |
|
"loss": 1.3701, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.18515043472821668, |
|
"grad_norm": 0.021240234375, |
|
"learning_rate": 0.00019562082095197632, |
|
"loss": 1.3525, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.19015450053168198, |
|
"grad_norm": 0.02001953125, |
|
"learning_rate": 0.00019509484709609215, |
|
"loss": 1.3427, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.1951585663351473, |
|
"grad_norm": 0.02001953125, |
|
"learning_rate": 0.0001945398419520823, |
|
"loss": 1.3469, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.20016263213861263, |
|
"grad_norm": 0.021484375, |
|
"learning_rate": 0.00019395597495619634, |
|
"loss": 1.3413, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.20516669794207792, |
|
"grad_norm": 0.0230712890625, |
|
"learning_rate": 0.00019334342435585255, |
|
"loss": 1.3604, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.21017076374554325, |
|
"grad_norm": 0.0211181640625, |
|
"learning_rate": 0.00019270237715522112, |
|
"loss": 1.3695, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.21517482954900857, |
|
"grad_norm": 0.02197265625, |
|
"learning_rate": 0.00019203302905813406, |
|
"loss": 1.3443, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.2201788953524739, |
|
"grad_norm": 0.0211181640625, |
|
"learning_rate": 0.00019133558440833926, |
|
"loss": 1.336, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.2251829611559392, |
|
"grad_norm": 0.0234375, |
|
"learning_rate": 0.00019061025612711699, |
|
"loss": 1.373, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.23018702695940452, |
|
"grad_norm": 0.021728515625, |
|
"learning_rate": 0.00018985726564827728, |
|
"loss": 1.3571, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.23519109276286984, |
|
"grad_norm": 0.0218505859375, |
|
"learning_rate": 0.0001890768428505593, |
|
"loss": 1.3279, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.24019515856633514, |
|
"grad_norm": 0.023193359375, |
|
"learning_rate": 0.00018826922598745197, |
|
"loss": 1.358, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.24519922436980046, |
|
"grad_norm": 0.0250244140625, |
|
"learning_rate": 0.00018743466161445823, |
|
"loss": 1.3361, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.25020329017326576, |
|
"grad_norm": 0.02197265625, |
|
"learning_rate": 0.00018657340451382447, |
|
"loss": 1.3324, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.2552073559767311, |
|
"grad_norm": 0.0213623046875, |
|
"learning_rate": 0.00018568571761675893, |
|
"loss": 1.3304, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.2602114217801964, |
|
"grad_norm": 0.0225830078125, |
|
"learning_rate": 0.00018477187192316184, |
|
"loss": 1.3585, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.2652154875836617, |
|
"grad_norm": 0.0250244140625, |
|
"learning_rate": 0.00018383214641889243, |
|
"loss": 1.3295, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.27021955338712705, |
|
"grad_norm": 0.024658203125, |
|
"learning_rate": 0.00018286682799059824, |
|
"loss": 1.3565, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.27522361919059235, |
|
"grad_norm": 0.0223388671875, |
|
"learning_rate": 0.00018187621133813187, |
|
"loss": 1.3314, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.28022768499405765, |
|
"grad_norm": 0.0228271484375, |
|
"learning_rate": 0.00018086059888458288, |
|
"loss": 1.3395, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.285231750797523, |
|
"grad_norm": 0.0218505859375, |
|
"learning_rate": 0.0001798203006839517, |
|
"loss": 1.3486, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.2902358166009883, |
|
"grad_norm": 0.022216796875, |
|
"learning_rate": 0.00017875563432649392, |
|
"loss": 1.3341, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.29523988240445365, |
|
"grad_norm": 0.0245361328125, |
|
"learning_rate": 0.0001776669248417641, |
|
"loss": 1.3409, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.30024394820791894, |
|
"grad_norm": 0.0235595703125, |
|
"learning_rate": 0.00017655450459938784, |
|
"loss": 1.3506, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.30524801401138424, |
|
"grad_norm": 0.0242919921875, |
|
"learning_rate": 0.00017541871320759378, |
|
"loss": 1.3523, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.3102520798148496, |
|
"grad_norm": 0.0230712890625, |
|
"learning_rate": 0.0001742598974095353, |
|
"loss": 1.3328, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.3152561456183149, |
|
"grad_norm": 0.0220947265625, |
|
"learning_rate": 0.0001730784109774339, |
|
"loss": 1.3394, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.3202602114217802, |
|
"grad_norm": 0.0224609375, |
|
"learning_rate": 0.00017187461460457717, |
|
"loss": 1.3379, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.32526427722524553, |
|
"grad_norm": 0.0233154296875, |
|
"learning_rate": 0.00017064887579520334, |
|
"loss": 1.317, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.33026834302871083, |
|
"grad_norm": 0.02197265625, |
|
"learning_rate": 0.00016940156875230687, |
|
"loss": 1.3257, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.3352724088321761, |
|
"grad_norm": 0.0220947265625, |
|
"learning_rate": 0.00016813307426339892, |
|
"loss": 1.3265, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.3402764746356415, |
|
"grad_norm": 0.0235595703125, |
|
"learning_rate": 0.0001668437795842574, |
|
"loss": 1.3372, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.3452805404391068, |
|
"grad_norm": 0.024658203125, |
|
"learning_rate": 0.00016553407832070277, |
|
"loss": 1.3559, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.35028460624257207, |
|
"grad_norm": 0.02490234375, |
|
"learning_rate": 0.00016420437030843484, |
|
"loss": 1.3396, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3552886720460374, |
|
"grad_norm": 0.025634765625, |
|
"learning_rate": 0.00016285506149096794, |
|
"loss": 1.3437, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.3602927378495027, |
|
"grad_norm": 0.0269775390625, |
|
"learning_rate": 0.00016148656379570144, |
|
"loss": 1.3409, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.365296803652968, |
|
"grad_norm": 0.029052734375, |
|
"learning_rate": 0.0001600992950081632, |
|
"loss": 1.3445, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.37030086945643337, |
|
"grad_norm": 0.026123046875, |
|
"learning_rate": 0.0001586936786444648, |
|
"loss": 1.3533, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.37530493525989866, |
|
"grad_norm": 0.0234375, |
|
"learning_rate": 0.0001572701438220074, |
|
"loss": 1.343, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.38030900106336396, |
|
"grad_norm": 0.026611328125, |
|
"learning_rate": 0.0001558291251284774, |
|
"loss": 1.3617, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.3853130668668293, |
|
"grad_norm": 0.0274658203125, |
|
"learning_rate": 0.00015437106248917217, |
|
"loss": 1.323, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.3903171326702946, |
|
"grad_norm": 0.0235595703125, |
|
"learning_rate": 0.00015289640103269625, |
|
"loss": 1.3186, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.3953211984737599, |
|
"grad_norm": 0.0262451171875, |
|
"learning_rate": 0.00015140559095506908, |
|
"loss": 1.3228, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.40032526427722526, |
|
"grad_norm": 0.02685546875, |
|
"learning_rate": 0.00014989908738228567, |
|
"loss": 1.3341, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.40532933008069055, |
|
"grad_norm": 0.024658203125, |
|
"learning_rate": 0.00014837735023137216, |
|
"loss": 1.3479, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.41033339588415585, |
|
"grad_norm": 0.0267333984375, |
|
"learning_rate": 0.00014684084406997903, |
|
"loss": 1.3272, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.4153374616876212, |
|
"grad_norm": 0.0262451171875, |
|
"learning_rate": 0.00014529003797455402, |
|
"loss": 1.338, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.4203415274910865, |
|
"grad_norm": 0.024658203125, |
|
"learning_rate": 0.000143725405387139, |
|
"loss": 1.3344, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.42534559329455185, |
|
"grad_norm": 0.023681640625, |
|
"learning_rate": 0.000142147423970834, |
|
"loss": 1.3262, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.43034965909801715, |
|
"grad_norm": 0.024658203125, |
|
"learning_rate": 0.0001405565754639724, |
|
"loss": 1.3282, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.43535372490148244, |
|
"grad_norm": 0.0238037109375, |
|
"learning_rate": 0.00013895334553305216, |
|
"loss": 1.3363, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.4403577907049478, |
|
"grad_norm": 0.0281982421875, |
|
"learning_rate": 0.0001373382236244679, |
|
"loss": 1.3365, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.4453618565084131, |
|
"grad_norm": 0.0242919921875, |
|
"learning_rate": 0.0001357117028150889, |
|
"loss": 1.338, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.4503659223118784, |
|
"grad_norm": 0.0240478515625, |
|
"learning_rate": 0.00013407427966172865, |
|
"loss": 1.3344, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.45536998811534374, |
|
"grad_norm": 0.0257568359375, |
|
"learning_rate": 0.00013242645404955237, |
|
"loss": 1.3576, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.46037405391880903, |
|
"grad_norm": 0.02734375, |
|
"learning_rate": 0.00013076872903946806, |
|
"loss": 1.3258, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.46537811972227433, |
|
"grad_norm": 0.02490234375, |
|
"learning_rate": 0.0001291016107145483, |
|
"loss": 1.3441, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.4703821855257397, |
|
"grad_norm": 0.02490234375, |
|
"learning_rate": 0.00012742560802552912, |
|
"loss": 1.3255, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.475386251329205, |
|
"grad_norm": 0.0272216796875, |
|
"learning_rate": 0.00012574123263543388, |
|
"loss": 1.3483, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.4803903171326703, |
|
"grad_norm": 0.025146484375, |
|
"learning_rate": 0.0001240489987633686, |
|
"loss": 1.3465, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.4853943829361356, |
|
"grad_norm": 0.0247802734375, |
|
"learning_rate": 0.0001223494230275372, |
|
"loss": 1.3419, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.4903984487396009, |
|
"grad_norm": 0.0244140625, |
|
"learning_rate": 0.0001206430242875246, |
|
"loss": 1.3168, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.4954025145430662, |
|
"grad_norm": 0.0247802734375, |
|
"learning_rate": 0.00011893032348589478, |
|
"loss": 1.3379, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.5004065803465315, |
|
"grad_norm": 0.025390625, |
|
"learning_rate": 0.00011721184348915384, |
|
"loss": 1.3195, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5054106461499969, |
|
"grad_norm": 0.024169921875, |
|
"learning_rate": 0.00011548810892812505, |
|
"loss": 1.3169, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.5104147119534622, |
|
"grad_norm": 0.0245361328125, |
|
"learning_rate": 0.00011375964603778561, |
|
"loss": 1.3208, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.5154187777569275, |
|
"grad_norm": 0.025390625, |
|
"learning_rate": 0.00011202698249661364, |
|
"loss": 1.3139, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.5204228435603928, |
|
"grad_norm": 0.0238037109375, |
|
"learning_rate": 0.00011029064726549412, |
|
"loss": 1.3217, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.5254269093638582, |
|
"grad_norm": 0.025634765625, |
|
"learning_rate": 0.000108551170426234, |
|
"loss": 1.3115, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.5304309751673234, |
|
"grad_norm": 0.0272216796875, |
|
"learning_rate": 0.0001068090830197346, |
|
"loss": 1.3709, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.5354350409707888, |
|
"grad_norm": 0.0257568359375, |
|
"learning_rate": 0.00010506491688387127, |
|
"loss": 1.3332, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.5404391067742541, |
|
"grad_norm": 0.025634765625, |
|
"learning_rate": 0.00010331920449112991, |
|
"loss": 1.3496, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.5454431725777193, |
|
"grad_norm": 0.0260009765625, |
|
"learning_rate": 0.00010157247878604961, |
|
"loss": 1.3611, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.5504472383811847, |
|
"grad_norm": 0.024658203125, |
|
"learning_rate": 9.982527302252135e-05, |
|
"loss": 1.3502, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.55545130418465, |
|
"grad_norm": 0.0235595703125, |
|
"learning_rate": 9.807812060099191e-05, |
|
"loss": 1.3278, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.5604553699881153, |
|
"grad_norm": 0.0244140625, |
|
"learning_rate": 9.633155490562358e-05, |
|
"loss": 1.3605, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.5654594357915806, |
|
"grad_norm": 0.0235595703125, |
|
"learning_rate": 9.458610914145826e-05, |
|
"loss": 1.3379, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.570463501595046, |
|
"grad_norm": 0.0257568359375, |
|
"learning_rate": 9.284231617163666e-05, |
|
"loss": 1.3085, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.5754675673985112, |
|
"grad_norm": 0.0252685546875, |
|
"learning_rate": 9.11007083547216e-05, |
|
"loss": 1.3415, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.5804716332019766, |
|
"grad_norm": 0.0252685546875, |
|
"learning_rate": 8.936181738217571e-05, |
|
"loss": 1.3251, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.5854756990054419, |
|
"grad_norm": 0.0262451171875, |
|
"learning_rate": 8.762617411604235e-05, |
|
"loss": 1.3519, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.5904797648089073, |
|
"grad_norm": 0.0252685546875, |
|
"learning_rate": 8.589430842688001e-05, |
|
"loss": 1.3148, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.5954838306123725, |
|
"grad_norm": 0.02685546875, |
|
"learning_rate": 8.41667490319994e-05, |
|
"loss": 1.3169, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.6004878964158379, |
|
"grad_norm": 0.024658203125, |
|
"learning_rate": 8.244402333405252e-05, |
|
"loss": 1.3561, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6054919622193032, |
|
"grad_norm": 0.025146484375, |
|
"learning_rate": 8.0726657260023e-05, |
|
"loss": 1.3059, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.6104960280227685, |
|
"grad_norm": 0.0245361328125, |
|
"learning_rate": 7.901517510066724e-05, |
|
"loss": 1.329, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.6155000938262338, |
|
"grad_norm": 0.0250244140625, |
|
"learning_rate": 7.73100993504548e-05, |
|
"loss": 1.3412, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.6205041596296992, |
|
"grad_norm": 0.025146484375, |
|
"learning_rate": 7.561195054805729e-05, |
|
"loss": 1.3447, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.6255082254331644, |
|
"grad_norm": 0.02392578125, |
|
"learning_rate": 7.392124711743422e-05, |
|
"loss": 1.3445, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.6305122912366298, |
|
"grad_norm": 0.025634765625, |
|
"learning_rate": 7.223850520956457e-05, |
|
"loss": 1.3078, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.6355163570400951, |
|
"grad_norm": 0.026123046875, |
|
"learning_rate": 7.056423854487236e-05, |
|
"loss": 1.3427, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.6405204228435604, |
|
"grad_norm": 0.0244140625, |
|
"learning_rate": 6.889895825639401e-05, |
|
"loss": 1.3364, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.6455244886470257, |
|
"grad_norm": 0.0263671875, |
|
"learning_rate": 6.724317273373563e-05, |
|
"loss": 1.3555, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.6505285544504911, |
|
"grad_norm": 0.02587890625, |
|
"learning_rate": 6.55973874678682e-05, |
|
"loss": 1.3542, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.6555326202539563, |
|
"grad_norm": 0.025390625, |
|
"learning_rate": 6.396210489680699e-05, |
|
"loss": 1.3421, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.6605366860574217, |
|
"grad_norm": 0.0240478515625, |
|
"learning_rate": 6.23378242522237e-05, |
|
"loss": 1.3395, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.665540751860887, |
|
"grad_norm": 0.02490234375, |
|
"learning_rate": 6.072504140703714e-05, |
|
"loss": 1.3291, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.6705448176643523, |
|
"grad_norm": 0.0255126953125, |
|
"learning_rate": 5.912424872402927e-05, |
|
"loss": 1.33, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.6755488834678176, |
|
"grad_norm": 0.0257568359375, |
|
"learning_rate": 5.7535934905532816e-05, |
|
"loss": 1.3547, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.680552949271283, |
|
"grad_norm": 0.02490234375, |
|
"learning_rate": 5.596058484423656e-05, |
|
"loss": 1.344, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.6855570150747482, |
|
"grad_norm": 0.025390625, |
|
"learning_rate": 5.43986794751536e-05, |
|
"loss": 1.3617, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.6905610808782136, |
|
"grad_norm": 0.023193359375, |
|
"learning_rate": 5.285069562879758e-05, |
|
"loss": 1.3275, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.6955651466816789, |
|
"grad_norm": 0.0245361328125, |
|
"learning_rate": 5.1317105885612524e-05, |
|
"loss": 1.3459, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.7005692124851441, |
|
"grad_norm": 0.026123046875, |
|
"learning_rate": 4.9798378431699585e-05, |
|
"loss": 1.3345, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7055732782886095, |
|
"grad_norm": 0.0260009765625, |
|
"learning_rate": 4.829497691588557e-05, |
|
"loss": 1.3208, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.7105773440920748, |
|
"grad_norm": 0.02587890625, |
|
"learning_rate": 4.680736030817687e-05, |
|
"loss": 1.3546, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.7155814098955401, |
|
"grad_norm": 0.0240478515625, |
|
"learning_rate": 4.533598275964139e-05, |
|
"loss": 1.326, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.7205854756990054, |
|
"grad_norm": 0.0250244140625, |
|
"learning_rate": 4.388129346376178e-05, |
|
"loss": 1.3337, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.7255895415024708, |
|
"grad_norm": 0.025390625, |
|
"learning_rate": 4.2443736519302314e-05, |
|
"loss": 1.3264, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.730593607305936, |
|
"grad_norm": 0.0250244140625, |
|
"learning_rate": 4.102375079473087e-05, |
|
"loss": 1.3214, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.7355976731094014, |
|
"grad_norm": 0.0255126953125, |
|
"learning_rate": 3.9621769794237894e-05, |
|
"loss": 1.3318, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.7406017389128667, |
|
"grad_norm": 0.0247802734375, |
|
"learning_rate": 3.823822152539286e-05, |
|
"loss": 1.3327, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.745605804716332, |
|
"grad_norm": 0.025146484375, |
|
"learning_rate": 3.687352836847874e-05, |
|
"loss": 1.3486, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.7506098705197973, |
|
"grad_norm": 0.0252685546875, |
|
"learning_rate": 3.552810694754463e-05, |
|
"loss": 1.329, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.7556139363232627, |
|
"grad_norm": 0.025634765625, |
|
"learning_rate": 3.42023680032154e-05, |
|
"loss": 1.3553, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.7606180021267279, |
|
"grad_norm": 0.0240478515625, |
|
"learning_rate": 3.289671626729772e-05, |
|
"loss": 1.3087, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.7656220679301933, |
|
"grad_norm": 0.0244140625, |
|
"learning_rate": 3.161155033922045e-05, |
|
"loss": 1.3299, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.7706261337336586, |
|
"grad_norm": 0.02392578125, |
|
"learning_rate": 3.0347262564347057e-05, |
|
"loss": 1.3156, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.7756301995371239, |
|
"grad_norm": 0.02587890625, |
|
"learning_rate": 2.9104238914197445e-05, |
|
"loss": 1.3171, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.7806342653405892, |
|
"grad_norm": 0.02392578125, |
|
"learning_rate": 2.7882858868615467e-05, |
|
"loss": 1.3415, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.7856383311440546, |
|
"grad_norm": 0.023193359375, |
|
"learning_rate": 2.6683495299918648e-05, |
|
"loss": 1.3576, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.7906423969475198, |
|
"grad_norm": 0.025146484375, |
|
"learning_rate": 2.550651435906456e-05, |
|
"loss": 1.309, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.7956464627509852, |
|
"grad_norm": 0.0272216796875, |
|
"learning_rate": 2.435227536386967e-05, |
|
"loss": 1.3299, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.8006505285544505, |
|
"grad_norm": 0.0240478515625, |
|
"learning_rate": 2.3221130689313907e-05, |
|
"loss": 1.3354, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.8056545943579158, |
|
"grad_norm": 0.023681640625, |
|
"learning_rate": 2.211342565996487e-05, |
|
"loss": 1.3262, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.8106586601613811, |
|
"grad_norm": 0.025390625, |
|
"learning_rate": 2.1029498444554618e-05, |
|
"loss": 1.3339, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.8156627259648465, |
|
"grad_norm": 0.02490234375, |
|
"learning_rate": 1.9969679952740805e-05, |
|
"loss": 1.3516, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.8206667917683117, |
|
"grad_norm": 0.0234375, |
|
"learning_rate": 1.893429373408411e-05, |
|
"loss": 1.3399, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.825670857571777, |
|
"grad_norm": 0.0242919921875, |
|
"learning_rate": 1.7923655879272393e-05, |
|
"loss": 1.3149, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.8306749233752424, |
|
"grad_norm": 0.0242919921875, |
|
"learning_rate": 1.6938074923622227e-05, |
|
"loss": 1.3292, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.8356789891787078, |
|
"grad_norm": 0.0242919921875, |
|
"learning_rate": 1.597785175288683e-05, |
|
"loss": 1.3325, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.840683054982173, |
|
"grad_norm": 0.0245361328125, |
|
"learning_rate": 1.5043279511399333e-05, |
|
"loss": 1.3544, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.8456871207856383, |
|
"grad_norm": 0.0245361328125, |
|
"learning_rate": 1.4134643512579382e-05, |
|
"loss": 1.3601, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.8506911865891037, |
|
"grad_norm": 0.0238037109375, |
|
"learning_rate": 1.3252221151830513e-05, |
|
"loss": 1.3447, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.8556952523925689, |
|
"grad_norm": 0.0238037109375, |
|
"learning_rate": 1.2396281821854683e-05, |
|
"loss": 1.3296, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.8606993181960343, |
|
"grad_norm": 0.024169921875, |
|
"learning_rate": 1.156708683041008e-05, |
|
"loss": 1.3187, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.8657033839994996, |
|
"grad_norm": 0.024169921875, |
|
"learning_rate": 1.0764889320536931e-05, |
|
"loss": 1.3136, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.8707074498029649, |
|
"grad_norm": 0.023193359375, |
|
"learning_rate": 9.989934193276219e-06, |
|
"loss": 1.323, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.8757115156064302, |
|
"grad_norm": 0.024658203125, |
|
"learning_rate": 9.242458032904311e-06, |
|
"loss": 1.3221, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.8807155814098956, |
|
"grad_norm": 0.02587890625, |
|
"learning_rate": 8.52268903470661e-06, |
|
"loss": 1.3413, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.8857196472133608, |
|
"grad_norm": 0.0242919921875, |
|
"learning_rate": 7.830846935312509e-06, |
|
"loss": 1.3342, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.8907237130168262, |
|
"grad_norm": 0.0245361328125, |
|
"learning_rate": 7.167142945612393e-06, |
|
"loss": 1.3265, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.8957277788202915, |
|
"grad_norm": 0.0238037109375, |
|
"learning_rate": 6.531779686277528e-06, |
|
"loss": 1.3387, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.9007318446237568, |
|
"grad_norm": 0.02392578125, |
|
"learning_rate": 5.924951125902545e-06, |
|
"loss": 1.3167, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.9057359104272221, |
|
"grad_norm": 0.0235595703125, |
|
"learning_rate": 5.346842521789141e-06, |
|
"loss": 1.3465, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.9107399762306875, |
|
"grad_norm": 0.0244140625, |
|
"learning_rate": 4.7976303633893384e-06, |
|
"loss": 1.3396, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.9157440420341527, |
|
"grad_norm": 0.0250244140625, |
|
"learning_rate": 4.277482318425408e-06, |
|
"loss": 1.3569, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.9207481078376181, |
|
"grad_norm": 0.025390625, |
|
"learning_rate": 3.7865571817029877e-06, |
|
"loss": 1.3265, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.9257521736410834, |
|
"grad_norm": 0.025390625, |
|
"learning_rate": 3.3250048266329825e-06, |
|
"loss": 1.3392, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.9307562394445487, |
|
"grad_norm": 0.024658203125, |
|
"learning_rate": 2.8929661594770174e-06, |
|
"loss": 1.3597, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.935760305248014, |
|
"grad_norm": 0.0257568359375, |
|
"learning_rate": 2.4905730763305047e-06, |
|
"loss": 1.3329, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.9407643710514794, |
|
"grad_norm": 0.025146484375, |
|
"learning_rate": 2.1179484228564305e-06, |
|
"loss": 1.3497, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.9457684368549446, |
|
"grad_norm": 0.02392578125, |
|
"learning_rate": 1.7752059567820333e-06, |
|
"loss": 1.3367, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.95077250265841, |
|
"grad_norm": 0.0245361328125, |
|
"learning_rate": 1.4624503131699828e-06, |
|
"loss": 1.3459, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.9557765684618753, |
|
"grad_norm": 0.02490234375, |
|
"learning_rate": 1.1797769724745888e-06, |
|
"loss": 1.3326, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.9607806342653405, |
|
"grad_norm": 0.024169921875, |
|
"learning_rate": 9.272722313927617e-07, |
|
"loss": 1.3487, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.9657847000688059, |
|
"grad_norm": 0.0247802734375, |
|
"learning_rate": 7.05013176518754e-07, |
|
"loss": 1.3647, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.9707887658722713, |
|
"grad_norm": 0.0233154296875, |
|
"learning_rate": 5.130676608104845e-07, |
|
"loss": 1.3482, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.9757928316757365, |
|
"grad_norm": 0.0247802734375, |
|
"learning_rate": 3.5149428287495343e-07, |
|
"loss": 1.3423, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.9807968974792018, |
|
"grad_norm": 0.02587890625, |
|
"learning_rate": 2.2034236907874094e-07, |
|
"loss": 1.3267, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.9858009632826672, |
|
"grad_norm": 0.0238037109375, |
|
"learning_rate": 1.1965195848929745e-07, |
|
"loss": 1.3185, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.9908050290861324, |
|
"grad_norm": 0.02392578125, |
|
"learning_rate": 4.945379065152134e-08, |
|
"loss": 1.3129, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.9958090948895978, |
|
"grad_norm": 0.024658203125, |
|
"learning_rate": 9.769296203332446e-09, |
|
"loss": 1.3423, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.99981234753237, |
|
"eval_loss": 1.343445897102356, |
|
"eval_runtime": 1960.9798, |
|
"eval_samples_per_second": 7.216, |
|
"eval_steps_per_second": 7.216, |
|
"step": 999 |
|
}, |
|
{ |
|
"epoch": 0.99981234753237, |
|
"step": 999, |
|
"total_flos": 2.0711686751461048e+18, |
|
"train_loss": 0.052138939037456644, |
|
"train_runtime": 4093.6241, |
|
"train_samples_per_second": 31.243, |
|
"train_steps_per_second": 0.244 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 999, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 20, |
|
"total_flos": 2.0711686751461048e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|