|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 1090, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 9.363638838369855, |
|
"learning_rate": 1.8348623853211012e-07, |
|
"loss": 1.1557, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 7.698313407130926, |
|
"learning_rate": 9.174311926605506e-07, |
|
"loss": 1.1258, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 4.550657581482179, |
|
"learning_rate": 1.8348623853211011e-06, |
|
"loss": 1.0811, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.1206535230799584, |
|
"learning_rate": 2.7522935779816517e-06, |
|
"loss": 1.0172, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.6667669157454204, |
|
"learning_rate": 3.6697247706422022e-06, |
|
"loss": 1.0098, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.915795372442334, |
|
"learning_rate": 4.587155963302753e-06, |
|
"loss": 0.9699, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.0202263157289146, |
|
"learning_rate": 5.504587155963303e-06, |
|
"loss": 0.987, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.941819185301855, |
|
"learning_rate": 6.422018348623854e-06, |
|
"loss": 0.973, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.3388431226160704, |
|
"learning_rate": 7.3394495412844045e-06, |
|
"loss": 0.9786, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.977546667798474, |
|
"learning_rate": 8.256880733944956e-06, |
|
"loss": 0.9682, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.1514922297341688, |
|
"learning_rate": 9.174311926605506e-06, |
|
"loss": 0.9655, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.000399665458361, |
|
"learning_rate": 1.0091743119266055e-05, |
|
"loss": 0.9764, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.8927332401265788, |
|
"learning_rate": 1.1009174311926607e-05, |
|
"loss": 0.9827, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.518457316528633, |
|
"learning_rate": 1.1926605504587156e-05, |
|
"loss": 0.9916, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.8795240926276715, |
|
"learning_rate": 1.2844036697247708e-05, |
|
"loss": 0.978, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.4935356013821566, |
|
"learning_rate": 1.3761467889908258e-05, |
|
"loss": 0.9766, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.466277593115983, |
|
"learning_rate": 1.4678899082568809e-05, |
|
"loss": 0.9745, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.8668548628472559, |
|
"learning_rate": 1.559633027522936e-05, |
|
"loss": 0.9923, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.97992788018359, |
|
"learning_rate": 1.6513761467889912e-05, |
|
"loss": 0.9952, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.9211900138296392, |
|
"learning_rate": 1.743119266055046e-05, |
|
"loss": 1.0061, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.0439374947928624, |
|
"learning_rate": 1.834862385321101e-05, |
|
"loss": 0.9985, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.484664541196989, |
|
"learning_rate": 1.9266055045871563e-05, |
|
"loss": 0.991, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.207570653291085, |
|
"learning_rate": 1.999994872196626e-05, |
|
"loss": 1.002, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.3955711863042564, |
|
"learning_rate": 1.9998154046002822e-05, |
|
"loss": 0.9923, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.805464967586267, |
|
"learning_rate": 1.999379599421534e-05, |
|
"loss": 1.0008, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.0096429770798974, |
|
"learning_rate": 1.9986875683942535e-05, |
|
"loss": 1.0239, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.0670945338079534, |
|
"learning_rate": 1.9977394889447526e-05, |
|
"loss": 1.0025, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.9796306144816491, |
|
"learning_rate": 1.9965356041462954e-05, |
|
"loss": 1.0148, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.7112414903007187, |
|
"learning_rate": 1.9950762226567783e-05, |
|
"loss": 1.0019, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.997309626939832, |
|
"learning_rate": 1.9933617186395917e-05, |
|
"loss": 0.9996, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 2.0837540997070243, |
|
"learning_rate": 1.9913925316676946e-05, |
|
"loss": 1.0029, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.7532309300779665, |
|
"learning_rate": 1.9891691666109112e-05, |
|
"loss": 0.9814, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 2.0488547177100593, |
|
"learning_rate": 1.9866921935064907e-05, |
|
"loss": 1.0033, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.9186897481938363, |
|
"learning_rate": 1.9839622474129595e-05, |
|
"loss": 0.9948, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.8790404004659207, |
|
"learning_rate": 1.9809800282473014e-05, |
|
"loss": 1.0133, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.7611301480055446, |
|
"learning_rate": 1.977746300605507e-05, |
|
"loss": 1.0155, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 9.836819737668185, |
|
"learning_rate": 1.9742618935665478e-05, |
|
"loss": 1.0041, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 2.789300774084455, |
|
"learning_rate": 1.9705277004798072e-05, |
|
"loss": 1.0166, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 242.96131281501275, |
|
"learning_rate": 1.9665446787360444e-05, |
|
"loss": 4.2595, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 109.15370345648675, |
|
"learning_rate": 1.9623138495219292e-05, |
|
"loss": 8.5047, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 14.326179300350288, |
|
"learning_rate": 1.957836297558229e-05, |
|
"loss": 7.5663, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 13.265654155079291, |
|
"learning_rate": 1.9531131708217005e-05, |
|
"loss": 7.3637, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 13.674475954916327, |
|
"learning_rate": 1.948145680250766e-05, |
|
"loss": 7.273, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 8.12456969902826, |
|
"learning_rate": 1.9429350994350483e-05, |
|
"loss": 7.2159, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 3.6086468669712164, |
|
"learning_rate": 1.93748276428884e-05, |
|
"loss": 7.2198, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 6.948446933409363, |
|
"learning_rate": 1.931790072708596e-05, |
|
"loss": 7.1902, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 3.5376231155027646, |
|
"learning_rate": 1.9258584842145342e-05, |
|
"loss": 7.1989, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 4.090167600205962, |
|
"learning_rate": 1.9196895195764363e-05, |
|
"loss": 7.1904, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 3.2600853909498593, |
|
"learning_rate": 1.913284760423745e-05, |
|
"loss": 7.1933, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 4.819120975382211, |
|
"learning_rate": 1.9066458488400586e-05, |
|
"loss": 7.1664, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 3.7632935964121224, |
|
"learning_rate": 1.8997744869421248e-05, |
|
"loss": 7.157, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 4.633751035095585, |
|
"learning_rate": 1.8926724364434447e-05, |
|
"loss": 7.1275, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 5.1750257818971965, |
|
"learning_rate": 1.8853415182025953e-05, |
|
"loss": 7.0982, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 8.967324318899966, |
|
"learning_rate": 1.8777836117563894e-05, |
|
"loss": 7.0618, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 19.315932042895618, |
|
"learning_rate": 1.8700006548379898e-05, |
|
"loss": 6.9764, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 7.0881940447215666, |
|
"learning_rate": 1.861994642880105e-05, |
|
"loss": 6.9355, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 22.552340976092317, |
|
"learning_rate": 1.8537676285033886e-05, |
|
"loss": 6.8924, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 35.955085580211176, |
|
"learning_rate": 1.845321720990181e-05, |
|
"loss": 6.8178, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 44.03988221662733, |
|
"learning_rate": 1.8366590857437182e-05, |
|
"loss": 6.7881, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 33.19644429475903, |
|
"learning_rate": 1.8277819437329577e-05, |
|
"loss": 6.7247, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 73.6301202469884, |
|
"learning_rate": 1.8186925709231534e-05, |
|
"loss": 6.7, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 50.40805995123905, |
|
"learning_rate": 1.809393297692334e-05, |
|
"loss": 6.6299, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 32.94909850757248, |
|
"learning_rate": 1.799886508233829e-05, |
|
"loss": 6.574, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 58.62369850675989, |
|
"learning_rate": 1.790174639944997e-05, |
|
"loss": 6.5143, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 58.028764013036465, |
|
"learning_rate": 1.780260182802314e-05, |
|
"loss": 6.4641, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 177.84430268746536, |
|
"learning_rate": 1.7701456787229805e-05, |
|
"loss": 6.4659, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 55.12447286980276, |
|
"learning_rate": 1.7598337209132142e-05, |
|
"loss": 6.4537, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 14.624403149302893, |
|
"learning_rate": 1.7493269532033882e-05, |
|
"loss": 6.4034, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 43.70727416145112, |
|
"learning_rate": 1.738628069370195e-05, |
|
"loss": 6.3425, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 51.61363762445518, |
|
"learning_rate": 1.7277398124460022e-05, |
|
"loss": 6.3135, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 15.717786816672422, |
|
"learning_rate": 1.71666497401558e-05, |
|
"loss": 6.2447, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 48.16105556797076, |
|
"learning_rate": 1.7054063935003813e-05, |
|
"loss": 6.2283, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 45.33885455105297, |
|
"learning_rate": 1.6939669574305565e-05, |
|
"loss": 6.1934, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 205.12275018145363, |
|
"learning_rate": 1.6823495987048922e-05, |
|
"loss": 6.1845, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 140.34062856192622, |
|
"learning_rate": 1.6705572958388576e-05, |
|
"loss": 6.1908, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 22.719850989180927, |
|
"learning_rate": 1.6585930722009602e-05, |
|
"loss": 6.1466, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 25.020898156050958, |
|
"learning_rate": 1.6464599952375998e-05, |
|
"loss": 6.1118, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 29.78095814605391, |
|
"learning_rate": 1.63416117568662e-05, |
|
"loss": 6.0879, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 27.691981481859912, |
|
"learning_rate": 1.621699766779763e-05, |
|
"loss": 6.035, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 51.38905417037612, |
|
"learning_rate": 1.6090789634342278e-05, |
|
"loss": 6.0116, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 56.33481087428632, |
|
"learning_rate": 1.5963020014335437e-05, |
|
"loss": 6.0081, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 45.76341033354771, |
|
"learning_rate": 1.583372156597961e-05, |
|
"loss": 5.9674, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 76.2485523686512, |
|
"learning_rate": 1.570292743944583e-05, |
|
"loss": 5.9338, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 159.1246861661541, |
|
"learning_rate": 1.557067116837444e-05, |
|
"loss": 5.9722, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 132.20908717790851, |
|
"learning_rate": 1.5436986661277578e-05, |
|
"loss": 5.9314, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 71.6597882899901, |
|
"learning_rate": 1.530190819284555e-05, |
|
"loss": 5.9226, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 24.57391536298703, |
|
"learning_rate": 1.5165470395159314e-05, |
|
"loss": 5.8777, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 30.810204350884774, |
|
"learning_rate": 1.5027708248811331e-05, |
|
"loss": 5.8426, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 22.007616074267528, |
|
"learning_rate": 1.4888657073937077e-05, |
|
"loss": 5.8248, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 45.21451125526737, |
|
"learning_rate": 1.4748352521159492e-05, |
|
"loss": 5.8128, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 66.1164622252545, |
|
"learning_rate": 1.4606830562448692e-05, |
|
"loss": 5.8, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 66.43271019490362, |
|
"learning_rate": 1.4464127481899312e-05, |
|
"loss": 5.7388, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 66.81215308290967, |
|
"learning_rate": 1.4320279866427798e-05, |
|
"loss": 5.8021, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 61.76469415155527, |
|
"learning_rate": 1.4175324596392075e-05, |
|
"loss": 5.7631, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 65.59093203298633, |
|
"learning_rate": 1.402929883613599e-05, |
|
"loss": 5.736, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 118.51044142323016, |
|
"learning_rate": 1.3882240024460928e-05, |
|
"loss": 5.6828, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 105.84910737707047, |
|
"learning_rate": 1.3734185865027061e-05, |
|
"loss": 5.6849, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 32.462567969669294, |
|
"learning_rate": 1.358517431668672e-05, |
|
"loss": 5.6538, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 106.47382640656464, |
|
"learning_rate": 1.3435243583752294e-05, |
|
"loss": 5.6048, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 145.1629065173643, |
|
"learning_rate": 1.3284432106201233e-05, |
|
"loss": 5.7145, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 203.25513004760577, |
|
"learning_rate": 1.313277854982062e-05, |
|
"loss": 5.7344, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 116.05723149181094, |
|
"learning_rate": 1.2980321796293838e-05, |
|
"loss": 5.6577, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 21.139566960386738, |
|
"learning_rate": 1.2827100933231904e-05, |
|
"loss": 5.6114, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 26.813357095510124, |
|
"learning_rate": 1.2673155244151985e-05, |
|
"loss": 5.5589, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 43.82656244521634, |
|
"learning_rate": 1.2518524198405699e-05, |
|
"loss": 5.5018, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 65.3879946725387, |
|
"learning_rate": 1.2363247441059775e-05, |
|
"loss": 5.4885, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 52.86165228521218, |
|
"learning_rate": 1.2207364782731657e-05, |
|
"loss": 5.4701, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 59.44473618774632, |
|
"learning_rate": 1.2050916189382646e-05, |
|
"loss": 5.4302, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 26.460703860460455, |
|
"learning_rate": 1.189394177207125e-05, |
|
"loss": 5.3781, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 58.18972877786887, |
|
"learning_rate": 1.1736481776669307e-05, |
|
"loss": 5.3572, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 83.35921782852613, |
|
"learning_rate": 1.1578576573543541e-05, |
|
"loss": 5.4269, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 34.129434276568325, |
|
"learning_rate": 1.1420266647205232e-05, |
|
"loss": 5.364, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 56.68082962283742, |
|
"learning_rate": 1.1261592585930576e-05, |
|
"loss": 5.2992, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 21.643023682782022, |
|
"learning_rate": 1.1102595071354471e-05, |
|
"loss": 5.2592, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 36.88815009730958, |
|
"learning_rate": 1.0943314868040365e-05, |
|
"loss": 5.2248, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 35.31299501129291, |
|
"learning_rate": 1.0783792813028828e-05, |
|
"loss": 5.1845, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 61.64543240010987, |
|
"learning_rate": 1.0624069805367558e-05, |
|
"loss": 5.2092, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 32.919556730268106, |
|
"learning_rate": 1.0464186795625481e-05, |
|
"loss": 5.1762, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 26.906436582456134, |
|
"learning_rate": 1.0304184775393642e-05, |
|
"loss": 5.1519, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 48.66053797078755, |
|
"learning_rate": 1.0144104766775574e-05, |
|
"loss": 5.1177, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 56.3116146779967, |
|
"learning_rate": 9.983987811869863e-06, |
|
"loss": 5.102, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 87.9740004930047, |
|
"learning_rate": 9.823874962247565e-06, |
|
"loss": 5.1095, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 66.31383230719314, |
|
"learning_rate": 9.663807268427197e-06, |
|
"loss": 5.1331, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 58.69537097341761, |
|
"learning_rate": 9.503825769350016e-06, |
|
"loss": 5.0607, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 50.77744089682505, |
|
"learning_rate": 9.343971481858246e-06, |
|
"loss": 5.0193, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 40.26196176853548, |
|
"learning_rate": 9.184285390178978e-06, |
|
"loss": 4.9987, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 37.04233592367474, |
|
"learning_rate": 9.024808435416435e-06, |
|
"loss": 4.99, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 34.99570481216453, |
|
"learning_rate": 8.865581505055292e-06, |
|
"loss": 4.9782, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 25.433787904984356, |
|
"learning_rate": 8.706645422477739e-06, |
|
"loss": 4.9216, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 30.120360546905527, |
|
"learning_rate": 8.548040936496989e-06, |
|
"loss": 4.9355, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 17.059498258803476, |
|
"learning_rate": 8.389808710909881e-06, |
|
"loss": 4.9029, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 43.200600750978815, |
|
"learning_rate": 8.231989314071318e-06, |
|
"loss": 4.9199, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 36.280085935407776, |
|
"learning_rate": 8.07462320849313e-06, |
|
"loss": 4.8689, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 43.88644104874169, |
|
"learning_rate": 7.917750740470116e-06, |
|
"loss": 4.8891, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 31.741053371313615, |
|
"learning_rate": 7.761412129735853e-06, |
|
"loss": 4.8543, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 67.90017223502674, |
|
"learning_rate": 7.605647459150961e-06, |
|
"loss": 4.8095, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 35.90719699243417, |
|
"learning_rate": 7.4504966644264775e-06, |
|
"loss": 4.8115, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 39.02489729571871, |
|
"learning_rate": 7.295999523884921e-06, |
|
"loss": 4.8148, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 46.142215188761334, |
|
"learning_rate": 7.142195648261747e-06, |
|
"loss": 4.8305, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 25.914087667720942, |
|
"learning_rate": 6.989124470549746e-06, |
|
"loss": 4.7909, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 35.254452574603874, |
|
"learning_rate": 6.83682523588902e-06, |
|
"loss": 4.7567, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 51.24311260833751, |
|
"learning_rate": 6.685336991505122e-06, |
|
"loss": 4.7621, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 62.972857231438994, |
|
"learning_rate": 6.5346985766979384e-06, |
|
"loss": 4.7508, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 26.803155155590026, |
|
"learning_rate": 6.384948612883872e-06, |
|
"loss": 4.7262, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 62.01432381157656, |
|
"learning_rate": 6.2361254936939e-06, |
|
"loss": 4.7213, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 52.16714069263354, |
|
"learning_rate": 6.0882673751300235e-06, |
|
"loss": 4.7201, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 47.197837362961145, |
|
"learning_rate": 5.941412165782645e-06, |
|
"loss": 4.6971, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 52.802764515881336, |
|
"learning_rate": 5.79559751711138e-06, |
|
"loss": 4.6844, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 40.805066320948455, |
|
"learning_rate": 5.650860813791786e-06, |
|
"loss": 4.6628, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 46.63291056024256, |
|
"learning_rate": 5.507239164130501e-06, |
|
"loss": 4.6673, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 25.617868108401684, |
|
"learning_rate": 5.364769390551225e-06, |
|
"loss": 4.6606, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 30.046819543783922, |
|
"learning_rate": 5.223488020154028e-06, |
|
"loss": 4.6283, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 29.92322960237207, |
|
"learning_rate": 5.083431275350312e-06, |
|
"loss": 4.6158, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 23.14008510014771, |
|
"learning_rate": 4.9446350645759885e-06, |
|
"loss": 4.5927, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 28.386364075607865, |
|
"learning_rate": 4.807134973085036e-06, |
|
"loss": 4.6146, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 58.55131000208054, |
|
"learning_rate": 4.670966253826027e-06, |
|
"loss": 4.6026, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 73.54411177838821, |
|
"learning_rate": 4.53616381840377e-06, |
|
"loss": 4.5888, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 40.62402827270361, |
|
"learning_rate": 4.402762228128531e-06, |
|
"loss": 4.6271, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 51.502712788137366, |
|
"learning_rate": 4.270795685155001e-06, |
|
"loss": 4.569, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 27.330225371915265, |
|
"learning_rate": 4.140298023713416e-06, |
|
"loss": 4.5708, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 21.99456795475846, |
|
"learning_rate": 4.0113027014349374e-06, |
|
"loss": 4.5624, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 20.119033858069777, |
|
"learning_rate": 3.883842790773647e-06, |
|
"loss": 4.5602, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 24.920824482756828, |
|
"learning_rate": 3.757950970527249e-06, |
|
"loss": 4.5386, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 20.182025561534093, |
|
"learning_rate": 3.633659517458736e-06, |
|
"loss": 4.5425, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 28.03996832003451, |
|
"learning_rate": 3.511000298021098e-06, |
|
"loss": 4.5275, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 24.852500986861678, |
|
"learning_rate": 3.39000476018726e-06, |
|
"loss": 4.5516, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 24.988158701271594, |
|
"learning_rate": 3.2707039253872796e-06, |
|
"loss": 4.5281, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 17.31035723167651, |
|
"learning_rate": 3.153128380554941e-06, |
|
"loss": 4.5368, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 29.648896729425456, |
|
"learning_rate": 3.037308270285709e-06, |
|
"loss": 4.4975, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 24.46394840808864, |
|
"learning_rate": 2.923273289108115e-06, |
|
"loss": 4.4996, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 22.621059381964272, |
|
"learning_rate": 2.8110526738705345e-06, |
|
"loss": 4.4796, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 21.20489787284271, |
|
"learning_rate": 2.700675196245288e-06, |
|
"loss": 4.4752, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 18.493488247322375, |
|
"learning_rate": 2.592169155352031e-06, |
|
"loss": 4.4879, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 18.595132565936982, |
|
"learning_rate": 2.485562370502279e-06, |
|
"loss": 4.4689, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 15.570886966815687, |
|
"learning_rate": 2.3808821740669608e-06, |
|
"loss": 4.4792, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 19.069241841201656, |
|
"learning_rate": 2.2781554044688015e-06, |
|
"loss": 4.4739, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 16.54384612167649, |
|
"learning_rate": 2.1774083993013715e-06, |
|
"loss": 4.4745, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 20.19537097527119, |
|
"learning_rate": 2.0786669885765044e-06, |
|
"loss": 4.4725, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 14.39041053114573, |
|
"learning_rate": 1.981956488101898e-06, |
|
"loss": 4.4684, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 20.220456531856506, |
|
"learning_rate": 1.8873016929904942e-06, |
|
"loss": 4.4534, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 27.77026468910616, |
|
"learning_rate": 1.7947268713034128e-06, |
|
"loss": 4.4627, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 28.465744903023698, |
|
"learning_rate": 1.704255757827963e-06, |
|
"loss": 4.4307, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 25.002218509146623, |
|
"learning_rate": 1.6159115479924259e-06, |
|
"loss": 4.4409, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 15.71265485425737, |
|
"learning_rate": 1.529716891919074e-06, |
|
"loss": 4.4379, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 14.402869444397375, |
|
"learning_rate": 1.4456938886170413e-06, |
|
"loss": 4.4058, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 16.403877600978255, |
|
"learning_rate": 1.3638640803164516e-06, |
|
"loss": 4.4251, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 13.446090650346864, |
|
"learning_rate": 1.2842484469453365e-06, |
|
"loss": 4.4229, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 14.466447190381347, |
|
"learning_rate": 1.2068674007506787e-06, |
|
"loss": 4.4321, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 14.052171577869647, |
|
"learning_rate": 1.1317407810650372e-06, |
|
"loss": 4.4196, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 18.473477215309615, |
|
"learning_rate": 1.0588878492200261e-06, |
|
"loss": 4.4253, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 19.954808882391053, |
|
"learning_rate": 9.883272836080116e-07, |
|
"loss": 4.4108, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 14.033845271160311, |
|
"learning_rate": 9.200771748932513e-07, |
|
"loss": 4.4486, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 21.390405932058453, |
|
"learning_rate": 8.541550213737171e-07, |
|
"loss": 4.4044, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 11.550215082840605, |
|
"learning_rate": 7.905777244947954e-07, |
|
"loss": 4.4409, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 13.607796114151936, |
|
"learning_rate": 7.293615845160196e-07, |
|
"loss": 4.3938, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 9.994454732482547, |
|
"learning_rate": 6.705222963319191e-07, |
|
"loss": 4.3996, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 11.575017099126718, |
|
"learning_rate": 6.140749454480932e-07, |
|
"loss": 4.4276, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 10.374432351937589, |
|
"learning_rate": 5.600340041135133e-07, |
|
"loss": 4.4383, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 12.772375916116722, |
|
"learning_rate": 5.0841332761005e-07, |
|
"loss": 4.4046, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 14.043876323714166, |
|
"learning_rate": 4.592261507001994e-07, |
|
"loss": 4.4136, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 10.7144239108995, |
|
"learning_rate": 4.124850842338779e-07, |
|
"loss": 4.4157, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 9.84410027507594, |
|
"learning_rate": 3.6820211191520127e-07, |
|
"loss": 4.3885, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 8.571874802816026, |
|
"learning_rate": 3.263885872300343e-07, |
|
"loss": 4.4265, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 10.780577259685115, |
|
"learning_rate": 2.870552305351382e-07, |
|
"loss": 4.3938, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 11.893503141163247, |
|
"learning_rate": 2.5021212630962246e-07, |
|
"loss": 4.3798, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 9.062984025133662, |
|
"learning_rate": 2.158687205694443e-07, |
|
"loss": 4.3937, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 8.219330068607992, |
|
"learning_rate": 1.840338184455881e-07, |
|
"loss": 4.3905, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 7.246539185596847, |
|
"learning_rate": 1.5471558192656776e-07, |
|
"loss": 4.3864, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 7.0125806843462195, |
|
"learning_rate": 1.279215277658097e-07, |
|
"loss": 4.413, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 9.201800725889436, |
|
"learning_rate": 1.0365852555447642e-07, |
|
"loss": 4.3679, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 7.076465448219673, |
|
"learning_rate": 8.19327959602012e-08, |
|
"loss": 4.3443, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 9.294237620575574, |
|
"learning_rate": 6.274990913221035e-08, |
|
"loss": 4.4186, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 5.94300131392314, |
|
"learning_rate": 4.6114783273213395e-08, |
|
"loss": 4.3711, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 7.1587345473456905, |
|
"learning_rate": 3.203168337845508e-08, |
|
"loss": 4.3909, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 8.342809351726451, |
|
"learning_rate": 2.05042201422323e-08, |
|
"loss": 4.3894, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 8.363435905130084, |
|
"learning_rate": 1.1535349032167908e-08, |
|
"loss": 4.3886, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 7.728421631832789, |
|
"learning_rate": 5.127369531473525e-09, |
|
"loss": 4.3824, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 6.285020285775841, |
|
"learning_rate": 1.2819245493955746e-09, |
|
"loss": 4.4253, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 6.94654446727408, |
|
"learning_rate": 0.0, |
|
"loss": 4.3906, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 4.393316268920898, |
|
"eval_runtime": 332.2559, |
|
"eval_samples_per_second": 46.443, |
|
"eval_steps_per_second": 0.728, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 1090, |
|
"total_flos": 456447649382400.0, |
|
"train_loss": 4.5692141799751775, |
|
"train_runtime": 12438.5839, |
|
"train_samples_per_second": 11.209, |
|
"train_steps_per_second": 0.088 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1090, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"total_flos": 456447649382400.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|