|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 4518, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 41.35739895402257, |
|
"learning_rate": 7.193423539345941e-06, |
|
"loss": 0.5141, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 10.443694874625423, |
|
"learning_rate": 9.358859796204429e-06, |
|
"loss": 0.4195, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 5.720021591145773, |
|
"learning_rate": 1.0625558804168632e-05, |
|
"loss": 0.2851, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 7.59250257002781, |
|
"learning_rate": 1.1524296053062918e-05, |
|
"loss": 0.2174, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 11.521550428114184, |
|
"learning_rate": 1.2221410821833392e-05, |
|
"loss": 0.1817, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 6.027998448017936, |
|
"learning_rate": 1.2790995061027121e-05, |
|
"loss": 0.1886, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 4.929816146115093, |
|
"learning_rate": 1.3272571673439616e-05, |
|
"loss": 0.1553, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.3881515971871345, |
|
"learning_rate": 1.3689732309921406e-05, |
|
"loss": 0.129, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 5.441118636680569, |
|
"learning_rate": 1.4057694068991321e-05, |
|
"loss": 0.1433, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 6.085213496930297, |
|
"learning_rate": 1.4386847078691883e-05, |
|
"loss": 0.1092, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 4.713361075579101, |
|
"learning_rate": 1.4684602194465794e-05, |
|
"loss": 0.1231, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 3.8259500358417924, |
|
"learning_rate": 1.495643131788561e-05, |
|
"loss": 0.0697, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.67920682727699, |
|
"learning_rate": 1.5206489871327869e-05, |
|
"loss": 0.084, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 8.189491553913532, |
|
"learning_rate": 1.54380079302981e-05, |
|
"loss": 0.1023, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 12.000369384166694, |
|
"learning_rate": 1.5653546086656083e-05, |
|
"loss": 0.0972, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.3726942012239953, |
|
"learning_rate": 1.5855168566779895e-05, |
|
"loss": 0.1036, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.0242117206977044, |
|
"learning_rate": 1.604456377435124e-05, |
|
"loss": 0.1081, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.2261934744219967, |
|
"learning_rate": 1.6223130325849813e-05, |
|
"loss": 0.082, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 4.70421073268978, |
|
"learning_rate": 1.6392039793463407e-05, |
|
"loss": 0.0744, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 12.064244277626111, |
|
"learning_rate": 1.6552283335550368e-05, |
|
"loss": 0.0934, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 2.150990470911233, |
|
"learning_rate": 1.67047069382623e-05, |
|
"loss": 0.0737, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 4.074258308547766, |
|
"learning_rate": 1.6850038451324284e-05, |
|
"loss": 0.0841, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 2.6764310841274237, |
|
"learning_rate": 1.6988908609137504e-05, |
|
"loss": 0.0821, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 3.1391283710987063, |
|
"learning_rate": 1.71218675747441e-05, |
|
"loss": 0.0747, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 3.1103728720325754, |
|
"learning_rate": 1.7249398104320845e-05, |
|
"loss": 0.0907, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 5.354173276825422, |
|
"learning_rate": 1.7371926128186358e-05, |
|
"loss": 0.0765, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 2.8677808267225005, |
|
"learning_rate": 1.7489829333814013e-05, |
|
"loss": 0.0777, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 2.6124301773304874, |
|
"learning_rate": 1.760344418715659e-05, |
|
"loss": 0.0897, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 2.915238130503912, |
|
"learning_rate": 1.7713071721324668e-05, |
|
"loss": 0.0866, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 4.061775729833908, |
|
"learning_rate": 1.781898234351457e-05, |
|
"loss": 0.0804, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.5648704857906242, |
|
"learning_rate": 1.7921419853452233e-05, |
|
"loss": 0.0834, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 2.225837991191409, |
|
"learning_rate": 1.8020604823638384e-05, |
|
"loss": 0.0646, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 4.041622096074851, |
|
"learning_rate": 1.811673745928848e-05, |
|
"loss": 0.0636, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.21100008000919218, |
|
"learning_rate": 1.821000003120973e-05, |
|
"loss": 0.0656, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 2.2923304434569443, |
|
"learning_rate": 1.8300558955927067e-05, |
|
"loss": 0.0756, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 3.6524729986322013, |
|
"learning_rate": 1.83885665827083e-05, |
|
"loss": 0.0466, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 2.1793660025907284, |
|
"learning_rate": 1.847416273569235e-05, |
|
"loss": 0.0704, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 4.363503091034068, |
|
"learning_rate": 1.8557476050321896e-05, |
|
"loss": 0.0712, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 3.0942807753734476, |
|
"learning_rate": 1.863862513615056e-05, |
|
"loss": 0.0831, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.789230871814937, |
|
"learning_rate": 1.8717719592408857e-05, |
|
"loss": 0.078, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 3.4361028432853677, |
|
"learning_rate": 1.879486089815082e-05, |
|
"loss": 0.0663, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 3.690439378170989, |
|
"learning_rate": 1.8870143195120794e-05, |
|
"loss": 0.0738, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 2.159404934400987, |
|
"learning_rate": 1.8943653978491198e-05, |
|
"loss": 0.0768, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.8042070009727055, |
|
"learning_rate": 1.901547470818277e-05, |
|
"loss": 0.0777, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.70758347995601, |
|
"learning_rate": 1.9085681351478775e-05, |
|
"loss": 0.05, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.2113560599237037, |
|
"learning_rate": 1.9154344865995993e-05, |
|
"loss": 0.0715, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 2.5816071085486527, |
|
"learning_rate": 1.9221531630710657e-05, |
|
"loss": 0.0688, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.4447841193953872, |
|
"learning_rate": 1.9287303831602588e-05, |
|
"loss": 0.0659, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.2668557011358101, |
|
"learning_rate": 1.9351719807533285e-05, |
|
"loss": 0.0515, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 2.0353893651728865, |
|
"learning_rate": 1.9414834361179333e-05, |
|
"loss": 0.0687, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 2.005863891715338, |
|
"learning_rate": 1.947669903917393e-05, |
|
"loss": 0.0459, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.4330303529877615, |
|
"learning_rate": 1.9537362385044847e-05, |
|
"loss": 0.0557, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.8683959370360874, |
|
"learning_rate": 1.959687016805845e-05, |
|
"loss": 0.0656, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.5963373312962135, |
|
"learning_rate": 1.9655265590672502e-05, |
|
"loss": 0.0519, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.824110595633705, |
|
"learning_rate": 1.9712589476953243e-05, |
|
"loss": 0.0557, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 2.024299349665376, |
|
"learning_rate": 1.976888044401508e-05, |
|
"loss": 0.0617, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.0490238284158413, |
|
"learning_rate": 1.98241750582861e-05, |
|
"loss": 0.0529, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.2602675746455845, |
|
"learning_rate": 1.9878507978183157e-05, |
|
"loss": 0.0617, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.7353304044327778, |
|
"learning_rate": 1.9931912084590654e-05, |
|
"loss": 0.0464, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 2.043608410331547, |
|
"learning_rate": 1.998441860037306e-05, |
|
"loss": 0.0716, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.9522735357022469, |
|
"learning_rate": 1.997786386275595e-05, |
|
"loss": 0.0496, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.3740017908881452, |
|
"learning_rate": 1.9940970300682533e-05, |
|
"loss": 0.0686, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.1533525092583308, |
|
"learning_rate": 1.9904076738609114e-05, |
|
"loss": 0.0562, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.564671647601224, |
|
"learning_rate": 1.9867183176535695e-05, |
|
"loss": 0.0553, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.8996693997352794, |
|
"learning_rate": 1.9830289614462276e-05, |
|
"loss": 0.0524, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.9340857910489522, |
|
"learning_rate": 1.979339605238886e-05, |
|
"loss": 0.0534, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.892682744806536, |
|
"learning_rate": 1.9756502490315442e-05, |
|
"loss": 0.0625, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.2063853192894494, |
|
"learning_rate": 1.9719608928242023e-05, |
|
"loss": 0.0519, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.984312822464147, |
|
"learning_rate": 1.9682715366168604e-05, |
|
"loss": 0.0494, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.6705446030280595, |
|
"learning_rate": 1.9645821804095185e-05, |
|
"loss": 0.059, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.7422620817223426, |
|
"learning_rate": 1.960892824202177e-05, |
|
"loss": 0.0454, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.3182961229762868, |
|
"learning_rate": 1.957203467994835e-05, |
|
"loss": 0.0581, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.330360819481426, |
|
"learning_rate": 1.9535141117874932e-05, |
|
"loss": 0.0575, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.9840741868184866, |
|
"learning_rate": 1.9498247555801517e-05, |
|
"loss": 0.063, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.6018064731760029, |
|
"learning_rate": 1.9461353993728094e-05, |
|
"loss": 0.0519, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.391561342963203, |
|
"learning_rate": 1.9424460431654675e-05, |
|
"loss": 0.0572, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.0319629863193043, |
|
"learning_rate": 1.938756686958126e-05, |
|
"loss": 0.0619, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.7398111190442345, |
|
"learning_rate": 1.935067330750784e-05, |
|
"loss": 0.05, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.675180749962208, |
|
"learning_rate": 1.9313779745434422e-05, |
|
"loss": 0.0654, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.486943722740635, |
|
"learning_rate": 1.9276886183361007e-05, |
|
"loss": 0.0463, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.1444156835752686, |
|
"learning_rate": 1.9239992621287588e-05, |
|
"loss": 0.0598, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.1260777714975718, |
|
"learning_rate": 1.920309905921417e-05, |
|
"loss": 0.039, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.328313971146618, |
|
"learning_rate": 1.916620549714075e-05, |
|
"loss": 0.0591, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.8805517771854091, |
|
"learning_rate": 1.912931193506733e-05, |
|
"loss": 0.0392, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.9704226543415952, |
|
"learning_rate": 1.9092418372993916e-05, |
|
"loss": 0.056, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.4328262810995938, |
|
"learning_rate": 1.9055524810920497e-05, |
|
"loss": 0.0602, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 2.6133255890647167, |
|
"learning_rate": 1.9018631248847078e-05, |
|
"loss": 0.0507, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.4574352859865667, |
|
"learning_rate": 1.898173768677366e-05, |
|
"loss": 0.0667, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.9116869451062006, |
|
"learning_rate": 1.894484412470024e-05, |
|
"loss": 0.0533, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.8152499611885836, |
|
"learning_rate": 1.890795056262682e-05, |
|
"loss": 0.0534, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 2.9440345318883696, |
|
"learning_rate": 1.8871057000553406e-05, |
|
"loss": 0.0419, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 2.7588896552457074, |
|
"learning_rate": 1.8834163438479987e-05, |
|
"loss": 0.0418, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.8886319208504987, |
|
"learning_rate": 1.8797269876406568e-05, |
|
"loss": 0.0397, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.8820615671771377, |
|
"learning_rate": 1.876037631433315e-05, |
|
"loss": 0.0454, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.2868214865586614, |
|
"learning_rate": 1.872348275225973e-05, |
|
"loss": 0.0373, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 2.016876778284122, |
|
"learning_rate": 1.8686589190186315e-05, |
|
"loss": 0.0619, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.8992764743601639, |
|
"learning_rate": 1.8649695628112896e-05, |
|
"loss": 0.0403, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.42416507318131186, |
|
"learning_rate": 1.8612802066039477e-05, |
|
"loss": 0.067, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.8601423331120188, |
|
"learning_rate": 1.8575908503966062e-05, |
|
"loss": 0.066, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.3893250379275068, |
|
"learning_rate": 1.8539014941892643e-05, |
|
"loss": 0.0437, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 2.3240219930504376, |
|
"learning_rate": 1.850212137981922e-05, |
|
"loss": 0.0488, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.8849896953543195, |
|
"learning_rate": 1.8465227817745805e-05, |
|
"loss": 0.0472, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.8655212477880712, |
|
"learning_rate": 1.8428334255672386e-05, |
|
"loss": 0.0546, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.2016665548799828, |
|
"learning_rate": 1.8391440693598967e-05, |
|
"loss": 0.0956, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.492659569010543, |
|
"learning_rate": 1.8354547131525552e-05, |
|
"loss": 0.0449, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.4224837643985645, |
|
"learning_rate": 1.8317653569452133e-05, |
|
"loss": 0.0545, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.8405573730480247, |
|
"learning_rate": 1.8280760007378714e-05, |
|
"loss": 0.0328, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.3988661237020132, |
|
"learning_rate": 1.8243866445305295e-05, |
|
"loss": 0.0451, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.5573396543240564, |
|
"learning_rate": 1.8206972883231876e-05, |
|
"loss": 0.0502, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.7855566721285819, |
|
"learning_rate": 1.817007932115846e-05, |
|
"loss": 0.0341, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.497005175064917, |
|
"learning_rate": 1.8133185759085042e-05, |
|
"loss": 0.0513, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.650975953086994, |
|
"learning_rate": 1.8096292197011623e-05, |
|
"loss": 0.0473, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.8418054071334755, |
|
"learning_rate": 1.8059398634938204e-05, |
|
"loss": 0.0444, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.33651471077315, |
|
"learning_rate": 1.8022505072864785e-05, |
|
"loss": 0.0476, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.1529220423121023, |
|
"learning_rate": 1.7985611510791367e-05, |
|
"loss": 0.0379, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.6912013035706749, |
|
"learning_rate": 1.794871794871795e-05, |
|
"loss": 0.0526, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 2.806793867054328, |
|
"learning_rate": 1.7911824386644532e-05, |
|
"loss": 0.047, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 3.323409487413899, |
|
"learning_rate": 1.7874930824571113e-05, |
|
"loss": 0.0455, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.0877783561592873, |
|
"learning_rate": 1.7838037262497695e-05, |
|
"loss": 0.0555, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.9158165193439103, |
|
"learning_rate": 1.7801143700424276e-05, |
|
"loss": 0.0458, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.0863638939536142, |
|
"learning_rate": 1.776425013835086e-05, |
|
"loss": 0.0462, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 2.3398031954044978, |
|
"learning_rate": 1.772735657627744e-05, |
|
"loss": 0.032, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.9272917966743286, |
|
"learning_rate": 1.7690463014204022e-05, |
|
"loss": 0.0464, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.4608044079217914, |
|
"learning_rate": 1.7653569452130607e-05, |
|
"loss": 0.0544, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 2.0799767984416713, |
|
"learning_rate": 1.7616675890057188e-05, |
|
"loss": 0.0629, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 2.438242481318271, |
|
"learning_rate": 1.757978232798377e-05, |
|
"loss": 0.0586, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.22666963149041416, |
|
"learning_rate": 1.754288876591035e-05, |
|
"loss": 0.0419, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.6906855856145777, |
|
"learning_rate": 1.750599520383693e-05, |
|
"loss": 0.0417, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.8417294291584847, |
|
"learning_rate": 1.7469101641763513e-05, |
|
"loss": 0.0496, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 2.384648750559231, |
|
"learning_rate": 1.7432208079690097e-05, |
|
"loss": 0.037, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.0969098360012755, |
|
"learning_rate": 1.7395314517616678e-05, |
|
"loss": 0.0392, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 4.265947825753472, |
|
"learning_rate": 1.735842095554326e-05, |
|
"loss": 0.0434, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.0043877066833349, |
|
"learning_rate": 1.732152739346984e-05, |
|
"loss": 0.0423, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.3834891081373628, |
|
"learning_rate": 1.728463383139642e-05, |
|
"loss": 0.0423, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.8917610552257311, |
|
"learning_rate": 1.7247740269323006e-05, |
|
"loss": 0.0491, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.7777935423085822, |
|
"learning_rate": 1.7210846707249587e-05, |
|
"loss": 0.0465, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 2.882686457165806, |
|
"learning_rate": 1.717395314517617e-05, |
|
"loss": 0.0366, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.4070944899475468, |
|
"learning_rate": 1.713705958310275e-05, |
|
"loss": 0.0364, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.368387839772382, |
|
"learning_rate": 1.710016602102933e-05, |
|
"loss": 0.0302, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.2202288141019488, |
|
"learning_rate": 1.7063272458955912e-05, |
|
"loss": 0.0455, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.7419694841079256, |
|
"learning_rate": 1.7026378896882496e-05, |
|
"loss": 0.0402, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.5792094366611027, |
|
"learning_rate": 1.6989485334809077e-05, |
|
"loss": 0.0364, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 1.0092569099059323, |
|
"learning_rate": 1.695259177273566e-05, |
|
"loss": 0.0335, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.5374731387141227, |
|
"learning_rate": 1.691569821066224e-05, |
|
"loss": 0.0428, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.6268956058587998, |
|
"learning_rate": 1.687880464858882e-05, |
|
"loss": 0.0483, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 1.6628348966888566, |
|
"learning_rate": 1.6841911086515402e-05, |
|
"loss": 0.0487, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 8.037103442054311, |
|
"learning_rate": 1.6805017524441987e-05, |
|
"loss": 0.0679, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 1.620270154467724, |
|
"learning_rate": 1.6768123962368568e-05, |
|
"loss": 0.0437, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 1.0436875513091504, |
|
"learning_rate": 1.6731230400295152e-05, |
|
"loss": 0.0346, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.8374706362800531, |
|
"learning_rate": 1.6694336838221733e-05, |
|
"loss": 0.0517, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.1939983249441695, |
|
"learning_rate": 1.6657443276148314e-05, |
|
"loss": 0.0393, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 1.0650702970882078, |
|
"learning_rate": 1.6620549714074896e-05, |
|
"loss": 0.0433, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.9588706576483126, |
|
"learning_rate": 1.6583656152001477e-05, |
|
"loss": 0.0381, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 1.4412233810414918, |
|
"learning_rate": 1.6546762589928058e-05, |
|
"loss": 0.0346, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 0.9533871774762268, |
|
"learning_rate": 1.6509869027854642e-05, |
|
"loss": 0.0239, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.44748765523495393, |
|
"learning_rate": 1.6472975465781223e-05, |
|
"loss": 0.0424, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.6542278196418172, |
|
"learning_rate": 1.6436081903707805e-05, |
|
"loss": 0.0322, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.774821420829752, |
|
"learning_rate": 1.6399188341634386e-05, |
|
"loss": 0.0337, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 3.4366984039627053, |
|
"learning_rate": 1.6362294779560967e-05, |
|
"loss": 0.0402, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 0.9409785268524657, |
|
"learning_rate": 1.6325401217487548e-05, |
|
"loss": 0.0351, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 1.3755255908395096, |
|
"learning_rate": 1.6288507655414133e-05, |
|
"loss": 0.0287, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.9105883051718862, |
|
"learning_rate": 1.6251614093340714e-05, |
|
"loss": 0.0377, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.669451044858242, |
|
"learning_rate": 1.6214720531267295e-05, |
|
"loss": 0.0397, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 1.0173498911812129, |
|
"learning_rate": 1.6177826969193876e-05, |
|
"loss": 0.0362, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 1.443726695479527, |
|
"learning_rate": 1.6140933407120457e-05, |
|
"loss": 0.0388, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.48614710000952543, |
|
"learning_rate": 1.610403984504704e-05, |
|
"loss": 0.044, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 0.8955875806318787, |
|
"learning_rate": 1.6067146282973623e-05, |
|
"loss": 0.0361, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.7136011214906065, |
|
"learning_rate": 1.6030252720900204e-05, |
|
"loss": 0.0399, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 1.342383821315822, |
|
"learning_rate": 1.599335915882679e-05, |
|
"loss": 0.0398, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 1.9154366056765233, |
|
"learning_rate": 1.5956465596753366e-05, |
|
"loss": 0.0351, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 3.380571358597728, |
|
"learning_rate": 1.5919572034679947e-05, |
|
"loss": 0.0355, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 1.4160947580259224, |
|
"learning_rate": 1.5882678472606532e-05, |
|
"loss": 0.0333, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.6737638581076882, |
|
"learning_rate": 1.5845784910533113e-05, |
|
"loss": 0.0296, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 1.206337590570484, |
|
"learning_rate": 1.5808891348459694e-05, |
|
"loss": 0.0308, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.7350933133142535, |
|
"learning_rate": 1.577199778638628e-05, |
|
"loss": 0.0335, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 1.3603619905771789, |
|
"learning_rate": 1.573510422431286e-05, |
|
"loss": 0.0254, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 0.6337146921122443, |
|
"learning_rate": 1.569821066223944e-05, |
|
"loss": 0.0381, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 1.3833442570741445, |
|
"learning_rate": 1.5661317100166022e-05, |
|
"loss": 0.0295, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 1.1660396503165016, |
|
"learning_rate": 1.5624423538092603e-05, |
|
"loss": 0.0313, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 1.085393536126589, |
|
"learning_rate": 1.5587529976019188e-05, |
|
"loss": 0.0294, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.9428955580680065, |
|
"learning_rate": 1.555063641394577e-05, |
|
"loss": 0.0403, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 1.2824389825772662, |
|
"learning_rate": 1.551374285187235e-05, |
|
"loss": 0.0326, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 1.071530839035684, |
|
"learning_rate": 1.547684928979893e-05, |
|
"loss": 0.0224, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 0.7250437295813942, |
|
"learning_rate": 1.5439955727725512e-05, |
|
"loss": 0.0253, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.4676245052117833, |
|
"learning_rate": 1.5403062165652093e-05, |
|
"loss": 0.0245, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 1.3654237978697314, |
|
"learning_rate": 1.5366168603578678e-05, |
|
"loss": 0.0274, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.8371654653693097, |
|
"learning_rate": 1.532927504150526e-05, |
|
"loss": 0.0408, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.918371136718616, |
|
"learning_rate": 1.529238147943184e-05, |
|
"loss": 0.0252, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 1.0100840319244568, |
|
"learning_rate": 1.5255487917358423e-05, |
|
"loss": 0.0282, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 1.2752226504227586, |
|
"learning_rate": 1.5218594355285004e-05, |
|
"loss": 0.0481, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 1.016628356639124, |
|
"learning_rate": 1.5181700793211587e-05, |
|
"loss": 0.0534, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 1.2018849311057271, |
|
"learning_rate": 1.5144807231138168e-05, |
|
"loss": 0.0357, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 1.3543149671289594, |
|
"learning_rate": 1.5107913669064749e-05, |
|
"loss": 0.0354, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 1.0460241475499934, |
|
"learning_rate": 1.5071020106991332e-05, |
|
"loss": 0.0328, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 0.7712993931217346, |
|
"learning_rate": 1.5034126544917913e-05, |
|
"loss": 0.0352, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 0.6403552627247137, |
|
"learning_rate": 1.4997232982844494e-05, |
|
"loss": 0.0374, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 0.6985534187744373, |
|
"learning_rate": 1.4960339420771077e-05, |
|
"loss": 0.0236, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 2.7973859948867315, |
|
"learning_rate": 1.4923445858697658e-05, |
|
"loss": 0.0333, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.7856751051336177, |
|
"learning_rate": 1.488655229662424e-05, |
|
"loss": 0.0283, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 1.1090831103953211, |
|
"learning_rate": 1.4849658734550822e-05, |
|
"loss": 0.0424, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 0.37705371680669364, |
|
"learning_rate": 1.4812765172477403e-05, |
|
"loss": 0.0264, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 2.011193423521568, |
|
"learning_rate": 1.4775871610403986e-05, |
|
"loss": 0.0325, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 1.00662492673962, |
|
"learning_rate": 1.4738978048330567e-05, |
|
"loss": 0.0325, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 1.0249189181939053, |
|
"learning_rate": 1.4702084486257148e-05, |
|
"loss": 0.0296, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 1.294162625472094, |
|
"learning_rate": 1.4665190924183733e-05, |
|
"loss": 0.0362, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 0.8753798886295676, |
|
"learning_rate": 1.4628297362110312e-05, |
|
"loss": 0.0352, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 0.32404034083576144, |
|
"learning_rate": 1.4591403800036893e-05, |
|
"loss": 0.0319, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 1.3881694244187606, |
|
"learning_rate": 1.4554510237963478e-05, |
|
"loss": 0.0383, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 0.9106958124280001, |
|
"learning_rate": 1.4517616675890059e-05, |
|
"loss": 0.0366, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 1.3920757998129725, |
|
"learning_rate": 1.448072311381664e-05, |
|
"loss": 0.0359, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 2.5378193320802556, |
|
"learning_rate": 1.4443829551743223e-05, |
|
"loss": 0.0375, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 1.6327275888367767, |
|
"learning_rate": 1.4406935989669804e-05, |
|
"loss": 0.0491, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 1.457863332305797, |
|
"learning_rate": 1.4370042427596385e-05, |
|
"loss": 0.0379, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 0.19454376571103232, |
|
"learning_rate": 1.4333148865522968e-05, |
|
"loss": 0.0357, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 0.7421971370673671, |
|
"learning_rate": 1.429625530344955e-05, |
|
"loss": 0.0418, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 0.0937417741981473, |
|
"learning_rate": 1.4259361741376132e-05, |
|
"loss": 0.0248, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.7544900086855553, |
|
"learning_rate": 1.4222468179302713e-05, |
|
"loss": 0.0242, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 0.10357082442667653, |
|
"learning_rate": 1.4185574617229294e-05, |
|
"loss": 0.0277, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 0.429204704240934, |
|
"learning_rate": 1.4148681055155877e-05, |
|
"loss": 0.0253, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 1.7367754983146269, |
|
"learning_rate": 1.4111787493082458e-05, |
|
"loss": 0.0337, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 0.6428519364176438, |
|
"learning_rate": 1.407489393100904e-05, |
|
"loss": 0.0331, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 0.7848514705508334, |
|
"learning_rate": 1.4038000368935622e-05, |
|
"loss": 0.031, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 1.2604412780544694, |
|
"learning_rate": 1.4001106806862203e-05, |
|
"loss": 0.0381, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 1.0154534415085918, |
|
"learning_rate": 1.3964213244788784e-05, |
|
"loss": 0.0294, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 1.2166332545112624, |
|
"learning_rate": 1.3927319682715367e-05, |
|
"loss": 0.0275, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.8758369938501386, |
|
"learning_rate": 1.3890426120641948e-05, |
|
"loss": 0.0359, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 0.9645035178583596, |
|
"learning_rate": 1.385353255856853e-05, |
|
"loss": 0.0257, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 0.24217577106008117, |
|
"learning_rate": 1.3816638996495112e-05, |
|
"loss": 0.0188, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.8656287403232453, |
|
"learning_rate": 1.3779745434421693e-05, |
|
"loss": 0.035, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 1.5535880950760363, |
|
"learning_rate": 1.3742851872348278e-05, |
|
"loss": 0.0325, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 0.6356266461691455, |
|
"learning_rate": 1.3705958310274859e-05, |
|
"loss": 0.0292, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 1.3067413428110854, |
|
"learning_rate": 1.3669064748201439e-05, |
|
"loss": 0.0206, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 1.3327319855425117, |
|
"learning_rate": 1.3632171186128023e-05, |
|
"loss": 0.023, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 2.308037550483557, |
|
"learning_rate": 1.3595277624054604e-05, |
|
"loss": 0.0283, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 1.447528110026965, |
|
"learning_rate": 1.3558384061981185e-05, |
|
"loss": 0.0344, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 1.4781377553893515, |
|
"learning_rate": 1.3521490499907768e-05, |
|
"loss": 0.0338, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 1.2213361639825704, |
|
"learning_rate": 1.348459693783435e-05, |
|
"loss": 0.0213, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 0.806579624483452, |
|
"learning_rate": 1.344770337576093e-05, |
|
"loss": 0.0308, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 0.36305855519021074, |
|
"learning_rate": 1.3410809813687513e-05, |
|
"loss": 0.0307, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 1.7963102033556697, |
|
"learning_rate": 1.3373916251614094e-05, |
|
"loss": 0.0438, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.8741626138535608, |
|
"learning_rate": 1.3337022689540676e-05, |
|
"loss": 0.0282, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 0.8183941890622122, |
|
"learning_rate": 1.3300129127467258e-05, |
|
"loss": 0.03, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 1.315171338874376, |
|
"learning_rate": 1.326323556539384e-05, |
|
"loss": 0.0347, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 1.268446896432008, |
|
"learning_rate": 1.3226342003320422e-05, |
|
"loss": 0.0376, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 0.913219454382149, |
|
"learning_rate": 1.3189448441247003e-05, |
|
"loss": 0.0358, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 1.1043754536185766, |
|
"learning_rate": 1.3152554879173585e-05, |
|
"loss": 0.0284, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 1.203565789266508, |
|
"learning_rate": 1.3115661317100167e-05, |
|
"loss": 0.0292, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 1.9568074416295906, |
|
"learning_rate": 1.3078767755026749e-05, |
|
"loss": 0.0262, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 0.2859744392048369, |
|
"learning_rate": 1.304187419295333e-05, |
|
"loss": 0.0295, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 1.077880610534017, |
|
"learning_rate": 1.3004980630879912e-05, |
|
"loss": 0.0288, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 0.9333475338900054, |
|
"learning_rate": 1.2968087068806494e-05, |
|
"loss": 0.0355, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 1.904061045593678, |
|
"learning_rate": 1.2931193506733075e-05, |
|
"loss": 0.0251, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 0.58158119774247, |
|
"learning_rate": 1.2894299944659658e-05, |
|
"loss": 0.0233, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 0.7277481905024581, |
|
"learning_rate": 1.2857406382586239e-05, |
|
"loss": 0.0298, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 1.8262592151991606, |
|
"learning_rate": 1.2820512820512823e-05, |
|
"loss": 0.0276, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 0.6465268769367779, |
|
"learning_rate": 1.2783619258439404e-05, |
|
"loss": 0.027, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 0.6081326854882737, |
|
"learning_rate": 1.2746725696365985e-05, |
|
"loss": 0.0337, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 0.7068771700231296, |
|
"learning_rate": 1.2709832134292568e-05, |
|
"loss": 0.0236, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 1.8049539476685674, |
|
"learning_rate": 1.267293857221915e-05, |
|
"loss": 0.0282, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 0.553245442957324, |
|
"learning_rate": 1.263604501014573e-05, |
|
"loss": 0.0226, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 0.8295872212818136, |
|
"learning_rate": 1.2599151448072313e-05, |
|
"loss": 0.0232, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 0.9798348798123743, |
|
"learning_rate": 1.2562257885998895e-05, |
|
"loss": 0.0456, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 1.19245304335107, |
|
"learning_rate": 1.2525364323925476e-05, |
|
"loss": 0.0284, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 1.5998767447680797, |
|
"learning_rate": 1.2488470761852058e-05, |
|
"loss": 0.0307, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 1.7431018978542536, |
|
"learning_rate": 1.245157719977864e-05, |
|
"loss": 0.0216, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 1.0864224894613053, |
|
"learning_rate": 1.241468363770522e-05, |
|
"loss": 0.0388, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 2.164074288346446, |
|
"learning_rate": 1.2377790075631804e-05, |
|
"loss": 0.0302, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 0.6890867086297026, |
|
"learning_rate": 1.2340896513558385e-05, |
|
"loss": 0.0258, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 1.1150440915546074, |
|
"learning_rate": 1.2304002951484968e-05, |
|
"loss": 0.0279, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 0.7515709240098276, |
|
"learning_rate": 1.2267109389411549e-05, |
|
"loss": 0.0419, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.7795946895425347, |
|
"learning_rate": 1.223021582733813e-05, |
|
"loss": 0.0152, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 0.7206757082802774, |
|
"learning_rate": 1.2193322265264713e-05, |
|
"loss": 0.039, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 0.9261417250470075, |
|
"learning_rate": 1.2156428703191294e-05, |
|
"loss": 0.031, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 0.9738836548814439, |
|
"learning_rate": 1.2119535141117875e-05, |
|
"loss": 0.0318, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 1.6517158198941853, |
|
"learning_rate": 1.2082641579044458e-05, |
|
"loss": 0.0346, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 0.8931816948527265, |
|
"learning_rate": 1.2045748016971039e-05, |
|
"loss": 0.0278, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 1.5388621088460506, |
|
"learning_rate": 1.200885445489762e-05, |
|
"loss": 0.0296, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 0.6080136375748658, |
|
"learning_rate": 1.1971960892824204e-05, |
|
"loss": 0.0314, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 0.7855564824393652, |
|
"learning_rate": 1.1935067330750784e-05, |
|
"loss": 0.0371, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 1.391898825501054, |
|
"learning_rate": 1.1898173768677365e-05, |
|
"loss": 0.0215, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 0.023622261403316365, |
|
"learning_rate": 1.186128020660395e-05, |
|
"loss": 0.0289, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 1.2844725300311655, |
|
"learning_rate": 1.182438664453053e-05, |
|
"loss": 0.0272, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.008098058731733223, |
|
"learning_rate": 1.1787493082457114e-05, |
|
"loss": 0.021, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 2.4455783687272565, |
|
"learning_rate": 1.1750599520383695e-05, |
|
"loss": 0.0387, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 0.5832035368747368, |
|
"learning_rate": 1.1713705958310276e-05, |
|
"loss": 0.0392, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 0.4288935348488189, |
|
"learning_rate": 1.1676812396236859e-05, |
|
"loss": 0.0243, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 0.85140142754813, |
|
"learning_rate": 1.163991883416344e-05, |
|
"loss": 0.0288, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 0.6208450731238502, |
|
"learning_rate": 1.1603025272090021e-05, |
|
"loss": 0.0257, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.5506545117162834, |
|
"learning_rate": 1.1566131710016604e-05, |
|
"loss": 0.0257, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 0.6983830876671564, |
|
"learning_rate": 1.1529238147943185e-05, |
|
"loss": 0.0153, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 0.5365569446878222, |
|
"learning_rate": 1.1492344585869766e-05, |
|
"loss": 0.0287, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 0.48404097576321364, |
|
"learning_rate": 1.1455451023796349e-05, |
|
"loss": 0.0189, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.2656124897423078, |
|
"learning_rate": 1.141855746172293e-05, |
|
"loss": 0.0321, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.9816015622085352, |
|
"learning_rate": 1.1381663899649511e-05, |
|
"loss": 0.0392, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 1.6810737744401563, |
|
"learning_rate": 1.1344770337576094e-05, |
|
"loss": 0.022, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 1.4543583055955254, |
|
"learning_rate": 1.1307876775502675e-05, |
|
"loss": 0.0331, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 1.4710753680279536, |
|
"learning_rate": 1.1270983213429258e-05, |
|
"loss": 0.0383, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 1.2478883072438642, |
|
"learning_rate": 1.1234089651355839e-05, |
|
"loss": 0.0316, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 2.2702061081470153, |
|
"learning_rate": 1.119719608928242e-05, |
|
"loss": 0.0282, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 0.6940800534656072, |
|
"learning_rate": 1.1160302527209005e-05, |
|
"loss": 0.0243, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.576849535693127, |
|
"learning_rate": 1.1123408965135584e-05, |
|
"loss": 0.0209, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 0.7227915044457328, |
|
"learning_rate": 1.1086515403062165e-05, |
|
"loss": 0.0243, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 9.838138591586603, |
|
"learning_rate": 1.104962184098875e-05, |
|
"loss": 0.0287, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 0.8776202139061747, |
|
"learning_rate": 1.101272827891533e-05, |
|
"loss": 0.0243, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 0.3717854528241673, |
|
"learning_rate": 1.097583471684191e-05, |
|
"loss": 0.0229, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 1.5052530178666412, |
|
"learning_rate": 1.0938941154768495e-05, |
|
"loss": 0.0232, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 0.7935110408300886, |
|
"learning_rate": 1.0902047592695076e-05, |
|
"loss": 0.0223, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 2.5764455237822945, |
|
"learning_rate": 1.0865154030621657e-05, |
|
"loss": 0.017, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 0.844609705609384, |
|
"learning_rate": 1.082826046854824e-05, |
|
"loss": 0.0247, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 1.3041682780604213, |
|
"learning_rate": 1.0791366906474821e-05, |
|
"loss": 0.0216, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 0.4847618373381675, |
|
"learning_rate": 1.0754473344401404e-05, |
|
"loss": 0.0229, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 0.575833133756361, |
|
"learning_rate": 1.0717579782327985e-05, |
|
"loss": 0.0225, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 1.064228503124154, |
|
"learning_rate": 1.0680686220254566e-05, |
|
"loss": 0.0212, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 0.6517908414314305, |
|
"learning_rate": 1.0643792658181149e-05, |
|
"loss": 0.0222, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 1.6514888202035902, |
|
"learning_rate": 1.060689909610773e-05, |
|
"loss": 0.0326, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 1.5296943703361738, |
|
"learning_rate": 1.0570005534034311e-05, |
|
"loss": 0.0292, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 1.020711807231569, |
|
"learning_rate": 1.0533111971960894e-05, |
|
"loss": 0.0226, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 0.5315183634882714, |
|
"learning_rate": 1.0496218409887475e-05, |
|
"loss": 0.0198, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 0.7542133397646632, |
|
"learning_rate": 1.0459324847814056e-05, |
|
"loss": 0.0201, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 1.4709649935718534, |
|
"learning_rate": 1.0422431285740639e-05, |
|
"loss": 0.0221, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 1.6579768697093855, |
|
"learning_rate": 1.038553772366722e-05, |
|
"loss": 0.0263, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 0.6342891371924907, |
|
"learning_rate": 1.0348644161593803e-05, |
|
"loss": 0.0229, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 0.37353433845779555, |
|
"learning_rate": 1.0311750599520384e-05, |
|
"loss": 0.0201, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 1.4459249487588341, |
|
"learning_rate": 1.0274857037446965e-05, |
|
"loss": 0.0283, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 1.6151962921561034, |
|
"learning_rate": 1.023796347537355e-05, |
|
"loss": 0.0118, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 1.4262238116120927, |
|
"learning_rate": 1.020106991330013e-05, |
|
"loss": 0.0229, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 1.228045108128222, |
|
"learning_rate": 1.016417635122671e-05, |
|
"loss": 0.0217, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 0.6530702622761245, |
|
"learning_rate": 1.0127282789153295e-05, |
|
"loss": 0.0207, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 1.9041502597805957, |
|
"learning_rate": 1.0090389227079876e-05, |
|
"loss": 0.0231, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 0.6535669839017003, |
|
"learning_rate": 1.0053495665006457e-05, |
|
"loss": 0.0225, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 1.3311305874562087, |
|
"learning_rate": 1.001660210293304e-05, |
|
"loss": 0.0203, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 0.8823147611211436, |
|
"learning_rate": 9.979708540859621e-06, |
|
"loss": 0.0276, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 1.5562568296719255, |
|
"learning_rate": 9.942814978786202e-06, |
|
"loss": 0.0269, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 0.6094359831984283, |
|
"learning_rate": 9.905921416712785e-06, |
|
"loss": 0.0205, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 2.2948821888190865, |
|
"learning_rate": 9.869027854639366e-06, |
|
"loss": 0.02, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 0.2523552251279377, |
|
"learning_rate": 9.832134292565947e-06, |
|
"loss": 0.0202, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 1.4590207341781818, |
|
"learning_rate": 9.79524073049253e-06, |
|
"loss": 0.02, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 2.385641051376586, |
|
"learning_rate": 9.758347168419111e-06, |
|
"loss": 0.0326, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 2.304443346442453, |
|
"learning_rate": 9.721453606345692e-06, |
|
"loss": 0.0469, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 0.7921276423545808, |
|
"learning_rate": 9.684560044272275e-06, |
|
"loss": 0.0226, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 0.7313140221633634, |
|
"learning_rate": 9.647666482198858e-06, |
|
"loss": 0.0208, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 0.8616674537642799, |
|
"learning_rate": 9.61077292012544e-06, |
|
"loss": 0.0228, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 0.3223458276996969, |
|
"learning_rate": 9.57387935805202e-06, |
|
"loss": 0.0204, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 0.25847436736502, |
|
"learning_rate": 9.536985795978603e-06, |
|
"loss": 0.0184, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 1.0102739659378692, |
|
"learning_rate": 9.500092233905184e-06, |
|
"loss": 0.021, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 0.4614770769943895, |
|
"learning_rate": 9.463198671831765e-06, |
|
"loss": 0.0192, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 0.7181037817361005, |
|
"learning_rate": 9.426305109758348e-06, |
|
"loss": 0.0219, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 0.8881807080809706, |
|
"learning_rate": 9.38941154768493e-06, |
|
"loss": 0.0149, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 0.8001361388360979, |
|
"learning_rate": 9.35251798561151e-06, |
|
"loss": 0.0148, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 1.4596057880898412, |
|
"learning_rate": 9.315624423538093e-06, |
|
"loss": 0.0171, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 0.3649340327954503, |
|
"learning_rate": 9.278730861464676e-06, |
|
"loss": 0.0217, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 0.3661755964392428, |
|
"learning_rate": 9.241837299391257e-06, |
|
"loss": 0.0262, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 1.1221188633493766, |
|
"learning_rate": 9.204943737317838e-06, |
|
"loss": 0.0254, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 0.12414790838540636, |
|
"learning_rate": 9.168050175244421e-06, |
|
"loss": 0.0193, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 0.8194229025263218, |
|
"learning_rate": 9.131156613171002e-06, |
|
"loss": 0.0132, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 0.7710864622318587, |
|
"learning_rate": 9.094263051097584e-06, |
|
"loss": 0.0229, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 1.2621629718343774, |
|
"learning_rate": 9.057369489024166e-06, |
|
"loss": 0.0214, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 0.31130549301919014, |
|
"learning_rate": 9.020475926950747e-06, |
|
"loss": 0.024, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 1.086050014018613, |
|
"learning_rate": 8.98358236487733e-06, |
|
"loss": 0.0251, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 0.7029996124364184, |
|
"learning_rate": 8.946688802803911e-06, |
|
"loss": 0.0215, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 1.3162716307135196, |
|
"learning_rate": 8.909795240730493e-06, |
|
"loss": 0.0198, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 0.15643570116786681, |
|
"learning_rate": 8.872901678657075e-06, |
|
"loss": 0.019, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 0.45575568712361675, |
|
"learning_rate": 8.836008116583657e-06, |
|
"loss": 0.0236, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 0.917784424801474, |
|
"learning_rate": 8.79911455451024e-06, |
|
"loss": 0.0262, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 1.3932007070706611, |
|
"learning_rate": 8.76222099243682e-06, |
|
"loss": 0.0203, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 1.0304061442597747, |
|
"learning_rate": 8.725327430363403e-06, |
|
"loss": 0.0265, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 1.161967395476058, |
|
"learning_rate": 8.688433868289984e-06, |
|
"loss": 0.024, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 0.43755927706096664, |
|
"learning_rate": 8.651540306216566e-06, |
|
"loss": 0.0249, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 0.45983444385917566, |
|
"learning_rate": 8.614646744143148e-06, |
|
"loss": 0.0121, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 1.0607636793892183, |
|
"learning_rate": 8.57775318206973e-06, |
|
"loss": 0.023, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 5.484759977473768, |
|
"learning_rate": 8.54085961999631e-06, |
|
"loss": 0.0208, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 0.6873972122341183, |
|
"learning_rate": 8.503966057922893e-06, |
|
"loss": 0.0224, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 0.9003081708047279, |
|
"learning_rate": 8.467072495849475e-06, |
|
"loss": 0.018, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 0.8521011818568595, |
|
"learning_rate": 8.430178933776056e-06, |
|
"loss": 0.0306, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 0.6956977848317097, |
|
"learning_rate": 8.393285371702639e-06, |
|
"loss": 0.023, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.5043637468484221, |
|
"learning_rate": 8.356391809629221e-06, |
|
"loss": 0.0193, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.9799158125352796, |
|
"learning_rate": 8.319498247555803e-06, |
|
"loss": 0.0211, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 0.33259982625437207, |
|
"learning_rate": 8.282604685482384e-06, |
|
"loss": 0.0217, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 1.0900206610270098, |
|
"learning_rate": 8.245711123408966e-06, |
|
"loss": 0.0291, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 0.8493131274550083, |
|
"learning_rate": 8.208817561335548e-06, |
|
"loss": 0.0186, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 0.0120847201082528, |
|
"learning_rate": 8.171923999262129e-06, |
|
"loss": 0.0241, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 1.1886551408595796, |
|
"learning_rate": 8.135030437188712e-06, |
|
"loss": 0.0263, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 0.7674536373536617, |
|
"learning_rate": 8.098136875115293e-06, |
|
"loss": 0.0162, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 0.9599986258139311, |
|
"learning_rate": 8.061243313041874e-06, |
|
"loss": 0.0311, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 0.7855164569900942, |
|
"learning_rate": 8.024349750968457e-06, |
|
"loss": 0.02, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 0.7680612067976029, |
|
"learning_rate": 7.987456188895038e-06, |
|
"loss": 0.0196, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 0.49399032116203606, |
|
"learning_rate": 7.95056262682162e-06, |
|
"loss": 0.0184, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 0.8318343668356561, |
|
"learning_rate": 7.913669064748202e-06, |
|
"loss": 0.0146, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 0.5414520871794223, |
|
"learning_rate": 7.876775502674785e-06, |
|
"loss": 0.0201, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 0.9871603598974189, |
|
"learning_rate": 7.839881940601366e-06, |
|
"loss": 0.0182, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 0.9254049832223281, |
|
"learning_rate": 7.802988378527947e-06, |
|
"loss": 0.019, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 0.04138757868391416, |
|
"learning_rate": 7.76609481645453e-06, |
|
"loss": 0.0127, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 1.0624519786080426, |
|
"learning_rate": 7.72920125438111e-06, |
|
"loss": 0.0137, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 1.545143622725685, |
|
"learning_rate": 7.692307692307694e-06, |
|
"loss": 0.0264, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 0.7165870463329832, |
|
"learning_rate": 7.655414130234275e-06, |
|
"loss": 0.0169, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 0.6659772755609369, |
|
"learning_rate": 7.618520568160857e-06, |
|
"loss": 0.0292, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 1.4324913671742698, |
|
"learning_rate": 7.581627006087439e-06, |
|
"loss": 0.0186, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 0.4677680379599719, |
|
"learning_rate": 7.54473344401402e-06, |
|
"loss": 0.0207, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 0.5611844573800361, |
|
"learning_rate": 7.507839881940602e-06, |
|
"loss": 0.0214, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 0.7203601548324894, |
|
"learning_rate": 7.470946319867184e-06, |
|
"loss": 0.0208, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 0.7513235652150956, |
|
"learning_rate": 7.434052757793766e-06, |
|
"loss": 0.0297, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 0.6663228554103, |
|
"learning_rate": 7.397159195720347e-06, |
|
"loss": 0.0203, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 1.025646882506606, |
|
"learning_rate": 7.360265633646929e-06, |
|
"loss": 0.0165, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 1.104810538987063, |
|
"learning_rate": 7.323372071573512e-06, |
|
"loss": 0.025, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 1.1222939795401854, |
|
"learning_rate": 7.286478509500092e-06, |
|
"loss": 0.0184, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 0.40339534329472254, |
|
"learning_rate": 7.249584947426675e-06, |
|
"loss": 0.0172, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 1.3827747835522215, |
|
"learning_rate": 7.212691385353257e-06, |
|
"loss": 0.0176, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 0.5976609123895832, |
|
"learning_rate": 7.175797823279839e-06, |
|
"loss": 0.0288, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 1.248750777678682, |
|
"learning_rate": 7.13890426120642e-06, |
|
"loss": 0.018, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 0.7797224751733178, |
|
"learning_rate": 7.102010699133002e-06, |
|
"loss": 0.0202, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 0.7965109498444111, |
|
"learning_rate": 7.065117137059584e-06, |
|
"loss": 0.0193, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 0.7504440026508671, |
|
"learning_rate": 7.028223574986166e-06, |
|
"loss": 0.0208, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 0.3650634488062586, |
|
"learning_rate": 6.991330012912747e-06, |
|
"loss": 0.0156, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 0.4739908339986351, |
|
"learning_rate": 6.954436450839329e-06, |
|
"loss": 0.0253, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 0.40917205803434015, |
|
"learning_rate": 6.917542888765911e-06, |
|
"loss": 0.0223, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 1.0343715356497936, |
|
"learning_rate": 6.880649326692492e-06, |
|
"loss": 0.0239, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 1.2172584699084175, |
|
"learning_rate": 6.843755764619075e-06, |
|
"loss": 0.0198, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 0.1649553267102644, |
|
"learning_rate": 6.806862202545657e-06, |
|
"loss": 0.0155, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 0.750390467725635, |
|
"learning_rate": 6.769968640472239e-06, |
|
"loss": 0.0142, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 0.37624795209764544, |
|
"learning_rate": 6.73307507839882e-06, |
|
"loss": 0.017, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.7620471735392086, |
|
"learning_rate": 6.696181516325402e-06, |
|
"loss": 0.0196, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.699688822688377, |
|
"learning_rate": 6.659287954251984e-06, |
|
"loss": 0.0211, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 0.6564323081238257, |
|
"learning_rate": 6.622394392178565e-06, |
|
"loss": 0.0175, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 4.47664345340197, |
|
"learning_rate": 6.585500830105147e-06, |
|
"loss": 0.0268, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 0.5855473045668194, |
|
"learning_rate": 6.548607268031729e-06, |
|
"loss": 0.0197, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 0.3900950078903462, |
|
"learning_rate": 6.511713705958311e-06, |
|
"loss": 0.0139, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 0.6231685029419647, |
|
"learning_rate": 6.474820143884892e-06, |
|
"loss": 0.0207, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 1.4647351847789896, |
|
"learning_rate": 6.437926581811474e-06, |
|
"loss": 0.0251, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 0.8968411402674165, |
|
"learning_rate": 6.401033019738057e-06, |
|
"loss": 0.02, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 1.6599408896389465, |
|
"learning_rate": 6.364139457664637e-06, |
|
"loss": 0.0204, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 0.5235302401701253, |
|
"learning_rate": 6.32724589559122e-06, |
|
"loss": 0.021, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 1.0083480713032584, |
|
"learning_rate": 6.290352333517802e-06, |
|
"loss": 0.0176, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 1.0120381326163095, |
|
"learning_rate": 6.253458771444384e-06, |
|
"loss": 0.0234, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 1.3180666902395828, |
|
"learning_rate": 6.216565209370965e-06, |
|
"loss": 0.0227, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 0.948900773905558, |
|
"learning_rate": 6.179671647297547e-06, |
|
"loss": 0.0191, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 0.6367559831650437, |
|
"learning_rate": 6.142778085224129e-06, |
|
"loss": 0.0189, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 0.6717504563338927, |
|
"learning_rate": 6.10588452315071e-06, |
|
"loss": 0.0193, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 0.4138448431041914, |
|
"learning_rate": 6.068990961077292e-06, |
|
"loss": 0.02, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 0.6941281207519591, |
|
"learning_rate": 6.032097399003874e-06, |
|
"loss": 0.0163, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 1.054337846229134, |
|
"learning_rate": 5.995203836930457e-06, |
|
"loss": 0.0163, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 0.5690430661632323, |
|
"learning_rate": 5.958310274857037e-06, |
|
"loss": 0.0183, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 1.0019959557731468, |
|
"learning_rate": 5.92141671278362e-06, |
|
"loss": 0.0198, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 2.3627897902276356, |
|
"learning_rate": 5.884523150710202e-06, |
|
"loss": 0.0182, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 0.3794782980567666, |
|
"learning_rate": 5.847629588636783e-06, |
|
"loss": 0.0182, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 0.3295687746448634, |
|
"learning_rate": 5.810736026563365e-06, |
|
"loss": 0.0195, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 0.6192695411260121, |
|
"learning_rate": 5.773842464489947e-06, |
|
"loss": 0.0322, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 0.49045425590494857, |
|
"learning_rate": 5.736948902416529e-06, |
|
"loss": 0.0131, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 0.5148148095849162, |
|
"learning_rate": 5.70005534034311e-06, |
|
"loss": 0.0082, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 1.1798023900555077, |
|
"learning_rate": 5.663161778269692e-06, |
|
"loss": 0.0146, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 2.0171878876748512, |
|
"learning_rate": 5.626268216196274e-06, |
|
"loss": 0.0177, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 0.48523277809422005, |
|
"learning_rate": 5.589374654122855e-06, |
|
"loss": 0.0179, |
|
"step": 4510 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 6024, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 500, |
|
"total_flos": 0.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|