|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.999207397622193, |
|
"eval_steps": 500, |
|
"global_step": 2838, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 54.60578280011217, |
|
"learning_rate": 3.4482758620689656e-07, |
|
"loss": 2.0852, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 42.54745106429926, |
|
"learning_rate": 1.724137931034483e-06, |
|
"loss": 2.0381, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 9.212853479586382, |
|
"learning_rate": 3.448275862068966e-06, |
|
"loss": 1.6632, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 5.314912275996213, |
|
"learning_rate": 5.172413793103449e-06, |
|
"loss": 1.3468, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 4.03433890035307, |
|
"learning_rate": 6.896551724137932e-06, |
|
"loss": 1.2538, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.6447411660183016, |
|
"learning_rate": 8.620689655172414e-06, |
|
"loss": 1.2604, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.1857065034640795, |
|
"learning_rate": 9.999996872939885e-06, |
|
"loss": 1.1935, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.7526909326796276, |
|
"learning_rate": 9.999887426246524e-06, |
|
"loss": 1.1939, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.5378704654179458, |
|
"learning_rate": 9.999621630458743e-06, |
|
"loss": 1.1626, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.3071739468687105, |
|
"learning_rate": 9.999199493888118e-06, |
|
"loss": 1.122, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.1982413397341172, |
|
"learning_rate": 9.998621029735082e-06, |
|
"loss": 1.1191, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.0956550608235818, |
|
"learning_rate": 9.997886256088507e-06, |
|
"loss": 1.123, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.0821639142212824, |
|
"learning_rate": 9.996995195925152e-06, |
|
"loss": 1.0751, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.0865870956103822, |
|
"learning_rate": 9.995947877108933e-06, |
|
"loss": 1.114, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.0582468505041362, |
|
"learning_rate": 9.99474433239006e-06, |
|
"loss": 1.0609, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.0556893008132, |
|
"learning_rate": 9.993384599404001e-06, |
|
"loss": 1.0861, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.053104954690895, |
|
"learning_rate": 9.991868720670322e-06, |
|
"loss": 1.0792, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.069931327549148, |
|
"learning_rate": 9.990196743591341e-06, |
|
"loss": 1.0722, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.0793525694046096, |
|
"learning_rate": 9.988368720450656e-06, |
|
"loss": 1.0561, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.0449093375062701, |
|
"learning_rate": 9.986384708411507e-06, |
|
"loss": 1.0675, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.9947694351867048, |
|
"learning_rate": 9.984244769514988e-06, |
|
"loss": 1.0395, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.0281907914775885, |
|
"learning_rate": 9.981948970678107e-06, |
|
"loss": 1.0455, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.0429607059919377, |
|
"learning_rate": 9.979497383691695e-06, |
|
"loss": 1.014, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.1551790238118493, |
|
"learning_rate": 9.976890085218157e-06, |
|
"loss": 1.0335, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.0958017153997939, |
|
"learning_rate": 9.974127156789082e-06, |
|
"loss": 1.0566, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.0545164230640043, |
|
"learning_rate": 9.971208684802686e-06, |
|
"loss": 1.0234, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.1119539198620567, |
|
"learning_rate": 9.968134760521114e-06, |
|
"loss": 0.9956, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.045532676163788, |
|
"learning_rate": 9.964905480067585e-06, |
|
"loss": 1.0103, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.0818099938062198, |
|
"learning_rate": 9.96152094442339e-06, |
|
"loss": 0.987, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.06916616510137, |
|
"learning_rate": 9.957981259424724e-06, |
|
"loss": 1.0189, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.1000812098052206, |
|
"learning_rate": 9.954286535759394e-06, |
|
"loss": 1.0025, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.0740685860653156, |
|
"learning_rate": 9.950436888963337e-06, |
|
"loss": 1.0394, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.0578416601226404, |
|
"learning_rate": 9.946432439417021e-06, |
|
"loss": 1.0419, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.1378367934770748, |
|
"learning_rate": 9.942273312341679e-06, |
|
"loss": 1.04, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.106141894903122, |
|
"learning_rate": 9.937959637795389e-06, |
|
"loss": 1.0112, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.0459501547982482, |
|
"learning_rate": 9.93349155066901e-06, |
|
"loss": 0.9959, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.1420602608538855, |
|
"learning_rate": 9.928869190681964e-06, |
|
"loss": 0.9952, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.0748374838181862, |
|
"learning_rate": 9.924092702377863e-06, |
|
"loss": 1.0094, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.0535011085546289, |
|
"learning_rate": 9.919162235119996e-06, |
|
"loss": 1.0054, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.0310625793824704, |
|
"learning_rate": 9.91407794308665e-06, |
|
"loss": 1.0117, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.0359842004906923, |
|
"learning_rate": 9.908839985266297e-06, |
|
"loss": 0.9982, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.084059570369228, |
|
"learning_rate": 9.903448525452618e-06, |
|
"loss": 1.0127, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.1227120329409497, |
|
"learning_rate": 9.89790373223938e-06, |
|
"loss": 1.048, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.0615021518173307, |
|
"learning_rate": 9.892205779015167e-06, |
|
"loss": 1.0021, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.070318683802529, |
|
"learning_rate": 9.886354843957953e-06, |
|
"loss": 1.0043, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.0419755132096296, |
|
"learning_rate": 9.88035111002954e-06, |
|
"loss": 0.9743, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.1796172322040084, |
|
"learning_rate": 9.874194764969827e-06, |
|
"loss": 0.9957, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.0933963352790785, |
|
"learning_rate": 9.867886001290943e-06, |
|
"loss": 0.9814, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.1066280030775704, |
|
"learning_rate": 9.861425016271227e-06, |
|
"loss": 0.9832, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.1138948008724274, |
|
"learning_rate": 9.854812011949059e-06, |
|
"loss": 0.9871, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.0644401239508805, |
|
"learning_rate": 9.848047195116543e-06, |
|
"loss": 0.9951, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.1880183474724784, |
|
"learning_rate": 9.841130777313039e-06, |
|
"loss": 0.9902, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.0747113009717828, |
|
"learning_rate": 9.834062974818547e-06, |
|
"loss": 0.9433, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.1442114734348945, |
|
"learning_rate": 9.826844008646949e-06, |
|
"loss": 0.9703, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.0895758630826766, |
|
"learning_rate": 9.81947410453909e-06, |
|
"loss": 1.0236, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.996075250542336, |
|
"learning_rate": 9.811953492955728e-06, |
|
"loss": 0.9577, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.1734623195649692, |
|
"learning_rate": 9.80428240907032e-06, |
|
"loss": 0.9752, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.282701051609298, |
|
"learning_rate": 9.796461092761668e-06, |
|
"loss": 0.987, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.0721992980205135, |
|
"learning_rate": 9.788489788606423e-06, |
|
"loss": 0.944, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.105694230535082, |
|
"learning_rate": 9.780368745871438e-06, |
|
"loss": 0.9804, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.1121587653939105, |
|
"learning_rate": 9.772098218505963e-06, |
|
"loss": 1.0099, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.1073177873687883, |
|
"learning_rate": 9.763678465133712e-06, |
|
"loss": 0.9887, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.1986141459298305, |
|
"learning_rate": 9.755109749044781e-06, |
|
"loss": 0.9749, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.0864391212895972, |
|
"learning_rate": 9.7463923381874e-06, |
|
"loss": 0.9767, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.0595953209575595, |
|
"learning_rate": 9.737526505159564e-06, |
|
"loss": 0.9297, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.083224438455533, |
|
"learning_rate": 9.728512527200509e-06, |
|
"loss": 0.9498, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.1306776282190978, |
|
"learning_rate": 9.719350686182041e-06, |
|
"loss": 0.982, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.07939319367538, |
|
"learning_rate": 9.710041268599718e-06, |
|
"loss": 0.9669, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.1100410279851476, |
|
"learning_rate": 9.700584565563897e-06, |
|
"loss": 0.956, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.0917533373255544, |
|
"learning_rate": 9.690980872790627e-06, |
|
"loss": 0.9878, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.1287494016251205, |
|
"learning_rate": 9.681230490592403e-06, |
|
"loss": 0.9604, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.0366025693971206, |
|
"learning_rate": 9.671333723868773e-06, |
|
"loss": 0.9809, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.1876939558601538, |
|
"learning_rate": 9.66129088209681e-06, |
|
"loss": 0.9324, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.1296469706806582, |
|
"learning_rate": 9.651102279321429e-06, |
|
"loss": 0.98, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.0920615981549329, |
|
"learning_rate": 9.640768234145563e-06, |
|
"loss": 0.9474, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.045353192143218, |
|
"learning_rate": 9.630289069720213e-06, |
|
"loss": 0.9416, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.0546831730532094, |
|
"learning_rate": 9.619665113734327e-06, |
|
"loss": 0.9583, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.120397617115956, |
|
"learning_rate": 9.608896698404567e-06, |
|
"loss": 0.9739, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.0897789727469696, |
|
"learning_rate": 9.597984160464908e-06, |
|
"loss": 0.9882, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.0655227440534312, |
|
"learning_rate": 9.586927841156121e-06, |
|
"loss": 0.973, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.024445190271631, |
|
"learning_rate": 9.575728086215093e-06, |
|
"loss": 0.9488, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.0957551302719917, |
|
"learning_rate": 9.564385245864015e-06, |
|
"loss": 0.9395, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.0348921383964815, |
|
"learning_rate": 9.552899674799438e-06, |
|
"loss": 0.9618, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.1320917241343242, |
|
"learning_rate": 9.541271732181174e-06, |
|
"loss": 0.9737, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.0955620287950987, |
|
"learning_rate": 9.52950178162107e-06, |
|
"loss": 0.9765, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.0865957472837047, |
|
"learning_rate": 9.517590191171638e-06, |
|
"loss": 0.9402, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.0608004961340336, |
|
"learning_rate": 9.505537333314534e-06, |
|
"loss": 0.938, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.0436288259170787, |
|
"learning_rate": 9.493343584948931e-06, |
|
"loss": 0.9495, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.0827000850655668, |
|
"learning_rate": 9.481009327379714e-06, |
|
"loss": 0.9505, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.0958366892000795, |
|
"learning_rate": 9.46853494630557e-06, |
|
"loss": 0.9536, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.0431220913897328, |
|
"learning_rate": 9.455920831806917e-06, |
|
"loss": 0.942, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.1372655798293543, |
|
"learning_rate": 9.443167378333711e-06, |
|
"loss": 0.9447, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.0890187843066097, |
|
"learning_rate": 9.43027498469311e-06, |
|
"loss": 0.9291, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.128255566030822, |
|
"learning_rate": 9.41724405403701e-06, |
|
"loss": 0.9418, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.0200134644324146, |
|
"learning_rate": 9.404074993849421e-06, |
|
"loss": 0.927, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.0912622433950008, |
|
"learning_rate": 9.390768215933746e-06, |
|
"loss": 0.943, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.1784430852167105, |
|
"learning_rate": 9.377324136399887e-06, |
|
"loss": 0.9409, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.0732445497397998, |
|
"learning_rate": 9.36374317565124e-06, |
|
"loss": 0.9401, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.1241973380928443, |
|
"learning_rate": 9.350025758371554e-06, |
|
"loss": 0.9188, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.0680249447424572, |
|
"learning_rate": 9.336172313511636e-06, |
|
"loss": 0.9304, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.0400938648362148, |
|
"learning_rate": 9.322183274275954e-06, |
|
"loss": 0.9465, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.1484166178621282, |
|
"learning_rate": 9.308059078109078e-06, |
|
"loss": 0.9431, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.0928763685485705, |
|
"learning_rate": 9.29380016668201e-06, |
|
"loss": 0.9368, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.0470334802413224, |
|
"learning_rate": 9.279406985878367e-06, |
|
"loss": 0.9529, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.055693577627048, |
|
"learning_rate": 9.264879985780436e-06, |
|
"loss": 0.9237, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.0582407523485609, |
|
"learning_rate": 9.250219620655112e-06, |
|
"loss": 0.9455, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.0392740863841614, |
|
"learning_rate": 9.235426348939674e-06, |
|
"loss": 0.9866, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.087021743413759, |
|
"learning_rate": 9.220500633227467e-06, |
|
"loss": 0.9797, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.0905659766649087, |
|
"learning_rate": 9.205442940253426e-06, |
|
"loss": 0.9231, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.0838061353931883, |
|
"learning_rate": 9.190253740879484e-06, |
|
"loss": 0.9155, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.1721559515157844, |
|
"learning_rate": 9.174933510079847e-06, |
|
"loss": 0.9132, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.0711291424853389, |
|
"learning_rate": 9.159482726926147e-06, |
|
"loss": 0.9368, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.0906836737125443, |
|
"learning_rate": 9.14390187457245e-06, |
|
"loss": 0.9652, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.2147816750505283, |
|
"learning_rate": 9.128191440240159e-06, |
|
"loss": 0.922, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.0745698856829782, |
|
"learning_rate": 9.11235191520277e-06, |
|
"loss": 0.9267, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.1107563079565528, |
|
"learning_rate": 9.096383794770513e-06, |
|
"loss": 0.9403, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.0645734678937102, |
|
"learning_rate": 9.080287578274866e-06, |
|
"loss": 0.9149, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.1729380707889032, |
|
"learning_rate": 9.064063769052933e-06, |
|
"loss": 0.9236, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.0634029251400858, |
|
"learning_rate": 9.047712874431716e-06, |
|
"loss": 0.9264, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.185148731024843, |
|
"learning_rate": 9.031235405712239e-06, |
|
"loss": 0.9632, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.1238661801404854, |
|
"learning_rate": 9.014631878153564e-06, |
|
"loss": 0.9364, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.1101591200426506, |
|
"learning_rate": 8.997902810956682e-06, |
|
"loss": 0.9121, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.1328306862765927, |
|
"learning_rate": 8.98104872724827e-06, |
|
"loss": 0.9637, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.1182389860600772, |
|
"learning_rate": 8.964070154064343e-06, |
|
"loss": 0.9431, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.2315329373588069, |
|
"learning_rate": 8.94696762233376e-06, |
|
"loss": 0.9261, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.0785263989248792, |
|
"learning_rate": 8.92974166686163e-06, |
|
"loss": 0.9218, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.0293877329539916, |
|
"learning_rate": 8.912392826312595e-06, |
|
"loss": 0.9516, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.0797961930582287, |
|
"learning_rate": 8.894921643193966e-06, |
|
"loss": 0.94, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.0052477432214972, |
|
"learning_rate": 8.877328663838776e-06, |
|
"loss": 0.9207, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.0126272743426095, |
|
"learning_rate": 8.85961443838869e-06, |
|
"loss": 0.9292, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.0166858946265631, |
|
"learning_rate": 8.841779520776803e-06, |
|
"loss": 0.9171, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.0674058891203713, |
|
"learning_rate": 8.823824468710312e-06, |
|
"loss": 0.9238, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.0826543746678357, |
|
"learning_rate": 8.805749843653086e-06, |
|
"loss": 0.8903, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.0474293060948185, |
|
"learning_rate": 8.787556210808101e-06, |
|
"loss": 0.8952, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.1092322508696293, |
|
"learning_rate": 8.769244139099774e-06, |
|
"loss": 0.9191, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.0453618423472522, |
|
"learning_rate": 8.750814201156157e-06, |
|
"loss": 0.9287, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.0150902528617922, |
|
"learning_rate": 8.732266973291053e-06, |
|
"loss": 0.9005, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.111573072134849, |
|
"learning_rate": 8.713603035485972e-06, |
|
"loss": 0.9061, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.0266552996471214, |
|
"learning_rate": 8.694822971372012e-06, |
|
"loss": 0.8981, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.026959416886306, |
|
"learning_rate": 8.675927368211599e-06, |
|
"loss": 0.9119, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.990879098356618, |
|
"learning_rate": 8.656916816880122e-06, |
|
"loss": 0.934, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.016936193517629, |
|
"learning_rate": 8.637791911847462e-06, |
|
"loss": 0.9031, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.0105346034407392, |
|
"learning_rate": 8.618553251159405e-06, |
|
"loss": 0.8918, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.0219526658502593, |
|
"learning_rate": 8.599201436418927e-06, |
|
"loss": 0.9202, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.0611008297726183, |
|
"learning_rate": 8.579737072767396e-06, |
|
"loss": 0.8956, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.0532525094762688, |
|
"learning_rate": 8.560160768865642e-06, |
|
"loss": 0.8782, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.0472370063073, |
|
"learning_rate": 8.540473136874926e-06, |
|
"loss": 0.9215, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.0503901600633805, |
|
"learning_rate": 8.520674792437793e-06, |
|
"loss": 0.905, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.0699401745712223, |
|
"learning_rate": 8.50076635465883e-06, |
|
"loss": 0.8914, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.1604934245734189, |
|
"learning_rate": 8.480748446085293e-06, |
|
"loss": 0.923, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.0575469862405844, |
|
"learning_rate": 8.460621692687656e-06, |
|
"loss": 0.91, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.1861862918344839, |
|
"learning_rate": 8.44038672384002e-06, |
|
"loss": 0.9183, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.0866238920331526, |
|
"learning_rate": 8.420044172300443e-06, |
|
"loss": 0.9012, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.0963030089254635, |
|
"learning_rate": 8.399594674191147e-06, |
|
"loss": 0.8867, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.0516263694748806, |
|
"learning_rate": 8.379038868978635e-06, |
|
"loss": 0.9204, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.0602404388082067, |
|
"learning_rate": 8.358377399453684e-06, |
|
"loss": 0.8975, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.0524212623827451, |
|
"learning_rate": 8.337610911711248e-06, |
|
"loss": 0.9182, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.0486851629524967, |
|
"learning_rate": 8.316740055130263e-06, |
|
"loss": 0.8996, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.0382393662171674, |
|
"learning_rate": 8.295765482353326e-06, |
|
"loss": 0.8898, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.0801053233779676, |
|
"learning_rate": 8.274687849266295e-06, |
|
"loss": 0.8942, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.082914632918619, |
|
"learning_rate": 8.253507814977779e-06, |
|
"loss": 0.9335, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.115797305584172, |
|
"learning_rate": 8.232226041798528e-06, |
|
"loss": 0.8733, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.0758274816242523, |
|
"learning_rate": 8.210843195220717e-06, |
|
"loss": 0.9121, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.9966437564306923, |
|
"learning_rate": 8.189359943897137e-06, |
|
"loss": 0.9126, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.1254388184304862, |
|
"learning_rate": 8.167776959620298e-06, |
|
"loss": 0.9113, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.033615919920944, |
|
"learning_rate": 8.1460949173014e-06, |
|
"loss": 0.8863, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.0126421627367477, |
|
"learning_rate": 8.124314494949247e-06, |
|
"loss": 0.9044, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.0545539629522227, |
|
"learning_rate": 8.102436373649029e-06, |
|
"loss": 0.8942, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.004956283976033, |
|
"learning_rate": 8.080461237541049e-06, |
|
"loss": 0.9255, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.0862660155528163, |
|
"learning_rate": 8.0583897737993e-06, |
|
"loss": 0.9275, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.0697124134441602, |
|
"learning_rate": 8.036222672609994e-06, |
|
"loss": 0.9161, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.0639070724236763, |
|
"learning_rate": 8.013960627149981e-06, |
|
"loss": 0.8874, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.166900094582672, |
|
"learning_rate": 7.991604333565062e-06, |
|
"loss": 0.8897, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.1335592965754175, |
|
"learning_rate": 7.969154490948225e-06, |
|
"loss": 0.8964, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.0520381511921073, |
|
"learning_rate": 7.946611801317794e-06, |
|
"loss": 0.8736, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.16753848747216, |
|
"learning_rate": 7.923976969595459e-06, |
|
"loss": 0.9112, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.0772133099773151, |
|
"learning_rate": 7.901250703584245e-06, |
|
"loss": 0.9155, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.1464686627860388, |
|
"learning_rate": 7.878433713946373e-06, |
|
"loss": 0.8962, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.0835779136854178, |
|
"learning_rate": 7.855526714181041e-06, |
|
"loss": 0.9058, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 1.171366478493349, |
|
"learning_rate": 7.832530420602113e-06, |
|
"loss": 0.8756, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 1.040168900901505, |
|
"learning_rate": 7.809445552315714e-06, |
|
"loss": 0.8594, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.02166560480321, |
|
"learning_rate": 7.786272831197745e-06, |
|
"loss": 0.8935, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.1107392454183416, |
|
"learning_rate": 7.763012981871314e-06, |
|
"loss": 0.904, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.9896358057101541, |
|
"learning_rate": 7.739666731684073e-06, |
|
"loss": 0.9068, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.9788741930391702, |
|
"learning_rate": 7.716234810685476e-06, |
|
"loss": 0.8846, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.9931045191442167, |
|
"learning_rate": 7.692717951603942e-06, |
|
"loss": 0.8584, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 1.0645481368236074, |
|
"learning_rate": 7.669116889823955e-06, |
|
"loss": 0.8992, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.9816731950451545, |
|
"learning_rate": 7.645432363363057e-06, |
|
"loss": 0.8851, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.9899142833993008, |
|
"learning_rate": 7.621665112848776e-06, |
|
"loss": 0.8845, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.0638888300871174, |
|
"learning_rate": 7.597815881495465e-06, |
|
"loss": 0.8773, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.031662431521578, |
|
"learning_rate": 7.573885415081059e-06, |
|
"loss": 0.8258, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 1.040426497974828, |
|
"learning_rate": 7.54987446192376e-06, |
|
"loss": 0.7907, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.9887566903005512, |
|
"learning_rate": 7.525783772858624e-06, |
|
"loss": 0.8091, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 1.0542179478307365, |
|
"learning_rate": 7.5016141012141e-06, |
|
"loss": 0.7815, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 1.0738731959256824, |
|
"learning_rate": 7.477366202788456e-06, |
|
"loss": 0.7734, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 0.9975806760235982, |
|
"learning_rate": 7.45304083582616e-06, |
|
"loss": 0.7824, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 1.005274019925314, |
|
"learning_rate": 7.4286387609941544e-06, |
|
"loss": 0.769, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 1.0937329481520819, |
|
"learning_rate": 7.40416074135808e-06, |
|
"loss": 0.791, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.9987999174071854, |
|
"learning_rate": 7.379607542358414e-06, |
|
"loss": 0.7983, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 1.074721973505265, |
|
"learning_rate": 7.3549799317865235e-06, |
|
"loss": 0.8264, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 1.0023766389640552, |
|
"learning_rate": 7.330278679760673e-06, |
|
"loss": 0.8166, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 1.0263488491446793, |
|
"learning_rate": 7.3055045587019315e-06, |
|
"loss": 0.7756, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 1.222252310199244, |
|
"learning_rate": 7.280658343310016e-06, |
|
"loss": 0.8113, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 1.0803171037496995, |
|
"learning_rate": 7.255740810539078e-06, |
|
"loss": 0.7773, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 1.0429385720996782, |
|
"learning_rate": 7.230752739573398e-06, |
|
"loss": 0.7959, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 1.0525788357504489, |
|
"learning_rate": 7.205694911803019e-06, |
|
"loss": 0.7962, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 0.986228023483833, |
|
"learning_rate": 7.18056811079932e-06, |
|
"loss": 0.79, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 1.031179895714868, |
|
"learning_rate": 7.155373122290508e-06, |
|
"loss": 0.8101, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 1.0379629517770603, |
|
"learning_rate": 7.13011073413705e-06, |
|
"loss": 0.781, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 1.033153108919124, |
|
"learning_rate": 7.1047817363070325e-06, |
|
"loss": 0.8418, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 1.0357203376239867, |
|
"learning_rate": 7.079386920851466e-06, |
|
"loss": 0.8065, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 1.0540192082846203, |
|
"learning_rate": 7.053927081879505e-06, |
|
"loss": 0.7956, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 1.0552828635725824, |
|
"learning_rate": 7.0284030155336315e-06, |
|
"loss": 0.7945, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 0.9810627289945896, |
|
"learning_rate": 7.002815519964745e-06, |
|
"loss": 0.7965, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 1.0916102744452092, |
|
"learning_rate": 6.977165395307215e-06, |
|
"loss": 0.7991, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 1.1543690326062077, |
|
"learning_rate": 6.951453443653852e-06, |
|
"loss": 0.7896, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 1.1170103600405488, |
|
"learning_rate": 6.9256804690308276e-06, |
|
"loss": 0.7828, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 1.0526733296614392, |
|
"learning_rate": 6.899847277372538e-06, |
|
"loss": 0.7923, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 1.0770254342023697, |
|
"learning_rate": 6.873954676496395e-06, |
|
"loss": 0.8128, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 1.037705594081886, |
|
"learning_rate": 6.848003476077567e-06, |
|
"loss": 0.7856, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 1.0319807068181204, |
|
"learning_rate": 6.8219944876236645e-06, |
|
"loss": 0.7949, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 1.0927555007584646, |
|
"learning_rate": 6.795928524449354e-06, |
|
"loss": 0.7941, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 0.9869897993273156, |
|
"learning_rate": 6.769806401650936e-06, |
|
"loss": 0.7667, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 1.0055956062759406, |
|
"learning_rate": 6.743628936080852e-06, |
|
"loss": 0.7855, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 1.0283367881989096, |
|
"learning_rate": 6.717396946322137e-06, |
|
"loss": 0.7745, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 1.0345829389670045, |
|
"learning_rate": 6.6911112526628295e-06, |
|
"loss": 0.7842, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 1.0711135328845822, |
|
"learning_rate": 6.664772677070316e-06, |
|
"loss": 0.7558, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.9877769296594265, |
|
"learning_rate": 6.638382043165628e-06, |
|
"loss": 0.7788, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 1.131836138091609, |
|
"learning_rate": 6.611940176197688e-06, |
|
"loss": 0.7901, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 1.058249641590972, |
|
"learning_rate": 6.585447903017506e-06, |
|
"loss": 0.7936, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 1.073971008814511, |
|
"learning_rate": 6.558906052052314e-06, |
|
"loss": 0.7835, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 1.0491301969369466, |
|
"learning_rate": 6.532315453279673e-06, |
|
"loss": 0.7902, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 1.046297097483487, |
|
"learning_rate": 6.505676938201512e-06, |
|
"loss": 0.7767, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 1.046022517875942, |
|
"learning_rate": 6.478991339818128e-06, |
|
"loss": 0.8091, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 1.0086633248074561, |
|
"learning_rate": 6.4522594926021355e-06, |
|
"loss": 0.7797, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 1.0965955454651117, |
|
"learning_rate": 6.425482232472377e-06, |
|
"loss": 0.7702, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 1.0362189192150881, |
|
"learning_rate": 6.3986603967677805e-06, |
|
"loss": 0.7931, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 1.110468197330772, |
|
"learning_rate": 6.371794824221173e-06, |
|
"loss": 0.7917, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 1.0163659020071605, |
|
"learning_rate": 6.344886354933058e-06, |
|
"loss": 0.7886, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 1.0115549227695064, |
|
"learning_rate": 6.3179358303453386e-06, |
|
"loss": 0.7511, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 1.0872016119161863, |
|
"learning_rate": 6.290944093215016e-06, |
|
"loss": 0.8036, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 1.0553500518484338, |
|
"learning_rate": 6.263911987587822e-06, |
|
"loss": 0.7938, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.993815270148442, |
|
"learning_rate": 6.236840358771837e-06, |
|
"loss": 0.7788, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 1.0605675582324252, |
|
"learning_rate": 6.20973005331105e-06, |
|
"loss": 0.7781, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 1.0965085071552372, |
|
"learning_rate": 6.1825819189588885e-06, |
|
"loss": 0.7872, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 1.040866195350916, |
|
"learning_rate": 6.155396804651714e-06, |
|
"loss": 0.7966, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 1.0593376609536802, |
|
"learning_rate": 6.128175560482264e-06, |
|
"loss": 0.7832, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 1.0081718313330637, |
|
"learning_rate": 6.1009190376730785e-06, |
|
"loss": 0.7772, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 0.9892554397828908, |
|
"learning_rate": 6.07362808854988e-06, |
|
"loss": 0.7856, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 1.0515874983049542, |
|
"learning_rate": 6.046303566514919e-06, |
|
"loss": 0.7812, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 1.01738547568124, |
|
"learning_rate": 6.018946326020287e-06, |
|
"loss": 0.7824, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 0.992994982201507, |
|
"learning_rate": 5.991557222541201e-06, |
|
"loss": 0.7842, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 0.9928822859609259, |
|
"learning_rate": 5.964137112549251e-06, |
|
"loss": 0.7906, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 1.0673862770846931, |
|
"learning_rate": 5.9366868534856115e-06, |
|
"loss": 0.7896, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 1.0627251705995355, |
|
"learning_rate": 5.909207303734241e-06, |
|
"loss": 0.7965, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 1.0050051635503012, |
|
"learning_rate": 5.881699322595031e-06, |
|
"loss": 0.7775, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 1.0049258262531797, |
|
"learning_rate": 5.854163770256934e-06, |
|
"loss": 0.7659, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 1.1097225296353777, |
|
"learning_rate": 5.826601507771073e-06, |
|
"loss": 0.7699, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 1.0610730723756006, |
|
"learning_rate": 5.799013397023806e-06, |
|
"loss": 0.7996, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 1.0285633823079718, |
|
"learning_rate": 5.771400300709785e-06, |
|
"loss": 0.7829, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 1.0484599021027985, |
|
"learning_rate": 5.743763082304973e-06, |
|
"loss": 0.7619, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 1.0137701786577156, |
|
"learning_rate": 5.7161026060396375e-06, |
|
"loss": 0.798, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 1.0289414598602742, |
|
"learning_rate": 5.688419736871341e-06, |
|
"loss": 0.7827, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 1.058376335913828, |
|
"learning_rate": 5.660715340457874e-06, |
|
"loss": 0.7921, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 1.0011219088912342, |
|
"learning_rate": 5.632990283130204e-06, |
|
"loss": 0.781, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.984264955084216, |
|
"learning_rate": 5.605245431865368e-06, |
|
"loss": 0.7772, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 1.0151072044919451, |
|
"learning_rate": 5.577481654259377e-06, |
|
"loss": 0.7735, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 1.063533843295668, |
|
"learning_rate": 5.549699818500074e-06, |
|
"loss": 0.7682, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 1.0434635789190496, |
|
"learning_rate": 5.521900793339989e-06, |
|
"loss": 0.7915, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 1.0587561050751115, |
|
"learning_rate": 5.494085448069181e-06, |
|
"loss": 0.7997, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 1.0758864296233028, |
|
"learning_rate": 5.466254652488036e-06, |
|
"loss": 0.7964, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 1.0556751372323996, |
|
"learning_rate": 5.438409276880089e-06, |
|
"loss": 0.8062, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.9792859835280993, |
|
"learning_rate": 5.410550191984798e-06, |
|
"loss": 0.787, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 1.0231438624972786, |
|
"learning_rate": 5.3826782689703115e-06, |
|
"loss": 0.7803, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 1.0660534726358564, |
|
"learning_rate": 5.354794379406242e-06, |
|
"loss": 0.78, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 0.9527414539128428, |
|
"learning_rate": 5.3268993952363936e-06, |
|
"loss": 0.796, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 0.9870931434726852, |
|
"learning_rate": 5.29899418875151e-06, |
|
"loss": 0.7652, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 1.0537299945885146, |
|
"learning_rate": 5.271079632561992e-06, |
|
"loss": 0.7854, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 1.1396368040574916, |
|
"learning_rate": 5.243156599570606e-06, |
|
"loss": 0.7617, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 1.0924704024745873, |
|
"learning_rate": 5.2152259629451986e-06, |
|
"loss": 0.7713, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 1.021493417245078, |
|
"learning_rate": 5.18728859609138e-06, |
|
"loss": 0.7609, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 1.0148194958691719, |
|
"learning_rate": 5.159345372625223e-06, |
|
"loss": 0.7788, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 1.0402765811164951, |
|
"learning_rate": 5.131397166345938e-06, |
|
"loss": 0.7599, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.9966250584272072, |
|
"learning_rate": 5.103444851208549e-06, |
|
"loss": 0.7874, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.9871275158697829, |
|
"learning_rate": 5.075489301296567e-06, |
|
"loss": 0.7566, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 1.0896451679213162, |
|
"learning_rate": 5.047531390794661e-06, |
|
"loss": 0.7699, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 1.1203863877988638, |
|
"learning_rate": 5.019571993961307e-06, |
|
"loss": 0.8088, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 1.03311513179617, |
|
"learning_rate": 4.9916119851014664e-06, |
|
"loss": 0.7739, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 1.0389351009988612, |
|
"learning_rate": 4.96365223853924e-06, |
|
"loss": 0.7816, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 0.9960641498632878, |
|
"learning_rate": 4.93569362859052e-06, |
|
"loss": 0.775, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 0.9388823495229471, |
|
"learning_rate": 4.907737029535664e-06, |
|
"loss": 0.756, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 1.0662538022442485, |
|
"learning_rate": 4.8797833155921396e-06, |
|
"loss": 0.7992, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 1.0350212904727674, |
|
"learning_rate": 4.8518333608872015e-06, |
|
"loss": 0.7595, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 0.9967538128228846, |
|
"learning_rate": 4.823888039430551e-06, |
|
"loss": 0.7582, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 1.0139079612075497, |
|
"learning_rate": 4.795948225087001e-06, |
|
"loss": 0.7709, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 1.0510044388149635, |
|
"learning_rate": 4.7680147915491585e-06, |
|
"loss": 0.7692, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 1.0641353890612333, |
|
"learning_rate": 4.740088612310096e-06, |
|
"loss": 0.7847, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 1.0192435995305715, |
|
"learning_rate": 4.7121705606360424e-06, |
|
"loss": 0.7732, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 1.0076325415256413, |
|
"learning_rate": 4.684261509539072e-06, |
|
"loss": 0.7701, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 0.9707102286396411, |
|
"learning_rate": 4.65636233174981e-06, |
|
"loss": 0.77, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 1.0835636202474823, |
|
"learning_rate": 4.628473899690133e-06, |
|
"loss": 0.7849, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 1.0157410126136626, |
|
"learning_rate": 4.600597085445894e-06, |
|
"loss": 0.784, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 1.0616186913926178, |
|
"learning_rate": 4.572732760739653e-06, |
|
"loss": 0.7785, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 1.006516145178769, |
|
"learning_rate": 4.5448817969034165e-06, |
|
"loss": 0.7753, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 1.0480529823653495, |
|
"learning_rate": 4.517045064851386e-06, |
|
"loss": 0.7989, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 1.0432567441250045, |
|
"learning_rate": 4.489223435052732e-06, |
|
"loss": 0.7946, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 1.0461342178531015, |
|
"learning_rate": 4.461417777504363e-06, |
|
"loss": 0.7676, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 1.0045382622138492, |
|
"learning_rate": 4.433628961703733e-06, |
|
"loss": 0.7651, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 0.9890094489435823, |
|
"learning_rate": 4.405857856621644e-06, |
|
"loss": 0.7943, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 1.0127639919495397, |
|
"learning_rate": 4.378105330675074e-06, |
|
"loss": 0.7895, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 1.0398544121817734, |
|
"learning_rate": 4.350372251700025e-06, |
|
"loss": 0.8004, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 1.037857459368961, |
|
"learning_rate": 4.322659486924373e-06, |
|
"loss": 0.7963, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 1.106103919813531, |
|
"learning_rate": 4.294967902940768e-06, |
|
"loss": 0.787, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 1.0865617469424886, |
|
"learning_rate": 4.267298365679522e-06, |
|
"loss": 0.788, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 1.0303226290700802, |
|
"learning_rate": 4.239651740381534e-06, |
|
"loss": 0.7642, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 1.0512505166055992, |
|
"learning_rate": 4.212028891571237e-06, |
|
"loss": 0.7832, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 1.0750316874597787, |
|
"learning_rate": 4.184430683029552e-06, |
|
"loss": 0.7599, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 1.0622608820174235, |
|
"learning_rate": 4.156857977766896e-06, |
|
"loss": 0.7841, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 1.0023528643121005, |
|
"learning_rate": 4.129311637996182e-06, |
|
"loss": 0.7845, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 1.0597451506484419, |
|
"learning_rate": 4.101792525105857e-06, |
|
"loss": 0.7802, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 0.9622973096022323, |
|
"learning_rate": 4.0743014996329764e-06, |
|
"loss": 0.7678, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 1.051095411122212, |
|
"learning_rate": 4.046839421236276e-06, |
|
"loss": 0.7972, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 1.0082128589578265, |
|
"learning_rate": 4.019407148669312e-06, |
|
"loss": 0.7948, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 1.0901759578931909, |
|
"learning_rate": 3.992005539753592e-06, |
|
"loss": 0.7914, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 1.0584302499373435, |
|
"learning_rate": 3.964635451351758e-06, |
|
"loss": 0.7821, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 1.043189384648134, |
|
"learning_rate": 3.937297739340783e-06, |
|
"loss": 0.778, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 1.0245392793145456, |
|
"learning_rate": 3.909993258585219e-06, |
|
"loss": 0.7908, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 1.0082519645854728, |
|
"learning_rate": 3.882722862910458e-06, |
|
"loss": 0.7793, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 1.0211341337802105, |
|
"learning_rate": 3.8554874050760345e-06, |
|
"loss": 0.8042, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 0.9920127978660441, |
|
"learning_rate": 3.828287736748957e-06, |
|
"loss": 0.758, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 1.0187229111502758, |
|
"learning_rate": 3.8011247084770754e-06, |
|
"loss": 0.7986, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.9982295207578855, |
|
"learning_rate": 3.773999169662489e-06, |
|
"loss": 0.7623, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 1.025180441312379, |
|
"learning_rate": 3.746911968534982e-06, |
|
"loss": 0.7454, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.9884338430346545, |
|
"learning_rate": 3.7198639521254988e-06, |
|
"loss": 0.7671, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.9685352318412103, |
|
"learning_rate": 3.6928559662396574e-06, |
|
"loss": 0.7583, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 1.029404957630594, |
|
"learning_rate": 3.6658888554312967e-06, |
|
"loss": 0.7868, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 0.9921023940146521, |
|
"learning_rate": 3.6389634629760763e-06, |
|
"loss": 0.7555, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 1.017350986680598, |
|
"learning_rate": 3.612080630845096e-06, |
|
"loss": 0.7905, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 1.0430603602540587, |
|
"learning_rate": 3.5852411996785776e-06, |
|
"loss": 0.7947, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 0.9737056004061376, |
|
"learning_rate": 3.558446008759569e-06, |
|
"loss": 0.7789, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 1.0212119960635129, |
|
"learning_rate": 3.5316958959876985e-06, |
|
"loss": 0.7671, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 1.0072141418910243, |
|
"learning_rate": 3.504991697852983e-06, |
|
"loss": 0.7844, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 1.059809521658242, |
|
"learning_rate": 3.4783342494096627e-06, |
|
"loss": 0.7845, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 1.032182317108509, |
|
"learning_rate": 3.451724384250091e-06, |
|
"loss": 0.7792, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 0.9779053888998924, |
|
"learning_rate": 3.4251629344786675e-06, |
|
"loss": 0.7591, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 1.0116163318504925, |
|
"learning_rate": 3.398650730685813e-06, |
|
"loss": 0.7556, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 1.0511489470052602, |
|
"learning_rate": 3.372188601922006e-06, |
|
"loss": 0.7637, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 1.0172930500825146, |
|
"learning_rate": 3.3457773756718513e-06, |
|
"loss": 0.7696, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 1.039493994412079, |
|
"learning_rate": 3.3194178778282046e-06, |
|
"loss": 0.7931, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 1.033662637919394, |
|
"learning_rate": 3.293110932666349e-06, |
|
"loss": 0.7692, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 1.0584694868797393, |
|
"learning_rate": 3.2668573628182145e-06, |
|
"loss": 0.7792, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 0.994626270021195, |
|
"learning_rate": 3.2406579892466582e-06, |
|
"loss": 0.7682, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 0.9270237802993908, |
|
"learning_rate": 3.2145136312197943e-06, |
|
"loss": 0.7552, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 2.0595234604236357, |
|
"learning_rate": 3.18842510628537e-06, |
|
"loss": 0.7749, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 1.0396319816767299, |
|
"learning_rate": 3.162393230245203e-06, |
|
"loss": 0.804, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 1.0214462086054552, |
|
"learning_rate": 3.1364188171296677e-06, |
|
"loss": 0.7744, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 1.0145502545771508, |
|
"learning_rate": 3.110502679172246e-06, |
|
"loss": 0.7824, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 1.0196641711891408, |
|
"learning_rate": 3.084645626784124e-06, |
|
"loss": 0.7745, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 1.0197064636159427, |
|
"learning_rate": 3.058848468528852e-06, |
|
"loss": 0.8031, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 0.9907125667454302, |
|
"learning_rate": 3.03311201109706e-06, |
|
"loss": 0.7919, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 1.017942513059757, |
|
"learning_rate": 3.0074370592812286e-06, |
|
"loss": 0.7907, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 1.0821499695866912, |
|
"learning_rate": 2.9818244159505265e-06, |
|
"loss": 0.7901, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 0.9934394662674368, |
|
"learning_rate": 2.956274882025706e-06, |
|
"loss": 0.7638, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 1.0313411208961847, |
|
"learning_rate": 2.930789256454052e-06, |
|
"loss": 0.7553, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.9950833531614097, |
|
"learning_rate": 2.905368336184406e-06, |
|
"loss": 0.7576, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 0.9936896686220547, |
|
"learning_rate": 2.8800129161422365e-06, |
|
"loss": 0.7671, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 0.9909860465997411, |
|
"learning_rate": 2.8547237892047852e-06, |
|
"loss": 0.74, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 0.9788752840880554, |
|
"learning_rate": 2.8295017461762806e-06, |
|
"loss": 0.767, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 0.9764110020200104, |
|
"learning_rate": 2.804347575763193e-06, |
|
"loss": 0.7668, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.9772254707929505, |
|
"learning_rate": 2.7792620645495917e-06, |
|
"loss": 0.7425, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 1.0000854462976456, |
|
"learning_rate": 2.7542459969725215e-06, |
|
"loss": 0.7466, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 1.0352323998365711, |
|
"learning_rate": 2.729300155297504e-06, |
|
"loss": 0.771, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 0.9811051893834364, |
|
"learning_rate": 2.704425319594049e-06, |
|
"loss": 0.7778, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 1.0284677234046133, |
|
"learning_rate": 2.6796222677112825e-06, |
|
"loss": 0.7796, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 0.9664217044137716, |
|
"learning_rate": 2.6548917752535997e-06, |
|
"loss": 0.771, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 1.0008524753186703, |
|
"learning_rate": 2.6302346155564385e-06, |
|
"loss": 0.7963, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 1.0088045948631796, |
|
"learning_rate": 2.6056515596620715e-06, |
|
"loss": 0.7571, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 0.9727997698934588, |
|
"learning_rate": 2.581143376295516e-06, |
|
"loss": 0.7968, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.9760428822299934, |
|
"learning_rate": 2.556710831840481e-06, |
|
"loss": 0.7829, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.1893585643467264, |
|
"learning_rate": 2.5323546903154074e-06, |
|
"loss": 0.7363, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 1.0408498899558132, |
|
"learning_rate": 2.508075713349575e-06, |
|
"loss": 0.683, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 1.0852218097728863, |
|
"learning_rate": 2.483874660159294e-06, |
|
"loss": 0.6388, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 1.0636193658435114, |
|
"learning_rate": 2.45975228752415e-06, |
|
"loss": 0.6785, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 1.05164052954354, |
|
"learning_rate": 2.435709349763354e-06, |
|
"loss": 0.7024, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 1.0744751292672923, |
|
"learning_rate": 2.4117465987121357e-06, |
|
"loss": 0.6714, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 1.0221167769747221, |
|
"learning_rate": 2.387864783698258e-06, |
|
"loss": 0.6441, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 1.0453109653021675, |
|
"learning_rate": 2.3640646515185596e-06, |
|
"loss": 0.6668, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 1.0035196656143317, |
|
"learning_rate": 2.3403469464156235e-06, |
|
"loss": 0.6711, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 1.0614923887712562, |
|
"learning_rate": 2.31671241005449e-06, |
|
"loss": 0.6801, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 1.0457688195463548, |
|
"learning_rate": 2.2931617814994704e-06, |
|
"loss": 0.6676, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 1.094973586743587, |
|
"learning_rate": 2.269695797191032e-06, |
|
"loss": 0.6467, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 1.0312304548353073, |
|
"learning_rate": 2.2463151909227804e-06, |
|
"loss": 0.6626, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 1.0435526510546405, |
|
"learning_rate": 2.223020693818495e-06, |
|
"loss": 0.6565, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 1.0361388218534178, |
|
"learning_rate": 2.1998130343092866e-06, |
|
"loss": 0.655, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 1.071971382261616, |
|
"learning_rate": 2.176692938110801e-06, |
|
"loss": 0.6628, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 1.0449189624346316, |
|
"learning_rate": 2.1536611282005374e-06, |
|
"loss": 0.6742, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 1.0076278447431801, |
|
"learning_rate": 2.130718324795234e-06, |
|
"loss": 0.6615, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 1.044357139317297, |
|
"learning_rate": 2.107865245328354e-06, |
|
"loss": 0.6707, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 1.0155250644507565, |
|
"learning_rate": 2.0851026044276405e-06, |
|
"loss": 0.6701, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 1.012020172763002, |
|
"learning_rate": 2.0624311138927795e-06, |
|
"loss": 0.6531, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 1.0209851165233697, |
|
"learning_rate": 2.0398514826731326e-06, |
|
"loss": 0.6685, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 1.0147123852944229, |
|
"learning_rate": 2.017364416845579e-06, |
|
"loss": 0.6506, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 1.06994559921509, |
|
"learning_rate": 1.9949706195924235e-06, |
|
"loss": 0.6743, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 0.9930487524595831, |
|
"learning_rate": 1.97267079117942e-06, |
|
"loss": 0.6596, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 1.0334858708046972, |
|
"learning_rate": 1.950465628933863e-06, |
|
"loss": 0.6679, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 1.060064879245556, |
|
"learning_rate": 1.9283558272227866e-06, |
|
"loss": 0.6749, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 1.0171368650427, |
|
"learning_rate": 1.9063420774312509e-06, |
|
"loss": 0.6703, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 0.9646165360014197, |
|
"learning_rate": 1.8844250679407272e-06, |
|
"loss": 0.6878, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 1.0209055430674492, |
|
"learning_rate": 1.862605484107562e-06, |
|
"loss": 0.7052, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 1.0216869737250995, |
|
"learning_rate": 1.840884008241549e-06, |
|
"loss": 0.6778, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 0.990030094537176, |
|
"learning_rate": 1.819261319584602e-06, |
|
"loss": 0.675, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 0.9972968188321764, |
|
"learning_rate": 1.7977380942895007e-06, |
|
"loss": 0.6832, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 1.002919858574642, |
|
"learning_rate": 1.7763150053987532e-06, |
|
"loss": 0.6669, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 1.040641077805689, |
|
"learning_rate": 1.7549927228235547e-06, |
|
"loss": 0.6874, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 1.0136593089712416, |
|
"learning_rate": 1.7337719133228308e-06, |
|
"loss": 0.6662, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 1.0032381970613455, |
|
"learning_rate": 1.7126532404823898e-06, |
|
"loss": 0.657, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 1.0107311218156156, |
|
"learning_rate": 1.6916373646941774e-06, |
|
"loss": 0.6706, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 1.0313882769598175, |
|
"learning_rate": 1.6707249431356188e-06, |
|
"loss": 0.6803, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 1.0013867402651844, |
|
"learning_rate": 1.6499166297490716e-06, |
|
"loss": 0.6896, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 0.9974367112606389, |
|
"learning_rate": 1.6292130752213747e-06, |
|
"loss": 0.6773, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 1.0457782650116, |
|
"learning_rate": 1.6086149269635081e-06, |
|
"loss": 0.668, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 0.9930241935385495, |
|
"learning_rate": 1.5881228290903367e-06, |
|
"loss": 0.6508, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 1.0059354322817335, |
|
"learning_rate": 1.5677374224004793e-06, |
|
"loss": 0.6529, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 1.0338579100235163, |
|
"learning_rate": 1.547459344356262e-06, |
|
"loss": 0.6614, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 1.0203126239591027, |
|
"learning_rate": 1.5272892290637892e-06, |
|
"loss": 0.6749, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 0.983643586611109, |
|
"learning_rate": 1.5072277072531127e-06, |
|
"loss": 0.6517, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 1.0203957676102433, |
|
"learning_rate": 1.4872754062585126e-06, |
|
"loss": 0.6716, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 1.036201909144992, |
|
"learning_rate": 1.4674329499988737e-06, |
|
"loss": 0.6574, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 1.0277085537623492, |
|
"learning_rate": 1.4477009589581787e-06, |
|
"loss": 0.6593, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 0.9713425669443266, |
|
"learning_rate": 1.4280800501661057e-06, |
|
"loss": 0.6621, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 1.028497947768737, |
|
"learning_rate": 1.408570837178735e-06, |
|
"loss": 0.6656, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 1.0565632370972053, |
|
"learning_rate": 1.3891739300593559e-06, |
|
"loss": 0.6644, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 1.0043346444991121, |
|
"learning_rate": 1.369889935359402e-06, |
|
"loss": 0.6539, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 1.0294689299797029, |
|
"learning_rate": 1.3507194560994657e-06, |
|
"loss": 0.6666, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 1.0123495429792864, |
|
"learning_rate": 1.331663091750463e-06, |
|
"loss": 0.6928, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 0.9951164224382856, |
|
"learning_rate": 1.312721438214869e-06, |
|
"loss": 0.6501, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 1.025832661356824, |
|
"learning_rate": 1.293895087808098e-06, |
|
"loss": 0.6658, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 0.9888366700648139, |
|
"learning_rate": 1.2751846292399705e-06, |
|
"loss": 0.6592, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 1.0208359350524125, |
|
"learning_rate": 1.2565906475963102e-06, |
|
"loss": 0.6483, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 1.0568986951058392, |
|
"learning_rate": 1.2381137243206455e-06, |
|
"loss": 0.6557, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 0.9849389521844061, |
|
"learning_rate": 1.2197544371960317e-06, |
|
"loss": 0.6488, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 1.0466426799607875, |
|
"learning_rate": 1.2015133603269753e-06, |
|
"loss": 0.6596, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 0.9985742048846067, |
|
"learning_rate": 1.183391064121493e-06, |
|
"loss": 0.6572, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 0.9661312369342807, |
|
"learning_rate": 1.1653881152732582e-06, |
|
"loss": 0.6439, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 1.0327058718249167, |
|
"learning_rate": 1.1475050767439e-06, |
|
"loss": 0.6811, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 1.0365200638536969, |
|
"learning_rate": 1.129742507745382e-06, |
|
"loss": 0.6588, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 0.9804079029045045, |
|
"learning_rate": 1.1121009637225283e-06, |
|
"loss": 0.6783, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 1.0326866018136251, |
|
"learning_rate": 1.0945809963356442e-06, |
|
"loss": 0.6705, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 1.0314679157662048, |
|
"learning_rate": 1.0771831534432714e-06, |
|
"loss": 0.6353, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 0.9589889108924486, |
|
"learning_rate": 1.0599079790850542e-06, |
|
"loss": 0.655, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 0.9894914192305704, |
|
"learning_rate": 1.0427560134647308e-06, |
|
"loss": 0.643, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 1.0693419775513076, |
|
"learning_rate": 1.0257277929332332e-06, |
|
"loss": 0.6611, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 0.9951590219864285, |
|
"learning_rate": 1.0088238499719254e-06, |
|
"loss": 0.6403, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 1.0105626202971048, |
|
"learning_rate": 9.920447131759392e-07, |
|
"loss": 0.6707, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 1.0186289750333066, |
|
"learning_rate": 9.753909072376594e-07, |
|
"loss": 0.6809, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 1.0267980845318398, |
|
"learning_rate": 9.58862952930304e-07, |
|
"loss": 0.6642, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 1.0314667402705489, |
|
"learning_rate": 9.424613670916499e-07, |
|
"loss": 0.6815, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 0.9818510396592551, |
|
"learning_rate": 9.261866626078625e-07, |
|
"loss": 0.6579, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 0.998040916561116, |
|
"learning_rate": 9.100393483974612e-07, |
|
"loss": 0.6815, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 1.007529165875462, |
|
"learning_rate": 8.940199293954033e-07, |
|
"loss": 0.6609, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 1.0489165413908048, |
|
"learning_rate": 8.781289065373016e-07, |
|
"loss": 0.6661, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 1.0586483881635766, |
|
"learning_rate": 8.623667767437483e-07, |
|
"loss": 0.6494, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 0.970861929985865, |
|
"learning_rate": 8.467340329047874e-07, |
|
"loss": 0.6403, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 1.0315170437890622, |
|
"learning_rate": 8.312311638644888e-07, |
|
"loss": 0.6802, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 1.018615901485097, |
|
"learning_rate": 8.158586544056791e-07, |
|
"loss": 0.6813, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 0.9991739019084611, |
|
"learning_rate": 8.00616985234764e-07, |
|
"loss": 0.6757, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 1.039226698329409, |
|
"learning_rate": 7.855066329667121e-07, |
|
"loss": 0.6421, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 1.0505394427255816, |
|
"learning_rate": 7.705280701101392e-07, |
|
"loss": 0.6655, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 0.9750027460632938, |
|
"learning_rate": 7.556817650525383e-07, |
|
"loss": 0.6526, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 0.989246982143368, |
|
"learning_rate": 7.409681820456315e-07, |
|
"loss": 0.667, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 0.9977414734019189, |
|
"learning_rate": 7.263877811908553e-07, |
|
"loss": 0.6647, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 0.9875292562685886, |
|
"learning_rate": 7.11941018424967e-07, |
|
"loss": 0.667, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 0.9932801930288735, |
|
"learning_rate": 6.97628345505797e-07, |
|
"loss": 0.6511, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 1.0199295886729471, |
|
"learning_rate": 6.83450209998106e-07, |
|
"loss": 0.6556, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 1.0279710885988984, |
|
"learning_rate": 6.694070552596105e-07, |
|
"loss": 0.6676, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 1.0221845787587531, |
|
"learning_rate": 6.554993204270993e-07, |
|
"loss": 0.6512, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 0.9597530531552908, |
|
"learning_rate": 6.417274404027163e-07, |
|
"loss": 0.6482, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 1.0201542647464452, |
|
"learning_rate": 6.280918458403506e-07, |
|
"loss": 0.6623, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 0.9818765108255797, |
|
"learning_rate": 6.14592963132174e-07, |
|
"loss": 0.6599, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 1.0020031777534095, |
|
"learning_rate": 6.012312143953075e-07, |
|
"loss": 0.6818, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 1.020601700800406, |
|
"learning_rate": 5.880070174586228e-07, |
|
"loss": 0.6794, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 0.9781529112263975, |
|
"learning_rate": 5.74920785849673e-07, |
|
"loss": 0.6612, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 1.020456830272749, |
|
"learning_rate": 5.619729287817621e-07, |
|
"loss": 0.6638, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 1.0134058298180835, |
|
"learning_rate": 5.49163851141154e-07, |
|
"loss": 0.6468, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 1.0051724307379968, |
|
"learning_rate": 5.36493953474404e-07, |
|
"loss": 0.6411, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 0.9963926377815217, |
|
"learning_rate": 5.239636319758356e-07, |
|
"loss": 0.668, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 0.9731428272925532, |
|
"learning_rate": 5.115732784751576e-07, |
|
"loss": 0.6444, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 1.0185774017291327, |
|
"learning_rate": 4.993232804252018e-07, |
|
"loss": 0.6529, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 1.00711656230006, |
|
"learning_rate": 4.872140208898118e-07, |
|
"loss": 0.6539, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 1.0045164786035452, |
|
"learning_rate": 4.7524587853186866e-07, |
|
"loss": 0.6629, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 0.9961645157673277, |
|
"learning_rate": 4.634192276014399e-07, |
|
"loss": 0.6738, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 1.0214318273829783, |
|
"learning_rate": 4.5173443792408625e-07, |
|
"loss": 0.6552, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 1.0163355618069994, |
|
"learning_rate": 4.4019187488928914e-07, |
|
"loss": 0.6638, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 1.032574771687925, |
|
"learning_rate": 4.2879189943903335e-07, |
|
"loss": 0.6877, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 0.9930486578442914, |
|
"learning_rate": 4.1753486805651e-07, |
|
"loss": 0.6832, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 0.969259241462703, |
|
"learning_rate": 4.064211327549794e-07, |
|
"loss": 0.6738, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 1.018380412495952, |
|
"learning_rate": 3.95451041066755e-07, |
|
"loss": 0.671, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 0.9735720562840744, |
|
"learning_rate": 3.8462493603234064e-07, |
|
"loss": 0.6433, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 1.023935871901339, |
|
"learning_rate": 3.739431561897011e-07, |
|
"loss": 0.6593, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 0.9931869209408388, |
|
"learning_rate": 3.634060355636798e-07, |
|
"loss": 0.6647, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 1.0007736035504975, |
|
"learning_rate": 3.53013903655548e-07, |
|
"loss": 0.6683, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 0.9926593135266999, |
|
"learning_rate": 3.427670854327042e-07, |
|
"loss": 0.6668, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 0.9870259704326787, |
|
"learning_rate": 3.3266590131851296e-07, |
|
"loss": 0.6583, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 1.0298553599069395, |
|
"learning_rate": 3.227106671822849e-07, |
|
"loss": 0.6835, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 0.9915918166378904, |
|
"learning_rate": 3.1290169432939556e-07, |
|
"loss": 0.6428, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 1.060474012796049, |
|
"learning_rate": 3.03239289491557e-07, |
|
"loss": 0.6571, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 1.0203183687136719, |
|
"learning_rate": 2.937237548172206e-07, |
|
"loss": 0.6511, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 0.989507237700814, |
|
"learning_rate": 2.8435538786213134e-07, |
|
"loss": 0.6746, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 0.9853274639882493, |
|
"learning_rate": 2.7513448158002334e-07, |
|
"loss": 0.6657, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 0.9957797339050202, |
|
"learning_rate": 2.66061324313458e-07, |
|
"loss": 0.6496, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 1.0073836211394178, |
|
"learning_rate": 2.5713619978480653e-07, |
|
"loss": 0.6596, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 0.9798969178233458, |
|
"learning_rate": 2.483593870873829e-07, |
|
"loss": 0.654, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 0.9936847658098146, |
|
"learning_rate": 2.3973116067670665e-07, |
|
"loss": 0.6457, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 1.0224466038654803, |
|
"learning_rate": 2.3125179036193214e-07, |
|
"loss": 0.6572, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 1.0378183041017084, |
|
"learning_rate": 2.2292154129740117e-07, |
|
"loss": 0.6554, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 0.9787357607930246, |
|
"learning_rate": 2.147406739743596e-07, |
|
"loss": 0.6689, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 1.003947207260689, |
|
"learning_rate": 2.0670944421280646e-07, |
|
"loss": 0.6458, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 1.0063190015667964, |
|
"learning_rate": 1.9882810315349554e-07, |
|
"loss": 0.6648, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 1.0148103533053272, |
|
"learning_rate": 1.9109689725008317e-07, |
|
"loss": 0.6738, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 1.0122729219524842, |
|
"learning_rate": 1.8351606826142176e-07, |
|
"loss": 0.6796, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 1.0170129872933447, |
|
"learning_rate": 1.7608585324399684e-07, |
|
"loss": 0.6798, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 0.992464215850126, |
|
"learning_rate": 1.688064845445192e-07, |
|
"loss": 0.6695, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 0.9778375876093532, |
|
"learning_rate": 1.6167818979265282e-07, |
|
"loss": 0.6563, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 1.0165595693382412, |
|
"learning_rate": 1.5470119189390342e-07, |
|
"loss": 0.6709, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 0.9846854115443192, |
|
"learning_rate": 1.4787570902264293e-07, |
|
"loss": 0.6468, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 1.0226129803358943, |
|
"learning_rate": 1.4120195461529097e-07, |
|
"loss": 0.6699, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 1.0082916511837874, |
|
"learning_rate": 1.3468013736363694e-07, |
|
"loss": 0.6516, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 1.0086534086914538, |
|
"learning_rate": 1.2831046120831692e-07, |
|
"loss": 0.6483, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 0.9957571698657345, |
|
"learning_rate": 1.2209312533243535e-07, |
|
"loss": 0.6632, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 1.0298383480420663, |
|
"learning_rate": 1.1602832415533616e-07, |
|
"loss": 0.6645, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 1.0188314052602203, |
|
"learning_rate": 1.1011624732652437e-07, |
|
"loss": 0.6752, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 1.0019681746822835, |
|
"learning_rate": 1.0435707971973297e-07, |
|
"loss": 0.6573, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 0.9926022445477827, |
|
"learning_rate": 9.875100142714478e-08, |
|
"loss": 0.6396, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 0.9847567872289796, |
|
"learning_rate": 9.329818775376088e-08, |
|
"loss": 0.672, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 1.0103069579844817, |
|
"learning_rate": 8.79988092119144e-08, |
|
"loss": 0.678, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 1.0092463732513441, |
|
"learning_rate": 8.285303151594537e-08, |
|
"loss": 0.6837, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 1.0032753352403014, |
|
"learning_rate": 7.786101557701209e-08, |
|
"loss": 0.6494, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 1.0278927407365124, |
|
"learning_rate": 7.302291749806345e-08, |
|
"loss": 0.6597, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 0.9985234255556347, |
|
"learning_rate": 6.833888856895676e-08, |
|
"loss": 0.6672, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 1.0086435046290338, |
|
"learning_rate": 6.380907526172597e-08, |
|
"loss": 0.6768, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 0.9639413787477988, |
|
"learning_rate": 5.943361922600255e-08, |
|
"loss": 0.6346, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 0.9898392259409212, |
|
"learning_rate": 5.521265728458347e-08, |
|
"loss": 0.6655, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 1.0000733408715612, |
|
"learning_rate": 5.114632142915687e-08, |
|
"loss": 0.638, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 0.990452054352071, |
|
"learning_rate": 4.723473881617147e-08, |
|
"loss": 0.6583, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 0.988717000145255, |
|
"learning_rate": 4.347803176286025e-08, |
|
"loss": 0.6708, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 0.9868081897157113, |
|
"learning_rate": 3.98763177434186e-08, |
|
"loss": 0.6583, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 1.001603936622736, |
|
"learning_rate": 3.642970938532553e-08, |
|
"loss": 0.6754, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 1.0028854813842756, |
|
"learning_rate": 3.313831446582816e-08, |
|
"loss": 0.6784, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 0.9840591494137083, |
|
"learning_rate": 3.000223590856666e-08, |
|
"loss": 0.6651, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 1.0425902900408417, |
|
"learning_rate": 2.7021571780356804e-08, |
|
"loss": 0.6489, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 1.0016271763738829, |
|
"learning_rate": 2.419641528812522e-08, |
|
"loss": 0.6501, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 0.9875844742537229, |
|
"learning_rate": 2.1526854775992255e-08, |
|
"loss": 0.667, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 0.9909068409835267, |
|
"learning_rate": 1.901297372251143e-08, |
|
"loss": 0.6649, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 1.0200770120528766, |
|
"learning_rate": 1.665485073805817e-08, |
|
"loss": 0.6542, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 0.9699214260408161, |
|
"learning_rate": 1.4452559562370683e-08, |
|
"loss": 0.6644, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 0.9962874170809767, |
|
"learning_rate": 1.2406169062246232e-08, |
|
"loss": 0.6502, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 1.0264867036759864, |
|
"learning_rate": 1.0515743229385645e-08, |
|
"loss": 0.6698, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 1.0133222133442825, |
|
"learning_rate": 8.781341178393244e-09, |
|
"loss": 0.6723, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 1.0159129157737807, |
|
"learning_rate": 7.203017144927771e-09, |
|
"loss": 0.6561, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 0.9931795490054022, |
|
"learning_rate": 5.780820484007632e-09, |
|
"loss": 0.6563, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 1.0195254872888724, |
|
"learning_rate": 4.514795668466576e-09, |
|
"loss": 0.6808, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 1.0210108366337896, |
|
"learning_rate": 3.4049822875614757e-09, |
|
"loss": 0.6723, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 0.9891130306027911, |
|
"learning_rate": 2.4514150457377594e-09, |
|
"loss": 0.6763, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 0.9876265686294937, |
|
"learning_rate": 1.654123761541393e-09, |
|
"loss": 0.6652, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 0.9719073327336301, |
|
"learning_rate": 1.0131333666885124e-09, |
|
"loss": 0.6793, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 1.004648101535836, |
|
"learning_rate": 5.284639052832718e-10, |
|
"loss": 0.6643, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 1.0172517540637482, |
|
"learning_rate": 2.0013053319334341e-10, |
|
"loss": 0.6768, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.9650966122076953, |
|
"learning_rate": 2.814351757529643e-11, |
|
"loss": 0.6356, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 2838, |
|
"total_flos": 1471706245890048.0, |
|
"train_loss": 0.8058284866381398, |
|
"train_runtime": 31310.8966, |
|
"train_samples_per_second": 5.802, |
|
"train_steps_per_second": 0.091 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 2838, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"total_flos": 1471706245890048.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|