|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9897610921501707, |
|
"eval_steps": 500, |
|
"global_step": 657, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.26715654134750366, |
|
"learning_rate": 4.9992855064046754e-05, |
|
"loss": 2.6697, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.4067687392234802, |
|
"learning_rate": 4.997142434019578e-05, |
|
"loss": 2.5369, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.44472697377204895, |
|
"learning_rate": 4.9935720078139045e-05, |
|
"loss": 2.5661, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.45230674743652344, |
|
"learning_rate": 4.988576268624979e-05, |
|
"loss": 2.4824, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.4477585554122925, |
|
"learning_rate": 4.982158071991725e-05, |
|
"loss": 2.3343, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.3630908131599426, |
|
"learning_rate": 4.974321086522453e-05, |
|
"loss": 2.4377, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.3237389028072357, |
|
"learning_rate": 4.9650697917979025e-05, |
|
"loss": 2.4114, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.3014233708381653, |
|
"learning_rate": 4.954409475810737e-05, |
|
"loss": 2.2636, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.3054388761520386, |
|
"learning_rate": 4.942346231942955e-05, |
|
"loss": 2.2758, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.2940792143344879, |
|
"learning_rate": 4.92888695548294e-05, |
|
"loss": 2.3207, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.25064757466316223, |
|
"learning_rate": 4.9140393396841565e-05, |
|
"loss": 2.2209, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.3227023780345917, |
|
"learning_rate": 4.89781187136772e-05, |
|
"loss": 2.2328, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.312673419713974, |
|
"learning_rate": 4.880213826071375e-05, |
|
"loss": 2.2737, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.26930904388427734, |
|
"learning_rate": 4.861255262747643e-05, |
|
"loss": 2.2686, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.2343609631061554, |
|
"learning_rate": 4.8409470180141827e-05, |
|
"loss": 2.2661, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.3088403642177582, |
|
"learning_rate": 4.8193006999596294e-05, |
|
"loss": 2.191, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.3336387574672699, |
|
"learning_rate": 4.796328681508473e-05, |
|
"loss": 2.2106, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.28899675607681274, |
|
"learning_rate": 4.7720440933487575e-05, |
|
"loss": 2.2347, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.30223432183265686, |
|
"learning_rate": 4.746460816426647e-05, |
|
"loss": 2.2307, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.2692021429538727, |
|
"learning_rate": 4.7195934740121485e-05, |
|
"loss": 2.1503, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.3296695053577423, |
|
"learning_rate": 4.6914574233405236e-05, |
|
"loss": 2.2145, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.28714171051979065, |
|
"learning_rate": 4.662068746834176e-05, |
|
"loss": 2.1163, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.35001716017723083, |
|
"learning_rate": 4.6314442429100155e-05, |
|
"loss": 2.1868, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.32721835374832153, |
|
"learning_rate": 4.599601416377575e-05, |
|
"loss": 2.1865, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.30708980560302734, |
|
"learning_rate": 4.566558468433344e-05, |
|
"loss": 2.2035, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.3122975528240204, |
|
"learning_rate": 4.532334286257064e-05, |
|
"loss": 2.1762, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.3531278669834137, |
|
"learning_rate": 4.496948432215913e-05, |
|
"loss": 2.2452, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.3190854489803314, |
|
"learning_rate": 4.460421132682751e-05, |
|
"loss": 2.2267, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.29605475068092346, |
|
"learning_rate": 4.4227732664748365e-05, |
|
"loss": 2.2548, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.37863361835479736, |
|
"learning_rate": 4.384026352919595e-05, |
|
"loss": 2.2053, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.3235403895378113, |
|
"learning_rate": 4.344202539554285e-05, |
|
"loss": 2.193, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.3326057195663452, |
|
"learning_rate": 4.3033245894665814e-05, |
|
"loss": 2.2349, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.3827252686023712, |
|
"learning_rate": 4.261415868283304e-05, |
|
"loss": 2.1247, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.34777992963790894, |
|
"learning_rate": 4.218500330814753e-05, |
|
"loss": 2.1555, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.3294101059436798, |
|
"learning_rate": 4.174602507362258e-05, |
|
"loss": 2.1771, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.3454132676124573, |
|
"learning_rate": 4.1297474896967814e-05, |
|
"loss": 2.1616, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.3628969192504883, |
|
"learning_rate": 4.083960916716597e-05, |
|
"loss": 2.1681, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.31756460666656494, |
|
"learning_rate": 4.0372689597922215e-05, |
|
"loss": 2.146, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.41087692975997925, |
|
"learning_rate": 3.989698307806995e-05, |
|
"loss": 2.2185, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.3311336636543274, |
|
"learning_rate": 3.941276151901853e-05, |
|
"loss": 2.0976, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.38921472430229187, |
|
"learning_rate": 3.8920301699330076e-05, |
|
"loss": 2.1204, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.31258201599121094, |
|
"learning_rate": 3.84198851065143e-05, |
|
"loss": 2.1678, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.4159790575504303, |
|
"learning_rate": 3.791179777613163e-05, |
|
"loss": 2.2486, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.3648820221424103, |
|
"learning_rate": 3.739633012829682e-05, |
|
"loss": 2.1523, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.35387834906578064, |
|
"learning_rate": 3.6873776801676264e-05, |
|
"loss": 2.1761, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.40865159034729004, |
|
"learning_rate": 3.6344436485074e-05, |
|
"loss": 2.219, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 0.40083423256874084, |
|
"learning_rate": 3.5808611746702814e-05, |
|
"loss": 2.1755, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 0.3656306564807892, |
|
"learning_rate": 3.5266608861237724e-05, |
|
"loss": 2.1917, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 0.3839705288410187, |
|
"learning_rate": 3.471873763475099e-05, |
|
"loss": 2.1878, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 0.4037785530090332, |
|
"learning_rate": 3.4165311227628524e-05, |
|
"loss": 2.1101, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.3458710312843323, |
|
"learning_rate": 3.3606645975569005e-05, |
|
"loss": 2.1691, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 0.3371334373950958, |
|
"learning_rate": 3.304306120876807e-05, |
|
"loss": 2.1904, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 0.4284100830554962, |
|
"learning_rate": 3.247487906939076e-05, |
|
"loss": 2.1688, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.3682388365268707, |
|
"learning_rate": 3.1902424327436734e-05, |
|
"loss": 2.1691, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.32562267780303955, |
|
"learning_rate": 3.132602419510336e-05, |
|
"loss": 2.0698, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 0.4634737968444824, |
|
"learning_rate": 3.0746008139752964e-05, |
|
"loss": 2.2131, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 0.4514225423336029, |
|
"learning_rate": 3.0162707695590935e-05, |
|
"loss": 2.1414, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.4424736499786377, |
|
"learning_rate": 2.9576456274162488e-05, |
|
"loss": 2.1257, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 0.4680028557777405, |
|
"learning_rate": 2.8987588973776304e-05, |
|
"loss": 2.2337, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 0.44635656476020813, |
|
"learning_rate": 2.8396442387964075e-05, |
|
"loss": 2.2022, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 0.4005129039287567, |
|
"learning_rate": 2.7803354413085364e-05, |
|
"loss": 2.1944, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 0.3982764184474945, |
|
"learning_rate": 2.72086640551878e-05, |
|
"loss": 2.0979, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 0.39070507884025574, |
|
"learning_rate": 2.6612711236232912e-05, |
|
"loss": 2.1757, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 0.4183822572231293, |
|
"learning_rate": 2.601583659979851e-05, |
|
"loss": 2.0571, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.39799273014068604, |
|
"learning_rate": 2.541838131636854e-05, |
|
"loss": 2.1154, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.3967350125312805, |
|
"learning_rate": 2.4820686888321808e-05, |
|
"loss": 2.164, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.3555432856082916, |
|
"learning_rate": 2.4223094954730956e-05, |
|
"loss": 2.1596, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 0.3698050379753113, |
|
"learning_rate": 2.3625947096083327e-05, |
|
"loss": 2.0815, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 0.45504623651504517, |
|
"learning_rate": 2.3029584639035286e-05, |
|
"loss": 2.0997, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 0.4048606753349304, |
|
"learning_rate": 2.2434348461311684e-05, |
|
"loss": 2.1397, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 0.46529653668403625, |
|
"learning_rate": 2.184057879686185e-05, |
|
"loss": 2.1781, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 0.4468785226345062, |
|
"learning_rate": 2.1248615041383685e-05, |
|
"loss": 2.1155, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 0.49244192242622375, |
|
"learning_rate": 2.0658795558326743e-05, |
|
"loss": 2.2141, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 0.45070916414260864, |
|
"learning_rate": 2.0071457485485463e-05, |
|
"loss": 2.2219, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 0.392012357711792, |
|
"learning_rate": 1.94869365422929e-05, |
|
"loss": 2.1288, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 0.3873184621334076, |
|
"learning_rate": 1.8905566837925264e-05, |
|
"loss": 2.1477, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.47440674901008606, |
|
"learning_rate": 1.832768068032678e-05, |
|
"loss": 2.0753, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 0.5017093420028687, |
|
"learning_rate": 1.7753608386264196e-05, |
|
"loss": 2.1999, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.3944746255874634, |
|
"learning_rate": 1.7183678092519385e-05, |
|
"loss": 2.1569, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 0.4446674883365631, |
|
"learning_rate": 1.66182155683281e-05, |
|
"loss": 2.1261, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 0.37069305777549744, |
|
"learning_rate": 1.6057544029171863e-05, |
|
"loss": 2.177, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 0.3994167745113373, |
|
"learning_rate": 1.550198395202974e-05, |
|
"loss": 2.2131, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 0.4573921859264374, |
|
"learning_rate": 1.4951852892195272e-05, |
|
"loss": 2.1517, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 0.5452659726142883, |
|
"learning_rate": 1.4407465301763534e-05, |
|
"loss": 2.1315, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 0.4585203528404236, |
|
"learning_rate": 1.386913234989191e-05, |
|
"loss": 2.1328, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 0.38274097442626953, |
|
"learning_rate": 1.3337161744937418e-05, |
|
"loss": 2.1218, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 0.40379753708839417, |
|
"learning_rate": 1.2811857558572168e-05, |
|
"loss": 2.1244, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.43701106309890747, |
|
"learning_rate": 1.2293520051977567e-05, |
|
"loss": 2.1058, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 0.4709857404232025, |
|
"learning_rate": 1.1782445504216554e-05, |
|
"loss": 2.2125, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 0.41358914971351624, |
|
"learning_rate": 1.1278926042882026e-05, |
|
"loss": 2.0835, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 0.3951328694820404, |
|
"learning_rate": 1.0783249477118156e-05, |
|
"loss": 2.1068, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 0.47784626483917236, |
|
"learning_rate": 1.0295699133110251e-05, |
|
"loss": 2.1347, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 0.3892216384410858, |
|
"learning_rate": 9.816553692136835e-06, |
|
"loss": 2.0526, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 0.4881783127784729, |
|
"learning_rate": 9.346087031276962e-06, |
|
"loss": 2.1394, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 0.42697590589523315, |
|
"learning_rate": 8.884568066863433e-06, |
|
"loss": 2.1722, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 0.410575807094574, |
|
"learning_rate": 8.432260600771602e-06, |
|
"loss": 2.2251, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 0.5141696333885193, |
|
"learning_rate": 7.98942316963158e-06, |
|
"loss": 2.1422, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 0.41447341442108154, |
|
"learning_rate": 7.556308897050024e-06, |
|
"loss": 2.1714, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 0.4169827401638031, |
|
"learning_rate": 7.133165348925977e-06, |
|
"loss": 2.1653, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 0.44966238737106323, |
|
"learning_rate": 6.720234391943475e-06, |
|
"loss": 2.142, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 0.47274890542030334, |
|
"learning_rate": 6.3177520553217575e-06, |
|
"loss": 2.2282, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 0.43409237265586853, |
|
"learning_rate": 5.925948395902253e-06, |
|
"loss": 2.1244, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 0.4762239158153534, |
|
"learning_rate": 5.545047366649164e-06, |
|
"loss": 2.1472, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 0.4269148111343384, |
|
"learning_rate": 5.175266688639177e-06, |
|
"loss": 2.1133, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 0.4269423186779022, |
|
"learning_rate": 4.816817726613188e-06, |
|
"loss": 2.0452, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 0.5028102993965149, |
|
"learning_rate": 4.469905368161287e-06, |
|
"loss": 2.1049, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 0.5188497304916382, |
|
"learning_rate": 4.134727906610078e-06, |
|
"loss": 2.122, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 0.5142034292221069, |
|
"learning_rate": 3.8114769276792278e-06, |
|
"loss": 2.149, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 0.5908805131912231, |
|
"learning_rate": 3.500337199972023e-06, |
|
"loss": 2.1086, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.38968101143836975, |
|
"learning_rate": 3.201486569362641e-06, |
|
"loss": 2.1705, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 0.511169970035553, |
|
"learning_rate": 2.9150958573402887e-06, |
|
"loss": 2.144, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 0.43135377764701843, |
|
"learning_rate": 2.6413287633685807e-06, |
|
"loss": 2.1459, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 0.41832235455513, |
|
"learning_rate": 2.380341771315711e-06, |
|
"loss": 2.1183, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 0.42403730750083923, |
|
"learning_rate": 2.13228406000911e-06, |
|
"loss": 2.1391, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 0.47705602645874023, |
|
"learning_rate": 1.8972974179655768e-06, |
|
"loss": 2.114, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 0.43250662088394165, |
|
"learning_rate": 1.6755161623456943e-06, |
|
"loss": 2.1369, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 0.46314170956611633, |
|
"learning_rate": 1.467067062178823e-06, |
|
"loss": 2.0985, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 0.46879735589027405, |
|
"learning_rate": 1.2720692659025867e-06, |
|
"loss": 2.1169, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 0.4447944760322571, |
|
"learning_rate": 1.0906342332582031e-06, |
|
"loss": 2.1579, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 0.4552321135997772, |
|
"learning_rate": 9.228656715807249e-07, |
|
"loss": 2.1577, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 0.49497705698013306, |
|
"learning_rate": 7.688594765203893e-07, |
|
"loss": 2.0601, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 0.4449738562107086, |
|
"learning_rate": 6.287036772292143e-07, |
|
"loss": 2.1249, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.42925629019737244, |
|
"learning_rate": 5.024783860439475e-07, |
|
"loss": 2.1224, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 0.5330957174301147, |
|
"learning_rate": 3.902557526942879e-07, |
|
"loss": 2.1459, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 0.4516250491142273, |
|
"learning_rate": 2.9209992306245826e-07, |
|
"loss": 2.1075, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 0.4317939877510071, |
|
"learning_rate": 2.0806700251775057e-07, |
|
"loss": 2.0667, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 0.4983243942260742, |
|
"learning_rate": 1.3820502384698508e-07, |
|
"loss": 2.184, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 0.5226026177406311, |
|
"learning_rate": 8.255391979921645e-08, |
|
"loss": 2.1755, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 0.46822264790534973, |
|
"learning_rate": 4.114550026037278e-08, |
|
"loss": 2.0958, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 0.41227301955223083, |
|
"learning_rate": 1.4003434070902766e-08, |
|
"loss": 2.0401, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 0.43563032150268555, |
|
"learning_rate": 1.1432354967644277e-09, |
|
"loss": 2.1381, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"step": 657, |
|
"total_flos": 2.2498862551007232e+17, |
|
"train_loss": 2.1785550988428124, |
|
"train_runtime": 11431.0743, |
|
"train_samples_per_second": 0.923, |
|
"train_steps_per_second": 0.057 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 657, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"total_flos": 2.2498862551007232e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|