|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9993049349617714, |
|
"eval_steps": 500, |
|
"global_step": 1258, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0007943600436898023, |
|
"grad_norm": 11.12151660776111, |
|
"learning_rate": 1.5873015873015874e-07, |
|
"loss": 1.8009, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.003971800218449012, |
|
"grad_norm": 10.10420314385476, |
|
"learning_rate": 7.936507936507937e-07, |
|
"loss": 1.7719, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.007943600436898023, |
|
"grad_norm": 2.7149708608696828, |
|
"learning_rate": 1.5873015873015873e-06, |
|
"loss": 1.7086, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.011915400655347037, |
|
"grad_norm": 1.570269172415534, |
|
"learning_rate": 2.380952380952381e-06, |
|
"loss": 1.6525, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.015887200873796047, |
|
"grad_norm": 1.166276790017584, |
|
"learning_rate": 3.1746031746031746e-06, |
|
"loss": 1.6423, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01985900109224506, |
|
"grad_norm": 1.0260020187790146, |
|
"learning_rate": 3.968253968253968e-06, |
|
"loss": 1.6245, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.023830801310694073, |
|
"grad_norm": 0.9521643373391829, |
|
"learning_rate": 4.761904761904762e-06, |
|
"loss": 1.6242, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.027802601529143083, |
|
"grad_norm": 0.9087502803225287, |
|
"learning_rate": 5.555555555555557e-06, |
|
"loss": 1.6067, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.03177440174759209, |
|
"grad_norm": 0.9276432527674262, |
|
"learning_rate": 6.349206349206349e-06, |
|
"loss": 1.5848, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.035746201966041107, |
|
"grad_norm": 0.8919073066787582, |
|
"learning_rate": 7.1428571428571436e-06, |
|
"loss": 1.6, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.03971800218449012, |
|
"grad_norm": 0.965858397020833, |
|
"learning_rate": 7.936507936507936e-06, |
|
"loss": 1.6023, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.04368980240293913, |
|
"grad_norm": 0.9487708676860586, |
|
"learning_rate": 8.730158730158731e-06, |
|
"loss": 1.5826, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.04766160262138815, |
|
"grad_norm": 0.9151332727520083, |
|
"learning_rate": 9.523809523809525e-06, |
|
"loss": 1.5978, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.05163340283983715, |
|
"grad_norm": 0.917185703733017, |
|
"learning_rate": 1.031746031746032e-05, |
|
"loss": 1.6003, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.055605203058286166, |
|
"grad_norm": 0.9194628312444804, |
|
"learning_rate": 1.1111111111111113e-05, |
|
"loss": 1.5856, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.05957700327673518, |
|
"grad_norm": 0.8901707069616845, |
|
"learning_rate": 1.1904761904761905e-05, |
|
"loss": 1.5846, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.06354880349518419, |
|
"grad_norm": 0.966608805341671, |
|
"learning_rate": 1.2698412698412699e-05, |
|
"loss": 1.6089, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0675206037136332, |
|
"grad_norm": 0.9313133663663362, |
|
"learning_rate": 1.3492063492063494e-05, |
|
"loss": 1.5815, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.07149240393208221, |
|
"grad_norm": 0.9808540327178217, |
|
"learning_rate": 1.4285714285714287e-05, |
|
"loss": 1.5816, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.07546420415053123, |
|
"grad_norm": 0.9026305570096459, |
|
"learning_rate": 1.507936507936508e-05, |
|
"loss": 1.5958, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.07943600436898024, |
|
"grad_norm": 0.9788483223436265, |
|
"learning_rate": 1.5873015873015872e-05, |
|
"loss": 1.5911, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.08340780458742925, |
|
"grad_norm": 0.9515538442938523, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 1.5865, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.08737960480587827, |
|
"grad_norm": 0.926406626131289, |
|
"learning_rate": 1.7460317460317463e-05, |
|
"loss": 1.5793, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.09135140502432727, |
|
"grad_norm": 0.922601693661366, |
|
"learning_rate": 1.8253968253968254e-05, |
|
"loss": 1.5822, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.0953232052427763, |
|
"grad_norm": 0.9585781023399166, |
|
"learning_rate": 1.904761904761905e-05, |
|
"loss": 1.5718, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.0992950054612253, |
|
"grad_norm": 1.0121070868569275, |
|
"learning_rate": 1.9841269841269845e-05, |
|
"loss": 1.5773, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.1032668056796743, |
|
"grad_norm": 1.0425213086708742, |
|
"learning_rate": 1.999938384153589e-05, |
|
"loss": 1.585, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.10723860589812333, |
|
"grad_norm": 0.9511949716486409, |
|
"learning_rate": 1.999688082790923e-05, |
|
"loss": 1.5868, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.11121040611657233, |
|
"grad_norm": 0.9590697791262225, |
|
"learning_rate": 1.9992452930796544e-05, |
|
"loss": 1.5776, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.11518220633502135, |
|
"grad_norm": 0.932339341553828, |
|
"learning_rate": 1.9986101002782376e-05, |
|
"loss": 1.5789, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.11915400655347036, |
|
"grad_norm": 0.9529377325330747, |
|
"learning_rate": 1.997782626692034e-05, |
|
"loss": 1.5814, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.12312580677191937, |
|
"grad_norm": 0.9257334084732001, |
|
"learning_rate": 1.9967630316497663e-05, |
|
"loss": 1.5659, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.12709760699036837, |
|
"grad_norm": 0.9578201119584917, |
|
"learning_rate": 1.995551511472836e-05, |
|
"loss": 1.5844, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.1310694072088174, |
|
"grad_norm": 0.9483988649883341, |
|
"learning_rate": 1.994148299437524e-05, |
|
"loss": 1.559, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.1350412074272664, |
|
"grad_norm": 0.9752973142389638, |
|
"learning_rate": 1.9925536657300734e-05, |
|
"loss": 1.5783, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.13901300764571542, |
|
"grad_norm": 0.9082694825570907, |
|
"learning_rate": 1.990767917394666e-05, |
|
"loss": 1.5716, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.14298480786416443, |
|
"grad_norm": 0.9870799951805275, |
|
"learning_rate": 1.9887913982743e-05, |
|
"loss": 1.5705, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.14695660808261343, |
|
"grad_norm": 0.8978375791866258, |
|
"learning_rate": 1.986624488944585e-05, |
|
"loss": 1.5738, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.15092840830106247, |
|
"grad_norm": 0.9206959902444366, |
|
"learning_rate": 1.984267606640462e-05, |
|
"loss": 1.5729, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.15490020851951147, |
|
"grad_norm": 0.9532760851392515, |
|
"learning_rate": 1.9817212051758667e-05, |
|
"loss": 1.5674, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.15887200873796048, |
|
"grad_norm": 0.8995123548574, |
|
"learning_rate": 1.978985774856346e-05, |
|
"loss": 1.5683, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.16284380895640949, |
|
"grad_norm": 0.9731444458977212, |
|
"learning_rate": 1.9760618423846526e-05, |
|
"loss": 1.5738, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.1668156091748585, |
|
"grad_norm": 0.9813203114593326, |
|
"learning_rate": 1.9729499707593284e-05, |
|
"loss": 1.5826, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.17078740939330753, |
|
"grad_norm": 0.9301814711446849, |
|
"learning_rate": 1.9696507591663003e-05, |
|
"loss": 1.5565, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.17475920961175653, |
|
"grad_norm": 0.9084286747837647, |
|
"learning_rate": 1.9661648428635066e-05, |
|
"loss": 1.5621, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.17873100983020554, |
|
"grad_norm": 0.9292775329242645, |
|
"learning_rate": 1.962492893058582e-05, |
|
"loss": 1.5533, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.18270281004865455, |
|
"grad_norm": 0.9282763244366917, |
|
"learning_rate": 1.9586356167796145e-05, |
|
"loss": 1.5801, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.18667461026710355, |
|
"grad_norm": 0.9219988029696228, |
|
"learning_rate": 1.954593756739009e-05, |
|
"loss": 1.5802, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.1906464104855526, |
|
"grad_norm": 0.9871605765917675, |
|
"learning_rate": 1.9503680911904822e-05, |
|
"loss": 1.5817, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.1946182107040016, |
|
"grad_norm": 0.9551024433790934, |
|
"learning_rate": 1.9459594337792063e-05, |
|
"loss": 1.571, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.1985900109224506, |
|
"grad_norm": 0.9155920092825396, |
|
"learning_rate": 1.9413686333851465e-05, |
|
"loss": 1.5694, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.2025618111408996, |
|
"grad_norm": 0.9097167589577474, |
|
"learning_rate": 1.9365965739596086e-05, |
|
"loss": 1.556, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.2065336113593486, |
|
"grad_norm": 0.9023225707856134, |
|
"learning_rate": 1.9316441743550375e-05, |
|
"loss": 1.5762, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.21050541157779765, |
|
"grad_norm": 0.9077263206493167, |
|
"learning_rate": 1.9265123881480912e-05, |
|
"loss": 1.5706, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.21447721179624665, |
|
"grad_norm": 0.8937248614200829, |
|
"learning_rate": 1.9212022034560332e-05, |
|
"loss": 1.567, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.21844901201469566, |
|
"grad_norm": 0.9122061973911317, |
|
"learning_rate": 1.91571464274647e-05, |
|
"loss": 1.5742, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.22242081223314467, |
|
"grad_norm": 0.9625286348265168, |
|
"learning_rate": 1.91005076264048e-05, |
|
"loss": 1.5653, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.22639261245159367, |
|
"grad_norm": 0.9485988527754959, |
|
"learning_rate": 1.9042116537091583e-05, |
|
"loss": 1.555, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.2303644126700427, |
|
"grad_norm": 0.9576868955487222, |
|
"learning_rate": 1.898198440263633e-05, |
|
"loss": 1.5624, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.2343362128884917, |
|
"grad_norm": 0.947081159201408, |
|
"learning_rate": 1.8920122801385785e-05, |
|
"loss": 1.5567, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.23830801310694072, |
|
"grad_norm": 0.9810396328169484, |
|
"learning_rate": 1.8856543644692767e-05, |
|
"loss": 1.5552, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.24227981332538973, |
|
"grad_norm": 0.9296206498184163, |
|
"learning_rate": 1.8791259174622668e-05, |
|
"loss": 1.5791, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.24625161354383873, |
|
"grad_norm": 0.9647483866705726, |
|
"learning_rate": 1.8724281961596255e-05, |
|
"loss": 1.5604, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.25022341376228774, |
|
"grad_norm": 0.8990721005609152, |
|
"learning_rate": 1.865562490196924e-05, |
|
"loss": 1.5648, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.25419521398073675, |
|
"grad_norm": 0.8752521530386431, |
|
"learning_rate": 1.8585301215549152e-05, |
|
"loss": 1.575, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.2581670141991858, |
|
"grad_norm": 0.9677847810573758, |
|
"learning_rate": 1.8513324443049826e-05, |
|
"loss": 1.5752, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.2621388144176348, |
|
"grad_norm": 0.8650992533303118, |
|
"learning_rate": 1.8439708443484212e-05, |
|
"loss": 1.5576, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.2661106146360838, |
|
"grad_norm": 0.8721763870519639, |
|
"learning_rate": 1.836446739149581e-05, |
|
"loss": 1.55, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.2700824148545328, |
|
"grad_norm": 0.9043150779726963, |
|
"learning_rate": 1.8287615774629372e-05, |
|
"loss": 1.5736, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.27405421507298183, |
|
"grad_norm": 0.872060546407473, |
|
"learning_rate": 1.820916839054137e-05, |
|
"loss": 1.5739, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.27802601529143084, |
|
"grad_norm": 0.9490597285856667, |
|
"learning_rate": 1.8129140344150698e-05, |
|
"loss": 1.5656, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.28199781550987985, |
|
"grad_norm": 0.9293959378534794, |
|
"learning_rate": 1.8047547044730266e-05, |
|
"loss": 1.5601, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.28596961572832885, |
|
"grad_norm": 0.8875229973252522, |
|
"learning_rate": 1.796440420293996e-05, |
|
"loss": 1.5595, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.28994141594677786, |
|
"grad_norm": 0.9241685744107659, |
|
"learning_rate": 1.7879727827801587e-05, |
|
"loss": 1.5681, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.29391321616522686, |
|
"grad_norm": 0.9255387788030182, |
|
"learning_rate": 1.7793534223616354e-05, |
|
"loss": 1.5613, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.2978850163836759, |
|
"grad_norm": 0.9447825641314226, |
|
"learning_rate": 1.7705839986825502e-05, |
|
"loss": 1.5726, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.30185681660212493, |
|
"grad_norm": 0.879507513143992, |
|
"learning_rate": 1.7616662002814704e-05, |
|
"loss": 1.5419, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.30582861682057394, |
|
"grad_norm": 0.89862098882536, |
|
"learning_rate": 1.752601744266278e-05, |
|
"loss": 1.5516, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.30980041703902295, |
|
"grad_norm": 0.9106190928925514, |
|
"learning_rate": 1.7433923759835468e-05, |
|
"loss": 1.5565, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.31377221725747195, |
|
"grad_norm": 0.9288737547498717, |
|
"learning_rate": 1.7340398686824755e-05, |
|
"loss": 1.5732, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.31774401747592096, |
|
"grad_norm": 0.9087033425211577, |
|
"learning_rate": 1.7245460231734537e-05, |
|
"loss": 1.5492, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.32171581769436997, |
|
"grad_norm": 0.8999862928001036, |
|
"learning_rate": 1.7149126674813174e-05, |
|
"loss": 1.5695, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.32568761791281897, |
|
"grad_norm": 0.9119036306365369, |
|
"learning_rate": 1.7051416564933677e-05, |
|
"loss": 1.5507, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.329659418131268, |
|
"grad_norm": 0.8975417094048892, |
|
"learning_rate": 1.6952348716022112e-05, |
|
"loss": 1.5902, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.333631218349717, |
|
"grad_norm": 0.8840046538984045, |
|
"learning_rate": 1.6851942203435056e-05, |
|
"loss": 1.5592, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.33760301856816605, |
|
"grad_norm": 0.9029659180561906, |
|
"learning_rate": 1.6750216360286634e-05, |
|
"loss": 1.5829, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.34157481878661505, |
|
"grad_norm": 0.8585662547884547, |
|
"learning_rate": 1.664719077372597e-05, |
|
"loss": 1.5576, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.34554661900506406, |
|
"grad_norm": 0.8785123954008921, |
|
"learning_rate": 1.6563847811650376e-05, |
|
"loss": 1.5683, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.34951841922351307, |
|
"grad_norm": 0.8697699611475651, |
|
"learning_rate": 1.64585328429674e-05, |
|
"loss": 1.5448, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.3534902194419621, |
|
"grad_norm": 0.8843659106432961, |
|
"learning_rate": 1.635197429406901e-05, |
|
"loss": 1.5726, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.3574620196604111, |
|
"grad_norm": 0.9485513906574914, |
|
"learning_rate": 1.6244192682634143e-05, |
|
"loss": 1.5465, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.3614338198788601, |
|
"grad_norm": 0.9407868053462191, |
|
"learning_rate": 1.6135208761840457e-05, |
|
"loss": 1.5591, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.3654056200973091, |
|
"grad_norm": 0.9465851646246182, |
|
"learning_rate": 1.602504351636838e-05, |
|
"loss": 1.5534, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.3693774203157581, |
|
"grad_norm": 0.9186695674403026, |
|
"learning_rate": 1.591371815836051e-05, |
|
"loss": 1.5543, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.3733492205342071, |
|
"grad_norm": 0.8884315626992574, |
|
"learning_rate": 1.580125412333728e-05, |
|
"loss": 1.5402, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.37732102075265617, |
|
"grad_norm": 0.8733398363412331, |
|
"learning_rate": 1.5687673066069568e-05, |
|
"loss": 1.552, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.3812928209711052, |
|
"grad_norm": 0.8992808033952767, |
|
"learning_rate": 1.5572996856409094e-05, |
|
"loss": 1.5638, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.3852646211895542, |
|
"grad_norm": 0.8937067889490861, |
|
"learning_rate": 1.5457247575077445e-05, |
|
"loss": 1.5406, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.3892364214080032, |
|
"grad_norm": 0.9108964394640126, |
|
"learning_rate": 1.534044750941444e-05, |
|
"loss": 1.5472, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.3932082216264522, |
|
"grad_norm": 0.8793030627405377, |
|
"learning_rate": 1.5222619149086746e-05, |
|
"loss": 1.5412, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.3971800218449012, |
|
"grad_norm": 0.8782946871477869, |
|
"learning_rate": 1.5103785181757533e-05, |
|
"loss": 1.5396, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.4011518220633502, |
|
"grad_norm": 0.8674461660159545, |
|
"learning_rate": 1.4983968488718005e-05, |
|
"loss": 1.5426, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.4051236222817992, |
|
"grad_norm": 0.8976341288348811, |
|
"learning_rate": 1.4863192140481624e-05, |
|
"loss": 1.5537, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.4090954225002482, |
|
"grad_norm": 0.8431858627506479, |
|
"learning_rate": 1.4741479392341941e-05, |
|
"loss": 1.5586, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.4130672227186972, |
|
"grad_norm": 0.9421670179657328, |
|
"learning_rate": 1.4618853679894813e-05, |
|
"loss": 1.5202, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.4170390229371463, |
|
"grad_norm": 0.8878585099095585, |
|
"learning_rate": 1.4495338614525927e-05, |
|
"loss": 1.5507, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.4210108231555953, |
|
"grad_norm": 0.9642989519892418, |
|
"learning_rate": 1.437095797886445e-05, |
|
"loss": 1.5488, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.4249826233740443, |
|
"grad_norm": 0.9246966546921916, |
|
"learning_rate": 1.4245735722203736e-05, |
|
"loss": 1.5401, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.4289544235924933, |
|
"grad_norm": 0.9529371824264209, |
|
"learning_rate": 1.4119695955889925e-05, |
|
"loss": 1.5495, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.4329262238109423, |
|
"grad_norm": 0.8825791672603804, |
|
"learning_rate": 1.3992862948679332e-05, |
|
"loss": 1.5491, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.4368980240293913, |
|
"grad_norm": 0.8881684368996328, |
|
"learning_rate": 1.3865261122065551e-05, |
|
"loss": 1.5482, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.4408698242478403, |
|
"grad_norm": 0.8423283640939411, |
|
"learning_rate": 1.3736915045577122e-05, |
|
"loss": 1.5488, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.44484162446628933, |
|
"grad_norm": 0.8255688998685623, |
|
"learning_rate": 1.3607849432046717e-05, |
|
"loss": 1.5478, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.44881342468473834, |
|
"grad_norm": 0.8357255814716047, |
|
"learning_rate": 1.3478089132852717e-05, |
|
"loss": 1.5598, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.45278522490318734, |
|
"grad_norm": 0.8217394668509155, |
|
"learning_rate": 1.3347659133134118e-05, |
|
"loss": 1.5141, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.4567570251216364, |
|
"grad_norm": 0.8370380332545342, |
|
"learning_rate": 1.3216584546979702e-05, |
|
"loss": 1.5338, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.4607288253400854, |
|
"grad_norm": 0.9499613626096974, |
|
"learning_rate": 1.3084890612592325e-05, |
|
"loss": 1.5633, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.4647006255585344, |
|
"grad_norm": 0.9043571179729512, |
|
"learning_rate": 1.2979106570683663e-05, |
|
"loss": 1.5624, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.4686724257769834, |
|
"grad_norm": 0.837192808810411, |
|
"learning_rate": 1.2846361787292137e-05, |
|
"loss": 1.5514, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.47264422599543243, |
|
"grad_norm": 0.893985182264359, |
|
"learning_rate": 1.2713068941470547e-05, |
|
"loss": 1.5609, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.47661602621388144, |
|
"grad_norm": 0.9569825734990193, |
|
"learning_rate": 1.2579253698544124e-05, |
|
"loss": 1.5421, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.48058782643233044, |
|
"grad_norm": 0.8770646136960255, |
|
"learning_rate": 1.2444941824424825e-05, |
|
"loss": 1.5392, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.48455962665077945, |
|
"grad_norm": 0.9065759679781771, |
|
"learning_rate": 1.2310159180650158e-05, |
|
"loss": 1.5277, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.48853142686922846, |
|
"grad_norm": 0.8727166144942986, |
|
"learning_rate": 1.2174931719403568e-05, |
|
"loss": 1.5206, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.49250322708767746, |
|
"grad_norm": 0.8374618600702926, |
|
"learning_rate": 1.2039285478517417e-05, |
|
"loss": 1.5363, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.4964750273061265, |
|
"grad_norm": 0.8559233592367627, |
|
"learning_rate": 1.1903246576459398e-05, |
|
"loss": 1.5188, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.5004468275245755, |
|
"grad_norm": 0.8928106567260038, |
|
"learning_rate": 1.1766841207303498e-05, |
|
"loss": 1.5388, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.5044186277430245, |
|
"grad_norm": 0.9205521273045262, |
|
"learning_rate": 1.1630095635686359e-05, |
|
"loss": 1.5246, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.5083904279614735, |
|
"grad_norm": 0.8459927952590032, |
|
"learning_rate": 1.1493036191750067e-05, |
|
"loss": 1.5597, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.5123622281799225, |
|
"grad_norm": 0.8954533380109134, |
|
"learning_rate": 1.1355689266072314e-05, |
|
"loss": 1.5407, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.5163340283983716, |
|
"grad_norm": 0.8801173721660838, |
|
"learning_rate": 1.1218081304584959e-05, |
|
"loss": 1.5358, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.5203058286168206, |
|
"grad_norm": 0.8777403433212762, |
|
"learning_rate": 1.1080238803481878e-05, |
|
"loss": 1.5529, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.5242776288352696, |
|
"grad_norm": 0.8804685666128383, |
|
"learning_rate": 1.0942188304117184e-05, |
|
"loss": 1.5373, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.5282494290537186, |
|
"grad_norm": 0.8340381127557642, |
|
"learning_rate": 1.0803956387894715e-05, |
|
"loss": 1.5454, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.5322212292721676, |
|
"grad_norm": 0.8855144262951853, |
|
"learning_rate": 1.066556967114984e-05, |
|
"loss": 1.5283, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.5361930294906166, |
|
"grad_norm": 0.8545713944344788, |
|
"learning_rate": 1.0527054800024537e-05, |
|
"loss": 1.5434, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.5401648297090657, |
|
"grad_norm": 0.8489068751890607, |
|
"learning_rate": 1.0388438445336677e-05, |
|
"loss": 1.5134, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.5441366299275147, |
|
"grad_norm": 0.8830252810636811, |
|
"learning_rate": 1.0249747297444659e-05, |
|
"loss": 1.5412, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.5481084301459637, |
|
"grad_norm": 0.815565252068504, |
|
"learning_rate": 1.0111008061108176e-05, |
|
"loss": 1.5327, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.5520802303644127, |
|
"grad_norm": 0.8249011581838526, |
|
"learning_rate": 9.972247450346272e-06, |
|
"loss": 1.5083, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.5560520305828617, |
|
"grad_norm": 0.8855145666062009, |
|
"learning_rate": 9.833492183293616e-06, |
|
"loss": 1.5481, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.5600238308013107, |
|
"grad_norm": 0.8735055764957946, |
|
"learning_rate": 9.69476897705595e-06, |
|
"loss": 1.5224, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.5639956310197597, |
|
"grad_norm": 0.8651835408338951, |
|
"learning_rate": 9.55610454256575e-06, |
|
"loss": 1.5291, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.5679674312382087, |
|
"grad_norm": 0.8804958974715262, |
|
"learning_rate": 9.417525579439094e-06, |
|
"loss": 1.5248, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.5719392314566577, |
|
"grad_norm": 0.8450121942474024, |
|
"learning_rate": 9.279058770834679e-06, |
|
"loss": 1.5264, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.5759110316751067, |
|
"grad_norm": 0.8556582918574475, |
|
"learning_rate": 9.140730778316037e-06, |
|
"loss": 1.5464, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.5798828318935557, |
|
"grad_norm": 0.8851588599533484, |
|
"learning_rate": 9.002568236717863e-06, |
|
"loss": 1.5389, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.5838546321120047, |
|
"grad_norm": 0.8608369842804535, |
|
"learning_rate": 8.864597749017566e-06, |
|
"loss": 1.5392, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.5878264323304537, |
|
"grad_norm": 0.8184232636661748, |
|
"learning_rate": 8.72684588121287e-06, |
|
"loss": 1.558, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.5917982325489027, |
|
"grad_norm": 0.84877451025036, |
|
"learning_rate": 8.589339157206583e-06, |
|
"loss": 1.5388, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.5957700327673519, |
|
"grad_norm": 0.8092058683281641, |
|
"learning_rate": 8.452104053699474e-06, |
|
"loss": 1.5313, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.5997418329858009, |
|
"grad_norm": 0.8452233779619619, |
|
"learning_rate": 8.315166995092206e-06, |
|
"loss": 1.5259, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.6037136332042499, |
|
"grad_norm": 0.8484399724532287, |
|
"learning_rate": 8.178554348397388e-06, |
|
"loss": 1.5193, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.6076854334226989, |
|
"grad_norm": 0.8144552151209362, |
|
"learning_rate": 8.042292418162611e-06, |
|
"loss": 1.5046, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.6116572336411479, |
|
"grad_norm": 0.8339533354115302, |
|
"learning_rate": 7.906407441405586e-06, |
|
"loss": 1.5372, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.6156290338595969, |
|
"grad_norm": 0.8305252567757494, |
|
"learning_rate": 7.770925582562228e-06, |
|
"loss": 1.5365, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.6196008340780459, |
|
"grad_norm": 0.8327672905609981, |
|
"learning_rate": 7.635872928448734e-06, |
|
"loss": 1.5326, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.6235726342964949, |
|
"grad_norm": 0.844461561584183, |
|
"learning_rate": 7.501275483238619e-06, |
|
"loss": 1.543, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.6275444345149439, |
|
"grad_norm": 0.8619627758469628, |
|
"learning_rate": 7.367159163455648e-06, |
|
"loss": 1.5259, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.6315162347333929, |
|
"grad_norm": 0.8366660180458262, |
|
"learning_rate": 7.2335497929836565e-06, |
|
"loss": 1.5465, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.6354880349518419, |
|
"grad_norm": 0.8803323174183961, |
|
"learning_rate": 7.10047309809418e-06, |
|
"loss": 1.5412, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.6394598351702909, |
|
"grad_norm": 0.8544630082563632, |
|
"learning_rate": 6.967954702492939e-06, |
|
"loss": 1.5207, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.6434316353887399, |
|
"grad_norm": 0.8328235652984776, |
|
"learning_rate": 6.8360201223860024e-06, |
|
"loss": 1.5407, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.6474034356071889, |
|
"grad_norm": 0.8462106579634066, |
|
"learning_rate": 6.704694761566697e-06, |
|
"loss": 1.5217, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.6513752358256379, |
|
"grad_norm": 0.8277784955761321, |
|
"learning_rate": 6.574003906524149e-06, |
|
"loss": 1.5389, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.655347036044087, |
|
"grad_norm": 0.8532449534893878, |
|
"learning_rate": 6.443972721574409e-06, |
|
"loss": 1.5046, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.659318836262536, |
|
"grad_norm": 0.8230178573303815, |
|
"learning_rate": 6.314626244015099e-06, |
|
"loss": 1.5062, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.663290636480985, |
|
"grad_norm": 0.84801479235042, |
|
"learning_rate": 6.18598937930452e-06, |
|
"loss": 1.5203, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.667262436699434, |
|
"grad_norm": 0.8397075331547054, |
|
"learning_rate": 6.058086896266149e-06, |
|
"loss": 1.5242, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.671234236917883, |
|
"grad_norm": 0.8597360676196729, |
|
"learning_rate": 5.930943422319453e-06, |
|
"loss": 1.5055, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.6752060371363321, |
|
"grad_norm": 0.8775604658344065, |
|
"learning_rate": 5.80458343873789e-06, |
|
"loss": 1.5257, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.6791778373547811, |
|
"grad_norm": 0.8235956970430471, |
|
"learning_rate": 5.679031275935104e-06, |
|
"loss": 1.5312, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.6831496375732301, |
|
"grad_norm": 0.8867364985685439, |
|
"learning_rate": 5.55431110878014e-06, |
|
"loss": 1.5074, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.6871214377916791, |
|
"grad_norm": 0.861522055731343, |
|
"learning_rate": 5.430446951942597e-06, |
|
"loss": 1.538, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.6910932380101281, |
|
"grad_norm": 0.8275026007168744, |
|
"learning_rate": 5.307462655268651e-06, |
|
"loss": 1.5146, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.6950650382285771, |
|
"grad_norm": 1.0158409774185204, |
|
"learning_rate": 5.185381899188811e-06, |
|
"loss": 1.5276, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.6990368384470261, |
|
"grad_norm": 0.8341799420785135, |
|
"learning_rate": 5.064228190158274e-06, |
|
"loss": 1.5281, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.7030086386654751, |
|
"grad_norm": 0.8162862561088432, |
|
"learning_rate": 4.944024856130813e-06, |
|
"loss": 1.5093, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.7069804388839241, |
|
"grad_norm": 0.8479134502646151, |
|
"learning_rate": 4.824795042066997e-06, |
|
"loss": 1.5455, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.7109522391023732, |
|
"grad_norm": 0.8433313440901115, |
|
"learning_rate": 4.706561705477687e-06, |
|
"loss": 1.5226, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.7149240393208222, |
|
"grad_norm": 0.8370841428358435, |
|
"learning_rate": 4.5893476120035895e-06, |
|
"loss": 1.5412, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.7188958395392712, |
|
"grad_norm": 0.8424960564588163, |
|
"learning_rate": 4.473175331031765e-06, |
|
"loss": 1.5175, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.7228676397577202, |
|
"grad_norm": 0.8032981360581487, |
|
"learning_rate": 4.358067231349942e-06, |
|
"loss": 1.5276, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.7268394399761692, |
|
"grad_norm": 0.7960926551931078, |
|
"learning_rate": 4.244045476839439e-06, |
|
"loss": 1.5167, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.7308112401946182, |
|
"grad_norm": 0.838320712946578, |
|
"learning_rate": 4.131132022207537e-06, |
|
"loss": 1.5445, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.7347830404130672, |
|
"grad_norm": 0.8306388787438427, |
|
"learning_rate": 4.019348608760137e-06, |
|
"loss": 1.5374, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.7387548406315162, |
|
"grad_norm": 0.8194701201913649, |
|
"learning_rate": 3.908716760215513e-06, |
|
"loss": 1.5204, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.7427266408499652, |
|
"grad_norm": 0.8194871368550529, |
|
"learning_rate": 3.799257778559955e-06, |
|
"loss": 1.5292, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.7466984410684142, |
|
"grad_norm": 0.824217253107322, |
|
"learning_rate": 3.6909927399460942e-06, |
|
"loss": 1.5336, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.7506702412868632, |
|
"grad_norm": 0.8142575339219603, |
|
"learning_rate": 3.5839424906347274e-06, |
|
"loss": 1.5092, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.7546420415053123, |
|
"grad_norm": 0.807677248718189, |
|
"learning_rate": 3.4781276429809153e-06, |
|
"loss": 1.5314, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.7586138417237613, |
|
"grad_norm": 0.8089021775387019, |
|
"learning_rate": 3.3735685714650925e-06, |
|
"loss": 1.5235, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.7625856419422103, |
|
"grad_norm": 0.8366198862903579, |
|
"learning_rate": 3.270285408769991e-06, |
|
"loss": 1.5381, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.7665574421606594, |
|
"grad_norm": 0.8405851483984087, |
|
"learning_rate": 3.168298041904141e-06, |
|
"loss": 1.5217, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.7705292423791084, |
|
"grad_norm": 0.8520055208027894, |
|
"learning_rate": 3.0676261083726466e-06, |
|
"loss": 1.5293, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.7745010425975574, |
|
"grad_norm": 0.8355631433834037, |
|
"learning_rate": 2.968288992396009e-06, |
|
"loss": 1.5132, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.7784728428160064, |
|
"grad_norm": 0.8291513738741879, |
|
"learning_rate": 2.870305821177747e-06, |
|
"loss": 1.5268, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.7824446430344554, |
|
"grad_norm": 0.8378530571179096, |
|
"learning_rate": 2.773695461221464e-06, |
|
"loss": 1.5098, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.7864164432529044, |
|
"grad_norm": 0.8199379802303441, |
|
"learning_rate": 2.678476514698146e-06, |
|
"loss": 1.5431, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.7903882434713534, |
|
"grad_norm": 0.8149428301792468, |
|
"learning_rate": 2.584667315864334e-06, |
|
"loss": 1.5524, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.7943600436898024, |
|
"grad_norm": 0.8127178564027231, |
|
"learning_rate": 2.492285927531893e-06, |
|
"loss": 1.5246, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.7983318439082514, |
|
"grad_norm": 0.8296085136845848, |
|
"learning_rate": 2.4013501375900604e-06, |
|
"loss": 1.5428, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.8023036441267004, |
|
"grad_norm": 0.8331298630227156, |
|
"learning_rate": 2.3118774555803915e-06, |
|
"loss": 1.5073, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.8062754443451494, |
|
"grad_norm": 0.8084677789049869, |
|
"learning_rate": 2.2238851093253476e-06, |
|
"loss": 1.518, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.8102472445635984, |
|
"grad_norm": 0.7995439438308841, |
|
"learning_rate": 2.1373900416110973e-06, |
|
"loss": 1.5272, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.8142190447820474, |
|
"grad_norm": 0.8236450629839644, |
|
"learning_rate": 2.0524089069252106e-06, |
|
"loss": 1.5028, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.8181908450004964, |
|
"grad_norm": 0.8275359166427305, |
|
"learning_rate": 1.9689580682498553e-06, |
|
"loss": 1.5268, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.8221626452189454, |
|
"grad_norm": 0.8081276364031936, |
|
"learning_rate": 1.887053593911149e-06, |
|
"loss": 1.5427, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.8261344454373944, |
|
"grad_norm": 0.8237710475989976, |
|
"learning_rate": 1.806711254485215e-06, |
|
"loss": 1.5389, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.8301062456558436, |
|
"grad_norm": 0.8384266531879961, |
|
"learning_rate": 1.727946519761583e-06, |
|
"loss": 1.5015, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.8340780458742926, |
|
"grad_norm": 0.8364585808120162, |
|
"learning_rate": 1.6507745557645127e-06, |
|
"loss": 1.5009, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.8380498460927416, |
|
"grad_norm": 0.8244749141974579, |
|
"learning_rate": 1.575210221832799e-06, |
|
"loss": 1.525, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.8420216463111906, |
|
"grad_norm": 0.8040790359125747, |
|
"learning_rate": 1.5012680677586222e-06, |
|
"loss": 1.5134, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.8459934465296396, |
|
"grad_norm": 0.7975576330551882, |
|
"learning_rate": 1.4432918921243055e-06, |
|
"loss": 1.5128, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.8499652467480886, |
|
"grad_norm": 0.7987138186718241, |
|
"learning_rate": 1.3723053285030463e-06, |
|
"loss": 1.5146, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.8539370469665376, |
|
"grad_norm": 0.8041473257969093, |
|
"learning_rate": 1.3029800137534632e-06, |
|
"loss": 1.56, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.8579088471849866, |
|
"grad_norm": 0.8017665713122328, |
|
"learning_rate": 1.235329296354526e-06, |
|
"loss": 1.5104, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.8618806474034356, |
|
"grad_norm": 0.8106278967681714, |
|
"learning_rate": 1.1693662023441577e-06, |
|
"loss": 1.5272, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.8658524476218846, |
|
"grad_norm": 0.8102683680589345, |
|
"learning_rate": 1.1051034328110776e-06, |
|
"loss": 1.5276, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.8698242478403336, |
|
"grad_norm": 0.8245536236761711, |
|
"learning_rate": 1.0425533614492412e-06, |
|
"loss": 1.5436, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.8737960480587826, |
|
"grad_norm": 0.8040355908066084, |
|
"learning_rate": 9.817280321752898e-07, |
|
"loss": 1.5007, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.8777678482772316, |
|
"grad_norm": 0.8123936825847906, |
|
"learning_rate": 9.226391568095306e-07, |
|
"loss": 1.5176, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.8817396484956807, |
|
"grad_norm": 0.7956886909023395, |
|
"learning_rate": 8.652981128208315e-07, |
|
"loss": 1.5135, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.8857114487141297, |
|
"grad_norm": 0.8198537899032718, |
|
"learning_rate": 8.097159411359135e-07, |
|
"loss": 1.5309, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.8896832489325787, |
|
"grad_norm": 0.782162948876607, |
|
"learning_rate": 7.559033440134311e-07, |
|
"loss": 1.5287, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.8936550491510277, |
|
"grad_norm": 0.8040565507060439, |
|
"learning_rate": 7.038706829832808e-07, |
|
"loss": 1.519, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.8976268493694767, |
|
"grad_norm": 0.8259422263740301, |
|
"learning_rate": 6.536279768514952e-07, |
|
"loss": 1.5137, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.9015986495879257, |
|
"grad_norm": 0.8095546356558041, |
|
"learning_rate": 6.051848997711395e-07, |
|
"loss": 1.5288, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.9055704498063747, |
|
"grad_norm": 0.8093720744796288, |
|
"learning_rate": 5.585507793795763e-07, |
|
"loss": 1.5212, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.9095422500248238, |
|
"grad_norm": 0.8077656938263478, |
|
"learning_rate": 5.137345950024309e-07, |
|
"loss": 1.4942, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.9135140502432728, |
|
"grad_norm": 0.8113816503121841, |
|
"learning_rate": 4.7074497592465074e-07, |
|
"loss": 1.5361, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.9174858504617218, |
|
"grad_norm": 0.813231569130407, |
|
"learning_rate": 4.2959019972893644e-07, |
|
"loss": 1.5306, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.9214576506801708, |
|
"grad_norm": 0.8099017485840051, |
|
"learning_rate": 3.9027819070191706e-07, |
|
"loss": 1.5137, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.9254294508986198, |
|
"grad_norm": 0.7881148123811212, |
|
"learning_rate": 3.5281651830833987e-07, |
|
"loss": 1.5193, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.9294012511170688, |
|
"grad_norm": 0.8097066087342274, |
|
"learning_rate": 3.1721239573357264e-07, |
|
"loss": 1.53, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.9333730513355178, |
|
"grad_norm": 0.7843276174810709, |
|
"learning_rate": 2.834726784947273e-07, |
|
"loss": 1.5407, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.9373448515539669, |
|
"grad_norm": 0.8036612590725093, |
|
"learning_rate": 2.5160386312063855e-07, |
|
"loss": 1.5143, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.9413166517724159, |
|
"grad_norm": 0.7981854209700985, |
|
"learning_rate": 2.2161208590096407e-07, |
|
"loss": 1.517, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.9452884519908649, |
|
"grad_norm": 0.7979530312793264, |
|
"learning_rate": 1.9350312170465234e-07, |
|
"loss": 1.5233, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.9492602522093139, |
|
"grad_norm": 0.8079703791008118, |
|
"learning_rate": 1.672823828680037e-07, |
|
"loss": 1.5193, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.9532320524277629, |
|
"grad_norm": 0.8283052457474909, |
|
"learning_rate": 1.4295491815253138e-07, |
|
"loss": 1.5306, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.9572038526462119, |
|
"grad_norm": 0.8204454900067728, |
|
"learning_rate": 1.205254117728316e-07, |
|
"loss": 1.5186, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.9611756528646609, |
|
"grad_norm": 0.8209623860337765, |
|
"learning_rate": 9.999818249464389e-08, |
|
"loss": 1.5262, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.9651474530831099, |
|
"grad_norm": 0.8116412111952241, |
|
"learning_rate": 8.137718280328166e-08, |
|
"loss": 1.5243, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.9691192533015589, |
|
"grad_norm": 0.7858348914688461, |
|
"learning_rate": 6.46659981425879e-08, |
|
"loss": 1.5169, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.9730910535200079, |
|
"grad_norm": 0.7965100492008761, |
|
"learning_rate": 4.9867846224559423e-08, |
|
"loss": 1.5238, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.9770628537384569, |
|
"grad_norm": 0.7942704227422176, |
|
"learning_rate": 3.6985576409787064e-08, |
|
"loss": 1.5131, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.9810346539569059, |
|
"grad_norm": 0.8182170297859925, |
|
"learning_rate": 2.6021669158811104e-08, |
|
"loss": 1.5142, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.9850064541753549, |
|
"grad_norm": 0.7890859967844505, |
|
"learning_rate": 1.697823555451561e-08, |
|
"loss": 1.5156, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.988978254393804, |
|
"grad_norm": 0.792384402297532, |
|
"learning_rate": 9.857016895642446e-09, |
|
"loss": 1.5025, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.992950054612253, |
|
"grad_norm": 0.7972540802101755, |
|
"learning_rate": 4.6593843615050374e-09, |
|
"loss": 1.5062, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.9969218548307021, |
|
"grad_norm": 0.7796868385026204, |
|
"learning_rate": 1.386338747972893e-09, |
|
"loss": 1.526, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.9993049349617714, |
|
"eval_loss": 1.5326974391937256, |
|
"eval_runtime": 248.3752, |
|
"eval_samples_per_second": 107.676, |
|
"eval_steps_per_second": 4.489, |
|
"step": 1258 |
|
}, |
|
{ |
|
"epoch": 0.9993049349617714, |
|
"step": 1258, |
|
"total_flos": 106140763422720.0, |
|
"train_loss": 1.5477893754295022, |
|
"train_runtime": 7899.7649, |
|
"train_samples_per_second": 30.594, |
|
"train_steps_per_second": 0.159 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1258, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 106140763422720.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|