|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 40.0, |
|
"eval_steps": 500, |
|
"global_step": 160320, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.124750499001996, |
|
"grad_norm": 13.176804542541504, |
|
"learning_rate": 1.9937624750499e-06, |
|
"loss": 0.2137, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.249500998003992, |
|
"grad_norm": 52.68854904174805, |
|
"learning_rate": 1.9875249500998005e-06, |
|
"loss": 0.2463, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.37425149700598803, |
|
"grad_norm": 9.197150230407715, |
|
"learning_rate": 1.9812874251497004e-06, |
|
"loss": 0.2316, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.499001996007984, |
|
"grad_norm": 23.94010353088379, |
|
"learning_rate": 1.9750499001996007e-06, |
|
"loss": 0.2095, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.6237524950099801, |
|
"grad_norm": 25.69223976135254, |
|
"learning_rate": 1.968812375249501e-06, |
|
"loss": 0.2102, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.7485029940119761, |
|
"grad_norm": 14.870789527893066, |
|
"learning_rate": 1.9625748502994013e-06, |
|
"loss": 0.2335, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.873253493013972, |
|
"grad_norm": 19.752464294433594, |
|
"learning_rate": 1.9563373253493016e-06, |
|
"loss": 0.2065, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.998003992015968, |
|
"grad_norm": 7.356762409210205, |
|
"learning_rate": 1.9500998003992014e-06, |
|
"loss": 0.215, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.4418031871318817, |
|
"eval_runtime": 50.9423, |
|
"eval_samples_per_second": 62.934, |
|
"eval_steps_per_second": 15.743, |
|
"step": 4008 |
|
}, |
|
{ |
|
"epoch": 1.122754491017964, |
|
"grad_norm": 1.049210786819458, |
|
"learning_rate": 1.9438622754491017e-06, |
|
"loss": 0.1786, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.24750499001996, |
|
"grad_norm": 33.95945358276367, |
|
"learning_rate": 1.937624750499002e-06, |
|
"loss": 0.2107, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.372255489021956, |
|
"grad_norm": 3.9420273303985596, |
|
"learning_rate": 1.931387225548902e-06, |
|
"loss": 0.1877, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.4970059880239521, |
|
"grad_norm": 15.459404945373535, |
|
"learning_rate": 1.925149700598802e-06, |
|
"loss": 0.1808, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.621756487025948, |
|
"grad_norm": 0.35231631994247437, |
|
"learning_rate": 1.9189121756487025e-06, |
|
"loss": 0.1842, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.746506986027944, |
|
"grad_norm": 22.17848014831543, |
|
"learning_rate": 1.9126746506986028e-06, |
|
"loss": 0.2165, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.8712574850299402, |
|
"grad_norm": 31.21565055847168, |
|
"learning_rate": 1.906437125748503e-06, |
|
"loss": 0.1879, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.996007984031936, |
|
"grad_norm": 1.7563763856887817, |
|
"learning_rate": 1.9001996007984032e-06, |
|
"loss": 0.1913, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.4956786632537842, |
|
"eval_runtime": 48.4339, |
|
"eval_samples_per_second": 66.193, |
|
"eval_steps_per_second": 16.559, |
|
"step": 8016 |
|
}, |
|
{ |
|
"epoch": 2.1207584830339323, |
|
"grad_norm": 0.0910625234246254, |
|
"learning_rate": 1.8939620758483032e-06, |
|
"loss": 0.1712, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 2.245508982035928, |
|
"grad_norm": 30.4615421295166, |
|
"learning_rate": 1.8877245508982035e-06, |
|
"loss": 0.1579, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 2.370259481037924, |
|
"grad_norm": 29.169662475585938, |
|
"learning_rate": 1.8814870259481036e-06, |
|
"loss": 0.1701, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 2.49500998003992, |
|
"grad_norm": 0.9950535893440247, |
|
"learning_rate": 1.875249500998004e-06, |
|
"loss": 0.1717, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 2.6197604790419162, |
|
"grad_norm": 0.30978772044181824, |
|
"learning_rate": 1.8690119760479042e-06, |
|
"loss": 0.1778, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 2.744510978043912, |
|
"grad_norm": 1.4617693424224854, |
|
"learning_rate": 1.8627744510978043e-06, |
|
"loss": 0.1772, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 2.8692614770459084, |
|
"grad_norm": 13.257425308227539, |
|
"learning_rate": 1.8565369261477044e-06, |
|
"loss": 0.1697, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 2.9940119760479043, |
|
"grad_norm": 11.522214889526367, |
|
"learning_rate": 1.8502994011976047e-06, |
|
"loss": 0.1651, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.4591982960700989, |
|
"eval_runtime": 48.8032, |
|
"eval_samples_per_second": 65.692, |
|
"eval_steps_per_second": 16.433, |
|
"step": 12024 |
|
}, |
|
{ |
|
"epoch": 3.1187624750499, |
|
"grad_norm": 20.58974266052246, |
|
"learning_rate": 1.844061876247505e-06, |
|
"loss": 0.1607, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 3.243512974051896, |
|
"grad_norm": 54.52241516113281, |
|
"learning_rate": 1.8378243512974053e-06, |
|
"loss": 0.1527, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 3.3682634730538923, |
|
"grad_norm": 12.846843719482422, |
|
"learning_rate": 1.8315868263473054e-06, |
|
"loss": 0.1484, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 3.493013972055888, |
|
"grad_norm": 0.6479830145835876, |
|
"learning_rate": 1.8253493013972054e-06, |
|
"loss": 0.1621, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 3.6177644710578845, |
|
"grad_norm": 0.7256312370300293, |
|
"learning_rate": 1.8191117764471057e-06, |
|
"loss": 0.1428, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 3.7425149700598803, |
|
"grad_norm": 12.274479866027832, |
|
"learning_rate": 1.8128742514970058e-06, |
|
"loss": 0.1433, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 3.867265469061876, |
|
"grad_norm": 35.40715408325195, |
|
"learning_rate": 1.8066367265469061e-06, |
|
"loss": 0.161, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 3.992015968063872, |
|
"grad_norm": 0.78450608253479, |
|
"learning_rate": 1.8003992015968064e-06, |
|
"loss": 0.171, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 0.4495457410812378, |
|
"eval_runtime": 49.2624, |
|
"eval_samples_per_second": 65.08, |
|
"eval_steps_per_second": 16.28, |
|
"step": 16032 |
|
}, |
|
{ |
|
"epoch": 4.116766467065868, |
|
"grad_norm": 0.06905636936426163, |
|
"learning_rate": 1.7941616766467065e-06, |
|
"loss": 0.1475, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 4.241516966067865, |
|
"grad_norm": 34.77931213378906, |
|
"learning_rate": 1.7879241516966066e-06, |
|
"loss": 0.1365, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 4.3662674650698605, |
|
"grad_norm": 0.5809102058410645, |
|
"learning_rate": 1.7816866267465069e-06, |
|
"loss": 0.1366, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 4.491017964071856, |
|
"grad_norm": 66.70156860351562, |
|
"learning_rate": 1.775449101796407e-06, |
|
"loss": 0.1526, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 4.615768463073852, |
|
"grad_norm": 29.423938751220703, |
|
"learning_rate": 1.7692115768463075e-06, |
|
"loss": 0.135, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 4.740518962075848, |
|
"grad_norm": 0.48827868700027466, |
|
"learning_rate": 1.7629740518962075e-06, |
|
"loss": 0.1444, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 4.865269461077844, |
|
"grad_norm": 8.966581344604492, |
|
"learning_rate": 1.7567365269461076e-06, |
|
"loss": 0.1295, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 4.99001996007984, |
|
"grad_norm": 3.6332414150238037, |
|
"learning_rate": 1.750499001996008e-06, |
|
"loss": 0.1407, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 0.5054113268852234, |
|
"eval_runtime": 46.022, |
|
"eval_samples_per_second": 69.662, |
|
"eval_steps_per_second": 17.426, |
|
"step": 20040 |
|
}, |
|
{ |
|
"epoch": 5.114770459081837, |
|
"grad_norm": 26.99722671508789, |
|
"learning_rate": 1.744261477045908e-06, |
|
"loss": 0.1307, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 5.2395209580838324, |
|
"grad_norm": 0.7371481657028198, |
|
"learning_rate": 1.7380239520958083e-06, |
|
"loss": 0.1153, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 5.364271457085828, |
|
"grad_norm": 0.3232800364494324, |
|
"learning_rate": 1.7317864271457086e-06, |
|
"loss": 0.1154, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 5.489021956087824, |
|
"grad_norm": 1.8309438228607178, |
|
"learning_rate": 1.7255489021956087e-06, |
|
"loss": 0.1331, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 5.61377245508982, |
|
"grad_norm": 0.4226222038269043, |
|
"learning_rate": 1.719311377245509e-06, |
|
"loss": 0.1206, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 5.738522954091817, |
|
"grad_norm": 1.4337540864944458, |
|
"learning_rate": 1.713073852295409e-06, |
|
"loss": 0.13, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 5.863273453093813, |
|
"grad_norm": 47.5312614440918, |
|
"learning_rate": 1.7068363273453091e-06, |
|
"loss": 0.1285, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 5.9880239520958085, |
|
"grad_norm": 1.092816710472107, |
|
"learning_rate": 1.7005988023952097e-06, |
|
"loss": 0.1412, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 0.4939550459384918, |
|
"eval_runtime": 45.0298, |
|
"eval_samples_per_second": 71.197, |
|
"eval_steps_per_second": 17.81, |
|
"step": 24048 |
|
}, |
|
{ |
|
"epoch": 6.112774451097804, |
|
"grad_norm": 0.03936842083930969, |
|
"learning_rate": 1.6943612774451097e-06, |
|
"loss": 0.1134, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 6.2375249500998, |
|
"grad_norm": 3.047616481781006, |
|
"learning_rate": 1.6881237524950098e-06, |
|
"loss": 0.1066, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 6.362275449101796, |
|
"grad_norm": 16.7564754486084, |
|
"learning_rate": 1.6818862275449101e-06, |
|
"loss": 0.1615, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 6.487025948103792, |
|
"grad_norm": 21.36778450012207, |
|
"learning_rate": 1.6756487025948102e-06, |
|
"loss": 0.1645, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 6.611776447105789, |
|
"grad_norm": 78.45208740234375, |
|
"learning_rate": 1.6694111776447105e-06, |
|
"loss": 0.1675, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 6.736526946107785, |
|
"grad_norm": 7.212148666381836, |
|
"learning_rate": 1.6631736526946108e-06, |
|
"loss": 0.146, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 6.86127744510978, |
|
"grad_norm": 9.503207206726074, |
|
"learning_rate": 1.6569361277445109e-06, |
|
"loss": 0.1606, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 6.986027944111776, |
|
"grad_norm": 0.4464740753173828, |
|
"learning_rate": 1.6506986027944112e-06, |
|
"loss": 0.1429, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 0.4717544615268707, |
|
"eval_runtime": 47.386, |
|
"eval_samples_per_second": 67.657, |
|
"eval_steps_per_second": 16.925, |
|
"step": 28056 |
|
}, |
|
{ |
|
"epoch": 7.110778443113772, |
|
"grad_norm": 0.42686018347740173, |
|
"learning_rate": 1.6444610778443113e-06, |
|
"loss": 0.1207, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 7.235528942115769, |
|
"grad_norm": 24.92848014831543, |
|
"learning_rate": 1.6382235528942113e-06, |
|
"loss": 0.1351, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 7.360279441117765, |
|
"grad_norm": 7.397327423095703, |
|
"learning_rate": 1.6319860279441118e-06, |
|
"loss": 0.1543, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 7.485029940119761, |
|
"grad_norm": 0.43539106845855713, |
|
"learning_rate": 1.625748502994012e-06, |
|
"loss": 0.1494, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 7.6097804391217565, |
|
"grad_norm": 14.456055641174316, |
|
"learning_rate": 1.619510978043912e-06, |
|
"loss": 0.1419, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 7.734530938123752, |
|
"grad_norm": 9.563997268676758, |
|
"learning_rate": 1.6132734530938123e-06, |
|
"loss": 0.1357, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 7.859281437125748, |
|
"grad_norm": 1.7568217515945435, |
|
"learning_rate": 1.6070359281437124e-06, |
|
"loss": 0.1369, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 7.984031936127744, |
|
"grad_norm": 2.780186653137207, |
|
"learning_rate": 1.600798403193613e-06, |
|
"loss": 0.1451, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 0.45947006344795227, |
|
"eval_runtime": 44.2941, |
|
"eval_samples_per_second": 72.38, |
|
"eval_steps_per_second": 18.106, |
|
"step": 32064 |
|
}, |
|
{ |
|
"epoch": 8.10878243512974, |
|
"grad_norm": 10.451217651367188, |
|
"learning_rate": 1.594560878243513e-06, |
|
"loss": 0.1136, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 8.233532934131736, |
|
"grad_norm": 0.18200552463531494, |
|
"learning_rate": 1.588323353293413e-06, |
|
"loss": 0.1259, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 8.358283433133732, |
|
"grad_norm": 1.9428528547286987, |
|
"learning_rate": 1.5820858283433134e-06, |
|
"loss": 0.1279, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 8.48303393213573, |
|
"grad_norm": 1.7016535997390747, |
|
"learning_rate": 1.5758483033932135e-06, |
|
"loss": 0.1231, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 8.607784431137725, |
|
"grad_norm": 0.7158037424087524, |
|
"learning_rate": 1.5696107784431135e-06, |
|
"loss": 0.1446, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 8.732534930139721, |
|
"grad_norm": 0.4712078273296356, |
|
"learning_rate": 1.563373253493014e-06, |
|
"loss": 0.1344, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 8.857285429141717, |
|
"grad_norm": 24.5105037689209, |
|
"learning_rate": 1.5571357285429141e-06, |
|
"loss": 0.1331, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 8.982035928143713, |
|
"grad_norm": 27.750621795654297, |
|
"learning_rate": 1.5508982035928142e-06, |
|
"loss": 0.1296, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 0.47351646423339844, |
|
"eval_runtime": 41.3902, |
|
"eval_samples_per_second": 77.458, |
|
"eval_steps_per_second": 19.377, |
|
"step": 36072 |
|
}, |
|
{ |
|
"epoch": 9.106786427145709, |
|
"grad_norm": 28.095134735107422, |
|
"learning_rate": 1.5446606786427145e-06, |
|
"loss": 0.119, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 9.231536926147704, |
|
"grad_norm": 0.07204411178827286, |
|
"learning_rate": 1.5384231536926146e-06, |
|
"loss": 0.1098, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 9.3562874251497, |
|
"grad_norm": 0.2767297327518463, |
|
"learning_rate": 1.532185628742515e-06, |
|
"loss": 0.1224, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 9.481037924151696, |
|
"grad_norm": 0.14060889184474945, |
|
"learning_rate": 1.5259481037924152e-06, |
|
"loss": 0.1247, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 9.605788423153692, |
|
"grad_norm": 32.673011779785156, |
|
"learning_rate": 1.5197105788423153e-06, |
|
"loss": 0.122, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 9.730538922155688, |
|
"grad_norm": 0.21247480809688568, |
|
"learning_rate": 1.5134730538922156e-06, |
|
"loss": 0.1233, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 9.855289421157684, |
|
"grad_norm": 0.4861377775669098, |
|
"learning_rate": 1.5072355289421156e-06, |
|
"loss": 0.1286, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 9.980039920159681, |
|
"grad_norm": 11.489697456359863, |
|
"learning_rate": 1.5009980039920157e-06, |
|
"loss": 0.1203, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 0.44174808263778687, |
|
"eval_runtime": 40.4714, |
|
"eval_samples_per_second": 79.216, |
|
"eval_steps_per_second": 19.816, |
|
"step": 40080 |
|
}, |
|
{ |
|
"epoch": 10.104790419161677, |
|
"grad_norm": 0.06284382939338684, |
|
"learning_rate": 1.4947604790419162e-06, |
|
"loss": 0.1176, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 10.229540918163673, |
|
"grad_norm": 0.8282334804534912, |
|
"learning_rate": 1.4885229540918163e-06, |
|
"loss": 0.1133, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 10.354291417165669, |
|
"grad_norm": 0.675163984298706, |
|
"learning_rate": 1.4822854291417164e-06, |
|
"loss": 0.0977, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 10.479041916167665, |
|
"grad_norm": 6.970102310180664, |
|
"learning_rate": 1.4760479041916167e-06, |
|
"loss": 0.1113, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 10.60379241516966, |
|
"grad_norm": 8.85517406463623, |
|
"learning_rate": 1.4698103792415168e-06, |
|
"loss": 0.1164, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 10.728542914171657, |
|
"grad_norm": 0.9282238483428955, |
|
"learning_rate": 1.4635728542914173e-06, |
|
"loss": 0.1167, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 10.853293413173652, |
|
"grad_norm": 9.984148979187012, |
|
"learning_rate": 1.4573353293413174e-06, |
|
"loss": 0.1261, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 10.978043912175648, |
|
"grad_norm": 0.20773719251155853, |
|
"learning_rate": 1.4510978043912175e-06, |
|
"loss": 0.1132, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_loss": 0.49900639057159424, |
|
"eval_runtime": 43.241, |
|
"eval_samples_per_second": 74.143, |
|
"eval_steps_per_second": 18.547, |
|
"step": 44088 |
|
}, |
|
{ |
|
"epoch": 11.102794411177644, |
|
"grad_norm": 12.603593826293945, |
|
"learning_rate": 1.4448602794411178e-06, |
|
"loss": 0.1061, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 11.22754491017964, |
|
"grad_norm": 51.32432174682617, |
|
"learning_rate": 1.4386227544910178e-06, |
|
"loss": 0.1079, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 11.352295409181636, |
|
"grad_norm": 10.22624397277832, |
|
"learning_rate": 1.432385229540918e-06, |
|
"loss": 0.1166, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 11.477045908183634, |
|
"grad_norm": 11.041003227233887, |
|
"learning_rate": 1.4261477045908184e-06, |
|
"loss": 0.105, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 11.60179640718563, |
|
"grad_norm": 35.79409408569336, |
|
"learning_rate": 1.4199101796407185e-06, |
|
"loss": 0.1124, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 11.726546906187625, |
|
"grad_norm": 0.18676696717739105, |
|
"learning_rate": 1.4136726546906188e-06, |
|
"loss": 0.0928, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 11.851297405189621, |
|
"grad_norm": 1.4925884008407593, |
|
"learning_rate": 1.4074351297405189e-06, |
|
"loss": 0.1098, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 11.976047904191617, |
|
"grad_norm": 0.32953181862831116, |
|
"learning_rate": 1.401197604790419e-06, |
|
"loss": 0.1117, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_loss": 0.4872562289237976, |
|
"eval_runtime": 41.6872, |
|
"eval_samples_per_second": 76.906, |
|
"eval_steps_per_second": 19.239, |
|
"step": 48096 |
|
}, |
|
{ |
|
"epoch": 12.100798403193613, |
|
"grad_norm": 0.027937307953834534, |
|
"learning_rate": 1.3949600798403195e-06, |
|
"loss": 0.0992, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 12.225548902195609, |
|
"grad_norm": 0.29068148136138916, |
|
"learning_rate": 1.3887225548902196e-06, |
|
"loss": 0.0921, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 12.350299401197605, |
|
"grad_norm": 0.127395898103714, |
|
"learning_rate": 1.3824850299401197e-06, |
|
"loss": 0.0933, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 12.4750499001996, |
|
"grad_norm": 0.09435238689184189, |
|
"learning_rate": 1.37624750499002e-06, |
|
"loss": 0.116, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 12.599800399201596, |
|
"grad_norm": 39.19729232788086, |
|
"learning_rate": 1.37000998003992e-06, |
|
"loss": 0.1052, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 12.724550898203592, |
|
"grad_norm": 0.28930047154426575, |
|
"learning_rate": 1.3637724550898201e-06, |
|
"loss": 0.1038, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 12.849301397205588, |
|
"grad_norm": 0.15510033071041107, |
|
"learning_rate": 1.3575349301397206e-06, |
|
"loss": 0.0983, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 12.974051896207584, |
|
"grad_norm": 81.58076477050781, |
|
"learning_rate": 1.3512974051896207e-06, |
|
"loss": 0.1117, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_loss": 0.45387548208236694, |
|
"eval_runtime": 43.4012, |
|
"eval_samples_per_second": 73.869, |
|
"eval_steps_per_second": 18.479, |
|
"step": 52104 |
|
}, |
|
{ |
|
"epoch": 13.098802395209582, |
|
"grad_norm": 4.060844421386719, |
|
"learning_rate": 1.345059880239521e-06, |
|
"loss": 0.0983, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 13.223552894211577, |
|
"grad_norm": 33.315853118896484, |
|
"learning_rate": 1.338822355289421e-06, |
|
"loss": 0.0941, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 13.348303393213573, |
|
"grad_norm": 0.1183587834239006, |
|
"learning_rate": 1.3325848303393212e-06, |
|
"loss": 0.0973, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 13.47305389221557, |
|
"grad_norm": 40.30908966064453, |
|
"learning_rate": 1.3263473053892215e-06, |
|
"loss": 0.0871, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 13.597804391217565, |
|
"grad_norm": 0.619777262210846, |
|
"learning_rate": 1.3201097804391218e-06, |
|
"loss": 0.1001, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 13.72255489021956, |
|
"grad_norm": 0.2705942392349243, |
|
"learning_rate": 1.3138722554890218e-06, |
|
"loss": 0.0983, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 13.847305389221557, |
|
"grad_norm": 6.151524066925049, |
|
"learning_rate": 1.3076347305389221e-06, |
|
"loss": 0.0793, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 13.972055888223553, |
|
"grad_norm": 2.340573787689209, |
|
"learning_rate": 1.3013972055888222e-06, |
|
"loss": 0.099, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_loss": 0.47363531589508057, |
|
"eval_runtime": 42.3279, |
|
"eval_samples_per_second": 75.742, |
|
"eval_steps_per_second": 18.947, |
|
"step": 56112 |
|
}, |
|
{ |
|
"epoch": 14.096806387225548, |
|
"grad_norm": 2.052589178085327, |
|
"learning_rate": 1.2951596806387225e-06, |
|
"loss": 0.0875, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 14.221556886227544, |
|
"grad_norm": 1.2925941944122314, |
|
"learning_rate": 1.2889221556886228e-06, |
|
"loss": 0.0812, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 14.34630738522954, |
|
"grad_norm": 0.062304213643074036, |
|
"learning_rate": 1.282684630738523e-06, |
|
"loss": 0.1017, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 14.471057884231538, |
|
"grad_norm": 0.1741693764925003, |
|
"learning_rate": 1.2764471057884232e-06, |
|
"loss": 0.0836, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 14.595808383233534, |
|
"grad_norm": 0.6444254517555237, |
|
"learning_rate": 1.2702095808383233e-06, |
|
"loss": 0.0804, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 14.72055888223553, |
|
"grad_norm": 2.0034759044647217, |
|
"learning_rate": 1.2639720558882234e-06, |
|
"loss": 0.0953, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 14.845309381237525, |
|
"grad_norm": 52.82548522949219, |
|
"learning_rate": 1.2577345309381237e-06, |
|
"loss": 0.0996, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 14.970059880239521, |
|
"grad_norm": 6.955111503601074, |
|
"learning_rate": 1.251497005988024e-06, |
|
"loss": 0.0857, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_loss": 0.45942702889442444, |
|
"eval_runtime": 43.409, |
|
"eval_samples_per_second": 73.856, |
|
"eval_steps_per_second": 18.475, |
|
"step": 60120 |
|
}, |
|
{ |
|
"epoch": 15.094810379241517, |
|
"grad_norm": 3.2324092388153076, |
|
"learning_rate": 1.245259481037924e-06, |
|
"loss": 0.0849, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 15.219560878243513, |
|
"grad_norm": 61.83153533935547, |
|
"learning_rate": 1.2390219560878243e-06, |
|
"loss": 0.0798, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 15.344311377245509, |
|
"grad_norm": 0.015876924619078636, |
|
"learning_rate": 1.2327844311377244e-06, |
|
"loss": 0.0785, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 15.469061876247505, |
|
"grad_norm": 3.0025134086608887, |
|
"learning_rate": 1.2265469061876247e-06, |
|
"loss": 0.0881, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 15.5938123752495, |
|
"grad_norm": 12.912367820739746, |
|
"learning_rate": 1.220309381237525e-06, |
|
"loss": 0.0802, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 15.718562874251496, |
|
"grad_norm": 0.3600245714187622, |
|
"learning_rate": 1.214071856287425e-06, |
|
"loss": 0.0849, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 15.843313373253492, |
|
"grad_norm": 0.21024100482463837, |
|
"learning_rate": 1.2078343313373254e-06, |
|
"loss": 0.078, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 15.968063872255488, |
|
"grad_norm": 9.392132759094238, |
|
"learning_rate": 1.2015968063872255e-06, |
|
"loss": 0.0865, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_loss": 0.48642057180404663, |
|
"eval_runtime": 46.7976, |
|
"eval_samples_per_second": 68.508, |
|
"eval_steps_per_second": 17.138, |
|
"step": 64128 |
|
}, |
|
{ |
|
"epoch": 16.092814371257486, |
|
"grad_norm": 0.5227041244506836, |
|
"learning_rate": 1.1953592814371256e-06, |
|
"loss": 0.0722, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 16.21756487025948, |
|
"grad_norm": 25.282564163208008, |
|
"learning_rate": 1.1891217564870259e-06, |
|
"loss": 0.0981, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 16.342315369261478, |
|
"grad_norm": 0.6670591235160828, |
|
"learning_rate": 1.1828842315369261e-06, |
|
"loss": 0.0787, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 16.46706586826347, |
|
"grad_norm": 22.668352127075195, |
|
"learning_rate": 1.1766467065868262e-06, |
|
"loss": 0.0764, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 16.59181636726547, |
|
"grad_norm": 0.22597374022006989, |
|
"learning_rate": 1.1704091816367265e-06, |
|
"loss": 0.078, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 16.716566866267463, |
|
"grad_norm": 21.123409271240234, |
|
"learning_rate": 1.1641716566866266e-06, |
|
"loss": 0.0766, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 16.84131736526946, |
|
"grad_norm": 0.04259370267391205, |
|
"learning_rate": 1.157934131736527e-06, |
|
"loss": 0.0765, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 16.96606786427146, |
|
"grad_norm": 0.021560240536928177, |
|
"learning_rate": 1.1516966067864272e-06, |
|
"loss": 0.0785, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_loss": 0.4793809652328491, |
|
"eval_runtime": 45.0906, |
|
"eval_samples_per_second": 71.101, |
|
"eval_steps_per_second": 17.786, |
|
"step": 68136 |
|
}, |
|
{ |
|
"epoch": 17.090818363273453, |
|
"grad_norm": 9.094868659973145, |
|
"learning_rate": 1.1454590818363273e-06, |
|
"loss": 0.0647, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 17.21556886227545, |
|
"grad_norm": 0.195833221077919, |
|
"learning_rate": 1.1392215568862276e-06, |
|
"loss": 0.0698, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 17.340319361277444, |
|
"grad_norm": 0.18507197499275208, |
|
"learning_rate": 1.1329840319361277e-06, |
|
"loss": 0.0712, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 17.465069860279442, |
|
"grad_norm": 0.9911601543426514, |
|
"learning_rate": 1.1267465069860278e-06, |
|
"loss": 0.0752, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 17.589820359281436, |
|
"grad_norm": 1.9703953266143799, |
|
"learning_rate": 1.120508982035928e-06, |
|
"loss": 0.0675, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 17.714570858283434, |
|
"grad_norm": 41.10940933227539, |
|
"learning_rate": 1.1142714570858283e-06, |
|
"loss": 0.0705, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 17.839321357285428, |
|
"grad_norm": 15.87336254119873, |
|
"learning_rate": 1.1080339321357286e-06, |
|
"loss": 0.0763, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 17.964071856287426, |
|
"grad_norm": 0.060888275504112244, |
|
"learning_rate": 1.1017964071856287e-06, |
|
"loss": 0.0784, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_loss": 0.4715409278869629, |
|
"eval_runtime": 44.1035, |
|
"eval_samples_per_second": 72.693, |
|
"eval_steps_per_second": 18.184, |
|
"step": 72144 |
|
}, |
|
{ |
|
"epoch": 18.08882235528942, |
|
"grad_norm": 2.47182035446167, |
|
"learning_rate": 1.0955588822355288e-06, |
|
"loss": 0.0747, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 18.213572854291417, |
|
"grad_norm": 40.5880126953125, |
|
"learning_rate": 1.089321357285429e-06, |
|
"loss": 0.0678, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 18.338323353293415, |
|
"grad_norm": 0.4340246915817261, |
|
"learning_rate": 1.0830838323353294e-06, |
|
"loss": 0.0713, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 18.46307385229541, |
|
"grad_norm": 4.4763312339782715, |
|
"learning_rate": 1.0768463073852295e-06, |
|
"loss": 0.065, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 18.587824351297407, |
|
"grad_norm": 0.1397508829832077, |
|
"learning_rate": 1.0706087824351298e-06, |
|
"loss": 0.0727, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 18.7125748502994, |
|
"grad_norm": 7.134496212005615, |
|
"learning_rate": 1.0643712574850299e-06, |
|
"loss": 0.0605, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 18.8373253493014, |
|
"grad_norm": 0.05227530747652054, |
|
"learning_rate": 1.05813373253493e-06, |
|
"loss": 0.0764, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 18.962075848303392, |
|
"grad_norm": 17.22441864013672, |
|
"learning_rate": 1.0518962075848302e-06, |
|
"loss": 0.0696, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_loss": 0.4802711308002472, |
|
"eval_runtime": 45.7109, |
|
"eval_samples_per_second": 70.136, |
|
"eval_steps_per_second": 17.545, |
|
"step": 76152 |
|
}, |
|
{ |
|
"epoch": 19.08682634730539, |
|
"grad_norm": 0.02889215387403965, |
|
"learning_rate": 1.0456586826347305e-06, |
|
"loss": 0.0625, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 19.211576846307384, |
|
"grad_norm": 128.0497283935547, |
|
"learning_rate": 1.0394211576846308e-06, |
|
"loss": 0.0548, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 19.336327345309382, |
|
"grad_norm": 0.22108981013298035, |
|
"learning_rate": 1.033183632734531e-06, |
|
"loss": 0.0695, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 19.461077844311376, |
|
"grad_norm": 55.13557815551758, |
|
"learning_rate": 1.026946107784431e-06, |
|
"loss": 0.0679, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 19.585828343313374, |
|
"grad_norm": 3.5990562438964844, |
|
"learning_rate": 1.0207085828343313e-06, |
|
"loss": 0.0697, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 19.710578842315368, |
|
"grad_norm": 3.9640650749206543, |
|
"learning_rate": 1.0144710578842316e-06, |
|
"loss": 0.0699, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 19.835329341317365, |
|
"grad_norm": 0.3529013395309448, |
|
"learning_rate": 1.0082335329341317e-06, |
|
"loss": 0.0676, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 19.960079840319363, |
|
"grad_norm": 1.3875175714492798, |
|
"learning_rate": 1.001996007984032e-06, |
|
"loss": 0.0683, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_loss": 0.5128437280654907, |
|
"eval_runtime": 46.0282, |
|
"eval_samples_per_second": 69.653, |
|
"eval_steps_per_second": 17.424, |
|
"step": 80160 |
|
}, |
|
{ |
|
"epoch": 20.084830339321357, |
|
"grad_norm": 6.171479225158691, |
|
"learning_rate": 9.95758483033932e-07, |
|
"loss": 0.0698, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 20.209580838323355, |
|
"grad_norm": 0.012239497154951096, |
|
"learning_rate": 9.895209580838323e-07, |
|
"loss": 0.0532, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 20.33433133732535, |
|
"grad_norm": 7.920960426330566, |
|
"learning_rate": 9.832834331337324e-07, |
|
"loss": 0.0609, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 20.459081836327346, |
|
"grad_norm": 59.41933822631836, |
|
"learning_rate": 9.770459081836327e-07, |
|
"loss": 0.0653, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 20.58383233532934, |
|
"grad_norm": 0.10031065344810486, |
|
"learning_rate": 9.708083832335328e-07, |
|
"loss": 0.0497, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 20.708582834331338, |
|
"grad_norm": 5.42900276184082, |
|
"learning_rate": 9.645708582834331e-07, |
|
"loss": 0.061, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 20.833333333333332, |
|
"grad_norm": 20.380285263061523, |
|
"learning_rate": 9.583333333333334e-07, |
|
"loss": 0.0717, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 20.95808383233533, |
|
"grad_norm": 0.10651753097772598, |
|
"learning_rate": 9.520958083832335e-07, |
|
"loss": 0.0638, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"eval_loss": 0.4833807945251465, |
|
"eval_runtime": 46.5592, |
|
"eval_samples_per_second": 68.859, |
|
"eval_steps_per_second": 17.225, |
|
"step": 84168 |
|
}, |
|
{ |
|
"epoch": 21.082834331337324, |
|
"grad_norm": 0.3842374086380005, |
|
"learning_rate": 9.458582834331337e-07, |
|
"loss": 0.0603, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 21.20758483033932, |
|
"grad_norm": 51.563140869140625, |
|
"learning_rate": 9.396207584830339e-07, |
|
"loss": 0.06, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 21.33233532934132, |
|
"grad_norm": 0.037806153297424316, |
|
"learning_rate": 9.333832335329342e-07, |
|
"loss": 0.0612, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 21.457085828343313, |
|
"grad_norm": 0.11586946994066238, |
|
"learning_rate": 9.271457085828342e-07, |
|
"loss": 0.0664, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 21.58183632734531, |
|
"grad_norm": 0.34262338280677795, |
|
"learning_rate": 9.209081836327344e-07, |
|
"loss": 0.0602, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 21.706586826347305, |
|
"grad_norm": 0.11894870549440384, |
|
"learning_rate": 9.146706586826347e-07, |
|
"loss": 0.0522, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 21.831337325349303, |
|
"grad_norm": 0.1180167868733406, |
|
"learning_rate": 9.084331337325349e-07, |
|
"loss": 0.0616, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 21.956087824351297, |
|
"grad_norm": 0.09437087923288345, |
|
"learning_rate": 9.02195608782435e-07, |
|
"loss": 0.0607, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_loss": 0.4958905279636383, |
|
"eval_runtime": 44.4581, |
|
"eval_samples_per_second": 72.113, |
|
"eval_steps_per_second": 18.039, |
|
"step": 88176 |
|
}, |
|
{ |
|
"epoch": 22.080838323353294, |
|
"grad_norm": 0.5892271399497986, |
|
"learning_rate": 8.959580838323353e-07, |
|
"loss": 0.058, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 22.20558882235529, |
|
"grad_norm": 1.0569002628326416, |
|
"learning_rate": 8.897205588822355e-07, |
|
"loss": 0.0559, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 22.330339321357286, |
|
"grad_norm": 50.68812561035156, |
|
"learning_rate": 8.834830339321357e-07, |
|
"loss": 0.05, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 22.45508982035928, |
|
"grad_norm": 0.08090469241142273, |
|
"learning_rate": 8.772455089820359e-07, |
|
"loss": 0.0595, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 22.579840319361278, |
|
"grad_norm": 14.62991714477539, |
|
"learning_rate": 8.710079840319361e-07, |
|
"loss": 0.059, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 22.704590818363272, |
|
"grad_norm": 0.2893312871456146, |
|
"learning_rate": 8.647704590818364e-07, |
|
"loss": 0.0518, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 22.82934131736527, |
|
"grad_norm": 22.239938735961914, |
|
"learning_rate": 8.585329341317364e-07, |
|
"loss": 0.0493, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 22.954091816367267, |
|
"grad_norm": 0.09933929890394211, |
|
"learning_rate": 8.522954091816366e-07, |
|
"loss": 0.0536, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"eval_loss": 0.48672357201576233, |
|
"eval_runtime": 44.6856, |
|
"eval_samples_per_second": 71.746, |
|
"eval_steps_per_second": 17.948, |
|
"step": 92184 |
|
}, |
|
{ |
|
"epoch": 23.07884231536926, |
|
"grad_norm": 0.821902871131897, |
|
"learning_rate": 8.460578842315369e-07, |
|
"loss": 0.0553, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 23.20359281437126, |
|
"grad_norm": 0.2537296414375305, |
|
"learning_rate": 8.398203592814371e-07, |
|
"loss": 0.046, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 23.328343313373253, |
|
"grad_norm": 0.198989599943161, |
|
"learning_rate": 8.335828343313372e-07, |
|
"loss": 0.0496, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 23.45309381237525, |
|
"grad_norm": 14.523540496826172, |
|
"learning_rate": 8.273453093812375e-07, |
|
"loss": 0.0465, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 23.577844311377245, |
|
"grad_norm": 0.3473449945449829, |
|
"learning_rate": 8.211077844311377e-07, |
|
"loss": 0.048, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 23.702594810379242, |
|
"grad_norm": 4.4253129959106445, |
|
"learning_rate": 8.14870259481038e-07, |
|
"loss": 0.0489, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 23.827345309381236, |
|
"grad_norm": 159.51025390625, |
|
"learning_rate": 8.086327345309381e-07, |
|
"loss": 0.0552, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 23.952095808383234, |
|
"grad_norm": 0.31450316309928894, |
|
"learning_rate": 8.023952095808383e-07, |
|
"loss": 0.0537, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_loss": 0.5026536583900452, |
|
"eval_runtime": 46.0362, |
|
"eval_samples_per_second": 69.641, |
|
"eval_steps_per_second": 17.421, |
|
"step": 96192 |
|
}, |
|
{ |
|
"epoch": 24.076846307385228, |
|
"grad_norm": 1.8670942783355713, |
|
"learning_rate": 7.961576846307386e-07, |
|
"loss": 0.0556, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 24.201596806387226, |
|
"grad_norm": 0.4119631052017212, |
|
"learning_rate": 7.899201596806386e-07, |
|
"loss": 0.0427, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 24.32634730538922, |
|
"grad_norm": 4.47167444229126, |
|
"learning_rate": 7.836826347305388e-07, |
|
"loss": 0.0579, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 24.451097804391217, |
|
"grad_norm": 0.940743625164032, |
|
"learning_rate": 7.774451097804391e-07, |
|
"loss": 0.0462, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 24.575848303393215, |
|
"grad_norm": 4.091241359710693, |
|
"learning_rate": 7.712075848303393e-07, |
|
"loss": 0.0524, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 24.70059880239521, |
|
"grad_norm": 11.099757194519043, |
|
"learning_rate": 7.649700598802394e-07, |
|
"loss": 0.0549, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 24.825349301397207, |
|
"grad_norm": 2.001067876815796, |
|
"learning_rate": 7.587325349301397e-07, |
|
"loss": 0.0485, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 24.9500998003992, |
|
"grad_norm": 0.15496690571308136, |
|
"learning_rate": 7.524950099800399e-07, |
|
"loss": 0.0537, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"eval_loss": 0.48970088362693787, |
|
"eval_runtime": 48.0502, |
|
"eval_samples_per_second": 66.722, |
|
"eval_steps_per_second": 16.691, |
|
"step": 100200 |
|
}, |
|
{ |
|
"epoch": 25.0748502994012, |
|
"grad_norm": 5.718461513519287, |
|
"learning_rate": 7.462574850299402e-07, |
|
"loss": 0.0471, |
|
"step": 100500 |
|
}, |
|
{ |
|
"epoch": 25.199600798403193, |
|
"grad_norm": 53.097293853759766, |
|
"learning_rate": 7.400199600798403e-07, |
|
"loss": 0.0467, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 25.32435129740519, |
|
"grad_norm": 70.51046752929688, |
|
"learning_rate": 7.337824351297404e-07, |
|
"loss": 0.0464, |
|
"step": 101500 |
|
}, |
|
{ |
|
"epoch": 25.449101796407184, |
|
"grad_norm": 6.485039234161377, |
|
"learning_rate": 7.275449101796407e-07, |
|
"loss": 0.0501, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 25.573852295409182, |
|
"grad_norm": 0.2076825648546219, |
|
"learning_rate": 7.213073852295409e-07, |
|
"loss": 0.05, |
|
"step": 102500 |
|
}, |
|
{ |
|
"epoch": 25.698602794411176, |
|
"grad_norm": 40.60255432128906, |
|
"learning_rate": 7.15069860279441e-07, |
|
"loss": 0.0374, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 25.823353293413174, |
|
"grad_norm": 1.1958940029144287, |
|
"learning_rate": 7.088323353293413e-07, |
|
"loss": 0.0533, |
|
"step": 103500 |
|
}, |
|
{ |
|
"epoch": 25.948103792415168, |
|
"grad_norm": 11.201072692871094, |
|
"learning_rate": 7.025948103792415e-07, |
|
"loss": 0.0388, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_loss": 0.48730549216270447, |
|
"eval_runtime": 48.7336, |
|
"eval_samples_per_second": 65.786, |
|
"eval_steps_per_second": 16.457, |
|
"step": 104208 |
|
}, |
|
{ |
|
"epoch": 26.072854291417165, |
|
"grad_norm": 0.08899884670972824, |
|
"learning_rate": 6.963572854291417e-07, |
|
"loss": 0.0482, |
|
"step": 104500 |
|
}, |
|
{ |
|
"epoch": 26.197604790419163, |
|
"grad_norm": 0.08736108243465424, |
|
"learning_rate": 6.901197604790419e-07, |
|
"loss": 0.042, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 26.322355289421157, |
|
"grad_norm": 0.050059039145708084, |
|
"learning_rate": 6.838822355289421e-07, |
|
"loss": 0.0443, |
|
"step": 105500 |
|
}, |
|
{ |
|
"epoch": 26.447105788423155, |
|
"grad_norm": 0.3098917603492737, |
|
"learning_rate": 6.776447105788423e-07, |
|
"loss": 0.0431, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 26.57185628742515, |
|
"grad_norm": 0.601845920085907, |
|
"learning_rate": 6.714071856287425e-07, |
|
"loss": 0.0474, |
|
"step": 106500 |
|
}, |
|
{ |
|
"epoch": 26.696606786427147, |
|
"grad_norm": 43.90340805053711, |
|
"learning_rate": 6.651696606786426e-07, |
|
"loss": 0.0546, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 26.82135728542914, |
|
"grad_norm": 0.1658441424369812, |
|
"learning_rate": 6.589321357285429e-07, |
|
"loss": 0.0463, |
|
"step": 107500 |
|
}, |
|
{ |
|
"epoch": 26.94610778443114, |
|
"grad_norm": 0.7097954154014587, |
|
"learning_rate": 6.526946107784431e-07, |
|
"loss": 0.0413, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"eval_loss": 0.49195966124534607, |
|
"eval_runtime": 48.5815, |
|
"eval_samples_per_second": 65.992, |
|
"eval_steps_per_second": 16.508, |
|
"step": 108216 |
|
}, |
|
{ |
|
"epoch": 27.070858283433132, |
|
"grad_norm": 0.12945351004600525, |
|
"learning_rate": 6.464570858283432e-07, |
|
"loss": 0.0514, |
|
"step": 108500 |
|
}, |
|
{ |
|
"epoch": 27.19560878243513, |
|
"grad_norm": 0.09241262078285217, |
|
"learning_rate": 6.402195608782435e-07, |
|
"loss": 0.0454, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 27.320359281437124, |
|
"grad_norm": 0.07145562022924423, |
|
"learning_rate": 6.339820359281437e-07, |
|
"loss": 0.0381, |
|
"step": 109500 |
|
}, |
|
{ |
|
"epoch": 27.44510978043912, |
|
"grad_norm": 0.003607134334743023, |
|
"learning_rate": 6.277445109780439e-07, |
|
"loss": 0.0476, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 27.56986027944112, |
|
"grad_norm": 10.220846176147461, |
|
"learning_rate": 6.215069860279441e-07, |
|
"loss": 0.0441, |
|
"step": 110500 |
|
}, |
|
{ |
|
"epoch": 27.694610778443113, |
|
"grad_norm": 0.18386581540107727, |
|
"learning_rate": 6.152694610778443e-07, |
|
"loss": 0.0461, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 27.81936127744511, |
|
"grad_norm": 0.26254481077194214, |
|
"learning_rate": 6.090319361277445e-07, |
|
"loss": 0.0367, |
|
"step": 111500 |
|
}, |
|
{ |
|
"epoch": 27.944111776447105, |
|
"grad_norm": 68.7042007446289, |
|
"learning_rate": 6.027944111776448e-07, |
|
"loss": 0.0471, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_loss": 0.4870954751968384, |
|
"eval_runtime": 45.0714, |
|
"eval_samples_per_second": 71.132, |
|
"eval_steps_per_second": 17.794, |
|
"step": 112224 |
|
}, |
|
{ |
|
"epoch": 28.068862275449103, |
|
"grad_norm": 0.0271464716643095, |
|
"learning_rate": 5.965568862275448e-07, |
|
"loss": 0.0433, |
|
"step": 112500 |
|
}, |
|
{ |
|
"epoch": 28.193612774451097, |
|
"grad_norm": 0.0086235161870718, |
|
"learning_rate": 5.903193612774451e-07, |
|
"loss": 0.0475, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 28.318363273453095, |
|
"grad_norm": 0.11506126821041107, |
|
"learning_rate": 5.840818363273453e-07, |
|
"loss": 0.0353, |
|
"step": 113500 |
|
}, |
|
{ |
|
"epoch": 28.44311377245509, |
|
"grad_norm": 10.355070114135742, |
|
"learning_rate": 5.778443113772454e-07, |
|
"loss": 0.0416, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 28.567864271457086, |
|
"grad_norm": 0.2200528234243393, |
|
"learning_rate": 5.716067864271457e-07, |
|
"loss": 0.0325, |
|
"step": 114500 |
|
}, |
|
{ |
|
"epoch": 28.69261477045908, |
|
"grad_norm": 0.05802537873387337, |
|
"learning_rate": 5.653692614770459e-07, |
|
"loss": 0.0468, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 28.817365269461078, |
|
"grad_norm": 0.10829133540391922, |
|
"learning_rate": 5.591317365269461e-07, |
|
"loss": 0.042, |
|
"step": 115500 |
|
}, |
|
{ |
|
"epoch": 28.942115768463076, |
|
"grad_norm": 0.162460595369339, |
|
"learning_rate": 5.528942115768463e-07, |
|
"loss": 0.049, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"eval_loss": 0.4795687198638916, |
|
"eval_runtime": 45.1647, |
|
"eval_samples_per_second": 70.985, |
|
"eval_steps_per_second": 17.757, |
|
"step": 116232 |
|
}, |
|
{ |
|
"epoch": 29.06686626746507, |
|
"grad_norm": 134.6587677001953, |
|
"learning_rate": 5.466566866267465e-07, |
|
"loss": 0.0416, |
|
"step": 116500 |
|
}, |
|
{ |
|
"epoch": 29.191616766467067, |
|
"grad_norm": 0.09312257915735245, |
|
"learning_rate": 5.404191616766467e-07, |
|
"loss": 0.0287, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 29.31636726546906, |
|
"grad_norm": 0.3530866503715515, |
|
"learning_rate": 5.341816367265469e-07, |
|
"loss": 0.0384, |
|
"step": 117500 |
|
}, |
|
{ |
|
"epoch": 29.44111776447106, |
|
"grad_norm": 0.033993642777204514, |
|
"learning_rate": 5.27944111776447e-07, |
|
"loss": 0.043, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 29.565868263473053, |
|
"grad_norm": 0.3124711513519287, |
|
"learning_rate": 5.217065868263473e-07, |
|
"loss": 0.04, |
|
"step": 118500 |
|
}, |
|
{ |
|
"epoch": 29.69061876247505, |
|
"grad_norm": 10.49288272857666, |
|
"learning_rate": 5.154690618762475e-07, |
|
"loss": 0.0463, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 29.815369261477045, |
|
"grad_norm": 0.024224599823355675, |
|
"learning_rate": 5.092315369261477e-07, |
|
"loss": 0.0411, |
|
"step": 119500 |
|
}, |
|
{ |
|
"epoch": 29.940119760479043, |
|
"grad_norm": 3.9215731620788574, |
|
"learning_rate": 5.029940119760479e-07, |
|
"loss": 0.0408, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"eval_loss": 0.492553174495697, |
|
"eval_runtime": 46.2042, |
|
"eval_samples_per_second": 69.388, |
|
"eval_steps_per_second": 17.358, |
|
"step": 120240 |
|
}, |
|
{ |
|
"epoch": 30.064870259481037, |
|
"grad_norm": 0.021667474880814552, |
|
"learning_rate": 4.967564870259481e-07, |
|
"loss": 0.0374, |
|
"step": 120500 |
|
}, |
|
{ |
|
"epoch": 30.189620758483034, |
|
"grad_norm": 0.5888983011245728, |
|
"learning_rate": 4.905189620758483e-07, |
|
"loss": 0.0463, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 30.31437125748503, |
|
"grad_norm": 0.09637131541967392, |
|
"learning_rate": 4.842814371257485e-07, |
|
"loss": 0.033, |
|
"step": 121500 |
|
}, |
|
{ |
|
"epoch": 30.439121756487026, |
|
"grad_norm": 0.23179832100868225, |
|
"learning_rate": 4.780439121756487e-07, |
|
"loss": 0.0402, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 30.563872255489024, |
|
"grad_norm": 0.14170564711093903, |
|
"learning_rate": 4.718063872255489e-07, |
|
"loss": 0.0395, |
|
"step": 122500 |
|
}, |
|
{ |
|
"epoch": 30.688622754491018, |
|
"grad_norm": 0.006093321368098259, |
|
"learning_rate": 4.6556886227544903e-07, |
|
"loss": 0.0356, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 30.813373253493015, |
|
"grad_norm": 0.1018219068646431, |
|
"learning_rate": 4.593313373253493e-07, |
|
"loss": 0.0419, |
|
"step": 123500 |
|
}, |
|
{ |
|
"epoch": 30.93812375249501, |
|
"grad_norm": 2.9131383895874023, |
|
"learning_rate": 4.5309381237524947e-07, |
|
"loss": 0.0378, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"eval_loss": 0.5052226781845093, |
|
"eval_runtime": 43.1611, |
|
"eval_samples_per_second": 74.28, |
|
"eval_steps_per_second": 18.582, |
|
"step": 124248 |
|
}, |
|
{ |
|
"epoch": 31.062874251497007, |
|
"grad_norm": 11.588695526123047, |
|
"learning_rate": 4.468562874251497e-07, |
|
"loss": 0.0346, |
|
"step": 124500 |
|
}, |
|
{ |
|
"epoch": 31.187624750499, |
|
"grad_norm": 0.2488149255514145, |
|
"learning_rate": 4.4061876247504985e-07, |
|
"loss": 0.0351, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 31.312375249501, |
|
"grad_norm": 12.691544532775879, |
|
"learning_rate": 4.343812375249501e-07, |
|
"loss": 0.0323, |
|
"step": 125500 |
|
}, |
|
{ |
|
"epoch": 31.437125748502993, |
|
"grad_norm": 0.004168800078332424, |
|
"learning_rate": 4.281437125748503e-07, |
|
"loss": 0.033, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 31.56187624750499, |
|
"grad_norm": 0.042690277099609375, |
|
"learning_rate": 4.219061876247505e-07, |
|
"loss": 0.039, |
|
"step": 126500 |
|
}, |
|
{ |
|
"epoch": 31.686626746506985, |
|
"grad_norm": 1.1096973419189453, |
|
"learning_rate": 4.1566866267465066e-07, |
|
"loss": 0.0349, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 31.811377245508982, |
|
"grad_norm": 0.2642970085144043, |
|
"learning_rate": 4.094311377245509e-07, |
|
"loss": 0.0338, |
|
"step": 127500 |
|
}, |
|
{ |
|
"epoch": 31.936127744510976, |
|
"grad_norm": 0.21338249742984772, |
|
"learning_rate": 4.031936127744511e-07, |
|
"loss": 0.0349, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"eval_loss": 0.4927305281162262, |
|
"eval_runtime": 43.7641, |
|
"eval_samples_per_second": 73.256, |
|
"eval_steps_per_second": 18.326, |
|
"step": 128256 |
|
}, |
|
{ |
|
"epoch": 32.060878243512974, |
|
"grad_norm": 0.1497274786233902, |
|
"learning_rate": 3.969560878243513e-07, |
|
"loss": 0.0403, |
|
"step": 128500 |
|
}, |
|
{ |
|
"epoch": 32.18562874251497, |
|
"grad_norm": 0.5848351120948792, |
|
"learning_rate": 3.9071856287425147e-07, |
|
"loss": 0.037, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 32.31037924151697, |
|
"grad_norm": 0.11372077465057373, |
|
"learning_rate": 3.8448103792415166e-07, |
|
"loss": 0.0383, |
|
"step": 129500 |
|
}, |
|
{ |
|
"epoch": 32.43512974051896, |
|
"grad_norm": 0.1047956719994545, |
|
"learning_rate": 3.782435129740519e-07, |
|
"loss": 0.0315, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 32.55988023952096, |
|
"grad_norm": 0.2975727617740631, |
|
"learning_rate": 3.7200598802395204e-07, |
|
"loss": 0.0264, |
|
"step": 130500 |
|
}, |
|
{ |
|
"epoch": 32.684630738522955, |
|
"grad_norm": 0.2123280167579651, |
|
"learning_rate": 3.657684630738523e-07, |
|
"loss": 0.0341, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 32.80938123752495, |
|
"grad_norm": 27.63080596923828, |
|
"learning_rate": 3.5953093812375247e-07, |
|
"loss": 0.0368, |
|
"step": 131500 |
|
}, |
|
{ |
|
"epoch": 32.93413173652694, |
|
"grad_norm": 0.034935545176267624, |
|
"learning_rate": 3.5329341317365266e-07, |
|
"loss": 0.0394, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"eval_loss": 0.4937605559825897, |
|
"eval_runtime": 39.9355, |
|
"eval_samples_per_second": 80.279, |
|
"eval_steps_per_second": 20.082, |
|
"step": 132264 |
|
}, |
|
{ |
|
"epoch": 33.05888223552894, |
|
"grad_norm": 0.003380158683285117, |
|
"learning_rate": 3.4705588822355285e-07, |
|
"loss": 0.0394, |
|
"step": 132500 |
|
}, |
|
{ |
|
"epoch": 33.18363273453094, |
|
"grad_norm": 2.721451997756958, |
|
"learning_rate": 3.408183632734531e-07, |
|
"loss": 0.0365, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 33.308383233532936, |
|
"grad_norm": 0.4309988021850586, |
|
"learning_rate": 3.345808383233533e-07, |
|
"loss": 0.0302, |
|
"step": 133500 |
|
}, |
|
{ |
|
"epoch": 33.43313373253493, |
|
"grad_norm": 0.24694228172302246, |
|
"learning_rate": 3.283433133732535e-07, |
|
"loss": 0.037, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 33.557884231536924, |
|
"grad_norm": 0.34988418221473694, |
|
"learning_rate": 3.2210578842315366e-07, |
|
"loss": 0.0258, |
|
"step": 134500 |
|
}, |
|
{ |
|
"epoch": 33.68263473053892, |
|
"grad_norm": 0.19452495872974396, |
|
"learning_rate": 3.158682634730539e-07, |
|
"loss": 0.035, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 33.80738522954092, |
|
"grad_norm": 0.006651519797742367, |
|
"learning_rate": 3.096307385229541e-07, |
|
"loss": 0.0368, |
|
"step": 135500 |
|
}, |
|
{ |
|
"epoch": 33.93213572854292, |
|
"grad_norm": 0.04128989204764366, |
|
"learning_rate": 3.033932135728543e-07, |
|
"loss": 0.0301, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"eval_loss": 0.4872666597366333, |
|
"eval_runtime": 37.6035, |
|
"eval_samples_per_second": 85.258, |
|
"eval_steps_per_second": 21.328, |
|
"step": 136272 |
|
}, |
|
{ |
|
"epoch": 34.05688622754491, |
|
"grad_norm": 0.05333876982331276, |
|
"learning_rate": 2.971556886227545e-07, |
|
"loss": 0.0349, |
|
"step": 136500 |
|
}, |
|
{ |
|
"epoch": 34.181636726546905, |
|
"grad_norm": 1.3579726219177246, |
|
"learning_rate": 2.909181636726547e-07, |
|
"loss": 0.0285, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 34.3063872255489, |
|
"grad_norm": 0.6725994348526001, |
|
"learning_rate": 2.8468063872255486e-07, |
|
"loss": 0.0361, |
|
"step": 137500 |
|
}, |
|
{ |
|
"epoch": 34.4311377245509, |
|
"grad_norm": 0.03919246420264244, |
|
"learning_rate": 2.7844311377245504e-07, |
|
"loss": 0.0274, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 34.55588822355289, |
|
"grad_norm": 35.5837287902832, |
|
"learning_rate": 2.722055888223553e-07, |
|
"loss": 0.0363, |
|
"step": 138500 |
|
}, |
|
{ |
|
"epoch": 34.68063872255489, |
|
"grad_norm": 0.007728968746960163, |
|
"learning_rate": 2.659680638722555e-07, |
|
"loss": 0.0391, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 34.80538922155689, |
|
"grad_norm": 0.07272203266620636, |
|
"learning_rate": 2.5973053892215567e-07, |
|
"loss": 0.0268, |
|
"step": 139500 |
|
}, |
|
{ |
|
"epoch": 34.930139720558884, |
|
"grad_norm": 0.33094656467437744, |
|
"learning_rate": 2.5349301397205586e-07, |
|
"loss": 0.0365, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"eval_loss": 0.4920032024383545, |
|
"eval_runtime": 40.4781, |
|
"eval_samples_per_second": 79.203, |
|
"eval_steps_per_second": 19.813, |
|
"step": 140280 |
|
}, |
|
{ |
|
"epoch": 35.054890219560875, |
|
"grad_norm": 191.99266052246094, |
|
"learning_rate": 2.472554890219561e-07, |
|
"loss": 0.0333, |
|
"step": 140500 |
|
}, |
|
{ |
|
"epoch": 35.17964071856287, |
|
"grad_norm": 0.002573936013504863, |
|
"learning_rate": 2.410179640718563e-07, |
|
"loss": 0.0327, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 35.30439121756487, |
|
"grad_norm": 0.04750495404005051, |
|
"learning_rate": 2.3478043912175645e-07, |
|
"loss": 0.0345, |
|
"step": 141500 |
|
}, |
|
{ |
|
"epoch": 35.42914171656687, |
|
"grad_norm": 193.8626251220703, |
|
"learning_rate": 2.2854291417165667e-07, |
|
"loss": 0.0321, |
|
"step": 142000 |
|
}, |
|
{ |
|
"epoch": 35.553892215568865, |
|
"grad_norm": 0.0009173236903734505, |
|
"learning_rate": 2.2230538922155686e-07, |
|
"loss": 0.0359, |
|
"step": 142500 |
|
}, |
|
{ |
|
"epoch": 35.678642714570856, |
|
"grad_norm": 0.12355954945087433, |
|
"learning_rate": 2.1606786427145708e-07, |
|
"loss": 0.0347, |
|
"step": 143000 |
|
}, |
|
{ |
|
"epoch": 35.80339321357285, |
|
"grad_norm": 0.24140344560146332, |
|
"learning_rate": 2.0983033932135726e-07, |
|
"loss": 0.031, |
|
"step": 143500 |
|
}, |
|
{ |
|
"epoch": 35.92814371257485, |
|
"grad_norm": 0.007129414472728968, |
|
"learning_rate": 2.0359281437125748e-07, |
|
"loss": 0.0214, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"eval_loss": 0.4941750466823578, |
|
"eval_runtime": 38.7085, |
|
"eval_samples_per_second": 82.824, |
|
"eval_steps_per_second": 20.719, |
|
"step": 144288 |
|
}, |
|
{ |
|
"epoch": 36.05289421157685, |
|
"grad_norm": 0.27973344922065735, |
|
"learning_rate": 1.9735528942115767e-07, |
|
"loss": 0.0331, |
|
"step": 144500 |
|
}, |
|
{ |
|
"epoch": 36.17764471057884, |
|
"grad_norm": 0.05331612005829811, |
|
"learning_rate": 1.911177644710579e-07, |
|
"loss": 0.0303, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 36.30239520958084, |
|
"grad_norm": 1.8135106563568115, |
|
"learning_rate": 1.8488023952095808e-07, |
|
"loss": 0.0349, |
|
"step": 145500 |
|
}, |
|
{ |
|
"epoch": 36.427145708582835, |
|
"grad_norm": 0.13009090721607208, |
|
"learning_rate": 1.7864271457085827e-07, |
|
"loss": 0.0405, |
|
"step": 146000 |
|
}, |
|
{ |
|
"epoch": 36.55189620758483, |
|
"grad_norm": 0.07144490629434586, |
|
"learning_rate": 1.7240518962075848e-07, |
|
"loss": 0.0377, |
|
"step": 146500 |
|
}, |
|
{ |
|
"epoch": 36.67664670658683, |
|
"grad_norm": 74.39689636230469, |
|
"learning_rate": 1.6616766467065867e-07, |
|
"loss": 0.0278, |
|
"step": 147000 |
|
}, |
|
{ |
|
"epoch": 36.80139720558882, |
|
"grad_norm": 0.08526595681905746, |
|
"learning_rate": 1.599301397205589e-07, |
|
"loss": 0.0306, |
|
"step": 147500 |
|
}, |
|
{ |
|
"epoch": 36.92614770459082, |
|
"grad_norm": 12.262850761413574, |
|
"learning_rate": 1.5369261477045908e-07, |
|
"loss": 0.0314, |
|
"step": 148000 |
|
}, |
|
{ |
|
"epoch": 37.0, |
|
"eval_loss": 0.49442577362060547, |
|
"eval_runtime": 42.7404, |
|
"eval_samples_per_second": 75.011, |
|
"eval_steps_per_second": 18.764, |
|
"step": 148296 |
|
}, |
|
{ |
|
"epoch": 37.050898203592816, |
|
"grad_norm": 0.02493446320295334, |
|
"learning_rate": 1.474550898203593e-07, |
|
"loss": 0.0262, |
|
"step": 148500 |
|
}, |
|
{ |
|
"epoch": 37.17564870259481, |
|
"grad_norm": 0.14130648970603943, |
|
"learning_rate": 1.4121756487025949e-07, |
|
"loss": 0.0281, |
|
"step": 149000 |
|
}, |
|
{ |
|
"epoch": 37.300399201596804, |
|
"grad_norm": 0.035768117755651474, |
|
"learning_rate": 1.3498003992015965e-07, |
|
"loss": 0.0255, |
|
"step": 149500 |
|
}, |
|
{ |
|
"epoch": 37.4251497005988, |
|
"grad_norm": 0.18820720911026, |
|
"learning_rate": 1.2874251497005986e-07, |
|
"loss": 0.032, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 37.5499001996008, |
|
"grad_norm": 0.37001463770866394, |
|
"learning_rate": 1.2250499001996008e-07, |
|
"loss": 0.0301, |
|
"step": 150500 |
|
}, |
|
{ |
|
"epoch": 37.6746506986028, |
|
"grad_norm": 0.06626907736063004, |
|
"learning_rate": 1.1626746506986028e-07, |
|
"loss": 0.0238, |
|
"step": 151000 |
|
}, |
|
{ |
|
"epoch": 37.79940119760479, |
|
"grad_norm": 19.17169189453125, |
|
"learning_rate": 1.1002994011976049e-07, |
|
"loss": 0.0385, |
|
"step": 151500 |
|
}, |
|
{ |
|
"epoch": 37.924151696606785, |
|
"grad_norm": 4.972864627838135, |
|
"learning_rate": 1.0379241516966066e-07, |
|
"loss": 0.0337, |
|
"step": 152000 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"eval_loss": 0.48605817556381226, |
|
"eval_runtime": 40.1954, |
|
"eval_samples_per_second": 79.76, |
|
"eval_steps_per_second": 19.953, |
|
"step": 152304 |
|
}, |
|
{ |
|
"epoch": 38.04890219560878, |
|
"grad_norm": 0.002587054157629609, |
|
"learning_rate": 9.755489021956087e-08, |
|
"loss": 0.0334, |
|
"step": 152500 |
|
}, |
|
{ |
|
"epoch": 38.17365269461078, |
|
"grad_norm": 70.7108383178711, |
|
"learning_rate": 9.131736526946107e-08, |
|
"loss": 0.0319, |
|
"step": 153000 |
|
}, |
|
{ |
|
"epoch": 38.29840319361278, |
|
"grad_norm": 0.5694107413291931, |
|
"learning_rate": 8.507984031936127e-08, |
|
"loss": 0.0313, |
|
"step": 153500 |
|
}, |
|
{ |
|
"epoch": 38.42315369261477, |
|
"grad_norm": 0.003176228841766715, |
|
"learning_rate": 7.884231536926148e-08, |
|
"loss": 0.0298, |
|
"step": 154000 |
|
}, |
|
{ |
|
"epoch": 38.547904191616766, |
|
"grad_norm": 0.004230498801916838, |
|
"learning_rate": 7.260479041916168e-08, |
|
"loss": 0.0284, |
|
"step": 154500 |
|
}, |
|
{ |
|
"epoch": 38.672654690618764, |
|
"grad_norm": 0.13844607770442963, |
|
"learning_rate": 6.636726546906188e-08, |
|
"loss": 0.0305, |
|
"step": 155000 |
|
}, |
|
{ |
|
"epoch": 38.79740518962076, |
|
"grad_norm": 0.05394995957612991, |
|
"learning_rate": 6.012974051896207e-08, |
|
"loss": 0.0269, |
|
"step": 155500 |
|
}, |
|
{ |
|
"epoch": 38.92215568862275, |
|
"grad_norm": 0.11763022094964981, |
|
"learning_rate": 5.3892215568862274e-08, |
|
"loss": 0.0279, |
|
"step": 156000 |
|
}, |
|
{ |
|
"epoch": 39.0, |
|
"eval_loss": 0.4873499870300293, |
|
"eval_runtime": 44.0281, |
|
"eval_samples_per_second": 72.817, |
|
"eval_steps_per_second": 18.216, |
|
"step": 156312 |
|
}, |
|
{ |
|
"epoch": 39.04690618762475, |
|
"grad_norm": 0.22139760851860046, |
|
"learning_rate": 4.765469061876248e-08, |
|
"loss": 0.0255, |
|
"step": 156500 |
|
}, |
|
{ |
|
"epoch": 39.17165668662675, |
|
"grad_norm": 0.002428988926112652, |
|
"learning_rate": 4.1417165668662674e-08, |
|
"loss": 0.0302, |
|
"step": 157000 |
|
}, |
|
{ |
|
"epoch": 39.296407185628745, |
|
"grad_norm": 0.07879871129989624, |
|
"learning_rate": 3.517964071856287e-08, |
|
"loss": 0.027, |
|
"step": 157500 |
|
}, |
|
{ |
|
"epoch": 39.421157684630735, |
|
"grad_norm": 0.03594490885734558, |
|
"learning_rate": 2.8942115768463073e-08, |
|
"loss": 0.033, |
|
"step": 158000 |
|
}, |
|
{ |
|
"epoch": 39.54590818363273, |
|
"grad_norm": 0.12444847822189331, |
|
"learning_rate": 2.2704590818363273e-08, |
|
"loss": 0.0271, |
|
"step": 158500 |
|
}, |
|
{ |
|
"epoch": 39.67065868263473, |
|
"grad_norm": 47.82669448852539, |
|
"learning_rate": 1.6467065868263473e-08, |
|
"loss": 0.0276, |
|
"step": 159000 |
|
}, |
|
{ |
|
"epoch": 39.79540918163673, |
|
"grad_norm": 0.1385308802127838, |
|
"learning_rate": 1.0229540918163672e-08, |
|
"loss": 0.03, |
|
"step": 159500 |
|
}, |
|
{ |
|
"epoch": 39.920159680638726, |
|
"grad_norm": 0.1429419070482254, |
|
"learning_rate": 3.992015968063871e-09, |
|
"loss": 0.0303, |
|
"step": 160000 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 160320, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 40, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.21770798769152e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|