|
{ |
|
"best_metric": 1.9801934740826255e-06, |
|
"best_model_checkpoint": "Models/t5-base-class-gen/checkpoint-10400", |
|
"epoch": 4.9440298507462686, |
|
"eval_steps": 100, |
|
"global_step": 10600, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.04664179104477612, |
|
"grad_norm": 1.2865939140319824, |
|
"learning_rate": 3.9630597014925376e-05, |
|
"loss": 0.437, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.04664179104477612, |
|
"eval_loss": 0.020251411944627762, |
|
"eval_runtime": 0.1587, |
|
"eval_samples_per_second": 189.027, |
|
"eval_steps_per_second": 25.204, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.09328358208955224, |
|
"grad_norm": 0.69548100233078, |
|
"learning_rate": 3.925746268656717e-05, |
|
"loss": 0.0333, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.09328358208955224, |
|
"eval_loss": 0.0021759807132184505, |
|
"eval_runtime": 0.1592, |
|
"eval_samples_per_second": 188.446, |
|
"eval_steps_per_second": 25.126, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.13992537313432835, |
|
"grad_norm": 0.07560934871435165, |
|
"learning_rate": 3.888432835820896e-05, |
|
"loss": 0.0203, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.13992537313432835, |
|
"eval_loss": 0.004563063848763704, |
|
"eval_runtime": 0.1618, |
|
"eval_samples_per_second": 185.371, |
|
"eval_steps_per_second": 24.716, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.1865671641791045, |
|
"grad_norm": 0.646364688873291, |
|
"learning_rate": 3.851119402985075e-05, |
|
"loss": 0.0144, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.1865671641791045, |
|
"eval_loss": 0.00161658669821918, |
|
"eval_runtime": 0.1567, |
|
"eval_samples_per_second": 191.49, |
|
"eval_steps_per_second": 25.532, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2332089552238806, |
|
"grad_norm": 0.30101820826530457, |
|
"learning_rate": 3.813805970149254e-05, |
|
"loss": 0.0079, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.2332089552238806, |
|
"eval_loss": 0.0010551114100962877, |
|
"eval_runtime": 0.1559, |
|
"eval_samples_per_second": 192.377, |
|
"eval_steps_per_second": 25.65, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.2798507462686567, |
|
"grad_norm": 0.5621537566184998, |
|
"learning_rate": 3.7764925373134334e-05, |
|
"loss": 0.0076, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.2798507462686567, |
|
"eval_loss": 0.0002234852290712297, |
|
"eval_runtime": 0.1571, |
|
"eval_samples_per_second": 191.018, |
|
"eval_steps_per_second": 25.469, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.32649253731343286, |
|
"grad_norm": 0.10140960663557053, |
|
"learning_rate": 3.7391791044776125e-05, |
|
"loss": 0.0092, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.32649253731343286, |
|
"eval_loss": 0.00010653692879714072, |
|
"eval_runtime": 0.1572, |
|
"eval_samples_per_second": 190.855, |
|
"eval_steps_per_second": 25.447, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.373134328358209, |
|
"grad_norm": 0.02672586776316166, |
|
"learning_rate": 3.701865671641792e-05, |
|
"loss": 0.0053, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.373134328358209, |
|
"eval_loss": 0.00011367371189408004, |
|
"eval_runtime": 0.1569, |
|
"eval_samples_per_second": 191.155, |
|
"eval_steps_per_second": 25.487, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.4197761194029851, |
|
"grad_norm": 0.09280095249414444, |
|
"learning_rate": 3.66455223880597e-05, |
|
"loss": 0.0054, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.4197761194029851, |
|
"eval_loss": 0.00013640630641020834, |
|
"eval_runtime": 0.1588, |
|
"eval_samples_per_second": 188.886, |
|
"eval_steps_per_second": 25.185, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.4664179104477612, |
|
"grad_norm": 0.034318458288908005, |
|
"learning_rate": 3.62723880597015e-05, |
|
"loss": 0.0067, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.4664179104477612, |
|
"eval_loss": 7.585491403006017e-05, |
|
"eval_runtime": 0.1557, |
|
"eval_samples_per_second": 192.664, |
|
"eval_steps_per_second": 25.689, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.5130597014925373, |
|
"grad_norm": 0.018935877829790115, |
|
"learning_rate": 3.5899253731343285e-05, |
|
"loss": 0.0035, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.5130597014925373, |
|
"eval_loss": 4.489553612074815e-05, |
|
"eval_runtime": 0.1574, |
|
"eval_samples_per_second": 190.631, |
|
"eval_steps_per_second": 25.418, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.5597014925373134, |
|
"grad_norm": 0.8913596868515015, |
|
"learning_rate": 3.5526119402985077e-05, |
|
"loss": 0.0056, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.5597014925373134, |
|
"eval_loss": 4.118124343222007e-05, |
|
"eval_runtime": 0.1589, |
|
"eval_samples_per_second": 188.831, |
|
"eval_steps_per_second": 25.177, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.6063432835820896, |
|
"grad_norm": 0.017305633053183556, |
|
"learning_rate": 3.515298507462687e-05, |
|
"loss": 0.0041, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.6063432835820896, |
|
"eval_loss": 4.481070573092438e-05, |
|
"eval_runtime": 0.1552, |
|
"eval_samples_per_second": 193.329, |
|
"eval_steps_per_second": 25.777, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.6529850746268657, |
|
"grad_norm": 0.009413465857505798, |
|
"learning_rate": 3.477985074626866e-05, |
|
"loss": 0.0029, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.6529850746268657, |
|
"eval_loss": 2.9594228180940263e-05, |
|
"eval_runtime": 0.1654, |
|
"eval_samples_per_second": 181.422, |
|
"eval_steps_per_second": 24.19, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.6996268656716418, |
|
"grad_norm": 0.004255557898432016, |
|
"learning_rate": 3.440671641791045e-05, |
|
"loss": 0.0028, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.6996268656716418, |
|
"eval_loss": 3.3944801543839276e-05, |
|
"eval_runtime": 0.1543, |
|
"eval_samples_per_second": 194.435, |
|
"eval_steps_per_second": 25.925, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.746268656716418, |
|
"grad_norm": 0.6249216198921204, |
|
"learning_rate": 3.403358208955224e-05, |
|
"loss": 0.003, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.746268656716418, |
|
"eval_loss": 3.595968155423179e-05, |
|
"eval_runtime": 0.1621, |
|
"eval_samples_per_second": 185.072, |
|
"eval_steps_per_second": 24.676, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.792910447761194, |
|
"grad_norm": 0.014389215037226677, |
|
"learning_rate": 3.3660447761194034e-05, |
|
"loss": 0.002, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.792910447761194, |
|
"eval_loss": 3.822403959929943e-05, |
|
"eval_runtime": 0.1593, |
|
"eval_samples_per_second": 188.284, |
|
"eval_steps_per_second": 25.105, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.8395522388059702, |
|
"grad_norm": 0.1081342026591301, |
|
"learning_rate": 3.328731343283582e-05, |
|
"loss": 0.0039, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.8395522388059702, |
|
"eval_loss": 3.7738984246971086e-05, |
|
"eval_runtime": 0.1558, |
|
"eval_samples_per_second": 192.583, |
|
"eval_steps_per_second": 25.678, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.8861940298507462, |
|
"grad_norm": 1.123632550239563, |
|
"learning_rate": 3.291417910447762e-05, |
|
"loss": 0.0035, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.8861940298507462, |
|
"eval_loss": 2.4694763851584867e-05, |
|
"eval_runtime": 0.157, |
|
"eval_samples_per_second": 191.065, |
|
"eval_steps_per_second": 25.475, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.9328358208955224, |
|
"grad_norm": 0.00756313418969512, |
|
"learning_rate": 3.25410447761194e-05, |
|
"loss": 0.0027, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.9328358208955224, |
|
"eval_loss": 5.045727448305115e-05, |
|
"eval_runtime": 0.1664, |
|
"eval_samples_per_second": 180.272, |
|
"eval_steps_per_second": 24.036, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.9794776119402985, |
|
"grad_norm": 0.0035852426663041115, |
|
"learning_rate": 3.2167910447761194e-05, |
|
"loss": 0.0026, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.9794776119402985, |
|
"eval_loss": 2.137782212230377e-05, |
|
"eval_runtime": 0.1552, |
|
"eval_samples_per_second": 193.332, |
|
"eval_steps_per_second": 25.778, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.0261194029850746, |
|
"grad_norm": 0.01669035293161869, |
|
"learning_rate": 3.1794776119402986e-05, |
|
"loss": 0.0008, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.0261194029850746, |
|
"eval_loss": 2.3261545720743015e-05, |
|
"eval_runtime": 0.1554, |
|
"eval_samples_per_second": 193.073, |
|
"eval_steps_per_second": 25.743, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.0727611940298507, |
|
"grad_norm": 0.002784071722999215, |
|
"learning_rate": 3.142164179104478e-05, |
|
"loss": 0.0016, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.0727611940298507, |
|
"eval_loss": 1.877884460554924e-05, |
|
"eval_runtime": 0.1597, |
|
"eval_samples_per_second": 187.795, |
|
"eval_steps_per_second": 25.039, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.1194029850746268, |
|
"grad_norm": 0.010812056250870228, |
|
"learning_rate": 3.104850746268657e-05, |
|
"loss": 0.0013, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.1194029850746268, |
|
"eval_loss": 2.032700831477996e-05, |
|
"eval_runtime": 0.157, |
|
"eval_samples_per_second": 191.1, |
|
"eval_steps_per_second": 25.48, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.166044776119403, |
|
"grad_norm": 0.08882497251033783, |
|
"learning_rate": 3.067537313432836e-05, |
|
"loss": 0.0023, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.166044776119403, |
|
"eval_loss": 2.7997491997666657e-05, |
|
"eval_runtime": 0.1583, |
|
"eval_samples_per_second": 189.547, |
|
"eval_steps_per_second": 25.273, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.212686567164179, |
|
"grad_norm": 0.5638116002082825, |
|
"learning_rate": 3.030223880597015e-05, |
|
"loss": 0.0014, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.212686567164179, |
|
"eval_loss": 1.3365392987907398e-05, |
|
"eval_runtime": 0.1568, |
|
"eval_samples_per_second": 191.368, |
|
"eval_steps_per_second": 25.516, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.2593283582089552, |
|
"grad_norm": 0.6982096433639526, |
|
"learning_rate": 2.9929104477611944e-05, |
|
"loss": 0.0009, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.2593283582089552, |
|
"eval_loss": 1.1131261089758482e-05, |
|
"eval_runtime": 0.1571, |
|
"eval_samples_per_second": 191.001, |
|
"eval_steps_per_second": 25.467, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.3059701492537314, |
|
"grad_norm": 0.2126922756433487, |
|
"learning_rate": 2.9555970149253732e-05, |
|
"loss": 0.0007, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.3059701492537314, |
|
"eval_loss": 1.0051960998680443e-05, |
|
"eval_runtime": 0.1557, |
|
"eval_samples_per_second": 192.64, |
|
"eval_steps_per_second": 25.685, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.3526119402985075, |
|
"grad_norm": 0.07693888247013092, |
|
"learning_rate": 2.9182835820895527e-05, |
|
"loss": 0.0011, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.3526119402985075, |
|
"eval_loss": 8.506859558110591e-06, |
|
"eval_runtime": 0.1606, |
|
"eval_samples_per_second": 186.795, |
|
"eval_steps_per_second": 24.906, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.3992537313432836, |
|
"grad_norm": 0.005555067211389542, |
|
"learning_rate": 2.8809701492537315e-05, |
|
"loss": 0.0012, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.3992537313432836, |
|
"eval_loss": 1.0461569218023214e-05, |
|
"eval_runtime": 0.1562, |
|
"eval_samples_per_second": 192.09, |
|
"eval_steps_per_second": 25.612, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.4458955223880596, |
|
"grad_norm": 0.15883490443229675, |
|
"learning_rate": 2.8436567164179106e-05, |
|
"loss": 0.0015, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.4458955223880596, |
|
"eval_loss": 9.19290141609963e-06, |
|
"eval_runtime": 0.1574, |
|
"eval_samples_per_second": 190.61, |
|
"eval_steps_per_second": 25.415, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.4925373134328357, |
|
"grad_norm": 0.0036452861968427896, |
|
"learning_rate": 2.806716417910448e-05, |
|
"loss": 0.0017, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.4925373134328357, |
|
"eval_loss": 7.644109246029984e-06, |
|
"eval_runtime": 0.1573, |
|
"eval_samples_per_second": 190.724, |
|
"eval_steps_per_second": 25.43, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.539179104477612, |
|
"grad_norm": 0.1588873416185379, |
|
"learning_rate": 2.769402985074627e-05, |
|
"loss": 0.0009, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.539179104477612, |
|
"eval_loss": 9.155748557532206e-06, |
|
"eval_runtime": 0.1602, |
|
"eval_samples_per_second": 187.322, |
|
"eval_steps_per_second": 24.976, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.585820895522388, |
|
"grad_norm": 0.004792116116732359, |
|
"learning_rate": 2.7320895522388062e-05, |
|
"loss": 0.0018, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.585820895522388, |
|
"eval_loss": 8.091868039628025e-06, |
|
"eval_runtime": 0.1578, |
|
"eval_samples_per_second": 190.11, |
|
"eval_steps_per_second": 25.348, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.6324626865671643, |
|
"grad_norm": 2.761561870574951, |
|
"learning_rate": 2.694776119402985e-05, |
|
"loss": 0.0011, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.6324626865671643, |
|
"eval_loss": 8.603728929301724e-06, |
|
"eval_runtime": 0.1552, |
|
"eval_samples_per_second": 193.265, |
|
"eval_steps_per_second": 25.769, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.6791044776119404, |
|
"grad_norm": 0.001969451317563653, |
|
"learning_rate": 2.6574626865671645e-05, |
|
"loss": 0.0012, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.6791044776119404, |
|
"eval_loss": 8.435232302872464e-06, |
|
"eval_runtime": 0.1553, |
|
"eval_samples_per_second": 193.184, |
|
"eval_steps_per_second": 25.758, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.7257462686567164, |
|
"grad_norm": 0.005350825376808643, |
|
"learning_rate": 2.6201492537313434e-05, |
|
"loss": 0.0008, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.7257462686567164, |
|
"eval_loss": 7.432493021042319e-06, |
|
"eval_runtime": 0.1563, |
|
"eval_samples_per_second": 191.964, |
|
"eval_steps_per_second": 25.595, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.7723880597014925, |
|
"grad_norm": 0.004110433626919985, |
|
"learning_rate": 2.5828358208955225e-05, |
|
"loss": 0.0014, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.7723880597014925, |
|
"eval_loss": 7.303169240913121e-06, |
|
"eval_runtime": 0.1569, |
|
"eval_samples_per_second": 191.177, |
|
"eval_steps_per_second": 25.49, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.8190298507462686, |
|
"grad_norm": 0.0039463951252400875, |
|
"learning_rate": 2.5455223880597017e-05, |
|
"loss": 0.0015, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.8190298507462686, |
|
"eval_loss": 9.76303817878943e-06, |
|
"eval_runtime": 0.1571, |
|
"eval_samples_per_second": 190.973, |
|
"eval_steps_per_second": 25.463, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.8656716417910446, |
|
"grad_norm": 0.004805909004062414, |
|
"learning_rate": 2.508208955223881e-05, |
|
"loss": 0.0012, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.8656716417910446, |
|
"eval_loss": 8.14036320662126e-06, |
|
"eval_runtime": 0.157, |
|
"eval_samples_per_second": 191.027, |
|
"eval_steps_per_second": 25.47, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.912313432835821, |
|
"grad_norm": 0.0031389566138386726, |
|
"learning_rate": 2.4708955223880597e-05, |
|
"loss": 0.0004, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.912313432835821, |
|
"eval_loss": 6.821030183346011e-06, |
|
"eval_runtime": 0.1614, |
|
"eval_samples_per_second": 185.9, |
|
"eval_steps_per_second": 24.787, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.9589552238805972, |
|
"grad_norm": 0.0017104181461036205, |
|
"learning_rate": 2.433582089552239e-05, |
|
"loss": 0.0008, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 1.9589552238805972, |
|
"eval_loss": 7.834916686988436e-06, |
|
"eval_runtime": 0.1615, |
|
"eval_samples_per_second": 185.757, |
|
"eval_steps_per_second": 24.768, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 2.0055970149253732, |
|
"grad_norm": 0.013449718244373798, |
|
"learning_rate": 2.396268656716418e-05, |
|
"loss": 0.001, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 2.0055970149253732, |
|
"eval_loss": 7.582441867270973e-06, |
|
"eval_runtime": 0.1686, |
|
"eval_samples_per_second": 177.913, |
|
"eval_steps_per_second": 23.722, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 2.0522388059701493, |
|
"grad_norm": 0.0017907052533701062, |
|
"learning_rate": 2.358955223880597e-05, |
|
"loss": 0.0007, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 2.0522388059701493, |
|
"eval_loss": 6.949214366613887e-06, |
|
"eval_runtime": 0.1537, |
|
"eval_samples_per_second": 195.234, |
|
"eval_steps_per_second": 26.031, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 2.0988805970149254, |
|
"grad_norm": 0.001830141874961555, |
|
"learning_rate": 2.3216417910447763e-05, |
|
"loss": 0.0004, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.0988805970149254, |
|
"eval_loss": 7.074564564391039e-06, |
|
"eval_runtime": 0.1577, |
|
"eval_samples_per_second": 190.227, |
|
"eval_steps_per_second": 25.364, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.1455223880597014, |
|
"grad_norm": 0.0030846677254885435, |
|
"learning_rate": 2.2843283582089555e-05, |
|
"loss": 0.0006, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 2.1455223880597014, |
|
"eval_loss": 6.291250883805333e-06, |
|
"eval_runtime": 0.1566, |
|
"eval_samples_per_second": 191.563, |
|
"eval_steps_per_second": 25.542, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 2.1921641791044775, |
|
"grad_norm": 0.24780245125293732, |
|
"learning_rate": 2.2470149253731343e-05, |
|
"loss": 0.0003, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 2.1921641791044775, |
|
"eval_loss": 6.1347077462414745e-06, |
|
"eval_runtime": 0.1575, |
|
"eval_samples_per_second": 190.525, |
|
"eval_steps_per_second": 25.403, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 2.2388059701492535, |
|
"grad_norm": 0.006296923384070396, |
|
"learning_rate": 2.2097014925373138e-05, |
|
"loss": 0.0003, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 2.2388059701492535, |
|
"eval_loss": 5.517637873708736e-06, |
|
"eval_runtime": 0.1579, |
|
"eval_samples_per_second": 189.97, |
|
"eval_steps_per_second": 25.329, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 2.28544776119403, |
|
"grad_norm": 0.0017776070162653923, |
|
"learning_rate": 2.1723880597014926e-05, |
|
"loss": 0.0007, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 2.28544776119403, |
|
"eval_loss": 5.534998763323529e-06, |
|
"eval_runtime": 0.158, |
|
"eval_samples_per_second": 189.867, |
|
"eval_steps_per_second": 25.316, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 2.332089552238806, |
|
"grad_norm": 0.0026376782916486263, |
|
"learning_rate": 2.1350746268656717e-05, |
|
"loss": 0.0004, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.332089552238806, |
|
"eval_loss": 5.3646053856937215e-06, |
|
"eval_runtime": 0.158, |
|
"eval_samples_per_second": 189.825, |
|
"eval_steps_per_second": 25.31, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.378731343283582, |
|
"grad_norm": 0.008729691617190838, |
|
"learning_rate": 2.097761194029851e-05, |
|
"loss": 0.0009, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 2.378731343283582, |
|
"eval_loss": 5.699926987290382e-06, |
|
"eval_runtime": 0.1586, |
|
"eval_samples_per_second": 189.183, |
|
"eval_steps_per_second": 25.224, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 2.425373134328358, |
|
"grad_norm": 0.0009663441451266408, |
|
"learning_rate": 2.06044776119403e-05, |
|
"loss": 0.0002, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 2.425373134328358, |
|
"eval_loss": 5.018339379603276e-06, |
|
"eval_runtime": 0.1609, |
|
"eval_samples_per_second": 186.434, |
|
"eval_steps_per_second": 24.858, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 2.4720149253731343, |
|
"grad_norm": 0.0009370720363222063, |
|
"learning_rate": 2.023134328358209e-05, |
|
"loss": 0.0003, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 2.4720149253731343, |
|
"eval_loss": 4.319118488638196e-06, |
|
"eval_runtime": 0.1594, |
|
"eval_samples_per_second": 188.258, |
|
"eval_steps_per_second": 25.101, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 2.5186567164179103, |
|
"grad_norm": 0.00313656241632998, |
|
"learning_rate": 1.9858208955223884e-05, |
|
"loss": 0.0008, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 2.5186567164179103, |
|
"eval_loss": 4.456574970390648e-06, |
|
"eval_runtime": 0.1551, |
|
"eval_samples_per_second": 193.476, |
|
"eval_steps_per_second": 25.797, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 2.5652985074626864, |
|
"grad_norm": 0.0011074623325839639, |
|
"learning_rate": 1.9485074626865675e-05, |
|
"loss": 0.001, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 2.5652985074626864, |
|
"eval_loss": 3.924767952412367e-06, |
|
"eval_runtime": 0.1544, |
|
"eval_samples_per_second": 194.285, |
|
"eval_steps_per_second": 25.905, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 2.611940298507463, |
|
"grad_norm": 0.02510717324912548, |
|
"learning_rate": 1.9111940298507467e-05, |
|
"loss": 0.0003, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 2.611940298507463, |
|
"eval_loss": 4.046037247462664e-06, |
|
"eval_runtime": 0.1553, |
|
"eval_samples_per_second": 193.136, |
|
"eval_steps_per_second": 25.752, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 2.658582089552239, |
|
"grad_norm": 0.023706955835223198, |
|
"learning_rate": 1.8738805970149255e-05, |
|
"loss": 0.0003, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 2.658582089552239, |
|
"eval_loss": 4.111709131393582e-06, |
|
"eval_runtime": 0.1558, |
|
"eval_samples_per_second": 192.543, |
|
"eval_steps_per_second": 25.672, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 2.705223880597015, |
|
"grad_norm": 0.004163182340562344, |
|
"learning_rate": 1.8365671641791047e-05, |
|
"loss": 0.0006, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 2.705223880597015, |
|
"eval_loss": 4.339735369285336e-06, |
|
"eval_runtime": 0.1607, |
|
"eval_samples_per_second": 186.695, |
|
"eval_steps_per_second": 24.893, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 2.751865671641791, |
|
"grad_norm": 0.002963052364066243, |
|
"learning_rate": 1.7992537313432835e-05, |
|
"loss": 0.0005, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 2.751865671641791, |
|
"eval_loss": 3.9786968955013435e-06, |
|
"eval_runtime": 0.1575, |
|
"eval_samples_per_second": 190.477, |
|
"eval_steps_per_second": 25.397, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 2.798507462686567, |
|
"grad_norm": 0.045626550912857056, |
|
"learning_rate": 1.7619402985074627e-05, |
|
"loss": 0.0008, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 2.798507462686567, |
|
"eval_loss": 4.052158146805596e-06, |
|
"eval_runtime": 0.1579, |
|
"eval_samples_per_second": 189.965, |
|
"eval_steps_per_second": 25.329, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 2.845149253731343, |
|
"grad_norm": 0.008129669353365898, |
|
"learning_rate": 1.7246268656716418e-05, |
|
"loss": 0.0004, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 2.845149253731343, |
|
"eval_loss": 4.024254849355202e-06, |
|
"eval_runtime": 0.158, |
|
"eval_samples_per_second": 189.912, |
|
"eval_steps_per_second": 25.322, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 2.8917910447761193, |
|
"grad_norm": 0.002264247043058276, |
|
"learning_rate": 1.687313432835821e-05, |
|
"loss": 0.0007, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 2.8917910447761193, |
|
"eval_loss": 3.7680488276237156e-06, |
|
"eval_runtime": 0.1559, |
|
"eval_samples_per_second": 192.455, |
|
"eval_steps_per_second": 25.661, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 2.9384328358208958, |
|
"grad_norm": 0.013498159125447273, |
|
"learning_rate": 1.65e-05, |
|
"loss": 0.0002, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 2.9384328358208958, |
|
"eval_loss": 3.534241386660142e-06, |
|
"eval_runtime": 0.1562, |
|
"eval_samples_per_second": 192.066, |
|
"eval_steps_per_second": 25.609, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 2.9850746268656714, |
|
"grad_norm": 0.000992327812127769, |
|
"learning_rate": 1.6126865671641793e-05, |
|
"loss": 0.0009, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 2.9850746268656714, |
|
"eval_loss": 3.539219051162945e-06, |
|
"eval_runtime": 0.1581, |
|
"eval_samples_per_second": 189.704, |
|
"eval_steps_per_second": 25.294, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 3.031716417910448, |
|
"grad_norm": 0.002681414596736431, |
|
"learning_rate": 1.575373134328358e-05, |
|
"loss": 0.0003, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 3.031716417910448, |
|
"eval_loss": 3.239377292629797e-06, |
|
"eval_runtime": 0.1572, |
|
"eval_samples_per_second": 190.818, |
|
"eval_steps_per_second": 25.442, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 3.078358208955224, |
|
"grad_norm": 0.06693683564662933, |
|
"learning_rate": 1.5380597014925373e-05, |
|
"loss": 0.0003, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 3.078358208955224, |
|
"eval_loss": 3.1195118026516866e-06, |
|
"eval_runtime": 0.1554, |
|
"eval_samples_per_second": 192.996, |
|
"eval_steps_per_second": 25.733, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 3.125, |
|
"grad_norm": 0.007976911030709743, |
|
"learning_rate": 1.5007462686567164e-05, |
|
"loss": 0.0002, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 3.125, |
|
"eval_loss": 3.158217623422388e-06, |
|
"eval_runtime": 0.1613, |
|
"eval_samples_per_second": 186.004, |
|
"eval_steps_per_second": 24.801, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 3.171641791044776, |
|
"grad_norm": 0.0018902523443102837, |
|
"learning_rate": 1.4634328358208956e-05, |
|
"loss": 0.0002, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 3.171641791044776, |
|
"eval_loss": 3.020254553121049e-06, |
|
"eval_runtime": 0.1619, |
|
"eval_samples_per_second": 185.322, |
|
"eval_steps_per_second": 24.71, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 3.218283582089552, |
|
"grad_norm": 0.0031356920953840017, |
|
"learning_rate": 1.4261194029850747e-05, |
|
"loss": 0.0002, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 3.218283582089552, |
|
"eval_loss": 2.9624054604937555e-06, |
|
"eval_runtime": 0.1556, |
|
"eval_samples_per_second": 192.812, |
|
"eval_steps_per_second": 25.708, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 3.264925373134328, |
|
"grad_norm": 0.005576052237302065, |
|
"learning_rate": 1.3888059701492537e-05, |
|
"loss": 0.0005, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 3.264925373134328, |
|
"eval_loss": 2.9549014470831025e-06, |
|
"eval_runtime": 0.1578, |
|
"eval_samples_per_second": 190.15, |
|
"eval_steps_per_second": 25.353, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 3.3115671641791042, |
|
"grad_norm": 0.0018602815689519048, |
|
"learning_rate": 1.3514925373134329e-05, |
|
"loss": 0.0002, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 3.3115671641791042, |
|
"eval_loss": 2.8336680770735256e-06, |
|
"eval_runtime": 0.1552, |
|
"eval_samples_per_second": 193.246, |
|
"eval_steps_per_second": 25.766, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 3.3582089552238807, |
|
"grad_norm": 0.002030579838901758, |
|
"learning_rate": 1.314179104477612e-05, |
|
"loss": 0.0003, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 3.3582089552238807, |
|
"eval_loss": 2.7239029805059545e-06, |
|
"eval_runtime": 0.1624, |
|
"eval_samples_per_second": 184.688, |
|
"eval_steps_per_second": 24.625, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 3.404850746268657, |
|
"grad_norm": 0.027261830866336823, |
|
"learning_rate": 1.276865671641791e-05, |
|
"loss": 0.0003, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 3.404850746268657, |
|
"eval_loss": 2.706149643927347e-06, |
|
"eval_runtime": 0.1586, |
|
"eval_samples_per_second": 189.213, |
|
"eval_steps_per_second": 25.228, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 3.451492537313433, |
|
"grad_norm": 0.005287382751703262, |
|
"learning_rate": 1.2395522388059702e-05, |
|
"loss": 0.0002, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 3.451492537313433, |
|
"eval_loss": 2.5531167011649814e-06, |
|
"eval_runtime": 0.1543, |
|
"eval_samples_per_second": 194.469, |
|
"eval_steps_per_second": 25.929, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 3.498134328358209, |
|
"grad_norm": 0.0021771446336060762, |
|
"learning_rate": 1.2022388059701493e-05, |
|
"loss": 0.0003, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 3.498134328358209, |
|
"eval_loss": 2.8221681986906333e-06, |
|
"eval_runtime": 0.1601, |
|
"eval_samples_per_second": 187.36, |
|
"eval_steps_per_second": 24.981, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 3.544776119402985, |
|
"grad_norm": 0.0027294817846268415, |
|
"learning_rate": 1.1649253731343283e-05, |
|
"loss": 0.0003, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 3.544776119402985, |
|
"eval_loss": 2.784717707982054e-06, |
|
"eval_runtime": 0.1555, |
|
"eval_samples_per_second": 192.876, |
|
"eval_steps_per_second": 25.717, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 3.591417910447761, |
|
"grad_norm": 0.0013641875702887774, |
|
"learning_rate": 1.1276119402985075e-05, |
|
"loss": 0.0003, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 3.591417910447761, |
|
"eval_loss": 2.9222812827356393e-06, |
|
"eval_runtime": 0.1639, |
|
"eval_samples_per_second": 183.034, |
|
"eval_steps_per_second": 24.405, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 3.638059701492537, |
|
"grad_norm": 0.08378789573907852, |
|
"learning_rate": 1.0902985074626867e-05, |
|
"loss": 0.0002, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 3.638059701492537, |
|
"eval_loss": 2.9144475774955936e-06, |
|
"eval_runtime": 0.1585, |
|
"eval_samples_per_second": 189.266, |
|
"eval_steps_per_second": 25.235, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 3.6847014925373136, |
|
"grad_norm": 0.004190696403384209, |
|
"learning_rate": 1.0529850746268656e-05, |
|
"loss": 0.0002, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 3.6847014925373136, |
|
"eval_loss": 2.752276259343489e-06, |
|
"eval_runtime": 0.1641, |
|
"eval_samples_per_second": 182.838, |
|
"eval_steps_per_second": 24.378, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 3.7313432835820897, |
|
"grad_norm": 0.0028494582511484623, |
|
"learning_rate": 1.0156716417910448e-05, |
|
"loss": 0.0005, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 3.7313432835820897, |
|
"eval_loss": 2.7154821964359144e-06, |
|
"eval_runtime": 0.1578, |
|
"eval_samples_per_second": 190.11, |
|
"eval_steps_per_second": 25.348, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 3.7779850746268657, |
|
"grad_norm": 0.03807828575372696, |
|
"learning_rate": 9.787313432835822e-06, |
|
"loss": 0.0006, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 3.7779850746268657, |
|
"eval_loss": 2.827802290994441e-06, |
|
"eval_runtime": 0.1552, |
|
"eval_samples_per_second": 193.25, |
|
"eval_steps_per_second": 25.767, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 3.824626865671642, |
|
"grad_norm": 0.5222908854484558, |
|
"learning_rate": 9.414179104477614e-06, |
|
"loss": 0.0002, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 3.824626865671642, |
|
"eval_loss": 2.7299265639157966e-06, |
|
"eval_runtime": 0.156, |
|
"eval_samples_per_second": 192.29, |
|
"eval_steps_per_second": 25.639, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 3.871268656716418, |
|
"grad_norm": 0.0023298938758671284, |
|
"learning_rate": 9.041044776119404e-06, |
|
"loss": 0.0002, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 3.871268656716418, |
|
"eval_loss": 2.6636719212547177e-06, |
|
"eval_runtime": 0.1552, |
|
"eval_samples_per_second": 193.302, |
|
"eval_steps_per_second": 25.774, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 3.917910447761194, |
|
"grad_norm": 0.05465957522392273, |
|
"learning_rate": 8.667910447761195e-06, |
|
"loss": 0.0005, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 3.917910447761194, |
|
"eval_loss": 2.582373781478964e-06, |
|
"eval_runtime": 0.1571, |
|
"eval_samples_per_second": 190.991, |
|
"eval_steps_per_second": 25.465, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 3.96455223880597, |
|
"grad_norm": 0.048116568475961685, |
|
"learning_rate": 8.294776119402985e-06, |
|
"loss": 0.0002, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 3.96455223880597, |
|
"eval_loss": 2.487162873876514e-06, |
|
"eval_runtime": 0.1645, |
|
"eval_samples_per_second": 182.375, |
|
"eval_steps_per_second": 24.317, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 4.0111940298507465, |
|
"grad_norm": 0.0024153583217412233, |
|
"learning_rate": 7.921641791044777e-06, |
|
"loss": 0.0001, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 4.0111940298507465, |
|
"eval_loss": 2.429806954751257e-06, |
|
"eval_runtime": 0.182, |
|
"eval_samples_per_second": 164.853, |
|
"eval_steps_per_second": 21.98, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 4.057835820895522, |
|
"grad_norm": 0.002540579764172435, |
|
"learning_rate": 7.548507462686568e-06, |
|
"loss": 0.0001, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 4.057835820895522, |
|
"eval_loss": 2.3769621293467935e-06, |
|
"eval_runtime": 0.1595, |
|
"eval_samples_per_second": 188.114, |
|
"eval_steps_per_second": 25.082, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 4.104477611940299, |
|
"grad_norm": 0.0042966934852302074, |
|
"learning_rate": 7.175373134328358e-06, |
|
"loss": 0.0002, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 4.104477611940299, |
|
"eval_loss": 2.33703940466512e-06, |
|
"eval_runtime": 0.1575, |
|
"eval_samples_per_second": 190.469, |
|
"eval_steps_per_second": 25.396, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 4.151119402985074, |
|
"grad_norm": 0.0006062475731596351, |
|
"learning_rate": 6.802238805970149e-06, |
|
"loss": 0.0004, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 4.151119402985074, |
|
"eval_loss": 2.3632851480215322e-06, |
|
"eval_runtime": 0.1561, |
|
"eval_samples_per_second": 192.228, |
|
"eval_steps_per_second": 25.63, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 4.197761194029851, |
|
"grad_norm": 0.009168209508061409, |
|
"learning_rate": 6.429104477611941e-06, |
|
"loss": 0.0002, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 4.197761194029851, |
|
"eval_loss": 2.2578756215807516e-06, |
|
"eval_runtime": 0.1552, |
|
"eval_samples_per_second": 193.262, |
|
"eval_steps_per_second": 25.768, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 4.244402985074627, |
|
"grad_norm": 0.002683964092284441, |
|
"learning_rate": 6.055970149253731e-06, |
|
"loss": 0.0001, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 4.244402985074627, |
|
"eval_loss": 2.2409305984183447e-06, |
|
"eval_runtime": 0.1564, |
|
"eval_samples_per_second": 191.8, |
|
"eval_steps_per_second": 25.573, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 4.291044776119403, |
|
"grad_norm": 0.0012024191673845053, |
|
"learning_rate": 5.682835820895522e-06, |
|
"loss": 0.0002, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 4.291044776119403, |
|
"eval_loss": 2.202784344262909e-06, |
|
"eval_runtime": 0.1563, |
|
"eval_samples_per_second": 191.927, |
|
"eval_steps_per_second": 25.59, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 4.337686567164179, |
|
"grad_norm": 0.003109121695160866, |
|
"learning_rate": 5.309701492537314e-06, |
|
"loss": 0.0002, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 4.337686567164179, |
|
"eval_loss": 2.2312042347039096e-06, |
|
"eval_runtime": 0.1563, |
|
"eval_samples_per_second": 191.941, |
|
"eval_steps_per_second": 25.592, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 4.384328358208955, |
|
"grad_norm": 0.0009185061790049076, |
|
"learning_rate": 4.9365671641791045e-06, |
|
"loss": 0.0002, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 4.384328358208955, |
|
"eval_loss": 2.1759601622761693e-06, |
|
"eval_runtime": 0.159, |
|
"eval_samples_per_second": 188.695, |
|
"eval_steps_per_second": 25.159, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 4.4309701492537314, |
|
"grad_norm": 0.0008011642494238913, |
|
"learning_rate": 4.563432835820896e-06, |
|
"loss": 0.0004, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 4.4309701492537314, |
|
"eval_loss": 2.2056660782254767e-06, |
|
"eval_runtime": 0.1567, |
|
"eval_samples_per_second": 191.395, |
|
"eval_steps_per_second": 25.519, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 4.477611940298507, |
|
"grad_norm": 0.001676097046583891, |
|
"learning_rate": 4.190298507462687e-06, |
|
"loss": 0.0001, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 4.477611940298507, |
|
"eval_loss": 2.196997229475528e-06, |
|
"eval_runtime": 0.1554, |
|
"eval_samples_per_second": 193.034, |
|
"eval_steps_per_second": 25.738, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 4.524253731343284, |
|
"grad_norm": 0.001348722493276, |
|
"learning_rate": 3.8171641791044775e-06, |
|
"loss": 0.0002, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 4.524253731343284, |
|
"eval_loss": 2.159943960577948e-06, |
|
"eval_runtime": 0.1588, |
|
"eval_samples_per_second": 188.88, |
|
"eval_steps_per_second": 25.184, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 4.57089552238806, |
|
"grad_norm": 0.002949994755908847, |
|
"learning_rate": 3.4440298507462687e-06, |
|
"loss": 0.0001, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 4.57089552238806, |
|
"eval_loss": 2.117126769007882e-06, |
|
"eval_runtime": 0.1558, |
|
"eval_samples_per_second": 192.602, |
|
"eval_steps_per_second": 25.68, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 4.617537313432836, |
|
"grad_norm": 0.005024689715355635, |
|
"learning_rate": 3.07089552238806e-06, |
|
"loss": 0.0001, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 4.617537313432836, |
|
"eval_loss": 2.069621814371203e-06, |
|
"eval_runtime": 0.1558, |
|
"eval_samples_per_second": 192.576, |
|
"eval_steps_per_second": 25.677, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 4.664179104477612, |
|
"grad_norm": 0.0006656211335211992, |
|
"learning_rate": 2.697761194029851e-06, |
|
"loss": 0.0004, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 4.664179104477612, |
|
"eval_loss": 2.0410807337611914e-06, |
|
"eval_runtime": 0.1585, |
|
"eval_samples_per_second": 189.231, |
|
"eval_steps_per_second": 25.231, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 4.710820895522388, |
|
"grad_norm": 0.0003097167646046728, |
|
"learning_rate": 2.324626865671642e-06, |
|
"loss": 0.0001, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 4.710820895522388, |
|
"eval_loss": 2.0217082692397526e-06, |
|
"eval_runtime": 0.1557, |
|
"eval_samples_per_second": 192.62, |
|
"eval_steps_per_second": 25.683, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 4.757462686567164, |
|
"grad_norm": 0.0007842735503800213, |
|
"learning_rate": 1.951492537313433e-06, |
|
"loss": 0.0001, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 4.757462686567164, |
|
"eval_loss": 2.0089323697902728e-06, |
|
"eval_runtime": 0.174, |
|
"eval_samples_per_second": 172.426, |
|
"eval_steps_per_second": 22.99, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 4.80410447761194, |
|
"grad_norm": 0.004456710536032915, |
|
"learning_rate": 1.578358208955224e-06, |
|
"loss": 0.0001, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 4.80410447761194, |
|
"eval_loss": 2.0024342575197807e-06, |
|
"eval_runtime": 0.1647, |
|
"eval_samples_per_second": 182.154, |
|
"eval_steps_per_second": 24.287, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 4.850746268656716, |
|
"grad_norm": 0.0003174096636939794, |
|
"learning_rate": 1.205223880597015e-06, |
|
"loss": 0.0001, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 4.850746268656716, |
|
"eval_loss": 1.9801934740826255e-06, |
|
"eval_runtime": 0.1571, |
|
"eval_samples_per_second": 190.953, |
|
"eval_steps_per_second": 25.46, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 4.897388059701493, |
|
"grad_norm": 0.013769041746854782, |
|
"learning_rate": 8.320895522388061e-07, |
|
"loss": 0.0001, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 4.897388059701493, |
|
"eval_loss": 1.9813883227470797e-06, |
|
"eval_runtime": 0.1598, |
|
"eval_samples_per_second": 187.736, |
|
"eval_steps_per_second": 25.031, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 4.9440298507462686, |
|
"grad_norm": 0.0012828693725168705, |
|
"learning_rate": 4.5895522388059706e-07, |
|
"loss": 0.0002, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 4.9440298507462686, |
|
"eval_loss": 1.981977902687504e-06, |
|
"eval_runtime": 0.1586, |
|
"eval_samples_per_second": 189.182, |
|
"eval_steps_per_second": 25.224, |
|
"step": 10600 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 10720, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 200, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4161551619317760.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|