|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 38.0, |
|
"eval_steps": 500, |
|
"global_step": 20102, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.1890359168241966, |
|
"grad_norm": 0.5107442140579224, |
|
"learning_rate": 5e-05, |
|
"loss": 2.0482, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3780718336483932, |
|
"grad_norm": 0.5563424229621887, |
|
"learning_rate": 5e-05, |
|
"loss": 1.8835, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5671077504725898, |
|
"grad_norm": 0.4735371172428131, |
|
"learning_rate": 5e-05, |
|
"loss": 1.8512, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7561436672967864, |
|
"grad_norm": 0.4543495178222656, |
|
"learning_rate": 5e-05, |
|
"loss": 1.8489, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.945179584120983, |
|
"grad_norm": 0.5007112622261047, |
|
"learning_rate": 5e-05, |
|
"loss": 1.8439, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.5733333333333334, |
|
"eval_loss": 1.6194567680358887, |
|
"eval_runtime": 5.4154, |
|
"eval_samples_per_second": 92.33, |
|
"eval_steps_per_second": 11.634, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_exact_match": 17.0, |
|
"eval_f1": 26.880952380952387, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 1.1342155009451795, |
|
"grad_norm": 0.5181360244750977, |
|
"learning_rate": 5e-05, |
|
"loss": 1.8217, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.3232514177693762, |
|
"grad_norm": 1.1348918676376343, |
|
"learning_rate": 5e-05, |
|
"loss": 1.7866, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.5122873345935728, |
|
"grad_norm": 0.661758303642273, |
|
"learning_rate": 5e-05, |
|
"loss": 1.7934, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.7013232514177694, |
|
"grad_norm": 0.5634772777557373, |
|
"learning_rate": 5e-05, |
|
"loss": 1.7731, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.8903591682419658, |
|
"grad_norm": 0.6698930263519287, |
|
"learning_rate": 5e-05, |
|
"loss": 1.7969, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.5751794871794872, |
|
"eval_loss": 1.603736162185669, |
|
"eval_runtime": 5.0986, |
|
"eval_samples_per_second": 98.066, |
|
"eval_steps_per_second": 12.356, |
|
"step": 1058 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_exact_match": 17.6, |
|
"eval_f1": 27.218095238095238, |
|
"step": 1058 |
|
}, |
|
{ |
|
"epoch": 2.0793950850661624, |
|
"grad_norm": 0.6265193223953247, |
|
"learning_rate": 5e-05, |
|
"loss": 1.7416, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.268431001890359, |
|
"grad_norm": 0.812364935874939, |
|
"learning_rate": 5e-05, |
|
"loss": 1.6957, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.4574669187145557, |
|
"grad_norm": 1.0972926616668701, |
|
"learning_rate": 5e-05, |
|
"loss": 1.7033, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.6465028355387523, |
|
"grad_norm": 1.0831658840179443, |
|
"learning_rate": 5e-05, |
|
"loss": 1.6935, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.835538752362949, |
|
"grad_norm": 1.0022494792938232, |
|
"learning_rate": 5e-05, |
|
"loss": 1.6799, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.573025641025641, |
|
"eval_loss": 1.6428505182266235, |
|
"eval_runtime": 5.1488, |
|
"eval_samples_per_second": 97.111, |
|
"eval_steps_per_second": 12.236, |
|
"step": 1587 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_exact_match": 20.2, |
|
"eval_f1": 27.93809523809523, |
|
"step": 1587 |
|
}, |
|
{ |
|
"epoch": 3.0245746691871456, |
|
"grad_norm": 0.8240587711334229, |
|
"learning_rate": 5e-05, |
|
"loss": 1.6618, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 3.213610586011342, |
|
"grad_norm": 1.2994645833969116, |
|
"learning_rate": 5e-05, |
|
"loss": 1.5736, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.402646502835539, |
|
"grad_norm": 1.0891159772872925, |
|
"learning_rate": 5e-05, |
|
"loss": 1.5798, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.5916824196597354, |
|
"grad_norm": 1.3826475143432617, |
|
"learning_rate": 5e-05, |
|
"loss": 1.5957, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 3.780718336483932, |
|
"grad_norm": 0.9901586174964905, |
|
"learning_rate": 5e-05, |
|
"loss": 1.6026, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.9697542533081287, |
|
"grad_norm": 1.0607813596725464, |
|
"learning_rate": 5e-05, |
|
"loss": 1.6106, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.5708717948717948, |
|
"eval_loss": 1.7143596410751343, |
|
"eval_runtime": 5.3942, |
|
"eval_samples_per_second": 92.693, |
|
"eval_steps_per_second": 11.679, |
|
"step": 2116 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_exact_match": 19.0, |
|
"eval_f1": 27.71428571428571, |
|
"step": 2116 |
|
}, |
|
{ |
|
"epoch": 4.158790170132325, |
|
"grad_norm": 1.3189178705215454, |
|
"learning_rate": 5e-05, |
|
"loss": 1.5029, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 4.3478260869565215, |
|
"grad_norm": 1.062267541885376, |
|
"learning_rate": 5e-05, |
|
"loss": 1.51, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 4.536862003780718, |
|
"grad_norm": 1.3810843229293823, |
|
"learning_rate": 5e-05, |
|
"loss": 1.4867, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 4.725897920604915, |
|
"grad_norm": 1.1991653442382812, |
|
"learning_rate": 5e-05, |
|
"loss": 1.501, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 4.914933837429111, |
|
"grad_norm": 1.310654878616333, |
|
"learning_rate": 5e-05, |
|
"loss": 1.5034, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.5681025641025641, |
|
"eval_loss": 1.8328346014022827, |
|
"eval_runtime": 5.7763, |
|
"eval_samples_per_second": 86.561, |
|
"eval_steps_per_second": 10.907, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_exact_match": 18.8, |
|
"eval_f1": 26.6015873015873, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 5.103969754253308, |
|
"grad_norm": 1.1406631469726562, |
|
"learning_rate": 5e-05, |
|
"loss": 1.4605, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 5.293005671077505, |
|
"grad_norm": 1.447221279144287, |
|
"learning_rate": 5e-05, |
|
"loss": 1.4204, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 5.482041587901701, |
|
"grad_norm": 1.461799144744873, |
|
"learning_rate": 5e-05, |
|
"loss": 1.4233, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 5.671077504725898, |
|
"grad_norm": 1.4184092283248901, |
|
"learning_rate": 5e-05, |
|
"loss": 1.4265, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 5.8601134215500945, |
|
"grad_norm": 0.9850455522537231, |
|
"learning_rate": 5e-05, |
|
"loss": 1.4139, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.564974358974359, |
|
"eval_loss": 1.931396484375, |
|
"eval_runtime": 5.3358, |
|
"eval_samples_per_second": 93.706, |
|
"eval_steps_per_second": 11.807, |
|
"step": 3174 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_exact_match": 17.6, |
|
"eval_f1": 25.464285714285715, |
|
"step": 3174 |
|
}, |
|
{ |
|
"epoch": 6.049149338374291, |
|
"grad_norm": 1.2274116277694702, |
|
"learning_rate": 5e-05, |
|
"loss": 1.4159, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 6.238185255198488, |
|
"grad_norm": 1.2107988595962524, |
|
"learning_rate": 5e-05, |
|
"loss": 1.3579, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 6.427221172022684, |
|
"grad_norm": 1.2965952157974243, |
|
"learning_rate": 5e-05, |
|
"loss": 1.3485, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 6.616257088846881, |
|
"grad_norm": 1.3035151958465576, |
|
"learning_rate": 5e-05, |
|
"loss": 1.36, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 6.805293005671078, |
|
"grad_norm": 1.1840296983718872, |
|
"learning_rate": 5e-05, |
|
"loss": 1.3599, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 6.994328922495274, |
|
"grad_norm": 1.780287265777588, |
|
"learning_rate": 5e-05, |
|
"loss": 1.355, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.565076923076923, |
|
"eval_loss": 1.9659509658813477, |
|
"eval_runtime": 5.3127, |
|
"eval_samples_per_second": 94.114, |
|
"eval_steps_per_second": 11.858, |
|
"step": 3703 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_exact_match": 18.0, |
|
"eval_f1": 26.546666666666663, |
|
"step": 3703 |
|
}, |
|
{ |
|
"epoch": 7.183364839319471, |
|
"grad_norm": 1.1608421802520752, |
|
"learning_rate": 5e-05, |
|
"loss": 1.298, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 7.3724007561436675, |
|
"grad_norm": 1.083270788192749, |
|
"learning_rate": 5e-05, |
|
"loss": 1.3088, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 7.561436672967864, |
|
"grad_norm": 1.3489899635314941, |
|
"learning_rate": 5e-05, |
|
"loss": 1.299, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 7.750472589792061, |
|
"grad_norm": 1.1663447618484497, |
|
"learning_rate": 5e-05, |
|
"loss": 1.2841, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 7.939508506616257, |
|
"grad_norm": 1.9308583736419678, |
|
"learning_rate": 5e-05, |
|
"loss": 1.314, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.5616923076923077, |
|
"eval_loss": 2.1164443492889404, |
|
"eval_runtime": 5.2165, |
|
"eval_samples_per_second": 95.85, |
|
"eval_steps_per_second": 12.077, |
|
"step": 4232 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_exact_match": 17.0, |
|
"eval_f1": 24.659841269841277, |
|
"step": 4232 |
|
}, |
|
{ |
|
"epoch": 8.128544423440454, |
|
"grad_norm": 1.109632134437561, |
|
"learning_rate": 5e-05, |
|
"loss": 1.2558, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 8.31758034026465, |
|
"grad_norm": 1.1132491827011108, |
|
"learning_rate": 5e-05, |
|
"loss": 1.2264, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 8.506616257088847, |
|
"grad_norm": 1.1017369031906128, |
|
"learning_rate": 5e-05, |
|
"loss": 1.2649, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 8.695652173913043, |
|
"grad_norm": 1.0874963998794556, |
|
"learning_rate": 5e-05, |
|
"loss": 1.2512, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 8.88468809073724, |
|
"grad_norm": 1.2617064714431763, |
|
"learning_rate": 5e-05, |
|
"loss": 1.261, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.5622051282051282, |
|
"eval_loss": 2.1635327339172363, |
|
"eval_runtime": 5.0832, |
|
"eval_samples_per_second": 98.363, |
|
"eval_steps_per_second": 12.394, |
|
"step": 4761 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_exact_match": 17.8, |
|
"eval_f1": 25.451428571428576, |
|
"step": 4761 |
|
}, |
|
{ |
|
"epoch": 9.073724007561436, |
|
"grad_norm": 1.2258027791976929, |
|
"learning_rate": 5e-05, |
|
"loss": 1.2298, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 9.262759924385634, |
|
"grad_norm": 1.8261148929595947, |
|
"learning_rate": 5e-05, |
|
"loss": 1.1748, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 9.45179584120983, |
|
"grad_norm": 1.1221072673797607, |
|
"learning_rate": 5e-05, |
|
"loss": 1.19, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 9.640831758034027, |
|
"grad_norm": 1.3320170640945435, |
|
"learning_rate": 5e-05, |
|
"loss": 1.1932, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 9.829867674858223, |
|
"grad_norm": 1.194077730178833, |
|
"learning_rate": 5e-05, |
|
"loss": 1.217, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.5612820512820513, |
|
"eval_loss": 2.244041919708252, |
|
"eval_runtime": 5.1606, |
|
"eval_samples_per_second": 96.889, |
|
"eval_steps_per_second": 12.208, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_exact_match": 18.4, |
|
"eval_f1": 26.067662337662348, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 10.01890359168242, |
|
"grad_norm": 1.1352814435958862, |
|
"learning_rate": 5e-05, |
|
"loss": 1.2093, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 10.207939508506616, |
|
"grad_norm": 1.2741838693618774, |
|
"learning_rate": 5e-05, |
|
"loss": 1.1467, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 10.396975425330814, |
|
"grad_norm": 1.6901249885559082, |
|
"learning_rate": 5e-05, |
|
"loss": 1.1288, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 10.58601134215501, |
|
"grad_norm": 1.1944465637207031, |
|
"learning_rate": 5e-05, |
|
"loss": 1.1399, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 10.775047258979207, |
|
"grad_norm": 1.2234572172164917, |
|
"learning_rate": 5e-05, |
|
"loss": 1.1678, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 10.964083175803403, |
|
"grad_norm": 1.2781612873077393, |
|
"learning_rate": 5e-05, |
|
"loss": 1.146, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.5618461538461539, |
|
"eval_loss": 2.296630859375, |
|
"eval_runtime": 5.1078, |
|
"eval_samples_per_second": 97.889, |
|
"eval_steps_per_second": 12.334, |
|
"step": 5819 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_exact_match": 16.8, |
|
"eval_f1": 23.63305916305917, |
|
"step": 5819 |
|
}, |
|
{ |
|
"epoch": 11.1531190926276, |
|
"grad_norm": 1.138975739479065, |
|
"learning_rate": 5e-05, |
|
"loss": 1.0842, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 11.342155009451796, |
|
"grad_norm": 1.727989912033081, |
|
"learning_rate": 5e-05, |
|
"loss": 1.0806, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 11.531190926275993, |
|
"grad_norm": 1.3104358911514282, |
|
"learning_rate": 5e-05, |
|
"loss": 1.0953, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 11.720226843100189, |
|
"grad_norm": 1.3629850149154663, |
|
"learning_rate": 5e-05, |
|
"loss": 1.0945, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 11.909262759924385, |
|
"grad_norm": 1.3276056051254272, |
|
"learning_rate": 5e-05, |
|
"loss": 1.1069, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.562, |
|
"eval_loss": 2.3118155002593994, |
|
"eval_runtime": 5.3837, |
|
"eval_samples_per_second": 92.873, |
|
"eval_steps_per_second": 11.702, |
|
"step": 6348 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_exact_match": 17.4, |
|
"eval_f1": 24.43924963924964, |
|
"step": 6348 |
|
}, |
|
{ |
|
"epoch": 12.098298676748582, |
|
"grad_norm": 1.2608078718185425, |
|
"learning_rate": 5e-05, |
|
"loss": 1.0418, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 12.287334593572778, |
|
"grad_norm": 1.4456945657730103, |
|
"learning_rate": 5e-05, |
|
"loss": 1.0258, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 12.476370510396976, |
|
"grad_norm": 1.4195287227630615, |
|
"learning_rate": 5e-05, |
|
"loss": 1.0255, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 12.665406427221171, |
|
"grad_norm": 1.2595598697662354, |
|
"learning_rate": 5e-05, |
|
"loss": 1.0511, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 12.854442344045369, |
|
"grad_norm": 1.5901292562484741, |
|
"learning_rate": 5e-05, |
|
"loss": 1.045, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.5614871794871795, |
|
"eval_loss": 2.3580987453460693, |
|
"eval_runtime": 5.3496, |
|
"eval_samples_per_second": 93.466, |
|
"eval_steps_per_second": 11.777, |
|
"step": 6877 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_exact_match": 17.2, |
|
"eval_f1": 24.350995670995676, |
|
"step": 6877 |
|
}, |
|
{ |
|
"epoch": 13.043478260869565, |
|
"grad_norm": 1.471511721611023, |
|
"learning_rate": 5e-05, |
|
"loss": 1.0404, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 13.232514177693762, |
|
"grad_norm": 1.2928757667541504, |
|
"learning_rate": 5e-05, |
|
"loss": 0.9573, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 13.421550094517958, |
|
"grad_norm": 1.5545622110366821, |
|
"learning_rate": 5e-05, |
|
"loss": 0.9583, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 13.610586011342155, |
|
"grad_norm": 1.6132512092590332, |
|
"learning_rate": 5e-05, |
|
"loss": 0.981, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 13.799621928166351, |
|
"grad_norm": 1.7548742294311523, |
|
"learning_rate": 5e-05, |
|
"loss": 0.9999, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 13.988657844990549, |
|
"grad_norm": 1.4219907522201538, |
|
"learning_rate": 5e-05, |
|
"loss": 1.0028, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.5618974358974359, |
|
"eval_loss": 2.413728952407837, |
|
"eval_runtime": 5.0464, |
|
"eval_samples_per_second": 99.08, |
|
"eval_steps_per_second": 12.484, |
|
"step": 7406 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_exact_match": 16.6, |
|
"eval_f1": 24.201789321789327, |
|
"step": 7406 |
|
}, |
|
{ |
|
"epoch": 14.177693761814744, |
|
"grad_norm": 1.5799965858459473, |
|
"learning_rate": 5e-05, |
|
"loss": 0.9096, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 14.366729678638942, |
|
"grad_norm": 1.5961594581604004, |
|
"learning_rate": 5e-05, |
|
"loss": 0.9075, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 14.555765595463138, |
|
"grad_norm": 1.5756512880325317, |
|
"learning_rate": 5e-05, |
|
"loss": 0.9222, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 14.744801512287335, |
|
"grad_norm": 1.515251636505127, |
|
"learning_rate": 5e-05, |
|
"loss": 0.9445, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 14.93383742911153, |
|
"grad_norm": 2.6804299354553223, |
|
"learning_rate": 5e-05, |
|
"loss": 0.9153, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.5596923076923077, |
|
"eval_loss": 2.4756176471710205, |
|
"eval_runtime": 5.1956, |
|
"eval_samples_per_second": 96.236, |
|
"eval_steps_per_second": 12.126, |
|
"step": 7935 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_exact_match": 16.8, |
|
"eval_f1": 24.146709956709966, |
|
"step": 7935 |
|
}, |
|
{ |
|
"epoch": 15.122873345935728, |
|
"grad_norm": 1.7786214351654053, |
|
"learning_rate": 5e-05, |
|
"loss": 0.8874, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 15.311909262759924, |
|
"grad_norm": 1.709060788154602, |
|
"learning_rate": 5e-05, |
|
"loss": 0.8538, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 15.500945179584122, |
|
"grad_norm": 2.3245434761047363, |
|
"learning_rate": 5e-05, |
|
"loss": 0.8591, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 15.689981096408317, |
|
"grad_norm": 1.6101887226104736, |
|
"learning_rate": 5e-05, |
|
"loss": 0.8593, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 15.879017013232515, |
|
"grad_norm": 1.5917792320251465, |
|
"learning_rate": 5e-05, |
|
"loss": 0.8748, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.5604615384615385, |
|
"eval_loss": 2.4434447288513184, |
|
"eval_runtime": 5.1072, |
|
"eval_samples_per_second": 97.901, |
|
"eval_steps_per_second": 12.336, |
|
"step": 8464 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_exact_match": 15.6, |
|
"eval_f1": 22.917027417027416, |
|
"step": 8464 |
|
}, |
|
{ |
|
"epoch": 16.068052930056712, |
|
"grad_norm": 1.6529980897903442, |
|
"learning_rate": 5e-05, |
|
"loss": 0.8413, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 16.257088846880908, |
|
"grad_norm": 1.931520938873291, |
|
"learning_rate": 5e-05, |
|
"loss": 0.7863, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 16.446124763705104, |
|
"grad_norm": 1.8401894569396973, |
|
"learning_rate": 5e-05, |
|
"loss": 0.799, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 16.6351606805293, |
|
"grad_norm": 1.7952015399932861, |
|
"learning_rate": 5e-05, |
|
"loss": 0.8238, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 16.8241965973535, |
|
"grad_norm": 1.8028634786605835, |
|
"learning_rate": 5e-05, |
|
"loss": 0.807, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.5606153846153846, |
|
"eval_loss": 2.4520959854125977, |
|
"eval_runtime": 5.1769, |
|
"eval_samples_per_second": 96.584, |
|
"eval_steps_per_second": 12.17, |
|
"step": 8993 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_exact_match": 17.4, |
|
"eval_f1": 25.104011544011545, |
|
"step": 8993 |
|
}, |
|
{ |
|
"epoch": 17.013232514177695, |
|
"grad_norm": 1.7197688817977905, |
|
"learning_rate": 5e-05, |
|
"loss": 0.8249, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 17.20226843100189, |
|
"grad_norm": 1.768463134765625, |
|
"learning_rate": 5e-05, |
|
"loss": 0.7304, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 17.391304347826086, |
|
"grad_norm": 1.9696024656295776, |
|
"learning_rate": 5e-05, |
|
"loss": 0.7445, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 17.58034026465028, |
|
"grad_norm": 2.093703269958496, |
|
"learning_rate": 5e-05, |
|
"loss": 0.755, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 17.76937618147448, |
|
"grad_norm": 1.909550666809082, |
|
"learning_rate": 5e-05, |
|
"loss": 0.7698, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 17.958412098298677, |
|
"grad_norm": 1.6835806369781494, |
|
"learning_rate": 5e-05, |
|
"loss": 0.7711, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.5603589743589743, |
|
"eval_loss": 2.473491668701172, |
|
"eval_runtime": 5.4257, |
|
"eval_samples_per_second": 92.155, |
|
"eval_steps_per_second": 11.611, |
|
"step": 9522 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_exact_match": 17.0, |
|
"eval_f1": 24.074487734487732, |
|
"step": 9522 |
|
}, |
|
{ |
|
"epoch": 18.147448015122873, |
|
"grad_norm": 2.0483736991882324, |
|
"learning_rate": 5e-05, |
|
"loss": 0.7048, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 18.33648393194707, |
|
"grad_norm": 1.987457275390625, |
|
"learning_rate": 5e-05, |
|
"loss": 0.6863, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 18.525519848771268, |
|
"grad_norm": 2.1744563579559326, |
|
"learning_rate": 5e-05, |
|
"loss": 0.7045, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 18.714555765595463, |
|
"grad_norm": 1.9871633052825928, |
|
"learning_rate": 5e-05, |
|
"loss": 0.7076, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 18.90359168241966, |
|
"grad_norm": 1.8895001411437988, |
|
"learning_rate": 5e-05, |
|
"loss": 0.7202, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 0.5601538461538461, |
|
"eval_loss": 2.5168700218200684, |
|
"eval_runtime": 5.0434, |
|
"eval_samples_per_second": 99.14, |
|
"eval_steps_per_second": 12.492, |
|
"step": 10051 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_exact_match": 17.2, |
|
"eval_f1": 24.260360750360757, |
|
"step": 10051 |
|
}, |
|
{ |
|
"epoch": 19.092627599243855, |
|
"grad_norm": 1.7914209365844727, |
|
"learning_rate": 5e-05, |
|
"loss": 0.6728, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 19.281663516068054, |
|
"grad_norm": 2.198495388031006, |
|
"learning_rate": 5e-05, |
|
"loss": 0.6241, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 19.47069943289225, |
|
"grad_norm": 1.81365966796875, |
|
"learning_rate": 5e-05, |
|
"loss": 0.6558, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 19.659735349716446, |
|
"grad_norm": 1.6571800708770752, |
|
"learning_rate": 5e-05, |
|
"loss": 0.6652, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 19.84877126654064, |
|
"grad_norm": 1.6469954252243042, |
|
"learning_rate": 5e-05, |
|
"loss": 0.6637, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.5596410256410257, |
|
"eval_loss": 2.516073703765869, |
|
"eval_runtime": 5.8234, |
|
"eval_samples_per_second": 85.86, |
|
"eval_steps_per_second": 10.818, |
|
"step": 10580 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_exact_match": 17.6, |
|
"eval_f1": 25.225079365079367, |
|
"step": 10580 |
|
}, |
|
{ |
|
"epoch": 20.03780718336484, |
|
"grad_norm": 1.7114077806472778, |
|
"learning_rate": 5e-05, |
|
"loss": 0.66, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 20.226843100189036, |
|
"grad_norm": 1.9962202310562134, |
|
"learning_rate": 5e-05, |
|
"loss": 0.5938, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 20.415879017013232, |
|
"grad_norm": 2.397592067718506, |
|
"learning_rate": 5e-05, |
|
"loss": 0.6042, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 20.604914933837428, |
|
"grad_norm": 2.0724213123321533, |
|
"learning_rate": 5e-05, |
|
"loss": 0.6124, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 20.793950850661627, |
|
"grad_norm": 1.9385457038879395, |
|
"learning_rate": 5e-05, |
|
"loss": 0.6078, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 20.982986767485823, |
|
"grad_norm": 2.034963607788086, |
|
"learning_rate": 5e-05, |
|
"loss": 0.6257, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"eval_accuracy": 0.5597948717948718, |
|
"eval_loss": 2.5497183799743652, |
|
"eval_runtime": 5.0442, |
|
"eval_samples_per_second": 99.123, |
|
"eval_steps_per_second": 12.489, |
|
"step": 11109 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"eval_exact_match": 17.4, |
|
"eval_f1": 24.755238095238095, |
|
"step": 11109 |
|
}, |
|
{ |
|
"epoch": 21.17202268431002, |
|
"grad_norm": 1.9768177270889282, |
|
"learning_rate": 5e-05, |
|
"loss": 0.5458, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 21.361058601134214, |
|
"grad_norm": 2.14420747756958, |
|
"learning_rate": 5e-05, |
|
"loss": 0.558, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 21.550094517958414, |
|
"grad_norm": 2.5267951488494873, |
|
"learning_rate": 5e-05, |
|
"loss": 0.5646, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 21.73913043478261, |
|
"grad_norm": 2.114386558532715, |
|
"learning_rate": 5e-05, |
|
"loss": 0.572, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 21.928166351606805, |
|
"grad_norm": 2.424217700958252, |
|
"learning_rate": 5e-05, |
|
"loss": 0.5928, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_accuracy": 0.5593846153846154, |
|
"eval_loss": 2.579535722732544, |
|
"eval_runtime": 5.7849, |
|
"eval_samples_per_second": 86.433, |
|
"eval_steps_per_second": 10.89, |
|
"step": 11638 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_exact_match": 16.8, |
|
"eval_f1": 23.666507936507937, |
|
"step": 11638 |
|
}, |
|
{ |
|
"epoch": 22.117202268431, |
|
"grad_norm": 1.7813427448272705, |
|
"learning_rate": 5e-05, |
|
"loss": 0.5342, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 22.3062381852552, |
|
"grad_norm": 1.989479422569275, |
|
"learning_rate": 5e-05, |
|
"loss": 0.5157, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 22.495274102079396, |
|
"grad_norm": 1.9694567918777466, |
|
"learning_rate": 5e-05, |
|
"loss": 0.5236, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 22.68431001890359, |
|
"grad_norm": 2.0226142406463623, |
|
"learning_rate": 5e-05, |
|
"loss": 0.5315, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 22.873345935727787, |
|
"grad_norm": 2.0204625129699707, |
|
"learning_rate": 5e-05, |
|
"loss": 0.5378, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"eval_accuracy": 0.558974358974359, |
|
"eval_loss": 2.5709009170532227, |
|
"eval_runtime": 5.0788, |
|
"eval_samples_per_second": 98.449, |
|
"eval_steps_per_second": 12.405, |
|
"step": 12167 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"eval_exact_match": 16.4, |
|
"eval_f1": 23.69809523809524, |
|
"step": 12167 |
|
}, |
|
{ |
|
"epoch": 23.062381852551987, |
|
"grad_norm": 2.0633811950683594, |
|
"learning_rate": 5e-05, |
|
"loss": 0.5183, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 23.251417769376182, |
|
"grad_norm": 2.3566079139709473, |
|
"learning_rate": 5e-05, |
|
"loss": 0.4761, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 23.440453686200378, |
|
"grad_norm": 2.368450880050659, |
|
"learning_rate": 5e-05, |
|
"loss": 0.487, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 23.629489603024574, |
|
"grad_norm": 2.1096372604370117, |
|
"learning_rate": 5e-05, |
|
"loss": 0.4945, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 23.81852551984877, |
|
"grad_norm": 2.2136454582214355, |
|
"learning_rate": 5e-05, |
|
"loss": 0.5035, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_accuracy": 0.5586153846153846, |
|
"eval_loss": 2.5898056030273438, |
|
"eval_runtime": 5.7458, |
|
"eval_samples_per_second": 87.021, |
|
"eval_steps_per_second": 10.965, |
|
"step": 12696 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_exact_match": 17.8, |
|
"eval_f1": 24.89603174603175, |
|
"step": 12696 |
|
}, |
|
{ |
|
"epoch": 24.00756143667297, |
|
"grad_norm": 2.317958116531372, |
|
"learning_rate": 5e-05, |
|
"loss": 0.5106, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 24.196597353497165, |
|
"grad_norm": 2.0783536434173584, |
|
"learning_rate": 5e-05, |
|
"loss": 0.4404, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 24.38563327032136, |
|
"grad_norm": 2.322791337966919, |
|
"learning_rate": 5e-05, |
|
"loss": 0.452, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 24.574669187145556, |
|
"grad_norm": 1.9763044118881226, |
|
"learning_rate": 5e-05, |
|
"loss": 0.4617, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 24.763705103969755, |
|
"grad_norm": 2.462392568588257, |
|
"learning_rate": 5e-05, |
|
"loss": 0.4731, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 24.95274102079395, |
|
"grad_norm": 2.2400307655334473, |
|
"learning_rate": 5e-05, |
|
"loss": 0.4751, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"eval_accuracy": 0.5594871794871795, |
|
"eval_loss": 2.611313581466675, |
|
"eval_runtime": 5.3564, |
|
"eval_samples_per_second": 93.346, |
|
"eval_steps_per_second": 11.762, |
|
"step": 13225 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"eval_exact_match": 15.8, |
|
"eval_f1": 22.969047619047625, |
|
"step": 13225 |
|
}, |
|
{ |
|
"epoch": 25.141776937618147, |
|
"grad_norm": 2.0931668281555176, |
|
"learning_rate": 5e-05, |
|
"loss": 0.4299, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 25.330812854442343, |
|
"grad_norm": 2.7056140899658203, |
|
"learning_rate": 5e-05, |
|
"loss": 0.422, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 25.519848771266542, |
|
"grad_norm": 1.9907532930374146, |
|
"learning_rate": 5e-05, |
|
"loss": 0.4374, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 25.708884688090738, |
|
"grad_norm": 2.3209221363067627, |
|
"learning_rate": 5e-05, |
|
"loss": 0.4376, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 25.897920604914933, |
|
"grad_norm": 3.2339212894439697, |
|
"learning_rate": 5e-05, |
|
"loss": 0.4432, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_accuracy": 0.5602051282051282, |
|
"eval_loss": 2.6182327270507812, |
|
"eval_runtime": 5.0321, |
|
"eval_samples_per_second": 99.362, |
|
"eval_steps_per_second": 12.52, |
|
"step": 13754 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_exact_match": 15.4, |
|
"eval_f1": 23.059047619047618, |
|
"step": 13754 |
|
}, |
|
{ |
|
"epoch": 26.08695652173913, |
|
"grad_norm": 1.9935250282287598, |
|
"learning_rate": 5e-05, |
|
"loss": 0.4265, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 26.27599243856333, |
|
"grad_norm": 2.5698649883270264, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3949, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 26.465028355387524, |
|
"grad_norm": 2.568392038345337, |
|
"learning_rate": 5e-05, |
|
"loss": 0.4041, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 26.65406427221172, |
|
"grad_norm": 2.094651699066162, |
|
"learning_rate": 5e-05, |
|
"loss": 0.4128, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 26.843100189035916, |
|
"grad_norm": 2.240649700164795, |
|
"learning_rate": 5e-05, |
|
"loss": 0.4191, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"eval_accuracy": 0.56, |
|
"eval_loss": 2.614851951599121, |
|
"eval_runtime": 5.636, |
|
"eval_samples_per_second": 88.715, |
|
"eval_steps_per_second": 11.178, |
|
"step": 14283 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"eval_exact_match": 17.2, |
|
"eval_f1": 24.941948051948046, |
|
"step": 14283 |
|
}, |
|
{ |
|
"epoch": 27.032136105860115, |
|
"grad_norm": 1.8800567388534546, |
|
"learning_rate": 5e-05, |
|
"loss": 0.4135, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 27.22117202268431, |
|
"grad_norm": 2.525048017501831, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3717, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 27.410207939508506, |
|
"grad_norm": 2.3803513050079346, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3817, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 27.599243856332702, |
|
"grad_norm": 2.369356155395508, |
|
"learning_rate": 5e-05, |
|
"loss": 0.388, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 27.7882797731569, |
|
"grad_norm": 2.701702356338501, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3943, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 27.977315689981097, |
|
"grad_norm": 2.2755825519561768, |
|
"learning_rate": 5e-05, |
|
"loss": 0.4015, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_accuracy": 0.5592820512820513, |
|
"eval_loss": 2.653672218322754, |
|
"eval_runtime": 5.3543, |
|
"eval_samples_per_second": 93.383, |
|
"eval_steps_per_second": 11.766, |
|
"step": 14812 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_exact_match": 17.4, |
|
"eval_f1": 24.478888888888893, |
|
"step": 14812 |
|
}, |
|
{ |
|
"epoch": 28.166351606805293, |
|
"grad_norm": 2.0458974838256836, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3566, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 28.35538752362949, |
|
"grad_norm": 2.040117025375366, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3602, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 28.544423440453688, |
|
"grad_norm": 1.947757363319397, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3666, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 28.733459357277884, |
|
"grad_norm": 2.0905520915985107, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3747, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 28.92249527410208, |
|
"grad_norm": 2.215851306915283, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3798, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"eval_accuracy": 0.5600512820512821, |
|
"eval_loss": 2.6304192543029785, |
|
"eval_runtime": 5.0359, |
|
"eval_samples_per_second": 99.288, |
|
"eval_steps_per_second": 12.51, |
|
"step": 15341 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"eval_exact_match": 16.2, |
|
"eval_f1": 23.909062049062054, |
|
"step": 15341 |
|
}, |
|
{ |
|
"epoch": 29.111531190926275, |
|
"grad_norm": 2.5050926208496094, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3538, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 29.300567107750474, |
|
"grad_norm": 2.0059566497802734, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3365, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 29.48960302457467, |
|
"grad_norm": 2.7678000926971436, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3523, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 29.678638941398866, |
|
"grad_norm": 2.6234450340270996, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3552, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 29.86767485822306, |
|
"grad_norm": 1.9958972930908203, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3629, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"eval_accuracy": 0.56, |
|
"eval_loss": 2.6483516693115234, |
|
"eval_runtime": 5.0621, |
|
"eval_samples_per_second": 98.774, |
|
"eval_steps_per_second": 12.446, |
|
"step": 15870 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"eval_exact_match": 16.4, |
|
"eval_f1": 24.021746031746037, |
|
"step": 15870 |
|
}, |
|
{ |
|
"epoch": 30.056710775047257, |
|
"grad_norm": 2.5568833351135254, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3505, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 30.245746691871457, |
|
"grad_norm": 2.0313165187835693, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3252, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 30.434782608695652, |
|
"grad_norm": 1.8152469396591187, |
|
"learning_rate": 5e-05, |
|
"loss": 0.335, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 30.623818525519848, |
|
"grad_norm": 2.0004210472106934, |
|
"learning_rate": 5e-05, |
|
"loss": 0.34, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 30.812854442344044, |
|
"grad_norm": 2.3558757305145264, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3474, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"eval_accuracy": 0.5605641025641026, |
|
"eval_loss": 2.6599507331848145, |
|
"eval_runtime": 5.3592, |
|
"eval_samples_per_second": 93.297, |
|
"eval_steps_per_second": 11.755, |
|
"step": 16399 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"eval_exact_match": 16.8, |
|
"eval_f1": 23.902857142857147, |
|
"step": 16399 |
|
}, |
|
{ |
|
"epoch": 31.001890359168243, |
|
"grad_norm": 1.641440749168396, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3478, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 31.19092627599244, |
|
"grad_norm": 2.425560712814331, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3067, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 31.379962192816635, |
|
"grad_norm": 1.850221872329712, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3202, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 31.56899810964083, |
|
"grad_norm": 1.4244951009750366, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3301, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 31.75803402646503, |
|
"grad_norm": 2.093834638595581, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3293, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 31.947069943289225, |
|
"grad_norm": 2.0703086853027344, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3369, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"eval_accuracy": 0.5597435897435897, |
|
"eval_loss": 2.6675195693969727, |
|
"eval_runtime": 5.1201, |
|
"eval_samples_per_second": 97.655, |
|
"eval_steps_per_second": 12.305, |
|
"step": 16928 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"eval_exact_match": 15.6, |
|
"eval_f1": 22.95126984126984, |
|
"step": 16928 |
|
}, |
|
{ |
|
"epoch": 32.136105860113425, |
|
"grad_norm": 2.0121419429779053, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3134, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 32.32514177693762, |
|
"grad_norm": 1.8783822059631348, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3102, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 32.514177693761816, |
|
"grad_norm": 1.5319064855575562, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3089, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 32.70321361058601, |
|
"grad_norm": 2.6580235958099365, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3175, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 32.89224952741021, |
|
"grad_norm": 2.121335744857788, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3275, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"eval_accuracy": 0.5603589743589743, |
|
"eval_loss": 2.658357858657837, |
|
"eval_runtime": 5.4806, |
|
"eval_samples_per_second": 91.231, |
|
"eval_steps_per_second": 11.495, |
|
"step": 17457 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"eval_exact_match": 16.4, |
|
"eval_f1": 24.0452380952381, |
|
"step": 17457 |
|
}, |
|
{ |
|
"epoch": 33.0812854442344, |
|
"grad_norm": 2.1282331943511963, |
|
"learning_rate": 5e-05, |
|
"loss": 0.311, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 33.2703213610586, |
|
"grad_norm": 2.212125062942505, |
|
"learning_rate": 5e-05, |
|
"loss": 0.2951, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 33.459357277882795, |
|
"grad_norm": 3.389835834503174, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3068, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 33.648393194707, |
|
"grad_norm": 1.812578558921814, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3091, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 33.83742911153119, |
|
"grad_norm": 2.434966802597046, |
|
"learning_rate": 5e-05, |
|
"loss": 0.311, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"eval_accuracy": 0.5600512820512821, |
|
"eval_loss": 2.676753520965576, |
|
"eval_runtime": 5.3579, |
|
"eval_samples_per_second": 93.32, |
|
"eval_steps_per_second": 11.758, |
|
"step": 17986 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"eval_exact_match": 16.6, |
|
"eval_f1": 24.913650793650795, |
|
"step": 17986 |
|
}, |
|
{ |
|
"epoch": 34.02646502835539, |
|
"grad_norm": 1.5352602005004883, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3142, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 34.215500945179585, |
|
"grad_norm": 1.4355374574661255, |
|
"learning_rate": 5e-05, |
|
"loss": 0.2897, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 34.40453686200378, |
|
"grad_norm": 1.7993170022964478, |
|
"learning_rate": 5e-05, |
|
"loss": 0.2923, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 34.593572778827976, |
|
"grad_norm": 1.9478981494903564, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3001, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 34.78260869565217, |
|
"grad_norm": 3.045933246612549, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3034, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 34.97164461247637, |
|
"grad_norm": 1.8796017169952393, |
|
"learning_rate": 5e-05, |
|
"loss": 0.31, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"eval_accuracy": 0.5601538461538461, |
|
"eval_loss": 2.6845180988311768, |
|
"eval_runtime": 5.7492, |
|
"eval_samples_per_second": 86.969, |
|
"eval_steps_per_second": 10.958, |
|
"step": 18515 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"eval_exact_match": 16.2, |
|
"eval_f1": 23.91369408369409, |
|
"step": 18515 |
|
}, |
|
{ |
|
"epoch": 35.16068052930057, |
|
"grad_norm": 1.7619277238845825, |
|
"learning_rate": 5e-05, |
|
"loss": 0.2853, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 35.349716446124766, |
|
"grad_norm": 1.7933720350265503, |
|
"learning_rate": 5e-05, |
|
"loss": 0.2853, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 35.53875236294896, |
|
"grad_norm": 1.476181149482727, |
|
"learning_rate": 5e-05, |
|
"loss": 0.2955, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 35.72778827977316, |
|
"grad_norm": 1.6427425146102905, |
|
"learning_rate": 5e-05, |
|
"loss": 0.2934, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 35.916824196597354, |
|
"grad_norm": 1.7125171422958374, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3009, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"eval_accuracy": 0.5604615384615385, |
|
"eval_loss": 2.6663565635681152, |
|
"eval_runtime": 5.0382, |
|
"eval_samples_per_second": 99.241, |
|
"eval_steps_per_second": 12.504, |
|
"step": 19044 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"eval_exact_match": 16.0, |
|
"eval_f1": 23.221269841269848, |
|
"step": 19044 |
|
}, |
|
{ |
|
"epoch": 36.10586011342155, |
|
"grad_norm": 1.757238507270813, |
|
"learning_rate": 5e-05, |
|
"loss": 0.286, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 36.294896030245745, |
|
"grad_norm": 1.9411671161651611, |
|
"learning_rate": 5e-05, |
|
"loss": 0.2761, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 36.48393194706994, |
|
"grad_norm": 2.305509328842163, |
|
"learning_rate": 5e-05, |
|
"loss": 0.2873, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 36.67296786389414, |
|
"grad_norm": 1.6525070667266846, |
|
"learning_rate": 5e-05, |
|
"loss": 0.2881, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 36.86200378071834, |
|
"grad_norm": 1.562827706336975, |
|
"learning_rate": 5e-05, |
|
"loss": 0.2959, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 37.0, |
|
"eval_accuracy": 0.5594871794871795, |
|
"eval_loss": 2.738389492034912, |
|
"eval_runtime": 5.0296, |
|
"eval_samples_per_second": 99.412, |
|
"eval_steps_per_second": 12.526, |
|
"step": 19573 |
|
}, |
|
{ |
|
"epoch": 37.0, |
|
"eval_exact_match": 16.0, |
|
"eval_f1": 23.01206349206349, |
|
"step": 19573 |
|
}, |
|
{ |
|
"epoch": 37.051039697542535, |
|
"grad_norm": 2.588029146194458, |
|
"learning_rate": 5e-05, |
|
"loss": 0.2885, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 37.24007561436673, |
|
"grad_norm": 2.2393579483032227, |
|
"learning_rate": 5e-05, |
|
"loss": 0.267, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 37.42911153119093, |
|
"grad_norm": 1.5349841117858887, |
|
"learning_rate": 5e-05, |
|
"loss": 0.2787, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 37.61814744801512, |
|
"grad_norm": 1.8498040437698364, |
|
"learning_rate": 5e-05, |
|
"loss": 0.2873, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 37.80718336483932, |
|
"grad_norm": 1.9959454536437988, |
|
"learning_rate": 5e-05, |
|
"loss": 0.2879, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 37.996219281663514, |
|
"grad_norm": 1.814253330230713, |
|
"learning_rate": 5e-05, |
|
"loss": 0.2927, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"eval_accuracy": 0.5592820512820513, |
|
"eval_loss": 2.731750965118408, |
|
"eval_runtime": 5.032, |
|
"eval_samples_per_second": 99.364, |
|
"eval_steps_per_second": 12.52, |
|
"step": 20102 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"eval_exact_match": 16.4, |
|
"eval_f1": 23.767460317460323, |
|
"step": 20102 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 26450, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 50, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.2383494727280886e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|