|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 17.0, |
|
"eval_steps": 500, |
|
"global_step": 8993, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.1890359168241966, |
|
"grad_norm": 0.5107442140579224, |
|
"learning_rate": 5e-05, |
|
"loss": 2.0482, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3780718336483932, |
|
"grad_norm": 0.5563424229621887, |
|
"learning_rate": 5e-05, |
|
"loss": 1.8835, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5671077504725898, |
|
"grad_norm": 0.4735371172428131, |
|
"learning_rate": 5e-05, |
|
"loss": 1.8512, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7561436672967864, |
|
"grad_norm": 0.4543495178222656, |
|
"learning_rate": 5e-05, |
|
"loss": 1.8489, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.945179584120983, |
|
"grad_norm": 0.5007112622261047, |
|
"learning_rate": 5e-05, |
|
"loss": 1.8439, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.5733333333333334, |
|
"eval_loss": 1.6194567680358887, |
|
"eval_runtime": 5.4154, |
|
"eval_samples_per_second": 92.33, |
|
"eval_steps_per_second": 11.634, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_exact_match": 17.0, |
|
"eval_f1": 26.880952380952387, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 1.1342155009451795, |
|
"grad_norm": 0.5181360244750977, |
|
"learning_rate": 5e-05, |
|
"loss": 1.8217, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.3232514177693762, |
|
"grad_norm": 1.1348918676376343, |
|
"learning_rate": 5e-05, |
|
"loss": 1.7866, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.5122873345935728, |
|
"grad_norm": 0.661758303642273, |
|
"learning_rate": 5e-05, |
|
"loss": 1.7934, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.7013232514177694, |
|
"grad_norm": 0.5634772777557373, |
|
"learning_rate": 5e-05, |
|
"loss": 1.7731, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.8903591682419658, |
|
"grad_norm": 0.6698930263519287, |
|
"learning_rate": 5e-05, |
|
"loss": 1.7969, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.5751794871794872, |
|
"eval_loss": 1.603736162185669, |
|
"eval_runtime": 5.0986, |
|
"eval_samples_per_second": 98.066, |
|
"eval_steps_per_second": 12.356, |
|
"step": 1058 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_exact_match": 17.6, |
|
"eval_f1": 27.218095238095238, |
|
"step": 1058 |
|
}, |
|
{ |
|
"epoch": 2.0793950850661624, |
|
"grad_norm": 0.6265193223953247, |
|
"learning_rate": 5e-05, |
|
"loss": 1.7416, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.268431001890359, |
|
"grad_norm": 0.812364935874939, |
|
"learning_rate": 5e-05, |
|
"loss": 1.6957, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.4574669187145557, |
|
"grad_norm": 1.0972926616668701, |
|
"learning_rate": 5e-05, |
|
"loss": 1.7033, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.6465028355387523, |
|
"grad_norm": 1.0831658840179443, |
|
"learning_rate": 5e-05, |
|
"loss": 1.6935, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.835538752362949, |
|
"grad_norm": 1.0022494792938232, |
|
"learning_rate": 5e-05, |
|
"loss": 1.6799, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.573025641025641, |
|
"eval_loss": 1.6428505182266235, |
|
"eval_runtime": 5.1488, |
|
"eval_samples_per_second": 97.111, |
|
"eval_steps_per_second": 12.236, |
|
"step": 1587 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_exact_match": 20.2, |
|
"eval_f1": 27.93809523809523, |
|
"step": 1587 |
|
}, |
|
{ |
|
"epoch": 3.0245746691871456, |
|
"grad_norm": 0.8240587711334229, |
|
"learning_rate": 5e-05, |
|
"loss": 1.6618, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 3.213610586011342, |
|
"grad_norm": 1.2994645833969116, |
|
"learning_rate": 5e-05, |
|
"loss": 1.5736, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.402646502835539, |
|
"grad_norm": 1.0891159772872925, |
|
"learning_rate": 5e-05, |
|
"loss": 1.5798, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.5916824196597354, |
|
"grad_norm": 1.3826475143432617, |
|
"learning_rate": 5e-05, |
|
"loss": 1.5957, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 3.780718336483932, |
|
"grad_norm": 0.9901586174964905, |
|
"learning_rate": 5e-05, |
|
"loss": 1.6026, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.9697542533081287, |
|
"grad_norm": 1.0607813596725464, |
|
"learning_rate": 5e-05, |
|
"loss": 1.6106, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.5708717948717948, |
|
"eval_loss": 1.7143596410751343, |
|
"eval_runtime": 5.3942, |
|
"eval_samples_per_second": 92.693, |
|
"eval_steps_per_second": 11.679, |
|
"step": 2116 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_exact_match": 19.0, |
|
"eval_f1": 27.71428571428571, |
|
"step": 2116 |
|
}, |
|
{ |
|
"epoch": 4.158790170132325, |
|
"grad_norm": 1.3189178705215454, |
|
"learning_rate": 5e-05, |
|
"loss": 1.5029, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 4.3478260869565215, |
|
"grad_norm": 1.062267541885376, |
|
"learning_rate": 5e-05, |
|
"loss": 1.51, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 4.536862003780718, |
|
"grad_norm": 1.3810843229293823, |
|
"learning_rate": 5e-05, |
|
"loss": 1.4867, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 4.725897920604915, |
|
"grad_norm": 1.1991653442382812, |
|
"learning_rate": 5e-05, |
|
"loss": 1.501, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 4.914933837429111, |
|
"grad_norm": 1.310654878616333, |
|
"learning_rate": 5e-05, |
|
"loss": 1.5034, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.5681025641025641, |
|
"eval_loss": 1.8328346014022827, |
|
"eval_runtime": 5.7763, |
|
"eval_samples_per_second": 86.561, |
|
"eval_steps_per_second": 10.907, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_exact_match": 18.8, |
|
"eval_f1": 26.6015873015873, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 5.103969754253308, |
|
"grad_norm": 1.1406631469726562, |
|
"learning_rate": 5e-05, |
|
"loss": 1.4605, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 5.293005671077505, |
|
"grad_norm": 1.447221279144287, |
|
"learning_rate": 5e-05, |
|
"loss": 1.4204, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 5.482041587901701, |
|
"grad_norm": 1.461799144744873, |
|
"learning_rate": 5e-05, |
|
"loss": 1.4233, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 5.671077504725898, |
|
"grad_norm": 1.4184092283248901, |
|
"learning_rate": 5e-05, |
|
"loss": 1.4265, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 5.8601134215500945, |
|
"grad_norm": 0.9850455522537231, |
|
"learning_rate": 5e-05, |
|
"loss": 1.4139, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.564974358974359, |
|
"eval_loss": 1.931396484375, |
|
"eval_runtime": 5.3358, |
|
"eval_samples_per_second": 93.706, |
|
"eval_steps_per_second": 11.807, |
|
"step": 3174 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_exact_match": 17.6, |
|
"eval_f1": 25.464285714285715, |
|
"step": 3174 |
|
}, |
|
{ |
|
"epoch": 6.049149338374291, |
|
"grad_norm": 1.2274116277694702, |
|
"learning_rate": 5e-05, |
|
"loss": 1.4159, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 6.238185255198488, |
|
"grad_norm": 1.2107988595962524, |
|
"learning_rate": 5e-05, |
|
"loss": 1.3579, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 6.427221172022684, |
|
"grad_norm": 1.2965952157974243, |
|
"learning_rate": 5e-05, |
|
"loss": 1.3485, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 6.616257088846881, |
|
"grad_norm": 1.3035151958465576, |
|
"learning_rate": 5e-05, |
|
"loss": 1.36, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 6.805293005671078, |
|
"grad_norm": 1.1840296983718872, |
|
"learning_rate": 5e-05, |
|
"loss": 1.3599, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 6.994328922495274, |
|
"grad_norm": 1.780287265777588, |
|
"learning_rate": 5e-05, |
|
"loss": 1.355, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.565076923076923, |
|
"eval_loss": 1.9659509658813477, |
|
"eval_runtime": 5.3127, |
|
"eval_samples_per_second": 94.114, |
|
"eval_steps_per_second": 11.858, |
|
"step": 3703 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_exact_match": 18.0, |
|
"eval_f1": 26.546666666666663, |
|
"step": 3703 |
|
}, |
|
{ |
|
"epoch": 7.183364839319471, |
|
"grad_norm": 1.1608421802520752, |
|
"learning_rate": 5e-05, |
|
"loss": 1.298, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 7.3724007561436675, |
|
"grad_norm": 1.083270788192749, |
|
"learning_rate": 5e-05, |
|
"loss": 1.3088, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 7.561436672967864, |
|
"grad_norm": 1.3489899635314941, |
|
"learning_rate": 5e-05, |
|
"loss": 1.299, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 7.750472589792061, |
|
"grad_norm": 1.1663447618484497, |
|
"learning_rate": 5e-05, |
|
"loss": 1.2841, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 7.939508506616257, |
|
"grad_norm": 1.9308583736419678, |
|
"learning_rate": 5e-05, |
|
"loss": 1.314, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.5616923076923077, |
|
"eval_loss": 2.1164443492889404, |
|
"eval_runtime": 5.2165, |
|
"eval_samples_per_second": 95.85, |
|
"eval_steps_per_second": 12.077, |
|
"step": 4232 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_exact_match": 17.0, |
|
"eval_f1": 24.659841269841277, |
|
"step": 4232 |
|
}, |
|
{ |
|
"epoch": 8.128544423440454, |
|
"grad_norm": 1.109632134437561, |
|
"learning_rate": 5e-05, |
|
"loss": 1.2558, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 8.31758034026465, |
|
"grad_norm": 1.1132491827011108, |
|
"learning_rate": 5e-05, |
|
"loss": 1.2264, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 8.506616257088847, |
|
"grad_norm": 1.1017369031906128, |
|
"learning_rate": 5e-05, |
|
"loss": 1.2649, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 8.695652173913043, |
|
"grad_norm": 1.0874963998794556, |
|
"learning_rate": 5e-05, |
|
"loss": 1.2512, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 8.88468809073724, |
|
"grad_norm": 1.2617064714431763, |
|
"learning_rate": 5e-05, |
|
"loss": 1.261, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.5622051282051282, |
|
"eval_loss": 2.1635327339172363, |
|
"eval_runtime": 5.0832, |
|
"eval_samples_per_second": 98.363, |
|
"eval_steps_per_second": 12.394, |
|
"step": 4761 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_exact_match": 17.8, |
|
"eval_f1": 25.451428571428576, |
|
"step": 4761 |
|
}, |
|
{ |
|
"epoch": 9.073724007561436, |
|
"grad_norm": 1.2258027791976929, |
|
"learning_rate": 5e-05, |
|
"loss": 1.2298, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 9.262759924385634, |
|
"grad_norm": 1.8261148929595947, |
|
"learning_rate": 5e-05, |
|
"loss": 1.1748, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 9.45179584120983, |
|
"grad_norm": 1.1221072673797607, |
|
"learning_rate": 5e-05, |
|
"loss": 1.19, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 9.640831758034027, |
|
"grad_norm": 1.3320170640945435, |
|
"learning_rate": 5e-05, |
|
"loss": 1.1932, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 9.829867674858223, |
|
"grad_norm": 1.194077730178833, |
|
"learning_rate": 5e-05, |
|
"loss": 1.217, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.5612820512820513, |
|
"eval_loss": 2.244041919708252, |
|
"eval_runtime": 5.1606, |
|
"eval_samples_per_second": 96.889, |
|
"eval_steps_per_second": 12.208, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_exact_match": 18.4, |
|
"eval_f1": 26.067662337662348, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 10.01890359168242, |
|
"grad_norm": 1.1352814435958862, |
|
"learning_rate": 5e-05, |
|
"loss": 1.2093, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 10.207939508506616, |
|
"grad_norm": 1.2741838693618774, |
|
"learning_rate": 5e-05, |
|
"loss": 1.1467, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 10.396975425330814, |
|
"grad_norm": 1.6901249885559082, |
|
"learning_rate": 5e-05, |
|
"loss": 1.1288, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 10.58601134215501, |
|
"grad_norm": 1.1944465637207031, |
|
"learning_rate": 5e-05, |
|
"loss": 1.1399, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 10.775047258979207, |
|
"grad_norm": 1.2234572172164917, |
|
"learning_rate": 5e-05, |
|
"loss": 1.1678, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 10.964083175803403, |
|
"grad_norm": 1.2781612873077393, |
|
"learning_rate": 5e-05, |
|
"loss": 1.146, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.5618461538461539, |
|
"eval_loss": 2.296630859375, |
|
"eval_runtime": 5.1078, |
|
"eval_samples_per_second": 97.889, |
|
"eval_steps_per_second": 12.334, |
|
"step": 5819 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_exact_match": 16.8, |
|
"eval_f1": 23.63305916305917, |
|
"step": 5819 |
|
}, |
|
{ |
|
"epoch": 11.1531190926276, |
|
"grad_norm": 1.138975739479065, |
|
"learning_rate": 5e-05, |
|
"loss": 1.0842, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 11.342155009451796, |
|
"grad_norm": 1.727989912033081, |
|
"learning_rate": 5e-05, |
|
"loss": 1.0806, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 11.531190926275993, |
|
"grad_norm": 1.3104358911514282, |
|
"learning_rate": 5e-05, |
|
"loss": 1.0953, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 11.720226843100189, |
|
"grad_norm": 1.3629850149154663, |
|
"learning_rate": 5e-05, |
|
"loss": 1.0945, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 11.909262759924385, |
|
"grad_norm": 1.3276056051254272, |
|
"learning_rate": 5e-05, |
|
"loss": 1.1069, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.562, |
|
"eval_loss": 2.3118155002593994, |
|
"eval_runtime": 5.3837, |
|
"eval_samples_per_second": 92.873, |
|
"eval_steps_per_second": 11.702, |
|
"step": 6348 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_exact_match": 17.4, |
|
"eval_f1": 24.43924963924964, |
|
"step": 6348 |
|
}, |
|
{ |
|
"epoch": 12.098298676748582, |
|
"grad_norm": 1.2608078718185425, |
|
"learning_rate": 5e-05, |
|
"loss": 1.0418, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 12.287334593572778, |
|
"grad_norm": 1.4456945657730103, |
|
"learning_rate": 5e-05, |
|
"loss": 1.0258, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 12.476370510396976, |
|
"grad_norm": 1.4195287227630615, |
|
"learning_rate": 5e-05, |
|
"loss": 1.0255, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 12.665406427221171, |
|
"grad_norm": 1.2595598697662354, |
|
"learning_rate": 5e-05, |
|
"loss": 1.0511, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 12.854442344045369, |
|
"grad_norm": 1.5901292562484741, |
|
"learning_rate": 5e-05, |
|
"loss": 1.045, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.5614871794871795, |
|
"eval_loss": 2.3580987453460693, |
|
"eval_runtime": 5.3496, |
|
"eval_samples_per_second": 93.466, |
|
"eval_steps_per_second": 11.777, |
|
"step": 6877 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_exact_match": 17.2, |
|
"eval_f1": 24.350995670995676, |
|
"step": 6877 |
|
}, |
|
{ |
|
"epoch": 13.043478260869565, |
|
"grad_norm": 1.471511721611023, |
|
"learning_rate": 5e-05, |
|
"loss": 1.0404, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 13.232514177693762, |
|
"grad_norm": 1.2928757667541504, |
|
"learning_rate": 5e-05, |
|
"loss": 0.9573, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 13.421550094517958, |
|
"grad_norm": 1.5545622110366821, |
|
"learning_rate": 5e-05, |
|
"loss": 0.9583, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 13.610586011342155, |
|
"grad_norm": 1.6132512092590332, |
|
"learning_rate": 5e-05, |
|
"loss": 0.981, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 13.799621928166351, |
|
"grad_norm": 1.7548742294311523, |
|
"learning_rate": 5e-05, |
|
"loss": 0.9999, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 13.988657844990549, |
|
"grad_norm": 1.4219907522201538, |
|
"learning_rate": 5e-05, |
|
"loss": 1.0028, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.5618974358974359, |
|
"eval_loss": 2.413728952407837, |
|
"eval_runtime": 5.0464, |
|
"eval_samples_per_second": 99.08, |
|
"eval_steps_per_second": 12.484, |
|
"step": 7406 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_exact_match": 16.6, |
|
"eval_f1": 24.201789321789327, |
|
"step": 7406 |
|
}, |
|
{ |
|
"epoch": 14.177693761814744, |
|
"grad_norm": 1.5799965858459473, |
|
"learning_rate": 5e-05, |
|
"loss": 0.9096, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 14.366729678638942, |
|
"grad_norm": 1.5961594581604004, |
|
"learning_rate": 5e-05, |
|
"loss": 0.9075, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 14.555765595463138, |
|
"grad_norm": 1.5756512880325317, |
|
"learning_rate": 5e-05, |
|
"loss": 0.9222, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 14.744801512287335, |
|
"grad_norm": 1.515251636505127, |
|
"learning_rate": 5e-05, |
|
"loss": 0.9445, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 14.93383742911153, |
|
"grad_norm": 2.6804299354553223, |
|
"learning_rate": 5e-05, |
|
"loss": 0.9153, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.5596923076923077, |
|
"eval_loss": 2.4756176471710205, |
|
"eval_runtime": 5.1956, |
|
"eval_samples_per_second": 96.236, |
|
"eval_steps_per_second": 12.126, |
|
"step": 7935 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_exact_match": 16.8, |
|
"eval_f1": 24.146709956709966, |
|
"step": 7935 |
|
}, |
|
{ |
|
"epoch": 15.122873345935728, |
|
"grad_norm": 1.7786214351654053, |
|
"learning_rate": 5e-05, |
|
"loss": 0.8874, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 15.311909262759924, |
|
"grad_norm": 1.709060788154602, |
|
"learning_rate": 5e-05, |
|
"loss": 0.8538, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 15.500945179584122, |
|
"grad_norm": 2.3245434761047363, |
|
"learning_rate": 5e-05, |
|
"loss": 0.8591, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 15.689981096408317, |
|
"grad_norm": 1.6101887226104736, |
|
"learning_rate": 5e-05, |
|
"loss": 0.8593, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 15.879017013232515, |
|
"grad_norm": 1.5917792320251465, |
|
"learning_rate": 5e-05, |
|
"loss": 0.8748, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.5604615384615385, |
|
"eval_loss": 2.4434447288513184, |
|
"eval_runtime": 5.1072, |
|
"eval_samples_per_second": 97.901, |
|
"eval_steps_per_second": 12.336, |
|
"step": 8464 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_exact_match": 15.6, |
|
"eval_f1": 22.917027417027416, |
|
"step": 8464 |
|
}, |
|
{ |
|
"epoch": 16.068052930056712, |
|
"grad_norm": 1.6529980897903442, |
|
"learning_rate": 5e-05, |
|
"loss": 0.8413, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 16.257088846880908, |
|
"grad_norm": 1.931520938873291, |
|
"learning_rate": 5e-05, |
|
"loss": 0.7863, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 16.446124763705104, |
|
"grad_norm": 1.8401894569396973, |
|
"learning_rate": 5e-05, |
|
"loss": 0.799, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 16.6351606805293, |
|
"grad_norm": 1.7952015399932861, |
|
"learning_rate": 5e-05, |
|
"loss": 0.8238, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 16.8241965973535, |
|
"grad_norm": 1.8028634786605835, |
|
"learning_rate": 5e-05, |
|
"loss": 0.807, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.5606153846153846, |
|
"eval_loss": 2.4520959854125977, |
|
"eval_runtime": 5.1769, |
|
"eval_samples_per_second": 96.584, |
|
"eval_steps_per_second": 12.17, |
|
"step": 8993 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_exact_match": 17.4, |
|
"eval_f1": 25.104011544011545, |
|
"step": 8993 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 26450, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 50, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.5399505656702566e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|