|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 33.0, |
|
"eval_steps": 500, |
|
"global_step": 17457, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.1890359168241966, |
|
"grad_norm": 0.5558584332466125, |
|
"learning_rate": 3e-05, |
|
"loss": 2.1103, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3780718336483932, |
|
"grad_norm": 0.4733140468597412, |
|
"learning_rate": 3e-05, |
|
"loss": 1.931, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5671077504725898, |
|
"grad_norm": 0.48580050468444824, |
|
"learning_rate": 3e-05, |
|
"loss": 1.8948, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7561436672967864, |
|
"grad_norm": 0.40655234456062317, |
|
"learning_rate": 3e-05, |
|
"loss": 1.8988, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.945179584120983, |
|
"grad_norm": 0.6303955316543579, |
|
"learning_rate": 3e-05, |
|
"loss": 1.8745, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.5701538461538461, |
|
"eval_loss": 1.6455246210098267, |
|
"eval_runtime": 5.0805, |
|
"eval_samples_per_second": 98.415, |
|
"eval_steps_per_second": 12.4, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_exact_match": 15.0, |
|
"eval_f1": 23.927619047619068, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 1.1342155009451795, |
|
"grad_norm": 0.4278818666934967, |
|
"learning_rate": 3e-05, |
|
"loss": 1.8798, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.3232514177693762, |
|
"grad_norm": 0.449830025434494, |
|
"learning_rate": 3e-05, |
|
"loss": 1.8613, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.5122873345935728, |
|
"grad_norm": 0.5130725502967834, |
|
"learning_rate": 3e-05, |
|
"loss": 1.8709, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.7013232514177694, |
|
"grad_norm": 0.4889715611934662, |
|
"learning_rate": 3e-05, |
|
"loss": 1.8448, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.8903591682419658, |
|
"grad_norm": 11.186380386352539, |
|
"learning_rate": 3e-05, |
|
"loss": 1.8764, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.5723589743589743, |
|
"eval_loss": 1.6143561601638794, |
|
"eval_runtime": 5.0466, |
|
"eval_samples_per_second": 99.077, |
|
"eval_steps_per_second": 12.484, |
|
"step": 1058 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_exact_match": 15.6, |
|
"eval_f1": 25.825238095238113, |
|
"step": 1058 |
|
}, |
|
{ |
|
"epoch": 2.0793950850661624, |
|
"grad_norm": 0.5414931774139404, |
|
"learning_rate": 3e-05, |
|
"loss": 1.8398, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.268431001890359, |
|
"grad_norm": 7.441005706787109, |
|
"learning_rate": 3e-05, |
|
"loss": 1.8348, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.4574669187145557, |
|
"grad_norm": 24.312145233154297, |
|
"learning_rate": 3e-05, |
|
"loss": 1.8421, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.6465028355387523, |
|
"grad_norm": 0.7407662868499756, |
|
"learning_rate": 3e-05, |
|
"loss": 1.8292, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.835538752362949, |
|
"grad_norm": 0.9653642177581787, |
|
"learning_rate": 3e-05, |
|
"loss": 1.828, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.5734358974358974, |
|
"eval_loss": 1.617864727973938, |
|
"eval_runtime": 5.2153, |
|
"eval_samples_per_second": 95.872, |
|
"eval_steps_per_second": 12.08, |
|
"step": 1587 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_exact_match": 16.2, |
|
"eval_f1": 25.811428571428586, |
|
"step": 1587 |
|
}, |
|
{ |
|
"epoch": 3.0245746691871456, |
|
"grad_norm": 450.6055603027344, |
|
"learning_rate": 3e-05, |
|
"loss": 1.8204, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 3.213610586011342, |
|
"grad_norm": 2.1127028465270996, |
|
"learning_rate": 3e-05, |
|
"loss": 1.8361, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.402646502835539, |
|
"grad_norm": 564.9320068359375, |
|
"learning_rate": 3e-05, |
|
"loss": 1.8338, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.5916824196597354, |
|
"grad_norm": 52.5078010559082, |
|
"learning_rate": 3e-05, |
|
"loss": 1.8075, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 3.780718336483932, |
|
"grad_norm": 27.87557601928711, |
|
"learning_rate": 3e-05, |
|
"loss": 1.8175, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.9697542533081287, |
|
"grad_norm": 4.810656547546387, |
|
"learning_rate": 3e-05, |
|
"loss": 1.8218, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.5728205128205128, |
|
"eval_loss": 1.6224406957626343, |
|
"eval_runtime": 5.0588, |
|
"eval_samples_per_second": 98.838, |
|
"eval_steps_per_second": 12.454, |
|
"step": 2116 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_exact_match": 17.4, |
|
"eval_f1": 27.194285714285726, |
|
"step": 2116 |
|
}, |
|
{ |
|
"epoch": 4.158790170132325, |
|
"grad_norm": 13.004076957702637, |
|
"learning_rate": 3e-05, |
|
"loss": 1.8089, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 4.3478260869565215, |
|
"grad_norm": 2.23368501663208, |
|
"learning_rate": 3e-05, |
|
"loss": 1.8292, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 4.536862003780718, |
|
"grad_norm": 0.8635586500167847, |
|
"learning_rate": 3e-05, |
|
"loss": 1.7992, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 4.725897920604915, |
|
"grad_norm": 6.5326738357543945, |
|
"learning_rate": 3e-05, |
|
"loss": 1.7955, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 4.914933837429111, |
|
"grad_norm": 463.4647521972656, |
|
"learning_rate": 3e-05, |
|
"loss": 1.7953, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.5740512820512821, |
|
"eval_loss": 1.6217337846755981, |
|
"eval_runtime": 5.1898, |
|
"eval_samples_per_second": 96.342, |
|
"eval_steps_per_second": 12.139, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_exact_match": 17.6, |
|
"eval_f1": 26.161428571428587, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 5.103969754253308, |
|
"grad_norm": 203.82815551757812, |
|
"learning_rate": 3e-05, |
|
"loss": 1.7845, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 5.293005671077505, |
|
"grad_norm": 660.8076782226562, |
|
"learning_rate": 3e-05, |
|
"loss": 1.7909, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 5.482041587901701, |
|
"grad_norm": 5727.51806640625, |
|
"learning_rate": 3e-05, |
|
"loss": 1.7909, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 5.671077504725898, |
|
"grad_norm": 4880698.0, |
|
"learning_rate": 3e-05, |
|
"loss": 3.5781, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 5.8601134215500945, |
|
"grad_norm": 1192.6517333984375, |
|
"learning_rate": 3e-05, |
|
"loss": 3.8703, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.5433846153846154, |
|
"eval_loss": 1.886108636856079, |
|
"eval_runtime": 5.5162, |
|
"eval_samples_per_second": 90.643, |
|
"eval_steps_per_second": 11.421, |
|
"step": 3174 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_exact_match": 11.4, |
|
"eval_f1": 20.358571428571437, |
|
"step": 3174 |
|
}, |
|
{ |
|
"epoch": 6.049149338374291, |
|
"grad_norm": 532.3778686523438, |
|
"learning_rate": 3e-05, |
|
"loss": 2.1701, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 6.238185255198488, |
|
"grad_norm": 565.6697998046875, |
|
"learning_rate": 3e-05, |
|
"loss": 1.8533, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 6.427221172022684, |
|
"grad_norm": 25206.482421875, |
|
"learning_rate": 3e-05, |
|
"loss": 2.623, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 6.616257088846881, |
|
"grad_norm": 71.88556671142578, |
|
"learning_rate": 3e-05, |
|
"loss": 3.1573, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 6.805293005671078, |
|
"grad_norm": 269563.15625, |
|
"learning_rate": 3e-05, |
|
"loss": 2.2696, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 6.994328922495274, |
|
"grad_norm": 11768.7880859375, |
|
"learning_rate": 3e-05, |
|
"loss": 3.1506, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.2834871794871795, |
|
"eval_loss": 8.311681747436523, |
|
"eval_runtime": 5.3223, |
|
"eval_samples_per_second": 93.944, |
|
"eval_steps_per_second": 11.837, |
|
"step": 3703 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_exact_match": 0.0, |
|
"eval_f1": 0.027458851580988985, |
|
"step": 3703 |
|
}, |
|
{ |
|
"epoch": 7.183364839319471, |
|
"grad_norm": 6220.75244140625, |
|
"learning_rate": 3e-05, |
|
"loss": 4.273, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 7.3724007561436675, |
|
"grad_norm": 3387.953857421875, |
|
"learning_rate": 3e-05, |
|
"loss": 7.7199, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 7.561436672967864, |
|
"grad_norm": 372.8681640625, |
|
"learning_rate": 3e-05, |
|
"loss": 7.7045, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 7.750472589792061, |
|
"grad_norm": 159.64283752441406, |
|
"learning_rate": 3e-05, |
|
"loss": 6.8901, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 7.939508506616257, |
|
"grad_norm": 751876.125, |
|
"learning_rate": 3e-05, |
|
"loss": 6.5238, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.22764102564102565, |
|
"eval_loss": 7.12347936630249, |
|
"eval_runtime": 5.3818, |
|
"eval_samples_per_second": 92.905, |
|
"eval_steps_per_second": 11.706, |
|
"step": 4232 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_exact_match": 0.0, |
|
"eval_f1": 0.013333333333333336, |
|
"step": 4232 |
|
}, |
|
{ |
|
"epoch": 8.128544423440454, |
|
"grad_norm": 1035664128.0, |
|
"learning_rate": 3e-05, |
|
"loss": 5.4157, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 8.31758034026465, |
|
"grad_norm": 11571242.0, |
|
"learning_rate": 3e-05, |
|
"loss": 5.8257, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 8.506616257088847, |
|
"grad_norm": 88236256.0, |
|
"learning_rate": 3e-05, |
|
"loss": 6.3609, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 8.695652173913043, |
|
"grad_norm": 105426080.0, |
|
"learning_rate": 3e-05, |
|
"loss": 6.1824, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 8.88468809073724, |
|
"grad_norm": 51438844.0, |
|
"learning_rate": 3e-05, |
|
"loss": 6.1818, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.1995897435897436, |
|
"eval_loss": 11.187646865844727, |
|
"eval_runtime": 5.154, |
|
"eval_samples_per_second": 97.013, |
|
"eval_steps_per_second": 12.224, |
|
"step": 4761 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_exact_match": 0.0, |
|
"eval_f1": 0.0, |
|
"step": 4761 |
|
}, |
|
{ |
|
"epoch": 9.073724007561436, |
|
"grad_norm": 8742571.0, |
|
"learning_rate": 3e-05, |
|
"loss": 7.5029, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 9.262759924385634, |
|
"grad_norm": 4885384192.0, |
|
"learning_rate": 3e-05, |
|
"loss": 7.805, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 9.45179584120983, |
|
"grad_norm": 919074304.0, |
|
"learning_rate": 3e-05, |
|
"loss": 8.0712, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 9.640831758034027, |
|
"grad_norm": 9706883072.0, |
|
"learning_rate": 3e-05, |
|
"loss": 8.4915, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 9.829867674858223, |
|
"grad_norm": 35723780096.0, |
|
"learning_rate": 3e-05, |
|
"loss": 8.3286, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.1995897435897436, |
|
"eval_loss": 13.374640464782715, |
|
"eval_runtime": 5.2089, |
|
"eval_samples_per_second": 95.99, |
|
"eval_steps_per_second": 12.095, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_exact_match": 0.0, |
|
"eval_f1": 0.0, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 10.01890359168242, |
|
"grad_norm": 172018032.0, |
|
"learning_rate": 3e-05, |
|
"loss": 8.4403, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 10.207939508506616, |
|
"grad_norm": 329336992.0, |
|
"learning_rate": 3e-05, |
|
"loss": 8.3233, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 10.396975425330814, |
|
"grad_norm": 74210560.0, |
|
"learning_rate": 3e-05, |
|
"loss": 8.3673, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 10.58601134215501, |
|
"grad_norm": 497968480.0, |
|
"learning_rate": 3e-05, |
|
"loss": 9.8311, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 10.775047258979207, |
|
"grad_norm": 34138104.0, |
|
"learning_rate": 3e-05, |
|
"loss": 10.8161, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 10.964083175803403, |
|
"grad_norm": 308232992.0, |
|
"learning_rate": 3e-05, |
|
"loss": 10.2827, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.1995897435897436, |
|
"eval_loss": 12.878591537475586, |
|
"eval_runtime": 5.1038, |
|
"eval_samples_per_second": 97.966, |
|
"eval_steps_per_second": 12.344, |
|
"step": 5819 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_exact_match": 0.0, |
|
"eval_f1": 0.0, |
|
"step": 5819 |
|
}, |
|
{ |
|
"epoch": 11.1531190926276, |
|
"grad_norm": 255807824.0, |
|
"learning_rate": 3e-05, |
|
"loss": 9.8782, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 11.342155009451796, |
|
"grad_norm": 27976615936.0, |
|
"learning_rate": 3e-05, |
|
"loss": 9.879, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 11.531190926275993, |
|
"grad_norm": 13783190732800.0, |
|
"learning_rate": 3e-05, |
|
"loss": 9.8617, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 11.720226843100189, |
|
"grad_norm": 3467661017088.0, |
|
"learning_rate": 3e-05, |
|
"loss": 10.1028, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 11.909262759924385, |
|
"grad_norm": 333752.3125, |
|
"learning_rate": 3e-05, |
|
"loss": 9.4972, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.1995897435897436, |
|
"eval_loss": 13.161654472351074, |
|
"eval_runtime": 5.0927, |
|
"eval_samples_per_second": 98.18, |
|
"eval_steps_per_second": 12.371, |
|
"step": 6348 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_exact_match": 0.0, |
|
"eval_f1": 0.0, |
|
"step": 6348 |
|
}, |
|
{ |
|
"epoch": 12.098298676748582, |
|
"grad_norm": 6690696704.0, |
|
"learning_rate": 3e-05, |
|
"loss": 9.2939, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 12.287334593572778, |
|
"grad_norm": 148906835968.0, |
|
"learning_rate": 3e-05, |
|
"loss": 9.3499, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 12.476370510396976, |
|
"grad_norm": 15515338752.0, |
|
"learning_rate": 3e-05, |
|
"loss": 9.3496, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 12.665406427221171, |
|
"grad_norm": 76318760960.0, |
|
"learning_rate": 3e-05, |
|
"loss": 9.2728, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 12.854442344045369, |
|
"grad_norm": 85637283840.0, |
|
"learning_rate": 3e-05, |
|
"loss": 9.4453, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.1995897435897436, |
|
"eval_loss": 12.748088836669922, |
|
"eval_runtime": 5.1332, |
|
"eval_samples_per_second": 97.405, |
|
"eval_steps_per_second": 12.273, |
|
"step": 6877 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_exact_match": 0.0, |
|
"eval_f1": 0.0, |
|
"step": 6877 |
|
}, |
|
{ |
|
"epoch": 13.043478260869565, |
|
"grad_norm": 370487552.0, |
|
"learning_rate": 3e-05, |
|
"loss": 9.8639, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 13.232514177693762, |
|
"grad_norm": 243782.78125, |
|
"learning_rate": 3e-05, |
|
"loss": 9.2996, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 13.421550094517958, |
|
"grad_norm": 572714688.0, |
|
"learning_rate": 3e-05, |
|
"loss": 9.0366, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 13.610586011342155, |
|
"grad_norm": 807026112.0, |
|
"learning_rate": 3e-05, |
|
"loss": 8.9777, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 13.799621928166351, |
|
"grad_norm": 401406112.0, |
|
"learning_rate": 3e-05, |
|
"loss": 10.3094, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 13.988657844990549, |
|
"grad_norm": 20012445696.0, |
|
"learning_rate": 3e-05, |
|
"loss": 11.1388, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.1995897435897436, |
|
"eval_loss": 11.845151901245117, |
|
"eval_runtime": 5.7887, |
|
"eval_samples_per_second": 86.376, |
|
"eval_steps_per_second": 10.883, |
|
"step": 7406 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_exact_match": 0.0, |
|
"eval_f1": 0.0, |
|
"step": 7406 |
|
}, |
|
{ |
|
"epoch": 14.177693761814744, |
|
"grad_norm": 61066740.0, |
|
"learning_rate": 3e-05, |
|
"loss": 9.4144, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 14.366729678638942, |
|
"grad_norm": 58445844.0, |
|
"learning_rate": 3e-05, |
|
"loss": 10.1294, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 14.555765595463138, |
|
"grad_norm": 40712032.0, |
|
"learning_rate": 3e-05, |
|
"loss": 12.1512, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 14.744801512287335, |
|
"grad_norm": 118321208.0, |
|
"learning_rate": 3e-05, |
|
"loss": 10.8949, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 14.93383742911153, |
|
"grad_norm": 61603804.0, |
|
"learning_rate": 3e-05, |
|
"loss": 10.849, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.1995897435897436, |
|
"eval_loss": 14.529706954956055, |
|
"eval_runtime": 5.7662, |
|
"eval_samples_per_second": 86.712, |
|
"eval_steps_per_second": 10.926, |
|
"step": 7935 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_exact_match": 0.0, |
|
"eval_f1": 0.0, |
|
"step": 7935 |
|
}, |
|
{ |
|
"epoch": 15.122873345935728, |
|
"grad_norm": 1755613568.0, |
|
"learning_rate": 3e-05, |
|
"loss": 10.7302, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 15.311909262759924, |
|
"grad_norm": 1215193088.0, |
|
"learning_rate": 3e-05, |
|
"loss": 10.1428, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 15.500945179584122, |
|
"grad_norm": 40452497408.0, |
|
"learning_rate": 3e-05, |
|
"loss": 10.0763, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 15.689981096408317, |
|
"grad_norm": 10178840576.0, |
|
"learning_rate": 3e-05, |
|
"loss": 10.2204, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 15.879017013232515, |
|
"grad_norm": 10069305344.0, |
|
"learning_rate": 3e-05, |
|
"loss": 9.9088, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.22523076923076923, |
|
"eval_loss": 14.802911758422852, |
|
"eval_runtime": 5.0764, |
|
"eval_samples_per_second": 98.494, |
|
"eval_steps_per_second": 12.41, |
|
"step": 8464 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_exact_match": 0.0, |
|
"eval_f1": 0.0, |
|
"step": 8464 |
|
}, |
|
{ |
|
"epoch": 16.068052930056712, |
|
"grad_norm": 9244783616.0, |
|
"learning_rate": 3e-05, |
|
"loss": 10.2574, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 16.257088846880908, |
|
"grad_norm": 556177664.0, |
|
"learning_rate": 3e-05, |
|
"loss": 10.6479, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 16.446124763705104, |
|
"grad_norm": 44021864.0, |
|
"learning_rate": 3e-05, |
|
"loss": 11.8164, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 16.6351606805293, |
|
"grad_norm": 17835092.0, |
|
"learning_rate": 3e-05, |
|
"loss": 12.391, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 16.8241965973535, |
|
"grad_norm": 39907684.0, |
|
"learning_rate": 3e-05, |
|
"loss": 12.3715, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.1995897435897436, |
|
"eval_loss": 15.876049995422363, |
|
"eval_runtime": 5.1121, |
|
"eval_samples_per_second": 97.807, |
|
"eval_steps_per_second": 12.324, |
|
"step": 8993 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_exact_match": 0.0, |
|
"eval_f1": 0.0, |
|
"step": 8993 |
|
}, |
|
{ |
|
"epoch": 17.013232514177695, |
|
"grad_norm": 5489768.0, |
|
"learning_rate": 3e-05, |
|
"loss": 12.9394, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 17.20226843100189, |
|
"grad_norm": 552121280.0, |
|
"learning_rate": 3e-05, |
|
"loss": 15.0591, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 17.391304347826086, |
|
"grad_norm": 4284707.5, |
|
"learning_rate": 3e-05, |
|
"loss": 14.7709, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 17.58034026465028, |
|
"grad_norm": 32860224.0, |
|
"learning_rate": 3e-05, |
|
"loss": 15.33, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 17.76937618147448, |
|
"grad_norm": 500232672.0, |
|
"learning_rate": 3e-05, |
|
"loss": 15.2314, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 17.958412098298677, |
|
"grad_norm": 138954320.0, |
|
"learning_rate": 3e-05, |
|
"loss": 14.0399, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.1995897435897436, |
|
"eval_loss": 15.959660530090332, |
|
"eval_runtime": 5.4552, |
|
"eval_samples_per_second": 91.656, |
|
"eval_steps_per_second": 11.549, |
|
"step": 9522 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_exact_match": 0.0, |
|
"eval_f1": 0.0, |
|
"step": 9522 |
|
}, |
|
{ |
|
"epoch": 18.147448015122873, |
|
"grad_norm": 17529152.0, |
|
"learning_rate": 3e-05, |
|
"loss": 15.3582, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 18.33648393194707, |
|
"grad_norm": 14452664.0, |
|
"learning_rate": 3e-05, |
|
"loss": 16.0595, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 18.525519848771268, |
|
"grad_norm": 9175341.0, |
|
"learning_rate": 3e-05, |
|
"loss": 15.2152, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 18.714555765595463, |
|
"grad_norm": 3109339.75, |
|
"learning_rate": 3e-05, |
|
"loss": 15.6083, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 18.90359168241966, |
|
"grad_norm": 1141542.125, |
|
"learning_rate": 3e-05, |
|
"loss": 15.6927, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 0.1995897435897436, |
|
"eval_loss": 16.572154998779297, |
|
"eval_runtime": 5.784, |
|
"eval_samples_per_second": 86.445, |
|
"eval_steps_per_second": 10.892, |
|
"step": 10051 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_exact_match": 0.0, |
|
"eval_f1": 0.0, |
|
"step": 10051 |
|
}, |
|
{ |
|
"epoch": 19.092627599243855, |
|
"grad_norm": 3051544.0, |
|
"learning_rate": 3e-05, |
|
"loss": 15.4752, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 19.281663516068054, |
|
"grad_norm": 32653528.0, |
|
"learning_rate": 3e-05, |
|
"loss": 15.3329, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 19.47069943289225, |
|
"grad_norm": 373167.34375, |
|
"learning_rate": 3e-05, |
|
"loss": 15.1992, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 19.659735349716446, |
|
"grad_norm": 761316507648.0, |
|
"learning_rate": 3e-05, |
|
"loss": 14.5326, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 19.84877126654064, |
|
"grad_norm": 15194965.0, |
|
"learning_rate": 3e-05, |
|
"loss": 14.6703, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.2026153846153846, |
|
"eval_loss": 12.01396656036377, |
|
"eval_runtime": 5.164, |
|
"eval_samples_per_second": 96.824, |
|
"eval_steps_per_second": 12.2, |
|
"step": 10580 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_exact_match": 0.0, |
|
"eval_f1": 0.0, |
|
"step": 10580 |
|
}, |
|
{ |
|
"epoch": 20.03780718336484, |
|
"grad_norm": 1443913.625, |
|
"learning_rate": 3e-05, |
|
"loss": 13.4546, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 20.226843100189036, |
|
"grad_norm": 38678328.0, |
|
"learning_rate": 3e-05, |
|
"loss": 15.1074, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 20.415879017013232, |
|
"grad_norm": 235378832.0, |
|
"learning_rate": 3e-05, |
|
"loss": 15.8794, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 20.604914933837428, |
|
"grad_norm": 7457460.5, |
|
"learning_rate": 3e-05, |
|
"loss": 15.9728, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 20.793950850661627, |
|
"grad_norm": 360401824.0, |
|
"learning_rate": 3e-05, |
|
"loss": 16.3397, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 20.982986767485823, |
|
"grad_norm": 5285015.0, |
|
"learning_rate": 3e-05, |
|
"loss": 16.1349, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"eval_accuracy": 0.1995897435897436, |
|
"eval_loss": 16.624380111694336, |
|
"eval_runtime": 5.0669, |
|
"eval_samples_per_second": 98.68, |
|
"eval_steps_per_second": 12.434, |
|
"step": 11109 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"eval_exact_match": 0.0, |
|
"eval_f1": 0.0, |
|
"step": 11109 |
|
}, |
|
{ |
|
"epoch": 21.17202268431002, |
|
"grad_norm": 3120185999360.0, |
|
"learning_rate": 3e-05, |
|
"loss": 15.2325, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 21.361058601134214, |
|
"grad_norm": 459053662208.0, |
|
"learning_rate": 3e-05, |
|
"loss": 14.9804, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 21.550094517958414, |
|
"grad_norm": 33530312704.0, |
|
"learning_rate": 3e-05, |
|
"loss": 13.4482, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 21.73913043478261, |
|
"grad_norm": 524117879029760.0, |
|
"learning_rate": 3e-05, |
|
"loss": 12.8065, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 21.928166351606805, |
|
"grad_norm": 73415352385536.0, |
|
"learning_rate": 3e-05, |
|
"loss": 13.1619, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_accuracy": 0.1995897435897436, |
|
"eval_loss": 15.610301971435547, |
|
"eval_runtime": 5.8348, |
|
"eval_samples_per_second": 85.693, |
|
"eval_steps_per_second": 10.797, |
|
"step": 11638 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_exact_match": 0.0, |
|
"eval_f1": 0.0, |
|
"step": 11638 |
|
}, |
|
{ |
|
"epoch": 22.117202268431, |
|
"grad_norm": 24663130374144.0, |
|
"learning_rate": 3e-05, |
|
"loss": 13.4567, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 22.3062381852552, |
|
"grad_norm": 1152798.75, |
|
"learning_rate": 3e-05, |
|
"loss": 13.3323, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 22.495274102079396, |
|
"grad_norm": 6407195.0, |
|
"learning_rate": 3e-05, |
|
"loss": 12.224, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 22.68431001890359, |
|
"grad_norm": 3026044.75, |
|
"learning_rate": 3e-05, |
|
"loss": 11.6534, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 22.873345935727787, |
|
"grad_norm": 433589.25, |
|
"learning_rate": 3e-05, |
|
"loss": 11.176, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"eval_accuracy": 0.20005128205128206, |
|
"eval_loss": 9.37710952758789, |
|
"eval_runtime": 5.4109, |
|
"eval_samples_per_second": 92.407, |
|
"eval_steps_per_second": 11.643, |
|
"step": 12167 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"eval_exact_match": 0.0, |
|
"eval_f1": 0.0, |
|
"step": 12167 |
|
}, |
|
{ |
|
"epoch": 23.062381852551987, |
|
"grad_norm": 221080977408.0, |
|
"learning_rate": 3e-05, |
|
"loss": 10.2085, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 23.251417769376182, |
|
"grad_norm": 819255836672.0, |
|
"learning_rate": 3e-05, |
|
"loss": 10.3606, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 23.440453686200378, |
|
"grad_norm": 1077065216.0, |
|
"learning_rate": 3e-05, |
|
"loss": 10.3544, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 23.629489603024574, |
|
"grad_norm": 147283214336.0, |
|
"learning_rate": 3e-05, |
|
"loss": 10.6161, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 23.81852551984877, |
|
"grad_norm": 479103936.0, |
|
"learning_rate": 3e-05, |
|
"loss": 11.0412, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_accuracy": 0.2362051282051282, |
|
"eval_loss": 8.926570892333984, |
|
"eval_runtime": 5.7624, |
|
"eval_samples_per_second": 86.769, |
|
"eval_steps_per_second": 10.933, |
|
"step": 12696 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_exact_match": 0.0, |
|
"eval_f1": 0.0, |
|
"step": 12696 |
|
}, |
|
{ |
|
"epoch": 24.00756143667297, |
|
"grad_norm": 18736.490234375, |
|
"learning_rate": 3e-05, |
|
"loss": 10.5723, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 24.196597353497165, |
|
"grad_norm": 10122.208984375, |
|
"learning_rate": 3e-05, |
|
"loss": 9.1505, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 24.38563327032136, |
|
"grad_norm": 884941.875, |
|
"learning_rate": 3e-05, |
|
"loss": 8.6122, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 24.574669187145556, |
|
"grad_norm": 51996.421875, |
|
"learning_rate": 3e-05, |
|
"loss": 8.5405, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 24.763705103969755, |
|
"grad_norm": 11833.41796875, |
|
"learning_rate": 3e-05, |
|
"loss": 8.4026, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 24.95274102079395, |
|
"grad_norm": 645689536.0, |
|
"learning_rate": 3e-05, |
|
"loss": 8.8444, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"eval_accuracy": 0.20235897435897435, |
|
"eval_loss": 10.400785446166992, |
|
"eval_runtime": 5.8159, |
|
"eval_samples_per_second": 85.971, |
|
"eval_steps_per_second": 10.832, |
|
"step": 13225 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"eval_exact_match": 0.0, |
|
"eval_f1": 0.0, |
|
"step": 13225 |
|
}, |
|
{ |
|
"epoch": 25.141776937618147, |
|
"grad_norm": 33455.7734375, |
|
"learning_rate": 3e-05, |
|
"loss": 8.9314, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 25.330812854442343, |
|
"grad_norm": 665939.625, |
|
"learning_rate": 3e-05, |
|
"loss": 8.9419, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 25.519848771266542, |
|
"grad_norm": 366503.25, |
|
"learning_rate": 3e-05, |
|
"loss": 9.1085, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 25.708884688090738, |
|
"grad_norm": 1146194755584.0, |
|
"learning_rate": 3e-05, |
|
"loss": 9.1301, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 25.897920604914933, |
|
"grad_norm": 5115191230464.0, |
|
"learning_rate": 3e-05, |
|
"loss": 8.9435, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_accuracy": 0.1995897435897436, |
|
"eval_loss": 12.846183776855469, |
|
"eval_runtime": 5.1674, |
|
"eval_samples_per_second": 96.76, |
|
"eval_steps_per_second": 12.192, |
|
"step": 13754 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_exact_match": 0.0, |
|
"eval_f1": 0.0, |
|
"step": 13754 |
|
}, |
|
{ |
|
"epoch": 26.08695652173913, |
|
"grad_norm": 119.87647247314453, |
|
"learning_rate": 3e-05, |
|
"loss": 10.1216, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 26.27599243856333, |
|
"grad_norm": 5832.49462890625, |
|
"learning_rate": 3e-05, |
|
"loss": 8.5386, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 26.465028355387524, |
|
"grad_norm": 7308.30908203125, |
|
"learning_rate": 3e-05, |
|
"loss": 8.1093, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 26.65406427221172, |
|
"grad_norm": 21071.32421875, |
|
"learning_rate": 3e-05, |
|
"loss": 8.0342, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 26.843100189035916, |
|
"grad_norm": 69589.0625, |
|
"learning_rate": 3e-05, |
|
"loss": 8.0313, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"eval_accuracy": 0.23364102564102565, |
|
"eval_loss": 7.521541595458984, |
|
"eval_runtime": 5.0339, |
|
"eval_samples_per_second": 99.326, |
|
"eval_steps_per_second": 12.515, |
|
"step": 14283 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"eval_exact_match": 0.0, |
|
"eval_f1": 0.0, |
|
"step": 14283 |
|
}, |
|
{ |
|
"epoch": 27.032136105860115, |
|
"grad_norm": 158736.625, |
|
"learning_rate": 3e-05, |
|
"loss": 8.1019, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 27.22117202268431, |
|
"grad_norm": 12491.8095703125, |
|
"learning_rate": 3e-05, |
|
"loss": 8.2181, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 27.410207939508506, |
|
"grad_norm": 123181.3671875, |
|
"learning_rate": 3e-05, |
|
"loss": 8.3091, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 27.599243856332702, |
|
"grad_norm": 620153.5, |
|
"learning_rate": 3e-05, |
|
"loss": 8.4058, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 27.7882797731569, |
|
"grad_norm": 337683.1875, |
|
"learning_rate": 3e-05, |
|
"loss": 8.619, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 27.977315689981097, |
|
"grad_norm": 5629.42431640625, |
|
"learning_rate": 3e-05, |
|
"loss": 8.3923, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_accuracy": 0.21205128205128204, |
|
"eval_loss": 7.625494003295898, |
|
"eval_runtime": 5.1068, |
|
"eval_samples_per_second": 97.909, |
|
"eval_steps_per_second": 12.336, |
|
"step": 14812 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_exact_match": 0.0, |
|
"eval_f1": 0.0, |
|
"step": 14812 |
|
}, |
|
{ |
|
"epoch": 28.166351606805293, |
|
"grad_norm": 779905.0625, |
|
"learning_rate": 3e-05, |
|
"loss": 8.2129, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 28.35538752362949, |
|
"grad_norm": 92020.96875, |
|
"learning_rate": 3e-05, |
|
"loss": 8.3803, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 28.544423440453688, |
|
"grad_norm": 1754453.125, |
|
"learning_rate": 3e-05, |
|
"loss": 10.3285, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 28.733459357277884, |
|
"grad_norm": 9245.0390625, |
|
"learning_rate": 3e-05, |
|
"loss": 10.3239, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 28.92249527410208, |
|
"grad_norm": 57860676.0, |
|
"learning_rate": 3e-05, |
|
"loss": 8.5368, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"eval_accuracy": 0.1995897435897436, |
|
"eval_loss": 9.528727531433105, |
|
"eval_runtime": 5.1078, |
|
"eval_samples_per_second": 97.89, |
|
"eval_steps_per_second": 12.334, |
|
"step": 15341 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"eval_exact_match": 0.0, |
|
"eval_f1": 0.0, |
|
"step": 15341 |
|
}, |
|
{ |
|
"epoch": 29.111531190926275, |
|
"grad_norm": 95621760.0, |
|
"learning_rate": 3e-05, |
|
"loss": 9.2102, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 29.300567107750474, |
|
"grad_norm": 68808096.0, |
|
"learning_rate": 3e-05, |
|
"loss": 9.1584, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 29.48960302457467, |
|
"grad_norm": 394869760.0, |
|
"learning_rate": 3e-05, |
|
"loss": 9.0096, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 29.678638941398866, |
|
"grad_norm": 295963264.0, |
|
"learning_rate": 3e-05, |
|
"loss": 8.1208, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 29.86767485822306, |
|
"grad_norm": 471883200.0, |
|
"learning_rate": 3e-05, |
|
"loss": 7.964, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"eval_accuracy": 0.19994871794871794, |
|
"eval_loss": 7.713045120239258, |
|
"eval_runtime": 5.3921, |
|
"eval_samples_per_second": 92.728, |
|
"eval_steps_per_second": 11.684, |
|
"step": 15870 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"eval_exact_match": 0.0, |
|
"eval_f1": 0.0, |
|
"step": 15870 |
|
}, |
|
{ |
|
"epoch": 30.056710775047257, |
|
"grad_norm": 1943886.25, |
|
"learning_rate": 3e-05, |
|
"loss": 8.051, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 30.245746691871457, |
|
"grad_norm": 252297.234375, |
|
"learning_rate": 3e-05, |
|
"loss": 8.2152, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 30.434782608695652, |
|
"grad_norm": 20118.697265625, |
|
"learning_rate": 3e-05, |
|
"loss": 8.586, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 30.623818525519848, |
|
"grad_norm": 30278450.0, |
|
"learning_rate": 3e-05, |
|
"loss": 8.594, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 30.812854442344044, |
|
"grad_norm": 491771.5, |
|
"learning_rate": 3e-05, |
|
"loss": 8.861, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"eval_accuracy": 0.2362051282051282, |
|
"eval_loss": 7.903472423553467, |
|
"eval_runtime": 5.622, |
|
"eval_samples_per_second": 88.937, |
|
"eval_steps_per_second": 11.206, |
|
"step": 16399 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"eval_exact_match": 0.0, |
|
"eval_f1": 0.0, |
|
"step": 16399 |
|
}, |
|
{ |
|
"epoch": 31.001890359168243, |
|
"grad_norm": 67330.3125, |
|
"learning_rate": 3e-05, |
|
"loss": 8.3547, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 31.19092627599244, |
|
"grad_norm": 437164.03125, |
|
"learning_rate": 3e-05, |
|
"loss": 9.0483, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 31.379962192816635, |
|
"grad_norm": 383445.78125, |
|
"learning_rate": 3e-05, |
|
"loss": 8.8258, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 31.56899810964083, |
|
"grad_norm": 26452162510848.0, |
|
"learning_rate": 3e-05, |
|
"loss": 9.2178, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 31.75803402646503, |
|
"grad_norm": 1477470060544.0, |
|
"learning_rate": 3e-05, |
|
"loss": 9.0897, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 31.947069943289225, |
|
"grad_norm": 3547858688.0, |
|
"learning_rate": 3e-05, |
|
"loss": 9.1201, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"eval_accuracy": 0.2362051282051282, |
|
"eval_loss": 8.878704071044922, |
|
"eval_runtime": 5.7948, |
|
"eval_samples_per_second": 86.284, |
|
"eval_steps_per_second": 10.872, |
|
"step": 16928 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"eval_exact_match": 0.0, |
|
"eval_f1": 0.0, |
|
"step": 16928 |
|
}, |
|
{ |
|
"epoch": 32.136105860113425, |
|
"grad_norm": 25736851226624.0, |
|
"learning_rate": 3e-05, |
|
"loss": 9.0721, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 32.32514177693762, |
|
"grad_norm": 83071770624.0, |
|
"learning_rate": 3e-05, |
|
"loss": 9.1184, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 32.514177693761816, |
|
"grad_norm": 362845667328.0, |
|
"learning_rate": 3e-05, |
|
"loss": 9.1277, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 32.70321361058601, |
|
"grad_norm": 6446509981696.0, |
|
"learning_rate": 3e-05, |
|
"loss": 9.1827, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 32.89224952741021, |
|
"grad_norm": 8676001710080.0, |
|
"learning_rate": 3e-05, |
|
"loss": 9.1903, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"eval_accuracy": 0.2003076923076923, |
|
"eval_loss": 12.143142700195312, |
|
"eval_runtime": 5.0907, |
|
"eval_samples_per_second": 98.218, |
|
"eval_steps_per_second": 12.375, |
|
"step": 17457 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"eval_exact_match": 0.0, |
|
"eval_f1": 0.0, |
|
"step": 17457 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 26450, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 50, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.42101722663433e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|