|
{ |
|
"best_metric": 0.6129801869392395, |
|
"best_model_checkpoint": "Action_model/checkpoint-300", |
|
"epoch": 10.0, |
|
"eval_steps": 100, |
|
"global_step": 2680, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.570383071899414, |
|
"learning_rate": 9.96268656716418e-05, |
|
"loss": 0.1841, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 6.266295433044434, |
|
"learning_rate": 9.925373134328359e-05, |
|
"loss": 0.2301, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 8.001986503601074, |
|
"learning_rate": 9.888059701492539e-05, |
|
"loss": 0.2533, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 5.319194316864014, |
|
"learning_rate": 9.850746268656717e-05, |
|
"loss": 0.2436, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.9653372764587402, |
|
"learning_rate": 9.813432835820896e-05, |
|
"loss": 0.3712, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 7.348043441772461, |
|
"learning_rate": 9.776119402985075e-05, |
|
"loss": 0.3645, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 2.1969542503356934, |
|
"learning_rate": 9.738805970149254e-05, |
|
"loss": 0.4609, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 6.397550106048584, |
|
"learning_rate": 9.701492537313434e-05, |
|
"loss": 0.4755, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 6.923007488250732, |
|
"learning_rate": 9.664179104477612e-05, |
|
"loss": 0.3901, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 4.786198616027832, |
|
"learning_rate": 9.626865671641792e-05, |
|
"loss": 0.255, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"eval_accuracy": 0.7926186291739895, |
|
"eval_loss": 0.7616190314292908, |
|
"eval_runtime": 8.7209, |
|
"eval_samples_per_second": 65.245, |
|
"eval_steps_per_second": 8.256, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 8.368223190307617, |
|
"learning_rate": 9.58955223880597e-05, |
|
"loss": 0.3784, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 4.078306198120117, |
|
"learning_rate": 9.552238805970149e-05, |
|
"loss": 0.4148, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 7.815361022949219, |
|
"learning_rate": 9.514925373134329e-05, |
|
"loss": 0.3621, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 11.498431205749512, |
|
"learning_rate": 9.477611940298507e-05, |
|
"loss": 0.3974, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 7.946558952331543, |
|
"learning_rate": 9.440298507462687e-05, |
|
"loss": 0.3856, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.3486919403076172, |
|
"learning_rate": 9.402985074626867e-05, |
|
"loss": 0.2435, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 4.267444133758545, |
|
"learning_rate": 9.365671641791045e-05, |
|
"loss": 0.3736, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 3.022345542907715, |
|
"learning_rate": 9.328358208955224e-05, |
|
"loss": 0.439, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 5.57196044921875, |
|
"learning_rate": 9.291044776119402e-05, |
|
"loss": 0.2996, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 2.636216640472412, |
|
"learning_rate": 9.253731343283582e-05, |
|
"loss": 0.2048, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"eval_accuracy": 0.8084358523725835, |
|
"eval_loss": 0.724670946598053, |
|
"eval_runtime": 8.4461, |
|
"eval_samples_per_second": 67.368, |
|
"eval_steps_per_second": 8.525, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.615098237991333, |
|
"learning_rate": 9.216417910447762e-05, |
|
"loss": 0.3594, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 9.315821647644043, |
|
"learning_rate": 9.17910447761194e-05, |
|
"loss": 0.3046, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 3.669430732727051, |
|
"learning_rate": 9.14179104477612e-05, |
|
"loss": 0.4158, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 7.0882978439331055, |
|
"learning_rate": 9.104477611940299e-05, |
|
"loss": 0.3477, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.1667325496673584, |
|
"learning_rate": 9.067164179104479e-05, |
|
"loss": 0.316, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 1.482625961303711, |
|
"learning_rate": 9.029850746268657e-05, |
|
"loss": 0.3922, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.20793116092681885, |
|
"learning_rate": 8.992537313432836e-05, |
|
"loss": 0.3751, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 6.772298812866211, |
|
"learning_rate": 8.955223880597016e-05, |
|
"loss": 0.3269, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 5.833349227905273, |
|
"learning_rate": 8.917910447761194e-05, |
|
"loss": 0.3026, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 6.349458694458008, |
|
"learning_rate": 8.880597014925374e-05, |
|
"loss": 0.3763, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"eval_accuracy": 0.8330404217926186, |
|
"eval_loss": 0.6129801869392395, |
|
"eval_runtime": 8.4095, |
|
"eval_samples_per_second": 67.661, |
|
"eval_steps_per_second": 8.562, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 4.767229080200195, |
|
"learning_rate": 8.843283582089554e-05, |
|
"loss": 0.3808, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 12.675297737121582, |
|
"learning_rate": 8.805970149253732e-05, |
|
"loss": 0.3766, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 3.8118245601654053, |
|
"learning_rate": 8.76865671641791e-05, |
|
"loss": 0.2642, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 8.736045837402344, |
|
"learning_rate": 8.731343283582089e-05, |
|
"loss": 0.3041, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 6.683359146118164, |
|
"learning_rate": 8.694029850746269e-05, |
|
"loss": 0.1352, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 4.780521392822266, |
|
"learning_rate": 8.656716417910447e-05, |
|
"loss": 0.4005, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 9.654714584350586, |
|
"learning_rate": 8.619402985074627e-05, |
|
"loss": 0.3646, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 4.174666881561279, |
|
"learning_rate": 8.582089552238807e-05, |
|
"loss": 0.2353, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 7.596667289733887, |
|
"learning_rate": 8.548507462686568e-05, |
|
"loss": 0.3991, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 5.592709064483643, |
|
"learning_rate": 8.511194029850747e-05, |
|
"loss": 0.307, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"eval_accuracy": 0.789103690685413, |
|
"eval_loss": 0.8137023448944092, |
|
"eval_runtime": 8.3292, |
|
"eval_samples_per_second": 68.314, |
|
"eval_steps_per_second": 8.644, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 2.232590675354004, |
|
"learning_rate": 8.473880597014926e-05, |
|
"loss": 0.4669, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 4.276609897613525, |
|
"learning_rate": 8.436567164179105e-05, |
|
"loss": 0.3831, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 7.262507915496826, |
|
"learning_rate": 8.399253731343283e-05, |
|
"loss": 0.3472, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 7.258556365966797, |
|
"learning_rate": 8.361940298507463e-05, |
|
"loss": 0.2396, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 4.945961952209473, |
|
"learning_rate": 8.324626865671642e-05, |
|
"loss": 0.2433, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 5.138702392578125, |
|
"learning_rate": 8.287313432835821e-05, |
|
"loss": 0.2947, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 1.1640909910202026, |
|
"learning_rate": 8.25e-05, |
|
"loss": 0.4791, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 4.626485824584961, |
|
"learning_rate": 8.21268656716418e-05, |
|
"loss": 0.286, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 5.178492069244385, |
|
"learning_rate": 8.17537313432836e-05, |
|
"loss": 0.3202, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 7.854339122772217, |
|
"learning_rate": 8.138059701492538e-05, |
|
"loss": 0.3542, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"eval_accuracy": 0.8014059753954306, |
|
"eval_loss": 0.6611581444740295, |
|
"eval_runtime": 8.5853, |
|
"eval_samples_per_second": 66.276, |
|
"eval_steps_per_second": 8.386, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 1.429740071296692, |
|
"learning_rate": 8.100746268656717e-05, |
|
"loss": 0.3039, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 2.9776551723480225, |
|
"learning_rate": 8.063432835820895e-05, |
|
"loss": 0.3825, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 10.557899475097656, |
|
"learning_rate": 8.026119402985075e-05, |
|
"loss": 0.5109, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 1.448002815246582, |
|
"learning_rate": 7.988805970149255e-05, |
|
"loss": 0.3421, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 4.500860691070557, |
|
"learning_rate": 7.951492537313433e-05, |
|
"loss": 0.3008, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 8.077374458312988, |
|
"learning_rate": 7.914179104477613e-05, |
|
"loss": 0.27, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 0.16809479892253876, |
|
"learning_rate": 7.876865671641792e-05, |
|
"loss": 0.2184, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 4.892763137817383, |
|
"learning_rate": 7.83955223880597e-05, |
|
"loss": 0.1479, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 8.35221004486084, |
|
"learning_rate": 7.80223880597015e-05, |
|
"loss": 0.3498, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 12.043429374694824, |
|
"learning_rate": 7.764925373134328e-05, |
|
"loss": 0.3518, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"eval_accuracy": 0.8189806678383128, |
|
"eval_loss": 0.6964564919471741, |
|
"eval_runtime": 8.3878, |
|
"eval_samples_per_second": 67.837, |
|
"eval_steps_per_second": 8.584, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 3.7737715244293213, |
|
"learning_rate": 7.727611940298508e-05, |
|
"loss": 0.3532, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 4.282881736755371, |
|
"learning_rate": 7.690298507462687e-05, |
|
"loss": 0.2214, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 6.733531475067139, |
|
"learning_rate": 7.652985074626866e-05, |
|
"loss": 0.2709, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 2.567267417907715, |
|
"learning_rate": 7.615671641791045e-05, |
|
"loss": 0.3725, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 3.120966911315918, |
|
"learning_rate": 7.578358208955223e-05, |
|
"loss": 0.3036, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 6.505622386932373, |
|
"learning_rate": 7.541044776119403e-05, |
|
"loss": 0.2426, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 4.887637615203857, |
|
"learning_rate": 7.503731343283582e-05, |
|
"loss": 0.281, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 9.790969848632812, |
|
"learning_rate": 7.466417910447762e-05, |
|
"loss": 0.4504, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 4.354789733886719, |
|
"learning_rate": 7.429104477611941e-05, |
|
"loss": 0.4094, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 5.015912055969238, |
|
"learning_rate": 7.39179104477612e-05, |
|
"loss": 0.3706, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"eval_accuracy": 0.804920913884007, |
|
"eval_loss": 0.7254143357276917, |
|
"eval_runtime": 8.3242, |
|
"eval_samples_per_second": 68.355, |
|
"eval_steps_per_second": 8.649, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 5.382541656494141, |
|
"learning_rate": 7.3544776119403e-05, |
|
"loss": 0.1722, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 5.573971748352051, |
|
"learning_rate": 7.317164179104478e-05, |
|
"loss": 0.327, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 3.5606117248535156, |
|
"learning_rate": 7.279850746268657e-05, |
|
"loss": 0.2702, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 1.7398028373718262, |
|
"learning_rate": 7.242537313432837e-05, |
|
"loss": 0.238, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 2.7511751651763916, |
|
"learning_rate": 7.205223880597015e-05, |
|
"loss": 0.1848, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 3.381510019302368, |
|
"learning_rate": 7.167910447761195e-05, |
|
"loss": 0.2261, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 4.65634298324585, |
|
"learning_rate": 7.130597014925373e-05, |
|
"loss": 0.237, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 10.35020923614502, |
|
"learning_rate": 7.093283582089553e-05, |
|
"loss": 0.3012, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 8.878485679626465, |
|
"learning_rate": 7.055970149253732e-05, |
|
"loss": 0.4094, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 2.9728074073791504, |
|
"learning_rate": 7.01865671641791e-05, |
|
"loss": 0.4084, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"eval_accuracy": 0.8101933216168717, |
|
"eval_loss": 0.6746156811714172, |
|
"eval_runtime": 8.2718, |
|
"eval_samples_per_second": 68.788, |
|
"eval_steps_per_second": 8.704, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"grad_norm": 4.835368633270264, |
|
"learning_rate": 6.98134328358209e-05, |
|
"loss": 0.3152, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"grad_norm": 2.9197049140930176, |
|
"learning_rate": 6.944029850746268e-05, |
|
"loss": 0.3433, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"grad_norm": 5.646128177642822, |
|
"learning_rate": 6.906716417910448e-05, |
|
"loss": 0.2604, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 3.13, |
|
"grad_norm": 3.860607862472534, |
|
"learning_rate": 6.869402985074627e-05, |
|
"loss": 0.2831, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"grad_norm": 0.1358175426721573, |
|
"learning_rate": 6.832089552238807e-05, |
|
"loss": 0.242, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 3.21, |
|
"grad_norm": 1.1011104583740234, |
|
"learning_rate": 6.794776119402985e-05, |
|
"loss": 0.2621, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 7.837879180908203, |
|
"learning_rate": 6.757462686567164e-05, |
|
"loss": 0.249, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 6.8647613525390625, |
|
"learning_rate": 6.720149253731343e-05, |
|
"loss": 0.3398, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"grad_norm": 2.8186678886413574, |
|
"learning_rate": 6.682835820895522e-05, |
|
"loss": 0.3092, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"grad_norm": 4.623282432556152, |
|
"learning_rate": 6.645522388059702e-05, |
|
"loss": 0.2533, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"eval_accuracy": 0.8189806678383128, |
|
"eval_loss": 0.6866591572761536, |
|
"eval_runtime": 8.3143, |
|
"eval_samples_per_second": 68.436, |
|
"eval_steps_per_second": 8.66, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 4.85120964050293, |
|
"learning_rate": 6.608208955223882e-05, |
|
"loss": 0.2279, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 3.43, |
|
"grad_norm": 0.7263774275779724, |
|
"learning_rate": 6.57089552238806e-05, |
|
"loss": 0.1725, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 3.47, |
|
"grad_norm": 6.813180923461914, |
|
"learning_rate": 6.53358208955224e-05, |
|
"loss": 0.3304, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 3.51, |
|
"grad_norm": 8.58501148223877, |
|
"learning_rate": 6.496268656716418e-05, |
|
"loss": 0.1864, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"grad_norm": 2.814436435699463, |
|
"learning_rate": 6.458955223880597e-05, |
|
"loss": 0.1496, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"grad_norm": 8.36603832244873, |
|
"learning_rate": 6.421641791044777e-05, |
|
"loss": 0.208, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"grad_norm": 3.5715956687927246, |
|
"learning_rate": 6.384328358208955e-05, |
|
"loss": 0.2429, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"grad_norm": 4.983556270599365, |
|
"learning_rate": 6.347014925373135e-05, |
|
"loss": 0.4053, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 3.69, |
|
"grad_norm": 4.936723232269287, |
|
"learning_rate": 6.309701492537313e-05, |
|
"loss": 0.1545, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 3.73, |
|
"grad_norm": 6.59185791015625, |
|
"learning_rate": 6.272388059701493e-05, |
|
"loss": 0.3147, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.73, |
|
"eval_accuracy": 0.8189806678383128, |
|
"eval_loss": 0.7077136635780334, |
|
"eval_runtime": 8.3117, |
|
"eval_samples_per_second": 68.457, |
|
"eval_steps_per_second": 8.662, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.77, |
|
"grad_norm": 9.348366737365723, |
|
"learning_rate": 6.235074626865672e-05, |
|
"loss": 0.3634, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"grad_norm": 9.918521881103516, |
|
"learning_rate": 6.19776119402985e-05, |
|
"loss": 0.3151, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"grad_norm": 5.687044143676758, |
|
"learning_rate": 6.16044776119403e-05, |
|
"loss": 0.3088, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 3.8347887992858887, |
|
"learning_rate": 6.123134328358209e-05, |
|
"loss": 0.2128, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 5.380050182342529, |
|
"learning_rate": 6.0858208955223884e-05, |
|
"loss": 0.255, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"grad_norm": 8.848828315734863, |
|
"learning_rate": 6.0485074626865676e-05, |
|
"loss": 0.2794, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 3.99, |
|
"grad_norm": 3.9666404724121094, |
|
"learning_rate": 6.011194029850746e-05, |
|
"loss": 0.1954, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 4.03, |
|
"grad_norm": 0.3369455635547638, |
|
"learning_rate": 5.973880597014926e-05, |
|
"loss": 0.2298, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 4.07, |
|
"grad_norm": 16.327823638916016, |
|
"learning_rate": 5.9365671641791044e-05, |
|
"loss": 0.2504, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 4.1, |
|
"grad_norm": 7.070168495178223, |
|
"learning_rate": 5.8992537313432835e-05, |
|
"loss": 0.3182, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 4.1, |
|
"eval_accuracy": 0.8189806678383128, |
|
"eval_loss": 0.6661401987075806, |
|
"eval_runtime": 8.2263, |
|
"eval_samples_per_second": 69.169, |
|
"eval_steps_per_second": 8.752, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 4.14, |
|
"grad_norm": 2.853975534439087, |
|
"learning_rate": 5.8619402985074634e-05, |
|
"loss": 0.201, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 4.18, |
|
"grad_norm": 0.958690881729126, |
|
"learning_rate": 5.824626865671642e-05, |
|
"loss": 0.1833, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 4.22, |
|
"grad_norm": 3.4794461727142334, |
|
"learning_rate": 5.787313432835822e-05, |
|
"loss": 0.2796, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"grad_norm": 4.793296813964844, |
|
"learning_rate": 5.7499999999999995e-05, |
|
"loss": 0.2281, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 4.29, |
|
"grad_norm": 6.200154781341553, |
|
"learning_rate": 5.712686567164179e-05, |
|
"loss": 0.2814, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 4.33, |
|
"grad_norm": 5.616389274597168, |
|
"learning_rate": 5.675373134328359e-05, |
|
"loss": 0.1656, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 4.37, |
|
"grad_norm": 9.382554054260254, |
|
"learning_rate": 5.6380597014925376e-05, |
|
"loss": 0.19, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 3.526240587234497, |
|
"learning_rate": 5.600746268656717e-05, |
|
"loss": 0.2063, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 4.44, |
|
"grad_norm": 3.494896650314331, |
|
"learning_rate": 5.563432835820895e-05, |
|
"loss": 0.1681, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"grad_norm": 5.764057636260986, |
|
"learning_rate": 5.526119402985075e-05, |
|
"loss": 0.2248, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"eval_accuracy": 0.8418277680140598, |
|
"eval_loss": 0.6632041335105896, |
|
"eval_runtime": 8.1661, |
|
"eval_samples_per_second": 69.679, |
|
"eval_steps_per_second": 8.817, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 4.51, |
|
"grad_norm": 4.680635452270508, |
|
"learning_rate": 5.488805970149254e-05, |
|
"loss": 0.2179, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 4.55, |
|
"grad_norm": 10.24306869506836, |
|
"learning_rate": 5.451492537313433e-05, |
|
"loss": 0.2187, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 4.59, |
|
"grad_norm": 3.054690361022949, |
|
"learning_rate": 5.4141791044776126e-05, |
|
"loss": 0.1729, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 4.63, |
|
"grad_norm": 4.907272815704346, |
|
"learning_rate": 5.376865671641791e-05, |
|
"loss": 0.2762, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 4.66, |
|
"grad_norm": 4.774748802185059, |
|
"learning_rate": 5.33955223880597e-05, |
|
"loss": 0.1965, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 4.7, |
|
"grad_norm": 5.757875919342041, |
|
"learning_rate": 5.30223880597015e-05, |
|
"loss": 0.1564, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 4.74, |
|
"grad_norm": 0.3608088791370392, |
|
"learning_rate": 5.2649253731343286e-05, |
|
"loss": 0.0946, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 4.78, |
|
"grad_norm": 3.6289939880371094, |
|
"learning_rate": 5.227611940298508e-05, |
|
"loss": 0.3364, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 4.81, |
|
"grad_norm": 5.132009029388428, |
|
"learning_rate": 5.190298507462686e-05, |
|
"loss": 0.231, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 4.85, |
|
"grad_norm": 1.0347099304199219, |
|
"learning_rate": 5.152985074626866e-05, |
|
"loss": 0.1617, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 4.85, |
|
"eval_accuracy": 0.8172231985940246, |
|
"eval_loss": 0.7277125716209412, |
|
"eval_runtime": 8.4693, |
|
"eval_samples_per_second": 67.184, |
|
"eval_steps_per_second": 8.501, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 4.89, |
|
"grad_norm": 2.5996298789978027, |
|
"learning_rate": 5.115671641791045e-05, |
|
"loss": 0.385, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 4.93, |
|
"grad_norm": 3.724181890487671, |
|
"learning_rate": 5.078358208955224e-05, |
|
"loss": 0.1786, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 4.96, |
|
"grad_norm": 2.150557518005371, |
|
"learning_rate": 5.0410447761194035e-05, |
|
"loss": 0.2122, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 3.8813323974609375, |
|
"learning_rate": 5.003731343283582e-05, |
|
"loss": 0.2425, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 5.04, |
|
"grad_norm": 0.896369457244873, |
|
"learning_rate": 4.966417910447762e-05, |
|
"loss": 0.2208, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 5.07, |
|
"grad_norm": 9.002110481262207, |
|
"learning_rate": 4.92910447761194e-05, |
|
"loss": 0.1432, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 5.11, |
|
"grad_norm": 9.619662284851074, |
|
"learning_rate": 4.8917910447761195e-05, |
|
"loss": 0.1347, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 5.15, |
|
"grad_norm": 3.5148773193359375, |
|
"learning_rate": 4.8544776119402986e-05, |
|
"loss": 0.2837, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 5.19, |
|
"grad_norm": 7.631669044494629, |
|
"learning_rate": 4.817164179104478e-05, |
|
"loss": 0.1887, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 5.22, |
|
"grad_norm": 11.738872528076172, |
|
"learning_rate": 4.779850746268657e-05, |
|
"loss": 0.2578, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 5.22, |
|
"eval_accuracy": 0.8189806678383128, |
|
"eval_loss": 0.7114442586898804, |
|
"eval_runtime": 8.2672, |
|
"eval_samples_per_second": 68.826, |
|
"eval_steps_per_second": 8.709, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 5.26, |
|
"grad_norm": 6.67802095413208, |
|
"learning_rate": 4.742537313432836e-05, |
|
"loss": 0.2527, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 5.3, |
|
"grad_norm": 4.491325378417969, |
|
"learning_rate": 4.705223880597015e-05, |
|
"loss": 0.2386, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 5.34, |
|
"grad_norm": 1.1810379028320312, |
|
"learning_rate": 4.667910447761194e-05, |
|
"loss": 0.1693, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 5.37, |
|
"grad_norm": 6.075868129730225, |
|
"learning_rate": 4.6305970149253736e-05, |
|
"loss": 0.167, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 5.41, |
|
"grad_norm": 2.315635919570923, |
|
"learning_rate": 4.593283582089553e-05, |
|
"loss": 0.2243, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 5.45, |
|
"grad_norm": 10.839255332946777, |
|
"learning_rate": 4.555970149253732e-05, |
|
"loss": 0.2414, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 5.49, |
|
"grad_norm": 4.562304496765137, |
|
"learning_rate": 4.5186567164179104e-05, |
|
"loss": 0.264, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 5.52, |
|
"grad_norm": 1.8821789026260376, |
|
"learning_rate": 4.4813432835820895e-05, |
|
"loss": 0.1407, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 5.56, |
|
"grad_norm": 8.406396865844727, |
|
"learning_rate": 4.4440298507462694e-05, |
|
"loss": 0.1454, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 5.6, |
|
"grad_norm": 0.2816010117530823, |
|
"learning_rate": 4.406716417910448e-05, |
|
"loss": 0.1864, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 5.6, |
|
"eval_accuracy": 0.8172231985940246, |
|
"eval_loss": 0.755394458770752, |
|
"eval_runtime": 8.2598, |
|
"eval_samples_per_second": 68.888, |
|
"eval_steps_per_second": 8.717, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 5.63, |
|
"grad_norm": 6.619854927062988, |
|
"learning_rate": 4.369402985074627e-05, |
|
"loss": 0.2806, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 5.67, |
|
"grad_norm": 2.056018829345703, |
|
"learning_rate": 4.332089552238806e-05, |
|
"loss": 0.2583, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 5.71, |
|
"grad_norm": 0.966521680355072, |
|
"learning_rate": 4.294776119402985e-05, |
|
"loss": 0.0997, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 5.75, |
|
"grad_norm": 2.8261241912841797, |
|
"learning_rate": 4.2574626865671645e-05, |
|
"loss": 0.1604, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 5.78, |
|
"grad_norm": 3.089912176132202, |
|
"learning_rate": 4.2201492537313436e-05, |
|
"loss": 0.2775, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 5.82, |
|
"grad_norm": 7.935690879821777, |
|
"learning_rate": 4.182835820895523e-05, |
|
"loss": 0.2522, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 5.86, |
|
"grad_norm": 0.7999266982078552, |
|
"learning_rate": 4.145522388059702e-05, |
|
"loss": 0.0752, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 5.9, |
|
"grad_norm": 6.0712480545043945, |
|
"learning_rate": 4.1082089552238804e-05, |
|
"loss": 0.1933, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 5.93, |
|
"grad_norm": 10.768308639526367, |
|
"learning_rate": 4.07089552238806e-05, |
|
"loss": 0.1664, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 5.97, |
|
"grad_norm": 9.641716003417969, |
|
"learning_rate": 4.0335820895522394e-05, |
|
"loss": 0.3134, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 5.97, |
|
"eval_accuracy": 0.8154657293497364, |
|
"eval_loss": 0.7593356966972351, |
|
"eval_runtime": 8.4455, |
|
"eval_samples_per_second": 67.373, |
|
"eval_steps_per_second": 8.525, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 6.01, |
|
"grad_norm": 6.7538838386535645, |
|
"learning_rate": 3.996268656716418e-05, |
|
"loss": 0.1747, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 6.04, |
|
"grad_norm": 6.237377166748047, |
|
"learning_rate": 3.958955223880597e-05, |
|
"loss": 0.2406, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 6.08, |
|
"grad_norm": 7.950930118560791, |
|
"learning_rate": 3.921641791044776e-05, |
|
"loss": 0.1884, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 6.12, |
|
"grad_norm": 4.41484260559082, |
|
"learning_rate": 3.8843283582089554e-05, |
|
"loss": 0.1445, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 6.16, |
|
"grad_norm": 6.339887619018555, |
|
"learning_rate": 3.8470149253731345e-05, |
|
"loss": 0.2906, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 6.19, |
|
"grad_norm": 7.597599983215332, |
|
"learning_rate": 3.809701492537314e-05, |
|
"loss": 0.1576, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 6.23, |
|
"grad_norm": 2.379629373550415, |
|
"learning_rate": 3.772388059701493e-05, |
|
"loss": 0.2016, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 6.27, |
|
"grad_norm": 2.7694478034973145, |
|
"learning_rate": 3.735074626865671e-05, |
|
"loss": 0.1188, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 6.31, |
|
"grad_norm": 2.1837210655212402, |
|
"learning_rate": 3.6977611940298505e-05, |
|
"loss": 0.1908, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 6.34, |
|
"grad_norm": 4.4665350914001465, |
|
"learning_rate": 3.66044776119403e-05, |
|
"loss": 0.24, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 6.34, |
|
"eval_accuracy": 0.8260105448154658, |
|
"eval_loss": 0.7510848641395569, |
|
"eval_runtime": 8.2044, |
|
"eval_samples_per_second": 69.353, |
|
"eval_steps_per_second": 8.776, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 6.38, |
|
"grad_norm": 4.6551995277404785, |
|
"learning_rate": 3.6231343283582095e-05, |
|
"loss": 0.1631, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 6.42, |
|
"grad_norm": 1.098407506942749, |
|
"learning_rate": 3.585820895522388e-05, |
|
"loss": 0.0912, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 6.46, |
|
"grad_norm": 0.37138649821281433, |
|
"learning_rate": 3.548507462686567e-05, |
|
"loss": 0.2621, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 6.49, |
|
"grad_norm": 7.4571757316589355, |
|
"learning_rate": 3.511194029850746e-05, |
|
"loss": 0.268, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 6.53, |
|
"grad_norm": 0.5180323123931885, |
|
"learning_rate": 3.4738805970149254e-05, |
|
"loss": 0.2135, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 6.57, |
|
"grad_norm": 1.0866820812225342, |
|
"learning_rate": 3.4365671641791046e-05, |
|
"loss": 0.1489, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 6.6, |
|
"grad_norm": 8.90451717376709, |
|
"learning_rate": 3.399253731343284e-05, |
|
"loss": 0.288, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 6.64, |
|
"grad_norm": 1.1608803272247314, |
|
"learning_rate": 3.361940298507463e-05, |
|
"loss": 0.18, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 6.68, |
|
"grad_norm": 2.9207170009613037, |
|
"learning_rate": 3.3246268656716414e-05, |
|
"loss": 0.2414, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 6.72, |
|
"grad_norm": 0.2674783170223236, |
|
"learning_rate": 3.287313432835821e-05, |
|
"loss": 0.2359, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 6.72, |
|
"eval_accuracy": 0.8137082601054482, |
|
"eval_loss": 0.7501537203788757, |
|
"eval_runtime": 8.1528, |
|
"eval_samples_per_second": 69.792, |
|
"eval_steps_per_second": 8.831, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 6.75, |
|
"grad_norm": 8.241676330566406, |
|
"learning_rate": 3.2500000000000004e-05, |
|
"loss": 0.1975, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 6.79, |
|
"grad_norm": 2.0347325801849365, |
|
"learning_rate": 3.2126865671641796e-05, |
|
"loss": 0.218, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 6.83, |
|
"grad_norm": 1.0338706970214844, |
|
"learning_rate": 3.175373134328358e-05, |
|
"loss": 0.1437, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 6.87, |
|
"grad_norm": 0.34902578592300415, |
|
"learning_rate": 3.138059701492537e-05, |
|
"loss": 0.1883, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 6.9, |
|
"grad_norm": 6.642534255981445, |
|
"learning_rate": 3.100746268656717e-05, |
|
"loss": 0.2513, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 6.94, |
|
"grad_norm": 4.432920455932617, |
|
"learning_rate": 3.0634328358208955e-05, |
|
"loss": 0.1058, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 6.98, |
|
"grad_norm": 4.381640434265137, |
|
"learning_rate": 3.0261194029850747e-05, |
|
"loss": 0.2114, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 7.01, |
|
"grad_norm": 7.730411529541016, |
|
"learning_rate": 2.9888059701492538e-05, |
|
"loss": 0.2542, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 7.05, |
|
"grad_norm": 7.122923851013184, |
|
"learning_rate": 2.9514925373134326e-05, |
|
"loss": 0.2594, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 7.09, |
|
"grad_norm": 1.411278486251831, |
|
"learning_rate": 2.9141791044776125e-05, |
|
"loss": 0.2322, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 7.09, |
|
"eval_accuracy": 0.8347978910369068, |
|
"eval_loss": 0.6952534317970276, |
|
"eval_runtime": 8.3769, |
|
"eval_samples_per_second": 67.925, |
|
"eval_steps_per_second": 8.595, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 7.13, |
|
"grad_norm": 2.219285011291504, |
|
"learning_rate": 2.8768656716417913e-05, |
|
"loss": 0.1344, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 7.16, |
|
"grad_norm": 6.302455902099609, |
|
"learning_rate": 2.8395522388059705e-05, |
|
"loss": 0.2098, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 7.2, |
|
"grad_norm": 1.2837783098220825, |
|
"learning_rate": 2.8022388059701493e-05, |
|
"loss": 0.0906, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 7.24, |
|
"grad_norm": 6.604355335235596, |
|
"learning_rate": 2.7649253731343284e-05, |
|
"loss": 0.2352, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 7.28, |
|
"grad_norm": 9.916419982910156, |
|
"learning_rate": 2.727611940298508e-05, |
|
"loss": 0.1422, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 7.31, |
|
"grad_norm": 2.7665014266967773, |
|
"learning_rate": 2.6902985074626868e-05, |
|
"loss": 0.1722, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 7.35, |
|
"grad_norm": 0.24231348931789398, |
|
"learning_rate": 2.652985074626866e-05, |
|
"loss": 0.2935, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 7.39, |
|
"grad_norm": 0.8025885224342346, |
|
"learning_rate": 2.6156716417910447e-05, |
|
"loss": 0.157, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 7.43, |
|
"grad_norm": 1.6752264499664307, |
|
"learning_rate": 2.578358208955224e-05, |
|
"loss": 0.1256, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 7.46, |
|
"grad_norm": 2.404883861541748, |
|
"learning_rate": 2.5410447761194027e-05, |
|
"loss": 0.1514, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 7.46, |
|
"eval_accuracy": 0.8260105448154658, |
|
"eval_loss": 0.7120960354804993, |
|
"eval_runtime": 8.1425, |
|
"eval_samples_per_second": 69.88, |
|
"eval_steps_per_second": 8.842, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"grad_norm": 5.409728050231934, |
|
"learning_rate": 2.5037313432835825e-05, |
|
"loss": 0.222, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 7.54, |
|
"grad_norm": 3.949014663696289, |
|
"learning_rate": 2.4664179104477614e-05, |
|
"loss": 0.245, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 7.57, |
|
"grad_norm": 8.40086555480957, |
|
"learning_rate": 2.4291044776119405e-05, |
|
"loss": 0.1408, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 7.61, |
|
"grad_norm": 7.694955348968506, |
|
"learning_rate": 2.3917910447761197e-05, |
|
"loss": 0.2072, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 7.65, |
|
"grad_norm": 1.9109055995941162, |
|
"learning_rate": 2.3544776119402985e-05, |
|
"loss": 0.145, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 7.69, |
|
"grad_norm": 12.803776741027832, |
|
"learning_rate": 2.3171641791044777e-05, |
|
"loss": 0.1274, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 7.72, |
|
"grad_norm": 3.3325235843658447, |
|
"learning_rate": 2.2798507462686568e-05, |
|
"loss": 0.1564, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 7.76, |
|
"grad_norm": 1.105327844619751, |
|
"learning_rate": 2.242537313432836e-05, |
|
"loss": 0.2008, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 7.8, |
|
"grad_norm": 1.7592620849609375, |
|
"learning_rate": 2.2052238805970148e-05, |
|
"loss": 0.203, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 7.84, |
|
"grad_norm": 0.13264060020446777, |
|
"learning_rate": 2.1679104477611943e-05, |
|
"loss": 0.2089, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 7.84, |
|
"eval_accuracy": 0.827768014059754, |
|
"eval_loss": 0.693087637424469, |
|
"eval_runtime": 8.2375, |
|
"eval_samples_per_second": 69.074, |
|
"eval_steps_per_second": 8.741, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 7.87, |
|
"grad_norm": 5.904381275177002, |
|
"learning_rate": 2.130597014925373e-05, |
|
"loss": 0.1754, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 7.91, |
|
"grad_norm": 1.7469266653060913, |
|
"learning_rate": 2.0932835820895526e-05, |
|
"loss": 0.1322, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 7.95, |
|
"grad_norm": 4.313326835632324, |
|
"learning_rate": 2.0559701492537314e-05, |
|
"loss": 0.1418, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 7.99, |
|
"grad_norm": 0.14211903512477875, |
|
"learning_rate": 2.0186567164179106e-05, |
|
"loss": 0.1534, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 8.02, |
|
"grad_norm": 5.527184009552002, |
|
"learning_rate": 1.9813432835820897e-05, |
|
"loss": 0.2122, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 8.06, |
|
"grad_norm": 0.2312430739402771, |
|
"learning_rate": 1.9440298507462686e-05, |
|
"loss": 0.1617, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 8.1, |
|
"grad_norm": 0.23949085175991058, |
|
"learning_rate": 1.906716417910448e-05, |
|
"loss": 0.1286, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 8.13, |
|
"grad_norm": 0.1903185099363327, |
|
"learning_rate": 1.869402985074627e-05, |
|
"loss": 0.0846, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 8.17, |
|
"grad_norm": 0.08518023788928986, |
|
"learning_rate": 1.832089552238806e-05, |
|
"loss": 0.0801, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 8.21, |
|
"grad_norm": 4.424215793609619, |
|
"learning_rate": 1.7947761194029852e-05, |
|
"loss": 0.2245, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 8.21, |
|
"eval_accuracy": 0.8330404217926186, |
|
"eval_loss": 0.7087014317512512, |
|
"eval_runtime": 8.1117, |
|
"eval_samples_per_second": 70.145, |
|
"eval_steps_per_second": 8.876, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 8.25, |
|
"grad_norm": 7.247931480407715, |
|
"learning_rate": 1.7574626865671644e-05, |
|
"loss": 0.0722, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 8.28, |
|
"grad_norm": 4.80264949798584, |
|
"learning_rate": 1.7201492537313435e-05, |
|
"loss": 0.0844, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 8.32, |
|
"grad_norm": 8.001790046691895, |
|
"learning_rate": 1.6828358208955223e-05, |
|
"loss": 0.1077, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 8.36, |
|
"grad_norm": 5.419641017913818, |
|
"learning_rate": 1.6455223880597015e-05, |
|
"loss": 0.1627, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 8.4, |
|
"grad_norm": 0.031686268746852875, |
|
"learning_rate": 1.6082089552238806e-05, |
|
"loss": 0.0984, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 8.43, |
|
"grad_norm": 6.095193862915039, |
|
"learning_rate": 1.5708955223880598e-05, |
|
"loss": 0.1756, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 8.47, |
|
"grad_norm": 5.179446220397949, |
|
"learning_rate": 1.5335820895522386e-05, |
|
"loss": 0.1708, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 8.51, |
|
"grad_norm": 4.06497049331665, |
|
"learning_rate": 1.496268656716418e-05, |
|
"loss": 0.1493, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 8.54, |
|
"grad_norm": 1.4721342325210571, |
|
"learning_rate": 1.458955223880597e-05, |
|
"loss": 0.2587, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 8.58, |
|
"grad_norm": 4.418783664703369, |
|
"learning_rate": 1.4216417910447763e-05, |
|
"loss": 0.1328, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 8.58, |
|
"eval_accuracy": 0.8312829525483304, |
|
"eval_loss": 0.700339674949646, |
|
"eval_runtime": 8.481, |
|
"eval_samples_per_second": 67.091, |
|
"eval_steps_per_second": 8.49, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 8.62, |
|
"grad_norm": 1.5734038352966309, |
|
"learning_rate": 1.3843283582089553e-05, |
|
"loss": 0.165, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 8.66, |
|
"grad_norm": 2.624784231185913, |
|
"learning_rate": 1.3470149253731342e-05, |
|
"loss": 0.0837, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 8.69, |
|
"grad_norm": 2.7039573192596436, |
|
"learning_rate": 1.3097014925373136e-05, |
|
"loss": 0.2098, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 8.73, |
|
"grad_norm": 6.542816638946533, |
|
"learning_rate": 1.2723880597014926e-05, |
|
"loss": 0.129, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 8.77, |
|
"grad_norm": 2.9511120319366455, |
|
"learning_rate": 1.2350746268656717e-05, |
|
"loss": 0.1762, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 8.81, |
|
"grad_norm": 3.435502529144287, |
|
"learning_rate": 1.1977611940298509e-05, |
|
"loss": 0.1345, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 8.84, |
|
"grad_norm": 2.1689364910125732, |
|
"learning_rate": 1.1604477611940299e-05, |
|
"loss": 0.1011, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 8.88, |
|
"grad_norm": 2.3366479873657227, |
|
"learning_rate": 1.123134328358209e-05, |
|
"loss": 0.1733, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 8.92, |
|
"grad_norm": 5.928171634674072, |
|
"learning_rate": 1.085820895522388e-05, |
|
"loss": 0.1089, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 8.96, |
|
"grad_norm": 0.08636012673377991, |
|
"learning_rate": 1.0485074626865672e-05, |
|
"loss": 0.1304, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 8.96, |
|
"eval_accuracy": 0.8224956063268892, |
|
"eval_loss": 0.7306046485900879, |
|
"eval_runtime": 8.4262, |
|
"eval_samples_per_second": 67.528, |
|
"eval_steps_per_second": 8.545, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 8.99, |
|
"grad_norm": 0.14256200194358826, |
|
"learning_rate": 1.0111940298507463e-05, |
|
"loss": 0.1506, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 9.03, |
|
"grad_norm": 0.4166848659515381, |
|
"learning_rate": 9.738805970149255e-06, |
|
"loss": 0.2058, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 9.07, |
|
"grad_norm": 0.3997032344341278, |
|
"learning_rate": 9.365671641791045e-06, |
|
"loss": 0.0482, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 9.1, |
|
"grad_norm": 9.076058387756348, |
|
"learning_rate": 8.992537313432836e-06, |
|
"loss": 0.2201, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 9.14, |
|
"grad_norm": 4.368849277496338, |
|
"learning_rate": 8.619402985074628e-06, |
|
"loss": 0.1288, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 9.18, |
|
"grad_norm": 4.311466693878174, |
|
"learning_rate": 8.24626865671642e-06, |
|
"loss": 0.3058, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 9.22, |
|
"grad_norm": 0.2911408543586731, |
|
"learning_rate": 7.87313432835821e-06, |
|
"loss": 0.1303, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 9.25, |
|
"grad_norm": 5.493233680725098, |
|
"learning_rate": 7.5e-06, |
|
"loss": 0.0915, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 9.29, |
|
"grad_norm": 0.09431172162294388, |
|
"learning_rate": 7.126865671641792e-06, |
|
"loss": 0.0954, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 9.33, |
|
"grad_norm": 1.8603869676589966, |
|
"learning_rate": 6.7537313432835825e-06, |
|
"loss": 0.1514, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 9.33, |
|
"eval_accuracy": 0.8260105448154658, |
|
"eval_loss": 0.7162156701087952, |
|
"eval_runtime": 8.3201, |
|
"eval_samples_per_second": 68.389, |
|
"eval_steps_per_second": 8.654, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 9.37, |
|
"grad_norm": 4.870584964752197, |
|
"learning_rate": 6.380597014925374e-06, |
|
"loss": 0.1354, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 9.4, |
|
"grad_norm": 2.316840410232544, |
|
"learning_rate": 6.007462686567165e-06, |
|
"loss": 0.1348, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 9.44, |
|
"grad_norm": 1.9005101919174194, |
|
"learning_rate": 5.6343283582089556e-06, |
|
"loss": 0.1755, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 9.48, |
|
"grad_norm": 0.1674620360136032, |
|
"learning_rate": 5.261194029850746e-06, |
|
"loss": 0.0878, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 9.51, |
|
"grad_norm": 5.729959011077881, |
|
"learning_rate": 4.888059701492537e-06, |
|
"loss": 0.1637, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 9.55, |
|
"grad_norm": 0.02724504843354225, |
|
"learning_rate": 4.514925373134329e-06, |
|
"loss": 0.1603, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 9.59, |
|
"grad_norm": 2.728663921356201, |
|
"learning_rate": 4.141791044776119e-06, |
|
"loss": 0.1152, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 9.63, |
|
"grad_norm": 8.920695304870605, |
|
"learning_rate": 3.7686567164179105e-06, |
|
"loss": 0.1964, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 9.66, |
|
"grad_norm": 2.3974239826202393, |
|
"learning_rate": 3.3955223880597013e-06, |
|
"loss": 0.0842, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 9.7, |
|
"grad_norm": 1.6431355476379395, |
|
"learning_rate": 3.022388059701493e-06, |
|
"loss": 0.2571, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 9.7, |
|
"eval_accuracy": 0.8347978910369068, |
|
"eval_loss": 0.7012546062469482, |
|
"eval_runtime": 8.3265, |
|
"eval_samples_per_second": 68.336, |
|
"eval_steps_per_second": 8.647, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 9.74, |
|
"grad_norm": 0.10621854662895203, |
|
"learning_rate": 2.6492537313432836e-06, |
|
"loss": 0.2632, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 9.78, |
|
"grad_norm": 4.150152206420898, |
|
"learning_rate": 2.2761194029850747e-06, |
|
"loss": 0.2804, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 9.81, |
|
"grad_norm": 4.01139497756958, |
|
"learning_rate": 1.9029850746268657e-06, |
|
"loss": 0.1696, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 9.85, |
|
"grad_norm": 4.7402262687683105, |
|
"learning_rate": 1.5298507462686568e-06, |
|
"loss": 0.1891, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 9.89, |
|
"grad_norm": 4.460111141204834, |
|
"learning_rate": 1.1567164179104478e-06, |
|
"loss": 0.1178, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 9.93, |
|
"grad_norm": 5.822507858276367, |
|
"learning_rate": 7.835820895522387e-07, |
|
"loss": 0.089, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 9.96, |
|
"grad_norm": 2.4408085346221924, |
|
"learning_rate": 4.1044776119402984e-07, |
|
"loss": 0.158, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 10.792135238647461, |
|
"learning_rate": 3.7313432835820895e-08, |
|
"loss": 0.2038, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 2680, |
|
"total_flos": 3.3230947683690086e+18, |
|
"train_loss": 0.23535207314277762, |
|
"train_runtime": 1371.8304, |
|
"train_samples_per_second": 31.258, |
|
"train_steps_per_second": 1.954 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2680, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"total_flos": 3.3230947683690086e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|