{ "best_metric": 0.86, "best_model_checkpoint": "./checkpoints/checkpoint_model_lr_5e_05_bs_16/checkpoint-1144", "epoch": 26.0, "eval_steps": 500, "global_step": 1144, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.11363636363636363, "grad_norm": 15.224111557006836, "learning_rate": 1.1363636363636364e-06, "loss": 2.5973, "step": 5 }, { "epoch": 0.22727272727272727, "grad_norm": 14.61684513092041, "learning_rate": 2.0454545454545457e-06, "loss": 2.5056, "step": 10 }, { "epoch": 0.3409090909090909, "grad_norm": 13.760466575622559, "learning_rate": 3.1818181818181817e-06, "loss": 2.4719, "step": 15 }, { "epoch": 0.45454545454545453, "grad_norm": 15.71677017211914, "learning_rate": 4.3181818181818185e-06, "loss": 2.3512, "step": 20 }, { "epoch": 0.5681818181818182, "grad_norm": 12.284369468688965, "learning_rate": 5.4545454545454545e-06, "loss": 2.2199, "step": 25 }, { "epoch": 0.6818181818181818, "grad_norm": 11.864232063293457, "learning_rate": 6.59090909090909e-06, "loss": 2.0262, "step": 30 }, { "epoch": 0.7954545454545454, "grad_norm": 11.269423484802246, "learning_rate": 7.727272727272727e-06, "loss": 1.7453, "step": 35 }, { "epoch": 0.9090909090909091, "grad_norm": 13.125885963439941, "learning_rate": 8.863636363636365e-06, "loss": 1.541, "step": 40 }, { "epoch": 1.0, "eval_accuracy": 0.67, "eval_loss": 1.2503589391708374, "eval_runtime": 18.1476, "eval_samples_per_second": 11.021, "eval_steps_per_second": 0.716, "step": 44 }, { "epoch": 1.0227272727272727, "grad_norm": 11.554954528808594, "learning_rate": 1e-05, "loss": 1.2945, "step": 45 }, { "epoch": 1.1363636363636362, "grad_norm": 11.968127250671387, "learning_rate": 1.1136363636363637e-05, "loss": 1.0641, "step": 50 }, { "epoch": 1.25, "grad_norm": 11.197978019714355, "learning_rate": 1.2272727272727273e-05, "loss": 0.846, "step": 55 }, { "epoch": 1.3636363636363638, "grad_norm": 9.72460651397705, "learning_rate": 1.340909090909091e-05, "loss": 0.6572, "step": 60 }, { "epoch": 1.4772727272727273, "grad_norm": 14.739611625671387, "learning_rate": 1.4545454545454545e-05, "loss": 0.6622, "step": 65 }, { "epoch": 1.5909090909090908, "grad_norm": 13.50337028503418, "learning_rate": 1.5681818181818182e-05, "loss": 0.565, "step": 70 }, { "epoch": 1.7045454545454546, "grad_norm": 9.195622444152832, "learning_rate": 1.6818181818181818e-05, "loss": 0.5775, "step": 75 }, { "epoch": 1.8181818181818183, "grad_norm": 10.333452224731445, "learning_rate": 1.7954545454545454e-05, "loss": 0.5743, "step": 80 }, { "epoch": 1.9318181818181817, "grad_norm": 10.101814270019531, "learning_rate": 1.9090909090909094e-05, "loss": 0.5395, "step": 85 }, { "epoch": 2.0, "eval_accuracy": 0.75, "eval_loss": 0.715374231338501, "eval_runtime": 18.1709, "eval_samples_per_second": 11.007, "eval_steps_per_second": 0.715, "step": 88 }, { "epoch": 2.0454545454545454, "grad_norm": 8.385086059570312, "learning_rate": 2.022727272727273e-05, "loss": 0.5222, "step": 90 }, { "epoch": 2.159090909090909, "grad_norm": 4.65899133682251, "learning_rate": 2.1363636363636362e-05, "loss": 0.4306, "step": 95 }, { "epoch": 2.2727272727272725, "grad_norm": 10.004486083984375, "learning_rate": 2.25e-05, "loss": 0.2421, "step": 100 }, { "epoch": 2.3863636363636362, "grad_norm": 9.067935943603516, "learning_rate": 2.3636363636363637e-05, "loss": 0.2421, "step": 105 }, { "epoch": 2.5, "grad_norm": 6.462744235992432, "learning_rate": 2.4772727272727277e-05, "loss": 0.2362, "step": 110 }, { "epoch": 2.6136363636363638, "grad_norm": 22.449291229248047, "learning_rate": 2.590909090909091e-05, "loss": 0.3596, "step": 115 }, { "epoch": 2.7272727272727275, "grad_norm": 10.209068298339844, "learning_rate": 2.7045454545454545e-05, "loss": 0.3211, "step": 120 }, { "epoch": 2.840909090909091, "grad_norm": 12.491495132446289, "learning_rate": 2.818181818181818e-05, "loss": 0.3784, "step": 125 }, { "epoch": 2.9545454545454546, "grad_norm": 14.337518692016602, "learning_rate": 2.9318181818181817e-05, "loss": 0.3125, "step": 130 }, { "epoch": 3.0, "eval_accuracy": 0.705, "eval_loss": 0.8171606659889221, "eval_runtime": 17.9915, "eval_samples_per_second": 11.116, "eval_steps_per_second": 0.723, "step": 132 }, { "epoch": 3.0681818181818183, "grad_norm": 9.655952453613281, "learning_rate": 3.0454545454545456e-05, "loss": 0.4244, "step": 135 }, { "epoch": 3.1818181818181817, "grad_norm": 4.139129161834717, "learning_rate": 3.159090909090909e-05, "loss": 0.1704, "step": 140 }, { "epoch": 3.2954545454545454, "grad_norm": 9.019370079040527, "learning_rate": 3.272727272727273e-05, "loss": 0.2044, "step": 145 }, { "epoch": 3.409090909090909, "grad_norm": 1.8330605030059814, "learning_rate": 3.3863636363636364e-05, "loss": 0.0844, "step": 150 }, { "epoch": 3.5227272727272725, "grad_norm": 1.0041872262954712, "learning_rate": 3.5e-05, "loss": 0.1684, "step": 155 }, { "epoch": 3.6363636363636362, "grad_norm": 13.836721420288086, "learning_rate": 3.613636363636364e-05, "loss": 0.1244, "step": 160 }, { "epoch": 3.75, "grad_norm": 10.699409484863281, "learning_rate": 3.7272727272727276e-05, "loss": 0.1537, "step": 165 }, { "epoch": 3.8636363636363638, "grad_norm": 1.7060661315917969, "learning_rate": 3.840909090909091e-05, "loss": 0.1152, "step": 170 }, { "epoch": 3.9772727272727275, "grad_norm": 7.366329669952393, "learning_rate": 3.954545454545455e-05, "loss": 0.1762, "step": 175 }, { "epoch": 4.0, "eval_accuracy": 0.765, "eval_loss": 0.7440270781517029, "eval_runtime": 18.4547, "eval_samples_per_second": 10.837, "eval_steps_per_second": 0.704, "step": 176 }, { "epoch": 4.090909090909091, "grad_norm": 5.621038436889648, "learning_rate": 4.068181818181818e-05, "loss": 0.0961, "step": 180 }, { "epoch": 4.204545454545454, "grad_norm": 2.5091235637664795, "learning_rate": 4.181818181818182e-05, "loss": 0.195, "step": 185 }, { "epoch": 4.318181818181818, "grad_norm": 11.424132347106934, "learning_rate": 4.295454545454546e-05, "loss": 0.0859, "step": 190 }, { "epoch": 4.431818181818182, "grad_norm": 2.8084189891815186, "learning_rate": 4.409090909090909e-05, "loss": 0.0252, "step": 195 }, { "epoch": 4.545454545454545, "grad_norm": 2.906534194946289, "learning_rate": 4.522727272727273e-05, "loss": 0.062, "step": 200 }, { "epoch": 4.659090909090909, "grad_norm": 7.522055149078369, "learning_rate": 4.636363636363636e-05, "loss": 0.0973, "step": 205 }, { "epoch": 4.7727272727272725, "grad_norm": 7.575300693511963, "learning_rate": 4.75e-05, "loss": 0.0665, "step": 210 }, { "epoch": 4.886363636363637, "grad_norm": 25.463464736938477, "learning_rate": 4.863636363636364e-05, "loss": 0.1488, "step": 215 }, { "epoch": 5.0, "grad_norm": 16.796293258666992, "learning_rate": 4.9772727272727275e-05, "loss": 0.4321, "step": 220 }, { "epoch": 5.0, "eval_accuracy": 0.73, "eval_loss": 1.1859827041625977, "eval_runtime": 18.064, "eval_samples_per_second": 11.072, "eval_steps_per_second": 0.72, "step": 220 }, { "epoch": 5.113636363636363, "grad_norm": 0.248295396566391, "learning_rate": 4.98989898989899e-05, "loss": 0.2525, "step": 225 }, { "epoch": 5.2272727272727275, "grad_norm": 1.8920602798461914, "learning_rate": 4.9772727272727275e-05, "loss": 0.0468, "step": 230 }, { "epoch": 5.340909090909091, "grad_norm": 10.923778533935547, "learning_rate": 4.964646464646465e-05, "loss": 0.0465, "step": 235 }, { "epoch": 5.454545454545454, "grad_norm": 2.1219613552093506, "learning_rate": 4.952020202020202e-05, "loss": 0.1053, "step": 240 }, { "epoch": 5.568181818181818, "grad_norm": 3.359290361404419, "learning_rate": 4.93939393939394e-05, "loss": 0.0528, "step": 245 }, { "epoch": 5.681818181818182, "grad_norm": 11.049290657043457, "learning_rate": 4.9267676767676765e-05, "loss": 0.1639, "step": 250 }, { "epoch": 5.795454545454545, "grad_norm": 14.748278617858887, "learning_rate": 4.9141414141414145e-05, "loss": 0.1953, "step": 255 }, { "epoch": 5.909090909090909, "grad_norm": 1.4790602922439575, "learning_rate": 4.901515151515152e-05, "loss": 0.1226, "step": 260 }, { "epoch": 6.0, "eval_accuracy": 0.76, "eval_loss": 1.0318822860717773, "eval_runtime": 17.924, "eval_samples_per_second": 11.158, "eval_steps_per_second": 0.725, "step": 264 }, { "epoch": 6.0227272727272725, "grad_norm": 22.95401954650879, "learning_rate": 4.888888888888889e-05, "loss": 0.1116, "step": 265 }, { "epoch": 6.136363636363637, "grad_norm": 18.67777442932129, "learning_rate": 4.876262626262626e-05, "loss": 0.1461, "step": 270 }, { "epoch": 6.25, "grad_norm": 1.6643537282943726, "learning_rate": 4.863636363636364e-05, "loss": 0.1635, "step": 275 }, { "epoch": 6.363636363636363, "grad_norm": 1.0799901485443115, "learning_rate": 4.851010101010101e-05, "loss": 0.0535, "step": 280 }, { "epoch": 6.4772727272727275, "grad_norm": 3.293269395828247, "learning_rate": 4.838383838383839e-05, "loss": 0.1025, "step": 285 }, { "epoch": 6.590909090909091, "grad_norm": 1.957770824432373, "learning_rate": 4.825757575757576e-05, "loss": 0.0845, "step": 290 }, { "epoch": 6.704545454545455, "grad_norm": 1.3866221904754639, "learning_rate": 4.813131313131313e-05, "loss": 0.0431, "step": 295 }, { "epoch": 6.818181818181818, "grad_norm": 0.7969300746917725, "learning_rate": 4.8005050505050505e-05, "loss": 0.2299, "step": 300 }, { "epoch": 6.931818181818182, "grad_norm": 0.36333414912223816, "learning_rate": 4.787878787878788e-05, "loss": 0.0158, "step": 305 }, { "epoch": 7.0, "eval_accuracy": 0.795, "eval_loss": 0.7636151313781738, "eval_runtime": 18.1881, "eval_samples_per_second": 10.996, "eval_steps_per_second": 0.715, "step": 308 }, { "epoch": 7.045454545454546, "grad_norm": 0.7603231072425842, "learning_rate": 4.775252525252526e-05, "loss": 0.0222, "step": 310 }, { "epoch": 7.159090909090909, "grad_norm": 0.044530898332595825, "learning_rate": 4.762626262626263e-05, "loss": 0.0114, "step": 315 }, { "epoch": 7.2727272727272725, "grad_norm": 13.010513305664062, "learning_rate": 4.75e-05, "loss": 0.0344, "step": 320 }, { "epoch": 7.386363636363637, "grad_norm": 1.4342093467712402, "learning_rate": 4.7373737373737375e-05, "loss": 0.0254, "step": 325 }, { "epoch": 7.5, "grad_norm": 9.25933837890625, "learning_rate": 4.7247474747474755e-05, "loss": 0.0171, "step": 330 }, { "epoch": 7.613636363636363, "grad_norm": 0.09031987190246582, "learning_rate": 4.712121212121212e-05, "loss": 0.0023, "step": 335 }, { "epoch": 7.7272727272727275, "grad_norm": 0.03679551184177399, "learning_rate": 4.69949494949495e-05, "loss": 0.0032, "step": 340 }, { "epoch": 7.840909090909091, "grad_norm": 2.662745237350464, "learning_rate": 4.686868686868687e-05, "loss": 0.0048, "step": 345 }, { "epoch": 7.954545454545455, "grad_norm": 0.17469650506973267, "learning_rate": 4.6742424242424245e-05, "loss": 0.0009, "step": 350 }, { "epoch": 8.0, "eval_accuracy": 0.84, "eval_loss": 0.8234661817550659, "eval_runtime": 18.0311, "eval_samples_per_second": 11.092, "eval_steps_per_second": 0.721, "step": 352 }, { "epoch": 8.068181818181818, "grad_norm": 0.034661825746297836, "learning_rate": 4.661616161616162e-05, "loss": 0.014, "step": 355 }, { "epoch": 8.181818181818182, "grad_norm": 0.06576091796159744, "learning_rate": 4.6489898989899e-05, "loss": 0.0007, "step": 360 }, { "epoch": 8.295454545454545, "grad_norm": 0.016597317531704903, "learning_rate": 4.636363636363636e-05, "loss": 0.002, "step": 365 }, { "epoch": 8.409090909090908, "grad_norm": 0.013650023378431797, "learning_rate": 4.623737373737374e-05, "loss": 0.0015, "step": 370 }, { "epoch": 8.522727272727273, "grad_norm": 0.19399936497211456, "learning_rate": 4.6111111111111115e-05, "loss": 0.0069, "step": 375 }, { "epoch": 8.636363636363637, "grad_norm": 0.05494086816906929, "learning_rate": 4.598484848484849e-05, "loss": 0.0026, "step": 380 }, { "epoch": 8.75, "grad_norm": 0.013664503581821918, "learning_rate": 4.585858585858586e-05, "loss": 0.0012, "step": 385 }, { "epoch": 8.863636363636363, "grad_norm": 0.025454234331846237, "learning_rate": 4.573232323232323e-05, "loss": 0.0006, "step": 390 }, { "epoch": 8.977272727272727, "grad_norm": 2.3740947246551514, "learning_rate": 4.5606060606060606e-05, "loss": 0.0029, "step": 395 }, { "epoch": 9.0, "eval_accuracy": 0.83, "eval_loss": 0.892738401889801, "eval_runtime": 18.123, "eval_samples_per_second": 11.036, "eval_steps_per_second": 0.717, "step": 396 }, { "epoch": 9.090909090909092, "grad_norm": 0.20694267749786377, "learning_rate": 4.5479797979797985e-05, "loss": 0.001, "step": 400 }, { "epoch": 9.204545454545455, "grad_norm": 0.07484059035778046, "learning_rate": 4.535353535353535e-05, "loss": 0.0027, "step": 405 }, { "epoch": 9.318181818181818, "grad_norm": 0.1402018964290619, "learning_rate": 4.522727272727273e-05, "loss": 0.1035, "step": 410 }, { "epoch": 9.431818181818182, "grad_norm": 0.011067689396440983, "learning_rate": 4.51010101010101e-05, "loss": 0.0145, "step": 415 }, { "epoch": 9.545454545454545, "grad_norm": 6.280289649963379, "learning_rate": 4.4974747474747476e-05, "loss": 0.0097, "step": 420 }, { "epoch": 9.659090909090908, "grad_norm": 0.018114736303687096, "learning_rate": 4.484848484848485e-05, "loss": 0.0006, "step": 425 }, { "epoch": 9.772727272727273, "grad_norm": 11.336344718933105, "learning_rate": 4.472222222222223e-05, "loss": 0.0134, "step": 430 }, { "epoch": 9.886363636363637, "grad_norm": 0.30804646015167236, "learning_rate": 4.4595959595959594e-05, "loss": 0.1245, "step": 435 }, { "epoch": 10.0, "grad_norm": 6.994472503662109, "learning_rate": 4.4469696969696973e-05, "loss": 0.0522, "step": 440 }, { "epoch": 10.0, "eval_accuracy": 0.775, "eval_loss": 1.2321974039077759, "eval_runtime": 18.1204, "eval_samples_per_second": 11.037, "eval_steps_per_second": 0.717, "step": 440 }, { "epoch": 10.113636363636363, "grad_norm": 2.2376856803894043, "learning_rate": 4.4343434343434346e-05, "loss": 0.0934, "step": 445 }, { "epoch": 10.227272727272727, "grad_norm": 0.008311591111123562, "learning_rate": 4.421717171717172e-05, "loss": 0.0298, "step": 450 }, { "epoch": 10.340909090909092, "grad_norm": 21.288654327392578, "learning_rate": 4.409090909090909e-05, "loss": 0.1688, "step": 455 }, { "epoch": 10.454545454545455, "grad_norm": 16.286630630493164, "learning_rate": 4.396464646464647e-05, "loss": 0.09, "step": 460 }, { "epoch": 10.568181818181818, "grad_norm": 0.036682695150375366, "learning_rate": 4.383838383838384e-05, "loss": 0.0326, "step": 465 }, { "epoch": 10.681818181818182, "grad_norm": 0.0594622865319252, "learning_rate": 4.3712121212121216e-05, "loss": 0.0026, "step": 470 }, { "epoch": 10.795454545454545, "grad_norm": 11.812299728393555, "learning_rate": 4.358585858585859e-05, "loss": 0.0363, "step": 475 }, { "epoch": 10.909090909090908, "grad_norm": 3.5540413856506348, "learning_rate": 4.345959595959596e-05, "loss": 0.0032, "step": 480 }, { "epoch": 11.0, "eval_accuracy": 0.765, "eval_loss": 1.2396481037139893, "eval_runtime": 18.1304, "eval_samples_per_second": 11.031, "eval_steps_per_second": 0.717, "step": 484 }, { "epoch": 11.022727272727273, "grad_norm": 0.7457100749015808, "learning_rate": 4.3333333333333334e-05, "loss": 0.0114, "step": 485 }, { "epoch": 11.136363636363637, "grad_norm": 1.6869572401046753, "learning_rate": 4.320707070707071e-05, "loss": 0.0147, "step": 490 }, { "epoch": 11.25, "grad_norm": 0.05314645916223526, "learning_rate": 4.308080808080808e-05, "loss": 0.0137, "step": 495 }, { "epoch": 11.363636363636363, "grad_norm": 0.05840263515710831, "learning_rate": 4.295454545454546e-05, "loss": 0.1057, "step": 500 }, { "epoch": 11.477272727272727, "grad_norm": 17.320690155029297, "learning_rate": 4.282828282828283e-05, "loss": 0.0669, "step": 505 }, { "epoch": 11.590909090909092, "grad_norm": 21.867401123046875, "learning_rate": 4.2702020202020204e-05, "loss": 0.0451, "step": 510 }, { "epoch": 11.704545454545455, "grad_norm": 0.07670444995164871, "learning_rate": 4.257575757575758e-05, "loss": 0.0029, "step": 515 }, { "epoch": 11.818181818181818, "grad_norm": 0.08975853025913239, "learning_rate": 4.244949494949495e-05, "loss": 0.0045, "step": 520 }, { "epoch": 11.931818181818182, "grad_norm": 0.46701082587242126, "learning_rate": 4.232323232323233e-05, "loss": 0.0038, "step": 525 }, { "epoch": 12.0, "eval_accuracy": 0.795, "eval_loss": 0.9945923686027527, "eval_runtime": 18.2396, "eval_samples_per_second": 10.965, "eval_steps_per_second": 0.713, "step": 528 }, { "epoch": 12.045454545454545, "grad_norm": 0.011377710849046707, "learning_rate": 4.21969696969697e-05, "loss": 0.0256, "step": 530 }, { "epoch": 12.159090909090908, "grad_norm": 0.018205437809228897, "learning_rate": 4.2070707070707074e-05, "loss": 0.005, "step": 535 }, { "epoch": 12.272727272727273, "grad_norm": 0.025772707536816597, "learning_rate": 4.194444444444445e-05, "loss": 0.0004, "step": 540 }, { "epoch": 12.386363636363637, "grad_norm": 1.5446034669876099, "learning_rate": 4.181818181818182e-05, "loss": 0.0429, "step": 545 }, { "epoch": 12.5, "grad_norm": 0.21388648450374603, "learning_rate": 4.169191919191919e-05, "loss": 0.0051, "step": 550 }, { "epoch": 12.613636363636363, "grad_norm": 6.640182018280029, "learning_rate": 4.156565656565657e-05, "loss": 0.0343, "step": 555 }, { "epoch": 12.727272727272727, "grad_norm": 0.2581124007701874, "learning_rate": 4.143939393939394e-05, "loss": 0.0048, "step": 560 }, { "epoch": 12.840909090909092, "grad_norm": 0.007842369377613068, "learning_rate": 4.131313131313132e-05, "loss": 0.0206, "step": 565 }, { "epoch": 12.954545454545455, "grad_norm": 18.075056076049805, "learning_rate": 4.118686868686869e-05, "loss": 0.1007, "step": 570 }, { "epoch": 13.0, "eval_accuracy": 0.76, "eval_loss": 1.2344247102737427, "eval_runtime": 18.5982, "eval_samples_per_second": 10.754, "eval_steps_per_second": 0.699, "step": 572 }, { "epoch": 13.068181818181818, "grad_norm": 0.03319934383034706, "learning_rate": 4.106060606060606e-05, "loss": 0.0243, "step": 575 }, { "epoch": 13.181818181818182, "grad_norm": 11.877718925476074, "learning_rate": 4.0934343434343435e-05, "loss": 0.0666, "step": 580 }, { "epoch": 13.295454545454545, "grad_norm": 0.40111812949180603, "learning_rate": 4.0808080808080814e-05, "loss": 0.0074, "step": 585 }, { "epoch": 13.409090909090908, "grad_norm": 0.5372756719589233, "learning_rate": 4.068181818181818e-05, "loss": 0.0101, "step": 590 }, { "epoch": 13.522727272727273, "grad_norm": 12.776512145996094, "learning_rate": 4.055555555555556e-05, "loss": 0.0137, "step": 595 }, { "epoch": 13.636363636363637, "grad_norm": 1.489402174949646, "learning_rate": 4.042929292929293e-05, "loss": 0.0023, "step": 600 }, { "epoch": 13.75, "grad_norm": 0.033923253417015076, "learning_rate": 4.0303030303030305e-05, "loss": 0.0375, "step": 605 }, { "epoch": 13.863636363636363, "grad_norm": 0.015429407358169556, "learning_rate": 4.017676767676768e-05, "loss": 0.0129, "step": 610 }, { "epoch": 13.977272727272727, "grad_norm": 0.010183623060584068, "learning_rate": 4.005050505050506e-05, "loss": 0.0003, "step": 615 }, { "epoch": 14.0, "eval_accuracy": 0.785, "eval_loss": 1.003933072090149, "eval_runtime": 18.085, "eval_samples_per_second": 11.059, "eval_steps_per_second": 0.719, "step": 616 }, { "epoch": 14.090909090909092, "grad_norm": 0.1973772794008255, "learning_rate": 3.992424242424242e-05, "loss": 0.0004, "step": 620 }, { "epoch": 14.204545454545455, "grad_norm": 0.012279081158339977, "learning_rate": 3.97979797979798e-05, "loss": 0.0015, "step": 625 }, { "epoch": 14.318181818181818, "grad_norm": 0.02780894935131073, "learning_rate": 3.967171717171717e-05, "loss": 0.0009, "step": 630 }, { "epoch": 14.431818181818182, "grad_norm": 0.004000269342213869, "learning_rate": 3.954545454545455e-05, "loss": 0.0016, "step": 635 }, { "epoch": 14.545454545454545, "grad_norm": 0.01629430614411831, "learning_rate": 3.941919191919192e-05, "loss": 0.0072, "step": 640 }, { "epoch": 14.659090909090908, "grad_norm": 0.023849163204431534, "learning_rate": 3.929292929292929e-05, "loss": 0.0007, "step": 645 }, { "epoch": 14.772727272727273, "grad_norm": 0.10356610268354416, "learning_rate": 3.9166666666666665e-05, "loss": 0.0026, "step": 650 }, { "epoch": 14.886363636363637, "grad_norm": 0.14715874195098877, "learning_rate": 3.9040404040404045e-05, "loss": 0.0003, "step": 655 }, { "epoch": 15.0, "grad_norm": 0.10584244877099991, "learning_rate": 3.891414141414141e-05, "loss": 0.0013, "step": 660 }, { "epoch": 15.0, "eval_accuracy": 0.82, "eval_loss": 1.136525273323059, "eval_runtime": 18.0084, "eval_samples_per_second": 11.106, "eval_steps_per_second": 0.722, "step": 660 }, { "epoch": 15.113636363636363, "grad_norm": 0.03254534304141998, "learning_rate": 3.878787878787879e-05, "loss": 0.0008, "step": 665 }, { "epoch": 15.227272727272727, "grad_norm": 0.024025389924645424, "learning_rate": 3.866161616161616e-05, "loss": 0.006, "step": 670 }, { "epoch": 15.340909090909092, "grad_norm": 0.0632549524307251, "learning_rate": 3.8535353535353536e-05, "loss": 0.0003, "step": 675 }, { "epoch": 15.454545454545455, "grad_norm": 0.1464257687330246, "learning_rate": 3.840909090909091e-05, "loss": 0.0046, "step": 680 }, { "epoch": 15.568181818181818, "grad_norm": 0.11179546266794205, "learning_rate": 3.828282828282829e-05, "loss": 0.0004, "step": 685 }, { "epoch": 15.681818181818182, "grad_norm": 4.28083610534668, "learning_rate": 3.815656565656566e-05, "loss": 0.0582, "step": 690 }, { "epoch": 15.795454545454545, "grad_norm": 0.02816050685942173, "learning_rate": 3.803030303030303e-05, "loss": 0.0002, "step": 695 }, { "epoch": 15.909090909090908, "grad_norm": 0.01497215311974287, "learning_rate": 3.7904040404040406e-05, "loss": 0.0006, "step": 700 }, { "epoch": 16.0, "eval_accuracy": 0.785, "eval_loss": 1.4599967002868652, "eval_runtime": 18.0022, "eval_samples_per_second": 11.11, "eval_steps_per_second": 0.722, "step": 704 }, { "epoch": 16.022727272727273, "grad_norm": 0.11498994380235672, "learning_rate": 3.777777777777778e-05, "loss": 0.0004, "step": 705 }, { "epoch": 16.136363636363637, "grad_norm": 0.07350458204746246, "learning_rate": 3.765151515151516e-05, "loss": 0.0003, "step": 710 }, { "epoch": 16.25, "grad_norm": 0.0034702487755566835, "learning_rate": 3.7525252525252524e-05, "loss": 0.0002, "step": 715 }, { "epoch": 16.363636363636363, "grad_norm": 0.02884703502058983, "learning_rate": 3.73989898989899e-05, "loss": 0.0006, "step": 720 }, { "epoch": 16.477272727272727, "grad_norm": 0.0020916229113936424, "learning_rate": 3.7272727272727276e-05, "loss": 0.0004, "step": 725 }, { "epoch": 16.59090909090909, "grad_norm": 0.056545890867710114, "learning_rate": 3.714646464646465e-05, "loss": 0.0005, "step": 730 }, { "epoch": 16.704545454545453, "grad_norm": 0.008786072954535484, "learning_rate": 3.702020202020202e-05, "loss": 0.0001, "step": 735 }, { "epoch": 16.818181818181817, "grad_norm": 0.002408586209639907, "learning_rate": 3.68939393939394e-05, "loss": 0.0002, "step": 740 }, { "epoch": 16.931818181818183, "grad_norm": 0.002747893799096346, "learning_rate": 3.6767676767676766e-05, "loss": 0.0001, "step": 745 }, { "epoch": 17.0, "eval_accuracy": 0.82, "eval_loss": 0.9243611693382263, "eval_runtime": 17.9514, "eval_samples_per_second": 11.141, "eval_steps_per_second": 0.724, "step": 748 }, { "epoch": 17.045454545454547, "grad_norm": 0.002468683058395982, "learning_rate": 3.6641414141414146e-05, "loss": 0.0002, "step": 750 }, { "epoch": 17.15909090909091, "grad_norm": 0.0032050844747573137, "learning_rate": 3.651515151515152e-05, "loss": 0.0001, "step": 755 }, { "epoch": 17.272727272727273, "grad_norm": 0.0034459943417459726, "learning_rate": 3.638888888888889e-05, "loss": 0.0001, "step": 760 }, { "epoch": 17.386363636363637, "grad_norm": 0.002899475395679474, "learning_rate": 3.6262626262626264e-05, "loss": 0.0001, "step": 765 }, { "epoch": 17.5, "grad_norm": 0.0023372883442789316, "learning_rate": 3.613636363636364e-05, "loss": 0.0001, "step": 770 }, { "epoch": 17.613636363636363, "grad_norm": 0.0017898937221616507, "learning_rate": 3.601010101010101e-05, "loss": 0.0001, "step": 775 }, { "epoch": 17.727272727272727, "grad_norm": 0.004293715115636587, "learning_rate": 3.588383838383839e-05, "loss": 0.0001, "step": 780 }, { "epoch": 17.84090909090909, "grad_norm": 0.0015544987982138991, "learning_rate": 3.575757575757576e-05, "loss": 0.0001, "step": 785 }, { "epoch": 17.954545454545453, "grad_norm": 0.0016952146543189883, "learning_rate": 3.5631313131313134e-05, "loss": 0.0001, "step": 790 }, { "epoch": 18.0, "eval_accuracy": 0.83, "eval_loss": 0.8526318073272705, "eval_runtime": 18.3475, "eval_samples_per_second": 10.901, "eval_steps_per_second": 0.709, "step": 792 }, { "epoch": 18.068181818181817, "grad_norm": 0.0020155711099505424, "learning_rate": 3.5505050505050506e-05, "loss": 0.0001, "step": 795 }, { "epoch": 18.181818181818183, "grad_norm": 0.0013277644757181406, "learning_rate": 3.537878787878788e-05, "loss": 0.0001, "step": 800 }, { "epoch": 18.295454545454547, "grad_norm": 0.0020753301214426756, "learning_rate": 3.525252525252525e-05, "loss": 0.0001, "step": 805 }, { "epoch": 18.40909090909091, "grad_norm": 0.0018062540329992771, "learning_rate": 3.512626262626263e-05, "loss": 0.0001, "step": 810 }, { "epoch": 18.522727272727273, "grad_norm": 0.0017586707836017013, "learning_rate": 3.5e-05, "loss": 0.0001, "step": 815 }, { "epoch": 18.636363636363637, "grad_norm": 0.001384661765769124, "learning_rate": 3.4873737373737376e-05, "loss": 0.0001, "step": 820 }, { "epoch": 18.75, "grad_norm": 0.001989172538742423, "learning_rate": 3.474747474747475e-05, "loss": 0.0001, "step": 825 }, { "epoch": 18.863636363636363, "grad_norm": 0.001609248691238463, "learning_rate": 3.462121212121212e-05, "loss": 0.0001, "step": 830 }, { "epoch": 18.977272727272727, "grad_norm": 0.001661130809225142, "learning_rate": 3.4494949494949494e-05, "loss": 0.0001, "step": 835 }, { "epoch": 19.0, "eval_accuracy": 0.83, "eval_loss": 0.8473049998283386, "eval_runtime": 18.1222, "eval_samples_per_second": 11.036, "eval_steps_per_second": 0.717, "step": 836 }, { "epoch": 19.09090909090909, "grad_norm": 0.0013667866587638855, "learning_rate": 3.4368686868686874e-05, "loss": 0.0001, "step": 840 }, { "epoch": 19.204545454545453, "grad_norm": 0.001460373867303133, "learning_rate": 3.424242424242424e-05, "loss": 0.0001, "step": 845 }, { "epoch": 19.318181818181817, "grad_norm": 0.001786314183846116, "learning_rate": 3.411616161616162e-05, "loss": 0.0001, "step": 850 }, { "epoch": 19.431818181818183, "grad_norm": 0.0021370083559304476, "learning_rate": 3.398989898989899e-05, "loss": 0.0001, "step": 855 }, { "epoch": 19.545454545454547, "grad_norm": 0.0021769488230347633, "learning_rate": 3.3863636363636364e-05, "loss": 0.0001, "step": 860 }, { "epoch": 19.65909090909091, "grad_norm": 0.002502184361219406, "learning_rate": 3.373737373737374e-05, "loss": 0.0001, "step": 865 }, { "epoch": 19.772727272727273, "grad_norm": 0.0011145217576995492, "learning_rate": 3.3611111111111116e-05, "loss": 0.0001, "step": 870 }, { "epoch": 19.886363636363637, "grad_norm": 0.0014179410645738244, "learning_rate": 3.348484848484848e-05, "loss": 0.0001, "step": 875 }, { "epoch": 20.0, "grad_norm": 0.0021014176309108734, "learning_rate": 3.335858585858586e-05, "loss": 0.0001, "step": 880 }, { "epoch": 20.0, "eval_accuracy": 0.84, "eval_loss": 0.8451841473579407, "eval_runtime": 17.9811, "eval_samples_per_second": 11.123, "eval_steps_per_second": 0.723, "step": 880 }, { "epoch": 20.113636363636363, "grad_norm": 0.002036330057308078, "learning_rate": 3.3232323232323234e-05, "loss": 0.0001, "step": 885 }, { "epoch": 20.227272727272727, "grad_norm": 0.003770321374759078, "learning_rate": 3.310606060606061e-05, "loss": 0.0001, "step": 890 }, { "epoch": 20.34090909090909, "grad_norm": 0.0016103875823318958, "learning_rate": 3.297979797979798e-05, "loss": 0.0001, "step": 895 }, { "epoch": 20.454545454545453, "grad_norm": 0.001122700166888535, "learning_rate": 3.285353535353535e-05, "loss": 0.0001, "step": 900 }, { "epoch": 20.568181818181817, "grad_norm": 0.0010796000715345144, "learning_rate": 3.272727272727273e-05, "loss": 0.0001, "step": 905 }, { "epoch": 20.681818181818183, "grad_norm": 0.0008875965140759945, "learning_rate": 3.2601010101010104e-05, "loss": 0.0001, "step": 910 }, { "epoch": 20.795454545454547, "grad_norm": 0.0022327785845845938, "learning_rate": 3.247474747474748e-05, "loss": 0.0001, "step": 915 }, { "epoch": 20.90909090909091, "grad_norm": 0.002081720856949687, "learning_rate": 3.234848484848485e-05, "loss": 0.0001, "step": 920 }, { "epoch": 21.0, "eval_accuracy": 0.845, "eval_loss": 0.8451855182647705, "eval_runtime": 18.2346, "eval_samples_per_second": 10.968, "eval_steps_per_second": 0.713, "step": 924 }, { "epoch": 21.022727272727273, "grad_norm": 0.0012421772116795182, "learning_rate": 3.222222222222223e-05, "loss": 0.0001, "step": 925 }, { "epoch": 21.136363636363637, "grad_norm": 0.0009116244618780911, "learning_rate": 3.2095959595959595e-05, "loss": 0.0001, "step": 930 }, { "epoch": 21.25, "grad_norm": 0.0018600281327962875, "learning_rate": 3.1969696969696974e-05, "loss": 0.0001, "step": 935 }, { "epoch": 21.363636363636363, "grad_norm": 0.0009139208123087883, "learning_rate": 3.184343434343435e-05, "loss": 0.0001, "step": 940 }, { "epoch": 21.477272727272727, "grad_norm": 0.0014584549935534596, "learning_rate": 3.171717171717172e-05, "loss": 0.0001, "step": 945 }, { "epoch": 21.59090909090909, "grad_norm": 0.0014573056250810623, "learning_rate": 3.159090909090909e-05, "loss": 0.0001, "step": 950 }, { "epoch": 21.704545454545453, "grad_norm": 0.0017861599335446954, "learning_rate": 3.146464646464647e-05, "loss": 0.0001, "step": 955 }, { "epoch": 21.818181818181817, "grad_norm": 0.0017260201275348663, "learning_rate": 3.133838383838384e-05, "loss": 0.0001, "step": 960 }, { "epoch": 21.931818181818183, "grad_norm": 0.0017356129828840494, "learning_rate": 3.121212121212122e-05, "loss": 0.0001, "step": 965 }, { "epoch": 22.0, "eval_accuracy": 0.845, "eval_loss": 0.8445044159889221, "eval_runtime": 18.0402, "eval_samples_per_second": 11.086, "eval_steps_per_second": 0.721, "step": 968 }, { "epoch": 22.045454545454547, "grad_norm": 0.0013173171319067478, "learning_rate": 3.108585858585858e-05, "loss": 0.0001, "step": 970 }, { "epoch": 22.15909090909091, "grad_norm": 0.0018760478124022484, "learning_rate": 3.095959595959596e-05, "loss": 0.0001, "step": 975 }, { "epoch": 22.272727272727273, "grad_norm": 0.0007338706054724753, "learning_rate": 3.0833333333333335e-05, "loss": 0.0, "step": 980 }, { "epoch": 22.386363636363637, "grad_norm": 0.000914990552701056, "learning_rate": 3.070707070707071e-05, "loss": 0.0, "step": 985 }, { "epoch": 22.5, "grad_norm": 0.0018791673937812448, "learning_rate": 3.058080808080808e-05, "loss": 0.0001, "step": 990 }, { "epoch": 22.613636363636363, "grad_norm": 0.0010454360162839293, "learning_rate": 3.0454545454545456e-05, "loss": 0.0001, "step": 995 }, { "epoch": 22.727272727272727, "grad_norm": 0.0019519156776368618, "learning_rate": 3.032828282828283e-05, "loss": 0.0001, "step": 1000 }, { "epoch": 22.84090909090909, "grad_norm": 0.0013823095941916108, "learning_rate": 3.0202020202020205e-05, "loss": 0.0, "step": 1005 }, { "epoch": 22.954545454545453, "grad_norm": 0.0008434047340415418, "learning_rate": 3.0075757575757578e-05, "loss": 0.0001, "step": 1010 }, { "epoch": 23.0, "eval_accuracy": 0.85, "eval_loss": 0.8445219397544861, "eval_runtime": 18.3375, "eval_samples_per_second": 10.907, "eval_steps_per_second": 0.709, "step": 1012 }, { "epoch": 23.068181818181817, "grad_norm": 0.0017377930926159024, "learning_rate": 2.994949494949495e-05, "loss": 0.0, "step": 1015 }, { "epoch": 23.181818181818183, "grad_norm": 0.001168358838185668, "learning_rate": 2.9823232323232327e-05, "loss": 0.0001, "step": 1020 }, { "epoch": 23.295454545454547, "grad_norm": 0.001191208721138537, "learning_rate": 2.96969696969697e-05, "loss": 0.0, "step": 1025 }, { "epoch": 23.40909090909091, "grad_norm": 0.0011305585503578186, "learning_rate": 2.9570707070707072e-05, "loss": 0.0001, "step": 1030 }, { "epoch": 23.522727272727273, "grad_norm": 0.0014099575346335769, "learning_rate": 2.9444444444444448e-05, "loss": 0.0001, "step": 1035 }, { "epoch": 23.636363636363637, "grad_norm": 0.001969902543351054, "learning_rate": 2.9318181818181817e-05, "loss": 0.0, "step": 1040 }, { "epoch": 23.75, "grad_norm": 0.0011912737973034382, "learning_rate": 2.9191919191919193e-05, "loss": 0.0, "step": 1045 }, { "epoch": 23.863636363636363, "grad_norm": 0.0008956310921348631, "learning_rate": 2.906565656565657e-05, "loss": 0.0, "step": 1050 }, { "epoch": 23.977272727272727, "grad_norm": 0.001731114462018013, "learning_rate": 2.893939393939394e-05, "loss": 0.0, "step": 1055 }, { "epoch": 24.0, "eval_accuracy": 0.85, "eval_loss": 0.8457598686218262, "eval_runtime": 18.3475, "eval_samples_per_second": 10.901, "eval_steps_per_second": 0.709, "step": 1056 }, { "epoch": 24.09090909090909, "grad_norm": 0.000823634210973978, "learning_rate": 2.8813131313131315e-05, "loss": 0.0, "step": 1060 }, { "epoch": 24.204545454545453, "grad_norm": 0.002404524013400078, "learning_rate": 2.868686868686869e-05, "loss": 0.0001, "step": 1065 }, { "epoch": 24.318181818181817, "grad_norm": 0.0007218755781650543, "learning_rate": 2.856060606060606e-05, "loss": 0.0, "step": 1070 }, { "epoch": 24.431818181818183, "grad_norm": 0.001373838516883552, "learning_rate": 2.8434343434343436e-05, "loss": 0.0, "step": 1075 }, { "epoch": 24.545454545454547, "grad_norm": 0.0016993086319416761, "learning_rate": 2.8308080808080812e-05, "loss": 0.0, "step": 1080 }, { "epoch": 24.65909090909091, "grad_norm": 0.0014250794192776084, "learning_rate": 2.818181818181818e-05, "loss": 0.0, "step": 1085 }, { "epoch": 24.772727272727273, "grad_norm": 0.001747175119817257, "learning_rate": 2.8055555555555557e-05, "loss": 0.0, "step": 1090 }, { "epoch": 24.886363636363637, "grad_norm": 0.001142960973083973, "learning_rate": 2.7929292929292933e-05, "loss": 0.0, "step": 1095 }, { "epoch": 25.0, "grad_norm": 0.0012541130417957902, "learning_rate": 2.7803030303030303e-05, "loss": 0.0, "step": 1100 }, { "epoch": 25.0, "eval_accuracy": 0.855, "eval_loss": 0.8466522097587585, "eval_runtime": 18.1075, "eval_samples_per_second": 11.045, "eval_steps_per_second": 0.718, "step": 1100 }, { "epoch": 25.113636363636363, "grad_norm": 0.0014029736630618572, "learning_rate": 2.767676767676768e-05, "loss": 0.0, "step": 1105 }, { "epoch": 25.227272727272727, "grad_norm": 0.002169027691707015, "learning_rate": 2.7550505050505055e-05, "loss": 0.0001, "step": 1110 }, { "epoch": 25.34090909090909, "grad_norm": 0.0018963554175570607, "learning_rate": 2.7424242424242424e-05, "loss": 0.0, "step": 1115 }, { "epoch": 25.454545454545453, "grad_norm": 0.0014427511487156153, "learning_rate": 2.72979797979798e-05, "loss": 0.0, "step": 1120 }, { "epoch": 25.568181818181817, "grad_norm": 0.0007338482537306845, "learning_rate": 2.717171717171717e-05, "loss": 0.0, "step": 1125 }, { "epoch": 25.681818181818183, "grad_norm": 0.0016011031111702323, "learning_rate": 2.7045454545454545e-05, "loss": 0.0, "step": 1130 }, { "epoch": 25.795454545454547, "grad_norm": 0.0010307264747098088, "learning_rate": 2.691919191919192e-05, "loss": 0.0, "step": 1135 }, { "epoch": 25.90909090909091, "grad_norm": 0.0012928517535328865, "learning_rate": 2.679292929292929e-05, "loss": 0.0, "step": 1140 }, { "epoch": 26.0, "eval_accuracy": 0.86, "eval_loss": 0.8471766114234924, "eval_runtime": 18.299, "eval_samples_per_second": 10.93, "eval_steps_per_second": 0.71, "step": 1144 } ], "logging_steps": 5, "max_steps": 2200, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.2319732604023603e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }