sn14 / trainer_state.json
chientv's picture
sn14_task1
ecb2e80 verified
{
"best_metric": 0.9716312056737588,
"best_model_checkpoint": "./results/checkpoint-3807",
"epoch": 70.0,
"eval_steps": 500,
"global_step": 5670,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 1.0,
"grad_norm": 6.570446014404297,
"learning_rate": 1.9728395061728395e-05,
"loss": 2.6389,
"step": 81
},
{
"epoch": 1.0,
"eval_accuracy": 0.475177304964539,
"eval_loss": 0.7098350524902344,
"eval_runtime": 0.3441,
"eval_samples_per_second": 819.478,
"eval_steps_per_second": 52.307,
"step": 81
},
{
"epoch": 2.0,
"grad_norm": 4.623419284820557,
"learning_rate": 1.944268077601411e-05,
"loss": 0.6477,
"step": 162
},
{
"epoch": 2.0,
"eval_accuracy": 0.48226950354609927,
"eval_loss": 0.7516428828239441,
"eval_runtime": 0.3434,
"eval_samples_per_second": 821.083,
"eval_steps_per_second": 52.41,
"step": 162
},
{
"epoch": 3.0,
"grad_norm": 10.926794052124023,
"learning_rate": 1.9156966490299824e-05,
"loss": 0.6227,
"step": 243
},
{
"epoch": 3.0,
"eval_accuracy": 0.4929078014184397,
"eval_loss": 0.8317187428474426,
"eval_runtime": 0.3439,
"eval_samples_per_second": 820.006,
"eval_steps_per_second": 52.341,
"step": 243
},
{
"epoch": 4.0,
"grad_norm": 12.648384094238281,
"learning_rate": 1.887125220458554e-05,
"loss": 0.5403,
"step": 324
},
{
"epoch": 4.0,
"eval_accuracy": 0.4929078014184397,
"eval_loss": 1.9380121231079102,
"eval_runtime": 0.3424,
"eval_samples_per_second": 823.561,
"eval_steps_per_second": 52.568,
"step": 324
},
{
"epoch": 5.0,
"grad_norm": 23.567258834838867,
"learning_rate": 1.8585537918871256e-05,
"loss": 0.5108,
"step": 405
},
{
"epoch": 5.0,
"eval_accuracy": 0.49645390070921985,
"eval_loss": 2.270359754562378,
"eval_runtime": 0.3437,
"eval_samples_per_second": 820.538,
"eval_steps_per_second": 52.375,
"step": 405
},
{
"epoch": 6.0,
"grad_norm": 3.5719075202941895,
"learning_rate": 1.830335097001764e-05,
"loss": 0.4677,
"step": 486
},
{
"epoch": 6.0,
"eval_accuracy": 0.48936170212765956,
"eval_loss": 1.6858181953430176,
"eval_runtime": 0.3432,
"eval_samples_per_second": 821.693,
"eval_steps_per_second": 52.449,
"step": 486
},
{
"epoch": 7.0,
"grad_norm": 7.08165168762207,
"learning_rate": 1.8017636684303353e-05,
"loss": 0.4798,
"step": 567
},
{
"epoch": 7.0,
"eval_accuracy": 0.49645390070921985,
"eval_loss": 1.623734712600708,
"eval_runtime": 0.3436,
"eval_samples_per_second": 820.682,
"eval_steps_per_second": 52.384,
"step": 567
},
{
"epoch": 8.0,
"grad_norm": 10.894269943237305,
"learning_rate": 1.773192239858907e-05,
"loss": 0.4817,
"step": 648
},
{
"epoch": 8.0,
"eval_accuracy": 0.5141843971631206,
"eval_loss": 1.3935478925704956,
"eval_runtime": 0.3435,
"eval_samples_per_second": 821.029,
"eval_steps_per_second": 52.406,
"step": 648
},
{
"epoch": 9.0,
"grad_norm": 7.739453315734863,
"learning_rate": 1.744620811287478e-05,
"loss": 0.4668,
"step": 729
},
{
"epoch": 9.0,
"eval_accuracy": 0.5177304964539007,
"eval_loss": 1.259345531463623,
"eval_runtime": 0.343,
"eval_samples_per_second": 822.053,
"eval_steps_per_second": 52.471,
"step": 729
},
{
"epoch": 10.0,
"grad_norm": 17.012800216674805,
"learning_rate": 1.7160493827160498e-05,
"loss": 0.4359,
"step": 810
},
{
"epoch": 10.0,
"eval_accuracy": 0.5354609929078015,
"eval_loss": 1.310729742050171,
"eval_runtime": 0.3436,
"eval_samples_per_second": 820.694,
"eval_steps_per_second": 52.385,
"step": 810
},
{
"epoch": 11.0,
"grad_norm": 1.6642764806747437,
"learning_rate": 1.687477954144621e-05,
"loss": 0.3956,
"step": 891
},
{
"epoch": 11.0,
"eval_accuracy": 0.8226950354609929,
"eval_loss": 0.43421775102615356,
"eval_runtime": 0.3435,
"eval_samples_per_second": 820.982,
"eval_steps_per_second": 52.403,
"step": 891
},
{
"epoch": 12.0,
"grad_norm": 0.3688388168811798,
"learning_rate": 1.6589065255731923e-05,
"loss": 0.2906,
"step": 972
},
{
"epoch": 12.0,
"eval_accuracy": 0.9290780141843972,
"eval_loss": 0.23947754502296448,
"eval_runtime": 0.3442,
"eval_samples_per_second": 819.25,
"eval_steps_per_second": 52.293,
"step": 972
},
{
"epoch": 13.0,
"grad_norm": 37.02349853515625,
"learning_rate": 1.630335097001764e-05,
"loss": 0.2146,
"step": 1053
},
{
"epoch": 13.0,
"eval_accuracy": 0.9397163120567376,
"eval_loss": 0.33284759521484375,
"eval_runtime": 0.3437,
"eval_samples_per_second": 820.462,
"eval_steps_per_second": 52.37,
"step": 1053
},
{
"epoch": 14.0,
"grad_norm": 0.11939908564090729,
"learning_rate": 1.601763668430335e-05,
"loss": 0.1462,
"step": 1134
},
{
"epoch": 14.0,
"eval_accuracy": 0.950354609929078,
"eval_loss": 0.3009294867515564,
"eval_runtime": 0.3439,
"eval_samples_per_second": 819.984,
"eval_steps_per_second": 52.339,
"step": 1134
},
{
"epoch": 15.0,
"grad_norm": 0.08733003586530685,
"learning_rate": 1.5731922398589064e-05,
"loss": 0.1062,
"step": 1215
},
{
"epoch": 15.0,
"eval_accuracy": 0.9290780141843972,
"eval_loss": 0.21407951414585114,
"eval_runtime": 0.3436,
"eval_samples_per_second": 820.805,
"eval_steps_per_second": 52.392,
"step": 1215
},
{
"epoch": 16.0,
"grad_norm": 0.21886540949344635,
"learning_rate": 1.544620811287478e-05,
"loss": 0.0813,
"step": 1296
},
{
"epoch": 16.0,
"eval_accuracy": 0.9432624113475178,
"eval_loss": 0.34917283058166504,
"eval_runtime": 0.344,
"eval_samples_per_second": 819.711,
"eval_steps_per_second": 52.322,
"step": 1296
},
{
"epoch": 17.0,
"grad_norm": 0.5847246646881104,
"learning_rate": 1.5160493827160495e-05,
"loss": 0.1027,
"step": 1377
},
{
"epoch": 17.0,
"eval_accuracy": 0.9219858156028369,
"eval_loss": 0.3432806432247162,
"eval_runtime": 0.3446,
"eval_samples_per_second": 818.425,
"eval_steps_per_second": 52.24,
"step": 1377
},
{
"epoch": 18.0,
"grad_norm": 0.6198065280914307,
"learning_rate": 1.4874779541446209e-05,
"loss": 0.0736,
"step": 1458
},
{
"epoch": 18.0,
"eval_accuracy": 0.9539007092198581,
"eval_loss": 0.27183273434638977,
"eval_runtime": 0.3437,
"eval_samples_per_second": 820.405,
"eval_steps_per_second": 52.366,
"step": 1458
},
{
"epoch": 19.0,
"grad_norm": 0.5257266163825989,
"learning_rate": 1.4589065255731925e-05,
"loss": 0.0684,
"step": 1539
},
{
"epoch": 19.0,
"eval_accuracy": 0.9645390070921985,
"eval_loss": 0.25684282183647156,
"eval_runtime": 0.3434,
"eval_samples_per_second": 821.157,
"eval_steps_per_second": 52.414,
"step": 1539
},
{
"epoch": 20.0,
"grad_norm": 0.0009818405378609896,
"learning_rate": 1.4303350970017638e-05,
"loss": 0.0779,
"step": 1620
},
{
"epoch": 20.0,
"eval_accuracy": 0.9609929078014184,
"eval_loss": 0.2152564525604248,
"eval_runtime": 0.3431,
"eval_samples_per_second": 821.93,
"eval_steps_per_second": 52.464,
"step": 1620
},
{
"epoch": 21.0,
"grad_norm": 0.4532203674316406,
"learning_rate": 1.4021164021164022e-05,
"loss": 0.0745,
"step": 1701
},
{
"epoch": 21.0,
"eval_accuracy": 0.9645390070921985,
"eval_loss": 0.1914406418800354,
"eval_runtime": 0.344,
"eval_samples_per_second": 819.813,
"eval_steps_per_second": 52.329,
"step": 1701
},
{
"epoch": 22.0,
"grad_norm": 17.428327560424805,
"learning_rate": 1.3735449735449738e-05,
"loss": 0.1106,
"step": 1782
},
{
"epoch": 22.0,
"eval_accuracy": 0.9574468085106383,
"eval_loss": 0.2807099223136902,
"eval_runtime": 0.3441,
"eval_samples_per_second": 819.457,
"eval_steps_per_second": 52.306,
"step": 1782
},
{
"epoch": 23.0,
"grad_norm": 0.00047796443686820567,
"learning_rate": 1.344973544973545e-05,
"loss": 0.0755,
"step": 1863
},
{
"epoch": 23.0,
"eval_accuracy": 0.9539007092198581,
"eval_loss": 0.331978976726532,
"eval_runtime": 0.3453,
"eval_samples_per_second": 816.672,
"eval_steps_per_second": 52.128,
"step": 1863
},
{
"epoch": 24.0,
"grad_norm": 1.006925106048584,
"learning_rate": 1.3164021164021166e-05,
"loss": 0.0833,
"step": 1944
},
{
"epoch": 24.0,
"eval_accuracy": 0.9539007092198581,
"eval_loss": 0.34625303745269775,
"eval_runtime": 0.3436,
"eval_samples_per_second": 820.661,
"eval_steps_per_second": 52.383,
"step": 1944
},
{
"epoch": 25.0,
"grad_norm": 0.506279706954956,
"learning_rate": 1.288183421516755e-05,
"loss": 0.0754,
"step": 2025
},
{
"epoch": 25.0,
"eval_accuracy": 0.9432624113475178,
"eval_loss": 0.34365448355674744,
"eval_runtime": 0.3432,
"eval_samples_per_second": 821.691,
"eval_steps_per_second": 52.448,
"step": 2025
},
{
"epoch": 26.0,
"grad_norm": 0.1998976171016693,
"learning_rate": 1.2596119929453263e-05,
"loss": 0.0772,
"step": 2106
},
{
"epoch": 26.0,
"eval_accuracy": 0.950354609929078,
"eval_loss": 0.3350883424282074,
"eval_runtime": 0.3435,
"eval_samples_per_second": 820.852,
"eval_steps_per_second": 52.395,
"step": 2106
},
{
"epoch": 27.0,
"grad_norm": 0.19478876888751984,
"learning_rate": 1.2310405643738979e-05,
"loss": 0.076,
"step": 2187
},
{
"epoch": 27.0,
"eval_accuracy": 0.9468085106382979,
"eval_loss": 0.4145265519618988,
"eval_runtime": 0.3445,
"eval_samples_per_second": 818.483,
"eval_steps_per_second": 52.244,
"step": 2187
},
{
"epoch": 28.0,
"grad_norm": 0.27469512820243835,
"learning_rate": 1.2024691358024691e-05,
"loss": 0.0625,
"step": 2268
},
{
"epoch": 28.0,
"eval_accuracy": 0.950354609929078,
"eval_loss": 0.44451093673706055,
"eval_runtime": 0.3439,
"eval_samples_per_second": 819.913,
"eval_steps_per_second": 52.335,
"step": 2268
},
{
"epoch": 29.0,
"grad_norm": 26.14291000366211,
"learning_rate": 1.1738977072310408e-05,
"loss": 0.0741,
"step": 2349
},
{
"epoch": 29.0,
"eval_accuracy": 0.9468085106382979,
"eval_loss": 0.29801085591316223,
"eval_runtime": 0.3448,
"eval_samples_per_second": 817.812,
"eval_steps_per_second": 52.201,
"step": 2349
},
{
"epoch": 30.0,
"grad_norm": 0.0004499799106270075,
"learning_rate": 1.145326278659612e-05,
"loss": 0.0649,
"step": 2430
},
{
"epoch": 30.0,
"eval_accuracy": 0.9574468085106383,
"eval_loss": 0.28359255194664,
"eval_runtime": 0.3442,
"eval_samples_per_second": 819.247,
"eval_steps_per_second": 52.292,
"step": 2430
},
{
"epoch": 31.0,
"grad_norm": 0.0018564946949481964,
"learning_rate": 1.1167548500881835e-05,
"loss": 0.0688,
"step": 2511
},
{
"epoch": 31.0,
"eval_accuracy": 0.9574468085106383,
"eval_loss": 0.21793903410434723,
"eval_runtime": 0.3445,
"eval_samples_per_second": 818.498,
"eval_steps_per_second": 52.245,
"step": 2511
},
{
"epoch": 32.0,
"grad_norm": 0.0009469461510889232,
"learning_rate": 1.088183421516755e-05,
"loss": 0.0735,
"step": 2592
},
{
"epoch": 32.0,
"eval_accuracy": 0.9539007092198581,
"eval_loss": 0.22946923971176147,
"eval_runtime": 0.3449,
"eval_samples_per_second": 817.666,
"eval_steps_per_second": 52.191,
"step": 2592
},
{
"epoch": 33.0,
"grad_norm": 0.4778638184070587,
"learning_rate": 1.0596119929453263e-05,
"loss": 0.0648,
"step": 2673
},
{
"epoch": 33.0,
"eval_accuracy": 0.9468085106382979,
"eval_loss": 0.42410480976104736,
"eval_runtime": 0.3433,
"eval_samples_per_second": 821.406,
"eval_steps_per_second": 52.43,
"step": 2673
},
{
"epoch": 34.0,
"grad_norm": 0.21737487614154816,
"learning_rate": 1.031040564373898e-05,
"loss": 0.0672,
"step": 2754
},
{
"epoch": 34.0,
"eval_accuracy": 0.9539007092198581,
"eval_loss": 0.2829430401325226,
"eval_runtime": 0.3447,
"eval_samples_per_second": 818.124,
"eval_steps_per_second": 52.221,
"step": 2754
},
{
"epoch": 35.0,
"grad_norm": 0.08269879966974258,
"learning_rate": 1.0024691358024692e-05,
"loss": 0.067,
"step": 2835
},
{
"epoch": 35.0,
"eval_accuracy": 0.9468085106382979,
"eval_loss": 0.3723122179508209,
"eval_runtime": 0.3448,
"eval_samples_per_second": 817.778,
"eval_steps_per_second": 52.199,
"step": 2835
},
{
"epoch": 36.0,
"grad_norm": 0.3665499687194824,
"learning_rate": 9.738977072310406e-06,
"loss": 0.0768,
"step": 2916
},
{
"epoch": 36.0,
"eval_accuracy": 0.9574468085106383,
"eval_loss": 0.25441667437553406,
"eval_runtime": 0.3447,
"eval_samples_per_second": 818.182,
"eval_steps_per_second": 52.224,
"step": 2916
},
{
"epoch": 37.0,
"grad_norm": 0.11919476091861725,
"learning_rate": 9.45326278659612e-06,
"loss": 0.0691,
"step": 2997
},
{
"epoch": 37.0,
"eval_accuracy": 0.9609929078014184,
"eval_loss": 0.20481815934181213,
"eval_runtime": 0.3445,
"eval_samples_per_second": 818.558,
"eval_steps_per_second": 52.248,
"step": 2997
},
{
"epoch": 38.0,
"grad_norm": 0.0036801116075366735,
"learning_rate": 9.167548500881835e-06,
"loss": 0.0661,
"step": 3078
},
{
"epoch": 38.0,
"eval_accuracy": 0.9680851063829787,
"eval_loss": 0.20478524267673492,
"eval_runtime": 0.3445,
"eval_samples_per_second": 818.468,
"eval_steps_per_second": 52.243,
"step": 3078
},
{
"epoch": 39.0,
"grad_norm": 0.12663815915584564,
"learning_rate": 8.88183421516755e-06,
"loss": 0.0409,
"step": 3159
},
{
"epoch": 39.0,
"eval_accuracy": 0.9645390070921985,
"eval_loss": 0.18502239882946014,
"eval_runtime": 0.3434,
"eval_samples_per_second": 821.144,
"eval_steps_per_second": 52.413,
"step": 3159
},
{
"epoch": 40.0,
"grad_norm": 0.06950168311595917,
"learning_rate": 8.596119929453264e-06,
"loss": 0.0424,
"step": 3240
},
{
"epoch": 40.0,
"eval_accuracy": 0.9645390070921985,
"eval_loss": 0.20747074484825134,
"eval_runtime": 0.3445,
"eval_samples_per_second": 818.693,
"eval_steps_per_second": 52.257,
"step": 3240
},
{
"epoch": 41.0,
"grad_norm": 0.09251494705677032,
"learning_rate": 8.310405643738978e-06,
"loss": 0.0381,
"step": 3321
},
{
"epoch": 41.0,
"eval_accuracy": 0.9645390070921985,
"eval_loss": 0.2633875906467438,
"eval_runtime": 0.3468,
"eval_samples_per_second": 813.14,
"eval_steps_per_second": 51.903,
"step": 3321
},
{
"epoch": 42.0,
"grad_norm": 0.06917154043912888,
"learning_rate": 8.024691358024692e-06,
"loss": 0.0383,
"step": 3402
},
{
"epoch": 42.0,
"eval_accuracy": 0.9574468085106383,
"eval_loss": 0.3520617187023163,
"eval_runtime": 0.3447,
"eval_samples_per_second": 818.036,
"eval_steps_per_second": 52.215,
"step": 3402
},
{
"epoch": 43.0,
"grad_norm": 0.0010325413895770907,
"learning_rate": 7.738977072310407e-06,
"loss": 0.0288,
"step": 3483
},
{
"epoch": 43.0,
"eval_accuracy": 0.9680851063829787,
"eval_loss": 0.2726523280143738,
"eval_runtime": 0.3428,
"eval_samples_per_second": 822.588,
"eval_steps_per_second": 52.506,
"step": 3483
},
{
"epoch": 44.0,
"grad_norm": 0.04726780578494072,
"learning_rate": 7.45326278659612e-06,
"loss": 0.035,
"step": 3564
},
{
"epoch": 44.0,
"eval_accuracy": 0.9645390070921985,
"eval_loss": 0.2995310127735138,
"eval_runtime": 0.3442,
"eval_samples_per_second": 819.308,
"eval_steps_per_second": 52.296,
"step": 3564
},
{
"epoch": 45.0,
"grad_norm": 0.09283600747585297,
"learning_rate": 7.167548500881835e-06,
"loss": 0.0265,
"step": 3645
},
{
"epoch": 45.0,
"eval_accuracy": 0.9609929078014184,
"eval_loss": 0.33694958686828613,
"eval_runtime": 0.3443,
"eval_samples_per_second": 818.994,
"eval_steps_per_second": 52.276,
"step": 3645
},
{
"epoch": 46.0,
"grad_norm": 0.03685113787651062,
"learning_rate": 6.881834215167549e-06,
"loss": 0.0217,
"step": 3726
},
{
"epoch": 46.0,
"eval_accuracy": 0.9609929078014184,
"eval_loss": 0.35722091794013977,
"eval_runtime": 0.3438,
"eval_samples_per_second": 820.281,
"eval_steps_per_second": 52.358,
"step": 3726
},
{
"epoch": 47.0,
"grad_norm": 0.04708189144730568,
"learning_rate": 6.596119929453263e-06,
"loss": 0.0259,
"step": 3807
},
{
"epoch": 47.0,
"eval_accuracy": 0.9716312056737588,
"eval_loss": 0.21833930909633636,
"eval_runtime": 0.3427,
"eval_samples_per_second": 822.913,
"eval_steps_per_second": 52.526,
"step": 3807
},
{
"epoch": 48.0,
"grad_norm": 0.06329997628927231,
"learning_rate": 6.310405643738977e-06,
"loss": 0.0264,
"step": 3888
},
{
"epoch": 48.0,
"eval_accuracy": 0.9609929078014184,
"eval_loss": 0.2745024561882019,
"eval_runtime": 0.3436,
"eval_samples_per_second": 820.777,
"eval_steps_per_second": 52.39,
"step": 3888
},
{
"epoch": 49.0,
"grad_norm": 0.13020673394203186,
"learning_rate": 6.024691358024692e-06,
"loss": 0.027,
"step": 3969
},
{
"epoch": 49.0,
"eval_accuracy": 0.9539007092198581,
"eval_loss": 0.3425739109516144,
"eval_runtime": 0.3449,
"eval_samples_per_second": 817.548,
"eval_steps_per_second": 52.184,
"step": 3969
},
{
"epoch": 50.0,
"grad_norm": 0.04181819409132004,
"learning_rate": 5.7389770723104065e-06,
"loss": 0.023,
"step": 4050
},
{
"epoch": 50.0,
"eval_accuracy": 0.950354609929078,
"eval_loss": 0.37068530917167664,
"eval_runtime": 0.3441,
"eval_samples_per_second": 819.471,
"eval_steps_per_second": 52.307,
"step": 4050
},
{
"epoch": 51.0,
"grad_norm": 0.03754027560353279,
"learning_rate": 5.453262786596121e-06,
"loss": 0.0241,
"step": 4131
},
{
"epoch": 51.0,
"eval_accuracy": 0.9645390070921985,
"eval_loss": 0.3041815459728241,
"eval_runtime": 0.3443,
"eval_samples_per_second": 819.127,
"eval_steps_per_second": 52.285,
"step": 4131
},
{
"epoch": 52.0,
"grad_norm": 0.06724414229393005,
"learning_rate": 5.167548500881835e-06,
"loss": 0.0248,
"step": 4212
},
{
"epoch": 52.0,
"eval_accuracy": 0.9609929078014184,
"eval_loss": 0.3282240927219391,
"eval_runtime": 0.3433,
"eval_samples_per_second": 821.512,
"eval_steps_per_second": 52.437,
"step": 4212
},
{
"epoch": 53.0,
"grad_norm": 0.044111430644989014,
"learning_rate": 4.881834215167549e-06,
"loss": 0.0267,
"step": 4293
},
{
"epoch": 53.0,
"eval_accuracy": 0.9680851063829787,
"eval_loss": 0.2480100840330124,
"eval_runtime": 0.3438,
"eval_samples_per_second": 820.176,
"eval_steps_per_second": 52.352,
"step": 4293
},
{
"epoch": 54.0,
"grad_norm": 0.09385800361633301,
"learning_rate": 4.596119929453263e-06,
"loss": 0.019,
"step": 4374
},
{
"epoch": 54.0,
"eval_accuracy": 0.9680851063829787,
"eval_loss": 0.2954387366771698,
"eval_runtime": 0.3444,
"eval_samples_per_second": 818.748,
"eval_steps_per_second": 52.261,
"step": 4374
},
{
"epoch": 55.0,
"grad_norm": 0.00036285247188061476,
"learning_rate": 4.3104056437389775e-06,
"loss": 0.0233,
"step": 4455
},
{
"epoch": 55.0,
"eval_accuracy": 0.9645390070921985,
"eval_loss": 0.26300373673439026,
"eval_runtime": 0.3483,
"eval_samples_per_second": 809.563,
"eval_steps_per_second": 51.674,
"step": 4455
},
{
"epoch": 56.0,
"grad_norm": 0.03549063578248024,
"learning_rate": 4.024691358024692e-06,
"loss": 0.0231,
"step": 4536
},
{
"epoch": 56.0,
"eval_accuracy": 0.9645390070921985,
"eval_loss": 0.26614007353782654,
"eval_runtime": 0.3434,
"eval_samples_per_second": 821.294,
"eval_steps_per_second": 52.423,
"step": 4536
},
{
"epoch": 57.0,
"grad_norm": 0.0008688592351973057,
"learning_rate": 3.7389770723104058e-06,
"loss": 0.0188,
"step": 4617
},
{
"epoch": 57.0,
"eval_accuracy": 0.9574468085106383,
"eval_loss": 0.3676702678203583,
"eval_runtime": 0.3441,
"eval_samples_per_second": 819.514,
"eval_steps_per_second": 52.309,
"step": 4617
},
{
"epoch": 58.0,
"grad_norm": 0.00031407736241817474,
"learning_rate": 3.4532627865961205e-06,
"loss": 0.0263,
"step": 4698
},
{
"epoch": 58.0,
"eval_accuracy": 0.9539007092198581,
"eval_loss": 0.36925771832466125,
"eval_runtime": 0.348,
"eval_samples_per_second": 810.368,
"eval_steps_per_second": 51.726,
"step": 4698
},
{
"epoch": 59.0,
"grad_norm": 0.040128860622644424,
"learning_rate": 3.1675485008818345e-06,
"loss": 0.019,
"step": 4779
},
{
"epoch": 59.0,
"eval_accuracy": 0.9574468085106383,
"eval_loss": 0.35094693303108215,
"eval_runtime": 0.3436,
"eval_samples_per_second": 820.815,
"eval_steps_per_second": 52.392,
"step": 4779
},
{
"epoch": 60.0,
"grad_norm": 0.0004439246258698404,
"learning_rate": 2.881834215167549e-06,
"loss": 0.0202,
"step": 4860
},
{
"epoch": 60.0,
"eval_accuracy": 0.9609929078014184,
"eval_loss": 0.3040333092212677,
"eval_runtime": 0.3445,
"eval_samples_per_second": 818.559,
"eval_steps_per_second": 52.248,
"step": 4860
},
{
"epoch": 61.0,
"grad_norm": 0.07529360055923462,
"learning_rate": 2.5961199294532628e-06,
"loss": 0.0208,
"step": 4941
},
{
"epoch": 61.0,
"eval_accuracy": 0.9468085106382979,
"eval_loss": 0.5039365887641907,
"eval_runtime": 0.3439,
"eval_samples_per_second": 819.902,
"eval_steps_per_second": 52.334,
"step": 4941
},
{
"epoch": 62.0,
"grad_norm": 0.00026053638430312276,
"learning_rate": 2.310405643738977e-06,
"loss": 0.0242,
"step": 5022
},
{
"epoch": 62.0,
"eval_accuracy": 0.950354609929078,
"eval_loss": 0.4803861677646637,
"eval_runtime": 0.3445,
"eval_samples_per_second": 818.64,
"eval_steps_per_second": 52.254,
"step": 5022
},
{
"epoch": 63.0,
"grad_norm": 0.06742388755083084,
"learning_rate": 2.0246913580246915e-06,
"loss": 0.023,
"step": 5103
},
{
"epoch": 63.0,
"eval_accuracy": 0.9609929078014184,
"eval_loss": 0.3538144826889038,
"eval_runtime": 0.3445,
"eval_samples_per_second": 818.51,
"eval_steps_per_second": 52.245,
"step": 5103
},
{
"epoch": 64.0,
"grad_norm": 0.00042550539365038276,
"learning_rate": 1.7389770723104056e-06,
"loss": 0.0189,
"step": 5184
},
{
"epoch": 64.0,
"eval_accuracy": 0.9574468085106383,
"eval_loss": 0.37617096304893494,
"eval_runtime": 0.3442,
"eval_samples_per_second": 819.198,
"eval_steps_per_second": 52.289,
"step": 5184
},
{
"epoch": 65.0,
"grad_norm": 0.02407378889620304,
"learning_rate": 1.45326278659612e-06,
"loss": 0.0209,
"step": 5265
},
{
"epoch": 65.0,
"eval_accuracy": 0.950354609929078,
"eval_loss": 0.43608424067497253,
"eval_runtime": 0.3438,
"eval_samples_per_second": 820.243,
"eval_steps_per_second": 52.356,
"step": 5265
},
{
"epoch": 66.0,
"grad_norm": 0.054311446845531464,
"learning_rate": 1.1675485008818344e-06,
"loss": 0.0209,
"step": 5346
},
{
"epoch": 66.0,
"eval_accuracy": 0.950354609929078,
"eval_loss": 0.41794532537460327,
"eval_runtime": 0.3436,
"eval_samples_per_second": 820.791,
"eval_steps_per_second": 52.391,
"step": 5346
},
{
"epoch": 67.0,
"grad_norm": 0.04109662398695946,
"learning_rate": 8.818342151675485e-07,
"loss": 0.0198,
"step": 5427
},
{
"epoch": 67.0,
"eval_accuracy": 0.9539007092198581,
"eval_loss": 0.3815895617008209,
"eval_runtime": 0.3443,
"eval_samples_per_second": 819.013,
"eval_steps_per_second": 52.277,
"step": 5427
},
{
"epoch": 68.0,
"grad_norm": 0.13629287481307983,
"learning_rate": 5.961199294532629e-07,
"loss": 0.0197,
"step": 5508
},
{
"epoch": 68.0,
"eval_accuracy": 0.950354609929078,
"eval_loss": 0.39786896109580994,
"eval_runtime": 0.3445,
"eval_samples_per_second": 818.46,
"eval_steps_per_second": 52.242,
"step": 5508
},
{
"epoch": 69.0,
"grad_norm": 0.039983708411455154,
"learning_rate": 3.104056437389771e-07,
"loss": 0.0192,
"step": 5589
},
{
"epoch": 69.0,
"eval_accuracy": 0.950354609929078,
"eval_loss": 0.411296546459198,
"eval_runtime": 0.3435,
"eval_samples_per_second": 820.901,
"eval_steps_per_second": 52.398,
"step": 5589
},
{
"epoch": 70.0,
"grad_norm": 0.00027353325276635587,
"learning_rate": 2.469135802469136e-08,
"loss": 0.0177,
"step": 5670
},
{
"epoch": 70.0,
"eval_accuracy": 0.9539007092198581,
"eval_loss": 0.40772485733032227,
"eval_runtime": 0.3437,
"eval_samples_per_second": 820.466,
"eval_steps_per_second": 52.37,
"step": 5670
}
],
"logging_steps": 100,
"max_steps": 5670,
"num_input_tokens_seen": 0,
"num_train_epochs": 70,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9735501528974304.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}